]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
xnu-3789.51.2.tar.gz
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67
68 #include <debug.h>
69 #include <mach_pagemap.h>
70 #include <mach_cluster_stats.h>
71
72 #include <mach/mach_types.h>
73 #include <mach/memory_object.h>
74 #include <mach/memory_object_default.h>
75 #include <mach/memory_object_control_server.h>
76 #include <mach/mach_host_server.h>
77 #include <mach/upl.h>
78 #include <mach/vm_map.h>
79 #include <mach/vm_param.h>
80 #include <mach/vm_statistics.h>
81 #include <mach/sdt.h>
82
83 #include <kern/kern_types.h>
84 #include <kern/counters.h>
85 #include <kern/host_statistics.h>
86 #include <kern/machine.h>
87 #include <kern/misc_protos.h>
88 #include <kern/sched.h>
89 #include <kern/thread.h>
90 #include <kern/xpr.h>
91 #include <kern/kalloc.h>
92 #include <kern/policy_internal.h>
93
94 #include <machine/vm_tuning.h>
95 #include <machine/commpage.h>
96
97 #include <vm/pmap.h>
98 #include <vm/vm_compressor_pager.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_pageout.h>
104 #include <vm/vm_protos.h> /* must be last */
105 #include <vm/memory_object.h>
106 #include <vm/vm_purgeable_internal.h>
107 #include <vm/vm_shared_region.h>
108 #include <vm/vm_compressor.h>
109
110 #if CONFIG_PHANTOM_CACHE
111 #include <vm/vm_phantom_cache.h>
112 #endif
113 /*
114 * ENCRYPTED SWAP:
115 */
116 #include <libkern/crypto/aes.h>
117 extern u_int32_t random(void); /* from <libkern/libkern.h> */
118
119 extern int cs_debug;
120
121 #if UPL_DEBUG
122 #include <libkern/OSDebug.h>
123 #endif
124
125 extern void m_drain(void);
126
127 #if VM_PRESSURE_EVENTS
128 extern unsigned int memorystatus_available_pages;
129 extern unsigned int memorystatus_available_pages_pressure;
130 extern unsigned int memorystatus_available_pages_critical;
131 extern unsigned int memorystatus_frozen_count;
132 extern unsigned int memorystatus_suspended_count;
133
134 extern vm_pressure_level_t memorystatus_vm_pressure_level;
135 int memorystatus_purge_on_warning = 2;
136 int memorystatus_purge_on_urgent = 5;
137 int memorystatus_purge_on_critical = 8;
138
139 void vm_pressure_response(void);
140 boolean_t vm_pressure_thread_running = FALSE;
141 extern void consider_vm_pressure_events(void);
142
143 #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
144 #endif /* VM_PRESSURE_EVENTS */
145
146 boolean_t vm_pressure_changed = FALSE;
147
148 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */
149 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100
150 #endif
151
152 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
153 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
154 #endif
155
156 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
157 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
158 #endif
159
160 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
161 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
162 #endif
163
164 #ifndef VM_PAGE_LAUNDRY_MAX
165 #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
166 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
167
168 #ifndef VM_PAGEOUT_BURST_WAIT
169 #define VM_PAGEOUT_BURST_WAIT 10 /* milliseconds */
170 #endif /* VM_PAGEOUT_BURST_WAIT */
171
172 #ifndef VM_PAGEOUT_EMPTY_WAIT
173 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
174 #endif /* VM_PAGEOUT_EMPTY_WAIT */
175
176 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
177 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
178 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
179
180 #ifndef VM_PAGEOUT_IDLE_WAIT
181 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
182 #endif /* VM_PAGEOUT_IDLE_WAIT */
183
184 #ifndef VM_PAGEOUT_SWAP_WAIT
185 #define VM_PAGEOUT_SWAP_WAIT 50 /* milliseconds */
186 #endif /* VM_PAGEOUT_SWAP_WAIT */
187
188 #ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
189 #define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED 1000 /* maximum pages considered before we issue a pressure event */
190 #endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
191
192 #ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
193 #define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS 5 /* seconds */
194 #endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
195
196 unsigned int vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
197 unsigned int vm_page_speculative_percentage = 5;
198
199 #ifndef VM_PAGE_SPECULATIVE_TARGET
200 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
201 #endif /* VM_PAGE_SPECULATIVE_TARGET */
202
203
204 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
205 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
206 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
207
208
209 /*
210 * To obtain a reasonable LRU approximation, the inactive queue
211 * needs to be large enough to give pages on it a chance to be
212 * referenced a second time. This macro defines the fraction
213 * of active+inactive pages that should be inactive.
214 * The pageout daemon uses it to update vm_page_inactive_target.
215 *
216 * If vm_page_free_count falls below vm_page_free_target and
217 * vm_page_inactive_count is below vm_page_inactive_target,
218 * then the pageout daemon starts running.
219 */
220
221 #ifndef VM_PAGE_INACTIVE_TARGET
222 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
223 #endif /* VM_PAGE_INACTIVE_TARGET */
224
225 /*
226 * Once the pageout daemon starts running, it keeps going
227 * until vm_page_free_count meets or exceeds vm_page_free_target.
228 */
229
230 #ifndef VM_PAGE_FREE_TARGET
231 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
232 #endif /* VM_PAGE_FREE_TARGET */
233
234
235 /*
236 * The pageout daemon always starts running once vm_page_free_count
237 * falls below vm_page_free_min.
238 */
239
240 #ifndef VM_PAGE_FREE_MIN
241 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
242 #endif /* VM_PAGE_FREE_MIN */
243
244 #define VM_PAGE_FREE_RESERVED_LIMIT 1700
245 #define VM_PAGE_FREE_MIN_LIMIT 3500
246 #define VM_PAGE_FREE_TARGET_LIMIT 4000
247
248 /*
249 * When vm_page_free_count falls below vm_page_free_reserved,
250 * only vm-privileged threads can allocate pages. vm-privilege
251 * allows the pageout daemon and default pager (and any other
252 * associated threads needed for default pageout) to continue
253 * operation by dipping into the reserved pool of pages.
254 */
255
256 #ifndef VM_PAGE_FREE_RESERVED
257 #define VM_PAGE_FREE_RESERVED(n) \
258 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
259 #endif /* VM_PAGE_FREE_RESERVED */
260
261 /*
262 * When we dequeue pages from the inactive list, they are
263 * reactivated (ie, put back on the active queue) if referenced.
264 * However, it is possible to starve the free list if other
265 * processors are referencing pages faster than we can turn off
266 * the referenced bit. So we limit the number of reactivations
267 * we will make per call of vm_pageout_scan().
268 */
269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270 #ifndef VM_PAGE_REACTIVATE_LIMIT
271 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
272 #endif /* VM_PAGE_REACTIVATE_LIMIT */
273 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
274
275
276 extern boolean_t hibernate_cleaning_in_progress;
277
278 /*
279 * Exported variable used to broadcast the activation of the pageout scan
280 * Working Set uses this to throttle its use of pmap removes. In this
281 * way, code which runs within memory in an uncontested context does
282 * not keep encountering soft faults.
283 */
284
285 unsigned int vm_pageout_scan_event_counter = 0;
286
287 /*
288 * Forward declarations for internal routines.
289 */
290 struct cq {
291 struct vm_pageout_queue *q;
292 void *current_chead;
293 char *scratch_buf;
294 int id;
295 };
296 #define MAX_COMPRESSOR_THREAD_COUNT 8
297
298 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
299
300 void *vm_pageout_immediate_chead;
301 char *vm_pageout_immediate_scratch_buf;
302
303
304 #if VM_PRESSURE_EVENTS
305 void vm_pressure_thread(void);
306
307 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
308 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
309
310 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
311 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
312 #endif
313 static void vm_pageout_garbage_collect(int);
314 static void vm_pageout_iothread_external(void);
315 static void vm_pageout_iothread_internal(struct cq *cq);
316 static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t);
317
318 extern void vm_pageout_continue(void);
319 extern void vm_pageout_scan(void);
320
321 static void vm_pageout_immediate(vm_page_t, boolean_t);
322 boolean_t vm_compressor_immediate_preferred = FALSE;
323 boolean_t vm_compressor_immediate_preferred_override = FALSE;
324 boolean_t vm_restricted_to_single_processor = FALSE;
325 static boolean_t vm_pageout_waiter = FALSE;
326 static boolean_t vm_pageout_running = FALSE;
327
328
329 static thread_t vm_pageout_external_iothread = THREAD_NULL;
330 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
331
332 unsigned int vm_pageout_reserved_internal = 0;
333 unsigned int vm_pageout_reserved_really = 0;
334
335 unsigned int vm_pageout_swap_wait = 0;
336 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
337 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
338 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
339 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
340 unsigned int vm_pageout_deadlock_relief = 0;
341 unsigned int vm_pageout_inactive_relief = 0;
342 unsigned int vm_pageout_burst_active_throttle = 0;
343 unsigned int vm_pageout_burst_inactive_throttle = 0;
344
345 int vm_upl_wait_for_pages = 0;
346
347
348 /*
349 * These variables record the pageout daemon's actions:
350 * how many pages it looks at and what happens to those pages.
351 * No locking needed because only one thread modifies the variables.
352 */
353
354 unsigned int vm_pageout_active = 0; /* debugging */
355 unsigned int vm_pageout_inactive = 0; /* debugging */
356 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
357 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
358 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
359 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
360 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
361 unsigned int vm_pageout_inactive_error = 0; /* debugging */
362 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
363 unsigned int vm_pageout_inactive_notalive = 0; /* debugging */
364 unsigned int vm_pageout_inactive_used = 0; /* debugging */
365 unsigned int vm_pageout_cache_evicted = 0; /* debugging */
366 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
367 unsigned int vm_pageout_speculative_clean = 0; /* debugging */
368
369 unsigned int vm_pageout_freed_from_cleaned = 0;
370 unsigned int vm_pageout_freed_from_speculative = 0;
371 unsigned int vm_pageout_freed_from_inactive_clean = 0;
372
373 unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0;
374 unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
375
376 unsigned int vm_pageout_cleaned_reclaimed = 0; /* debugging; how many cleaned pages are reclaimed by the pageout scan */
377 unsigned int vm_pageout_cleaned_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
378 unsigned int vm_pageout_cleaned_reference_reactivated = 0;
379 unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
380 unsigned int vm_pageout_cleaned_fault_reactivated = 0;
381 unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
382 unsigned int vm_pageout_cleaned_busy = 0;
383 unsigned int vm_pageout_cleaned_nolock = 0;
384
385 unsigned int vm_pageout_inactive_dirty_internal = 0; /* debugging */
386 unsigned int vm_pageout_inactive_dirty_external = 0; /* debugging */
387 unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */
388 unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */
389 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
390 unsigned int vm_pageout_purged_objects = 0; /* used for sysctl vm stats */
391 unsigned int vm_stat_discard = 0; /* debugging */
392 unsigned int vm_stat_discard_sent = 0; /* debugging */
393 unsigned int vm_stat_discard_failure = 0; /* debugging */
394 unsigned int vm_stat_discard_throttle = 0; /* debugging */
395 unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */
396 unsigned int vm_pageout_catch_ups = 0; /* debugging */
397 unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */
398
399 unsigned int vm_pageout_scan_reclaimed_throttled = 0;
400 unsigned int vm_pageout_scan_active_throttled = 0;
401 unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
402 unsigned int vm_pageout_scan_inactive_throttled_external = 0;
403 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
404 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
405 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
406 unsigned int vm_pageout_scan_swap_throttle = 0; /* debugging */
407 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
408 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
409 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
410 unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0; /* debugging */
411 unsigned int vm_pageout_scan_throttle_deferred = 0; /* debugging */
412 unsigned int vm_pageout_scan_yield_unthrottled = 0; /* debugging */
413 unsigned int vm_page_speculative_count_drifts = 0;
414 unsigned int vm_page_speculative_count_drift_max = 0;
415
416
417 /*
418 * Backing store throttle when BS is exhausted
419 */
420 unsigned int vm_backing_store_low = 0;
421
422 unsigned int vm_pageout_out_of_line = 0;
423 unsigned int vm_pageout_in_place = 0;
424
425 unsigned int vm_page_steal_pageout_page = 0;
426
427 struct vm_config vm_config;
428
429 /*
430 * ENCRYPTED SWAP:
431 * counters and statistics...
432 */
433 unsigned long vm_page_decrypt_counter = 0;
434 unsigned long vm_page_decrypt_for_upl_counter = 0;
435 unsigned long vm_page_encrypt_counter = 0;
436 unsigned long vm_page_encrypt_abort_counter = 0;
437 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
438 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
439
440 struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
441 struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
442
443 unsigned int vm_page_speculative_target = 0;
444
445 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
446
447 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
448
449 #if DEVELOPMENT || DEBUG
450 unsigned long vm_cs_validated_resets = 0;
451 #endif
452
453 int vm_debug_events = 0;
454
455 #if CONFIG_MEMORYSTATUS
456 #if !CONFIG_JETSAM
457 extern boolean_t memorystatus_idle_exit_from_VM(void);
458 #endif
459 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
460 extern void memorystatus_on_pageout_scan_end(void);
461
462 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
463 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
464 #if DEVELOPMENT || DEBUG
465 uint32_t vm_grab_anon_overrides = 0;
466 uint32_t vm_grab_anon_nops = 0;
467 #endif
468
469 #endif
470
471 /*
472 * Routine: vm_backing_store_disable
473 * Purpose:
474 * Suspend non-privileged threads wishing to extend
475 * backing store when we are low on backing store
476 * (Synchronized by caller)
477 */
478 void
479 vm_backing_store_disable(
480 boolean_t disable)
481 {
482 if(disable) {
483 vm_backing_store_low = 1;
484 } else {
485 if(vm_backing_store_low) {
486 vm_backing_store_low = 0;
487 thread_wakeup((event_t) &vm_backing_store_low);
488 }
489 }
490 }
491
492
493 #if MACH_CLUSTER_STATS
494 unsigned long vm_pageout_cluster_dirtied = 0;
495 unsigned long vm_pageout_cluster_cleaned = 0;
496 unsigned long vm_pageout_cluster_collisions = 0;
497 unsigned long vm_pageout_cluster_clusters = 0;
498 unsigned long vm_pageout_cluster_conversions = 0;
499 unsigned long vm_pageout_target_collisions = 0;
500 unsigned long vm_pageout_target_page_dirtied = 0;
501 unsigned long vm_pageout_target_page_freed = 0;
502 #define CLUSTER_STAT(clause) clause
503 #else /* MACH_CLUSTER_STATS */
504 #define CLUSTER_STAT(clause)
505 #endif /* MACH_CLUSTER_STATS */
506
507 /*
508 * Routine: vm_pageout_object_terminate
509 * Purpose:
510 * Destroy the pageout_object, and perform all of the
511 * required cleanup actions.
512 *
513 * In/Out conditions:
514 * The object must be locked, and will be returned locked.
515 */
516 void
517 vm_pageout_object_terminate(
518 vm_object_t object)
519 {
520 vm_object_t shadow_object;
521
522 /*
523 * Deal with the deallocation (last reference) of a pageout object
524 * (used for cleaning-in-place) by dropping the paging references/
525 * freeing pages in the original object.
526 */
527
528 assert(object->pageout);
529 shadow_object = object->shadow;
530 vm_object_lock(shadow_object);
531
532 while (!vm_page_queue_empty(&object->memq)) {
533 vm_page_t p, m;
534 vm_object_offset_t offset;
535
536 p = (vm_page_t) vm_page_queue_first(&object->memq);
537
538 assert(p->private);
539 assert(p->free_when_done);
540 p->free_when_done = FALSE;
541 assert(!p->cleaning);
542 assert(!p->laundry);
543
544 offset = p->offset;
545 VM_PAGE_FREE(p);
546 p = VM_PAGE_NULL;
547
548 m = vm_page_lookup(shadow_object,
549 offset + object->vo_shadow_offset);
550
551 if(m == VM_PAGE_NULL)
552 continue;
553
554 assert((m->dirty) || (m->precious) ||
555 (m->busy && m->cleaning));
556
557 /*
558 * Handle the trusted pager throttle.
559 * Also decrement the burst throttle (if external).
560 */
561 vm_page_lock_queues();
562 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
563 vm_pageout_throttle_up(m);
564
565 /*
566 * Handle the "target" page(s). These pages are to be freed if
567 * successfully cleaned. Target pages are always busy, and are
568 * wired exactly once. The initial target pages are not mapped,
569 * (so cannot be referenced or modified) but converted target
570 * pages may have been modified between the selection as an
571 * adjacent page and conversion to a target.
572 */
573 if (m->free_when_done) {
574 assert(m->busy);
575 assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
576 assert(m->wire_count == 1);
577 m->cleaning = FALSE;
578 m->encrypted_cleaning = FALSE;
579 m->free_when_done = FALSE;
580 #if MACH_CLUSTER_STATS
581 if (m->wanted) vm_pageout_target_collisions++;
582 #endif
583 /*
584 * Revoke all access to the page. Since the object is
585 * locked, and the page is busy, this prevents the page
586 * from being dirtied after the pmap_disconnect() call
587 * returns.
588 *
589 * Since the page is left "dirty" but "not modifed", we
590 * can detect whether the page was redirtied during
591 * pageout by checking the modify state.
592 */
593 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
594 SET_PAGE_DIRTY(m, FALSE);
595 } else {
596 m->dirty = FALSE;
597 }
598
599 if (m->dirty) {
600 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
601 vm_page_unwire(m, TRUE); /* reactivates */
602 VM_STAT_INCR(reactivations);
603 PAGE_WAKEUP_DONE(m);
604 } else {
605 CLUSTER_STAT(vm_pageout_target_page_freed++;)
606 vm_page_free(m);/* clears busy, etc. */
607 }
608 vm_page_unlock_queues();
609 continue;
610 }
611 /*
612 * Handle the "adjacent" pages. These pages were cleaned in
613 * place, and should be left alone.
614 * If prep_pin_count is nonzero, then someone is using the
615 * page, so make it active.
616 */
617 if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) {
618 if (m->reference)
619 vm_page_activate(m);
620 else
621 vm_page_deactivate(m);
622 }
623 if (m->overwriting) {
624 /*
625 * the (COPY_OUT_FROM == FALSE) request_page_list case
626 */
627 if (m->busy) {
628 /*
629 * We do not re-set m->dirty !
630 * The page was busy so no extraneous activity
631 * could have occurred. COPY_INTO is a read into the
632 * new pages. CLEAN_IN_PLACE does actually write
633 * out the pages but handling outside of this code
634 * will take care of resetting dirty. We clear the
635 * modify however for the Programmed I/O case.
636 */
637 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
638
639 m->busy = FALSE;
640 m->absent = FALSE;
641 } else {
642 /*
643 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
644 * Occurs when the original page was wired
645 * at the time of the list request
646 */
647 assert(VM_PAGE_WIRED(m));
648 vm_page_unwire(m, TRUE); /* reactivates */
649 }
650 m->overwriting = FALSE;
651 } else {
652 /*
653 * Set the dirty state according to whether or not the page was
654 * modified during the pageout. Note that we purposefully do
655 * NOT call pmap_clear_modify since the page is still mapped.
656 * If the page were to be dirtied between the 2 calls, this
657 * this fact would be lost. This code is only necessary to
658 * maintain statistics, since the pmap module is always
659 * consulted if m->dirty is false.
660 */
661 #if MACH_CLUSTER_STATS
662 m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
663
664 if (m->dirty) vm_pageout_cluster_dirtied++;
665 else vm_pageout_cluster_cleaned++;
666 if (m->wanted) vm_pageout_cluster_collisions++;
667 #else
668 m->dirty = FALSE;
669 #endif
670 }
671 if (m->encrypted_cleaning == TRUE) {
672 m->encrypted_cleaning = FALSE;
673 m->busy = FALSE;
674 }
675 m->cleaning = FALSE;
676
677 /*
678 * Wakeup any thread waiting for the page to be un-cleaning.
679 */
680 PAGE_WAKEUP(m);
681 vm_page_unlock_queues();
682 }
683 /*
684 * Account for the paging reference taken in vm_paging_object_allocate.
685 */
686 vm_object_activity_end(shadow_object);
687 vm_object_unlock(shadow_object);
688
689 assert(object->ref_count == 0);
690 assert(object->paging_in_progress == 0);
691 assert(object->activity_in_progress == 0);
692 assert(object->resident_page_count == 0);
693 return;
694 }
695
696 /*
697 * Routine: vm_pageclean_setup
698 *
699 * Purpose: setup a page to be cleaned (made non-dirty), but not
700 * necessarily flushed from the VM page cache.
701 * This is accomplished by cleaning in place.
702 *
703 * The page must not be busy, and new_object
704 * must be locked.
705 *
706 */
707 static void
708 vm_pageclean_setup(
709 vm_page_t m,
710 vm_page_t new_m,
711 vm_object_t new_object,
712 vm_object_offset_t new_offset)
713 {
714 assert(!m->busy);
715 #if 0
716 assert(!m->cleaning);
717 #endif
718
719 XPR(XPR_VM_PAGEOUT,
720 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
721 VM_PAGE_OBJECT(m), m->offset, m,
722 new_m, new_offset);
723
724 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
725
726 /*
727 * Mark original page as cleaning in place.
728 */
729 m->cleaning = TRUE;
730 SET_PAGE_DIRTY(m, FALSE);
731 m->precious = FALSE;
732
733 /*
734 * Convert the fictitious page to a private shadow of
735 * the real page.
736 */
737 assert(new_m->fictitious);
738 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
739 new_m->fictitious = FALSE;
740 new_m->private = TRUE;
741 new_m->free_when_done = TRUE;
742 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
743
744 vm_page_lockspin_queues();
745 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
746 vm_page_unlock_queues();
747
748 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
749 assert(!new_m->wanted);
750 new_m->busy = FALSE;
751 }
752
753 /*
754 * Routine: vm_pageout_initialize_page
755 * Purpose:
756 * Causes the specified page to be initialized in
757 * the appropriate memory object. This routine is used to push
758 * pages into a copy-object when they are modified in the
759 * permanent object.
760 *
761 * The page is moved to a temporary object and paged out.
762 *
763 * In/out conditions:
764 * The page in question must not be on any pageout queues.
765 * The object to which it belongs must be locked.
766 * The page must be busy, but not hold a paging reference.
767 *
768 * Implementation:
769 * Move this page to a completely new object.
770 */
771 void
772 vm_pageout_initialize_page(
773 vm_page_t m)
774 {
775 vm_object_t object;
776 vm_object_offset_t paging_offset;
777 memory_object_t pager;
778
779 XPR(XPR_VM_PAGEOUT,
780 "vm_pageout_initialize_page, page 0x%X\n",
781 m, 0, 0, 0, 0);
782
783 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
784
785 object = VM_PAGE_OBJECT(m);
786
787 assert(m->busy);
788 assert(object->internal);
789
790 /*
791 * Verify that we really want to clean this page
792 */
793 assert(!m->absent);
794 assert(!m->error);
795 assert(m->dirty);
796
797 /*
798 * Create a paging reference to let us play with the object.
799 */
800 paging_offset = m->offset + object->paging_offset;
801
802 if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
803 panic("reservation without pageout?"); /* alan */
804
805 VM_PAGE_FREE(m);
806 vm_object_unlock(object);
807
808 return;
809 }
810
811 /*
812 * If there's no pager, then we can't clean the page. This should
813 * never happen since this should be a copy object and therefore not
814 * an external object, so the pager should always be there.
815 */
816
817 pager = object->pager;
818
819 if (pager == MEMORY_OBJECT_NULL) {
820 panic("missing pager for copy object");
821
822 VM_PAGE_FREE(m);
823 return;
824 }
825
826 /*
827 * set the page for future call to vm_fault_list_request
828 */
829 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
830 SET_PAGE_DIRTY(m, FALSE);
831
832 /*
833 * keep the object from collapsing or terminating
834 */
835 vm_object_paging_begin(object);
836 vm_object_unlock(object);
837
838 /*
839 * Write the data to its pager.
840 * Note that the data is passed by naming the new object,
841 * not a virtual address; the pager interface has been
842 * manipulated to use the "internal memory" data type.
843 * [The object reference from its allocation is donated
844 * to the eventual recipient.]
845 */
846 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
847
848 vm_object_lock(object);
849 vm_object_paging_end(object);
850 }
851
852 #if MACH_CLUSTER_STATS
853 #define MAXCLUSTERPAGES 16
854 struct {
855 unsigned long pages_in_cluster;
856 unsigned long pages_at_higher_offsets;
857 unsigned long pages_at_lower_offsets;
858 } cluster_stats[MAXCLUSTERPAGES];
859 #endif /* MACH_CLUSTER_STATS */
860
861
862 /*
863 * vm_pageout_cluster:
864 *
865 * Given a page, queue it to the appropriate I/O thread,
866 * which will page it out and attempt to clean adjacent pages
867 * in the same operation.
868 *
869 * The object and queues must be locked. We will take a
870 * paging reference to prevent deallocation or collapse when we
871 * release the object lock back at the call site. The I/O thread
872 * is responsible for consuming this reference
873 *
874 * The page must not be on any pageout queue.
875 */
876
877 int
878 vm_pageout_cluster(vm_page_t m, boolean_t immediate_ok, boolean_t keep_object_locked)
879 {
880 vm_object_t object = VM_PAGE_OBJECT(m);
881 struct vm_pageout_queue *q;
882
883
884 XPR(XPR_VM_PAGEOUT,
885 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
886 object, m->offset, m, 0, 0);
887
888 VM_PAGE_CHECK(m);
889 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
890 vm_object_lock_assert_exclusive(object);
891
892 /*
893 * Only a certain kind of page is appreciated here.
894 */
895 assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
896 assert(!m->cleaning && !m->laundry);
897 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
898
899 /*
900 * protect the object from collapse or termination
901 */
902 vm_object_activity_begin(object);
903
904 if (object->internal == TRUE) {
905 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
906
907 m->busy = TRUE;
908
909 if (vm_compressor_immediate_preferred == TRUE && immediate_ok == TRUE) {
910 panic("immediate compressor mode no longer supported\n");
911
912 if (keep_object_locked == FALSE)
913 vm_object_unlock(object);
914 vm_page_unlock_queues();
915
916 vm_pageout_immediate(m, keep_object_locked);
917
918 return (1);
919 }
920 q = &vm_pageout_queue_internal;
921 } else
922 q = &vm_pageout_queue_external;
923
924 /*
925 * pgo_laundry count is tied to the laundry bit
926 */
927 m->laundry = TRUE;
928 q->pgo_laundry++;
929
930 m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q;
931 vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
932
933 if (q->pgo_idle == TRUE) {
934 q->pgo_idle = FALSE;
935 thread_wakeup((event_t) &q->pgo_pending);
936 }
937 VM_PAGE_CHECK(m);
938
939 return (0);
940 }
941
942
943 unsigned long vm_pageout_throttle_up_count = 0;
944
945 /*
946 * A page is back from laundry or we are stealing it back from
947 * the laundering state. See if there are some pages waiting to
948 * go to laundry and if we can let some of them go now.
949 *
950 * Object and page queues must be locked.
951 */
952 void
953 vm_pageout_throttle_up(
954 vm_page_t m)
955 {
956 struct vm_pageout_queue *q;
957 vm_object_t m_object;
958
959 m_object = VM_PAGE_OBJECT(m);
960
961 assert(m_object != VM_OBJECT_NULL);
962 assert(m_object != kernel_object);
963
964 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
965 vm_object_lock_assert_exclusive(m_object);
966
967 vm_pageout_throttle_up_count++;
968
969 if (m_object->internal == TRUE)
970 q = &vm_pageout_queue_internal;
971 else
972 q = &vm_pageout_queue_external;
973
974 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
975
976 vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
977 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
978
979 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
980
981 vm_object_activity_end(m_object);
982 }
983 if (m->laundry == TRUE) {
984
985 m->laundry = FALSE;
986 q->pgo_laundry--;
987
988 if (q->pgo_throttled == TRUE) {
989 q->pgo_throttled = FALSE;
990 thread_wakeup((event_t) &q->pgo_laundry);
991 }
992 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
993 q->pgo_draining = FALSE;
994 thread_wakeup((event_t) (&q->pgo_laundry+1));
995 }
996 }
997 }
998
999
1000 static void
1001 vm_pageout_throttle_up_batch(
1002 struct vm_pageout_queue *q,
1003 int batch_cnt)
1004 {
1005 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1006
1007 vm_pageout_throttle_up_count += batch_cnt;
1008
1009 q->pgo_laundry -= batch_cnt;
1010
1011 if (q->pgo_throttled == TRUE) {
1012 q->pgo_throttled = FALSE;
1013 thread_wakeup((event_t) &q->pgo_laundry);
1014 }
1015 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
1016 q->pgo_draining = FALSE;
1017 thread_wakeup((event_t) (&q->pgo_laundry+1));
1018 }
1019 }
1020
1021
1022
1023 /*
1024 * VM memory pressure monitoring.
1025 *
1026 * vm_pageout_scan() keeps track of the number of pages it considers and
1027 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
1028 *
1029 * compute_memory_pressure() is called every second from compute_averages()
1030 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
1031 * of recalimed pages in a new vm_pageout_stat[] bucket.
1032 *
1033 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
1034 * The caller provides the number of seconds ("nsecs") worth of statistics
1035 * it wants, up to 30 seconds.
1036 * It computes the number of pages reclaimed in the past "nsecs" seconds and
1037 * also returns the number of pages the system still needs to reclaim at this
1038 * moment in time.
1039 */
1040 #define VM_PAGEOUT_STAT_SIZE 31
1041 struct vm_pageout_stat {
1042 unsigned int considered;
1043 unsigned int reclaimed;
1044 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
1045 unsigned int vm_pageout_stat_now = 0;
1046 unsigned int vm_memory_pressure = 0;
1047
1048 #define VM_PAGEOUT_STAT_BEFORE(i) \
1049 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
1050 #define VM_PAGEOUT_STAT_AFTER(i) \
1051 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
1052
1053 #if VM_PAGE_BUCKETS_CHECK
1054 int vm_page_buckets_check_interval = 10; /* in seconds */
1055 #endif /* VM_PAGE_BUCKETS_CHECK */
1056
1057 /*
1058 * Called from compute_averages().
1059 */
1060 void
1061 compute_memory_pressure(
1062 __unused void *arg)
1063 {
1064 unsigned int vm_pageout_next;
1065
1066 #if VM_PAGE_BUCKETS_CHECK
1067 /* check the consistency of VM page buckets at regular interval */
1068 static int counter = 0;
1069 if ((++counter % vm_page_buckets_check_interval) == 0) {
1070 vm_page_buckets_check();
1071 }
1072 #endif /* VM_PAGE_BUCKETS_CHECK */
1073
1074 vm_memory_pressure =
1075 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
1076
1077 commpage_set_memory_pressure( vm_memory_pressure );
1078
1079 /* move "now" forward */
1080 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
1081 vm_pageout_stats[vm_pageout_next].considered = 0;
1082 vm_pageout_stats[vm_pageout_next].reclaimed = 0;
1083 vm_pageout_stat_now = vm_pageout_next;
1084 }
1085
1086
1087 /*
1088 * IMPORTANT
1089 * mach_vm_ctl_page_free_wanted() is called indirectly, via
1090 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1091 * it must be safe in the restricted stackshot context. Locks and/or
1092 * blocking are not allowable.
1093 */
1094 unsigned int
1095 mach_vm_ctl_page_free_wanted(void)
1096 {
1097 unsigned int page_free_target, page_free_count, page_free_wanted;
1098
1099 page_free_target = vm_page_free_target;
1100 page_free_count = vm_page_free_count;
1101 if (page_free_target > page_free_count) {
1102 page_free_wanted = page_free_target - page_free_count;
1103 } else {
1104 page_free_wanted = 0;
1105 }
1106
1107 return page_free_wanted;
1108 }
1109
1110
1111 /*
1112 * IMPORTANT:
1113 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1114 * wait_for_pressure FALSE, so that code path must remain safe in the
1115 * restricted stackshot context. No blocking or locks are allowable.
1116 * on that code path.
1117 */
1118
1119 kern_return_t
1120 mach_vm_pressure_monitor(
1121 boolean_t wait_for_pressure,
1122 unsigned int nsecs_monitored,
1123 unsigned int *pages_reclaimed_p,
1124 unsigned int *pages_wanted_p)
1125 {
1126 wait_result_t wr;
1127 unsigned int vm_pageout_then, vm_pageout_now;
1128 unsigned int pages_reclaimed;
1129
1130 /*
1131 * We don't take the vm_page_queue_lock here because we don't want
1132 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1133 * thread when it's trying to reclaim memory. We don't need fully
1134 * accurate monitoring anyway...
1135 */
1136
1137 if (wait_for_pressure) {
1138 /* wait until there's memory pressure */
1139 while (vm_page_free_count >= vm_page_free_target) {
1140 wr = assert_wait((event_t) &vm_page_free_wanted,
1141 THREAD_INTERRUPTIBLE);
1142 if (wr == THREAD_WAITING) {
1143 wr = thread_block(THREAD_CONTINUE_NULL);
1144 }
1145 if (wr == THREAD_INTERRUPTED) {
1146 return KERN_ABORTED;
1147 }
1148 if (wr == THREAD_AWAKENED) {
1149 /*
1150 * The memory pressure might have already
1151 * been relieved but let's not block again
1152 * and let's report that there was memory
1153 * pressure at some point.
1154 */
1155 break;
1156 }
1157 }
1158 }
1159
1160 /* provide the number of pages the system wants to reclaim */
1161 if (pages_wanted_p != NULL) {
1162 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1163 }
1164
1165 if (pages_reclaimed_p == NULL) {
1166 return KERN_SUCCESS;
1167 }
1168
1169 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1170 vm_pageout_now = vm_pageout_stat_now;
1171 pages_reclaimed = 0;
1172 for (vm_pageout_then =
1173 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1174 vm_pageout_then != vm_pageout_now &&
1175 nsecs_monitored-- != 0;
1176 vm_pageout_then =
1177 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1178 pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1179 }
1180 *pages_reclaimed_p = pages_reclaimed;
1181
1182 return KERN_SUCCESS;
1183 }
1184
1185
1186
1187 #if DEVELOPMENT || DEBUG
1188
1189 static void
1190 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1191
1192 /*
1193 * condition variable used to make sure there is
1194 * only a single sweep going on at a time
1195 */
1196 boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1197
1198
1199 void
1200 vm_pageout_disconnect_all_pages()
1201 {
1202 vm_page_lock_queues();
1203
1204 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1205 vm_page_unlock_queues();
1206 return;
1207 }
1208 vm_pageout_disconnect_all_pages_active = TRUE;
1209 vm_page_unlock_queues();
1210
1211 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1212 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1213 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1214
1215 vm_pageout_disconnect_all_pages_active = FALSE;
1216 }
1217
1218
1219 void
1220 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1221 {
1222 vm_page_t m;
1223 vm_object_t t_object = NULL;
1224 vm_object_t l_object = NULL;
1225 vm_object_t m_object = NULL;
1226 int delayed_unlock = 0;
1227 int try_failed_count = 0;
1228 int disconnected_count = 0;
1229 int paused_count = 0;
1230 int object_locked_count = 0;
1231
1232 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1233 q, qcount, 0, 0, 0);
1234
1235 vm_page_lock_queues();
1236
1237 while (qcount && !vm_page_queue_empty(q)) {
1238
1239 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1240
1241 m = (vm_page_t) vm_page_queue_first(q);
1242 m_object = VM_PAGE_OBJECT(m);
1243
1244 /*
1245 * check to see if we currently are working
1246 * with the same object... if so, we've
1247 * already got the lock
1248 */
1249 if (m_object != l_object) {
1250 /*
1251 * the object associated with candidate page is
1252 * different from the one we were just working
1253 * with... dump the lock if we still own it
1254 */
1255 if (l_object != NULL) {
1256 vm_object_unlock(l_object);
1257 l_object = NULL;
1258 }
1259 if (m_object != t_object)
1260 try_failed_count = 0;
1261
1262 /*
1263 * Try to lock object; since we've alread got the
1264 * page queues lock, we can only 'try' for this one.
1265 * if the 'try' fails, we need to do a mutex_pause
1266 * to allow the owner of the object lock a chance to
1267 * run...
1268 */
1269 if ( !vm_object_lock_try_scan(m_object)) {
1270
1271 if (try_failed_count > 20) {
1272 goto reenter_pg_on_q;
1273 }
1274 vm_page_unlock_queues();
1275 mutex_pause(try_failed_count++);
1276 vm_page_lock_queues();
1277 delayed_unlock = 0;
1278
1279 paused_count++;
1280
1281 t_object = m_object;
1282 continue;
1283 }
1284 object_locked_count++;
1285
1286 l_object = m_object;
1287 }
1288 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1289 /*
1290 * put it back on the head of its queue
1291 */
1292 goto reenter_pg_on_q;
1293 }
1294 if (m->pmapped == TRUE) {
1295
1296 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1297
1298 disconnected_count++;
1299 }
1300 reenter_pg_on_q:
1301 vm_page_queue_remove(q, m, vm_page_t, pageq);
1302 vm_page_queue_enter(q, m, vm_page_t, pageq);
1303
1304 qcount--;
1305 try_failed_count = 0;
1306
1307 if (delayed_unlock++ > 128) {
1308
1309 if (l_object != NULL) {
1310 vm_object_unlock(l_object);
1311 l_object = NULL;
1312 }
1313 lck_mtx_yield(&vm_page_queue_lock);
1314 delayed_unlock = 0;
1315 }
1316 }
1317 if (l_object != NULL) {
1318 vm_object_unlock(l_object);
1319 l_object = NULL;
1320 }
1321 vm_page_unlock_queues();
1322
1323 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1324 q, disconnected_count, object_locked_count, paused_count, 0);
1325 }
1326
1327 #endif
1328
1329
1330 static void
1331 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1332
1333 /*
1334 * condition variable used to make sure there is
1335 * only a single sweep going on at a time
1336 */
1337 boolean_t vm_pageout_anonymous_pages_active = FALSE;
1338
1339
1340 void
1341 vm_pageout_anonymous_pages()
1342 {
1343 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1344
1345 vm_page_lock_queues();
1346
1347 if (vm_pageout_anonymous_pages_active == TRUE) {
1348 vm_page_unlock_queues();
1349 return;
1350 }
1351 vm_pageout_anonymous_pages_active = TRUE;
1352 vm_page_unlock_queues();
1353
1354 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1355 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1356 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1357
1358 if (VM_CONFIG_SWAP_IS_PRESENT)
1359 vm_consider_swapping();
1360
1361 vm_page_lock_queues();
1362 vm_pageout_anonymous_pages_active = FALSE;
1363 vm_page_unlock_queues();
1364 }
1365 }
1366
1367
1368 void
1369 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1370 {
1371 vm_page_t m;
1372 vm_object_t t_object = NULL;
1373 vm_object_t l_object = NULL;
1374 vm_object_t m_object = NULL;
1375 int delayed_unlock = 0;
1376 int try_failed_count = 0;
1377 int refmod_state;
1378 int pmap_options;
1379 struct vm_pageout_queue *iq;
1380 ppnum_t phys_page;
1381
1382
1383 iq = &vm_pageout_queue_internal;
1384
1385 vm_page_lock_queues();
1386
1387 while (qcount && !vm_page_queue_empty(q)) {
1388
1389 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1390
1391 if (VM_PAGE_Q_THROTTLED(iq)) {
1392
1393 if (l_object != NULL) {
1394 vm_object_unlock(l_object);
1395 l_object = NULL;
1396 }
1397 iq->pgo_draining = TRUE;
1398
1399 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1400 vm_page_unlock_queues();
1401
1402 thread_block(THREAD_CONTINUE_NULL);
1403
1404 vm_page_lock_queues();
1405 delayed_unlock = 0;
1406 continue;
1407 }
1408 m = (vm_page_t) vm_page_queue_first(q);
1409 m_object = VM_PAGE_OBJECT(m);
1410
1411 /*
1412 * check to see if we currently are working
1413 * with the same object... if so, we've
1414 * already got the lock
1415 */
1416 if (m_object != l_object) {
1417 if ( !m_object->internal)
1418 goto reenter_pg_on_q;
1419
1420 /*
1421 * the object associated with candidate page is
1422 * different from the one we were just working
1423 * with... dump the lock if we still own it
1424 */
1425 if (l_object != NULL) {
1426 vm_object_unlock(l_object);
1427 l_object = NULL;
1428 }
1429 if (m_object != t_object)
1430 try_failed_count = 0;
1431
1432 /*
1433 * Try to lock object; since we've alread got the
1434 * page queues lock, we can only 'try' for this one.
1435 * if the 'try' fails, we need to do a mutex_pause
1436 * to allow the owner of the object lock a chance to
1437 * run...
1438 */
1439 if ( !vm_object_lock_try_scan(m_object)) {
1440
1441 if (try_failed_count > 20) {
1442 goto reenter_pg_on_q;
1443 }
1444 vm_page_unlock_queues();
1445 mutex_pause(try_failed_count++);
1446 vm_page_lock_queues();
1447 delayed_unlock = 0;
1448
1449 t_object = m_object;
1450 continue;
1451 }
1452 l_object = m_object;
1453 }
1454 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1455 /*
1456 * page is not to be cleaned
1457 * put it back on the head of its queue
1458 */
1459 goto reenter_pg_on_q;
1460 }
1461 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1462
1463 if (m->reference == FALSE && m->pmapped == TRUE) {
1464 refmod_state = pmap_get_refmod(phys_page);
1465
1466 if (refmod_state & VM_MEM_REFERENCED)
1467 m->reference = TRUE;
1468 if (refmod_state & VM_MEM_MODIFIED) {
1469 SET_PAGE_DIRTY(m, FALSE);
1470 }
1471 }
1472 if (m->reference == TRUE) {
1473 m->reference = FALSE;
1474 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1475 goto reenter_pg_on_q;
1476 }
1477 if (m->pmapped == TRUE) {
1478 if (m->dirty || m->precious) {
1479 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1480 } else {
1481 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1482 }
1483 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1484 if (refmod_state & VM_MEM_MODIFIED) {
1485 SET_PAGE_DIRTY(m, FALSE);
1486 }
1487 }
1488 if ( !m->dirty && !m->precious) {
1489 vm_page_unlock_queues();
1490 VM_PAGE_FREE(m);
1491 vm_page_lock_queues();
1492 delayed_unlock = 0;
1493
1494 goto next_pg;
1495 }
1496 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1497
1498 if (!m_object->pager_initialized) {
1499
1500 vm_page_unlock_queues();
1501
1502 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1503
1504 if (!m_object->pager_initialized)
1505 vm_object_compressor_pager_create(m_object);
1506
1507 vm_page_lock_queues();
1508 delayed_unlock = 0;
1509 }
1510 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)
1511 goto reenter_pg_on_q;
1512 /*
1513 * vm_object_compressor_pager_create will drop the object lock
1514 * which means 'm' may no longer be valid to use
1515 */
1516 continue;
1517 }
1518 /*
1519 * we've already factored out pages in the laundry which
1520 * means this page can't be on the pageout queue so it's
1521 * safe to do the vm_page_queues_remove
1522 */
1523 vm_page_queues_remove(m, TRUE);
1524
1525 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1526
1527 vm_pageout_cluster(m, FALSE, FALSE);
1528
1529 goto next_pg;
1530
1531 reenter_pg_on_q:
1532 vm_page_queue_remove(q, m, vm_page_t, pageq);
1533 vm_page_queue_enter(q, m, vm_page_t, pageq);
1534 next_pg:
1535 qcount--;
1536 try_failed_count = 0;
1537
1538 if (delayed_unlock++ > 128) {
1539
1540 if (l_object != NULL) {
1541 vm_object_unlock(l_object);
1542 l_object = NULL;
1543 }
1544 lck_mtx_yield(&vm_page_queue_lock);
1545 delayed_unlock = 0;
1546 }
1547 }
1548 if (l_object != NULL) {
1549 vm_object_unlock(l_object);
1550 l_object = NULL;
1551 }
1552 vm_page_unlock_queues();
1553 }
1554
1555
1556
1557 /*
1558 * function in BSD to apply I/O throttle to the pageout thread
1559 */
1560 extern void vm_pageout_io_throttle(void);
1561
1562 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1563 MACRO_BEGIN \
1564 /* \
1565 * If a "reusable" page somehow made it back into \
1566 * the active queue, it's been re-used and is not \
1567 * quite re-usable. \
1568 * If the VM object was "all_reusable", consider it \
1569 * as "all re-used" instead of converting it to \
1570 * "partially re-used", which could be expensive. \
1571 */ \
1572 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1573 if ((m)->reusable || \
1574 (obj)->all_reusable) { \
1575 vm_object_reuse_pages((obj), \
1576 (m)->offset, \
1577 (m)->offset + PAGE_SIZE_64, \
1578 FALSE); \
1579 } \
1580 MACRO_END
1581
1582
1583 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1585
1586 #define FCS_IDLE 0
1587 #define FCS_DELAYED 1
1588 #define FCS_DEADLOCK_DETECTED 2
1589
1590 struct flow_control {
1591 int state;
1592 mach_timespec_t ts;
1593 };
1594
1595 #if CONFIG_BACKGROUND_QUEUE
1596 uint64_t vm_pageout_considered_bq_internal = 0;
1597 uint64_t vm_pageout_considered_bq_external = 0;
1598 uint64_t vm_pageout_rejected_bq_internal = 0;
1599 uint64_t vm_pageout_rejected_bq_external = 0;
1600 #endif
1601 uint32_t vm_pageout_considered_page = 0;
1602 uint32_t vm_page_filecache_min = 0;
1603
1604 #define ANONS_GRABBED_LIMIT 2
1605
1606 #if CONFIG_SECLUDED_MEMORY
1607 extern vm_page_t vm_page_grab_secluded(void);
1608 uint64_t vm_pageout_freed_from_secluded = 0;
1609 uint64_t vm_pageout_secluded_reactivated = 0; /* debugging; how many secluded pages are found to be referenced on pageout (and are therefore reactivated) */
1610 uint64_t vm_pageout_secluded_burst_count = 0;
1611 #endif /* CONFIG_SECLUDED_MEMORY */
1612
1613 /*
1614 * vm_pageout_scan does the dirty work for the pageout daemon.
1615 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1616 * held and vm_page_free_wanted == 0.
1617 */
1618 void
1619 vm_pageout_scan(void)
1620 {
1621 unsigned int loop_count = 0;
1622 unsigned int inactive_burst_count = 0;
1623 unsigned int active_burst_count = 0;
1624 unsigned int reactivated_this_call;
1625 unsigned int reactivate_limit;
1626 vm_page_t local_freeq = NULL;
1627 int local_freed = 0;
1628 int delayed_unlock;
1629 int delayed_unlock_limit = 0;
1630 int refmod_state = 0;
1631 int vm_pageout_deadlock_target = 0;
1632 struct vm_pageout_queue *iq;
1633 struct vm_pageout_queue *eq;
1634 struct vm_speculative_age_q *sq;
1635 struct flow_control flow_control = { 0, { 0, 0 } };
1636 boolean_t inactive_throttled = FALSE;
1637 boolean_t try_failed;
1638 mach_timespec_t ts;
1639 unsigned int msecs = 0;
1640 vm_object_t object;
1641 vm_object_t last_object_tried;
1642 uint32_t catch_up_count = 0;
1643 uint32_t inactive_reclaim_run;
1644 boolean_t exceeded_burst_throttle;
1645 boolean_t grab_anonymous = FALSE;
1646 boolean_t force_anonymous = FALSE;
1647 int anons_grabbed = 0;
1648 int page_prev_q_state = 0;
1649 #if CONFIG_BACKGROUND_QUEUE
1650 boolean_t ignore_reference = FALSE;
1651 #endif
1652 #if CONFIG_SECLUDED_MEMORY
1653 boolean_t ignore_reference_secluded;
1654 #endif /* CONFIG_SECLUDED_MEMORY */
1655 int cache_evict_throttle = 0;
1656 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
1657 int force_purge = 0;
1658 #define DELAY_SPECULATIVE_AGE 1000
1659 int delay_speculative_age = 0;
1660 vm_object_t m_object = VM_OBJECT_NULL;
1661
1662 #if VM_PRESSURE_EVENTS
1663 vm_pressure_level_t pressure_level;
1664 #endif /* VM_PRESSURE_EVENTS */
1665
1666 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
1667 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1668 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1669
1670 flow_control.state = FCS_IDLE;
1671 iq = &vm_pageout_queue_internal;
1672 eq = &vm_pageout_queue_external;
1673 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1674
1675
1676 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1677
1678
1679 vm_page_lock_queues();
1680 delayed_unlock = 1; /* must be nonzero if Qs are locked, 0 if unlocked */
1681
1682 /*
1683 * Calculate the max number of referenced pages on the inactive
1684 * queue that we will reactivate.
1685 */
1686 reactivated_this_call = 0;
1687 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1688 vm_page_inactive_count);
1689 inactive_reclaim_run = 0;
1690
1691 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
1692
1693 /*
1694 * We want to gradually dribble pages from the active queue
1695 * to the inactive queue. If we let the inactive queue get
1696 * very small, and then suddenly dump many pages into it,
1697 * those pages won't get a sufficient chance to be referenced
1698 * before we start taking them from the inactive queue.
1699 *
1700 * We must limit the rate at which we send pages to the pagers
1701 * so that we don't tie up too many pages in the I/O queues.
1702 * We implement a throttling mechanism using the laundry count
1703 * to limit the number of pages outstanding to the default
1704 * and external pagers. We can bypass the throttles and look
1705 * for clean pages if the pageout queues don't drain in a timely
1706 * fashion since this may indicate that the pageout paths are
1707 * stalled waiting for memory, which only we can provide.
1708 */
1709
1710
1711 Restart:
1712
1713
1714 assert(delayed_unlock!=0);
1715
1716 /*
1717 * Recalculate vm_page_inactivate_target.
1718 */
1719 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1720 vm_page_inactive_count +
1721 vm_page_speculative_count);
1722
1723 vm_page_anonymous_min = vm_page_inactive_target / 20;
1724
1725
1726 /*
1727 * don't want to wake the pageout_scan thread up everytime we fall below
1728 * the targets... set a low water mark at 0.25% below the target
1729 */
1730 vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1731
1732 if (vm_page_speculative_percentage > 50)
1733 vm_page_speculative_percentage = 50;
1734 else if (vm_page_speculative_percentage <= 0)
1735 vm_page_speculative_percentage = 1;
1736
1737 vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1738 vm_page_inactive_count);
1739
1740 object = NULL;
1741 last_object_tried = NULL;
1742 try_failed = FALSE;
1743
1744 if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1745 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1746 else
1747 catch_up_count = 0;
1748
1749 for (;;) {
1750 vm_page_t m;
1751
1752 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1753
1754 #if CONFIG_SECLUDED_MEMORY
1755 if (vm_page_secluded_count > vm_page_secluded_target &&
1756 object != NULL) {
1757 vm_object_unlock(object);
1758 object = NULL;
1759 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1760 }
1761
1762 /*
1763 * Deal with secluded_q overflow.
1764 */
1765 if (vm_page_secluded_count > vm_page_secluded_target &&
1766 secluded_aging_policy == SECLUDED_AGING_FIFO) {
1767 unsigned int secluded_overflow;
1768 vm_page_t secluded_page;
1769
1770 /*
1771 * SECLUDED_AGING_FIFO:
1772 * No aging, just reclaim the excess pages
1773 * at the tail of the secluded queue.
1774 * We're reclaiming pages and we're not hogging
1775 * any global lock, so no need for throttling.
1776 */
1777
1778 secluded_overflow = (vm_page_secluded_count -
1779 vm_page_secluded_target);
1780 /* transfer to free queue */
1781 vm_page_unlock_queues();
1782 while (secluded_overflow--) {
1783 secluded_page = vm_page_grab_secluded();
1784 if (secluded_page == VM_PAGE_NULL) {
1785 break;
1786 }
1787 assert(secluded_page->busy);
1788 assert(secluded_page->pageq.next == 0 &&
1789 secluded_page->pageq.prev == 0);
1790
1791 secluded_page->snext = local_freeq;
1792 local_freeq = secluded_page;
1793 local_freed++;
1794 secluded_page = VM_PAGE_NULL;
1795 }
1796 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1797 secluded_aging_policy == SECLUDED_AGING_ALONG_ACTIVE) {
1798 unsigned int secluded_overflow;
1799 vm_page_t secluded_page;
1800
1801 /*
1802 * SECLUDED_AGING_ALONG_ACTIVE:
1803 * There might be free pages at the tail of the
1804 * secluded queue:
1805 * just move them to the free queue (in batches).
1806 * There can also be an excessive number of "inuse"
1807 * pages:
1808 * we age them by resetting their "referenced" bit and
1809 * moving them to the inactive queue. Their trip
1810 * through the secluded queue was equivalent to a trip
1811 * through the active queue.
1812 *
1813 * We're holding the page queue lock, so we need
1814 * to throttle and give someone else a chance to
1815 * grab that lock if needed.
1816 *
1817 * We're also limiting the number of secluded "inuse"
1818 * pages that get moved to the inactive queue, using
1819 * the same "active_bust_count" method we use when
1820 * balancing the active and inactive queues, because
1821 * there can be a large number
1822 * of extra "inuse" pages and handling them gets in the
1823 * way of actually reclaiming memory.
1824 */
1825
1826 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1827 vm_page_secluded_count_inuse);
1828 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1829 delayed_unlock = 1;
1830 secluded_overflow = (vm_page_secluded_count -
1831 vm_page_secluded_target);
1832 while (secluded_overflow-- > 0 &&
1833 vm_page_secluded_count > vm_page_secluded_target) {
1834 assert((vm_page_secluded_count_free +
1835 vm_page_secluded_count_inuse) ==
1836 vm_page_secluded_count);
1837 secluded_page = vm_page_queue_first(&vm_page_queue_secluded);
1838 assert(secluded_page->vm_page_q_state ==
1839 VM_PAGE_ON_SECLUDED_Q);
1840 vm_page_queues_remove(secluded_page, FALSE);
1841 assert(!secluded_page->fictitious);
1842 assert(!VM_PAGE_WIRED(secluded_page));
1843 if (secluded_page->vm_page_object == 0) {
1844 /* transfer to free queue */
1845 assert(secluded_page->busy);
1846 secluded_page->snext = local_freeq;
1847 local_freeq = secluded_page;
1848 local_freed++;
1849 } else {
1850 /* transfer to head of inactive queue */
1851 pmap_clear_refmod_options(
1852 VM_PAGE_GET_PHYS_PAGE(secluded_page),
1853 VM_MEM_REFERENCED,
1854 PMAP_OPTIONS_NOFLUSH,
1855 (void *)NULL);
1856 vm_page_enqueue_inactive(secluded_page,
1857 FALSE);
1858 if (active_burst_count-- == 0) {
1859 vm_pageout_secluded_burst_count++;
1860 break;
1861 }
1862 }
1863 secluded_page = VM_PAGE_NULL;
1864 if (delayed_unlock++ > delayed_unlock_limit) {
1865 if (local_freeq) {
1866 vm_page_unlock_queues();
1867 VM_DEBUG_EVENT(
1868 vm_pageout_freelist,
1869 VM_PAGEOUT_FREELIST,
1870 DBG_FUNC_START,
1871 vm_page_free_count,
1872 local_freed,
1873 delayed_unlock_limit,
1874 1);
1875 vm_page_free_list(local_freeq,
1876 TRUE);
1877 VM_DEBUG_EVENT(
1878 vm_pageout_freelist,
1879 VM_PAGEOUT_FREELIST,
1880 DBG_FUNC_END,
1881 vm_page_free_count,
1882 0, 0, 1);
1883 local_freeq = NULL;
1884 local_freed = 0;
1885 vm_page_lock_queues();
1886 } else {
1887 lck_mtx_yield(&vm_page_queue_lock);
1888 }
1889 delayed_unlock = 1;
1890 }
1891 }
1892 delayed_unlock = 1;
1893 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1894 secluded_aging_policy == SECLUDED_AGING_AFTER_INACTIVE) {
1895 /*
1896 * SECLUDED_AGING_AFTER_INACTIVE:
1897 * No balancing needed at this point: when we get to
1898 * the "choose a victim" part below, we'll consider the
1899 * extra secluded pages before any inactive page.
1900 */
1901 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1902 secluded_aging_policy == SECLUDED_AGING_BEFORE_ACTIVE) {
1903 unsigned int secluded_overflow;
1904 vm_page_t secluded_page;
1905
1906 /*
1907 * SECLUDED_AGING_BEFORE_ACTIVE:
1908 * Excess secluded pages go to the active queue and
1909 * will later go to the inactive queue.
1910 */
1911 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1912 vm_page_secluded_count_inuse);
1913 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1914 delayed_unlock = 1;
1915 secluded_overflow = (vm_page_secluded_count -
1916 vm_page_secluded_target);
1917 while (secluded_overflow-- > 0 &&
1918 vm_page_secluded_count > vm_page_secluded_target) {
1919 assert((vm_page_secluded_count_free +
1920 vm_page_secluded_count_inuse) ==
1921 vm_page_secluded_count);
1922 secluded_page = vm_page_queue_first(&vm_page_queue_secluded);
1923 assert(secluded_page->vm_page_q_state ==
1924 VM_PAGE_ON_SECLUDED_Q);
1925 vm_page_queues_remove(secluded_page, FALSE);
1926 assert(!secluded_page->fictitious);
1927 assert(!VM_PAGE_WIRED(secluded_page));
1928 if (secluded_page->vm_page_object == 0) {
1929 /* transfer to free queue */
1930 assert(secluded_page->busy);
1931 secluded_page->snext = local_freeq;
1932 local_freeq = secluded_page;
1933 local_freed++;
1934 } else {
1935 /* transfer to head of active queue */
1936 vm_page_enqueue_active(secluded_page,
1937 FALSE);
1938 if (active_burst_count-- == 0) {
1939 vm_pageout_secluded_burst_count++;
1940 break;
1941 }
1942 }
1943 secluded_page = VM_PAGE_NULL;
1944 if (delayed_unlock++ > delayed_unlock_limit) {
1945 if (local_freeq) {
1946 vm_page_unlock_queues();
1947 VM_DEBUG_EVENT(
1948 vm_pageout_freelist,
1949 VM_PAGEOUT_FREELIST,
1950 DBG_FUNC_START,
1951 vm_page_free_count,
1952 local_freed,
1953 delayed_unlock_limit,
1954 1);
1955 vm_page_free_list(local_freeq,
1956 TRUE);
1957 VM_DEBUG_EVENT(
1958 vm_pageout_freelist,
1959 VM_PAGEOUT_FREELIST,
1960 DBG_FUNC_END,
1961 vm_page_free_count,
1962 0, 0, 1);
1963 local_freeq = NULL;
1964 local_freed = 0;
1965 vm_page_lock_queues();
1966 } else {
1967 lck_mtx_yield(&vm_page_queue_lock);
1968 }
1969 delayed_unlock = 1;
1970 }
1971 }
1972 delayed_unlock = 1;
1973 } else if (vm_page_secluded_count > vm_page_secluded_target) {
1974 panic("unsupported secluded_aging_policy %d\n",
1975 secluded_aging_policy);
1976 }
1977 if (local_freeq) {
1978 vm_page_unlock_queues();
1979 VM_DEBUG_EVENT(vm_pageout_freelist,
1980 VM_PAGEOUT_FREELIST,
1981 DBG_FUNC_START,
1982 vm_page_free_count,
1983 local_freed,
1984 0,
1985 0);
1986 vm_page_free_list(local_freeq, TRUE);
1987 VM_DEBUG_EVENT(vm_pageout_freelist,
1988 VM_PAGEOUT_FREELIST,
1989 DBG_FUNC_END,
1990 vm_page_free_count, 0, 0, 0);
1991 local_freeq = NULL;
1992 local_freed = 0;
1993 vm_page_lock_queues();
1994 }
1995 #endif /* CONFIG_SECLUDED_MEMORY */
1996
1997 assert(delayed_unlock);
1998
1999 if (vm_upl_wait_for_pages < 0)
2000 vm_upl_wait_for_pages = 0;
2001
2002 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2003
2004 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
2005 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
2006
2007 /*
2008 * Move pages from active to inactive if we're below the target
2009 */
2010 /* if we are trying to make clean, we need to make sure we actually have inactive - mj */
2011 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
2012 goto done_moving_active_pages;
2013
2014 if (object != NULL) {
2015 vm_object_unlock(object);
2016 object = NULL;
2017 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2018 }
2019 /*
2020 * Don't sweep through active queue more than the throttle
2021 * which should be kept relatively low
2022 */
2023 active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
2024
2025 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
2026 vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
2027
2028 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
2029 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2030 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2031 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
2032
2033
2034 while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) {
2035
2036 vm_pageout_active++;
2037
2038 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2039
2040 assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
2041 assert(!m->laundry);
2042 assert(VM_PAGE_OBJECT(m) != kernel_object);
2043 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2044
2045 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2046
2047 /*
2048 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2049 *
2050 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2051 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2052 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2053 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2054 * by pageout_scan, which is just fine since the last reference would have happened quite far
2055 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2056 * have happened before we moved the page
2057 */
2058 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2059
2060 /*
2061 * The page might be absent or busy,
2062 * but vm_page_deactivate can handle that.
2063 * FALSE indicates that we don't want a H/W clear reference
2064 */
2065 vm_page_deactivate_internal(m, FALSE);
2066
2067 if (delayed_unlock++ > delayed_unlock_limit) {
2068
2069 if (local_freeq) {
2070 vm_page_unlock_queues();
2071
2072 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2073 vm_page_free_count, local_freed, delayed_unlock_limit, 1);
2074
2075 vm_page_free_list(local_freeq, TRUE);
2076
2077 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2078 vm_page_free_count, 0, 0, 1);
2079
2080 local_freeq = NULL;
2081 local_freed = 0;
2082 vm_page_lock_queues();
2083 } else {
2084 lck_mtx_yield(&vm_page_queue_lock);
2085 }
2086
2087 delayed_unlock = 1;
2088
2089 /*
2090 * continue the while loop processing
2091 * the active queue... need to hold
2092 * the page queues lock
2093 */
2094 }
2095 }
2096
2097 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
2098 vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
2099 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
2100
2101 /**********************************************************************
2102 * above this point we're playing with the active queue
2103 * below this point we're playing with the throttling mechanisms
2104 * and the inactive queue
2105 **********************************************************************/
2106
2107 done_moving_active_pages:
2108
2109 #if CONFIG_BACKGROUND_QUEUE
2110 if ((vm_page_free_count + local_freed >= vm_page_free_target) &&
2111 ((vm_page_background_mode < VM_PAGE_BG_LEVEL_2) || (vm_page_background_count <= vm_page_background_target)))
2112 #else
2113 if (vm_page_free_count + local_freed >= vm_page_free_target)
2114 #endif
2115 {
2116 if (object != NULL) {
2117 vm_object_unlock(object);
2118 object = NULL;
2119 }
2120 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2121
2122 vm_page_unlock_queues();
2123
2124 if (local_freeq) {
2125
2126 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2127 vm_page_free_count, local_freed, delayed_unlock_limit, 2);
2128
2129 vm_page_free_list(local_freeq, TRUE);
2130
2131 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2132 vm_page_free_count, local_freed, 0, 2);
2133
2134 local_freeq = NULL;
2135 local_freed = 0;
2136 }
2137 vm_consider_waking_compactor_swapper();
2138
2139 vm_page_lock_queues();
2140
2141 /*
2142 * make sure the pageout I/O threads are running
2143 * throttled in case there are still requests
2144 * in the laundry... since we have met our targets
2145 * we don't need the laundry to be cleaned in a timely
2146 * fashion... so let's avoid interfering with foreground
2147 * activity
2148 */
2149 vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2150
2151 /*
2152 * recalculate vm_page_inactivate_target
2153 */
2154 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2155 vm_page_inactive_count +
2156 vm_page_speculative_count);
2157 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
2158 !vm_page_queue_empty(&vm_page_queue_active)) {
2159 /*
2160 * inactive target still not met... keep going
2161 * until we get the queues balanced...
2162 */
2163 continue;
2164 }
2165 lck_mtx_lock(&vm_page_queue_free_lock);
2166
2167 if ((vm_page_free_count >= vm_page_free_target) &&
2168 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2169 /*
2170 * done - we have met our target *and*
2171 * there is no one waiting for a page.
2172 */
2173 return_from_scan:
2174 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2175
2176 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
2177 vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
2178 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
2179 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2180 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2181
2182 return;
2183 }
2184 lck_mtx_unlock(&vm_page_queue_free_lock);
2185 }
2186
2187 /*
2188 * Before anything, we check if we have any ripe volatile
2189 * objects around. If so, try to purge the first object.
2190 * If the purge fails, fall through to reclaim a page instead.
2191 * If the purge succeeds, go back to the top and reevalute
2192 * the new memory situation.
2193 */
2194
2195 assert (available_for_purge>=0);
2196 force_purge = 0; /* no force-purging */
2197
2198 #if VM_PRESSURE_EVENTS
2199 pressure_level = memorystatus_vm_pressure_level;
2200
2201 if (pressure_level > kVMPressureNormal) {
2202
2203 if (pressure_level >= kVMPressureCritical) {
2204 force_purge = memorystatus_purge_on_critical;
2205 } else if (pressure_level >= kVMPressureUrgent) {
2206 force_purge = memorystatus_purge_on_urgent;
2207 } else if (pressure_level >= kVMPressureWarning) {
2208 force_purge = memorystatus_purge_on_warning;
2209 }
2210 }
2211 #endif /* VM_PRESSURE_EVENTS */
2212
2213 if (available_for_purge || force_purge) {
2214
2215 if (object != NULL) {
2216 vm_object_unlock(object);
2217 object = NULL;
2218 }
2219
2220 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2221
2222 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2223 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2224 vm_pageout_purged_objects++;
2225 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2226 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2227 continue;
2228 }
2229 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2230 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2231 }
2232
2233 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
2234 /*
2235 * try to pull pages from the aging bins...
2236 * see vm_page.h for an explanation of how
2237 * this mechanism works
2238 */
2239 struct vm_speculative_age_q *aq;
2240 boolean_t can_steal = FALSE;
2241 int num_scanned_queues;
2242
2243 aq = &vm_page_queue_speculative[speculative_steal_index];
2244
2245 num_scanned_queues = 0;
2246 while (vm_page_queue_empty(&aq->age_q) &&
2247 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2248
2249 speculative_steal_index++;
2250
2251 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2252 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2253
2254 aq = &vm_page_queue_speculative[speculative_steal_index];
2255 }
2256
2257 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2258 /*
2259 * XXX We've scanned all the speculative
2260 * queues but still haven't found one
2261 * that is not empty, even though
2262 * vm_page_speculative_count is not 0.
2263 *
2264 * report the anomaly...
2265 */
2266 printf("vm_pageout_scan: "
2267 "all speculative queues empty "
2268 "but count=%d. Re-adjusting.\n",
2269 vm_page_speculative_count);
2270 if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
2271 vm_page_speculative_count_drift_max = vm_page_speculative_count;
2272 vm_page_speculative_count_drifts++;
2273 #if DEVELOPMENT || DEBUG
2274 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2275 #endif /* DEVELOPMENT || DEBUG */
2276 /* readjust... */
2277 vm_page_speculative_count = 0;
2278 /* ... and continue */
2279 continue;
2280 }
2281
2282 if (vm_page_speculative_count > vm_page_speculative_target)
2283 can_steal = TRUE;
2284 else {
2285 if (!delay_speculative_age) {
2286 mach_timespec_t ts_fully_aged;
2287
2288 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
2289 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
2290 * 1000 * NSEC_PER_USEC;
2291
2292 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2293
2294 clock_sec_t sec;
2295 clock_nsec_t nsec;
2296 clock_get_system_nanotime(&sec, &nsec);
2297 ts.tv_sec = (unsigned int) sec;
2298 ts.tv_nsec = nsec;
2299
2300 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
2301 can_steal = TRUE;
2302 else
2303 delay_speculative_age++;
2304 } else {
2305 delay_speculative_age++;
2306 if (delay_speculative_age == DELAY_SPECULATIVE_AGE)
2307 delay_speculative_age = 0;
2308 }
2309 }
2310 if (can_steal == TRUE)
2311 vm_page_speculate_ageit(aq);
2312 }
2313 #if CONFIG_BACKGROUND_QUEUE
2314 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 &&
2315 ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target)))
2316 #else
2317 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0)
2318 #endif
2319 {
2320 int pages_evicted;
2321
2322 if (object != NULL) {
2323 vm_object_unlock(object);
2324 object = NULL;
2325 }
2326 pages_evicted = vm_object_cache_evict(100, 10);
2327
2328 if (pages_evicted) {
2329
2330 vm_pageout_cache_evicted += pages_evicted;
2331
2332 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2333 vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
2334 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2335
2336 /*
2337 * we just freed up to 100 pages,
2338 * so go back to the top of the main loop
2339 * and re-evaulate the memory situation
2340 */
2341 continue;
2342 } else
2343 cache_evict_throttle = 100;
2344 }
2345 if (cache_evict_throttle)
2346 cache_evict_throttle--;
2347
2348 #if CONFIG_JETSAM
2349 /*
2350 * don't let the filecache_min fall below 15% of available memory
2351 * on systems with an active compressor that isn't nearing its
2352 * limits w/r to accepting new data
2353 *
2354 * on systems w/o the compressor/swapper, the filecache is always
2355 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2356 * since most (if not all) of the anonymous pages are in the
2357 * throttled queue (which isn't counted as available) which
2358 * effectively disables this filter
2359 */
2360 if (vm_compressor_low_on_space())
2361 vm_page_filecache_min = 0;
2362 else
2363 vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7);
2364 #else
2365 /*
2366 * don't let the filecache_min fall below 33% of available memory...
2367 */
2368 vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
2369 #endif
2370 if (vm_page_free_count < (vm_page_free_reserved / 4))
2371 vm_page_filecache_min = 0;
2372
2373 exceeded_burst_throttle = FALSE;
2374 /*
2375 * Sometimes we have to pause:
2376 * 1) No inactive pages - nothing to do.
2377 * 2) Loop control - no acceptable pages found on the inactive queue
2378 * within the last vm_pageout_burst_inactive_throttle iterations
2379 * 3) Flow control - default pageout queue is full
2380 */
2381 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2382 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2383 vm_page_queue_empty(&sq->age_q)) {
2384 vm_pageout_scan_empty_throttle++;
2385 msecs = vm_pageout_empty_wait;
2386 goto vm_pageout_scan_delay;
2387
2388 } else if (inactive_burst_count >=
2389 MIN(vm_pageout_burst_inactive_throttle,
2390 (vm_page_inactive_count +
2391 vm_page_speculative_count))) {
2392 vm_pageout_scan_burst_throttle++;
2393 msecs = vm_pageout_burst_wait;
2394
2395 exceeded_burst_throttle = TRUE;
2396 goto vm_pageout_scan_delay;
2397
2398 } else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
2399 VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
2400 vm_pageout_scan_swap_throttle++;
2401 msecs = vm_pageout_swap_wait;
2402 goto vm_pageout_scan_delay;
2403
2404 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2405 VM_DYNAMIC_PAGING_ENABLED()) {
2406 clock_sec_t sec;
2407 clock_nsec_t nsec;
2408
2409 switch (flow_control.state) {
2410
2411 case FCS_IDLE:
2412 if ((vm_page_free_count + local_freed) < vm_page_free_target) {
2413
2414 if (object != NULL) {
2415 vm_object_unlock(object);
2416 object = NULL;
2417 }
2418 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2419
2420 vm_page_unlock_queues();
2421
2422 if (local_freeq) {
2423
2424 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2425 vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2426
2427 vm_page_free_list(local_freeq, TRUE);
2428
2429 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2430 vm_page_free_count, local_freed, 0, 3);
2431
2432 local_freeq = NULL;
2433 local_freed = 0;
2434 }
2435 thread_yield_internal(1);
2436
2437 vm_page_lock_queues();
2438
2439 if (!VM_PAGE_Q_THROTTLED(iq)) {
2440 vm_pageout_scan_yield_unthrottled++;
2441 continue;
2442 }
2443 if (vm_page_pageable_external_count > vm_page_filecache_min &&
2444 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2445 anons_grabbed = ANONS_GRABBED_LIMIT;
2446 vm_pageout_scan_throttle_deferred++;
2447 goto consider_inactive;
2448 }
2449 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
2450 continue;
2451 }
2452 reset_deadlock_timer:
2453 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
2454 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2455 clock_get_system_nanotime(&sec, &nsec);
2456 flow_control.ts.tv_sec = (unsigned int) sec;
2457 flow_control.ts.tv_nsec = nsec;
2458 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
2459
2460 flow_control.state = FCS_DELAYED;
2461 msecs = vm_pageout_deadlock_wait;
2462
2463 break;
2464
2465 case FCS_DELAYED:
2466 clock_get_system_nanotime(&sec, &nsec);
2467 ts.tv_sec = (unsigned int) sec;
2468 ts.tv_nsec = nsec;
2469
2470 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
2471 /*
2472 * the pageout thread for the default pager is potentially
2473 * deadlocked since the
2474 * default pager queue has been throttled for more than the
2475 * allowable time... we need to move some clean pages or dirty
2476 * pages belonging to the external pagers if they aren't throttled
2477 * vm_page_free_wanted represents the number of threads currently
2478 * blocked waiting for pages... we'll move one page for each of
2479 * these plus a fixed amount to break the logjam... once we're done
2480 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2481 * with a new timeout target since we have no way of knowing
2482 * whether we've broken the deadlock except through observation
2483 * of the queue associated with the default pager... we need to
2484 * stop moving pages and allow the system to run to see what
2485 * state it settles into.
2486 */
2487 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
2488 vm_pageout_scan_deadlock_detected++;
2489 flow_control.state = FCS_DEADLOCK_DETECTED;
2490 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2491 goto consider_inactive;
2492 }
2493 /*
2494 * just resniff instead of trying
2495 * to compute a new delay time... we're going to be
2496 * awakened immediately upon a laundry completion,
2497 * so we won't wait any longer than necessary
2498 */
2499 msecs = vm_pageout_idle_wait;
2500 break;
2501
2502 case FCS_DEADLOCK_DETECTED:
2503 if (vm_pageout_deadlock_target)
2504 goto consider_inactive;
2505 goto reset_deadlock_timer;
2506
2507 }
2508 vm_pageout_scan_delay:
2509 if (object != NULL) {
2510 vm_object_unlock(object);
2511 object = NULL;
2512 }
2513 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2514
2515 vm_page_unlock_queues();
2516
2517 if (local_freeq) {
2518
2519 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2520 vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2521
2522 vm_page_free_list(local_freeq, TRUE);
2523
2524 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2525 vm_page_free_count, local_freed, 0, 3);
2526
2527 local_freeq = NULL;
2528 local_freed = 0;
2529 }
2530 vm_consider_waking_compactor_swapper();
2531
2532 vm_page_lock_queues();
2533
2534 if (flow_control.state == FCS_DELAYED &&
2535 !VM_PAGE_Q_THROTTLED(iq)) {
2536 flow_control.state = FCS_IDLE;
2537 goto consider_inactive;
2538 }
2539
2540 if (vm_page_free_count >= vm_page_free_target) {
2541 /*
2542 * we're here because
2543 * 1) someone else freed up some pages while we had
2544 * the queues unlocked above
2545 * and we've hit one of the 3 conditions that
2546 * cause us to pause the pageout scan thread
2547 *
2548 * since we already have enough free pages,
2549 * let's avoid stalling and return normally
2550 *
2551 * before we return, make sure the pageout I/O threads
2552 * are running throttled in case there are still requests
2553 * in the laundry... since we have enough free pages
2554 * we don't need the laundry to be cleaned in a timely
2555 * fashion... so let's avoid interfering with foreground
2556 * activity
2557 *
2558 * we don't want to hold vm_page_queue_free_lock when
2559 * calling vm_pageout_adjust_io_throttles (since it
2560 * may cause other locks to be taken), we do the intitial
2561 * check outside of the lock. Once we take the lock,
2562 * we recheck the condition since it may have changed.
2563 * if it has, no problem, we will make the threads
2564 * non-throttled before actually blocking
2565 */
2566 vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2567 }
2568 lck_mtx_lock(&vm_page_queue_free_lock);
2569
2570 if (vm_page_free_count >= vm_page_free_target &&
2571 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2572 goto return_from_scan;
2573 }
2574 lck_mtx_unlock(&vm_page_queue_free_lock);
2575
2576 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2577 /*
2578 * we're most likely about to block due to one of
2579 * the 3 conditions that cause vm_pageout_scan to
2580 * not be able to make forward progress w/r
2581 * to providing new pages to the free queue,
2582 * so unthrottle the I/O threads in case we
2583 * have laundry to be cleaned... it needs
2584 * to be completed ASAP.
2585 *
2586 * even if we don't block, we want the io threads
2587 * running unthrottled since the sum of free +
2588 * clean pages is still under our free target
2589 */
2590 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
2591 }
2592 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2593 /*
2594 * if we get here we're below our free target and
2595 * we're stalling due to a full laundry queue or
2596 * we don't have any inactive pages other then
2597 * those in the clean queue...
2598 * however, we have pages on the clean queue that
2599 * can be moved to the free queue, so let's not
2600 * stall the pageout scan
2601 */
2602 flow_control.state = FCS_IDLE;
2603 goto consider_inactive;
2604 }
2605 VM_CHECK_MEMORYSTATUS;
2606
2607 if (flow_control.state != FCS_IDLE)
2608 vm_pageout_scan_throttle++;
2609 iq->pgo_throttled = TRUE;
2610
2611 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
2612 counter(c_vm_pageout_scan_block++);
2613
2614 vm_page_unlock_queues();
2615
2616 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2617
2618 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2619 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2620 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2621
2622 thread_block(THREAD_CONTINUE_NULL);
2623
2624 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2625 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2626 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2627
2628 vm_page_lock_queues();
2629 delayed_unlock = 1;
2630
2631 iq->pgo_throttled = FALSE;
2632
2633 if (loop_count >= vm_page_inactive_count)
2634 loop_count = 0;
2635 inactive_burst_count = 0;
2636
2637 goto Restart;
2638 /*NOTREACHED*/
2639 }
2640
2641
2642 flow_control.state = FCS_IDLE;
2643 consider_inactive:
2644 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
2645 vm_pageout_inactive_external_forced_reactivate_limit);
2646 loop_count++;
2647 inactive_burst_count++;
2648 vm_pageout_inactive++;
2649
2650
2651 /*
2652 * Choose a victim.
2653 */
2654 while (1) {
2655 uint32_t inactive_external_count;
2656
2657 #if CONFIG_BACKGROUND_QUEUE
2658 ignore_reference = FALSE;
2659 #endif /* CONFIG_BACKGROUND_QUEUE */
2660
2661 m = NULL;
2662 m_object = VM_OBJECT_NULL;
2663
2664 if (VM_DYNAMIC_PAGING_ENABLED()) {
2665 assert(vm_page_throttled_count == 0);
2666 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2667 }
2668
2669
2670 #if CONFIG_SECLUDED_MEMORY
2671 if ((secluded_aging_policy ==
2672 SECLUDED_AGING_AFTER_INACTIVE) &&
2673 vm_page_secluded_count > vm_page_secluded_target) {
2674 /*
2675 * SECLUDED_AGING_AFTER_INACTIVE:
2676 * Secluded pages have already been aged
2677 * through the active and inactive queues, and
2678 * we now have too many of them, so let's
2679 * balance that queue by considering reclaiming
2680 * the oldest page in the secluded queue.
2681 */
2682 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
2683 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_secluded);
2684 if (m->vm_page_object == 0) {
2685 /*
2686 * It's already a free page:
2687 * just move it to a free queue.
2688 */
2689 vm_page_queues_remove(m, TRUE);
2690 assert(m->busy);
2691 assert(m->pageq.next == 0);
2692 assert(m->pageq.prev == 0);
2693 m->snext = local_freeq;
2694 local_freeq = m;
2695 local_freed++;
2696 goto done_with_inactivepage;
2697 }
2698 /*
2699 * Not a free page: we've found our next
2700 * "victim".
2701 */
2702 break;
2703 }
2704 #endif /* CONFIG_SECLUDED_MEMORY */
2705
2706 #if CONFIG_BACKGROUND_QUEUE
2707 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2708 vm_object_t bg_m_object = NULL;
2709
2710 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2711
2712 bg_m_object = VM_PAGE_OBJECT(m);
2713
2714 if (!VM_PAGE_PAGEABLE(m)) {
2715 /*
2716 * This page is on the background queue
2717 * but not on a pageable queue. This is
2718 * likely a transient state and whoever
2719 * took it out of its pageable queue
2720 * will likely put it back on a pageable
2721 * queue soon but we can't deal with it
2722 * at this point, so let's ignore this
2723 * page.
2724 */
2725 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2726 ignore_reference = TRUE;
2727
2728 if (bg_m_object->internal)
2729 vm_pageout_considered_bq_internal++;
2730 else
2731 vm_pageout_considered_bq_external++;
2732
2733 break;
2734 }
2735 }
2736 #endif
2737
2738 /*
2739 * The most eligible pages are ones we paged in speculatively,
2740 * but which have not yet been touched.
2741 */
2742 if (!vm_page_queue_empty(&sq->age_q) && force_anonymous == FALSE) {
2743 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2744
2745 assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2746
2747 break;
2748 }
2749 /*
2750 * Try a clean-queue inactive page.
2751 */
2752 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2753 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2754
2755 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2756
2757 break;
2758 }
2759
2760 grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2761 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2762
2763 if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) ||
2764 ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) {
2765 grab_anonymous = TRUE;
2766 anons_grabbed = 0;
2767 }
2768 #if CONFIG_JETSAM
2769 /* If the file-backed pool has accumulated
2770 * significantly more pages than the jetsam
2771 * threshold, prefer to reclaim those
2772 * inline to minimise compute overhead of reclaiming
2773 * anonymous pages.
2774 * This calculation does not account for the CPU local
2775 * external page queues, as those are expected to be
2776 * much smaller relative to the global pools.
2777 */
2778 if (grab_anonymous) {
2779 if (vm_page_pageable_external_count >
2780 vm_page_filecache_min) {
2781 if ((vm_page_pageable_external_count *
2782 vm_pageout_memorystatus_fb_factor_dr) >
2783 (memorystatus_available_pages_critical *
2784 vm_pageout_memorystatus_fb_factor_nr)) {
2785 grab_anonymous = FALSE;
2786 #if DEVELOPMENT || DEBUG
2787 vm_grab_anon_overrides++;
2788 #endif
2789 }
2790 }
2791 #if DEVELOPMENT || DEBUG
2792 if (grab_anonymous) {
2793 vm_grab_anon_nops++;
2794
2795 }
2796 #endif
2797 }
2798 #endif /* CONFIG_JETSAM */
2799
2800 if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2801
2802 if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) {
2803 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2804
2805 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2806 anons_grabbed = 0;
2807
2808 if (vm_page_pageable_external_count < vm_page_filecache_min) {
2809 if ((++reactivated_this_call % 100))
2810 goto must_activate_page;
2811 /*
2812 * steal 1% of the file backed pages even if
2813 * we are under the limit that has been set
2814 * for a healthy filecache
2815 */
2816 }
2817 break;
2818 }
2819 }
2820 if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
2821 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2822
2823 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2824 anons_grabbed++;
2825
2826 break;
2827 }
2828
2829 /*
2830 * if we've gotten here, we have no victim page.
2831 * if making clean, free the local freed list and return.
2832 * if making free, check to see if we've finished balancing the queues
2833 * yet, if we haven't just continue, else panic
2834 */
2835 vm_page_unlock_queues();
2836
2837 if (object != NULL) {
2838 vm_object_unlock(object);
2839 object = NULL;
2840 }
2841 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2842
2843 if (local_freeq) {
2844 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2845 vm_page_free_count, local_freed, delayed_unlock_limit, 5);
2846
2847 vm_page_free_list(local_freeq, TRUE);
2848
2849 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2850 vm_page_free_count, local_freed, 0, 5);
2851
2852 local_freeq = NULL;
2853 local_freed = 0;
2854 }
2855 vm_page_lock_queues();
2856 delayed_unlock = 1;
2857
2858 force_anonymous = FALSE;
2859
2860 if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
2861 goto Restart;
2862
2863 if (!vm_page_queue_empty(&sq->age_q))
2864 goto Restart;
2865
2866 panic("vm_pageout: no victim");
2867
2868 /* NOTREACHED */
2869 }
2870 assert(VM_PAGE_PAGEABLE(m));
2871 m_object = VM_PAGE_OBJECT(m);
2872 force_anonymous = FALSE;
2873
2874 page_prev_q_state = m->vm_page_q_state;
2875 /*
2876 * we just found this page on one of our queues...
2877 * it can't also be on the pageout queue, so safe
2878 * to call vm_page_queues_remove
2879 */
2880 vm_page_queues_remove(m, TRUE);
2881
2882 assert(!m->laundry);
2883 assert(!m->private);
2884 assert(!m->fictitious);
2885 assert(m_object != kernel_object);
2886 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2887
2888
2889 if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
2890 page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
2891 vm_pageout_stats[vm_pageout_stat_now].considered++;
2892
2893 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2894
2895 /*
2896 * check to see if we currently are working
2897 * with the same object... if so, we've
2898 * already got the lock
2899 */
2900 if (m_object != object) {
2901 /*
2902 * the object associated with candidate page is
2903 * different from the one we were just working
2904 * with... dump the lock if we still own it
2905 */
2906 if (object != NULL) {
2907 vm_object_unlock(object);
2908 object = NULL;
2909 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2910 }
2911 /*
2912 * Try to lock object; since we've alread got the
2913 * page queues lock, we can only 'try' for this one.
2914 * if the 'try' fails, we need to do a mutex_pause
2915 * to allow the owner of the object lock a chance to
2916 * run... otherwise, we're likely to trip over this
2917 * object in the same state as we work our way through
2918 * the queue... clumps of pages associated with the same
2919 * object are fairly typical on the inactive and active queues
2920 */
2921 if (!vm_object_lock_try_scan(m_object)) {
2922 vm_page_t m_want = NULL;
2923
2924 vm_pageout_inactive_nolock++;
2925
2926 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2927 vm_pageout_cleaned_nolock++;
2928
2929 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2930 m->reference = FALSE;
2931
2932 /*
2933 * m->object must be stable since we hold the page queues lock...
2934 * we can update the scan_collisions field sans the object lock
2935 * since it is a separate field and this is the only spot that does
2936 * a read-modify-write operation and it is never executed concurrently...
2937 * we can asynchronously set this field to 0 when creating a UPL, so it
2938 * is possible for the value to be a bit non-determistic, but that's ok
2939 * since it's only used as a hint
2940 */
2941 m_object->scan_collisions = 1;
2942
2943 if ( !vm_page_queue_empty(&sq->age_q) )
2944 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2945 else if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
2946 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2947 else if ( !vm_page_queue_empty(&vm_page_queue_inactive) &&
2948 (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)))
2949 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2950 else if ( !vm_page_queue_empty(&vm_page_queue_anonymous))
2951 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2952
2953 /*
2954 * this is the next object we're going to be interested in
2955 * try to make sure its available after the mutex_yield
2956 * returns control
2957 */
2958 if (m_want)
2959 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2960
2961 /*
2962 * force us to dump any collected free pages
2963 * and to pause before moving on
2964 */
2965 try_failed = TRUE;
2966
2967 goto requeue_page;
2968 }
2969 object = m_object;
2970 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2971
2972 try_failed = FALSE;
2973 }
2974 assert(m_object == object);
2975 assert(VM_PAGE_OBJECT(m) == m_object);
2976
2977 if (catch_up_count)
2978 catch_up_count--;
2979
2980 if (m->busy) {
2981 if (m->encrypted_cleaning) {
2982 /*
2983 * ENCRYPTED SWAP:
2984 * if this page has already been picked up as
2985 * part of a page-out cluster, it will be busy
2986 * because it is being encrypted (see
2987 * vm_object_upl_request()). But we still
2988 * want to demote it from "clean-in-place"
2989 * (aka "adjacent") to "clean-and-free" (aka
2990 * "target"), so let's ignore its "busy" bit
2991 * here and proceed to check for "cleaning" a
2992 * little bit below...
2993 *
2994 * CAUTION CAUTION:
2995 * A "busy" page should still be left alone for
2996 * most purposes, so we have to be very careful
2997 * not to process that page too much.
2998 */
2999 assert(m->cleaning);
3000 goto consider_inactive_page;
3001 }
3002
3003 /*
3004 * Somebody is already playing with this page.
3005 * Put it back on the appropriate queue
3006 *
3007 */
3008 vm_pageout_inactive_busy++;
3009
3010 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3011 vm_pageout_cleaned_busy++;
3012 requeue_page:
3013 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3014 vm_page_enqueue_inactive(m, FALSE);
3015 else
3016 vm_page_activate(m);
3017
3018 #if CONFIG_BACKGROUND_QUEUE
3019 if (ignore_reference == TRUE) {
3020 if (m_object->internal)
3021 vm_pageout_rejected_bq_internal++;
3022 else
3023 vm_pageout_rejected_bq_external++;
3024 }
3025 #endif
3026 goto done_with_inactivepage;
3027 }
3028
3029
3030 /*
3031 * If it's absent, in error or the object is no longer alive,
3032 * we can reclaim the page... in the no longer alive case,
3033 * there are 2 states the page can be in that preclude us
3034 * from reclaiming it - busy or cleaning - that we've already
3035 * dealt with
3036 */
3037 if (m->absent || m->error || !object->alive) {
3038
3039 if (m->absent)
3040 vm_pageout_inactive_absent++;
3041 else if (!object->alive)
3042 vm_pageout_inactive_notalive++;
3043 else
3044 vm_pageout_inactive_error++;
3045 reclaim_page:
3046 if (vm_pageout_deadlock_target) {
3047 vm_pageout_scan_inactive_throttle_success++;
3048 vm_pageout_deadlock_target--;
3049 }
3050
3051 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3052
3053 if (object->internal) {
3054 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3055 } else {
3056 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3057 }
3058 assert(!m->cleaning);
3059 assert(!m->laundry);
3060
3061 m->busy = TRUE;
3062
3063 /*
3064 * remove page from object here since we're already
3065 * behind the object lock... defer the rest of the work
3066 * we'd normally do in vm_page_free_prepare_object
3067 * until 'vm_page_free_list' is called
3068 */
3069 if (m->tabled)
3070 vm_page_remove(m, TRUE);
3071
3072 assert(m->pageq.next == 0 && m->pageq.prev == 0);
3073 m->snext = local_freeq;
3074 local_freeq = m;
3075 local_freed++;
3076
3077 #if CONFIG_SECLUDED_MEMORY
3078 if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3079 vm_pageout_freed_from_secluded++;
3080 #endif /* CONFIG_SECLUDED_MEMORY */
3081 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3082 vm_pageout_freed_from_speculative++;
3083 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3084 vm_pageout_freed_from_cleaned++;
3085 else
3086 vm_pageout_freed_from_inactive_clean++;
3087
3088 if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
3089 page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
3090 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
3091
3092 inactive_burst_count = 0;
3093 goto done_with_inactivepage;
3094 }
3095 /*
3096 * If the object is empty, the page must be reclaimed even
3097 * if dirty or used.
3098 * If the page belongs to a volatile object, we stick it back
3099 * on.
3100 */
3101 if (object->copy == VM_OBJECT_NULL) {
3102 if (object->purgable == VM_PURGABLE_EMPTY) {
3103 if (m->pmapped == TRUE) {
3104 /* unmap the page */
3105 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3106 if (refmod_state & VM_MEM_MODIFIED) {
3107 SET_PAGE_DIRTY(m, FALSE);
3108 }
3109 }
3110 if (m->dirty || m->precious) {
3111 /* we saved the cost of cleaning this page ! */
3112 vm_page_purged_count++;
3113 }
3114 goto reclaim_page;
3115 }
3116
3117 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3118 /*
3119 * With the VM compressor, the cost of
3120 * reclaiming a page is much lower (no I/O),
3121 * so if we find a "volatile" page, it's better
3122 * to let it get compressed rather than letting
3123 * it occupy a full page until it gets purged.
3124 * So no need to check for "volatile" here.
3125 */
3126 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3127 /*
3128 * Avoid cleaning a "volatile" page which might
3129 * be purged soon.
3130 */
3131
3132 /* if it's wired, we can't put it on our queue */
3133 assert(!VM_PAGE_WIRED(m));
3134
3135 /* just stick it back on! */
3136 reactivated_this_call++;
3137
3138 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3139 vm_pageout_cleaned_volatile_reactivated++;
3140
3141 goto reactivate_page;
3142 }
3143 }
3144
3145 consider_inactive_page:
3146 if (m->busy) {
3147 /*
3148 * CAUTION CAUTION:
3149 * A "busy" page should always be left alone, except...
3150 */
3151 if (m->cleaning && m->encrypted_cleaning) {
3152 /*
3153 * ENCRYPTED_SWAP:
3154 * We could get here with a "busy" page
3155 * if it's being encrypted during a
3156 * "clean-in-place" operation. We'll deal
3157 * with it right away by testing if it has been
3158 * referenced and either reactivating it or
3159 * promoting it from "clean-in-place" to
3160 * "clean-and-free".
3161 */
3162 } else {
3163 panic("\"busy\" page considered for pageout\n");
3164 }
3165 }
3166
3167 /*
3168 * If it's being used, reactivate.
3169 * (Fictitious pages are either busy or absent.)
3170 * First, update the reference and dirty bits
3171 * to make sure the page is unreferenced.
3172 */
3173 refmod_state = -1;
3174
3175 if (m->reference == FALSE && m->pmapped == TRUE) {
3176 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3177
3178 if (refmod_state & VM_MEM_REFERENCED)
3179 m->reference = TRUE;
3180 if (refmod_state & VM_MEM_MODIFIED) {
3181 SET_PAGE_DIRTY(m, FALSE);
3182 }
3183 }
3184
3185 /*
3186 * if (m->cleaning && !m->free_when_done)
3187 * If already cleaning this page in place and it hasn't
3188 * been recently referenced, just pull off the queue.
3189 * We can leave the page mapped, and upl_commit_range
3190 * will put it on the clean queue.
3191 *
3192 * note: if m->encrypted_cleaning == TRUE, then
3193 * m->cleaning == TRUE
3194 * and we'll handle it here
3195 *
3196 * if (m->free_when_done && !m->cleaning)
3197 * an msync INVALIDATE is in progress...
3198 * this page has been marked for destruction
3199 * after it has been cleaned,
3200 * but not yet gathered into a UPL
3201 * where 'cleaning' will be set...
3202 * just leave it off the paging queues
3203 *
3204 * if (m->free_when_done && m->clenaing)
3205 * an msync INVALIDATE is in progress
3206 * and the UPL has already gathered this page...
3207 * just leave it off the paging queues
3208 */
3209
3210 /*
3211 * page with m->free_when_done and still on the queues means that an
3212 * MS_INVALIDATE is in progress on this page... leave it alone
3213 */
3214 if (m->free_when_done) {
3215 goto done_with_inactivepage;
3216 }
3217
3218 /* if cleaning, reactivate if referenced. otherwise, just pull off queue */
3219 if (m->cleaning) {
3220 if (m->reference == TRUE) {
3221 reactivated_this_call++;
3222 goto reactivate_page;
3223 } else {
3224 goto done_with_inactivepage;
3225 }
3226 }
3227
3228 if (m->reference || m->dirty) {
3229 /* deal with a rogue "reusable" page */
3230 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3231 }
3232
3233 #if CONFIG_SECLUDED_MEMORY
3234 if (secluded_for_filecache &&
3235 vm_page_secluded_target > 0 &&
3236 m_object->eligible_for_secluded &&
3237 secluded_aging_policy == SECLUDED_AGING_FIFO) {
3238 /*
3239 * SECLUDED_AGING_FIFO:
3240 * This victim page is eligible for the secluded pool
3241 * and we're not aging secluded pages, so let's not
3242 * reactivate it if it's been re-referenced.
3243 * Later on, we'll move it to the secluded queue
3244 * instead of freeing it.
3245 */
3246 ignore_reference_secluded = TRUE;
3247 } else {
3248 ignore_reference_secluded = FALSE;
3249 }
3250 #endif /* CONFIG_SECLUDED_MEMORY */
3251
3252 if (!m->no_cache &&
3253 #if CONFIG_BACKGROUND_QUEUE
3254 ignore_reference == FALSE &&
3255 #endif
3256 #if CONFIG_SECLUDED_MEMORY
3257 ignore_reference_secluded == FALSE &&
3258 #endif /* CONFIG_SECLUDED_MEMORY */
3259 (m->reference ||
3260 (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) {
3261 /*
3262 * The page we pulled off the inactive list has
3263 * been referenced. It is possible for other
3264 * processors to be touching pages faster than we
3265 * can clear the referenced bit and traverse the
3266 * inactive queue, so we limit the number of
3267 * reactivations.
3268 */
3269 if (++reactivated_this_call >= reactivate_limit) {
3270 vm_pageout_reactivation_limit_exceeded++;
3271 } else if (catch_up_count) {
3272 vm_pageout_catch_ups++;
3273 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3274 vm_pageout_inactive_force_reclaim++;
3275 } else {
3276 uint32_t isinuse;
3277
3278 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3279 vm_pageout_cleaned_reference_reactivated++;
3280
3281 reactivate_page:
3282 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
3283 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3284 /*
3285 * no explict mappings of this object exist
3286 * and it's not open via the filesystem
3287 */
3288 vm_page_deactivate(m);
3289 vm_pageout_inactive_deactivated++;
3290 } else {
3291 must_activate_page:
3292 /*
3293 * The page was/is being used, so put back on active list.
3294 */
3295 vm_page_activate(m);
3296 VM_STAT_INCR(reactivations);
3297 inactive_burst_count = 0;
3298 }
3299 #if CONFIG_BACKGROUND_QUEUE
3300 if (ignore_reference == TRUE) {
3301 if (m_object->internal)
3302 vm_pageout_rejected_bq_internal++;
3303 else
3304 vm_pageout_rejected_bq_external++;
3305 }
3306 #endif
3307 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3308 vm_pageout_cleaned_reactivated++;
3309 #if CONFIG_SECLUDED_MEMORY
3310 if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3311 vm_pageout_secluded_reactivated++;
3312 #endif /* CONFIG_SECLUDED_MEMORY */
3313
3314 vm_pageout_inactive_used++;
3315
3316 goto done_with_inactivepage;
3317 }
3318 /*
3319 * Make sure we call pmap_get_refmod() if it
3320 * wasn't already called just above, to update
3321 * the dirty bit.
3322 */
3323 if ((refmod_state == -1) && !m->dirty && m->pmapped) {
3324 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3325 if (refmod_state & VM_MEM_MODIFIED) {
3326 SET_PAGE_DIRTY(m, FALSE);
3327 }
3328 }
3329 }
3330
3331 XPR(XPR_VM_PAGEOUT,
3332 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
3333 object, m->offset, m, 0,0);
3334
3335 /*
3336 * we've got a candidate page to steal...
3337 *
3338 * m->dirty is up to date courtesy of the
3339 * preceding check for m->reference... if
3340 * we get here, then m->reference had to be
3341 * FALSE (or possibly "reactivate_limit" was
3342 * exceeded), but in either case we called
3343 * pmap_get_refmod() and updated both
3344 * m->reference and m->dirty
3345 *
3346 * if it's dirty or precious we need to
3347 * see if the target queue is throtttled
3348 * it if is, we need to skip over it by moving it back
3349 * to the end of the inactive queue
3350 */
3351
3352 inactive_throttled = FALSE;
3353
3354 if (m->dirty || m->precious) {
3355 if (object->internal) {
3356 if (VM_PAGE_Q_THROTTLED(iq))
3357 inactive_throttled = TRUE;
3358 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3359 inactive_throttled = TRUE;
3360 }
3361 }
3362 throttle_inactive:
3363 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3364 object->internal && m->dirty &&
3365 (object->purgable == VM_PURGABLE_DENY ||
3366 object->purgable == VM_PURGABLE_NONVOLATILE ||
3367 object->purgable == VM_PURGABLE_VOLATILE)) {
3368 vm_page_check_pageable_safe(m);
3369 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3370 vm_page_queue_enter(&vm_page_queue_throttled, m,
3371 vm_page_t, pageq);
3372 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
3373 vm_page_throttled_count++;
3374
3375 vm_pageout_scan_reclaimed_throttled++;
3376
3377 inactive_burst_count = 0;
3378 goto done_with_inactivepage;
3379 }
3380 if (inactive_throttled == TRUE) {
3381
3382 if (object->internal == FALSE) {
3383 /*
3384 * we need to break up the following potential deadlock case...
3385 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
3386 * b) The thread doing the writing is waiting for pages while holding the truncate lock
3387 * c) Most of the pages in the inactive queue belong to this file.
3388 *
3389 * we are potentially in this deadlock because...
3390 * a) the external pageout queue is throttled
3391 * b) we're done with the active queue and moved on to the inactive queue
3392 * c) we've got a dirty external page
3393 *
3394 * since we don't know the reason for the external pageout queue being throttled we
3395 * must suspect that we are deadlocked, so move the current page onto the active queue
3396 * in an effort to cause a page from the active queue to 'age' to the inactive queue
3397 *
3398 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
3399 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
3400 * pool the next time we select a victim page... if we can make enough new free pages,
3401 * the deadlock will break, the external pageout queue will empty and it will no longer
3402 * be throttled
3403 *
3404 * if we have jestam configured, keep a count of the pages reactivated this way so
3405 * that we can try to find clean pages in the active/inactive queues before
3406 * deciding to jetsam a process
3407 */
3408 vm_pageout_scan_inactive_throttled_external++;
3409
3410 vm_page_check_pageable_safe(m);
3411 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3412 vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
3413 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
3414 vm_page_active_count++;
3415 vm_page_pageable_external_count++;
3416
3417 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
3418
3419 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
3420 vm_pageout_inactive_external_forced_reactivate_limit--;
3421
3422 if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
3423 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3424 /*
3425 * Possible deadlock scenario so request jetsam action
3426 */
3427 assert(object);
3428 vm_object_unlock(object);
3429 object = VM_OBJECT_NULL;
3430 vm_page_unlock_queues();
3431
3432 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
3433 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
3434
3435 /* Kill first suitable process */
3436 if (memorystatus_kill_on_VM_page_shortage(FALSE) == FALSE) {
3437 panic("vm_pageout_scan: Jetsam request failed\n");
3438 }
3439
3440 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
3441
3442 vm_pageout_inactive_external_forced_jetsam_count++;
3443 vm_page_lock_queues();
3444 delayed_unlock = 1;
3445 }
3446 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
3447 force_anonymous = TRUE;
3448 #endif
3449 inactive_burst_count = 0;
3450 goto done_with_inactivepage;
3451 } else {
3452 vm_pageout_scan_inactive_throttled_internal++;
3453
3454 goto must_activate_page;
3455 }
3456 }
3457
3458 /*
3459 * we've got a page that we can steal...
3460 * eliminate all mappings and make sure
3461 * we have the up-to-date modified state
3462 *
3463 * if we need to do a pmap_disconnect then we
3464 * need to re-evaluate m->dirty since the pmap_disconnect
3465 * provides the true state atomically... the
3466 * page was still mapped up to the pmap_disconnect
3467 * and may have been dirtied at the last microsecond
3468 *
3469 * Note that if 'pmapped' is FALSE then the page is not
3470 * and has not been in any map, so there is no point calling
3471 * pmap_disconnect(). m->dirty could have been set in anticipation
3472 * of likely usage of the page.
3473 */
3474 if (m->pmapped == TRUE) {
3475 int pmap_options;
3476
3477 /*
3478 * Don't count this page as going into the compressor
3479 * if any of these are true:
3480 * 1) compressed pager isn't enabled
3481 * 2) Freezer enabled device with compressed pager
3482 * backend (exclusive use) i.e. most of the VM system
3483 * (including vm_pageout_scan) has no knowledge of
3484 * the compressor
3485 * 3) This page belongs to a file and hence will not be
3486 * sent into the compressor
3487 */
3488 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3489 object->internal == FALSE) {
3490 pmap_options = 0;
3491 } else if (m->dirty || m->precious) {
3492 /*
3493 * VM knows that this page is dirty (or
3494 * precious) and needs to be compressed
3495 * rather than freed.
3496 * Tell the pmap layer to count this page
3497 * as "compressed".
3498 */
3499 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3500 } else {
3501 /*
3502 * VM does not know if the page needs to
3503 * be preserved but the pmap layer might tell
3504 * us if any mapping has "modified" it.
3505 * Let's the pmap layer to count this page
3506 * as compressed if and only if it has been
3507 * modified.
3508 */
3509 pmap_options =
3510 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3511 }
3512 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3513 pmap_options,
3514 NULL);
3515 if (refmod_state & VM_MEM_MODIFIED) {
3516 SET_PAGE_DIRTY(m, FALSE);
3517 }
3518 }
3519 /*
3520 * reset our count of pages that have been reclaimed
3521 * since the last page was 'stolen'
3522 */
3523 inactive_reclaim_run = 0;
3524
3525 /*
3526 * If it's clean and not precious, we can free the page.
3527 */
3528 if (!m->dirty && !m->precious) {
3529
3530 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3531 vm_pageout_speculative_clean++;
3532 else {
3533 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3534 vm_pageout_inactive_anonymous++;
3535 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3536 vm_pageout_cleaned_reclaimed++;
3537
3538 vm_pageout_inactive_clean++;
3539 }
3540
3541 #if CONFIG_SECLUDED_MEMORY
3542 if (secluded_for_filecache &&
3543 vm_page_secluded_target > 0 &&
3544 !m->fictitious &&
3545 m_object->eligible_for_secluded &&
3546 num_tasks_can_use_secluded_mem == 0 &&
3547 (secluded_aging_policy == SECLUDED_AGING_FIFO ||
3548 ((secluded_aging_policy ==
3549 SECLUDED_AGING_AFTER_INACTIVE) &&
3550 (page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)))) {
3551 assert(page_prev_q_state != VM_PAGE_ON_SECLUDED_Q);
3552 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3553 LCK_MTX_ASSERT(&vm_page_queue_lock,
3554 LCK_MTX_ASSERT_OWNED);
3555 vm_page_queue_enter(&vm_page_queue_secluded,
3556 m,
3557 vm_page_t,
3558 pageq);
3559 m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
3560 vm_object_unlock(m_object);
3561 object = VM_OBJECT_NULL;
3562 vm_page_secluded_count++;
3563 vm_page_secluded_count_inuse++;
3564 assert(!m_object->internal);
3565 // vm_page_pageable_external_count++;
3566 m = VM_PAGE_NULL;
3567 goto done_with_inactivepage;
3568 }
3569 #endif /* CONFIG_SECLUDED_MEMORY */
3570
3571 /*
3572 * OK, at this point we have found a page we are going to free.
3573 */
3574 #if CONFIG_PHANTOM_CACHE
3575 if (!object->internal)
3576 vm_phantom_cache_add_ghost(m);
3577 #endif
3578 goto reclaim_page;
3579 }
3580
3581 /*
3582 * The page may have been dirtied since the last check
3583 * for a throttled target queue (which may have been skipped
3584 * if the page was clean then). With the dirty page
3585 * disconnected here, we can make one final check.
3586 */
3587 if (object->internal) {
3588 if (VM_PAGE_Q_THROTTLED(iq))
3589 inactive_throttled = TRUE;
3590 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3591 inactive_throttled = TRUE;
3592 }
3593
3594 if (inactive_throttled == TRUE)
3595 goto throttle_inactive;
3596
3597 #if VM_PRESSURE_EVENTS
3598 #if CONFIG_JETSAM
3599
3600 /*
3601 * If Jetsam is enabled, then the sending
3602 * of memory pressure notifications is handled
3603 * from the same thread that takes care of high-water
3604 * and other jetsams i.e. the memorystatus_thread.
3605 */
3606
3607 #else /* CONFIG_JETSAM */
3608
3609 vm_pressure_response();
3610
3611 #endif /* CONFIG_JETSAM */
3612 #endif /* VM_PRESSURE_EVENTS */
3613
3614 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3615 vm_pageout_inactive_anonymous++;
3616 if (object->internal)
3617 vm_pageout_inactive_dirty_internal++;
3618 else
3619 vm_pageout_inactive_dirty_external++;
3620
3621 /*
3622 * do NOT set the pageout bit!
3623 * sure, we might need free pages, but this page is going to take time to become free
3624 * anyway, so we may as well put it on the clean queue first and take it from there later
3625 * if necessary. that way, we'll ensure we don't free up too much. -mj
3626 */
3627 vm_pageout_cluster(m, FALSE, FALSE);
3628
3629 done_with_inactivepage:
3630
3631 if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
3632 boolean_t need_delay = TRUE;
3633
3634 if (object != NULL) {
3635 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3636 vm_object_unlock(object);
3637 object = NULL;
3638 }
3639 vm_page_unlock_queues();
3640
3641 if (local_freeq) {
3642
3643 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
3644 vm_page_free_count, local_freed, delayed_unlock_limit, 4);
3645
3646 vm_page_free_list(local_freeq, TRUE);
3647
3648 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
3649 vm_page_free_count, local_freed, 0, 4);
3650
3651 local_freeq = NULL;
3652 local_freed = 0;
3653 need_delay = FALSE;
3654 }
3655 vm_consider_waking_compactor_swapper();
3656
3657 vm_page_lock_queues();
3658
3659 if (need_delay == TRUE)
3660 lck_mtx_yield(&vm_page_queue_lock);
3661
3662 delayed_unlock = 1;
3663 }
3664 vm_pageout_considered_page++;
3665
3666 /*
3667 * back to top of pageout scan loop
3668 */
3669 }
3670 }
3671
3672
3673 int vm_page_free_count_init;
3674
3675 void
3676 vm_page_free_reserve(
3677 int pages)
3678 {
3679 int free_after_reserve;
3680
3681 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3682
3683 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT))
3684 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3685 else
3686 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3687
3688 } else {
3689 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT)
3690 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3691 else
3692 vm_page_free_reserved += pages;
3693 }
3694 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
3695
3696 vm_page_free_min = vm_page_free_reserved +
3697 VM_PAGE_FREE_MIN(free_after_reserve);
3698
3699 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
3700 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3701
3702 vm_page_free_target = vm_page_free_reserved +
3703 VM_PAGE_FREE_TARGET(free_after_reserve);
3704
3705 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
3706 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3707
3708 if (vm_page_free_target < vm_page_free_min + 5)
3709 vm_page_free_target = vm_page_free_min + 5;
3710
3711 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3712 }
3713
3714 /*
3715 * vm_pageout is the high level pageout daemon.
3716 */
3717
3718 void
3719 vm_pageout_continue(void)
3720 {
3721 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3722 vm_pageout_scan_event_counter++;
3723
3724 lck_mtx_lock(&vm_page_queue_free_lock);
3725 vm_pageout_running = TRUE;
3726 lck_mtx_unlock(&vm_page_queue_free_lock);
3727
3728 vm_pageout_scan();
3729 /*
3730 * we hold both the vm_page_queue_free_lock
3731 * and the vm_page_queues_lock at this point
3732 */
3733 assert(vm_page_free_wanted == 0);
3734 assert(vm_page_free_wanted_privileged == 0);
3735 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3736
3737 vm_pageout_running = FALSE;
3738 if (vm_pageout_waiter) {
3739 vm_pageout_waiter = FALSE;
3740 thread_wakeup((event_t)&vm_pageout_waiter);
3741 }
3742
3743 lck_mtx_unlock(&vm_page_queue_free_lock);
3744 vm_page_unlock_queues();
3745
3746 counter(c_vm_pageout_block++);
3747 thread_block((thread_continue_t)vm_pageout_continue);
3748 /*NOTREACHED*/
3749 }
3750
3751 kern_return_t
3752 vm_pageout_wait(uint64_t deadline)
3753 {
3754 kern_return_t kr;
3755
3756 lck_mtx_lock(&vm_page_queue_free_lock);
3757 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) {
3758 vm_pageout_waiter = TRUE;
3759 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3760 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3761 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3762 kr = KERN_OPERATION_TIMED_OUT;
3763 }
3764 }
3765 lck_mtx_unlock(&vm_page_queue_free_lock);
3766
3767 return (kr);
3768 }
3769
3770
3771 static void
3772 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3773 {
3774 vm_page_t m = NULL;
3775 vm_object_t object;
3776 vm_object_offset_t offset;
3777 memory_object_t pager;
3778
3779
3780 if (vm_pageout_internal_iothread != THREAD_NULL)
3781 current_thread()->options &= ~TH_OPT_VMPRIV;
3782
3783 vm_page_lockspin_queues();
3784
3785 while ( !vm_page_queue_empty(&q->pgo_pending) ) {
3786
3787 q->pgo_busy = TRUE;
3788 vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3789
3790 assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3791 VM_PAGE_CHECK(m);
3792 /*
3793 * grab a snapshot of the object and offset this
3794 * page is tabled in so that we can relookup this
3795 * page after we've taken the object lock - these
3796 * fields are stable while we hold the page queues lock
3797 * but as soon as we drop it, there is nothing to keep
3798 * this page in this object... we hold an activity_in_progress
3799 * on this object which will keep it from terminating
3800 */
3801 object = VM_PAGE_OBJECT(m);
3802 offset = m->offset;
3803
3804 if (object->object_slid) {
3805 panic("slid page %p not allowed on this path\n", m);
3806 }
3807 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3808 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3809
3810 vm_page_unlock_queues();
3811
3812 vm_object_lock(object);
3813
3814 m = vm_page_lookup(object, offset);
3815
3816 if (m == NULL ||
3817 m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
3818 /*
3819 * it's either the same page that someone else has
3820 * started cleaning (or it's finished cleaning or
3821 * been put back on the pageout queue), or
3822 * the page has been freed or we have found a
3823 * new page at this offset... in all of these cases
3824 * we merely need to release the activity_in_progress
3825 * we took when we put the page on the pageout queue
3826 */
3827 vm_object_activity_end(object);
3828 vm_object_unlock(object);
3829
3830 vm_page_lockspin_queues();
3831 continue;
3832 }
3833 pager = object->pager;
3834
3835 if (pager == MEMORY_OBJECT_NULL) {
3836 /*
3837 * This pager has been destroyed by either
3838 * memory_object_destroy or vm_object_destroy, and
3839 * so there is nowhere for the page to go.
3840 */
3841 if (m->free_when_done) {
3842 /*
3843 * Just free the page... VM_PAGE_FREE takes
3844 * care of cleaning up all the state...
3845 * including doing the vm_pageout_throttle_up
3846 */
3847 VM_PAGE_FREE(m);
3848 } else {
3849 vm_page_lockspin_queues();
3850
3851 vm_pageout_throttle_up(m);
3852 vm_page_activate(m);
3853
3854 vm_page_unlock_queues();
3855
3856 /*
3857 * And we are done with it.
3858 */
3859 }
3860 vm_object_activity_end(object);
3861 vm_object_unlock(object);
3862
3863 vm_page_lockspin_queues();
3864 continue;
3865 }
3866 #if 0
3867 /*
3868 * we don't hold the page queue lock
3869 * so this check isn't safe to make
3870 */
3871 VM_PAGE_CHECK(m);
3872 #endif
3873 /*
3874 * give back the activity_in_progress reference we
3875 * took when we queued up this page and replace it
3876 * it with a paging_in_progress reference that will
3877 * also hold the paging offset from changing and
3878 * prevent the object from terminating
3879 */
3880 vm_object_activity_end(object);
3881 vm_object_paging_begin(object);
3882 vm_object_unlock(object);
3883
3884 /*
3885 * Send the data to the pager.
3886 * any pageout clustering happens there
3887 */
3888 memory_object_data_return(pager,
3889 m->offset + object->paging_offset,
3890 PAGE_SIZE,
3891 NULL,
3892 NULL,
3893 FALSE,
3894 FALSE,
3895 0);
3896
3897 vm_object_lock(object);
3898 vm_object_paging_end(object);
3899 vm_object_unlock(object);
3900
3901 vm_pageout_io_throttle();
3902
3903 vm_page_lockspin_queues();
3904 }
3905 q->pgo_busy = FALSE;
3906 q->pgo_idle = TRUE;
3907
3908 assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3909 vm_page_unlock_queues();
3910
3911 thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3912 /*NOTREACHED*/
3913 }
3914
3915
3916 uint32_t vm_compressor_failed;
3917
3918 #define MAX_FREE_BATCH 32
3919 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3920 * this thread.
3921 */
3922 uint64_t vm_compressor_thread_runtime;
3923
3924 static void
3925 vm_pageout_iothread_internal_continue(struct cq *cq)
3926 {
3927 struct vm_pageout_queue *q;
3928 vm_page_t m = NULL;
3929 boolean_t pgo_draining;
3930 vm_page_t local_q;
3931 int local_cnt;
3932 vm_page_t local_freeq = NULL;
3933 int local_freed = 0;
3934 int local_batch_size;
3935
3936
3937 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3938
3939 q = cq->q;
3940 local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2);
3941
3942 #if RECORD_THE_COMPRESSED_DATA
3943 if (q->pgo_laundry)
3944 c_compressed_record_init();
3945 #endif
3946 while (TRUE) {
3947 int pages_left_on_q = 0;
3948
3949 local_cnt = 0;
3950 local_q = NULL;
3951
3952 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3953
3954 vm_page_lock_queues();
3955
3956 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3957
3958 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
3959
3960 while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
3961
3962 vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3963 assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3964 VM_PAGE_CHECK(m);
3965
3966 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3967 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3968 m->laundry = FALSE;
3969
3970 m->snext = local_q;
3971 local_q = m;
3972 local_cnt++;
3973 }
3974 if (local_q == NULL)
3975 break;
3976
3977 q->pgo_busy = TRUE;
3978
3979 if ((pgo_draining = q->pgo_draining) == FALSE) {
3980 vm_pageout_throttle_up_batch(q, local_cnt);
3981 pages_left_on_q = q->pgo_laundry;
3982 } else
3983 pages_left_on_q = q->pgo_laundry - local_cnt;
3984
3985 vm_page_unlock_queues();
3986
3987 #if !RECORD_THE_COMPRESSED_DATA
3988 if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1))
3989 thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
3990 #endif
3991 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
3992
3993 while (local_q) {
3994
3995 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
3996
3997 m = local_q;
3998 local_q = m->snext;
3999 m->snext = NULL;
4000
4001 if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) {
4002
4003 m->snext = local_freeq;
4004 local_freeq = m;
4005 local_freed++;
4006
4007 if (local_freed >= MAX_FREE_BATCH) {
4008
4009 vm_page_free_list(local_freeq, TRUE);
4010 local_freeq = NULL;
4011 local_freed = 0;
4012 }
4013 }
4014 #if !CONFIG_JETSAM
4015 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4016 kern_return_t wait_result;
4017 int need_wakeup = 0;
4018
4019 if (local_freeq) {
4020 vm_page_free_list(local_freeq, TRUE);
4021
4022 local_freeq = NULL;
4023 local_freed = 0;
4024
4025 continue;
4026 }
4027 lck_mtx_lock_spin(&vm_page_queue_free_lock);
4028
4029 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4030
4031 if (vm_page_free_wanted_privileged++ == 0)
4032 need_wakeup = 1;
4033 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4034
4035 lck_mtx_unlock(&vm_page_queue_free_lock);
4036
4037 if (need_wakeup)
4038 thread_wakeup((event_t)&vm_page_free_wanted);
4039
4040 if (wait_result == THREAD_WAITING)
4041
4042 thread_block(THREAD_CONTINUE_NULL);
4043 } else
4044 lck_mtx_unlock(&vm_page_queue_free_lock);
4045 }
4046 #endif
4047 }
4048 if (local_freeq) {
4049 vm_page_free_list(local_freeq, TRUE);
4050
4051 local_freeq = NULL;
4052 local_freed = 0;
4053 }
4054 if (pgo_draining == TRUE) {
4055 vm_page_lockspin_queues();
4056 vm_pageout_throttle_up_batch(q, local_cnt);
4057 vm_page_unlock_queues();
4058 }
4059 }
4060 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4061
4062 /*
4063 * queue lock is held and our q is empty
4064 */
4065 q->pgo_busy = FALSE;
4066 q->pgo_idle = TRUE;
4067
4068 assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
4069 vm_page_unlock_queues();
4070
4071 if (__improbable(vm_compressor_time_thread)) {
4072 vm_compressor_thread_runtime = thread_get_runtime_self();
4073 }
4074
4075 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4076
4077 thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4078 /*NOTREACHED*/
4079 }
4080
4081
4082
4083 static void
4084 vm_pageout_immediate(vm_page_t m, boolean_t object_locked_by_caller)
4085 {
4086 assert(vm_pageout_immediate_scratch_buf);
4087
4088 if (vm_pageout_compress_page(&vm_pageout_immediate_chead, vm_pageout_immediate_scratch_buf, m, object_locked_by_caller) == KERN_SUCCESS) {
4089
4090 vm_page_free_prepare_object(m, TRUE);
4091 vm_page_release(m, TRUE);
4092 }
4093 }
4094
4095
4096 kern_return_t
4097 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller)
4098 {
4099 vm_object_t object;
4100 memory_object_t pager;
4101 int compressed_count_delta;
4102 kern_return_t retval;
4103
4104 object = VM_PAGE_OBJECT(m);
4105
4106 if (object->object_slid) {
4107 panic("slid page %p not allowed on this path\n", m);
4108 }
4109 assert(!m->free_when_done);
4110 assert(!m->laundry);
4111
4112 pager = object->pager;
4113
4114 if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)) {
4115
4116 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4117
4118 vm_object_lock(object);
4119
4120 /*
4121 * If there is no memory object for the page, create
4122 * one and hand it to the compression pager.
4123 */
4124
4125 if (!object->pager_initialized)
4126 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4127 if (!object->pager_initialized)
4128 vm_object_compressor_pager_create(object);
4129
4130 pager = object->pager;
4131
4132 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4133 /*
4134 * Still no pager for the object,
4135 * or the pager has been destroyed.
4136 * Reactivate the page.
4137 *
4138 * Should only happen if there is no
4139 * compression pager
4140 */
4141 PAGE_WAKEUP_DONE(m);
4142
4143 vm_page_lockspin_queues();
4144 vm_page_activate(m);
4145 vm_pageout_dirty_no_pager++;
4146 vm_page_unlock_queues();
4147
4148 /*
4149 * And we are done with it.
4150 */
4151 vm_object_activity_end(object);
4152 vm_object_unlock(object);
4153
4154 return KERN_FAILURE;
4155 }
4156 vm_object_unlock(object);
4157
4158 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4159 }
4160 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4161
4162 if (object_locked_by_caller == FALSE)
4163 assert(object->activity_in_progress > 0);
4164
4165 retval = vm_compressor_pager_put(
4166 pager,
4167 m->offset + object->paging_offset,
4168 VM_PAGE_GET_PHYS_PAGE(m),
4169 current_chead,
4170 scratch_buf,
4171 &compressed_count_delta);
4172
4173 if (object_locked_by_caller == FALSE) {
4174 vm_object_lock(object);
4175
4176 assert(object->activity_in_progress > 0);
4177 assert(VM_PAGE_OBJECT(m) == object);
4178 }
4179
4180 vm_compressor_pager_count(pager,
4181 compressed_count_delta,
4182 FALSE, /* shared_lock */
4183 object);
4184
4185 assert( !VM_PAGE_WIRED(m));
4186
4187 if (retval == KERN_SUCCESS) {
4188 /*
4189 * If the object is purgeable, its owner's
4190 * purgeable ledgers will be updated in
4191 * vm_page_remove() but the page still
4192 * contributes to the owner's memory footprint,
4193 * so account for it as such.
4194 */
4195 if (object->purgable != VM_PURGABLE_DENY &&
4196 object->vo_purgeable_owner != NULL) {
4197 /* one more compressed purgeable page */
4198 vm_purgeable_compressed_update(object,
4199 +1);
4200 }
4201 VM_STAT_INCR(compressions);
4202
4203 if (m->tabled)
4204 vm_page_remove(m, TRUE);
4205
4206 } else {
4207 PAGE_WAKEUP_DONE(m);
4208
4209 vm_page_lockspin_queues();
4210
4211 vm_page_activate(m);
4212 vm_compressor_failed++;
4213
4214 vm_page_unlock_queues();
4215 }
4216 if (object_locked_by_caller == FALSE) {
4217 vm_object_activity_end(object);
4218 vm_object_unlock(object);
4219 }
4220 return retval;
4221 }
4222
4223
4224 static void
4225 vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4226 {
4227 uint32_t policy;
4228 boolean_t set_iq = FALSE;
4229 boolean_t set_eq = FALSE;
4230
4231 if (hibernate_cleaning_in_progress == TRUE)
4232 req_lowpriority = FALSE;
4233
4234 if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority)
4235 set_eq = TRUE;
4236
4237 if (set_iq == TRUE || set_eq == TRUE) {
4238
4239 vm_page_unlock_queues();
4240
4241 if (req_lowpriority == TRUE) {
4242 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4243 DTRACE_VM(laundrythrottle);
4244 } else {
4245 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4246 DTRACE_VM(laundryunthrottle);
4247 }
4248 if (set_iq == TRUE) {
4249 proc_set_thread_policy_with_tid(kernel_task, iq->pgo_tid,
4250 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4251
4252 iq->pgo_lowpriority = req_lowpriority;
4253 }
4254 if (set_eq == TRUE) {
4255 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4256 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4257
4258 eq->pgo_lowpriority = req_lowpriority;
4259 }
4260 vm_page_lock_queues();
4261 }
4262 }
4263
4264
4265 static void
4266 vm_pageout_iothread_external(void)
4267 {
4268 thread_t self = current_thread();
4269
4270 self->options |= TH_OPT_VMPRIV;
4271
4272 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4273
4274 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4275 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4276
4277 vm_page_lock_queues();
4278
4279 vm_pageout_queue_external.pgo_tid = self->thread_id;
4280 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4281 vm_pageout_queue_external.pgo_inited = TRUE;
4282
4283 vm_page_unlock_queues();
4284
4285 vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4286
4287 /*NOTREACHED*/
4288 }
4289
4290
4291 static void
4292 vm_pageout_iothread_internal(struct cq *cq)
4293 {
4294 thread_t self = current_thread();
4295
4296 self->options |= TH_OPT_VMPRIV;
4297
4298 vm_page_lock_queues();
4299
4300 vm_pageout_queue_internal.pgo_tid = self->thread_id;
4301 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4302 vm_pageout_queue_internal.pgo_inited = TRUE;
4303
4304 vm_page_unlock_queues();
4305
4306 if (vm_restricted_to_single_processor == TRUE)
4307 thread_vm_bind_group_add();
4308
4309 vm_pageout_iothread_internal_continue(cq);
4310
4311 /*NOTREACHED*/
4312 }
4313
4314 kern_return_t
4315 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4316 {
4317 if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
4318 return KERN_SUCCESS;
4319 } else {
4320 return KERN_FAILURE; /* Already set */
4321 }
4322 }
4323
4324 extern boolean_t memorystatus_manual_testing_on;
4325 extern unsigned int memorystatus_level;
4326
4327
4328 #if VM_PRESSURE_EVENTS
4329
4330 boolean_t vm_pressure_events_enabled = FALSE;
4331
4332 void
4333 vm_pressure_response(void)
4334 {
4335
4336 vm_pressure_level_t old_level = kVMPressureNormal;
4337 int new_level = -1;
4338 unsigned int total_pages;
4339 uint64_t available_memory = 0;
4340
4341 if (vm_pressure_events_enabled == FALSE)
4342 return;
4343
4344
4345 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4346
4347
4348 total_pages = (unsigned int) atop_64(max_mem);
4349 #if CONFIG_SECLUDED_MEMORY
4350 total_pages -= vm_page_secluded_count;
4351 #endif /* CONFIG_SECLUDED_MEMORY */
4352 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4353
4354 if (memorystatus_manual_testing_on) {
4355 return;
4356 }
4357
4358 old_level = memorystatus_vm_pressure_level;
4359
4360 switch (memorystatus_vm_pressure_level) {
4361
4362 case kVMPressureNormal:
4363 {
4364 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4365 new_level = kVMPressureCritical;
4366 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4367 new_level = kVMPressureWarning;
4368 }
4369 break;
4370 }
4371
4372 case kVMPressureWarning:
4373 case kVMPressureUrgent:
4374 {
4375 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4376 new_level = kVMPressureNormal;
4377 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4378 new_level = kVMPressureCritical;
4379 }
4380 break;
4381 }
4382
4383 case kVMPressureCritical:
4384 {
4385 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4386 new_level = kVMPressureNormal;
4387 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4388 new_level = kVMPressureWarning;
4389 }
4390 break;
4391 }
4392
4393 default:
4394 return;
4395 }
4396
4397 if (new_level != -1) {
4398 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4399
4400 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) {
4401 if (vm_pressure_thread_running == FALSE) {
4402 thread_wakeup(&vm_pressure_thread);
4403 }
4404
4405 if (old_level != new_level) {
4406 thread_wakeup(&vm_pressure_changed);
4407 }
4408 }
4409 }
4410
4411 }
4412 #endif /* VM_PRESSURE_EVENTS */
4413
4414 kern_return_t
4415 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
4416
4417 #if !VM_PRESSURE_EVENTS
4418
4419 return KERN_FAILURE;
4420
4421 #else /* VM_PRESSURE_EVENTS */
4422
4423 kern_return_t kr = KERN_SUCCESS;
4424
4425 if (pressure_level != NULL) {
4426
4427 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4428
4429 if (wait_for_pressure == TRUE) {
4430 wait_result_t wr = 0;
4431
4432 while (old_level == *pressure_level) {
4433 wr = assert_wait((event_t) &vm_pressure_changed,
4434 THREAD_INTERRUPTIBLE);
4435 if (wr == THREAD_WAITING) {
4436 wr = thread_block(THREAD_CONTINUE_NULL);
4437 }
4438 if (wr == THREAD_INTERRUPTED) {
4439 return KERN_ABORTED;
4440 }
4441 if (wr == THREAD_AWAKENED) {
4442
4443 old_level = memorystatus_vm_pressure_level;
4444
4445 if (old_level != *pressure_level) {
4446 break;
4447 }
4448 }
4449 }
4450 }
4451
4452 *pressure_level = old_level;
4453 kr = KERN_SUCCESS;
4454 } else {
4455 kr = KERN_INVALID_ARGUMENT;
4456 }
4457
4458 return kr;
4459 #endif /* VM_PRESSURE_EVENTS */
4460 }
4461
4462 #if VM_PRESSURE_EVENTS
4463 void
4464 vm_pressure_thread(void) {
4465 static boolean_t thread_initialized = FALSE;
4466
4467 if (thread_initialized == TRUE) {
4468 vm_pressure_thread_running = TRUE;
4469 consider_vm_pressure_events();
4470 vm_pressure_thread_running = FALSE;
4471 }
4472
4473 thread_initialized = TRUE;
4474 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4475 thread_block((thread_continue_t)vm_pressure_thread);
4476 }
4477 #endif /* VM_PRESSURE_EVENTS */
4478
4479
4480 uint32_t vm_pageout_considered_page_last = 0;
4481
4482 /*
4483 * called once per-second via "compute_averages"
4484 */
4485 void
4486 compute_pageout_gc_throttle(__unused void *arg)
4487 {
4488 if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
4489
4490 vm_pageout_considered_page_last = vm_pageout_considered_page;
4491
4492 thread_wakeup((event_t) &vm_pageout_garbage_collect);
4493 }
4494 }
4495
4496
4497 static void
4498 vm_pageout_garbage_collect(int collect)
4499 {
4500
4501 if (collect) {
4502 boolean_t buf_large_zfree = FALSE;
4503 boolean_t first_try = TRUE;
4504
4505 stack_collect();
4506
4507 consider_machine_collect();
4508 m_drain();
4509
4510 do {
4511 if (consider_buffer_cache_collect != NULL) {
4512 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4513 }
4514 if (first_try == TRUE || buf_large_zfree == TRUE) {
4515 /*
4516 * consider_zone_gc should be last, because the other operations
4517 * might return memory to zones.
4518 */
4519 consider_zone_gc();
4520 }
4521 first_try = FALSE;
4522
4523 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4524
4525 consider_machine_adjust();
4526 }
4527 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
4528
4529 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
4530 /*NOTREACHED*/
4531 }
4532
4533
4534 #if VM_PAGE_BUCKETS_CHECK
4535 #if VM_PAGE_FAKE_BUCKETS
4536 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4537 #endif /* VM_PAGE_FAKE_BUCKETS */
4538 #endif /* VM_PAGE_BUCKETS_CHECK */
4539
4540
4541 #define FBDP_TEST_COLLAPSE_COMPRESSOR 0
4542 #define FBDP_TEST_WIRE_AND_EXTRACT 0
4543 #define FBDP_TEST_PAGE_WIRE_OVERFLOW 0
4544 #define FBDP_TEST_KERNEL_OBJECT_FAULT 0
4545
4546 #if FBDP_TEST_KERNEL_OBJECT_FAULT
4547 #endif /* FBDP_TEST_KERNEL_OBJECT_FAULT */
4548
4549 #if FBDP_TEST_COLLAPSE_COMPRESSOR
4550 extern boolean_t vm_object_collapse_compressor_allowed;
4551 #include <IOKit/IOLib.h>
4552 #endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4553
4554 #if FBDP_TEST_WIRE_AND_EXTRACT
4555 extern ledger_template_t task_ledger_template;
4556 #include <mach/mach_vm.h>
4557 extern ppnum_t vm_map_get_phys_page(vm_map_t map,
4558 vm_offset_t offset);
4559 #endif /* FBDP_TEST_WIRE_AND_EXTRACT */
4560
4561
4562 void
4563 vm_set_restrictions()
4564 {
4565 host_basic_info_data_t hinfo;
4566 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
4567
4568 #define BSD_HOST 1
4569 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
4570
4571 assert(hinfo.max_cpus > 0);
4572
4573 if (hinfo.max_cpus <= 3) {
4574 /*
4575 * on systems with a limited number of CPUS, bind the
4576 * 4 major threads that can free memory and that tend to use
4577 * a fair bit of CPU under pressured conditions to a single processor.
4578 * This insures that these threads don't hog all of the available CPUs
4579 * (important for camera launch), while allowing them to run independently
4580 * w/r to locks... the 4 threads are
4581 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4582 * vm_compressor_swap_trigger_thread (minor and major compactions),
4583 * memorystatus_thread (jetsams).
4584 *
4585 * the first time the thread is run, it is responsible for checking the
4586 * state of vm_restricted_to_single_processor, and if TRUE it calls
4587 * thread_bind_master... someday this should be replaced with a group
4588 * scheduling mechanism and KPI.
4589 */
4590 vm_restricted_to_single_processor = TRUE;
4591 }
4592 }
4593
4594
4595 void
4596 vm_pageout(void)
4597 {
4598 thread_t self = current_thread();
4599 thread_t thread;
4600 kern_return_t result;
4601 spl_t s;
4602
4603 /*
4604 * Set thread privileges.
4605 */
4606 s = splsched();
4607
4608 thread_lock(self);
4609 self->options |= TH_OPT_VMPRIV;
4610 sched_set_thread_base_priority(self, BASEPRI_PREEMPT - 1);
4611 thread_unlock(self);
4612
4613 if (!self->reserved_stack)
4614 self->reserved_stack = self->kernel_stack;
4615
4616 if (vm_restricted_to_single_processor == TRUE)
4617 thread_vm_bind_group_add();
4618
4619 splx(s);
4620
4621 /*
4622 * Initialize some paging parameters.
4623 */
4624
4625 if (vm_pageout_swap_wait == 0)
4626 vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4627
4628 if (vm_pageout_idle_wait == 0)
4629 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4630
4631 if (vm_pageout_burst_wait == 0)
4632 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4633
4634 if (vm_pageout_empty_wait == 0)
4635 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4636
4637 if (vm_pageout_deadlock_wait == 0)
4638 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4639
4640 if (vm_pageout_deadlock_relief == 0)
4641 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4642
4643 if (vm_pageout_inactive_relief == 0)
4644 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
4645
4646 if (vm_pageout_burst_active_throttle == 0)
4647 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
4648
4649 if (vm_pageout_burst_inactive_throttle == 0)
4650 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4651
4652 /*
4653 * Set kernel task to low backing store privileged
4654 * status
4655 */
4656 task_lock(kernel_task);
4657 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
4658 task_unlock(kernel_task);
4659
4660 vm_page_free_count_init = vm_page_free_count;
4661
4662 /*
4663 * even if we've already called vm_page_free_reserve
4664 * call it again here to insure that the targets are
4665 * accurately calculated (it uses vm_page_free_count_init)
4666 * calling it with an arg of 0 will not change the reserve
4667 * but will re-calculate free_min and free_target
4668 */
4669 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4670 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4671 } else
4672 vm_page_free_reserve(0);
4673
4674
4675 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
4676 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
4677 vm_pageout_queue_external.pgo_laundry = 0;
4678 vm_pageout_queue_external.pgo_idle = FALSE;
4679 vm_pageout_queue_external.pgo_busy = FALSE;
4680 vm_pageout_queue_external.pgo_throttled = FALSE;
4681 vm_pageout_queue_external.pgo_draining = FALSE;
4682 vm_pageout_queue_external.pgo_lowpriority = FALSE;
4683 vm_pageout_queue_external.pgo_tid = -1;
4684 vm_pageout_queue_external.pgo_inited = FALSE;
4685
4686 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
4687 vm_pageout_queue_internal.pgo_maxlaundry = 0;
4688 vm_pageout_queue_internal.pgo_laundry = 0;
4689 vm_pageout_queue_internal.pgo_idle = FALSE;
4690 vm_pageout_queue_internal.pgo_busy = FALSE;
4691 vm_pageout_queue_internal.pgo_throttled = FALSE;
4692 vm_pageout_queue_internal.pgo_draining = FALSE;
4693 vm_pageout_queue_internal.pgo_lowpriority = FALSE;
4694 vm_pageout_queue_internal.pgo_tid = -1;
4695 vm_pageout_queue_internal.pgo_inited = FALSE;
4696
4697 /* internal pageout thread started when default pager registered first time */
4698 /* external pageout and garbage collection threads started here */
4699
4700 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
4701 BASEPRI_PREEMPT - 1,
4702 &vm_pageout_external_iothread);
4703 if (result != KERN_SUCCESS)
4704 panic("vm_pageout_iothread_external: create failed");
4705
4706 thread_deallocate(vm_pageout_external_iothread);
4707
4708 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
4709 BASEPRI_DEFAULT,
4710 &thread);
4711 if (result != KERN_SUCCESS)
4712 panic("vm_pageout_garbage_collect: create failed");
4713
4714 thread_deallocate(thread);
4715
4716 #if VM_PRESSURE_EVENTS
4717 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
4718 BASEPRI_DEFAULT,
4719 &thread);
4720
4721 if (result != KERN_SUCCESS)
4722 panic("vm_pressure_thread: create failed");
4723
4724 thread_deallocate(thread);
4725 #endif
4726
4727 vm_object_reaper_init();
4728
4729
4730 bzero(&vm_config, sizeof(vm_config));
4731
4732 switch(vm_compressor_mode) {
4733
4734 case VM_PAGER_DEFAULT:
4735 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4736
4737 case VM_PAGER_COMPRESSOR_WITH_SWAP:
4738 vm_config.compressor_is_present = TRUE;
4739 vm_config.swap_is_present = TRUE;
4740 vm_config.compressor_is_active = TRUE;
4741 vm_config.swap_is_active = TRUE;
4742 break;
4743
4744 case VM_PAGER_COMPRESSOR_NO_SWAP:
4745 vm_config.compressor_is_present = TRUE;
4746 vm_config.swap_is_present = TRUE;
4747 vm_config.compressor_is_active = TRUE;
4748 break;
4749
4750 case VM_PAGER_FREEZER_DEFAULT:
4751 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4752
4753 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4754 vm_config.compressor_is_present = TRUE;
4755 vm_config.swap_is_present = TRUE;
4756 break;
4757
4758 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4759 vm_config.compressor_is_present = TRUE;
4760 vm_config.swap_is_present = TRUE;
4761 vm_config.compressor_is_active = TRUE;
4762 vm_config.freezer_swap_is_active = TRUE;
4763 break;
4764
4765 case VM_PAGER_NOT_CONFIGURED:
4766 break;
4767
4768 default:
4769 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4770 break;
4771 }
4772 if (VM_CONFIG_COMPRESSOR_IS_PRESENT)
4773 vm_compressor_pager_init();
4774
4775 #if VM_PRESSURE_EVENTS
4776 vm_pressure_events_enabled = TRUE;
4777 #endif /* VM_PRESSURE_EVENTS */
4778
4779 #if CONFIG_PHANTOM_CACHE
4780 vm_phantom_cache_init();
4781 #endif
4782 #if VM_PAGE_BUCKETS_CHECK
4783 #if VM_PAGE_FAKE_BUCKETS
4784 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
4785 (uint64_t) vm_page_fake_buckets_start,
4786 (uint64_t) vm_page_fake_buckets_end);
4787 pmap_protect(kernel_pmap,
4788 vm_page_fake_buckets_start,
4789 vm_page_fake_buckets_end,
4790 VM_PROT_READ);
4791 // *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
4792 #endif /* VM_PAGE_FAKE_BUCKETS */
4793 #endif /* VM_PAGE_BUCKETS_CHECK */
4794
4795 #if VM_OBJECT_TRACKING
4796 vm_object_tracking_init();
4797 #endif /* VM_OBJECT_TRACKING */
4798
4799
4800 #if FBDP_TEST_COLLAPSE_COMPRESSOR
4801 vm_object_size_t backing_size, top_size;
4802 vm_object_t backing_object, top_object;
4803 vm_map_offset_t backing_offset, top_offset;
4804 unsigned char *backing_address, *top_address;
4805 kern_return_t kr;
4806
4807 printf("FBDP_TEST_COLLAPSE_COMPRESSOR:\n");
4808
4809 /* create backing object */
4810 backing_size = 15 * PAGE_SIZE;
4811 backing_object = vm_object_allocate(backing_size);
4812 assert(backing_object != VM_OBJECT_NULL);
4813 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n",
4814 backing_object);
4815 /* map backing object */
4816 backing_offset = 0;
4817 kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0,
4818 VM_FLAGS_ANYWHERE, backing_object, 0, FALSE,
4819 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4820 assert(kr == KERN_SUCCESS);
4821 backing_address = (unsigned char *) backing_offset;
4822 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4823 "mapped backing object %p at 0x%llx\n",
4824 backing_object, (uint64_t) backing_offset);
4825 /* populate with pages to be compressed in backing object */
4826 backing_address[0x1*PAGE_SIZE] = 0xB1;
4827 backing_address[0x4*PAGE_SIZE] = 0xB4;
4828 backing_address[0x7*PAGE_SIZE] = 0xB7;
4829 backing_address[0xa*PAGE_SIZE] = 0xBA;
4830 backing_address[0xd*PAGE_SIZE] = 0xBD;
4831 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4832 "populated pages to be compressed in "
4833 "backing_object %p\n", backing_object);
4834 /* compress backing object */
4835 vm_object_pageout(backing_object);
4836 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing backing_object %p\n",
4837 backing_object);
4838 /* wait for all the pages to be gone */
4839 while (*(volatile int *)&backing_object->resident_page_count != 0)
4840 IODelay(10);
4841 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: backing_object %p compressed\n",
4842 backing_object);
4843 /* populate with pages to be resident in backing object */
4844 backing_address[0x0*PAGE_SIZE] = 0xB0;
4845 backing_address[0x3*PAGE_SIZE] = 0xB3;
4846 backing_address[0x6*PAGE_SIZE] = 0xB6;
4847 backing_address[0x9*PAGE_SIZE] = 0xB9;
4848 backing_address[0xc*PAGE_SIZE] = 0xBC;
4849 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4850 "populated pages to be resident in "
4851 "backing_object %p\n", backing_object);
4852 /* leave the other pages absent */
4853 /* mess with the paging_offset of the backing_object */
4854 assert(backing_object->paging_offset == 0);
4855 backing_object->paging_offset = 0x3000;
4856
4857 /* create top object */
4858 top_size = 9 * PAGE_SIZE;
4859 top_object = vm_object_allocate(top_size);
4860 assert(top_object != VM_OBJECT_NULL);
4861 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created top object %p\n",
4862 top_object);
4863 /* map top object */
4864 top_offset = 0;
4865 kr = vm_map_enter(kernel_map, &top_offset, top_size, 0,
4866 VM_FLAGS_ANYWHERE, top_object, 0, FALSE,
4867 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4868 assert(kr == KERN_SUCCESS);
4869 top_address = (unsigned char *) top_offset;
4870 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4871 "mapped top object %p at 0x%llx\n",
4872 top_object, (uint64_t) top_offset);
4873 /* populate with pages to be compressed in top object */
4874 top_address[0x3*PAGE_SIZE] = 0xA3;
4875 top_address[0x4*PAGE_SIZE] = 0xA4;
4876 top_address[0x5*PAGE_SIZE] = 0xA5;
4877 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4878 "populated pages to be compressed in "
4879 "top_object %p\n", top_object);
4880 /* compress top object */
4881 vm_object_pageout(top_object);
4882 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing top_object %p\n",
4883 top_object);
4884 /* wait for all the pages to be gone */
4885 while (top_object->resident_page_count != 0);
4886 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: top_object %p compressed\n",
4887 top_object);
4888 /* populate with pages to be resident in top object */
4889 top_address[0x0*PAGE_SIZE] = 0xA0;
4890 top_address[0x1*PAGE_SIZE] = 0xA1;
4891 top_address[0x2*PAGE_SIZE] = 0xA2;
4892 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4893 "populated pages to be resident in "
4894 "top_object %p\n", top_object);
4895 /* leave the other pages absent */
4896
4897 /* link the 2 objects */
4898 vm_object_reference(backing_object);
4899 top_object->shadow = backing_object;
4900 top_object->vo_shadow_offset = 0x3000;
4901 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: linked %p and %p\n",
4902 top_object, backing_object);
4903
4904 /* unmap backing object */
4905 vm_map_remove(kernel_map,
4906 backing_offset,
4907 backing_offset + backing_size,
4908 0);
4909 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4910 "unmapped backing_object %p [0x%llx:0x%llx]\n",
4911 backing_object,
4912 (uint64_t) backing_offset,
4913 (uint64_t) (backing_offset + backing_size));
4914
4915 /* collapse */
4916 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsing %p\n", top_object);
4917 vm_object_lock(top_object);
4918 vm_object_collapse(top_object, 0, FALSE);
4919 vm_object_unlock(top_object);
4920 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsed %p\n", top_object);
4921
4922 /* did it work? */
4923 if (top_object->shadow != VM_OBJECT_NULL) {
4924 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: not collapsed\n");
4925 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4926 if (vm_object_collapse_compressor_allowed) {
4927 panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4928 }
4929 } else {
4930 /* check the contents of the mapping */
4931 unsigned char expect[9] =
4932 { 0xA0, 0xA1, 0xA2, /* resident in top */
4933 0xA3, 0xA4, 0xA5, /* compressed in top */
4934 0xB9, /* resident in backing + shadow_offset */
4935 0xBD, /* compressed in backing + shadow_offset + paging_offset */
4936 0x00 }; /* absent in both */
4937 unsigned char actual[9];
4938 unsigned int i, errors;
4939
4940 errors = 0;
4941 for (i = 0; i < sizeof (actual); i++) {
4942 actual[i] = (unsigned char) top_address[i*PAGE_SIZE];
4943 if (actual[i] != expect[i]) {
4944 errors++;
4945 }
4946 }
4947 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4948 "actual [%x %x %x %x %x %x %x %x %x] "
4949 "expect [%x %x %x %x %x %x %x %x %x] "
4950 "%d errors\n",
4951 actual[0], actual[1], actual[2], actual[3],
4952 actual[4], actual[5], actual[6], actual[7],
4953 actual[8],
4954 expect[0], expect[1], expect[2], expect[3],
4955 expect[4], expect[5], expect[6], expect[7],
4956 expect[8],
4957 errors);
4958 if (errors) {
4959 panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4960 } else {
4961 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: PASS\n");
4962 }
4963 }
4964 #endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4965
4966 #if FBDP_TEST_WIRE_AND_EXTRACT
4967 ledger_t ledger;
4968 vm_map_t user_map, wire_map;
4969 mach_vm_address_t user_addr, wire_addr;
4970 mach_vm_size_t user_size, wire_size;
4971 mach_vm_offset_t cur_offset;
4972 vm_prot_t cur_prot, max_prot;
4973 ppnum_t user_ppnum, wire_ppnum;
4974 kern_return_t kr;
4975
4976 ledger = ledger_instantiate(task_ledger_template,
4977 LEDGER_CREATE_ACTIVE_ENTRIES);
4978 user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT),
4979 0x100000000ULL,
4980 0x200000000ULL,
4981 TRUE);
4982 wire_map = vm_map_create(NULL,
4983 0x100000000ULL,
4984 0x200000000ULL,
4985 TRUE);
4986 user_addr = 0;
4987 user_size = 0x10000;
4988 kr = mach_vm_allocate(user_map,
4989 &user_addr,
4990 user_size,
4991 VM_FLAGS_ANYWHERE);
4992 assert(kr == KERN_SUCCESS);
4993 wire_addr = 0;
4994 wire_size = user_size;
4995 kr = mach_vm_remap(wire_map,
4996 &wire_addr,
4997 wire_size,
4998 0,
4999 VM_FLAGS_ANYWHERE,
5000 user_map,
5001 user_addr,
5002 FALSE,
5003 &cur_prot,
5004 &max_prot,
5005 VM_INHERIT_NONE);
5006 assert(kr == KERN_SUCCESS);
5007 for (cur_offset = 0;
5008 cur_offset < wire_size;
5009 cur_offset += PAGE_SIZE) {
5010 kr = vm_map_wire_and_extract(wire_map,
5011 wire_addr + cur_offset,
5012 VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK),
5013 TRUE,
5014 &wire_ppnum);
5015 assert(kr == KERN_SUCCESS);
5016 user_ppnum = vm_map_get_phys_page(user_map,
5017 user_addr + cur_offset);
5018 printf("FBDP_TEST_WIRE_AND_EXTRACT: kr=0x%x "
5019 "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5020 kr,
5021 user_map, user_addr + cur_offset, user_ppnum,
5022 wire_map, wire_addr + cur_offset, wire_ppnum);
5023 if (kr != KERN_SUCCESS ||
5024 wire_ppnum == 0 ||
5025 wire_ppnum != user_ppnum) {
5026 panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5027 }
5028 }
5029 cur_offset -= PAGE_SIZE;
5030 kr = vm_map_wire_and_extract(wire_map,
5031 wire_addr + cur_offset,
5032 VM_PROT_DEFAULT,
5033 TRUE,
5034 &wire_ppnum);
5035 assert(kr == KERN_SUCCESS);
5036 printf("FBDP_TEST_WIRE_AND_EXTRACT: re-wire kr=0x%x "
5037 "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5038 kr,
5039 user_map, user_addr + cur_offset, user_ppnum,
5040 wire_map, wire_addr + cur_offset, wire_ppnum);
5041 if (kr != KERN_SUCCESS ||
5042 wire_ppnum == 0 ||
5043 wire_ppnum != user_ppnum) {
5044 panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5045 }
5046
5047 printf("FBDP_TEST_WIRE_AND_EXTRACT: PASS\n");
5048 #endif /* FBDP_TEST_WIRE_AND_EXTRACT */
5049
5050 #if FBDP_TEST_PAGE_WIRE_OVERFLOW
5051 vm_object_t fbdp_object;
5052 vm_page_t fbdp_page;
5053
5054 printf("FBDP_TEST_PAGE_WIRE_OVERFLOW: starting...\n");
5055
5056 fbdp_object = vm_object_allocate(PAGE_SIZE);
5057 vm_object_lock(fbdp_object);
5058 fbdp_page = vm_page_alloc(fbdp_object, 0x0);
5059 vm_page_lock_queues();
5060 do {
5061 vm_page_wire(fbdp_page, 1, FALSE);
5062 } while (fbdp_page->wire_count != 0);
5063 vm_page_unlock_queues();
5064 vm_object_unlock(fbdp_object);
5065 panic("FBDP(%p,%p): wire_count overflow not detected\n",
5066 fbdp_object, fbdp_page);
5067 #endif /* FBDP_TEST_PAGE_WIRE_OVERFLOW */
5068
5069 #if FBDP_TEST_KERNEL_OBJECT_FAULT
5070 {
5071 }
5072 #endif /* FBDP_TEST_KERNEL_OBJECT_FAULT */
5073
5074 vm_pageout_continue();
5075
5076 /*
5077 * Unreached code!
5078 *
5079 * The vm_pageout_continue() call above never returns, so the code below is never
5080 * executed. We take advantage of this to declare several DTrace VM related probe
5081 * points that our kernel doesn't have an analog for. These are probe points that
5082 * exist in Solaris and are in the DTrace documentation, so people may have written
5083 * scripts that use them. Declaring the probe points here means their scripts will
5084 * compile and execute which we want for portability of the scripts, but since this
5085 * section of code is never reached, the probe points will simply never fire. Yes,
5086 * this is basically a hack. The problem is the DTrace probe points were chosen with
5087 * Solaris specific VM events in mind, not portability to different VM implementations.
5088 */
5089
5090 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5091 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5092 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5093 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5094 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5095 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5096 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5097 /*NOTREACHED*/
5098 }
5099
5100
5101
5102 int vm_compressor_thread_count = 2;
5103
5104 kern_return_t
5105 vm_pageout_internal_start(void)
5106 {
5107 kern_return_t result;
5108 int i;
5109 host_basic_info_data_t hinfo;
5110
5111 assert (VM_CONFIG_COMPRESSOR_IS_PRESENT);
5112
5113 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5114 #define BSD_HOST 1
5115 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5116
5117 assert(hinfo.max_cpus > 0);
5118
5119 if (vm_compressor_thread_count >= hinfo.max_cpus)
5120 vm_compressor_thread_count = hinfo.max_cpus - 1;
5121 if (vm_compressor_thread_count <= 0)
5122 vm_compressor_thread_count = 1;
5123 else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
5124 vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5125
5126 if (vm_compressor_immediate_preferred == TRUE) {
5127 vm_pageout_immediate_chead = NULL;
5128 vm_pageout_immediate_scratch_buf = kalloc(vm_compressor_get_encode_scratch_size());
5129
5130 vm_compressor_thread_count = 1;
5131 }
5132
5133 vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5134
5135 for (i = 0; i < vm_compressor_thread_count; i++) {
5136 ciq[i].id = i;
5137 ciq[i].q = &vm_pageout_queue_internal;
5138 ciq[i].current_chead = NULL;
5139 ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
5140
5141 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
5142
5143 if (result == KERN_SUCCESS)
5144 thread_deallocate(vm_pageout_internal_iothread);
5145 else
5146 break;
5147 }
5148 return result;
5149 }
5150
5151 #if CONFIG_IOSCHED
5152 /*
5153 * To support I/O Expedite for compressed files we mark the upls with special flags.
5154 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5155 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5156 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5157 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5158 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5159 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5160 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5161 * unless the real I/O upl is being destroyed).
5162 */
5163
5164
5165 static void
5166 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5167 {
5168 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5169
5170 upl_lock(src_upl);
5171 if (src_upl->decmp_io_upl) {
5172 /*
5173 * If there is already an alive real I/O UPL, ignore this new UPL.
5174 * This case should rarely happen and even if it does, it just means
5175 * that we might issue a spurious expedite which the driver is expected
5176 * to handle.
5177 */
5178 upl_unlock(src_upl);
5179 return;
5180 }
5181 src_upl->decmp_io_upl = (void *)upl;
5182 src_upl->ref_count++;
5183
5184 upl->flags |= UPL_DECMP_REAL_IO;
5185 upl->decmp_io_upl = (void *)src_upl;
5186 upl_unlock(src_upl);
5187 }
5188 #endif /* CONFIG_IOSCHED */
5189
5190 #if UPL_DEBUG
5191 int upl_debug_enabled = 1;
5192 #else
5193 int upl_debug_enabled = 0;
5194 #endif
5195
5196 static upl_t
5197 upl_create(int type, int flags, upl_size_t size)
5198 {
5199 upl_t upl;
5200 vm_size_t page_field_size = 0;
5201 int upl_flags = 0;
5202 vm_size_t upl_size = sizeof(struct upl);
5203
5204 size = round_page_32(size);
5205
5206 if (type & UPL_CREATE_LITE) {
5207 page_field_size = (atop(size) + 7) >> 3;
5208 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5209
5210 upl_flags |= UPL_LITE;
5211 }
5212 if (type & UPL_CREATE_INTERNAL) {
5213 upl_size += sizeof(struct upl_page_info) * atop(size);
5214
5215 upl_flags |= UPL_INTERNAL;
5216 }
5217 upl = (upl_t)kalloc(upl_size + page_field_size);
5218
5219 if (page_field_size)
5220 bzero((char *)upl + upl_size, page_field_size);
5221
5222 upl->flags = upl_flags | flags;
5223 upl->kaddr = (vm_offset_t)0;
5224 upl->size = 0;
5225 upl->map_object = NULL;
5226 upl->ref_count = 1;
5227 upl->ext_ref_count = 0;
5228 upl->highest_page = 0;
5229 upl_lock_init(upl);
5230 upl->vector_upl = NULL;
5231 upl->associated_upl = NULL;
5232 #if CONFIG_IOSCHED
5233 if (type & UPL_CREATE_IO_TRACKING) {
5234 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5235 }
5236
5237 upl->upl_reprio_info = 0;
5238 upl->decmp_io_upl = 0;
5239 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5240 /* Only support expedite on internal UPLs */
5241 thread_t curthread = current_thread();
5242 upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
5243 bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
5244 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5245 if (curthread->decmp_upl != NULL)
5246 upl_set_decmp_info(upl, curthread->decmp_upl);
5247 }
5248 #endif
5249 #if CONFIG_IOSCHED || UPL_DEBUG
5250 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5251 upl->upl_creator = current_thread();
5252 upl->uplq.next = 0;
5253 upl->uplq.prev = 0;
5254 upl->flags |= UPL_TRACKED_BY_OBJECT;
5255 }
5256 #endif
5257
5258 #if UPL_DEBUG
5259 upl->ubc_alias1 = 0;
5260 upl->ubc_alias2 = 0;
5261
5262 upl->upl_state = 0;
5263 upl->upl_commit_index = 0;
5264 bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
5265
5266 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5267 #endif /* UPL_DEBUG */
5268
5269 return(upl);
5270 }
5271
5272 static void
5273 upl_destroy(upl_t upl)
5274 {
5275 int page_field_size; /* bit field in word size buf */
5276 int size;
5277
5278 if (upl->ext_ref_count) {
5279 panic("upl(%p) ext_ref_count", upl);
5280 }
5281
5282 #if CONFIG_IOSCHED
5283 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5284 upl_t src_upl;
5285 src_upl = upl->decmp_io_upl;
5286 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5287 upl_lock(src_upl);
5288 src_upl->decmp_io_upl = NULL;
5289 upl_unlock(src_upl);
5290 upl_deallocate(src_upl);
5291 }
5292 #endif /* CONFIG_IOSCHED */
5293
5294 #if CONFIG_IOSCHED || UPL_DEBUG
5295 if ((upl->flags & UPL_TRACKED_BY_OBJECT) && !(upl->flags & UPL_VECTOR)) {
5296 vm_object_t object;
5297
5298 if (upl->flags & UPL_SHADOWED) {
5299 object = upl->map_object->shadow;
5300 } else {
5301 object = upl->map_object;
5302 }
5303
5304 vm_object_lock(object);
5305 queue_remove(&object->uplq, upl, upl_t, uplq);
5306 vm_object_activity_end(object);
5307 vm_object_collapse(object, 0, TRUE);
5308 vm_object_unlock(object);
5309 }
5310 #endif
5311 /*
5312 * drop a reference on the map_object whether or
5313 * not a pageout object is inserted
5314 */
5315 if (upl->flags & UPL_SHADOWED)
5316 vm_object_deallocate(upl->map_object);
5317
5318 if (upl->flags & UPL_DEVICE_MEMORY)
5319 size = PAGE_SIZE;
5320 else
5321 size = upl->size;
5322 page_field_size = 0;
5323
5324 if (upl->flags & UPL_LITE) {
5325 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
5326 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5327 }
5328 upl_lock_destroy(upl);
5329 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5330
5331 #if CONFIG_IOSCHED
5332 if (upl->flags & UPL_EXPEDITE_SUPPORTED)
5333 kfree(upl->upl_reprio_info, sizeof(uint64_t) * (size/PAGE_SIZE));
5334 #endif
5335
5336 if (upl->flags & UPL_INTERNAL) {
5337 kfree(upl,
5338 sizeof(struct upl) +
5339 (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
5340 + page_field_size);
5341 } else {
5342 kfree(upl, sizeof(struct upl) + page_field_size);
5343 }
5344 }
5345
5346 void
5347 upl_deallocate(upl_t upl)
5348 {
5349 upl_lock(upl);
5350 if (--upl->ref_count == 0) {
5351 if(vector_upl_is_valid(upl))
5352 vector_upl_deallocate(upl);
5353 upl_unlock(upl);
5354 upl_destroy(upl);
5355 }
5356 else
5357 upl_unlock(upl);
5358 }
5359
5360 #if CONFIG_IOSCHED
5361 void
5362 upl_mark_decmp(upl_t upl)
5363 {
5364 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5365 upl->flags |= UPL_DECMP_REQ;
5366 upl->upl_creator->decmp_upl = (void *)upl;
5367 }
5368 }
5369
5370 void
5371 upl_unmark_decmp(upl_t upl)
5372 {
5373 if(upl && (upl->flags & UPL_DECMP_REQ)) {
5374 upl->upl_creator->decmp_upl = NULL;
5375 }
5376 }
5377
5378 #endif /* CONFIG_IOSCHED */
5379
5380 #define VM_PAGE_Q_BACKING_UP(q) \
5381 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5382
5383 boolean_t must_throttle_writes(void);
5384
5385 boolean_t
5386 must_throttle_writes()
5387 {
5388 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5389 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10)
5390 return (TRUE);
5391
5392 return (FALSE);
5393 }
5394
5395
5396 #if DEVELOPMENT || DEBUG
5397 /*/*
5398 * Statistics about UPL enforcement of copy-on-write obligations.
5399 */
5400 unsigned long upl_cow = 0;
5401 unsigned long upl_cow_again = 0;
5402 unsigned long upl_cow_pages = 0;
5403 unsigned long upl_cow_again_pages = 0;
5404
5405 unsigned long iopl_cow = 0;
5406 unsigned long iopl_cow_pages = 0;
5407 #endif
5408
5409 /*
5410 * Routine: vm_object_upl_request
5411 * Purpose:
5412 * Cause the population of a portion of a vm_object.
5413 * Depending on the nature of the request, the pages
5414 * returned may be contain valid data or be uninitialized.
5415 * A page list structure, listing the physical pages
5416 * will be returned upon request.
5417 * This function is called by the file system or any other
5418 * supplier of backing store to a pager.
5419 * IMPORTANT NOTE: The caller must still respect the relationship
5420 * between the vm_object and its backing memory object. The
5421 * caller MUST NOT substitute changes in the backing file
5422 * without first doing a memory_object_lock_request on the
5423 * target range unless it is know that the pages are not
5424 * shared with another entity at the pager level.
5425 * Copy_in_to:
5426 * if a page list structure is present
5427 * return the mapped physical pages, where a
5428 * page is not present, return a non-initialized
5429 * one. If the no_sync bit is turned on, don't
5430 * call the pager unlock to synchronize with other
5431 * possible copies of the page. Leave pages busy
5432 * in the original object, if a page list structure
5433 * was specified. When a commit of the page list
5434 * pages is done, the dirty bit will be set for each one.
5435 * Copy_out_from:
5436 * If a page list structure is present, return
5437 * all mapped pages. Where a page does not exist
5438 * map a zero filled one. Leave pages busy in
5439 * the original object. If a page list structure
5440 * is not specified, this call is a no-op.
5441 *
5442 * Note: access of default pager objects has a rather interesting
5443 * twist. The caller of this routine, presumably the file system
5444 * page cache handling code, will never actually make a request
5445 * against a default pager backed object. Only the default
5446 * pager will make requests on backing store related vm_objects
5447 * In this way the default pager can maintain the relationship
5448 * between backing store files (abstract memory objects) and
5449 * the vm_objects (cache objects), they support.
5450 *
5451 */
5452
5453 __private_extern__ kern_return_t
5454 vm_object_upl_request(
5455 vm_object_t object,
5456 vm_object_offset_t offset,
5457 upl_size_t size,
5458 upl_t *upl_ptr,
5459 upl_page_info_array_t user_page_list,
5460 unsigned int *page_list_count,
5461 upl_control_flags_t cntrl_flags)
5462 {
5463 vm_page_t dst_page = VM_PAGE_NULL;
5464 vm_object_offset_t dst_offset;
5465 upl_size_t xfer_size;
5466 unsigned int size_in_pages;
5467 boolean_t dirty;
5468 boolean_t hw_dirty;
5469 upl_t upl = NULL;
5470 unsigned int entry;
5471 #if MACH_CLUSTER_STATS
5472 boolean_t encountered_lrp = FALSE;
5473 #endif
5474 vm_page_t alias_page = NULL;
5475 int refmod_state = 0;
5476 wpl_array_t lite_list = NULL;
5477 vm_object_t last_copy_object;
5478 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5479 struct vm_page_delayed_work *dwp;
5480 int dw_count;
5481 int dw_limit;
5482 int io_tracking_flag = 0;
5483 int grab_options;
5484 ppnum_t phys_page;
5485
5486 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5487 /*
5488 * For forward compatibility's sake,
5489 * reject any unknown flag.
5490 */
5491 return KERN_INVALID_VALUE;
5492 }
5493 if ( (!object->internal) && (object->paging_offset != 0) )
5494 panic("vm_object_upl_request: external object with non-zero paging offset\n");
5495 if (object->phys_contiguous)
5496 panic("vm_object_upl_request: contiguous object specified\n");
5497
5498
5499 if (size > MAX_UPL_SIZE_BYTES)
5500 size = MAX_UPL_SIZE_BYTES;
5501
5502 if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
5503 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5504
5505 #if CONFIG_IOSCHED || UPL_DEBUG
5506 if (object->io_tracking || upl_debug_enabled)
5507 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5508 #endif
5509 #if CONFIG_IOSCHED
5510 if (object->io_tracking)
5511 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5512 #endif
5513
5514 if (cntrl_flags & UPL_SET_INTERNAL) {
5515 if (cntrl_flags & UPL_SET_LITE) {
5516
5517 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5518
5519 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5520 lite_list = (wpl_array_t)
5521 (((uintptr_t)user_page_list) +
5522 ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5523 if (size == 0) {
5524 user_page_list = NULL;
5525 lite_list = NULL;
5526 }
5527 } else {
5528 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5529
5530 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5531 if (size == 0) {
5532 user_page_list = NULL;
5533 }
5534 }
5535 } else {
5536 if (cntrl_flags & UPL_SET_LITE) {
5537
5538 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5539
5540 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5541 if (size == 0) {
5542 lite_list = NULL;
5543 }
5544 } else {
5545 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5546 }
5547 }
5548 *upl_ptr = upl;
5549
5550 if (user_page_list)
5551 user_page_list[0].device = FALSE;
5552
5553 if (cntrl_flags & UPL_SET_LITE) {
5554 upl->map_object = object;
5555 } else {
5556 upl->map_object = vm_object_allocate(size);
5557 /*
5558 * No neeed to lock the new object: nobody else knows
5559 * about it yet, so it's all ours so far.
5560 */
5561 upl->map_object->shadow = object;
5562 upl->map_object->pageout = TRUE;
5563 upl->map_object->can_persist = FALSE;
5564 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5565 upl->map_object->vo_shadow_offset = offset;
5566 upl->map_object->wimg_bits = object->wimg_bits;
5567
5568 VM_PAGE_GRAB_FICTITIOUS(alias_page);
5569
5570 upl->flags |= UPL_SHADOWED;
5571 }
5572 /*
5573 * ENCRYPTED SWAP:
5574 * Just mark the UPL as "encrypted" here.
5575 * We'll actually encrypt the pages later,
5576 * in upl_encrypt(), when the caller has
5577 * selected which pages need to go to swap.
5578 */
5579 if (cntrl_flags & UPL_ENCRYPT)
5580 upl->flags |= UPL_ENCRYPTED;
5581
5582 if (cntrl_flags & UPL_FOR_PAGEOUT)
5583 upl->flags |= UPL_PAGEOUT;
5584
5585 vm_object_lock(object);
5586 vm_object_activity_begin(object);
5587
5588 grab_options = 0;
5589 #if CONFIG_SECLUDED_MEMORY
5590 if (object->can_grab_secluded) {
5591 grab_options |= VM_PAGE_GRAB_SECLUDED;
5592 }
5593 #endif /* CONFIG_SECLUDED_MEMORY */
5594
5595 /*
5596 * we can lock in the paging_offset once paging_in_progress is set
5597 */
5598 upl->size = size;
5599 upl->offset = offset + object->paging_offset;
5600
5601 #if CONFIG_IOSCHED || UPL_DEBUG
5602 if (object->io_tracking || upl_debug_enabled) {
5603 vm_object_activity_begin(object);
5604 queue_enter(&object->uplq, upl, upl_t, uplq);
5605 }
5606 #endif
5607 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5608 /*
5609 * Honor copy-on-write obligations
5610 *
5611 * The caller is gathering these pages and
5612 * might modify their contents. We need to
5613 * make sure that the copy object has its own
5614 * private copies of these pages before we let
5615 * the caller modify them.
5616 */
5617 vm_object_update(object,
5618 offset,
5619 size,
5620 NULL,
5621 NULL,
5622 FALSE, /* should_return */
5623 MEMORY_OBJECT_COPY_SYNC,
5624 VM_PROT_NO_CHANGE);
5625 #if DEVELOPMENT || DEBUG
5626 upl_cow++;
5627 upl_cow_pages += size >> PAGE_SHIFT;
5628 #endif
5629 }
5630 /*
5631 * remember which copy object we synchronized with
5632 */
5633 last_copy_object = object->copy;
5634 entry = 0;
5635
5636 xfer_size = size;
5637 dst_offset = offset;
5638 size_in_pages = size / PAGE_SIZE;
5639
5640 dwp = &dw_array[0];
5641 dw_count = 0;
5642 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5643
5644 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5645 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT))
5646 object->scan_collisions = 0;
5647
5648 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5649 boolean_t isSSD = FALSE;
5650
5651 vnode_pager_get_isSSD(object->pager, &isSSD);
5652 vm_object_unlock(object);
5653
5654 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5655
5656 if (isSSD == TRUE)
5657 delay(1000 * size_in_pages);
5658 else
5659 delay(5000 * size_in_pages);
5660 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5661
5662 vm_object_lock(object);
5663 }
5664
5665 while (xfer_size) {
5666
5667 dwp->dw_mask = 0;
5668
5669 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5670 vm_object_unlock(object);
5671 VM_PAGE_GRAB_FICTITIOUS(alias_page);
5672 vm_object_lock(object);
5673 }
5674 if (cntrl_flags & UPL_COPYOUT_FROM) {
5675 upl->flags |= UPL_PAGE_SYNC_DONE;
5676
5677 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5678 dst_page->fictitious ||
5679 dst_page->absent ||
5680 dst_page->error ||
5681 dst_page->cleaning ||
5682 (VM_PAGE_WIRED(dst_page))) {
5683
5684 if (user_page_list)
5685 user_page_list[entry].phys_addr = 0;
5686
5687 goto try_next_page;
5688 }
5689 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5690
5691 /*
5692 * grab this up front...
5693 * a high percentange of the time we're going to
5694 * need the hardware modification state a bit later
5695 * anyway... so we can eliminate an extra call into
5696 * the pmap layer by grabbing it here and recording it
5697 */
5698 if (dst_page->pmapped)
5699 refmod_state = pmap_get_refmod(phys_page);
5700 else
5701 refmod_state = 0;
5702
5703 if ( (refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5704 /*
5705 * page is on inactive list and referenced...
5706 * reactivate it now... this gets it out of the
5707 * way of vm_pageout_scan which would have to
5708 * reactivate it upon tripping over it
5709 */
5710 dwp->dw_mask |= DW_vm_page_activate;
5711 }
5712 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5713 /*
5714 * we're only asking for DIRTY pages to be returned
5715 */
5716 if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5717 /*
5718 * if we were the page stolen by vm_pageout_scan to be
5719 * cleaned (as opposed to a buddy being clustered in
5720 * or this request is not being driven by a PAGEOUT cluster
5721 * then we only need to check for the page being dirty or
5722 * precious to decide whether to return it
5723 */
5724 if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
5725 goto check_busy;
5726 goto dont_return;
5727 }
5728 /*
5729 * this is a request for a PAGEOUT cluster and this page
5730 * is merely along for the ride as a 'buddy'... not only
5731 * does it have to be dirty to be returned, but it also
5732 * can't have been referenced recently...
5733 */
5734 if ( (hibernate_cleaning_in_progress == TRUE ||
5735 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) ||
5736 (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5737 ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
5738 goto check_busy;
5739 }
5740 dont_return:
5741 /*
5742 * if we reach here, we're not to return
5743 * the page... go on to the next one
5744 */
5745 if (dst_page->laundry == TRUE) {
5746 /*
5747 * if we get here, the page is not 'cleaning' (filtered out above).
5748 * since it has been referenced, remove it from the laundry
5749 * so we don't pay the cost of an I/O to clean a page
5750 * we're just going to take back
5751 */
5752 vm_page_lockspin_queues();
5753
5754 vm_pageout_steal_laundry(dst_page, TRUE);
5755 vm_page_activate(dst_page);
5756
5757 vm_page_unlock_queues();
5758 }
5759 if (user_page_list)
5760 user_page_list[entry].phys_addr = 0;
5761
5762 goto try_next_page;
5763 }
5764 check_busy:
5765 if (dst_page->busy) {
5766 if (cntrl_flags & UPL_NOBLOCK) {
5767 if (user_page_list)
5768 user_page_list[entry].phys_addr = 0;
5769 dwp->dw_mask = 0;
5770
5771 goto try_next_page;
5772 }
5773 /*
5774 * someone else is playing with the
5775 * page. We will have to wait.
5776 */
5777 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5778
5779 continue;
5780 }
5781 /*
5782 * ENCRYPTED SWAP:
5783 * The caller is gathering this page and might
5784 * access its contents later on. Decrypt the
5785 * page before adding it to the UPL, so that
5786 * the caller never sees encrypted data.
5787 */
5788 if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
5789 int was_busy;
5790
5791 /*
5792 * save the current state of busy
5793 * mark page as busy while decrypt
5794 * is in progress since it will drop
5795 * the object lock...
5796 */
5797 was_busy = dst_page->busy;
5798 dst_page->busy = TRUE;
5799
5800 vm_page_decrypt(dst_page, 0);
5801 vm_page_decrypt_for_upl_counter++;
5802 /*
5803 * restore to original busy state
5804 */
5805 dst_page->busy = was_busy;
5806 }
5807 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5808
5809 vm_page_lockspin_queues();
5810
5811 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5812 /*
5813 * we've buddied up a page for a clustered pageout
5814 * that has already been moved to the pageout
5815 * queue by pageout_scan... we need to remove
5816 * it from the queue and drop the laundry count
5817 * on that queue
5818 */
5819 vm_pageout_throttle_up(dst_page);
5820 }
5821 vm_page_unlock_queues();
5822 }
5823 #if MACH_CLUSTER_STATS
5824 /*
5825 * pageout statistics gathering. count
5826 * all the pages we will page out that
5827 * were not counted in the initial
5828 * vm_pageout_scan work
5829 */
5830 if (dst_page->pageout)
5831 encountered_lrp = TRUE;
5832 if ((dst_page->dirty || (object->internal && dst_page->precious))) {
5833 if (encountered_lrp)
5834 CLUSTER_STAT(pages_at_higher_offsets++;)
5835 else
5836 CLUSTER_STAT(pages_at_lower_offsets++;)
5837 }
5838 #endif
5839 hw_dirty = refmod_state & VM_MEM_MODIFIED;
5840 dirty = hw_dirty ? TRUE : dst_page->dirty;
5841
5842 if (phys_page > upl->highest_page)
5843 upl->highest_page = phys_page;
5844
5845 assert (!pmap_is_noencrypt(phys_page));
5846
5847 if (cntrl_flags & UPL_SET_LITE) {
5848 unsigned int pg_num;
5849
5850 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5851 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5852 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5853
5854 if (hw_dirty)
5855 pmap_clear_modify(phys_page);
5856
5857 /*
5858 * Mark original page as cleaning
5859 * in place.
5860 */
5861 dst_page->cleaning = TRUE;
5862 dst_page->precious = FALSE;
5863 } else {
5864 /*
5865 * use pageclean setup, it is more
5866 * convenient even for the pageout
5867 * cases here
5868 */
5869 vm_object_lock(upl->map_object);
5870 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5871 vm_object_unlock(upl->map_object);
5872
5873 alias_page->absent = FALSE;
5874 alias_page = NULL;
5875 }
5876 if (dirty) {
5877 SET_PAGE_DIRTY(dst_page, FALSE);
5878 } else {
5879 dst_page->dirty = FALSE;
5880 }
5881
5882 if (!dirty)
5883 dst_page->precious = TRUE;
5884
5885 if ( (cntrl_flags & UPL_ENCRYPT) ) {
5886 /*
5887 * ENCRYPTED SWAP:
5888 * We want to deny access to the target page
5889 * because its contents are about to be
5890 * encrypted and the user would be very
5891 * confused to see encrypted data instead
5892 * of their data.
5893 * We also set "encrypted_cleaning" to allow
5894 * vm_pageout_scan() to demote that page
5895 * from "adjacent/clean-in-place" to
5896 * "target/clean-and-free" if it bumps into
5897 * this page during its scanning while we're
5898 * still processing this cluster.
5899 */
5900 dst_page->busy = TRUE;
5901 dst_page->encrypted_cleaning = TRUE;
5902 }
5903 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
5904 if ( !VM_PAGE_WIRED(dst_page))
5905 dst_page->free_when_done = TRUE;
5906 }
5907 } else {
5908 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5909 /*
5910 * Honor copy-on-write obligations
5911 *
5912 * The copy object has changed since we
5913 * last synchronized for copy-on-write.
5914 * Another copy object might have been
5915 * inserted while we released the object's
5916 * lock. Since someone could have seen the
5917 * original contents of the remaining pages
5918 * through that new object, we have to
5919 * synchronize with it again for the remaining
5920 * pages only. The previous pages are "busy"
5921 * so they can not be seen through the new
5922 * mapping. The new mapping will see our
5923 * upcoming changes for those previous pages,
5924 * but that's OK since they couldn't see what
5925 * was there before. It's just a race anyway
5926 * and there's no guarantee of consistency or
5927 * atomicity. We just don't want new mappings
5928 * to see both the *before* and *after* pages.
5929 */
5930 if (object->copy != VM_OBJECT_NULL) {
5931 vm_object_update(
5932 object,
5933 dst_offset,/* current offset */
5934 xfer_size, /* remaining size */
5935 NULL,
5936 NULL,
5937 FALSE, /* should_return */
5938 MEMORY_OBJECT_COPY_SYNC,
5939 VM_PROT_NO_CHANGE);
5940
5941 #if DEVELOPMENT || DEBUG
5942 upl_cow_again++;
5943 upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
5944 #endif
5945 }
5946 /*
5947 * remember the copy object we synced with
5948 */
5949 last_copy_object = object->copy;
5950 }
5951 dst_page = vm_page_lookup(object, dst_offset);
5952
5953 if (dst_page != VM_PAGE_NULL) {
5954
5955 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5956 /*
5957 * skip over pages already present in the cache
5958 */
5959 if (user_page_list)
5960 user_page_list[entry].phys_addr = 0;
5961
5962 goto try_next_page;
5963 }
5964 if (dst_page->fictitious) {
5965 panic("need corner case for fictitious page");
5966 }
5967
5968 if (dst_page->busy || dst_page->cleaning) {
5969 /*
5970 * someone else is playing with the
5971 * page. We will have to wait.
5972 */
5973 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5974
5975 continue;
5976 }
5977 if (dst_page->laundry)
5978 vm_pageout_steal_laundry(dst_page, FALSE);
5979 } else {
5980 if (object->private) {
5981 /*
5982 * This is a nasty wrinkle for users
5983 * of upl who encounter device or
5984 * private memory however, it is
5985 * unavoidable, only a fault can
5986 * resolve the actual backing
5987 * physical page by asking the
5988 * backing device.
5989 */
5990 if (user_page_list)
5991 user_page_list[entry].phys_addr = 0;
5992
5993 goto try_next_page;
5994 }
5995 if (object->scan_collisions) {
5996 /*
5997 * the pageout_scan thread is trying to steal
5998 * pages from this object, but has run into our
5999 * lock... grab 2 pages from the head of the object...
6000 * the first is freed on behalf of pageout_scan, the
6001 * 2nd is for our own use... we use vm_object_page_grab
6002 * in both cases to avoid taking pages from the free
6003 * list since we are under memory pressure and our
6004 * lock on this object is getting in the way of
6005 * relieving it
6006 */
6007 dst_page = vm_object_page_grab(object);
6008
6009 if (dst_page != VM_PAGE_NULL)
6010 vm_page_release(dst_page,
6011 FALSE);
6012
6013 dst_page = vm_object_page_grab(object);
6014 }
6015 if (dst_page == VM_PAGE_NULL) {
6016 /*
6017 * need to allocate a page
6018 */
6019 dst_page = vm_page_grab_options(grab_options);
6020 }
6021 if (dst_page == VM_PAGE_NULL) {
6022 if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6023 /*
6024 * we don't want to stall waiting for pages to come onto the free list
6025 * while we're already holding absent pages in this UPL
6026 * the caller will deal with the empty slots
6027 */
6028 if (user_page_list)
6029 user_page_list[entry].phys_addr = 0;
6030
6031 goto try_next_page;
6032 }
6033 /*
6034 * no pages available... wait
6035 * then try again for the same
6036 * offset...
6037 */
6038 vm_object_unlock(object);
6039
6040 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6041
6042 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6043
6044 VM_PAGE_WAIT();
6045 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6046
6047 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6048
6049 vm_object_lock(object);
6050
6051 continue;
6052 }
6053 vm_page_insert(dst_page, object, dst_offset);
6054
6055 dst_page->absent = TRUE;
6056 dst_page->busy = FALSE;
6057
6058 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6059 /*
6060 * if UPL_RET_ONLY_ABSENT was specified,
6061 * than we're definitely setting up a
6062 * upl for a clustered read/pagein
6063 * operation... mark the pages as clustered
6064 * so upl_commit_range can put them on the
6065 * speculative list
6066 */
6067 dst_page->clustered = TRUE;
6068
6069 if ( !(cntrl_flags & UPL_FILE_IO))
6070 VM_STAT_INCR(pageins);
6071 }
6072 }
6073 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6074
6075 /*
6076 * ENCRYPTED SWAP:
6077 */
6078 if (cntrl_flags & UPL_ENCRYPT) {
6079 /*
6080 * The page is going to be encrypted when we
6081 * get it from the pager, so mark it so.
6082 */
6083 dst_page->encrypted = TRUE;
6084 } else {
6085 /*
6086 * Otherwise, the page will not contain
6087 * encrypted data.
6088 */
6089 dst_page->encrypted = FALSE;
6090 }
6091 dst_page->overwriting = TRUE;
6092
6093 if (dst_page->pmapped) {
6094 if ( !(cntrl_flags & UPL_FILE_IO))
6095 /*
6096 * eliminate all mappings from the
6097 * original object and its prodigy
6098 */
6099 refmod_state = pmap_disconnect(phys_page);
6100 else
6101 refmod_state = pmap_get_refmod(phys_page);
6102 } else
6103 refmod_state = 0;
6104
6105 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6106 dirty = hw_dirty ? TRUE : dst_page->dirty;
6107
6108 if (cntrl_flags & UPL_SET_LITE) {
6109 unsigned int pg_num;
6110
6111 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
6112 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
6113 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
6114
6115 if (hw_dirty)
6116 pmap_clear_modify(phys_page);
6117
6118 /*
6119 * Mark original page as cleaning
6120 * in place.
6121 */
6122 dst_page->cleaning = TRUE;
6123 dst_page->precious = FALSE;
6124 } else {
6125 /*
6126 * use pageclean setup, it is more
6127 * convenient even for the pageout
6128 * cases here
6129 */
6130 vm_object_lock(upl->map_object);
6131 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6132 vm_object_unlock(upl->map_object);
6133
6134 alias_page->absent = FALSE;
6135 alias_page = NULL;
6136 }
6137
6138 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6139 upl->flags &= ~UPL_CLEAR_DIRTY;
6140 upl->flags |= UPL_SET_DIRTY;
6141 dirty = TRUE;
6142 upl->flags |= UPL_SET_DIRTY;
6143 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6144 /*
6145 * clean in place for read implies
6146 * that a write will be done on all
6147 * the pages that are dirty before
6148 * a upl commit is done. The caller
6149 * is obligated to preserve the
6150 * contents of all pages marked dirty
6151 */
6152 upl->flags |= UPL_CLEAR_DIRTY;
6153 }
6154 dst_page->dirty = dirty;
6155
6156 if (!dirty)
6157 dst_page->precious = TRUE;
6158
6159 if ( !VM_PAGE_WIRED(dst_page)) {
6160 /*
6161 * deny access to the target page while
6162 * it is being worked on
6163 */
6164 dst_page->busy = TRUE;
6165 } else
6166 dwp->dw_mask |= DW_vm_page_wire;
6167
6168 /*
6169 * We might be about to satisfy a fault which has been
6170 * requested. So no need for the "restart" bit.
6171 */
6172 dst_page->restart = FALSE;
6173 if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6174 /*
6175 * expect the page to be used
6176 */
6177 dwp->dw_mask |= DW_set_reference;
6178 }
6179 if (cntrl_flags & UPL_PRECIOUS) {
6180 if (object->internal) {
6181 SET_PAGE_DIRTY(dst_page, FALSE);
6182 dst_page->precious = FALSE;
6183 } else {
6184 dst_page->precious = TRUE;
6185 }
6186 } else {
6187 dst_page->precious = FALSE;
6188 }
6189 }
6190 if (dst_page->busy)
6191 upl->flags |= UPL_HAS_BUSY;
6192
6193 if (phys_page > upl->highest_page)
6194 upl->highest_page = phys_page;
6195 assert (!pmap_is_noencrypt(phys_page));
6196 if (user_page_list) {
6197 user_page_list[entry].phys_addr = phys_page;
6198 user_page_list[entry].free_when_done = dst_page->free_when_done;
6199 user_page_list[entry].absent = dst_page->absent;
6200 user_page_list[entry].dirty = dst_page->dirty;
6201 user_page_list[entry].precious = dst_page->precious;
6202 user_page_list[entry].device = FALSE;
6203 user_page_list[entry].needed = FALSE;
6204 if (dst_page->clustered == TRUE)
6205 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6206 else
6207 user_page_list[entry].speculative = FALSE;
6208 user_page_list[entry].cs_validated = dst_page->cs_validated;
6209 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
6210 user_page_list[entry].cs_nx = dst_page->cs_nx;
6211 user_page_list[entry].mark = FALSE;
6212 }
6213 /*
6214 * if UPL_RET_ONLY_ABSENT is set, then
6215 * we are working with a fresh page and we've
6216 * just set the clustered flag on it to
6217 * indicate that it was drug in as part of a
6218 * speculative cluster... so leave it alone
6219 */
6220 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6221 /*
6222 * someone is explicitly grabbing this page...
6223 * update clustered and speculative state
6224 *
6225 */
6226 if (dst_page->clustered)
6227 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6228 }
6229 try_next_page:
6230 if (dwp->dw_mask) {
6231 if (dwp->dw_mask & DW_vm_page_activate)
6232 VM_STAT_INCR(reactivations);
6233
6234 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6235
6236 if (dw_count >= dw_limit) {
6237 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
6238
6239 dwp = &dw_array[0];
6240 dw_count = 0;
6241 }
6242 }
6243 entry++;
6244 dst_offset += PAGE_SIZE_64;
6245 xfer_size -= PAGE_SIZE;
6246 }
6247 if (dw_count)
6248 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
6249
6250 if (alias_page != NULL) {
6251 VM_PAGE_FREE(alias_page);
6252 }
6253
6254 if (page_list_count != NULL) {
6255 if (upl->flags & UPL_INTERNAL)
6256 *page_list_count = 0;
6257 else if (*page_list_count > entry)
6258 *page_list_count = entry;
6259 }
6260 #if UPL_DEBUG
6261 upl->upl_state = 1;
6262 #endif
6263 vm_object_unlock(object);
6264
6265 return KERN_SUCCESS;
6266 }
6267
6268 /*
6269 * Routine: vm_object_super_upl_request
6270 * Purpose:
6271 * Cause the population of a portion of a vm_object
6272 * in much the same way as memory_object_upl_request.
6273 * Depending on the nature of the request, the pages
6274 * returned may be contain valid data or be uninitialized.
6275 * However, the region may be expanded up to the super
6276 * cluster size provided.
6277 */
6278
6279 __private_extern__ kern_return_t
6280 vm_object_super_upl_request(
6281 vm_object_t object,
6282 vm_object_offset_t offset,
6283 upl_size_t size,
6284 upl_size_t super_cluster,
6285 upl_t *upl,
6286 upl_page_info_t *user_page_list,
6287 unsigned int *page_list_count,
6288 upl_control_flags_t cntrl_flags)
6289 {
6290 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
6291 return KERN_FAILURE;
6292
6293 assert(object->paging_in_progress);
6294 offset = offset - object->paging_offset;
6295
6296 if (super_cluster > size) {
6297
6298 vm_object_offset_t base_offset;
6299 upl_size_t super_size;
6300 vm_object_size_t super_size_64;
6301
6302 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6303 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
6304 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6305 super_size = (upl_size_t) super_size_64;
6306 assert(super_size == super_size_64);
6307
6308 if (offset > (base_offset + super_size)) {
6309 panic("vm_object_super_upl_request: Missed target pageout"
6310 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6311 offset, base_offset, super_size, super_cluster,
6312 size, object->paging_offset);
6313 }
6314 /*
6315 * apparently there is a case where the vm requests a
6316 * page to be written out who's offset is beyond the
6317 * object size
6318 */
6319 if ((offset + size) > (base_offset + super_size)) {
6320 super_size_64 = (offset + size) - base_offset;
6321 super_size = (upl_size_t) super_size_64;
6322 assert(super_size == super_size_64);
6323 }
6324
6325 offset = base_offset;
6326 size = super_size;
6327 }
6328 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
6329 }
6330
6331
6332 kern_return_t
6333 vm_map_create_upl(
6334 vm_map_t map,
6335 vm_map_address_t offset,
6336 upl_size_t *upl_size,
6337 upl_t *upl,
6338 upl_page_info_array_t page_list,
6339 unsigned int *count,
6340 upl_control_flags_t *flags)
6341 {
6342 vm_map_entry_t entry;
6343 upl_control_flags_t caller_flags;
6344 int force_data_sync;
6345 int sync_cow_data;
6346 vm_object_t local_object;
6347 vm_map_offset_t local_offset;
6348 vm_map_offset_t local_start;
6349 kern_return_t ret;
6350
6351 assert(page_aligned(offset));
6352
6353 caller_flags = *flags;
6354
6355 if (caller_flags & ~UPL_VALID_FLAGS) {
6356 /*
6357 * For forward compatibility's sake,
6358 * reject any unknown flag.
6359 */
6360 return KERN_INVALID_VALUE;
6361 }
6362 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6363 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6364
6365 if (upl == NULL)
6366 return KERN_INVALID_ARGUMENT;
6367
6368 REDISCOVER_ENTRY:
6369 vm_map_lock_read(map);
6370
6371 if (!vm_map_lookup_entry(map, offset, &entry)) {
6372 vm_map_unlock_read(map);
6373 return KERN_FAILURE;
6374 }
6375
6376 if ((entry->vme_end - offset) < *upl_size) {
6377 *upl_size = (upl_size_t) (entry->vme_end - offset);
6378 assert(*upl_size == entry->vme_end - offset);
6379 }
6380
6381 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6382 *flags = 0;
6383
6384 if (!entry->is_sub_map &&
6385 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6386 if (VME_OBJECT(entry)->private)
6387 *flags = UPL_DEV_MEMORY;
6388
6389 if (VME_OBJECT(entry)->phys_contiguous)
6390 *flags |= UPL_PHYS_CONTIG;
6391 }
6392 vm_map_unlock_read(map);
6393 return KERN_SUCCESS;
6394 }
6395
6396 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6397 !VME_OBJECT(entry)->phys_contiguous) {
6398 if (*upl_size > MAX_UPL_SIZE_BYTES)
6399 *upl_size = MAX_UPL_SIZE_BYTES;
6400 }
6401
6402 /*
6403 * Create an object if necessary.
6404 */
6405 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6406
6407 if (vm_map_lock_read_to_write(map))
6408 goto REDISCOVER_ENTRY;
6409
6410 VME_OBJECT_SET(entry,
6411 vm_object_allocate((vm_size_t)
6412 (entry->vme_end -
6413 entry->vme_start)));
6414 VME_OFFSET_SET(entry, 0);
6415
6416 vm_map_lock_write_to_read(map);
6417 }
6418
6419 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6420 !(entry->protection & VM_PROT_WRITE)) {
6421 vm_map_unlock_read(map);
6422 return KERN_PROTECTION_FAILURE;
6423 }
6424
6425
6426 local_object = VME_OBJECT(entry);
6427 assert(local_object != VM_OBJECT_NULL);
6428
6429 if (!entry->is_sub_map &&
6430 !entry->needs_copy &&
6431 *upl_size != 0 &&
6432 local_object->vo_size > *upl_size && /* partial UPL */
6433 entry->wired_count == 0 && /* No COW for entries that are wired */
6434 (map->pmap != kernel_pmap) && /* alias checks */
6435 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6436 ||
6437 (/* case 2 */
6438 local_object->internal &&
6439 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6440 local_object->ref_count > 1))) {
6441 vm_prot_t prot;
6442
6443 /*
6444 * Case 1:
6445 * Set up the targeted range for copy-on-write to avoid
6446 * applying true_share/copy_delay to the entire object.
6447 *
6448 * Case 2:
6449 * This map entry covers only part of an internal
6450 * object. There could be other map entries covering
6451 * other areas of this object and some of these map
6452 * entries could be marked as "needs_copy", which
6453 * assumes that the object is COPY_SYMMETRIC.
6454 * To avoid marking this object as COPY_DELAY and
6455 * "true_share", let's shadow it and mark the new
6456 * (smaller) object as "true_share" and COPY_DELAY.
6457 */
6458
6459 if (vm_map_lock_read_to_write(map)) {
6460 goto REDISCOVER_ENTRY;
6461 }
6462 vm_map_lock_assert_exclusive(map);
6463 assert(VME_OBJECT(entry) == local_object);
6464
6465 vm_map_clip_start(map,
6466 entry,
6467 vm_map_trunc_page(offset,
6468 VM_MAP_PAGE_MASK(map)));
6469 vm_map_clip_end(map,
6470 entry,
6471 vm_map_round_page(offset + *upl_size,
6472 VM_MAP_PAGE_MASK(map)));
6473 if ((entry->vme_end - offset) < *upl_size) {
6474 *upl_size = (upl_size_t) (entry->vme_end - offset);
6475 assert(*upl_size == entry->vme_end - offset);
6476 }
6477
6478 prot = entry->protection & ~VM_PROT_WRITE;
6479 if (override_nx(map, VME_ALIAS(entry)) && prot)
6480 prot |= VM_PROT_EXECUTE;
6481 vm_object_pmap_protect(local_object,
6482 VME_OFFSET(entry),
6483 entry->vme_end - entry->vme_start,
6484 ((entry->is_shared ||
6485 map->mapped_in_other_pmaps)
6486 ? PMAP_NULL
6487 : map->pmap),
6488 entry->vme_start,
6489 prot);
6490
6491 assert(entry->wired_count == 0);
6492
6493 /*
6494 * Lock the VM object and re-check its status: if it's mapped
6495 * in another address space, we could still be racing with
6496 * another thread holding that other VM map exclusively.
6497 */
6498 vm_object_lock(local_object);
6499 if (local_object->true_share) {
6500 /* object is already in proper state: no COW needed */
6501 assert(local_object->copy_strategy !=
6502 MEMORY_OBJECT_COPY_SYMMETRIC);
6503 } else {
6504 /* not true_share: ask for copy-on-write below */
6505 assert(local_object->copy_strategy ==
6506 MEMORY_OBJECT_COPY_SYMMETRIC);
6507 entry->needs_copy = TRUE;
6508 }
6509 vm_object_unlock(local_object);
6510
6511 vm_map_lock_write_to_read(map);
6512 }
6513
6514 if (entry->needs_copy) {
6515 /*
6516 * Honor copy-on-write for COPY_SYMMETRIC
6517 * strategy.
6518 */
6519 vm_map_t local_map;
6520 vm_object_t object;
6521 vm_object_offset_t new_offset;
6522 vm_prot_t prot;
6523 boolean_t wired;
6524 vm_map_version_t version;
6525 vm_map_t real_map;
6526 vm_prot_t fault_type;
6527
6528 local_map = map;
6529
6530 if (caller_flags & UPL_COPYOUT_FROM) {
6531 fault_type = VM_PROT_READ | VM_PROT_COPY;
6532 vm_counters.create_upl_extra_cow++;
6533 vm_counters.create_upl_extra_cow_pages +=
6534 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6535 } else {
6536 fault_type = VM_PROT_WRITE;
6537 }
6538 if (vm_map_lookup_locked(&local_map,
6539 offset, fault_type,
6540 OBJECT_LOCK_EXCLUSIVE,
6541 &version, &object,
6542 &new_offset, &prot, &wired,
6543 NULL,
6544 &real_map) != KERN_SUCCESS) {
6545 if (fault_type == VM_PROT_WRITE) {
6546 vm_counters.create_upl_lookup_failure_write++;
6547 } else {
6548 vm_counters.create_upl_lookup_failure_copy++;
6549 }
6550 vm_map_unlock_read(local_map);
6551 return KERN_FAILURE;
6552 }
6553 if (real_map != map)
6554 vm_map_unlock(real_map);
6555 vm_map_unlock_read(local_map);
6556
6557 vm_object_unlock(object);
6558
6559 goto REDISCOVER_ENTRY;
6560 }
6561
6562 if (entry->is_sub_map) {
6563 vm_map_t submap;
6564
6565 submap = VME_SUBMAP(entry);
6566 local_start = entry->vme_start;
6567 local_offset = VME_OFFSET(entry);
6568
6569 vm_map_reference(submap);
6570 vm_map_unlock_read(map);
6571
6572 ret = vm_map_create_upl(submap,
6573 local_offset + (offset - local_start),
6574 upl_size, upl, page_list, count, flags);
6575 vm_map_deallocate(submap);
6576
6577 return ret;
6578 }
6579
6580 if (sync_cow_data &&
6581 (VME_OBJECT(entry)->shadow ||
6582 VME_OBJECT(entry)->copy)) {
6583 local_object = VME_OBJECT(entry);
6584 local_start = entry->vme_start;
6585 local_offset = VME_OFFSET(entry);
6586
6587 vm_object_reference(local_object);
6588 vm_map_unlock_read(map);
6589
6590 if (local_object->shadow && local_object->copy) {
6591 vm_object_lock_request(local_object->shadow,
6592 ((vm_object_offset_t)
6593 ((offset - local_start) +
6594 local_offset) +
6595 local_object->vo_shadow_offset),
6596 *upl_size, FALSE,
6597 MEMORY_OBJECT_DATA_SYNC,
6598 VM_PROT_NO_CHANGE);
6599 }
6600 sync_cow_data = FALSE;
6601 vm_object_deallocate(local_object);
6602
6603 goto REDISCOVER_ENTRY;
6604 }
6605 if (force_data_sync) {
6606 local_object = VME_OBJECT(entry);
6607 local_start = entry->vme_start;
6608 local_offset = VME_OFFSET(entry);
6609
6610 vm_object_reference(local_object);
6611 vm_map_unlock_read(map);
6612
6613 vm_object_lock_request(local_object,
6614 ((vm_object_offset_t)
6615 ((offset - local_start) +
6616 local_offset)),
6617 (vm_object_size_t)*upl_size,
6618 FALSE,
6619 MEMORY_OBJECT_DATA_SYNC,
6620 VM_PROT_NO_CHANGE);
6621
6622 force_data_sync = FALSE;
6623 vm_object_deallocate(local_object);
6624
6625 goto REDISCOVER_ENTRY;
6626 }
6627 if (VME_OBJECT(entry)->private)
6628 *flags = UPL_DEV_MEMORY;
6629 else
6630 *flags = 0;
6631
6632 if (VME_OBJECT(entry)->phys_contiguous)
6633 *flags |= UPL_PHYS_CONTIG;
6634
6635 local_object = VME_OBJECT(entry);
6636 local_offset = VME_OFFSET(entry);
6637 local_start = entry->vme_start;
6638
6639
6640 vm_object_lock(local_object);
6641
6642 /*
6643 * Ensure that this object is "true_share" and "copy_delay" now,
6644 * while we're still holding the VM map lock. After we unlock the map,
6645 * anything could happen to that mapping, including some copy-on-write
6646 * activity. We need to make sure that the IOPL will point at the
6647 * same memory as the mapping.
6648 */
6649 if (local_object->true_share) {
6650 assert(local_object->copy_strategy !=
6651 MEMORY_OBJECT_COPY_SYMMETRIC);
6652 } else if (local_object != kernel_object &&
6653 local_object != compressor_object &&
6654 !local_object->phys_contiguous) {
6655 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6656 if (!local_object->true_share &&
6657 vm_object_tracking_inited) {
6658 void *bt[VM_OBJECT_TRACKING_BTDEPTH];
6659 int num = 0;
6660 num = OSBacktrace(bt,
6661 VM_OBJECT_TRACKING_BTDEPTH);
6662 btlog_add_entry(vm_object_tracking_btlog,
6663 local_object,
6664 VM_OBJECT_TRACKING_OP_TRUESHARE,
6665 bt,
6666 num);
6667 }
6668 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6669 local_object->true_share = TRUE;
6670 if (local_object->copy_strategy ==
6671 MEMORY_OBJECT_COPY_SYMMETRIC) {
6672 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6673 }
6674 }
6675
6676 vm_object_reference_locked(local_object);
6677 vm_object_unlock(local_object);
6678
6679 vm_map_unlock_read(map);
6680
6681 ret = vm_object_iopl_request(local_object,
6682 ((vm_object_offset_t)
6683 ((offset - local_start) + local_offset)),
6684 *upl_size,
6685 upl,
6686 page_list,
6687 count,
6688 caller_flags);
6689 vm_object_deallocate(local_object);
6690
6691 return ret;
6692 }
6693
6694 /*
6695 * Internal routine to enter a UPL into a VM map.
6696 *
6697 * JMM - This should just be doable through the standard
6698 * vm_map_enter() API.
6699 */
6700 kern_return_t
6701 vm_map_enter_upl(
6702 vm_map_t map,
6703 upl_t upl,
6704 vm_map_offset_t *dst_addr)
6705 {
6706 vm_map_size_t size;
6707 vm_object_offset_t offset;
6708 vm_map_offset_t addr;
6709 vm_page_t m;
6710 kern_return_t kr;
6711 int isVectorUPL = 0, curr_upl=0;
6712 upl_t vector_upl = NULL;
6713 vm_offset_t vector_upl_dst_addr = 0;
6714 vm_map_t vector_upl_submap = NULL;
6715 upl_offset_t subupl_offset = 0;
6716 upl_size_t subupl_size = 0;
6717
6718 if (upl == UPL_NULL)
6719 return KERN_INVALID_ARGUMENT;
6720
6721 if((isVectorUPL = vector_upl_is_valid(upl))) {
6722 int mapped=0,valid_upls=0;
6723 vector_upl = upl;
6724
6725 upl_lock(vector_upl);
6726 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6727 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
6728 if(upl == NULL)
6729 continue;
6730 valid_upls++;
6731 if (UPL_PAGE_LIST_MAPPED & upl->flags)
6732 mapped++;
6733 }
6734
6735 if(mapped) {
6736 if(mapped != valid_upls)
6737 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
6738 else {
6739 upl_unlock(vector_upl);
6740 return KERN_FAILURE;
6741 }
6742 }
6743
6744 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
6745 if( kr != KERN_SUCCESS )
6746 panic("Vector UPL submap allocation failed\n");
6747 map = vector_upl_submap;
6748 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6749 curr_upl=0;
6750 }
6751 else
6752 upl_lock(upl);
6753
6754 process_upl_to_enter:
6755 if(isVectorUPL){
6756 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6757 *dst_addr = vector_upl_dst_addr;
6758 upl_unlock(vector_upl);
6759 return KERN_SUCCESS;
6760 }
6761 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6762 if(upl == NULL)
6763 goto process_upl_to_enter;
6764
6765 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
6766 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
6767 } else {
6768 /*
6769 * check to see if already mapped
6770 */
6771 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6772 upl_unlock(upl);
6773 return KERN_FAILURE;
6774 }
6775 }
6776 if ((!(upl->flags & UPL_SHADOWED)) &&
6777 ((upl->flags & UPL_HAS_BUSY) ||
6778 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
6779
6780 vm_object_t object;
6781 vm_page_t alias_page;
6782 vm_object_offset_t new_offset;
6783 unsigned int pg_num;
6784 wpl_array_t lite_list;
6785
6786 if (upl->flags & UPL_INTERNAL) {
6787 lite_list = (wpl_array_t)
6788 ((((uintptr_t)upl) + sizeof(struct upl))
6789 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
6790 } else {
6791 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
6792 }
6793 object = upl->map_object;
6794 upl->map_object = vm_object_allocate(upl->size);
6795
6796 vm_object_lock(upl->map_object);
6797
6798 upl->map_object->shadow = object;
6799 upl->map_object->pageout = TRUE;
6800 upl->map_object->can_persist = FALSE;
6801 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6802 upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
6803 upl->map_object->wimg_bits = object->wimg_bits;
6804 offset = upl->map_object->vo_shadow_offset;
6805 new_offset = 0;
6806 size = upl->size;
6807
6808 upl->flags |= UPL_SHADOWED;
6809
6810 while (size) {
6811 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
6812 assert(pg_num == new_offset / PAGE_SIZE);
6813
6814 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
6815
6816 VM_PAGE_GRAB_FICTITIOUS(alias_page);
6817
6818 vm_object_lock(object);
6819
6820 m = vm_page_lookup(object, offset);
6821 if (m == VM_PAGE_NULL) {
6822 panic("vm_upl_map: page missing\n");
6823 }
6824
6825 /*
6826 * Convert the fictitious page to a private
6827 * shadow of the real page.
6828 */
6829 assert(alias_page->fictitious);
6830 alias_page->fictitious = FALSE;
6831 alias_page->private = TRUE;
6832 alias_page->free_when_done = TRUE;
6833 /*
6834 * since m is a page in the upl it must
6835 * already be wired or BUSY, so it's
6836 * safe to assign the underlying physical
6837 * page to the alias
6838 */
6839 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
6840
6841 vm_object_unlock(object);
6842
6843 vm_page_lockspin_queues();
6844 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
6845 vm_page_unlock_queues();
6846
6847 /*
6848 * ENCRYPTED SWAP:
6849 * The virtual page ("m") has to be wired in some way
6850 * here or its backing physical page could
6851 * be recycled at any time.
6852 * Assuming this is enforced by the caller, we can't
6853 * get an encrypted page here. Since the encryption
6854 * key depends on the VM page's "pager" object and
6855 * the "paging_offset", we couldn't handle 2 pageable
6856 * VM pages (with different pagers and paging_offsets)
6857 * sharing the same physical page: we could end up
6858 * encrypting with one key (via one VM page) and
6859 * decrypting with another key (via the alias VM page).
6860 */
6861 ASSERT_PAGE_DECRYPTED(m);
6862
6863 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
6864
6865 assert(!alias_page->wanted);
6866 alias_page->busy = FALSE;
6867 alias_page->absent = FALSE;
6868 }
6869 size -= PAGE_SIZE;
6870 offset += PAGE_SIZE_64;
6871 new_offset += PAGE_SIZE_64;
6872 }
6873 vm_object_unlock(upl->map_object);
6874 }
6875 if (upl->flags & UPL_SHADOWED)
6876 offset = 0;
6877 else
6878 offset = upl->offset - upl->map_object->paging_offset;
6879
6880 size = upl->size;
6881
6882 vm_object_reference(upl->map_object);
6883
6884 if(!isVectorUPL) {
6885 *dst_addr = 0;
6886 /*
6887 * NEED A UPL_MAP ALIAS
6888 */
6889 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6890 VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6891 upl->map_object, offset, FALSE,
6892 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6893
6894 if (kr != KERN_SUCCESS) {
6895 vm_object_deallocate(upl->map_object);
6896 upl_unlock(upl);
6897 return(kr);
6898 }
6899 }
6900 else {
6901 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6902 VM_FLAGS_FIXED | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6903 upl->map_object, offset, FALSE,
6904 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6905 if(kr)
6906 panic("vm_map_enter failed for a Vector UPL\n");
6907 }
6908 vm_object_lock(upl->map_object);
6909
6910 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
6911 m = vm_page_lookup(upl->map_object, offset);
6912
6913 if (m) {
6914 m->pmapped = TRUE;
6915
6916 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
6917 * but only in kernel space. If this was on a user map,
6918 * we'd have to set the wpmapped bit. */
6919 /* m->wpmapped = TRUE; */
6920 assert(map->pmap == kernel_pmap);
6921
6922 PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE);
6923 }
6924 offset += PAGE_SIZE_64;
6925 }
6926 vm_object_unlock(upl->map_object);
6927
6928 /*
6929 * hold a reference for the mapping
6930 */
6931 upl->ref_count++;
6932 upl->flags |= UPL_PAGE_LIST_MAPPED;
6933 upl->kaddr = (vm_offset_t) *dst_addr;
6934 assert(upl->kaddr == *dst_addr);
6935
6936 if(isVectorUPL)
6937 goto process_upl_to_enter;
6938
6939 upl_unlock(upl);
6940
6941 return KERN_SUCCESS;
6942 }
6943
6944 /*
6945 * Internal routine to remove a UPL mapping from a VM map.
6946 *
6947 * XXX - This should just be doable through a standard
6948 * vm_map_remove() operation. Otherwise, implicit clean-up
6949 * of the target map won't be able to correctly remove
6950 * these (and release the reference on the UPL). Having
6951 * to do this means we can't map these into user-space
6952 * maps yet.
6953 */
6954 kern_return_t
6955 vm_map_remove_upl(
6956 vm_map_t map,
6957 upl_t upl)
6958 {
6959 vm_address_t addr;
6960 upl_size_t size;
6961 int isVectorUPL = 0, curr_upl = 0;
6962 upl_t vector_upl = NULL;
6963
6964 if (upl == UPL_NULL)
6965 return KERN_INVALID_ARGUMENT;
6966
6967 if((isVectorUPL = vector_upl_is_valid(upl))) {
6968 int unmapped=0, valid_upls=0;
6969 vector_upl = upl;
6970 upl_lock(vector_upl);
6971 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6972 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
6973 if(upl == NULL)
6974 continue;
6975 valid_upls++;
6976 if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
6977 unmapped++;
6978 }
6979
6980 if(unmapped) {
6981 if(unmapped != valid_upls)
6982 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
6983 else {
6984 upl_unlock(vector_upl);
6985 return KERN_FAILURE;
6986 }
6987 }
6988 curr_upl=0;
6989 }
6990 else
6991 upl_lock(upl);
6992
6993 process_upl_to_remove:
6994 if(isVectorUPL) {
6995 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6996 vm_map_t v_upl_submap;
6997 vm_offset_t v_upl_submap_dst_addr;
6998 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
6999
7000 vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
7001 vm_map_deallocate(v_upl_submap);
7002 upl_unlock(vector_upl);
7003 return KERN_SUCCESS;
7004 }
7005
7006 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7007 if(upl == NULL)
7008 goto process_upl_to_remove;
7009 }
7010
7011 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7012 addr = upl->kaddr;
7013 size = upl->size;
7014
7015 assert(upl->ref_count > 1);
7016 upl->ref_count--; /* removing mapping ref */
7017
7018 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7019 upl->kaddr = (vm_offset_t) 0;
7020
7021 if(!isVectorUPL) {
7022 upl_unlock(upl);
7023
7024 vm_map_remove(
7025 map,
7026 vm_map_trunc_page(addr,
7027 VM_MAP_PAGE_MASK(map)),
7028 vm_map_round_page(addr + size,
7029 VM_MAP_PAGE_MASK(map)),
7030 VM_MAP_NO_FLAGS);
7031
7032 return KERN_SUCCESS;
7033 }
7034 else {
7035 /*
7036 * If it's a Vectored UPL, we'll be removing the entire
7037 * submap anyways, so no need to remove individual UPL
7038 * element mappings from within the submap
7039 */
7040 goto process_upl_to_remove;
7041 }
7042 }
7043 upl_unlock(upl);
7044
7045 return KERN_FAILURE;
7046 }
7047
7048
7049 kern_return_t
7050 upl_commit_range(
7051 upl_t upl,
7052 upl_offset_t offset,
7053 upl_size_t size,
7054 int flags,
7055 upl_page_info_t *page_list,
7056 mach_msg_type_number_t count,
7057 boolean_t *empty)
7058 {
7059 upl_size_t xfer_size, subupl_size = size;
7060 vm_object_t shadow_object;
7061 vm_object_t object;
7062 vm_object_t m_object;
7063 vm_object_offset_t target_offset;
7064 upl_offset_t subupl_offset = offset;
7065 int entry;
7066 wpl_array_t lite_list;
7067 int occupied;
7068 int clear_refmod = 0;
7069 int pgpgout_count = 0;
7070 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7071 struct vm_page_delayed_work *dwp;
7072 int dw_count;
7073 int dw_limit;
7074 int isVectorUPL = 0;
7075 upl_t vector_upl = NULL;
7076 boolean_t should_be_throttled = FALSE;
7077
7078 vm_page_t nxt_page = VM_PAGE_NULL;
7079 int fast_path_possible = 0;
7080 int fast_path_full_commit = 0;
7081 int throttle_page = 0;
7082 int unwired_count = 0;
7083 int local_queue_count = 0;
7084 vm_page_t first_local, last_local;
7085
7086 *empty = FALSE;
7087
7088 if (upl == UPL_NULL)
7089 return KERN_INVALID_ARGUMENT;
7090
7091 if (count == 0)
7092 page_list = NULL;
7093
7094 if((isVectorUPL = vector_upl_is_valid(upl))) {
7095 vector_upl = upl;
7096 upl_lock(vector_upl);
7097 }
7098 else
7099 upl_lock(upl);
7100
7101 process_upl_to_commit:
7102
7103 if(isVectorUPL) {
7104 size = subupl_size;
7105 offset = subupl_offset;
7106 if(size == 0) {
7107 upl_unlock(vector_upl);
7108 return KERN_SUCCESS;
7109 }
7110 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7111 if(upl == NULL) {
7112 upl_unlock(vector_upl);
7113 return KERN_FAILURE;
7114 }
7115 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7116 subupl_size -= size;
7117 subupl_offset += size;
7118 }
7119
7120 #if UPL_DEBUG
7121 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7122 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7123
7124 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7125 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7126
7127 upl->upl_commit_index++;
7128 }
7129 #endif
7130 if (upl->flags & UPL_DEVICE_MEMORY)
7131 xfer_size = 0;
7132 else if ((offset + size) <= upl->size)
7133 xfer_size = size;
7134 else {
7135 if(!isVectorUPL)
7136 upl_unlock(upl);
7137 else {
7138 upl_unlock(vector_upl);
7139 }
7140 return KERN_FAILURE;
7141 }
7142 if (upl->flags & UPL_SET_DIRTY)
7143 flags |= UPL_COMMIT_SET_DIRTY;
7144 if (upl->flags & UPL_CLEAR_DIRTY)
7145 flags |= UPL_COMMIT_CLEAR_DIRTY;
7146
7147 if (upl->flags & UPL_INTERNAL)
7148 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7149 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7150 else
7151 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7152
7153 object = upl->map_object;
7154
7155 if (upl->flags & UPL_SHADOWED) {
7156 vm_object_lock(object);
7157 shadow_object = object->shadow;
7158 } else {
7159 shadow_object = object;
7160 }
7161 entry = offset/PAGE_SIZE;
7162 target_offset = (vm_object_offset_t)offset;
7163
7164 assert(!(target_offset & PAGE_MASK));
7165 assert(!(xfer_size & PAGE_MASK));
7166
7167 if (upl->flags & UPL_KERNEL_OBJECT)
7168 vm_object_lock_shared(shadow_object);
7169 else
7170 vm_object_lock(shadow_object);
7171
7172 if (upl->flags & UPL_ACCESS_BLOCKED) {
7173 assert(shadow_object->blocked_access);
7174 shadow_object->blocked_access = FALSE;
7175 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7176 }
7177
7178 if (shadow_object->code_signed) {
7179 /*
7180 * CODE SIGNING:
7181 * If the object is code-signed, do not let this UPL tell
7182 * us if the pages are valid or not. Let the pages be
7183 * validated by VM the normal way (when they get mapped or
7184 * copied).
7185 */
7186 flags &= ~UPL_COMMIT_CS_VALIDATED;
7187 }
7188 if (! page_list) {
7189 /*
7190 * No page list to get the code-signing info from !?
7191 */
7192 flags &= ~UPL_COMMIT_CS_VALIDATED;
7193 }
7194 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal)
7195 should_be_throttled = TRUE;
7196
7197 dwp = &dw_array[0];
7198 dw_count = 0;
7199 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7200
7201 if ((upl->flags & UPL_IO_WIRE) &&
7202 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7203 !isVectorUPL &&
7204 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7205 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7206
7207 if (!vm_page_queue_empty(&shadow_object->memq)) {
7208
7209 if (size == shadow_object->vo_size) {
7210 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7211 fast_path_full_commit = 1;
7212 }
7213 fast_path_possible = 1;
7214
7215 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7216 (shadow_object->purgable == VM_PURGABLE_DENY ||
7217 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7218 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7219 throttle_page = 1;
7220 }
7221 }
7222 }
7223 first_local = VM_PAGE_NULL;
7224 last_local = VM_PAGE_NULL;
7225
7226 while (xfer_size) {
7227 vm_page_t t, m;
7228
7229 dwp->dw_mask = 0;
7230 clear_refmod = 0;
7231
7232 m = VM_PAGE_NULL;
7233
7234 if (upl->flags & UPL_LITE) {
7235 unsigned int pg_num;
7236
7237 if (nxt_page != VM_PAGE_NULL) {
7238 m = nxt_page;
7239 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
7240 target_offset = m->offset;
7241 }
7242 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7243 assert(pg_num == target_offset/PAGE_SIZE);
7244
7245 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7246 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
7247
7248 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
7249 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
7250 } else
7251 m = NULL;
7252 }
7253 if (upl->flags & UPL_SHADOWED) {
7254 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7255
7256 t->free_when_done = FALSE;
7257
7258 VM_PAGE_FREE(t);
7259
7260 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
7261 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7262 }
7263 }
7264 if (m == VM_PAGE_NULL)
7265 goto commit_next_page;
7266
7267 m_object = VM_PAGE_OBJECT(m);
7268
7269 if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7270 assert(m->busy);
7271
7272 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7273 goto commit_next_page;
7274 }
7275
7276 if (flags & UPL_COMMIT_CS_VALIDATED) {
7277 /*
7278 * CODE SIGNING:
7279 * Set the code signing bits according to
7280 * what the UPL says they should be.
7281 */
7282 m->cs_validated = page_list[entry].cs_validated;
7283 m->cs_tainted = page_list[entry].cs_tainted;
7284 m->cs_nx = page_list[entry].cs_nx;
7285 }
7286 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
7287 m->written_by_kernel = TRUE;
7288
7289 if (upl->flags & UPL_IO_WIRE) {
7290
7291 if (page_list)
7292 page_list[entry].phys_addr = 0;
7293
7294 if (flags & UPL_COMMIT_SET_DIRTY) {
7295 SET_PAGE_DIRTY(m, FALSE);
7296 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7297 m->dirty = FALSE;
7298
7299 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7300 m->cs_validated && !m->cs_tainted) {
7301 /*
7302 * CODE SIGNING:
7303 * This page is no longer dirty
7304 * but could have been modified,
7305 * so it will need to be
7306 * re-validated.
7307 */
7308 if (m->slid) {
7309 panic("upl_commit_range(%p): page %p was slid\n",
7310 upl, m);
7311 }
7312 assert(!m->slid);
7313 m->cs_validated = FALSE;
7314 #if DEVELOPMENT || DEBUG
7315 vm_cs_validated_resets++;
7316 #endif
7317 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7318 }
7319 clear_refmod |= VM_MEM_MODIFIED;
7320 }
7321 if (upl->flags & UPL_ACCESS_BLOCKED) {
7322 /*
7323 * We blocked access to the pages in this UPL.
7324 * Clear the "busy" bit and wake up any waiter
7325 * for this page.
7326 */
7327 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7328 }
7329 if (fast_path_possible) {
7330 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7331 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7332 if (m->absent) {
7333 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
7334 assert(m->wire_count == 0);
7335 assert(m->busy);
7336
7337 m->absent = FALSE;
7338 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7339 } else {
7340 if (m->wire_count == 0)
7341 panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
7342 assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
7343
7344 /*
7345 * XXX FBDP need to update some other
7346 * counters here (purgeable_wired_count)
7347 * (ledgers), ...
7348 */
7349 assert(m->wire_count > 0);
7350 m->wire_count--;
7351
7352 if (m->wire_count == 0) {
7353 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
7354 unwired_count++;
7355 }
7356 }
7357 if (m->wire_count == 0) {
7358 assert(m->pageq.next == 0 && m->pageq.prev == 0);
7359
7360 if (last_local == VM_PAGE_NULL) {
7361 assert(first_local == VM_PAGE_NULL);
7362
7363 last_local = m;
7364 first_local = m;
7365 } else {
7366 assert(first_local != VM_PAGE_NULL);
7367
7368 m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7369 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7370 first_local = m;
7371 }
7372 local_queue_count++;
7373
7374 if (throttle_page) {
7375 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
7376 } else {
7377 if (flags & UPL_COMMIT_INACTIVATE) {
7378 if (shadow_object->internal)
7379 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7380 else
7381 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7382 } else
7383 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
7384 }
7385 }
7386 } else {
7387 if (flags & UPL_COMMIT_INACTIVATE) {
7388 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7389 clear_refmod |= VM_MEM_REFERENCED;
7390 }
7391 if (m->absent) {
7392 if (flags & UPL_COMMIT_FREE_ABSENT)
7393 dwp->dw_mask |= DW_vm_page_free;
7394 else {
7395 m->absent = FALSE;
7396 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7397
7398 if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
7399 dwp->dw_mask |= DW_vm_page_activate;
7400 }
7401 } else
7402 dwp->dw_mask |= DW_vm_page_unwire;
7403 }
7404 goto commit_next_page;
7405 }
7406 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7407
7408 if (page_list)
7409 page_list[entry].phys_addr = 0;
7410
7411 /*
7412 * make sure to clear the hardware
7413 * modify or reference bits before
7414 * releasing the BUSY bit on this page
7415 * otherwise we risk losing a legitimate
7416 * change of state
7417 */
7418 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7419 m->dirty = FALSE;
7420
7421 clear_refmod |= VM_MEM_MODIFIED;
7422 }
7423 if (m->laundry)
7424 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7425
7426 if (VM_PAGE_WIRED(m))
7427 m->free_when_done = FALSE;
7428
7429 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7430 m->cs_validated && !m->cs_tainted) {
7431 /*
7432 * CODE SIGNING:
7433 * This page is no longer dirty
7434 * but could have been modified,
7435 * so it will need to be
7436 * re-validated.
7437 */
7438 if (m->slid) {
7439 panic("upl_commit_range(%p): page %p was slid\n",
7440 upl, m);
7441 }
7442 assert(!m->slid);
7443 m->cs_validated = FALSE;
7444 #if DEVELOPMENT || DEBUG
7445 vm_cs_validated_resets++;
7446 #endif
7447 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7448 }
7449 if (m->overwriting) {
7450 /*
7451 * the (COPY_OUT_FROM == FALSE) request_page_list case
7452 */
7453 if (m->busy) {
7454 #if CONFIG_PHANTOM_CACHE
7455 if (m->absent && !m_object->internal)
7456 dwp->dw_mask |= DW_vm_phantom_cache_update;
7457 #endif
7458 m->absent = FALSE;
7459
7460 dwp->dw_mask |= DW_clear_busy;
7461 } else {
7462 /*
7463 * alternate (COPY_OUT_FROM == FALSE) page_list case
7464 * Occurs when the original page was wired
7465 * at the time of the list request
7466 */
7467 assert(VM_PAGE_WIRED(m));
7468
7469 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7470 }
7471 m->overwriting = FALSE;
7472 }
7473 if (m->encrypted_cleaning == TRUE) {
7474 m->encrypted_cleaning = FALSE;
7475
7476 dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP;
7477 }
7478 m->cleaning = FALSE;
7479
7480 if (m->free_when_done) {
7481 /*
7482 * With the clean queue enabled, UPL_PAGEOUT should
7483 * no longer set the pageout bit. It's pages now go
7484 * to the clean queue.
7485 */
7486 assert(!(flags & UPL_PAGEOUT));
7487 assert(!m_object->internal);
7488
7489 m->free_when_done = FALSE;
7490 #if MACH_CLUSTER_STATS
7491 if (m->wanted) vm_pageout_target_collisions++;
7492 #endif
7493 if ((flags & UPL_COMMIT_SET_DIRTY) ||
7494 (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7495 /*
7496 * page was re-dirtied after we started
7497 * the pageout... reactivate it since
7498 * we don't know whether the on-disk
7499 * copy matches what is now in memory
7500 */
7501 SET_PAGE_DIRTY(m, FALSE);
7502
7503 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7504
7505 if (upl->flags & UPL_PAGEOUT) {
7506 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
7507 VM_STAT_INCR(reactivations);
7508 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7509 }
7510 } else {
7511 /*
7512 * page has been successfully cleaned
7513 * go ahead and free it for other use
7514 */
7515 if (m_object->internal) {
7516 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7517 } else {
7518 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7519 }
7520 m->dirty = FALSE;
7521 m->busy = TRUE;
7522
7523 dwp->dw_mask |= DW_vm_page_free;
7524 }
7525 goto commit_next_page;
7526 }
7527 #if MACH_CLUSTER_STATS
7528 if (m->wpmapped)
7529 m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
7530
7531 if (m->dirty) vm_pageout_cluster_dirtied++;
7532 else vm_pageout_cluster_cleaned++;
7533 if (m->wanted) vm_pageout_cluster_collisions++;
7534 #endif
7535 /*
7536 * It is a part of the semantic of COPYOUT_FROM
7537 * UPLs that a commit implies cache sync
7538 * between the vm page and the backing store
7539 * this can be used to strip the precious bit
7540 * as well as clean
7541 */
7542 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
7543 m->precious = FALSE;
7544
7545 if (flags & UPL_COMMIT_SET_DIRTY) {
7546 SET_PAGE_DIRTY(m, FALSE);
7547 } else {
7548 m->dirty = FALSE;
7549 }
7550
7551 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7552 if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
7553 pgpgout_count++;
7554
7555 VM_STAT_INCR(pageouts);
7556 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7557
7558 dwp->dw_mask |= DW_enqueue_cleaned;
7559 vm_pageout_enqueued_cleaned_from_inactive_dirty++;
7560 } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
7561 /*
7562 * page coming back in from being 'frozen'...
7563 * it was dirty before it was frozen, so keep it so
7564 * the vm_page_activate will notice that it really belongs
7565 * on the throttle queue and put it there
7566 */
7567 SET_PAGE_DIRTY(m, FALSE);
7568 dwp->dw_mask |= DW_vm_page_activate;
7569
7570 } else {
7571 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7572 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7573 clear_refmod |= VM_MEM_REFERENCED;
7574 } else if ( !VM_PAGE_PAGEABLE(m)) {
7575
7576 if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
7577 dwp->dw_mask |= DW_vm_page_speculate;
7578 else if (m->reference)
7579 dwp->dw_mask |= DW_vm_page_activate;
7580 else {
7581 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7582 clear_refmod |= VM_MEM_REFERENCED;
7583 }
7584 }
7585 }
7586 if (upl->flags & UPL_ACCESS_BLOCKED) {
7587 /*
7588 * We blocked access to the pages in this URL.
7589 * Clear the "busy" bit on this page before we
7590 * wake up any waiter.
7591 */
7592 dwp->dw_mask |= DW_clear_busy;
7593 }
7594 /*
7595 * Wakeup any thread waiting for the page to be un-cleaning.
7596 */
7597 dwp->dw_mask |= DW_PAGE_WAKEUP;
7598
7599 commit_next_page:
7600 if (clear_refmod)
7601 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7602
7603 target_offset += PAGE_SIZE_64;
7604 xfer_size -= PAGE_SIZE;
7605 entry++;
7606
7607 if (dwp->dw_mask) {
7608 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7609 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7610
7611 if (dw_count >= dw_limit) {
7612 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7613
7614 dwp = &dw_array[0];
7615 dw_count = 0;
7616 }
7617 } else {
7618 if (dwp->dw_mask & DW_clear_busy)
7619 m->busy = FALSE;
7620
7621 if (dwp->dw_mask & DW_PAGE_WAKEUP)
7622 PAGE_WAKEUP(m);
7623 }
7624 }
7625 }
7626 if (dw_count)
7627 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7628
7629 if (fast_path_possible) {
7630
7631 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7632 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7633
7634 if (local_queue_count || unwired_count) {
7635
7636 if (local_queue_count) {
7637 vm_page_t first_target;
7638 vm_page_queue_head_t *target_queue;
7639
7640 if (throttle_page)
7641 target_queue = &vm_page_queue_throttled;
7642 else {
7643 if (flags & UPL_COMMIT_INACTIVATE) {
7644 if (shadow_object->internal)
7645 target_queue = &vm_page_queue_anonymous;
7646 else
7647 target_queue = &vm_page_queue_inactive;
7648 } else
7649 target_queue = &vm_page_queue_active;
7650 }
7651 /*
7652 * Transfer the entire local queue to a regular LRU page queues.
7653 */
7654 vm_page_lockspin_queues();
7655
7656 first_target = (vm_page_t) vm_page_queue_first(target_queue);
7657
7658 if (vm_page_queue_empty(target_queue))
7659 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7660 else
7661 first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7662
7663 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7664 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
7665 last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
7666
7667 /*
7668 * Adjust the global page counts.
7669 */
7670 if (throttle_page) {
7671 vm_page_throttled_count += local_queue_count;
7672 } else {
7673 if (flags & UPL_COMMIT_INACTIVATE) {
7674 if (shadow_object->internal)
7675 vm_page_anonymous_count += local_queue_count;
7676 vm_page_inactive_count += local_queue_count;
7677
7678 token_new_pagecount += local_queue_count;
7679 } else
7680 vm_page_active_count += local_queue_count;
7681
7682 if (shadow_object->internal)
7683 vm_page_pageable_internal_count += local_queue_count;
7684 else
7685 vm_page_pageable_external_count += local_queue_count;
7686 }
7687 } else {
7688 vm_page_lockspin_queues();
7689 }
7690 if (unwired_count) {
7691 vm_page_wire_count -= unwired_count;
7692 VM_CHECK_MEMORYSTATUS;
7693 }
7694 vm_page_unlock_queues();
7695
7696 shadow_object->wired_page_count -= unwired_count;
7697
7698 if (!shadow_object->wired_page_count) {
7699 VM_OBJECT_UNWIRED(shadow_object);
7700 }
7701 }
7702 }
7703 occupied = 1;
7704
7705 if (upl->flags & UPL_DEVICE_MEMORY) {
7706 occupied = 0;
7707 } else if (upl->flags & UPL_LITE) {
7708 int pg_num;
7709 int i;
7710
7711 occupied = 0;
7712
7713 if (!fast_path_full_commit) {
7714 pg_num = upl->size/PAGE_SIZE;
7715 pg_num = (pg_num + 31) >> 5;
7716
7717 for (i = 0; i < pg_num; i++) {
7718 if (lite_list[i] != 0) {
7719 occupied = 1;
7720 break;
7721 }
7722 }
7723 }
7724 } else {
7725 if (vm_page_queue_empty(&upl->map_object->memq))
7726 occupied = 0;
7727 }
7728 if (occupied == 0) {
7729 /*
7730 * If this UPL element belongs to a Vector UPL and is
7731 * empty, then this is the right function to deallocate
7732 * it. So go ahead set the *empty variable. The flag
7733 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7734 * should be considered relevant for the Vector UPL and not
7735 * the internal UPLs.
7736 */
7737 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
7738 *empty = TRUE;
7739
7740 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
7741 /*
7742 * this is not a paging object
7743 * so we need to drop the paging reference
7744 * that was taken when we created the UPL
7745 * against this object
7746 */
7747 vm_object_activity_end(shadow_object);
7748 vm_object_collapse(shadow_object, 0, TRUE);
7749 } else {
7750 /*
7751 * we dontated the paging reference to
7752 * the map object... vm_pageout_object_terminate
7753 * will drop this reference
7754 */
7755 }
7756 }
7757 vm_object_unlock(shadow_object);
7758 if (object != shadow_object)
7759 vm_object_unlock(object);
7760
7761 if(!isVectorUPL)
7762 upl_unlock(upl);
7763 else {
7764 /*
7765 * If we completed our operations on an UPL that is
7766 * part of a Vectored UPL and if empty is TRUE, then
7767 * we should go ahead and deallocate this UPL element.
7768 * Then we check if this was the last of the UPL elements
7769 * within that Vectored UPL. If so, set empty to TRUE
7770 * so that in ubc_upl_commit_range or ubc_upl_commit, we
7771 * can go ahead and deallocate the Vector UPL too.
7772 */
7773 if(*empty==TRUE) {
7774 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
7775 upl_deallocate(upl);
7776 }
7777 goto process_upl_to_commit;
7778 }
7779 if (pgpgout_count) {
7780 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
7781 }
7782
7783 return KERN_SUCCESS;
7784 }
7785
7786 kern_return_t
7787 upl_abort_range(
7788 upl_t upl,
7789 upl_offset_t offset,
7790 upl_size_t size,
7791 int error,
7792 boolean_t *empty)
7793 {
7794 upl_page_info_t *user_page_list = NULL;
7795 upl_size_t xfer_size, subupl_size = size;
7796 vm_object_t shadow_object;
7797 vm_object_t object;
7798 vm_object_offset_t target_offset;
7799 upl_offset_t subupl_offset = offset;
7800 int entry;
7801 wpl_array_t lite_list;
7802 int occupied;
7803 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7804 struct vm_page_delayed_work *dwp;
7805 int dw_count;
7806 int dw_limit;
7807 int isVectorUPL = 0;
7808 upl_t vector_upl = NULL;
7809
7810 *empty = FALSE;
7811
7812 if (upl == UPL_NULL)
7813 return KERN_INVALID_ARGUMENT;
7814
7815 if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
7816 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
7817
7818 if((isVectorUPL = vector_upl_is_valid(upl))) {
7819 vector_upl = upl;
7820 upl_lock(vector_upl);
7821 }
7822 else
7823 upl_lock(upl);
7824
7825 process_upl_to_abort:
7826 if(isVectorUPL) {
7827 size = subupl_size;
7828 offset = subupl_offset;
7829 if(size == 0) {
7830 upl_unlock(vector_upl);
7831 return KERN_SUCCESS;
7832 }
7833 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7834 if(upl == NULL) {
7835 upl_unlock(vector_upl);
7836 return KERN_FAILURE;
7837 }
7838 subupl_size -= size;
7839 subupl_offset += size;
7840 }
7841
7842 *empty = FALSE;
7843
7844 #if UPL_DEBUG
7845 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7846 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7847
7848 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7849 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7850 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
7851
7852 upl->upl_commit_index++;
7853 }
7854 #endif
7855 if (upl->flags & UPL_DEVICE_MEMORY)
7856 xfer_size = 0;
7857 else if ((offset + size) <= upl->size)
7858 xfer_size = size;
7859 else {
7860 if(!isVectorUPL)
7861 upl_unlock(upl);
7862 else {
7863 upl_unlock(vector_upl);
7864 }
7865
7866 return KERN_FAILURE;
7867 }
7868 if (upl->flags & UPL_INTERNAL) {
7869 lite_list = (wpl_array_t)
7870 ((((uintptr_t)upl) + sizeof(struct upl))
7871 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7872
7873 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
7874 } else {
7875 lite_list = (wpl_array_t)
7876 (((uintptr_t)upl) + sizeof(struct upl));
7877 }
7878 object = upl->map_object;
7879
7880 if (upl->flags & UPL_SHADOWED) {
7881 vm_object_lock(object);
7882 shadow_object = object->shadow;
7883 } else
7884 shadow_object = object;
7885
7886 entry = offset/PAGE_SIZE;
7887 target_offset = (vm_object_offset_t)offset;
7888
7889 assert(!(target_offset & PAGE_MASK));
7890 assert(!(xfer_size & PAGE_MASK));
7891
7892 if (upl->flags & UPL_KERNEL_OBJECT)
7893 vm_object_lock_shared(shadow_object);
7894 else
7895 vm_object_lock(shadow_object);
7896
7897 if (upl->flags & UPL_ACCESS_BLOCKED) {
7898 assert(shadow_object->blocked_access);
7899 shadow_object->blocked_access = FALSE;
7900 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7901 }
7902
7903 dwp = &dw_array[0];
7904 dw_count = 0;
7905 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7906
7907 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
7908 panic("upl_abort_range: kernel_object being DUMPED");
7909
7910 while (xfer_size) {
7911 vm_page_t t, m;
7912 unsigned int pg_num;
7913 boolean_t needed;
7914
7915 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7916 assert(pg_num == target_offset/PAGE_SIZE);
7917
7918 needed = FALSE;
7919
7920 if (user_page_list)
7921 needed = user_page_list[pg_num].needed;
7922
7923 dwp->dw_mask = 0;
7924 m = VM_PAGE_NULL;
7925
7926 if (upl->flags & UPL_LITE) {
7927
7928 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7929 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
7930
7931 if ( !(upl->flags & UPL_KERNEL_OBJECT))
7932 m = vm_page_lookup(shadow_object, target_offset +
7933 (upl->offset - shadow_object->paging_offset));
7934 }
7935 }
7936 if (upl->flags & UPL_SHADOWED) {
7937 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7938 t->free_when_done = FALSE;
7939
7940 VM_PAGE_FREE(t);
7941
7942 if (m == VM_PAGE_NULL)
7943 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7944 }
7945 }
7946 if ((upl->flags & UPL_KERNEL_OBJECT))
7947 goto abort_next_page;
7948
7949 if (m != VM_PAGE_NULL) {
7950
7951 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7952
7953 if (m->absent) {
7954 boolean_t must_free = TRUE;
7955
7956 /*
7957 * COPYOUT = FALSE case
7958 * check for error conditions which must
7959 * be passed back to the pages customer
7960 */
7961 if (error & UPL_ABORT_RESTART) {
7962 m->restart = TRUE;
7963 m->absent = FALSE;
7964 m->unusual = TRUE;
7965 must_free = FALSE;
7966 } else if (error & UPL_ABORT_UNAVAILABLE) {
7967 m->restart = FALSE;
7968 m->unusual = TRUE;
7969 must_free = FALSE;
7970 } else if (error & UPL_ABORT_ERROR) {
7971 m->restart = FALSE;
7972 m->absent = FALSE;
7973 m->error = TRUE;
7974 m->unusual = TRUE;
7975 must_free = FALSE;
7976 }
7977 if (m->clustered && needed == FALSE) {
7978 /*
7979 * This page was a part of a speculative
7980 * read-ahead initiated by the kernel
7981 * itself. No one is expecting this
7982 * page and no one will clean up its
7983 * error state if it ever becomes valid
7984 * in the future.
7985 * We have to free it here.
7986 */
7987 must_free = TRUE;
7988 }
7989
7990 /*
7991 * ENCRYPTED SWAP:
7992 * If the page was already encrypted,
7993 * we don't really need to decrypt it
7994 * now. It will get decrypted later,
7995 * on demand, as soon as someone needs
7996 * to access its contents.
7997 */
7998
7999 m->cleaning = FALSE;
8000 m->encrypted_cleaning = FALSE;
8001
8002 if (m->overwriting && !m->busy) {
8003 /*
8004 * this shouldn't happen since
8005 * this is an 'absent' page, but
8006 * it doesn't hurt to check for
8007 * the 'alternate' method of
8008 * stabilizing the page...
8009 * we will mark 'busy' to be cleared
8010 * in the following code which will
8011 * take care of the primary stabilzation
8012 * method (i.e. setting 'busy' to TRUE)
8013 */
8014 dwp->dw_mask |= DW_vm_page_unwire;
8015 }
8016 m->overwriting = FALSE;
8017
8018 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8019
8020 if (must_free == TRUE)
8021 dwp->dw_mask |= DW_vm_page_free;
8022 else
8023 dwp->dw_mask |= DW_vm_page_activate;
8024 } else {
8025 /*
8026 * Handle the trusted pager throttle.
8027 */
8028 if (m->laundry)
8029 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8030
8031 if (upl->flags & UPL_ACCESS_BLOCKED) {
8032 /*
8033 * We blocked access to the pages in this UPL.
8034 * Clear the "busy" bit and wake up any waiter
8035 * for this page.
8036 */
8037 dwp->dw_mask |= DW_clear_busy;
8038 }
8039 if (m->overwriting) {
8040 if (m->busy)
8041 dwp->dw_mask |= DW_clear_busy;
8042 else {
8043 /*
8044 * deal with the 'alternate' method
8045 * of stabilizing the page...
8046 * we will either free the page
8047 * or mark 'busy' to be cleared
8048 * in the following code which will
8049 * take care of the primary stabilzation
8050 * method (i.e. setting 'busy' to TRUE)
8051 */
8052 dwp->dw_mask |= DW_vm_page_unwire;
8053 }
8054 m->overwriting = FALSE;
8055 }
8056 if (m->encrypted_cleaning == TRUE) {
8057 m->encrypted_cleaning = FALSE;
8058
8059 dwp->dw_mask |= DW_clear_busy;
8060 }
8061 m->free_when_done = FALSE;
8062 m->cleaning = FALSE;
8063
8064 if (error & UPL_ABORT_DUMP_PAGES) {
8065 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8066
8067 dwp->dw_mask |= DW_vm_page_free;
8068 } else {
8069 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8070 if (error & UPL_ABORT_REFERENCE) {
8071 /*
8072 * we've been told to explictly
8073 * reference this page... for
8074 * file I/O, this is done by
8075 * implementing an LRU on the inactive q
8076 */
8077 dwp->dw_mask |= DW_vm_page_lru;
8078
8079 } else if ( !VM_PAGE_PAGEABLE(m))
8080 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8081 }
8082 dwp->dw_mask |= DW_PAGE_WAKEUP;
8083 }
8084 }
8085 }
8086 abort_next_page:
8087 target_offset += PAGE_SIZE_64;
8088 xfer_size -= PAGE_SIZE;
8089 entry++;
8090
8091 if (dwp->dw_mask) {
8092 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8093 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8094
8095 if (dw_count >= dw_limit) {
8096 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
8097
8098 dwp = &dw_array[0];
8099 dw_count = 0;
8100 }
8101 } else {
8102 if (dwp->dw_mask & DW_clear_busy)
8103 m->busy = FALSE;
8104
8105 if (dwp->dw_mask & DW_PAGE_WAKEUP)
8106 PAGE_WAKEUP(m);
8107 }
8108 }
8109 }
8110 if (dw_count)
8111 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
8112
8113 occupied = 1;
8114
8115 if (upl->flags & UPL_DEVICE_MEMORY) {
8116 occupied = 0;
8117 } else if (upl->flags & UPL_LITE) {
8118 int pg_num;
8119 int i;
8120
8121 pg_num = upl->size/PAGE_SIZE;
8122 pg_num = (pg_num + 31) >> 5;
8123 occupied = 0;
8124
8125 for (i = 0; i < pg_num; i++) {
8126 if (lite_list[i] != 0) {
8127 occupied = 1;
8128 break;
8129 }
8130 }
8131 } else {
8132 if (vm_page_queue_empty(&upl->map_object->memq))
8133 occupied = 0;
8134 }
8135 if (occupied == 0) {
8136 /*
8137 * If this UPL element belongs to a Vector UPL and is
8138 * empty, then this is the right function to deallocate
8139 * it. So go ahead set the *empty variable. The flag
8140 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8141 * should be considered relevant for the Vector UPL and
8142 * not the internal UPLs.
8143 */
8144 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
8145 *empty = TRUE;
8146
8147 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8148 /*
8149 * this is not a paging object
8150 * so we need to drop the paging reference
8151 * that was taken when we created the UPL
8152 * against this object
8153 */
8154 vm_object_activity_end(shadow_object);
8155 vm_object_collapse(shadow_object, 0, TRUE);
8156 } else {
8157 /*
8158 * we dontated the paging reference to
8159 * the map object... vm_pageout_object_terminate
8160 * will drop this reference
8161 */
8162 }
8163 }
8164 vm_object_unlock(shadow_object);
8165 if (object != shadow_object)
8166 vm_object_unlock(object);
8167
8168 if(!isVectorUPL)
8169 upl_unlock(upl);
8170 else {
8171 /*
8172 * If we completed our operations on an UPL that is
8173 * part of a Vectored UPL and if empty is TRUE, then
8174 * we should go ahead and deallocate this UPL element.
8175 * Then we check if this was the last of the UPL elements
8176 * within that Vectored UPL. If so, set empty to TRUE
8177 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8178 * can go ahead and deallocate the Vector UPL too.
8179 */
8180 if(*empty == TRUE) {
8181 *empty = vector_upl_set_subupl(vector_upl, upl,0);
8182 upl_deallocate(upl);
8183 }
8184 goto process_upl_to_abort;
8185 }
8186
8187 return KERN_SUCCESS;
8188 }
8189
8190
8191 kern_return_t
8192 upl_abort(
8193 upl_t upl,
8194 int error)
8195 {
8196 boolean_t empty;
8197
8198 if (upl == UPL_NULL)
8199 return KERN_INVALID_ARGUMENT;
8200
8201 return upl_abort_range(upl, 0, upl->size, error, &empty);
8202 }
8203
8204
8205 /* an option on commit should be wire */
8206 kern_return_t
8207 upl_commit(
8208 upl_t upl,
8209 upl_page_info_t *page_list,
8210 mach_msg_type_number_t count)
8211 {
8212 boolean_t empty;
8213
8214 if (upl == UPL_NULL)
8215 return KERN_INVALID_ARGUMENT;
8216
8217 return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
8218 }
8219
8220
8221 void
8222 iopl_valid_data(
8223 upl_t upl)
8224 {
8225 vm_object_t object;
8226 vm_offset_t offset;
8227 vm_page_t m, nxt_page = VM_PAGE_NULL;
8228 upl_size_t size;
8229 int wired_count = 0;
8230
8231 if (upl == NULL)
8232 panic("iopl_valid_data: NULL upl");
8233 if (vector_upl_is_valid(upl))
8234 panic("iopl_valid_data: vector upl");
8235 if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8236 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8237
8238 object = upl->map_object;
8239
8240 if (object == kernel_object || object == compressor_object)
8241 panic("iopl_valid_data: object == kernel or compressor");
8242
8243 if (object->purgable == VM_PURGABLE_VOLATILE ||
8244 object->purgable == VM_PURGABLE_EMPTY)
8245 panic("iopl_valid_data: object %p purgable %d",
8246 object, object->purgable);
8247
8248 size = upl->size;
8249
8250 vm_object_lock(object);
8251
8252 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE))
8253 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8254 else
8255 offset = 0 + upl->offset - object->paging_offset;
8256
8257 while (size) {
8258
8259 if (nxt_page != VM_PAGE_NULL) {
8260 m = nxt_page;
8261 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
8262 } else {
8263 m = vm_page_lookup(object, offset);
8264 offset += PAGE_SIZE;
8265
8266 if (m == VM_PAGE_NULL)
8267 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8268 }
8269 if (m->busy) {
8270 if (!m->absent)
8271 panic("iopl_valid_data: busy page w/o absent");
8272
8273 if (m->pageq.next || m->pageq.prev)
8274 panic("iopl_valid_data: busy+absent page on page queue");
8275 if (m->reusable) {
8276 panic("iopl_valid_data: %p is reusable", m);
8277 }
8278
8279 m->absent = FALSE;
8280 m->dirty = TRUE;
8281 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8282 assert(m->wire_count == 0);
8283 m->wire_count++;
8284 assert(m->wire_count);
8285 if (m->wire_count == 1) {
8286 m->vm_page_q_state = VM_PAGE_IS_WIRED;
8287 wired_count++;
8288 } else {
8289 panic("iopl_valid_data: %p already wired\n", m);
8290 }
8291
8292 PAGE_WAKEUP_DONE(m);
8293 }
8294 size -= PAGE_SIZE;
8295 }
8296 if (wired_count) {
8297
8298 if (!object->wired_page_count) {
8299 VM_OBJECT_WIRED(object);
8300 }
8301 object->wired_page_count += wired_count;
8302 assert(object->resident_page_count >= object->wired_page_count);
8303
8304 /* no need to adjust purgeable accounting for this object: */
8305 assert(object->purgable != VM_PURGABLE_VOLATILE);
8306 assert(object->purgable != VM_PURGABLE_EMPTY);
8307
8308 vm_page_lockspin_queues();
8309 vm_page_wire_count += wired_count;
8310 vm_page_unlock_queues();
8311 }
8312 vm_object_unlock(object);
8313 }
8314
8315 vm_tag_t
8316 iopl_set_tag(
8317 upl_t upl,
8318 vm_tag_t tag)
8319 {
8320 vm_object_t object;
8321 vm_tag_t prior_tag;
8322
8323 if (upl == NULL)
8324 panic("%s: NULL upl", __FUNCTION__);
8325 if (vector_upl_is_valid(upl))
8326 panic("%s: vector upl", __FUNCTION__);
8327 if (kernel_object == upl->map_object)
8328 return (tag);
8329 if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8330 return (tag);
8331
8332 object = upl->map_object;
8333 vm_object_lock(object);
8334
8335 prior_tag = object->wire_tag;
8336 object->wire_tag = tag;
8337 if (VM_KERN_MEMORY_NONE == prior_tag) prior_tag = tag;
8338 vm_object_unlock(object);
8339
8340 return (prior_tag);
8341 }
8342
8343
8344 void
8345 vm_object_set_pmap_cache_attr(
8346 vm_object_t object,
8347 upl_page_info_array_t user_page_list,
8348 unsigned int num_pages,
8349 boolean_t batch_pmap_op)
8350 {
8351 unsigned int cache_attr = 0;
8352
8353 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8354 assert(user_page_list);
8355 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8356 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8357 }
8358 }
8359
8360
8361 boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t);
8362 kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_object_offset_t *, int);
8363
8364
8365
8366 boolean_t
8367 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8368 wpl_array_t lite_list, upl_control_flags_t cntrl_flags)
8369 {
8370 vm_page_t dst_page;
8371 vm_tag_t tag;
8372 unsigned int entry;
8373 int page_count;
8374 int delayed_unlock = 0;
8375 boolean_t retval = TRUE;
8376 ppnum_t phys_page;
8377
8378 vm_object_lock_assert_exclusive(object);
8379 assert(object->purgable != VM_PURGABLE_VOLATILE);
8380 assert(object->purgable != VM_PURGABLE_EMPTY);
8381 assert(object->pager == NULL);
8382 assert(object->copy == NULL);
8383 assert(object->shadow == NULL);
8384
8385 tag = UPL_MEMORY_TAG(cntrl_flags);
8386 page_count = object->resident_page_count;
8387 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8388
8389 vm_page_lock_queues();
8390
8391 while (page_count--) {
8392
8393 if (dst_page->busy ||
8394 dst_page->fictitious ||
8395 dst_page->absent ||
8396 dst_page->error ||
8397 dst_page->cleaning ||
8398 dst_page->restart ||
8399 dst_page->encrypted ||
8400 dst_page->laundry) {
8401 retval = FALSE;
8402 goto done;
8403 }
8404 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
8405 retval = FALSE;
8406 goto done;
8407 }
8408 dst_page->reference = TRUE;
8409
8410 vm_page_wire(dst_page, tag, FALSE);
8411
8412 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8413 SET_PAGE_DIRTY(dst_page, FALSE);
8414 }
8415 entry = (unsigned int)(dst_page->offset / PAGE_SIZE);
8416 assert(entry >= 0 && entry < object->resident_page_count);
8417 lite_list[entry>>5] |= 1 << (entry & 31);
8418
8419 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8420
8421 if (phys_page > upl->highest_page)
8422 upl->highest_page = phys_page;
8423
8424 if (user_page_list) {
8425 user_page_list[entry].phys_addr = phys_page;
8426 user_page_list[entry].absent = dst_page->absent;
8427 user_page_list[entry].dirty = dst_page->dirty;
8428 user_page_list[entry].free_when_done = dst_page->free_when_done;
8429 user_page_list[entry].precious = dst_page->precious;
8430 user_page_list[entry].device = FALSE;
8431 user_page_list[entry].speculative = FALSE;
8432 user_page_list[entry].cs_validated = FALSE;
8433 user_page_list[entry].cs_tainted = FALSE;
8434 user_page_list[entry].cs_nx = FALSE;
8435 user_page_list[entry].needed = FALSE;
8436 user_page_list[entry].mark = FALSE;
8437 }
8438 if (delayed_unlock++ > 256) {
8439 delayed_unlock = 0;
8440 lck_mtx_yield(&vm_page_queue_lock);
8441
8442 VM_CHECK_MEMORYSTATUS;
8443 }
8444 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq);
8445 }
8446 done:
8447 vm_page_unlock_queues();
8448
8449 VM_CHECK_MEMORYSTATUS;
8450
8451 return (retval);
8452 }
8453
8454
8455 kern_return_t
8456 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8457 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_object_offset_t *dst_offset, int page_count)
8458 {
8459 vm_page_t dst_page;
8460 vm_tag_t tag;
8461 boolean_t no_zero_fill = FALSE;
8462 int interruptible;
8463 int pages_wired = 0;
8464 int pages_inserted = 0;
8465 int entry = 0;
8466 uint64_t delayed_ledger_update = 0;
8467 kern_return_t ret = KERN_SUCCESS;
8468 int grab_options;
8469 ppnum_t phys_page;
8470
8471 vm_object_lock_assert_exclusive(object);
8472 assert(object->purgable != VM_PURGABLE_VOLATILE);
8473 assert(object->purgable != VM_PURGABLE_EMPTY);
8474 assert(object->pager == NULL);
8475 assert(object->copy == NULL);
8476 assert(object->shadow == NULL);
8477
8478 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8479 interruptible = THREAD_ABORTSAFE;
8480 else
8481 interruptible = THREAD_UNINT;
8482
8483 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8484 no_zero_fill = TRUE;
8485
8486 tag = UPL_MEMORY_TAG(cntrl_flags);
8487
8488 grab_options = 0;
8489 #if CONFIG_SECLUDED_MEMORY
8490 if (object->can_grab_secluded) {
8491 grab_options |= VM_PAGE_GRAB_SECLUDED;
8492 }
8493 #endif /* CONFIG_SECLUDED_MEMORY */
8494
8495 while (page_count--) {
8496
8497 while ((dst_page = vm_page_grab_options(grab_options))
8498 == VM_PAGE_NULL) {
8499
8500 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8501
8502 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8503
8504 if (vm_page_wait(interruptible) == FALSE) {
8505 /*
8506 * interrupted case
8507 */
8508 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8509
8510 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8511
8512 ret = MACH_SEND_INTERRUPTED;
8513 goto done;
8514 }
8515 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8516
8517 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8518 }
8519 if (no_zero_fill == FALSE)
8520 vm_page_zero_fill(dst_page);
8521 else
8522 dst_page->absent = TRUE;
8523
8524 dst_page->reference = TRUE;
8525
8526 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8527 SET_PAGE_DIRTY(dst_page, FALSE);
8528 }
8529 if (dst_page->absent == FALSE) {
8530 assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8531 assert(dst_page->wire_count == 0);
8532 dst_page->wire_count++;
8533 dst_page->vm_page_q_state = VM_PAGE_IS_WIRED;
8534 assert(dst_page->wire_count);
8535 pages_wired++;
8536 PAGE_WAKEUP_DONE(dst_page);
8537 }
8538 pages_inserted++;
8539
8540 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8541
8542 lite_list[entry>>5] |= 1 << (entry & 31);
8543
8544 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8545
8546 if (phys_page > upl->highest_page)
8547 upl->highest_page = phys_page;
8548
8549 if (user_page_list) {
8550 user_page_list[entry].phys_addr = phys_page;
8551 user_page_list[entry].absent = dst_page->absent;
8552 user_page_list[entry].dirty = dst_page->dirty;
8553 user_page_list[entry].free_when_done = FALSE;
8554 user_page_list[entry].precious = FALSE;
8555 user_page_list[entry].device = FALSE;
8556 user_page_list[entry].speculative = FALSE;
8557 user_page_list[entry].cs_validated = FALSE;
8558 user_page_list[entry].cs_tainted = FALSE;
8559 user_page_list[entry].cs_nx = FALSE;
8560 user_page_list[entry].needed = FALSE;
8561 user_page_list[entry].mark = FALSE;
8562 }
8563 entry++;
8564 *dst_offset += PAGE_SIZE_64;
8565 }
8566 done:
8567 if (pages_wired) {
8568 vm_page_lockspin_queues();
8569 vm_page_wire_count += pages_wired;
8570 vm_page_unlock_queues();
8571 }
8572 if (pages_inserted) {
8573 if (object->internal) {
8574 OSAddAtomic(pages_inserted, &vm_page_internal_count);
8575 } else {
8576 OSAddAtomic(pages_inserted, &vm_page_external_count);
8577 }
8578 }
8579 if (delayed_ledger_update) {
8580 task_t owner;
8581
8582 owner = object->vo_purgeable_owner;
8583 assert(owner);
8584
8585 /* more non-volatile bytes */
8586 ledger_credit(owner->ledger,
8587 task_ledgers.purgeable_nonvolatile,
8588 delayed_ledger_update);
8589 /* more footprint */
8590 ledger_credit(owner->ledger,
8591 task_ledgers.phys_footprint,
8592 delayed_ledger_update);
8593 }
8594 return (ret);
8595 }
8596
8597
8598 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
8599
8600
8601 kern_return_t
8602 vm_object_iopl_request(
8603 vm_object_t object,
8604 vm_object_offset_t offset,
8605 upl_size_t size,
8606 upl_t *upl_ptr,
8607 upl_page_info_array_t user_page_list,
8608 unsigned int *page_list_count,
8609 upl_control_flags_t cntrl_flags)
8610 {
8611 vm_page_t dst_page;
8612 vm_object_offset_t dst_offset;
8613 upl_size_t xfer_size;
8614 upl_t upl = NULL;
8615 unsigned int entry;
8616 wpl_array_t lite_list = NULL;
8617 int no_zero_fill = FALSE;
8618 unsigned int size_in_pages;
8619 u_int32_t psize;
8620 kern_return_t ret;
8621 vm_prot_t prot;
8622 struct vm_object_fault_info fault_info;
8623 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
8624 struct vm_page_delayed_work *dwp;
8625 int dw_count;
8626 int dw_limit;
8627 int dw_index;
8628 boolean_t caller_lookup;
8629 int io_tracking_flag = 0;
8630 int interruptible;
8631 ppnum_t phys_page;
8632
8633 boolean_t set_cache_attr_needed = FALSE;
8634 boolean_t free_wired_pages = FALSE;
8635 boolean_t fast_path_empty_req = FALSE;
8636 boolean_t fast_path_full_req = FALSE;
8637
8638 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8639 /*
8640 * For forward compatibility's sake,
8641 * reject any unknown flag.
8642 */
8643 return KERN_INVALID_VALUE;
8644 }
8645 if (vm_lopage_needed == FALSE)
8646 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8647
8648 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8649 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
8650 return KERN_INVALID_VALUE;
8651
8652 if (object->phys_contiguous) {
8653 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
8654 return KERN_INVALID_ADDRESS;
8655
8656 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
8657 return KERN_INVALID_ADDRESS;
8658 }
8659 }
8660
8661 if (cntrl_flags & UPL_ENCRYPT) {
8662 /*
8663 * ENCRYPTED SWAP:
8664 * The paging path doesn't use this interface,
8665 * so we don't support the UPL_ENCRYPT flag
8666 * here. We won't encrypt the pages.
8667 */
8668 assert(! (cntrl_flags & UPL_ENCRYPT));
8669 }
8670 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8671 no_zero_fill = TRUE;
8672
8673 if (cntrl_flags & UPL_COPYOUT_FROM)
8674 prot = VM_PROT_READ;
8675 else
8676 prot = VM_PROT_READ | VM_PROT_WRITE;
8677
8678 if ((!object->internal) && (object->paging_offset != 0))
8679 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
8680
8681 #if CONFIG_IOSCHED || UPL_DEBUG
8682 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled)
8683 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8684 #endif
8685
8686 #if CONFIG_IOSCHED
8687 if (object->io_tracking) {
8688 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8689 if (object != kernel_object)
8690 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8691 }
8692 #endif
8693
8694 if (object->phys_contiguous)
8695 psize = PAGE_SIZE;
8696 else
8697 psize = size;
8698
8699 if (cntrl_flags & UPL_SET_INTERNAL) {
8700 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8701
8702 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8703 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
8704 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
8705 if (size == 0) {
8706 user_page_list = NULL;
8707 lite_list = NULL;
8708 }
8709 } else {
8710 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8711
8712 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
8713 if (size == 0) {
8714 lite_list = NULL;
8715 }
8716 }
8717 if (user_page_list)
8718 user_page_list[0].device = FALSE;
8719 *upl_ptr = upl;
8720
8721 upl->map_object = object;
8722 upl->size = size;
8723
8724 size_in_pages = size / PAGE_SIZE;
8725
8726 if (object == kernel_object &&
8727 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8728 upl->flags |= UPL_KERNEL_OBJECT;
8729 #if UPL_DEBUG
8730 vm_object_lock(object);
8731 #else
8732 vm_object_lock_shared(object);
8733 #endif
8734 } else {
8735 vm_object_lock(object);
8736 vm_object_activity_begin(object);
8737 }
8738 /*
8739 * paging in progress also protects the paging_offset
8740 */
8741 upl->offset = offset + object->paging_offset;
8742
8743 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8744 /*
8745 * The user requested that access to the pages in this UPL
8746 * be blocked until the UPL is commited or aborted.
8747 */
8748 upl->flags |= UPL_ACCESS_BLOCKED;
8749 }
8750
8751 #if CONFIG_IOSCHED || UPL_DEBUG
8752 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
8753 vm_object_activity_begin(object);
8754 queue_enter(&object->uplq, upl, upl_t, uplq);
8755 }
8756 #endif
8757
8758 if (object->phys_contiguous) {
8759
8760 if (upl->flags & UPL_ACCESS_BLOCKED) {
8761 assert(!object->blocked_access);
8762 object->blocked_access = TRUE;
8763 }
8764
8765 vm_object_unlock(object);
8766
8767 /*
8768 * don't need any shadow mappings for this one
8769 * since it is already I/O memory
8770 */
8771 upl->flags |= UPL_DEVICE_MEMORY;
8772
8773 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
8774
8775 if (user_page_list) {
8776 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
8777 user_page_list[0].device = TRUE;
8778 }
8779 if (page_list_count != NULL) {
8780 if (upl->flags & UPL_INTERNAL)
8781 *page_list_count = 0;
8782 else
8783 *page_list_count = 1;
8784 }
8785 return KERN_SUCCESS;
8786 }
8787 if (object != kernel_object && object != compressor_object) {
8788 /*
8789 * Protect user space from future COW operations
8790 */
8791 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8792 if (!object->true_share &&
8793 vm_object_tracking_inited) {
8794 void *bt[VM_OBJECT_TRACKING_BTDEPTH];
8795 int num = 0;
8796
8797 num = OSBacktrace(bt,
8798 VM_OBJECT_TRACKING_BTDEPTH);
8799 btlog_add_entry(vm_object_tracking_btlog,
8800 object,
8801 VM_OBJECT_TRACKING_OP_TRUESHARE,
8802 bt,
8803 num);
8804 }
8805 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8806
8807 vm_object_lock_assert_exclusive(object);
8808 object->true_share = TRUE;
8809
8810 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
8811 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8812 }
8813
8814 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8815 object->copy != VM_OBJECT_NULL) {
8816 /*
8817 * Honor copy-on-write obligations
8818 *
8819 * The caller is gathering these pages and
8820 * might modify their contents. We need to
8821 * make sure that the copy object has its own
8822 * private copies of these pages before we let
8823 * the caller modify them.
8824 *
8825 * NOTE: someone else could map the original object
8826 * after we've done this copy-on-write here, and they
8827 * could then see an inconsistent picture of the memory
8828 * while it's being modified via the UPL. To prevent this,
8829 * we would have to block access to these pages until the
8830 * UPL is released. We could use the UPL_BLOCK_ACCESS
8831 * code path for that...
8832 */
8833 vm_object_update(object,
8834 offset,
8835 size,
8836 NULL,
8837 NULL,
8838 FALSE, /* should_return */
8839 MEMORY_OBJECT_COPY_SYNC,
8840 VM_PROT_NO_CHANGE);
8841 #if DEVELOPMENT || DEBUG
8842 iopl_cow++;
8843 iopl_cow_pages += size >> PAGE_SHIFT;
8844 #endif
8845 }
8846 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8847 object->purgable != VM_PURGABLE_VOLATILE &&
8848 object->purgable != VM_PURGABLE_EMPTY &&
8849 object->copy == NULL &&
8850 size == object->vo_size &&
8851 offset == 0 &&
8852 object->shadow == NULL &&
8853 object->pager == NULL)
8854 {
8855 if (object->resident_page_count == size_in_pages)
8856 {
8857 assert(object != compressor_object);
8858 assert(object != kernel_object);
8859 fast_path_full_req = TRUE;
8860 }
8861 else if (object->resident_page_count == 0)
8862 {
8863 assert(object != compressor_object);
8864 assert(object != kernel_object);
8865 fast_path_empty_req = TRUE;
8866 set_cache_attr_needed = TRUE;
8867 }
8868 }
8869
8870 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8871 interruptible = THREAD_ABORTSAFE;
8872 else
8873 interruptible = THREAD_UNINT;
8874
8875 entry = 0;
8876
8877 xfer_size = size;
8878 dst_offset = offset;
8879 dw_count = 0;
8880
8881 if (fast_path_full_req) {
8882
8883 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags) == TRUE)
8884 goto finish;
8885 /*
8886 * we couldn't complete the processing of this request on the fast path
8887 * so fall through to the slow path and finish up
8888 */
8889
8890 } else if (fast_path_empty_req) {
8891
8892 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8893 ret = KERN_MEMORY_ERROR;
8894 goto return_err;
8895 }
8896 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, &dst_offset, size_in_pages);
8897
8898 if (ret) {
8899 free_wired_pages = TRUE;
8900 goto return_err;
8901 }
8902 goto finish;
8903 }
8904
8905 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8906 fault_info.user_tag = 0;
8907 fault_info.lo_offset = offset;
8908 fault_info.hi_offset = offset + xfer_size;
8909 fault_info.no_cache = FALSE;
8910 fault_info.stealth = FALSE;
8911 fault_info.io_sync = FALSE;
8912 fault_info.cs_bypass = FALSE;
8913 fault_info.mark_zf_absent = TRUE;
8914 fault_info.interruptible = interruptible;
8915 fault_info.batch_pmap_op = TRUE;
8916
8917 dwp = &dw_array[0];
8918 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8919
8920 while (xfer_size) {
8921 vm_fault_return_t result;
8922
8923 dwp->dw_mask = 0;
8924
8925 if (fast_path_full_req) {
8926 /*
8927 * if we get here, it means that we ran into a page
8928 * state we couldn't handle in the fast path and
8929 * bailed out to the slow path... since the order
8930 * we look at pages is different between the 2 paths,
8931 * the following check is needed to determine whether
8932 * this page was already processed in the fast path
8933 */
8934 if (lite_list[entry>>5] & (1 << (entry & 31)))
8935 goto skip_page;
8936 }
8937 dst_page = vm_page_lookup(object, dst_offset);
8938
8939 /*
8940 * ENCRYPTED SWAP:
8941 * If the page is encrypted, we need to decrypt it,
8942 * so force a soft page fault.
8943 */
8944 if (dst_page == VM_PAGE_NULL ||
8945 dst_page->busy ||
8946 dst_page->encrypted ||
8947 dst_page->error ||
8948 dst_page->restart ||
8949 dst_page->absent ||
8950 dst_page->fictitious) {
8951
8952 if (object == kernel_object)
8953 panic("vm_object_iopl_request: missing/bad page in kernel object\n");
8954 if (object == compressor_object)
8955 panic("vm_object_iopl_request: missing/bad page in compressor object\n");
8956
8957 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8958 ret = KERN_MEMORY_ERROR;
8959 goto return_err;
8960 }
8961 set_cache_attr_needed = TRUE;
8962
8963 /*
8964 * We just looked up the page and the result remains valid
8965 * until the object lock is release, so send it to
8966 * vm_fault_page() (as "dst_page"), to avoid having to
8967 * look it up again there.
8968 */
8969 caller_lookup = TRUE;
8970
8971 do {
8972 vm_page_t top_page;
8973 kern_return_t error_code;
8974
8975 fault_info.cluster_size = xfer_size;
8976
8977 vm_object_paging_begin(object);
8978
8979 result = vm_fault_page(object, dst_offset,
8980 prot | VM_PROT_WRITE, FALSE,
8981 caller_lookup,
8982 &prot, &dst_page, &top_page,
8983 (int *)0,
8984 &error_code, no_zero_fill,
8985 FALSE, &fault_info);
8986
8987 /* our lookup is no longer valid at this point */
8988 caller_lookup = FALSE;
8989
8990 switch (result) {
8991
8992 case VM_FAULT_SUCCESS:
8993
8994 if ( !dst_page->absent) {
8995 PAGE_WAKEUP_DONE(dst_page);
8996 } else {
8997 /*
8998 * we only get back an absent page if we
8999 * requested that it not be zero-filled
9000 * because we are about to fill it via I/O
9001 *
9002 * absent pages should be left BUSY
9003 * to prevent them from being faulted
9004 * into an address space before we've
9005 * had a chance to complete the I/O on
9006 * them since they may contain info that
9007 * shouldn't be seen by the faulting task
9008 */
9009 }
9010 /*
9011 * Release paging references and
9012 * top-level placeholder page, if any.
9013 */
9014 if (top_page != VM_PAGE_NULL) {
9015 vm_object_t local_object;
9016
9017 local_object = VM_PAGE_OBJECT(top_page);
9018
9019 /*
9020 * comparing 2 packed pointers
9021 */
9022 if (top_page->vm_page_object != dst_page->vm_page_object) {
9023 vm_object_lock(local_object);
9024 VM_PAGE_FREE(top_page);
9025 vm_object_paging_end(local_object);
9026 vm_object_unlock(local_object);
9027 } else {
9028 VM_PAGE_FREE(top_page);
9029 vm_object_paging_end(local_object);
9030 }
9031 }
9032 vm_object_paging_end(object);
9033 break;
9034
9035 case VM_FAULT_RETRY:
9036 vm_object_lock(object);
9037 break;
9038
9039 case VM_FAULT_MEMORY_SHORTAGE:
9040 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9041
9042 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9043
9044 if (vm_page_wait(interruptible)) {
9045 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9046
9047 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9048 vm_object_lock(object);
9049
9050 break;
9051 }
9052 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9053
9054 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9055
9056 /* fall thru */
9057
9058 case VM_FAULT_INTERRUPTED:
9059 error_code = MACH_SEND_INTERRUPTED;
9060 case VM_FAULT_MEMORY_ERROR:
9061 memory_error:
9062 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9063
9064 vm_object_lock(object);
9065 goto return_err;
9066
9067 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9068 /* success but no page: fail */
9069 vm_object_paging_end(object);
9070 vm_object_unlock(object);
9071 goto memory_error;
9072
9073 default:
9074 panic("vm_object_iopl_request: unexpected error"
9075 " 0x%x from vm_fault_page()\n", result);
9076 }
9077 } while (result != VM_FAULT_SUCCESS);
9078
9079 }
9080 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9081
9082 if (upl->flags & UPL_KERNEL_OBJECT)
9083 goto record_phys_addr;
9084
9085 if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9086 dst_page->busy = TRUE;
9087 goto record_phys_addr;
9088 }
9089
9090 if (dst_page->cleaning) {
9091 /*
9092 * Someone else is cleaning this page in place.
9093 * In theory, we should be able to proceed and use this
9094 * page but they'll probably end up clearing the "busy"
9095 * bit on it in upl_commit_range() but they didn't set
9096 * it, so they would clear our "busy" bit and open
9097 * us to race conditions.
9098 * We'd better wait for the cleaning to complete and
9099 * then try again.
9100 */
9101 vm_object_iopl_request_sleep_for_cleaning++;
9102 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9103 continue;
9104 }
9105 if (dst_page->laundry)
9106 vm_pageout_steal_laundry(dst_page, FALSE);
9107
9108 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9109 phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
9110 vm_page_t low_page;
9111 int refmod;
9112
9113 /*
9114 * support devices that can't DMA above 32 bits
9115 * by substituting pages from a pool of low address
9116 * memory for any pages we find above the 4G mark
9117 * can't substitute if the page is already wired because
9118 * we don't know whether that physical address has been
9119 * handed out to some other 64 bit capable DMA device to use
9120 */
9121 if (VM_PAGE_WIRED(dst_page)) {
9122 ret = KERN_PROTECTION_FAILURE;
9123 goto return_err;
9124 }
9125 low_page = vm_page_grablo();
9126
9127 if (low_page == VM_PAGE_NULL) {
9128 ret = KERN_RESOURCE_SHORTAGE;
9129 goto return_err;
9130 }
9131 /*
9132 * from here until the vm_page_replace completes
9133 * we musn't drop the object lock... we don't
9134 * want anyone refaulting this page in and using
9135 * it after we disconnect it... we want the fault
9136 * to find the new page being substituted.
9137 */
9138 if (dst_page->pmapped)
9139 refmod = pmap_disconnect(phys_page);
9140 else
9141 refmod = 0;
9142
9143 if (!dst_page->absent)
9144 vm_page_copy(dst_page, low_page);
9145
9146 low_page->reference = dst_page->reference;
9147 low_page->dirty = dst_page->dirty;
9148 low_page->absent = dst_page->absent;
9149
9150 if (refmod & VM_MEM_REFERENCED)
9151 low_page->reference = TRUE;
9152 if (refmod & VM_MEM_MODIFIED) {
9153 SET_PAGE_DIRTY(low_page, FALSE);
9154 }
9155
9156 vm_page_replace(low_page, object, dst_offset);
9157
9158 dst_page = low_page;
9159 /*
9160 * vm_page_grablo returned the page marked
9161 * BUSY... we don't need a PAGE_WAKEUP_DONE
9162 * here, because we've never dropped the object lock
9163 */
9164 if ( !dst_page->absent)
9165 dst_page->busy = FALSE;
9166
9167 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9168 }
9169 if ( !dst_page->busy)
9170 dwp->dw_mask |= DW_vm_page_wire;
9171
9172 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9173 /*
9174 * Mark the page "busy" to block any future page fault
9175 * on this page in addition to wiring it.
9176 * We'll also remove the mapping
9177 * of all these pages before leaving this routine.
9178 */
9179 assert(!dst_page->fictitious);
9180 dst_page->busy = TRUE;
9181 }
9182 /*
9183 * expect the page to be used
9184 * page queues lock must be held to set 'reference'
9185 */
9186 dwp->dw_mask |= DW_set_reference;
9187
9188 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9189 SET_PAGE_DIRTY(dst_page, TRUE);
9190 }
9191 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
9192 pmap_sync_page_attributes_phys(phys_page);
9193 dst_page->written_by_kernel = FALSE;
9194 }
9195
9196 record_phys_addr:
9197 if (dst_page->busy)
9198 upl->flags |= UPL_HAS_BUSY;
9199
9200 lite_list[entry>>5] |= 1 << (entry & 31);
9201
9202 if (phys_page > upl->highest_page)
9203 upl->highest_page = phys_page;
9204
9205 if (user_page_list) {
9206 user_page_list[entry].phys_addr = phys_page;
9207 user_page_list[entry].free_when_done = dst_page->free_when_done;
9208 user_page_list[entry].absent = dst_page->absent;
9209 user_page_list[entry].dirty = dst_page->dirty;
9210 user_page_list[entry].precious = dst_page->precious;
9211 user_page_list[entry].device = FALSE;
9212 user_page_list[entry].needed = FALSE;
9213 if (dst_page->clustered == TRUE)
9214 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9215 else
9216 user_page_list[entry].speculative = FALSE;
9217 user_page_list[entry].cs_validated = dst_page->cs_validated;
9218 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
9219 user_page_list[entry].cs_nx = dst_page->cs_nx;
9220 user_page_list[entry].mark = FALSE;
9221 }
9222 if (object != kernel_object && object != compressor_object) {
9223 /*
9224 * someone is explicitly grabbing this page...
9225 * update clustered and speculative state
9226 *
9227 */
9228 if (dst_page->clustered)
9229 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9230 }
9231 skip_page:
9232 entry++;
9233 dst_offset += PAGE_SIZE_64;
9234 xfer_size -= PAGE_SIZE;
9235
9236 if (dwp->dw_mask) {
9237 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9238
9239 if (dw_count >= dw_limit) {
9240 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
9241
9242 dwp = &dw_array[0];
9243 dw_count = 0;
9244 }
9245 }
9246 }
9247 assert(entry == size_in_pages);
9248
9249 if (dw_count)
9250 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
9251 finish:
9252 if (user_page_list && set_cache_attr_needed == TRUE)
9253 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9254
9255 if (page_list_count != NULL) {
9256 if (upl->flags & UPL_INTERNAL)
9257 *page_list_count = 0;
9258 else if (*page_list_count > size_in_pages)
9259 *page_list_count = size_in_pages;
9260 }
9261 vm_object_unlock(object);
9262
9263 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9264 /*
9265 * We've marked all the pages "busy" so that future
9266 * page faults will block.
9267 * Now remove the mapping for these pages, so that they
9268 * can't be accessed without causing a page fault.
9269 */
9270 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9271 PMAP_NULL, 0, VM_PROT_NONE);
9272 assert(!object->blocked_access);
9273 object->blocked_access = TRUE;
9274 }
9275
9276 return KERN_SUCCESS;
9277
9278 return_err:
9279 dw_index = 0;
9280
9281 for (; offset < dst_offset; offset += PAGE_SIZE) {
9282 boolean_t need_unwire;
9283
9284 dst_page = vm_page_lookup(object, offset);
9285
9286 if (dst_page == VM_PAGE_NULL)
9287 panic("vm_object_iopl_request: Wired page missing. \n");
9288
9289 /*
9290 * if we've already processed this page in an earlier
9291 * dw_do_work, we need to undo the wiring... we will
9292 * leave the dirty and reference bits on if they
9293 * were set, since we don't have a good way of knowing
9294 * what the previous state was and we won't get here
9295 * under any normal circumstances... we will always
9296 * clear BUSY and wakeup any waiters via vm_page_free
9297 * or PAGE_WAKEUP_DONE
9298 */
9299 need_unwire = TRUE;
9300
9301 if (dw_count) {
9302 if (dw_array[dw_index].dw_m == dst_page) {
9303 /*
9304 * still in the deferred work list
9305 * which means we haven't yet called
9306 * vm_page_wire on this page
9307 */
9308 need_unwire = FALSE;
9309
9310 dw_index++;
9311 dw_count--;
9312 }
9313 }
9314 vm_page_lock_queues();
9315
9316 if (dst_page->absent || free_wired_pages == TRUE) {
9317 vm_page_free(dst_page);
9318
9319 need_unwire = FALSE;
9320 } else {
9321 if (need_unwire == TRUE)
9322 vm_page_unwire(dst_page, TRUE);
9323
9324 PAGE_WAKEUP_DONE(dst_page);
9325 }
9326 vm_page_unlock_queues();
9327
9328 if (need_unwire == TRUE)
9329 VM_STAT_INCR(reactivations);
9330 }
9331 #if UPL_DEBUG
9332 upl->upl_state = 2;
9333 #endif
9334 if (! (upl->flags & UPL_KERNEL_OBJECT)) {
9335 vm_object_activity_end(object);
9336 vm_object_collapse(object, 0, TRUE);
9337 }
9338 vm_object_unlock(object);
9339 upl_destroy(upl);
9340
9341 return ret;
9342 }
9343
9344 kern_return_t
9345 upl_transpose(
9346 upl_t upl1,
9347 upl_t upl2)
9348 {
9349 kern_return_t retval;
9350 boolean_t upls_locked;
9351 vm_object_t object1, object2;
9352
9353 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR) || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
9354 return KERN_INVALID_ARGUMENT;
9355 }
9356
9357 upls_locked = FALSE;
9358
9359 /*
9360 * Since we need to lock both UPLs at the same time,
9361 * avoid deadlocks by always taking locks in the same order.
9362 */
9363 if (upl1 < upl2) {
9364 upl_lock(upl1);
9365 upl_lock(upl2);
9366 } else {
9367 upl_lock(upl2);
9368 upl_lock(upl1);
9369 }
9370 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9371
9372 object1 = upl1->map_object;
9373 object2 = upl2->map_object;
9374
9375 if (upl1->offset != 0 || upl2->offset != 0 ||
9376 upl1->size != upl2->size) {
9377 /*
9378 * We deal only with full objects, not subsets.
9379 * That's because we exchange the entire backing store info
9380 * for the objects: pager, resident pages, etc... We can't do
9381 * only part of it.
9382 */
9383 retval = KERN_INVALID_VALUE;
9384 goto done;
9385 }
9386
9387 /*
9388 * Tranpose the VM objects' backing store.
9389 */
9390 retval = vm_object_transpose(object1, object2,
9391 (vm_object_size_t) upl1->size);
9392
9393 if (retval == KERN_SUCCESS) {
9394 /*
9395 * Make each UPL point to the correct VM object, i.e. the
9396 * object holding the pages that the UPL refers to...
9397 */
9398 #if CONFIG_IOSCHED || UPL_DEBUG
9399 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9400 vm_object_lock(object1);
9401 vm_object_lock(object2);
9402 }
9403 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9404 queue_remove(&object1->uplq, upl1, upl_t, uplq);
9405 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9406 queue_remove(&object2->uplq, upl2, upl_t, uplq);
9407 #endif
9408 upl1->map_object = object2;
9409 upl2->map_object = object1;
9410
9411 #if CONFIG_IOSCHED || UPL_DEBUG
9412 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9413 queue_enter(&object2->uplq, upl1, upl_t, uplq);
9414 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9415 queue_enter(&object1->uplq, upl2, upl_t, uplq);
9416 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9417 vm_object_unlock(object2);
9418 vm_object_unlock(object1);
9419 }
9420 #endif
9421 }
9422
9423 done:
9424 /*
9425 * Cleanup.
9426 */
9427 if (upls_locked) {
9428 upl_unlock(upl1);
9429 upl_unlock(upl2);
9430 upls_locked = FALSE;
9431 }
9432
9433 return retval;
9434 }
9435
9436 void
9437 upl_range_needed(
9438 upl_t upl,
9439 int index,
9440 int count)
9441 {
9442 upl_page_info_t *user_page_list;
9443 int size_in_pages;
9444
9445 if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
9446 return;
9447
9448 size_in_pages = upl->size / PAGE_SIZE;
9449
9450 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9451
9452 while (count-- && index < size_in_pages)
9453 user_page_list[index++].needed = TRUE;
9454 }
9455
9456
9457 /*
9458 * ENCRYPTED SWAP:
9459 *
9460 * Rationale: the user might have some encrypted data on disk (via
9461 * FileVault or any other mechanism). That data is then decrypted in
9462 * memory, which is safe as long as the machine is secure. But that
9463 * decrypted data in memory could be paged out to disk by the default
9464 * pager. The data would then be stored on disk in clear (not encrypted)
9465 * and it could be accessed by anyone who gets physical access to the
9466 * disk (if the laptop or the disk gets stolen for example). This weakens
9467 * the security offered by FileVault.
9468 *
9469 * Solution: the default pager will optionally request that all the
9470 * pages it gathers for pageout be encrypted, via the UPL interfaces,
9471 * before it sends this UPL to disk via the vnode_pageout() path.
9472 *
9473 * Notes:
9474 *
9475 * To avoid disrupting the VM LRU algorithms, we want to keep the
9476 * clean-in-place mechanisms, which allow us to send some extra pages to
9477 * swap (clustering) without actually removing them from the user's
9478 * address space. We don't want the user to unknowingly access encrypted
9479 * data, so we have to actually remove the encrypted pages from the page
9480 * table. When the user accesses the data, the hardware will fail to
9481 * locate the virtual page in its page table and will trigger a page
9482 * fault. We can then decrypt the page and enter it in the page table
9483 * again. Whenever we allow the user to access the contents of a page,
9484 * we have to make sure it's not encrypted.
9485 *
9486 *
9487 */
9488 /*
9489 * ENCRYPTED SWAP:
9490 * Reserve of virtual addresses in the kernel address space.
9491 * We need to map the physical pages in the kernel, so that we
9492 * can call the encryption/decryption routines with a kernel
9493 * virtual address. We keep this pool of pre-allocated kernel
9494 * virtual addresses so that we don't have to scan the kernel's
9495 * virtaul address space each time we need to encrypt or decrypt
9496 * a physical page.
9497 * It would be nice to be able to encrypt and decrypt in physical
9498 * mode but that might not always be more efficient...
9499 */
9500 decl_simple_lock_data(,vm_paging_lock)
9501 #define VM_PAGING_NUM_PAGES 64
9502 vm_map_offset_t vm_paging_base_address = 0;
9503 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9504 int vm_paging_max_index = 0;
9505 int vm_paging_page_waiter = 0;
9506 int vm_paging_page_waiter_total = 0;
9507 unsigned long vm_paging_no_kernel_page = 0;
9508 unsigned long vm_paging_objects_mapped = 0;
9509 unsigned long vm_paging_pages_mapped = 0;
9510 unsigned long vm_paging_objects_mapped_slow = 0;
9511 unsigned long vm_paging_pages_mapped_slow = 0;
9512
9513 void
9514 vm_paging_map_init(void)
9515 {
9516 kern_return_t kr;
9517 vm_map_offset_t page_map_offset;
9518 vm_map_entry_t map_entry;
9519
9520 assert(vm_paging_base_address == 0);
9521
9522 /*
9523 * Initialize our pool of pre-allocated kernel
9524 * virtual addresses.
9525 */
9526 page_map_offset = 0;
9527 kr = vm_map_find_space(kernel_map,
9528 &page_map_offset,
9529 VM_PAGING_NUM_PAGES * PAGE_SIZE,
9530 0,
9531 0,
9532 &map_entry);
9533 if (kr != KERN_SUCCESS) {
9534 panic("vm_paging_map_init: kernel_map full\n");
9535 }
9536 VME_OBJECT_SET(map_entry, kernel_object);
9537 VME_OFFSET_SET(map_entry, page_map_offset);
9538 map_entry->protection = VM_PROT_NONE;
9539 map_entry->max_protection = VM_PROT_NONE;
9540 map_entry->permanent = TRUE;
9541 vm_object_reference(kernel_object);
9542 vm_map_unlock(kernel_map);
9543
9544 assert(vm_paging_base_address == 0);
9545 vm_paging_base_address = page_map_offset;
9546 }
9547
9548 /*
9549 * ENCRYPTED SWAP:
9550 * vm_paging_map_object:
9551 * Maps part of a VM object's pages in the kernel
9552 * virtual address space, using the pre-allocated
9553 * kernel virtual addresses, if possible.
9554 * Context:
9555 * The VM object is locked. This lock will get
9556 * dropped and re-acquired though, so the caller
9557 * must make sure the VM object is kept alive
9558 * (by holding a VM map that has a reference
9559 * on it, for example, or taking an extra reference).
9560 * The page should also be kept busy to prevent
9561 * it from being reclaimed.
9562 */
9563 kern_return_t
9564 vm_paging_map_object(
9565 vm_page_t page,
9566 vm_object_t object,
9567 vm_object_offset_t offset,
9568 vm_prot_t protection,
9569 boolean_t can_unlock_object,
9570 vm_map_size_t *size, /* IN/OUT */
9571 vm_map_offset_t *address, /* OUT */
9572 boolean_t *need_unmap) /* OUT */
9573 {
9574 kern_return_t kr;
9575 vm_map_offset_t page_map_offset;
9576 vm_map_size_t map_size;
9577 vm_object_offset_t object_offset;
9578 int i;
9579
9580 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9581 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9582 #if __x86_64__
9583 *address = (vm_map_offset_t)
9584 PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) <<
9585 PAGE_SHIFT);
9586 *need_unmap = FALSE;
9587 return KERN_SUCCESS;
9588 #else
9589 #warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
9590 #endif
9591
9592 assert(page->busy);
9593 /*
9594 * Use one of the pre-allocated kernel virtual addresses
9595 * and just enter the VM page in the kernel address space
9596 * at that virtual address.
9597 */
9598 simple_lock(&vm_paging_lock);
9599
9600 /*
9601 * Try and find an available kernel virtual address
9602 * from our pre-allocated pool.
9603 */
9604 page_map_offset = 0;
9605 for (;;) {
9606 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9607 if (vm_paging_page_inuse[i] == FALSE) {
9608 page_map_offset =
9609 vm_paging_base_address +
9610 (i * PAGE_SIZE);
9611 break;
9612 }
9613 }
9614 if (page_map_offset != 0) {
9615 /* found a space to map our page ! */
9616 break;
9617 }
9618
9619 if (can_unlock_object) {
9620 /*
9621 * If we can afford to unlock the VM object,
9622 * let's take the slow path now...
9623 */
9624 break;
9625 }
9626 /*
9627 * We can't afford to unlock the VM object, so
9628 * let's wait for a space to become available...
9629 */
9630 vm_paging_page_waiter_total++;
9631 vm_paging_page_waiter++;
9632 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9633 if (kr == THREAD_WAITING) {
9634 simple_unlock(&vm_paging_lock);
9635 kr = thread_block(THREAD_CONTINUE_NULL);
9636 simple_lock(&vm_paging_lock);
9637 }
9638 vm_paging_page_waiter--;
9639 /* ... and try again */
9640 }
9641
9642 if (page_map_offset != 0) {
9643 /*
9644 * We found a kernel virtual address;
9645 * map the physical page to that virtual address.
9646 */
9647 if (i > vm_paging_max_index) {
9648 vm_paging_max_index = i;
9649 }
9650 vm_paging_page_inuse[i] = TRUE;
9651 simple_unlock(&vm_paging_lock);
9652
9653 page->pmapped = TRUE;
9654
9655 /*
9656 * Keep the VM object locked over the PMAP_ENTER
9657 * and the actual use of the page by the kernel,
9658 * or this pmap mapping might get undone by a
9659 * vm_object_pmap_protect() call...
9660 */
9661 PMAP_ENTER(kernel_pmap,
9662 page_map_offset,
9663 page,
9664 protection,
9665 VM_PROT_NONE,
9666 0,
9667 TRUE);
9668 vm_paging_objects_mapped++;
9669 vm_paging_pages_mapped++;
9670 *address = page_map_offset;
9671 *need_unmap = TRUE;
9672
9673 /* all done and mapped, ready to use ! */
9674 return KERN_SUCCESS;
9675 }
9676
9677 /*
9678 * We ran out of pre-allocated kernel virtual
9679 * addresses. Just map the page in the kernel
9680 * the slow and regular way.
9681 */
9682 vm_paging_no_kernel_page++;
9683 simple_unlock(&vm_paging_lock);
9684 }
9685
9686 if (! can_unlock_object) {
9687 *address = 0;
9688 *size = 0;
9689 *need_unmap = FALSE;
9690 return KERN_NOT_SUPPORTED;
9691 }
9692
9693 object_offset = vm_object_trunc_page(offset);
9694 map_size = vm_map_round_page(*size,
9695 VM_MAP_PAGE_MASK(kernel_map));
9696
9697 /*
9698 * Try and map the required range of the object
9699 * in the kernel_map
9700 */
9701
9702 vm_object_reference_locked(object); /* for the map entry */
9703 vm_object_unlock(object);
9704
9705 kr = vm_map_enter(kernel_map,
9706 address,
9707 map_size,
9708 0,
9709 VM_FLAGS_ANYWHERE,
9710 object,
9711 object_offset,
9712 FALSE,
9713 protection,
9714 VM_PROT_ALL,
9715 VM_INHERIT_NONE);
9716 if (kr != KERN_SUCCESS) {
9717 *address = 0;
9718 *size = 0;
9719 *need_unmap = FALSE;
9720 vm_object_deallocate(object); /* for the map entry */
9721 vm_object_lock(object);
9722 return kr;
9723 }
9724
9725 *size = map_size;
9726
9727 /*
9728 * Enter the mapped pages in the page table now.
9729 */
9730 vm_object_lock(object);
9731 /*
9732 * VM object must be kept locked from before PMAP_ENTER()
9733 * until after the kernel is done accessing the page(s).
9734 * Otherwise, the pmap mappings in the kernel could be
9735 * undone by a call to vm_object_pmap_protect().
9736 */
9737
9738 for (page_map_offset = 0;
9739 map_size != 0;
9740 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9741
9742 page = vm_page_lookup(object, offset + page_map_offset);
9743 if (page == VM_PAGE_NULL) {
9744 printf("vm_paging_map_object: no page !?");
9745 vm_object_unlock(object);
9746 kr = vm_map_remove(kernel_map, *address, *size,
9747 VM_MAP_NO_FLAGS);
9748 assert(kr == KERN_SUCCESS);
9749 *address = 0;
9750 *size = 0;
9751 *need_unmap = FALSE;
9752 vm_object_lock(object);
9753 return KERN_MEMORY_ERROR;
9754 }
9755 page->pmapped = TRUE;
9756
9757 //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
9758 PMAP_ENTER(kernel_pmap,
9759 *address + page_map_offset,
9760 page,
9761 protection,
9762 VM_PROT_NONE,
9763 0,
9764 TRUE);
9765 }
9766
9767 vm_paging_objects_mapped_slow++;
9768 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9769
9770 *need_unmap = TRUE;
9771
9772 return KERN_SUCCESS;
9773 }
9774
9775 /*
9776 * ENCRYPTED SWAP:
9777 * vm_paging_unmap_object:
9778 * Unmaps part of a VM object's pages from the kernel
9779 * virtual address space.
9780 * Context:
9781 * The VM object is locked. This lock will get
9782 * dropped and re-acquired though.
9783 */
9784 void
9785 vm_paging_unmap_object(
9786 vm_object_t object,
9787 vm_map_offset_t start,
9788 vm_map_offset_t end)
9789 {
9790 kern_return_t kr;
9791 int i;
9792
9793 if ((vm_paging_base_address == 0) ||
9794 (start < vm_paging_base_address) ||
9795 (end > (vm_paging_base_address
9796 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9797 /*
9798 * We didn't use our pre-allocated pool of
9799 * kernel virtual address. Deallocate the
9800 * virtual memory.
9801 */
9802 if (object != VM_OBJECT_NULL) {
9803 vm_object_unlock(object);
9804 }
9805 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
9806 if (object != VM_OBJECT_NULL) {
9807 vm_object_lock(object);
9808 }
9809 assert(kr == KERN_SUCCESS);
9810 } else {
9811 /*
9812 * We used a kernel virtual address from our
9813 * pre-allocated pool. Put it back in the pool
9814 * for next time.
9815 */
9816 assert(end - start == PAGE_SIZE);
9817 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9818 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9819
9820 /* undo the pmap mapping */
9821 pmap_remove(kernel_pmap, start, end);
9822
9823 simple_lock(&vm_paging_lock);
9824 vm_paging_page_inuse[i] = FALSE;
9825 if (vm_paging_page_waiter) {
9826 thread_wakeup(&vm_paging_page_waiter);
9827 }
9828 simple_unlock(&vm_paging_lock);
9829 }
9830 }
9831
9832 #if ENCRYPTED_SWAP
9833 /*
9834 * Encryption data.
9835 * "iv" is the "initial vector". Ideally, we want to
9836 * have a different one for each page we encrypt, so that
9837 * crackers can't find encryption patterns too easily.
9838 */
9839 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
9840 boolean_t swap_crypt_ctx_initialized = FALSE;
9841 uint32_t swap_crypt_key[8]; /* big enough for a 256 key */
9842 aes_ctx swap_crypt_ctx;
9843 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
9844
9845 #if DEBUG
9846 boolean_t swap_crypt_ctx_tested = FALSE;
9847 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
9848 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
9849 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
9850 #endif /* DEBUG */
9851
9852 /*
9853 * Initialize the encryption context: key and key size.
9854 */
9855 void swap_crypt_ctx_initialize(void); /* forward */
9856 void
9857 swap_crypt_ctx_initialize(void)
9858 {
9859 unsigned int i;
9860
9861 /*
9862 * No need for locking to protect swap_crypt_ctx_initialized
9863 * because the first use of encryption will come from the
9864 * pageout thread (we won't pagein before there's been a pageout)
9865 * and there's only one pageout thread.
9866 */
9867 if (swap_crypt_ctx_initialized == FALSE) {
9868 for (i = 0;
9869 i < (sizeof (swap_crypt_key) /
9870 sizeof (swap_crypt_key[0]));
9871 i++) {
9872 swap_crypt_key[i] = random();
9873 }
9874 aes_encrypt_key((const unsigned char *) swap_crypt_key,
9875 SWAP_CRYPT_AES_KEY_SIZE,
9876 &swap_crypt_ctx.encrypt);
9877 aes_decrypt_key((const unsigned char *) swap_crypt_key,
9878 SWAP_CRYPT_AES_KEY_SIZE,
9879 &swap_crypt_ctx.decrypt);
9880 swap_crypt_ctx_initialized = TRUE;
9881 }
9882
9883 #if DEBUG
9884 /*
9885 * Validate the encryption algorithms.
9886 */
9887 if (swap_crypt_ctx_tested == FALSE) {
9888 /* initialize */
9889 for (i = 0; i < 4096; i++) {
9890 swap_crypt_test_page_ref[i] = (char) i;
9891 }
9892 /* encrypt */
9893 aes_encrypt_cbc(swap_crypt_test_page_ref,
9894 swap_crypt_null_iv,
9895 PAGE_SIZE / AES_BLOCK_SIZE,
9896 swap_crypt_test_page_encrypt,
9897 &swap_crypt_ctx.encrypt);
9898 /* decrypt */
9899 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
9900 swap_crypt_null_iv,
9901 PAGE_SIZE / AES_BLOCK_SIZE,
9902 swap_crypt_test_page_decrypt,
9903 &swap_crypt_ctx.decrypt);
9904 /* compare result with original */
9905 for (i = 0; i < 4096; i ++) {
9906 if (swap_crypt_test_page_decrypt[i] !=
9907 swap_crypt_test_page_ref[i]) {
9908 panic("encryption test failed");
9909 }
9910 }
9911
9912 /* encrypt again */
9913 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
9914 swap_crypt_null_iv,
9915 PAGE_SIZE / AES_BLOCK_SIZE,
9916 swap_crypt_test_page_decrypt,
9917 &swap_crypt_ctx.encrypt);
9918 /* decrypt in place */
9919 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
9920 swap_crypt_null_iv,
9921 PAGE_SIZE / AES_BLOCK_SIZE,
9922 swap_crypt_test_page_decrypt,
9923 &swap_crypt_ctx.decrypt);
9924 for (i = 0; i < 4096; i ++) {
9925 if (swap_crypt_test_page_decrypt[i] !=
9926 swap_crypt_test_page_ref[i]) {
9927 panic("in place encryption test failed");
9928 }
9929 }
9930
9931 swap_crypt_ctx_tested = TRUE;
9932 }
9933 #endif /* DEBUG */
9934 }
9935
9936 /*
9937 * ENCRYPTED SWAP:
9938 * vm_page_encrypt:
9939 * Encrypt the given page, for secure paging.
9940 * The page might already be mapped at kernel virtual
9941 * address "kernel_mapping_offset". Otherwise, we need
9942 * to map it.
9943 *
9944 * Context:
9945 * The page's object is locked, but this lock will be released
9946 * and re-acquired.
9947 * The page is busy and not accessible by users (not entered in any pmap).
9948 */
9949 void
9950 vm_page_encrypt(
9951 vm_page_t page,
9952 vm_map_offset_t kernel_mapping_offset)
9953 {
9954 kern_return_t kr;
9955 vm_map_size_t kernel_mapping_size;
9956 boolean_t kernel_mapping_needs_unmap;
9957 vm_offset_t kernel_vaddr;
9958 vm_object_t page_object;
9959 union {
9960 unsigned char aes_iv[AES_BLOCK_SIZE];
9961 struct {
9962 memory_object_t pager_object;
9963 vm_object_offset_t paging_offset;
9964 } vm;
9965 } encrypt_iv;
9966
9967 if (! vm_pages_encrypted) {
9968 vm_pages_encrypted = TRUE;
9969 }
9970
9971 assert(page->busy);
9972
9973 if (page->encrypted) {
9974 /*
9975 * Already encrypted: no need to do it again.
9976 */
9977 vm_page_encrypt_already_encrypted_counter++;
9978 return;
9979 }
9980 assert(page->dirty || page->precious);
9981
9982 ASSERT_PAGE_DECRYPTED(page);
9983
9984 page_object = VM_PAGE_OBJECT(page);
9985
9986 /*
9987 * Take a paging-in-progress reference to keep the object
9988 * alive even if we have to unlock it (in vm_paging_map_object()
9989 * for example)...
9990 */
9991 vm_object_paging_begin(page_object);
9992
9993 if (kernel_mapping_offset == 0) {
9994 /*
9995 * The page hasn't already been mapped in kernel space
9996 * by the caller. Map it now, so that we can access
9997 * its contents and encrypt them.
9998 */
9999 kernel_mapping_size = PAGE_SIZE;
10000 kernel_mapping_needs_unmap = FALSE;
10001 kr = vm_paging_map_object(page,
10002 page_object,
10003 page->offset,
10004 VM_PROT_READ | VM_PROT_WRITE,
10005 FALSE,
10006 &kernel_mapping_size,
10007 &kernel_mapping_offset,
10008 &kernel_mapping_needs_unmap);
10009 if (kr != KERN_SUCCESS) {
10010 panic("vm_page_encrypt: "
10011 "could not map page in kernel: 0x%x\n",
10012 kr);
10013 }
10014 } else {
10015 kernel_mapping_size = 0;
10016 kernel_mapping_needs_unmap = FALSE;
10017 }
10018 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10019
10020 if (swap_crypt_ctx_initialized == FALSE) {
10021 swap_crypt_ctx_initialize();
10022 }
10023 assert(swap_crypt_ctx_initialized);
10024
10025 /*
10026 * Prepare an "initial vector" for the encryption.
10027 * We use the "pager" and the "paging_offset" for that
10028 * page to obfuscate the encrypted data a bit more and
10029 * prevent crackers from finding patterns that they could
10030 * use to break the key.
10031 */
10032 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
10033 encrypt_iv.vm.pager_object = page_object->pager;
10034 encrypt_iv.vm.paging_offset =
10035 page_object->paging_offset + page->offset;
10036
10037 /* encrypt the "initial vector" */
10038 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
10039 swap_crypt_null_iv,
10040 1,
10041 &encrypt_iv.aes_iv[0],
10042 &swap_crypt_ctx.encrypt);
10043
10044 /*
10045 * Encrypt the page.
10046 */
10047 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
10048 &encrypt_iv.aes_iv[0],
10049 PAGE_SIZE / AES_BLOCK_SIZE,
10050 (unsigned char *) kernel_vaddr,
10051 &swap_crypt_ctx.encrypt);
10052
10053 vm_page_encrypt_counter++;
10054
10055 /*
10056 * Unmap the page from the kernel's address space,
10057 * if we had to map it ourselves. Otherwise, let
10058 * the caller undo the mapping if needed.
10059 */
10060 if (kernel_mapping_needs_unmap) {
10061 vm_paging_unmap_object(page_object,
10062 kernel_mapping_offset,
10063 kernel_mapping_offset + kernel_mapping_size);
10064 }
10065
10066 /*
10067 * Clear the "reference" and "modified" bits.
10068 * This should clean up any impact the encryption had
10069 * on them.
10070 * The page was kept busy and disconnected from all pmaps,
10071 * so it can't have been referenced or modified from user
10072 * space.
10073 * The software bits will be reset later after the I/O
10074 * has completed (in upl_commit_range()).
10075 */
10076 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_REFERENCED | VM_MEM_MODIFIED);
10077
10078 page->encrypted = TRUE;
10079
10080 vm_object_paging_end(page_object);
10081 }
10082
10083 /*
10084 * ENCRYPTED SWAP:
10085 * vm_page_decrypt:
10086 * Decrypt the given page.
10087 * The page might already be mapped at kernel virtual
10088 * address "kernel_mapping_offset". Otherwise, we need
10089 * to map it.
10090 *
10091 * Context:
10092 * The page's VM object is locked but will be unlocked and relocked.
10093 * The page is busy and not accessible by users (not entered in any pmap).
10094 */
10095 void
10096 vm_page_decrypt(
10097 vm_page_t page,
10098 vm_map_offset_t kernel_mapping_offset)
10099 {
10100 kern_return_t kr;
10101 vm_map_size_t kernel_mapping_size;
10102 vm_offset_t kernel_vaddr;
10103 boolean_t kernel_mapping_needs_unmap;
10104 vm_object_t page_object;
10105 union {
10106 unsigned char aes_iv[AES_BLOCK_SIZE];
10107 struct {
10108 memory_object_t pager_object;
10109 vm_object_offset_t paging_offset;
10110 } vm;
10111 } decrypt_iv;
10112 boolean_t was_dirty;
10113
10114 assert(page->busy);
10115 assert(page->encrypted);
10116
10117 page_object = VM_PAGE_OBJECT(page);
10118 was_dirty = page->dirty;
10119
10120 /*
10121 * Take a paging-in-progress reference to keep the object
10122 * alive even if we have to unlock it (in vm_paging_map_object()
10123 * for example)...
10124 */
10125 vm_object_paging_begin(page_object);
10126
10127 if (kernel_mapping_offset == 0) {
10128 /*
10129 * The page hasn't already been mapped in kernel space
10130 * by the caller. Map it now, so that we can access
10131 * its contents and decrypt them.
10132 */
10133 kernel_mapping_size = PAGE_SIZE;
10134 kernel_mapping_needs_unmap = FALSE;
10135 kr = vm_paging_map_object(page,
10136 page_object,
10137 page->offset,
10138 VM_PROT_READ | VM_PROT_WRITE,
10139 FALSE,
10140 &kernel_mapping_size,
10141 &kernel_mapping_offset,
10142 &kernel_mapping_needs_unmap);
10143 if (kr != KERN_SUCCESS) {
10144 panic("vm_page_decrypt: "
10145 "could not map page in kernel: 0x%x\n",
10146 kr);
10147 }
10148 } else {
10149 kernel_mapping_size = 0;
10150 kernel_mapping_needs_unmap = FALSE;
10151 }
10152 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10153
10154 assert(swap_crypt_ctx_initialized);
10155
10156 /*
10157 * Prepare an "initial vector" for the decryption.
10158 * It has to be the same as the "initial vector" we
10159 * used to encrypt that page.
10160 */
10161 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
10162 decrypt_iv.vm.pager_object = page_object->pager;
10163 decrypt_iv.vm.paging_offset =
10164 page_object->paging_offset + page->offset;
10165
10166 /* encrypt the "initial vector" */
10167 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
10168 swap_crypt_null_iv,
10169 1,
10170 &decrypt_iv.aes_iv[0],
10171 &swap_crypt_ctx.encrypt);
10172
10173 /*
10174 * Decrypt the page.
10175 */
10176 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
10177 &decrypt_iv.aes_iv[0],
10178 PAGE_SIZE / AES_BLOCK_SIZE,
10179 (unsigned char *) kernel_vaddr,
10180 &swap_crypt_ctx.decrypt);
10181 vm_page_decrypt_counter++;
10182
10183 /*
10184 * Unmap the page from the kernel's address space,
10185 * if we had to map it ourselves. Otherwise, let
10186 * the caller undo the mapping if needed.
10187 */
10188 if (kernel_mapping_needs_unmap) {
10189 vm_paging_unmap_object(page_object,
10190 kernel_vaddr,
10191 kernel_vaddr + PAGE_SIZE);
10192 }
10193
10194 if (was_dirty) {
10195 /*
10196 * The pager did not specify that the page would be
10197 * clean when it got paged in, so let's not clean it here
10198 * either.
10199 */
10200 } else {
10201 /*
10202 * After decryption, the page is actually still clean.
10203 * It was encrypted as part of paging, which "cleans"
10204 * the "dirty" pages.
10205 * Noone could access it after it was encrypted
10206 * and the decryption doesn't count.
10207 */
10208 page->dirty = FALSE;
10209 assert (page->cs_validated == FALSE);
10210 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
10211 }
10212 page->encrypted = FALSE;
10213
10214 /*
10215 * We've just modified the page's contents via the data cache and part
10216 * of the new contents might still be in the cache and not yet in RAM.
10217 * Since the page is now available and might get gathered in a UPL to
10218 * be part of a DMA transfer from a driver that expects the memory to
10219 * be coherent at this point, we have to flush the data cache.
10220 */
10221 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(page));
10222 /*
10223 * Since the page is not mapped yet, some code might assume that it
10224 * doesn't need to invalidate the instruction cache when writing to
10225 * that page. That code relies on "pmapped" being FALSE, so that the
10226 * caches get synchronized when the page is first mapped.
10227 */
10228 assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
10229 page->pmapped = FALSE;
10230 page->wpmapped = FALSE;
10231
10232 vm_object_paging_end(page_object);
10233 }
10234
10235 #if DEVELOPMENT || DEBUG
10236 unsigned long upl_encrypt_upls = 0;
10237 unsigned long upl_encrypt_pages = 0;
10238 #endif
10239
10240 /*
10241 * ENCRYPTED SWAP:
10242 *
10243 * upl_encrypt:
10244 * Encrypts all the pages in the UPL, within the specified range.
10245 *
10246 */
10247 void
10248 upl_encrypt(
10249 upl_t upl,
10250 upl_offset_t crypt_offset,
10251 upl_size_t crypt_size)
10252 {
10253 upl_size_t upl_size, subupl_size=crypt_size;
10254 upl_offset_t offset_in_upl, subupl_offset=crypt_offset;
10255 vm_object_t upl_object;
10256 vm_object_offset_t upl_offset;
10257 vm_page_t page;
10258 vm_object_t shadow_object;
10259 vm_object_offset_t shadow_offset;
10260 vm_object_offset_t paging_offset;
10261 vm_object_offset_t base_offset;
10262 int isVectorUPL = 0;
10263 upl_t vector_upl = NULL;
10264
10265 if((isVectorUPL = vector_upl_is_valid(upl)))
10266 vector_upl = upl;
10267
10268 process_upl_to_encrypt:
10269 if(isVectorUPL) {
10270 crypt_size = subupl_size;
10271 crypt_offset = subupl_offset;
10272 upl = vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
10273 if(upl == NULL)
10274 panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
10275 subupl_size -= crypt_size;
10276 subupl_offset += crypt_size;
10277 }
10278
10279 #if DEVELOPMENT || DEBUG
10280 upl_encrypt_upls++;
10281 upl_encrypt_pages += crypt_size / PAGE_SIZE;
10282 #endif
10283 upl_object = upl->map_object;
10284 upl_offset = upl->offset;
10285 upl_size = upl->size;
10286
10287 vm_object_lock(upl_object);
10288
10289 /*
10290 * Find the VM object that contains the actual pages.
10291 */
10292 if (upl_object->pageout) {
10293 shadow_object = upl_object->shadow;
10294 /*
10295 * The offset in the shadow object is actually also
10296 * accounted for in upl->offset. It possibly shouldn't be
10297 * this way, but for now don't account for it twice.
10298 */
10299 shadow_offset = 0;
10300 assert(upl_object->paging_offset == 0); /* XXX ? */
10301 vm_object_lock(shadow_object);
10302 } else {
10303 shadow_object = upl_object;
10304 shadow_offset = 0;
10305 }
10306
10307 paging_offset = shadow_object->paging_offset;
10308 vm_object_paging_begin(shadow_object);
10309
10310 if (shadow_object != upl_object)
10311 vm_object_unlock(upl_object);
10312
10313
10314 base_offset = shadow_offset;
10315 base_offset += upl_offset;
10316 base_offset += crypt_offset;
10317 base_offset -= paging_offset;
10318
10319 assert(crypt_offset + crypt_size <= upl_size);
10320
10321 for (offset_in_upl = 0;
10322 offset_in_upl < crypt_size;
10323 offset_in_upl += PAGE_SIZE) {
10324 page = vm_page_lookup(shadow_object,
10325 base_offset + offset_in_upl);
10326 if (page == VM_PAGE_NULL) {
10327 panic("upl_encrypt: "
10328 "no page for (obj=%p,off=0x%llx+0x%x)!\n",
10329 shadow_object,
10330 base_offset,
10331 offset_in_upl);
10332 }
10333 /*
10334 * Disconnect the page from all pmaps, so that nobody can
10335 * access it while it's encrypted. After that point, all
10336 * accesses to this page will cause a page fault and block
10337 * while the page is busy being encrypted. After the
10338 * encryption completes, any access will cause a
10339 * page fault and the page gets decrypted at that time.
10340 */
10341 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
10342 vm_page_encrypt(page, 0);
10343
10344 if (vm_object_lock_avoid(shadow_object)) {
10345 /*
10346 * Give vm_pageout_scan() a chance to convert more
10347 * pages from "clean-in-place" to "clean-and-free",
10348 * if it's interested in the same pages we selected
10349 * in this cluster.
10350 */
10351 vm_object_unlock(shadow_object);
10352 mutex_pause(2);
10353 vm_object_lock(shadow_object);
10354 }
10355 }
10356
10357 vm_object_paging_end(shadow_object);
10358 vm_object_unlock(shadow_object);
10359
10360 if(isVectorUPL && subupl_size)
10361 goto process_upl_to_encrypt;
10362 }
10363
10364 #else /* ENCRYPTED_SWAP */
10365 void
10366 upl_encrypt(
10367 __unused upl_t upl,
10368 __unused upl_offset_t crypt_offset,
10369 __unused upl_size_t crypt_size)
10370 {
10371 }
10372
10373 void
10374 vm_page_encrypt(
10375 __unused vm_page_t page,
10376 __unused vm_map_offset_t kernel_mapping_offset)
10377 {
10378 }
10379
10380 void
10381 vm_page_decrypt(
10382 __unused vm_page_t page,
10383 __unused vm_map_offset_t kernel_mapping_offset)
10384 {
10385 }
10386
10387 #endif /* ENCRYPTED_SWAP */
10388
10389 /*
10390 * page->object must be locked
10391 */
10392 void
10393 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10394 {
10395 if (!queues_locked) {
10396 vm_page_lockspin_queues();
10397 }
10398
10399 page->free_when_done = FALSE;
10400 /*
10401 * need to drop the laundry count...
10402 * we may also need to remove it
10403 * from the I/O paging queue...
10404 * vm_pageout_throttle_up handles both cases
10405 *
10406 * the laundry and pageout_queue flags are cleared...
10407 */
10408 vm_pageout_throttle_up(page);
10409
10410 vm_page_steal_pageout_page++;
10411
10412 if (!queues_locked) {
10413 vm_page_unlock_queues();
10414 }
10415 }
10416
10417 upl_t
10418 vector_upl_create(vm_offset_t upl_offset)
10419 {
10420 int vector_upl_size = sizeof(struct _vector_upl);
10421 int i=0;
10422 upl_t upl;
10423 vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
10424
10425 upl = upl_create(0,UPL_VECTOR,0);
10426 upl->vector_upl = vector_upl;
10427 upl->offset = upl_offset;
10428 vector_upl->size = 0;
10429 vector_upl->offset = upl_offset;
10430 vector_upl->invalid_upls=0;
10431 vector_upl->num_upls=0;
10432 vector_upl->pagelist = NULL;
10433
10434 for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
10435 vector_upl->upl_iostates[i].size = 0;
10436 vector_upl->upl_iostates[i].offset = 0;
10437
10438 }
10439 return upl;
10440 }
10441
10442 void
10443 vector_upl_deallocate(upl_t upl)
10444 {
10445 if(upl) {
10446 vector_upl_t vector_upl = upl->vector_upl;
10447 if(vector_upl) {
10448 if(vector_upl->invalid_upls != vector_upl->num_upls)
10449 panic("Deallocating non-empty Vectored UPL\n");
10450 kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
10451 vector_upl->invalid_upls=0;
10452 vector_upl->num_upls = 0;
10453 vector_upl->pagelist = NULL;
10454 vector_upl->size = 0;
10455 vector_upl->offset = 0;
10456 kfree(vector_upl, sizeof(struct _vector_upl));
10457 vector_upl = (vector_upl_t)0xfeedfeed;
10458 }
10459 else
10460 panic("vector_upl_deallocate was passed a non-vectored upl\n");
10461 }
10462 else
10463 panic("vector_upl_deallocate was passed a NULL upl\n");
10464 }
10465
10466 boolean_t
10467 vector_upl_is_valid(upl_t upl)
10468 {
10469 if(upl && ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
10470 vector_upl_t vector_upl = upl->vector_upl;
10471 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
10472 return FALSE;
10473 else
10474 return TRUE;
10475 }
10476 return FALSE;
10477 }
10478
10479 boolean_t
10480 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
10481 {
10482 if(vector_upl_is_valid(upl)) {
10483 vector_upl_t vector_upl = upl->vector_upl;
10484
10485 if(vector_upl) {
10486 if(subupl) {
10487 if(io_size) {
10488 if(io_size < PAGE_SIZE)
10489 io_size = PAGE_SIZE;
10490 subupl->vector_upl = (void*)vector_upl;
10491 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10492 vector_upl->size += io_size;
10493 upl->size += io_size;
10494 }
10495 else {
10496 uint32_t i=0,invalid_upls=0;
10497 for(i = 0; i < vector_upl->num_upls; i++) {
10498 if(vector_upl->upl_elems[i] == subupl)
10499 break;
10500 }
10501 if(i == vector_upl->num_upls)
10502 panic("Trying to remove sub-upl when none exists");
10503
10504 vector_upl->upl_elems[i] = NULL;
10505 invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
10506 if(invalid_upls == vector_upl->num_upls)
10507 return TRUE;
10508 else
10509 return FALSE;
10510 }
10511 }
10512 else
10513 panic("vector_upl_set_subupl was passed a NULL upl element\n");
10514 }
10515 else
10516 panic("vector_upl_set_subupl was passed a non-vectored upl\n");
10517 }
10518 else
10519 panic("vector_upl_set_subupl was passed a NULL upl\n");
10520
10521 return FALSE;
10522 }
10523
10524 void
10525 vector_upl_set_pagelist(upl_t upl)
10526 {
10527 if(vector_upl_is_valid(upl)) {
10528 uint32_t i=0;
10529 vector_upl_t vector_upl = upl->vector_upl;
10530
10531 if(vector_upl) {
10532 vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
10533
10534 vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
10535
10536 for(i=0; i < vector_upl->num_upls; i++) {
10537 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
10538 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10539 pagelist_size += cur_upl_pagelist_size;
10540 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
10541 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10542 }
10543 assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
10544 }
10545 else
10546 panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
10547 }
10548 else
10549 panic("vector_upl_set_pagelist was passed a NULL upl\n");
10550
10551 }
10552
10553 upl_t
10554 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10555 {
10556 if(vector_upl_is_valid(upl)) {
10557 vector_upl_t vector_upl = upl->vector_upl;
10558 if(vector_upl) {
10559 if(index < vector_upl->num_upls)
10560 return vector_upl->upl_elems[index];
10561 }
10562 else
10563 panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
10564 }
10565 return NULL;
10566 }
10567
10568 upl_t
10569 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10570 {
10571 if(vector_upl_is_valid(upl)) {
10572 uint32_t i=0;
10573 vector_upl_t vector_upl = upl->vector_upl;
10574
10575 if(vector_upl) {
10576 upl_t subupl = NULL;
10577 vector_upl_iostates_t subupl_state;
10578
10579 for(i=0; i < vector_upl->num_upls; i++) {
10580 subupl = vector_upl->upl_elems[i];
10581 subupl_state = vector_upl->upl_iostates[i];
10582 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10583 /* We could have been passed an offset/size pair that belongs
10584 * to an UPL element that has already been committed/aborted.
10585 * If so, return NULL.
10586 */
10587 if(subupl == NULL)
10588 return NULL;
10589 if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10590 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10591 if(*upl_size > subupl_state.size)
10592 *upl_size = subupl_state.size;
10593 }
10594 if(*upl_offset >= subupl_state.offset)
10595 *upl_offset -= subupl_state.offset;
10596 else if(i)
10597 panic("Vector UPL offset miscalculation\n");
10598 return subupl;
10599 }
10600 }
10601 }
10602 else
10603 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
10604 }
10605 return NULL;
10606 }
10607
10608 void
10609 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10610 {
10611 *v_upl_submap = NULL;
10612
10613 if(vector_upl_is_valid(upl)) {
10614 vector_upl_t vector_upl = upl->vector_upl;
10615 if(vector_upl) {
10616 *v_upl_submap = vector_upl->submap;
10617 *submap_dst_addr = vector_upl->submap_dst_addr;
10618 }
10619 else
10620 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10621 }
10622 else
10623 panic("vector_upl_get_submap was passed a null UPL\n");
10624 }
10625
10626 void
10627 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10628 {
10629 if(vector_upl_is_valid(upl)) {
10630 vector_upl_t vector_upl = upl->vector_upl;
10631 if(vector_upl) {
10632 vector_upl->submap = submap;
10633 vector_upl->submap_dst_addr = submap_dst_addr;
10634 }
10635 else
10636 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10637 }
10638 else
10639 panic("vector_upl_get_submap was passed a NULL UPL\n");
10640 }
10641
10642 void
10643 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10644 {
10645 if(vector_upl_is_valid(upl)) {
10646 uint32_t i = 0;
10647 vector_upl_t vector_upl = upl->vector_upl;
10648
10649 if(vector_upl) {
10650 for(i = 0; i < vector_upl->num_upls; i++) {
10651 if(vector_upl->upl_elems[i] == subupl)
10652 break;
10653 }
10654
10655 if(i == vector_upl->num_upls)
10656 panic("setting sub-upl iostate when none exists");
10657
10658 vector_upl->upl_iostates[i].offset = offset;
10659 if(size < PAGE_SIZE)
10660 size = PAGE_SIZE;
10661 vector_upl->upl_iostates[i].size = size;
10662 }
10663 else
10664 panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
10665 }
10666 else
10667 panic("vector_upl_set_iostate was passed a NULL UPL\n");
10668 }
10669
10670 void
10671 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10672 {
10673 if(vector_upl_is_valid(upl)) {
10674 uint32_t i = 0;
10675 vector_upl_t vector_upl = upl->vector_upl;
10676
10677 if(vector_upl) {
10678 for(i = 0; i < vector_upl->num_upls; i++) {
10679 if(vector_upl->upl_elems[i] == subupl)
10680 break;
10681 }
10682
10683 if(i == vector_upl->num_upls)
10684 panic("getting sub-upl iostate when none exists");
10685
10686 *offset = vector_upl->upl_iostates[i].offset;
10687 *size = vector_upl->upl_iostates[i].size;
10688 }
10689 else
10690 panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
10691 }
10692 else
10693 panic("vector_upl_get_iostate was passed a NULL UPL\n");
10694 }
10695
10696 void
10697 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10698 {
10699 if(vector_upl_is_valid(upl)) {
10700 vector_upl_t vector_upl = upl->vector_upl;
10701 if(vector_upl) {
10702 if(index < vector_upl->num_upls) {
10703 *offset = vector_upl->upl_iostates[index].offset;
10704 *size = vector_upl->upl_iostates[index].size;
10705 }
10706 else
10707 *offset = *size = 0;
10708 }
10709 else
10710 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
10711 }
10712 else
10713 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
10714 }
10715
10716 upl_page_info_t *
10717 upl_get_internal_vectorupl_pagelist(upl_t upl)
10718 {
10719 return ((vector_upl_t)(upl->vector_upl))->pagelist;
10720 }
10721
10722 void *
10723 upl_get_internal_vectorupl(upl_t upl)
10724 {
10725 return upl->vector_upl;
10726 }
10727
10728 vm_size_t
10729 upl_get_internal_pagelist_offset(void)
10730 {
10731 return sizeof(struct upl);
10732 }
10733
10734 void
10735 upl_clear_dirty(
10736 upl_t upl,
10737 boolean_t value)
10738 {
10739 if (value) {
10740 upl->flags |= UPL_CLEAR_DIRTY;
10741 } else {
10742 upl->flags &= ~UPL_CLEAR_DIRTY;
10743 }
10744 }
10745
10746 void
10747 upl_set_referenced(
10748 upl_t upl,
10749 boolean_t value)
10750 {
10751 upl_lock(upl);
10752 if (value) {
10753 upl->ext_ref_count++;
10754 } else {
10755 if (!upl->ext_ref_count) {
10756 panic("upl_set_referenced not %p\n", upl);
10757 }
10758 upl->ext_ref_count--;
10759 }
10760 upl_unlock(upl);
10761 }
10762
10763 #if CONFIG_IOSCHED
10764 void
10765 upl_set_blkno(
10766 upl_t upl,
10767 vm_offset_t upl_offset,
10768 int io_size,
10769 int64_t blkno)
10770 {
10771 int i,j;
10772 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
10773 return;
10774
10775 assert(upl->upl_reprio_info != 0);
10776 for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10777 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10778 }
10779 }
10780 #endif
10781
10782 boolean_t
10783 vm_page_is_slideable(vm_page_t m)
10784 {
10785 boolean_t result = FALSE;
10786 vm_shared_region_slide_info_t si;
10787 vm_object_t m_object;
10788
10789 m_object = VM_PAGE_OBJECT(m);
10790
10791 vm_object_lock_assert_held(m_object);
10792
10793 /* make sure our page belongs to the one object allowed to do this */
10794 if (!m_object->object_slid) {
10795 goto done;
10796 }
10797
10798 si = m_object->vo_slide_info;
10799 if (si == NULL) {
10800 goto done;
10801 }
10802
10803 if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
10804 result = TRUE;
10805 }
10806
10807 done:
10808 return result;
10809 }
10810
10811 int vm_page_slide_counter = 0;
10812 int vm_page_slide_errors = 0;
10813 kern_return_t
10814 vm_page_slide(
10815 vm_page_t page,
10816 vm_map_offset_t kernel_mapping_offset)
10817 {
10818 kern_return_t kr;
10819 vm_map_size_t kernel_mapping_size;
10820 boolean_t kernel_mapping_needs_unmap;
10821 vm_offset_t kernel_vaddr;
10822 uint32_t pageIndex;
10823 uint32_t slide_chunk;
10824 vm_object_t page_object;
10825
10826 page_object = VM_PAGE_OBJECT(page);
10827
10828 assert(!page->slid);
10829 assert(page_object->object_slid);
10830 vm_object_lock_assert_exclusive(page_object);
10831
10832 if (page->error)
10833 return KERN_FAILURE;
10834
10835 /*
10836 * Take a paging-in-progress reference to keep the object
10837 * alive even if we have to unlock it (in vm_paging_map_object()
10838 * for example)...
10839 */
10840 vm_object_paging_begin(page_object);
10841
10842 if (kernel_mapping_offset == 0) {
10843 /*
10844 * The page hasn't already been mapped in kernel space
10845 * by the caller. Map it now, so that we can access
10846 * its contents and decrypt them.
10847 */
10848 kernel_mapping_size = PAGE_SIZE;
10849 kernel_mapping_needs_unmap = FALSE;
10850 kr = vm_paging_map_object(page,
10851 page_object,
10852 page->offset,
10853 VM_PROT_READ | VM_PROT_WRITE,
10854 FALSE,
10855 &kernel_mapping_size,
10856 &kernel_mapping_offset,
10857 &kernel_mapping_needs_unmap);
10858 if (kr != KERN_SUCCESS) {
10859 panic("vm_page_slide: "
10860 "could not map page in kernel: 0x%x\n",
10861 kr);
10862 }
10863 } else {
10864 kernel_mapping_size = 0;
10865 kernel_mapping_needs_unmap = FALSE;
10866 }
10867 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10868
10869 /*
10870 * Slide the pointers on the page.
10871 */
10872
10873 /*assert that slide_file_info.start/end are page-aligned?*/
10874
10875 assert(!page->slid);
10876 assert(page_object->object_slid);
10877
10878 pageIndex = (uint32_t)((page->offset -
10879 page_object->vo_slide_info->start) /
10880 PAGE_SIZE_FOR_SR_SLIDE);
10881 for (slide_chunk = 0;
10882 slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE;
10883 slide_chunk++) {
10884 kr = vm_shared_region_slide_page(page_object->vo_slide_info,
10885 (kernel_vaddr +
10886 (slide_chunk *
10887 PAGE_SIZE_FOR_SR_SLIDE)),
10888 (pageIndex + slide_chunk));
10889 if (kr != KERN_SUCCESS) {
10890 break;
10891 }
10892 }
10893
10894 vm_page_slide_counter++;
10895
10896 /*
10897 * Unmap the page from the kernel's address space,
10898 */
10899 if (kernel_mapping_needs_unmap) {
10900 vm_paging_unmap_object(page_object,
10901 kernel_vaddr,
10902 kernel_vaddr + PAGE_SIZE);
10903 }
10904
10905 page->dirty = FALSE;
10906 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
10907
10908 if (kr != KERN_SUCCESS || cs_debug > 1) {
10909 printf("vm_page_slide(%p): "
10910 "obj %p off 0x%llx mobj %p moff 0x%llx\n",
10911 page,
10912 page_object, page->offset,
10913 page_object->pager,
10914 page->offset + page_object->paging_offset);
10915 }
10916
10917 if (kr == KERN_SUCCESS) {
10918 page->slid = TRUE;
10919 } else {
10920 page->error = TRUE;
10921 vm_page_slide_errors++;
10922 }
10923
10924 vm_object_paging_end(page_object);
10925
10926 return kr;
10927 }
10928
10929 void inline memoryshot(unsigned int event, unsigned int control)
10930 {
10931 if (vm_debug_events) {
10932 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10933 vm_page_active_count, vm_page_inactive_count,
10934 vm_page_free_count, vm_page_speculative_count,
10935 vm_page_throttled_count);
10936 } else {
10937 (void) event;
10938 (void) control;
10939 }
10940
10941 }
10942
10943 #ifdef MACH_BSD
10944
10945 boolean_t upl_device_page(upl_page_info_t *upl)
10946 {
10947 return(UPL_DEVICE_PAGE(upl));
10948 }
10949 boolean_t upl_page_present(upl_page_info_t *upl, int index)
10950 {
10951 return(UPL_PAGE_PRESENT(upl, index));
10952 }
10953 boolean_t upl_speculative_page(upl_page_info_t *upl, int index)
10954 {
10955 return(UPL_SPECULATIVE_PAGE(upl, index));
10956 }
10957 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
10958 {
10959 return(UPL_DIRTY_PAGE(upl, index));
10960 }
10961 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
10962 {
10963 return(UPL_VALID_PAGE(upl, index));
10964 }
10965 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
10966 {
10967 return(UPL_PHYS_PAGE(upl, index));
10968 }
10969
10970 void upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10971 {
10972 upl[index].mark = v;
10973 }
10974
10975 boolean_t upl_page_get_mark(upl_page_info_t *upl, int index)
10976 {
10977 return upl[index].mark;
10978 }
10979
10980 void
10981 vm_countdirtypages(void)
10982 {
10983 vm_page_t m;
10984 int dpages;
10985 int pgopages;
10986 int precpages;
10987
10988
10989 dpages=0;
10990 pgopages=0;
10991 precpages=0;
10992
10993 vm_page_lock_queues();
10994 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10995 do {
10996 if (m ==(vm_page_t )0) break;
10997
10998 if(m->dirty) dpages++;
10999 if(m->free_when_done) pgopages++;
11000 if(m->precious) precpages++;
11001
11002 assert(VM_PAGE_OBJECT(m) != kernel_object);
11003 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11004 if (m ==(vm_page_t )0) break;
11005
11006 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
11007 vm_page_unlock_queues();
11008
11009 vm_page_lock_queues();
11010 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
11011 do {
11012 if (m ==(vm_page_t )0) break;
11013
11014 dpages++;
11015 assert(m->dirty);
11016 assert(!m->free_when_done);
11017 assert(VM_PAGE_OBJECT(m) != kernel_object);
11018 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11019 if (m ==(vm_page_t )0) break;
11020
11021 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
11022 vm_page_unlock_queues();
11023
11024 vm_page_lock_queues();
11025 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
11026 do {
11027 if (m ==(vm_page_t )0) break;
11028
11029 if(m->dirty) dpages++;
11030 if(m->free_when_done) pgopages++;
11031 if(m->precious) precpages++;
11032
11033 assert(VM_PAGE_OBJECT(m) != kernel_object);
11034 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11035 if (m ==(vm_page_t )0) break;
11036
11037 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
11038 vm_page_unlock_queues();
11039
11040 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
11041
11042 dpages=0;
11043 pgopages=0;
11044 precpages=0;
11045
11046 vm_page_lock_queues();
11047 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
11048
11049 do {
11050 if(m == (vm_page_t )0) break;
11051 if(m->dirty) dpages++;
11052 if(m->free_when_done) pgopages++;
11053 if(m->precious) precpages++;
11054
11055 assert(VM_PAGE_OBJECT(m) != kernel_object);
11056 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11057 if(m == (vm_page_t )0) break;
11058
11059 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
11060 vm_page_unlock_queues();
11061
11062 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
11063
11064 }
11065 #endif /* MACH_BSD */
11066
11067 ppnum_t upl_get_highest_page(
11068 upl_t upl)
11069 {
11070 return upl->highest_page;
11071 }
11072
11073 upl_size_t upl_get_size(
11074 upl_t upl)
11075 {
11076 return upl->size;
11077 }
11078
11079 upl_t upl_associated_upl(upl_t upl)
11080 {
11081 return upl->associated_upl;
11082 }
11083
11084 void upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11085 {
11086 upl->associated_upl = associated_upl;
11087 }
11088
11089 struct vnode * upl_lookup_vnode(upl_t upl)
11090 {
11091 if (!upl->map_object->internal)
11092 return vnode_pager_lookup_vnode(upl->map_object->pager);
11093 else
11094 return NULL;
11095 }
11096
11097 #if UPL_DEBUG
11098 kern_return_t upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11099 {
11100 upl->ubc_alias1 = alias1;
11101 upl->ubc_alias2 = alias2;
11102 return KERN_SUCCESS;
11103 }
11104 int upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11105 {
11106 if(al)
11107 *al = upl->ubc_alias1;
11108 if(al2)
11109 *al2 = upl->ubc_alias2;
11110 return KERN_SUCCESS;
11111 }
11112 #endif /* UPL_DEBUG */
11113
11114 #if VM_PRESSURE_EVENTS
11115 /*
11116 * Upward trajectory.
11117 */
11118 extern boolean_t vm_compressor_low_on_space(void);
11119
11120 boolean_t
11121 VM_PRESSURE_NORMAL_TO_WARNING(void) {
11122
11123 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11124
11125 /* Available pages below our threshold */
11126 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11127 /* No frozen processes to kill */
11128 if (memorystatus_frozen_count == 0) {
11129 /* Not enough suspended processes available. */
11130 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11131 return TRUE;
11132 }
11133 }
11134 }
11135 return FALSE;
11136
11137 } else {
11138 return ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0);
11139 }
11140 }
11141
11142 boolean_t
11143 VM_PRESSURE_WARNING_TO_CRITICAL(void) {
11144
11145 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11146
11147 /* Available pages below our threshold */
11148 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11149 return TRUE;
11150 }
11151 return FALSE;
11152 } else {
11153 return (vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11154 }
11155 }
11156
11157 /*
11158 * Downward trajectory.
11159 */
11160 boolean_t
11161 VM_PRESSURE_WARNING_TO_NORMAL(void) {
11162
11163 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11164
11165 /* Available pages above our threshold */
11166 unsigned int target_threshold = memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100);
11167 if (memorystatus_available_pages > target_threshold) {
11168 return TRUE;
11169 }
11170 return FALSE;
11171 } else {
11172 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0);
11173 }
11174 }
11175
11176 boolean_t
11177 VM_PRESSURE_CRITICAL_TO_WARNING(void) {
11178
11179 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11180
11181 /* Available pages above our threshold */
11182 unsigned int target_threshold = memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100);
11183 if (memorystatus_available_pages > target_threshold) {
11184 return TRUE;
11185 }
11186 return FALSE;
11187 } else {
11188 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11189 }
11190 }
11191 #endif /* VM_PRESSURE_EVENTS */
11192