]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
xnu-1228.0.2.tar.gz
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67
68 #include <debug.h>
69 #include <mach_pagemap.h>
70 #include <mach_cluster_stats.h>
71 #include <mach_kdb.h>
72 #include <advisory_pageout.h>
73
74 #include <mach/mach_types.h>
75 #include <mach/memory_object.h>
76 #include <mach/memory_object_default.h>
77 #include <mach/memory_object_control_server.h>
78 #include <mach/mach_host_server.h>
79 #include <mach/upl.h>
80 #include <mach/vm_map.h>
81 #include <mach/vm_param.h>
82 #include <mach/vm_statistics.h>
83 #include <mach/sdt.h>
84
85 #include <kern/kern_types.h>
86 #include <kern/counters.h>
87 #include <kern/host_statistics.h>
88 #include <kern/machine.h>
89 #include <kern/misc_protos.h>
90 #include <kern/thread.h>
91 #include <kern/xpr.h>
92 #include <kern/kalloc.h>
93
94 #include <machine/vm_tuning.h>
95
96 #if CONFIG_EMBEDDED
97 #include <sys/kern_memorystatus.h>
98 #endif
99
100 #include <vm/pmap.h>
101 #include <vm/vm_fault.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_protos.h> /* must be last */
107 #include <vm/memory_object.h>
108 #include <vm/vm_purgeable_internal.h>
109
110 /*
111 * ENCRYPTED SWAP:
112 */
113 #include <../bsd/crypto/aes/aes.h>
114
115
116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */
117 #ifdef CONFIG_EMBEDDED
118 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 2048
119 #else
120 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100
121 #endif
122 #endif
123
124 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
125 #ifdef CONFIG_EMBEDDED
126 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
127 #else
128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
129 #endif
130 #endif
131
132 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
133 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
134 #endif
135
136 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
137 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
138 #endif
139
140 #ifndef VM_PAGE_LAUNDRY_MAX
141 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
142 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
143
144 #ifndef VM_PAGEOUT_BURST_WAIT
145 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
146 #endif /* VM_PAGEOUT_BURST_WAIT */
147
148 #ifndef VM_PAGEOUT_EMPTY_WAIT
149 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
150 #endif /* VM_PAGEOUT_EMPTY_WAIT */
151
152 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
153 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
154 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
155
156 #ifndef VM_PAGEOUT_IDLE_WAIT
157 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
158 #endif /* VM_PAGEOUT_IDLE_WAIT */
159
160 #ifndef VM_PAGE_SPECULATIVE_TARGET
161 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
162 #endif /* VM_PAGE_SPECULATIVE_TARGET */
163
164 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
165 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
166 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
167
168
169 /*
170 * To obtain a reasonable LRU approximation, the inactive queue
171 * needs to be large enough to give pages on it a chance to be
172 * referenced a second time. This macro defines the fraction
173 * of active+inactive pages that should be inactive.
174 * The pageout daemon uses it to update vm_page_inactive_target.
175 *
176 * If vm_page_free_count falls below vm_page_free_target and
177 * vm_page_inactive_count is below vm_page_inactive_target,
178 * then the pageout daemon starts running.
179 */
180
181 #ifndef VM_PAGE_INACTIVE_TARGET
182 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
183 #endif /* VM_PAGE_INACTIVE_TARGET */
184
185 /*
186 * Once the pageout daemon starts running, it keeps going
187 * until vm_page_free_count meets or exceeds vm_page_free_target.
188 */
189
190 #ifndef VM_PAGE_FREE_TARGET
191 #ifdef CONFIG_EMBEDDED
192 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
193 #else
194 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
195 #endif
196 #endif /* VM_PAGE_FREE_TARGET */
197
198 /*
199 * The pageout daemon always starts running once vm_page_free_count
200 * falls below vm_page_free_min.
201 */
202
203 #ifndef VM_PAGE_FREE_MIN
204 #ifdef CONFIG_EMBEDDED
205 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
206 #else
207 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
208 #endif
209 #endif /* VM_PAGE_FREE_MIN */
210
211 #define VM_PAGE_FREE_MIN_LIMIT 1500
212 #define VM_PAGE_FREE_TARGET_LIMIT 2000
213
214
215 /*
216 * When vm_page_free_count falls below vm_page_free_reserved,
217 * only vm-privileged threads can allocate pages. vm-privilege
218 * allows the pageout daemon and default pager (and any other
219 * associated threads needed for default pageout) to continue
220 * operation by dipping into the reserved pool of pages.
221 */
222
223 #ifndef VM_PAGE_FREE_RESERVED
224 #define VM_PAGE_FREE_RESERVED(n) \
225 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
226 #endif /* VM_PAGE_FREE_RESERVED */
227
228 /*
229 * When we dequeue pages from the inactive list, they are
230 * reactivated (ie, put back on the active queue) if referenced.
231 * However, it is possible to starve the free list if other
232 * processors are referencing pages faster than we can turn off
233 * the referenced bit. So we limit the number of reactivations
234 * we will make per call of vm_pageout_scan().
235 */
236 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
237 #ifndef VM_PAGE_REACTIVATE_LIMIT
238 #ifdef CONFIG_EMBEDDED
239 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
240 #else
241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
242 #endif
243 #endif /* VM_PAGE_REACTIVATE_LIMIT */
244 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 100
245
246
247 /*
248 * must hold the page queues lock to
249 * manipulate this structure
250 */
251 struct vm_pageout_queue {
252 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
253 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
254 unsigned int pgo_maxlaundry;
255
256 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
257 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
258 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
259 :0;
260 };
261
262 #define VM_PAGE_Q_THROTTLED(q) \
263 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
264
265
266 /*
267 * Exported variable used to broadcast the activation of the pageout scan
268 * Working Set uses this to throttle its use of pmap removes. In this
269 * way, code which runs within memory in an uncontested context does
270 * not keep encountering soft faults.
271 */
272
273 unsigned int vm_pageout_scan_event_counter = 0;
274
275 /*
276 * Forward declarations for internal routines.
277 */
278
279 static void vm_pageout_garbage_collect(int);
280 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
281 static void vm_pageout_iothread_external(void);
282 static void vm_pageout_iothread_internal(void);
283 static void vm_pageout_queue_steal(vm_page_t);
284
285 extern void vm_pageout_continue(void);
286 extern void vm_pageout_scan(void);
287
288 static thread_t vm_pageout_external_iothread = THREAD_NULL;
289 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
290
291 unsigned int vm_pageout_reserved_internal = 0;
292 unsigned int vm_pageout_reserved_really = 0;
293
294 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
295 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
296 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
297 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
298 unsigned int vm_pageout_deadlock_relief = 0;
299 unsigned int vm_pageout_inactive_relief = 0;
300 unsigned int vm_pageout_burst_active_throttle = 0;
301 unsigned int vm_pageout_burst_inactive_throttle = 0;
302
303 /*
304 * Protection against zero fill flushing live working sets derived
305 * from existing backing store and files
306 */
307 unsigned int vm_accellerate_zf_pageout_trigger = 400;
308 unsigned int zf_queue_min_count = 100;
309 unsigned int vm_zf_count = 0;
310 unsigned int vm_zf_queue_count = 0;
311
312 /*
313 * These variables record the pageout daemon's actions:
314 * how many pages it looks at and what happens to those pages.
315 * No locking needed because only one thread modifies the variables.
316 */
317
318 unsigned int vm_pageout_active = 0; /* debugging */
319 unsigned int vm_pageout_inactive = 0; /* debugging */
320 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
321 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
322 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
323 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
324 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
325 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
326 unsigned int vm_pageout_inactive_used = 0; /* debugging */
327 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
328 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
329 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
330 unsigned int vm_pageout_purged_objects = 0; /* debugging */
331 unsigned int vm_stat_discard = 0; /* debugging */
332 unsigned int vm_stat_discard_sent = 0; /* debugging */
333 unsigned int vm_stat_discard_failure = 0; /* debugging */
334 unsigned int vm_stat_discard_throttle = 0; /* debugging */
335 unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */
336 unsigned int vm_pageout_catch_ups = 0; /* debugging */
337 unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */
338
339 unsigned int vm_pageout_scan_active_throttled = 0;
340 unsigned int vm_pageout_scan_inactive_throttled = 0;
341 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
342 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
343 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
344 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
345 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
346 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
347 /*
348 * Backing store throttle when BS is exhausted
349 */
350 unsigned int vm_backing_store_low = 0;
351
352 unsigned int vm_pageout_out_of_line = 0;
353 unsigned int vm_pageout_in_place = 0;
354
355 /*
356 * ENCRYPTED SWAP:
357 * counters and statistics...
358 */
359 unsigned long vm_page_decrypt_counter = 0;
360 unsigned long vm_page_decrypt_for_upl_counter = 0;
361 unsigned long vm_page_encrypt_counter = 0;
362 unsigned long vm_page_encrypt_abort_counter = 0;
363 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
364 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
365
366 struct vm_pageout_queue vm_pageout_queue_internal;
367 struct vm_pageout_queue vm_pageout_queue_external;
368
369 unsigned int vm_page_speculative_target = 0;
370
371 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
372
373
374 /*
375 * Routine: vm_backing_store_disable
376 * Purpose:
377 * Suspend non-privileged threads wishing to extend
378 * backing store when we are low on backing store
379 * (Synchronized by caller)
380 */
381 void
382 vm_backing_store_disable(
383 boolean_t disable)
384 {
385 if(disable) {
386 vm_backing_store_low = 1;
387 } else {
388 if(vm_backing_store_low) {
389 vm_backing_store_low = 0;
390 thread_wakeup((event_t) &vm_backing_store_low);
391 }
392 }
393 }
394
395
396 #if MACH_CLUSTER_STATS
397 unsigned long vm_pageout_cluster_dirtied = 0;
398 unsigned long vm_pageout_cluster_cleaned = 0;
399 unsigned long vm_pageout_cluster_collisions = 0;
400 unsigned long vm_pageout_cluster_clusters = 0;
401 unsigned long vm_pageout_cluster_conversions = 0;
402 unsigned long vm_pageout_target_collisions = 0;
403 unsigned long vm_pageout_target_page_dirtied = 0;
404 unsigned long vm_pageout_target_page_freed = 0;
405 #define CLUSTER_STAT(clause) clause
406 #else /* MACH_CLUSTER_STATS */
407 #define CLUSTER_STAT(clause)
408 #endif /* MACH_CLUSTER_STATS */
409
410 /*
411 * Routine: vm_pageout_object_terminate
412 * Purpose:
413 * Destroy the pageout_object, and perform all of the
414 * required cleanup actions.
415 *
416 * In/Out conditions:
417 * The object must be locked, and will be returned locked.
418 */
419 void
420 vm_pageout_object_terminate(
421 vm_object_t object)
422 {
423 vm_object_t shadow_object;
424
425 /*
426 * Deal with the deallocation (last reference) of a pageout object
427 * (used for cleaning-in-place) by dropping the paging references/
428 * freeing pages in the original object.
429 */
430
431 assert(object->pageout);
432 shadow_object = object->shadow;
433 vm_object_lock(shadow_object);
434
435 while (!queue_empty(&object->memq)) {
436 vm_page_t p, m;
437 vm_object_offset_t offset;
438
439 p = (vm_page_t) queue_first(&object->memq);
440
441 assert(p->private);
442 assert(p->pageout);
443 p->pageout = FALSE;
444 assert(!p->cleaning);
445
446 offset = p->offset;
447 VM_PAGE_FREE(p);
448 p = VM_PAGE_NULL;
449
450 m = vm_page_lookup(shadow_object,
451 offset + object->shadow_offset);
452
453 if(m == VM_PAGE_NULL)
454 continue;
455 assert(m->cleaning);
456 /* used as a trigger on upl_commit etc to recognize the */
457 /* pageout daemon's subseqent desire to pageout a cleaning */
458 /* page. When the bit is on the upl commit code will */
459 /* respect the pageout bit in the target page over the */
460 /* caller's page list indication */
461 m->dump_cleaning = FALSE;
462
463 assert((m->dirty) || (m->precious) ||
464 (m->busy && m->cleaning));
465
466 /*
467 * Handle the trusted pager throttle.
468 * Also decrement the burst throttle (if external).
469 */
470 vm_page_lock_queues();
471 if (m->laundry) {
472 vm_pageout_throttle_up(m);
473 }
474
475 /*
476 * Handle the "target" page(s). These pages are to be freed if
477 * successfully cleaned. Target pages are always busy, and are
478 * wired exactly once. The initial target pages are not mapped,
479 * (so cannot be referenced or modified) but converted target
480 * pages may have been modified between the selection as an
481 * adjacent page and conversion to a target.
482 */
483 if (m->pageout) {
484 assert(m->busy);
485 assert(m->wire_count == 1);
486 m->cleaning = FALSE;
487 m->encrypted_cleaning = FALSE;
488 m->pageout = FALSE;
489 #if MACH_CLUSTER_STATS
490 if (m->wanted) vm_pageout_target_collisions++;
491 #endif
492 /*
493 * Revoke all access to the page. Since the object is
494 * locked, and the page is busy, this prevents the page
495 * from being dirtied after the pmap_disconnect() call
496 * returns.
497 *
498 * Since the page is left "dirty" but "not modifed", we
499 * can detect whether the page was redirtied during
500 * pageout by checking the modify state.
501 */
502 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
503 m->dirty = TRUE;
504 else
505 m->dirty = FALSE;
506
507 if (m->dirty) {
508 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
509 vm_page_unwire(m);/* reactivates */
510 VM_STAT_INCR(reactivations);
511 PAGE_WAKEUP_DONE(m);
512 } else {
513 CLUSTER_STAT(vm_pageout_target_page_freed++;)
514 vm_page_free(m);/* clears busy, etc. */
515 }
516 vm_page_unlock_queues();
517 continue;
518 }
519 /*
520 * Handle the "adjacent" pages. These pages were cleaned in
521 * place, and should be left alone.
522 * If prep_pin_count is nonzero, then someone is using the
523 * page, so make it active.
524 */
525 if (!m->active && !m->inactive && !m->throttled && !m->private) {
526 if (m->reference)
527 vm_page_activate(m);
528 else
529 vm_page_deactivate(m);
530 }
531 if((m->busy) && (m->cleaning)) {
532
533 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
534 m->busy = FALSE;
535
536 /* We do not re-set m->dirty ! */
537 /* The page was busy so no extraneous activity */
538 /* could have occurred. COPY_INTO is a read into the */
539 /* new pages. CLEAN_IN_PLACE does actually write */
540 /* out the pages but handling outside of this code */
541 /* will take care of resetting dirty. We clear the */
542 /* modify however for the Programmed I/O case. */
543 pmap_clear_modify(m->phys_page);
544
545 m->absent = FALSE;
546 m->overwriting = FALSE;
547 } else if (m->overwriting) {
548 /* alternate request page list, write to page_list */
549 /* case. Occurs when the original page was wired */
550 /* at the time of the list request */
551 assert(m->wire_count != 0);
552 vm_page_unwire(m);/* reactivates */
553 m->overwriting = FALSE;
554 } else {
555 /*
556 * Set the dirty state according to whether or not the page was
557 * modified during the pageout. Note that we purposefully do
558 * NOT call pmap_clear_modify since the page is still mapped.
559 * If the page were to be dirtied between the 2 calls, this
560 * this fact would be lost. This code is only necessary to
561 * maintain statistics, since the pmap module is always
562 * consulted if m->dirty is false.
563 */
564 #if MACH_CLUSTER_STATS
565 m->dirty = pmap_is_modified(m->phys_page);
566
567 if (m->dirty) vm_pageout_cluster_dirtied++;
568 else vm_pageout_cluster_cleaned++;
569 if (m->wanted) vm_pageout_cluster_collisions++;
570 #else
571 m->dirty = 0;
572 #endif
573 }
574 m->cleaning = FALSE;
575 m->encrypted_cleaning = FALSE;
576
577 /*
578 * Wakeup any thread waiting for the page to be un-cleaning.
579 */
580 PAGE_WAKEUP(m);
581 vm_page_unlock_queues();
582 }
583 /*
584 * Account for the paging reference taken in vm_paging_object_allocate.
585 */
586 vm_object_paging_end(shadow_object);
587 vm_object_unlock(shadow_object);
588
589 assert(object->ref_count == 0);
590 assert(object->paging_in_progress == 0);
591 assert(object->resident_page_count == 0);
592 return;
593 }
594
595 /*
596 * Routine: vm_pageclean_setup
597 *
598 * Purpose: setup a page to be cleaned (made non-dirty), but not
599 * necessarily flushed from the VM page cache.
600 * This is accomplished by cleaning in place.
601 *
602 * The page must not be busy, and the object and page
603 * queues must be locked.
604 *
605 */
606 void
607 vm_pageclean_setup(
608 vm_page_t m,
609 vm_page_t new_m,
610 vm_object_t new_object,
611 vm_object_offset_t new_offset)
612 {
613 assert(!m->busy);
614 #if 0
615 assert(!m->cleaning);
616 #endif
617
618 XPR(XPR_VM_PAGEOUT,
619 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
620 (integer_t)m->object, m->offset, (integer_t)m,
621 (integer_t)new_m, new_offset);
622
623 pmap_clear_modify(m->phys_page);
624
625 /*
626 * Mark original page as cleaning in place.
627 */
628 m->cleaning = TRUE;
629 m->dirty = TRUE;
630 m->precious = FALSE;
631
632 /*
633 * Convert the fictitious page to a private shadow of
634 * the real page.
635 */
636 assert(new_m->fictitious);
637 assert(new_m->phys_page == vm_page_fictitious_addr);
638 new_m->fictitious = FALSE;
639 new_m->private = TRUE;
640 new_m->pageout = TRUE;
641 new_m->phys_page = m->phys_page;
642 vm_page_wire(new_m);
643
644 vm_page_insert(new_m, new_object, new_offset);
645 assert(!new_m->wanted);
646 new_m->busy = FALSE;
647 }
648
649 /*
650 * Routine: vm_pageout_initialize_page
651 * Purpose:
652 * Causes the specified page to be initialized in
653 * the appropriate memory object. This routine is used to push
654 * pages into a copy-object when they are modified in the
655 * permanent object.
656 *
657 * The page is moved to a temporary object and paged out.
658 *
659 * In/out conditions:
660 * The page in question must not be on any pageout queues.
661 * The object to which it belongs must be locked.
662 * The page must be busy, but not hold a paging reference.
663 *
664 * Implementation:
665 * Move this page to a completely new object.
666 */
667 void
668 vm_pageout_initialize_page(
669 vm_page_t m)
670 {
671 vm_object_t object;
672 vm_object_offset_t paging_offset;
673 vm_page_t holding_page;
674 memory_object_t pager;
675
676 XPR(XPR_VM_PAGEOUT,
677 "vm_pageout_initialize_page, page 0x%X\n",
678 (integer_t)m, 0, 0, 0, 0);
679 assert(m->busy);
680
681 /*
682 * Verify that we really want to clean this page
683 */
684 assert(!m->absent);
685 assert(!m->error);
686 assert(m->dirty);
687
688 /*
689 * Create a paging reference to let us play with the object.
690 */
691 object = m->object;
692 paging_offset = m->offset + object->paging_offset;
693
694 if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
695 VM_PAGE_FREE(m);
696 panic("reservation without pageout?"); /* alan */
697 vm_object_unlock(object);
698
699 return;
700 }
701
702 /*
703 * If there's no pager, then we can't clean the page. This should
704 * never happen since this should be a copy object and therefore not
705 * an external object, so the pager should always be there.
706 */
707
708 pager = object->pager;
709
710 if (pager == MEMORY_OBJECT_NULL) {
711 VM_PAGE_FREE(m);
712 panic("missing pager for copy object");
713 return;
714 }
715
716 /* set the page for future call to vm_fault_list_request */
717 vm_object_paging_begin(object);
718 holding_page = NULL;
719 vm_page_lock_queues();
720 pmap_clear_modify(m->phys_page);
721 m->dirty = TRUE;
722 m->busy = TRUE;
723 m->list_req_pending = TRUE;
724 m->cleaning = TRUE;
725 m->pageout = TRUE;
726 vm_page_wire(m);
727 vm_page_unlock_queues();
728 vm_object_unlock(object);
729
730 /*
731 * Write the data to its pager.
732 * Note that the data is passed by naming the new object,
733 * not a virtual address; the pager interface has been
734 * manipulated to use the "internal memory" data type.
735 * [The object reference from its allocation is donated
736 * to the eventual recipient.]
737 */
738 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
739
740 vm_object_lock(object);
741 vm_object_paging_end(object);
742 }
743
744 #if MACH_CLUSTER_STATS
745 #define MAXCLUSTERPAGES 16
746 struct {
747 unsigned long pages_in_cluster;
748 unsigned long pages_at_higher_offsets;
749 unsigned long pages_at_lower_offsets;
750 } cluster_stats[MAXCLUSTERPAGES];
751 #endif /* MACH_CLUSTER_STATS */
752
753
754 /*
755 * vm_pageout_cluster:
756 *
757 * Given a page, queue it to the appropriate I/O thread,
758 * which will page it out and attempt to clean adjacent pages
759 * in the same operation.
760 *
761 * The page must be busy, and the object and queues locked. We will take a
762 * paging reference to prevent deallocation or collapse when we
763 * release the object lock back at the call site. The I/O thread
764 * is responsible for consuming this reference
765 *
766 * The page must not be on any pageout queue.
767 */
768
769 void
770 vm_pageout_cluster(vm_page_t m)
771 {
772 vm_object_t object = m->object;
773 struct vm_pageout_queue *q;
774
775
776 XPR(XPR_VM_PAGEOUT,
777 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
778 (integer_t)object, m->offset, (integer_t)m, 0, 0);
779
780 /*
781 * Only a certain kind of page is appreciated here.
782 */
783 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
784 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
785 assert(!m->throttled);
786
787 /*
788 * protect the object from collapse -
789 * locking in the object's paging_offset.
790 */
791 vm_object_paging_begin(object);
792
793 /*
794 * set the page for future call to vm_fault_list_request
795 * page should already be marked busy
796 */
797 vm_page_wire(m);
798 m->list_req_pending = TRUE;
799 m->cleaning = TRUE;
800 m->pageout = TRUE;
801 m->laundry = TRUE;
802
803 if (object->internal == TRUE)
804 q = &vm_pageout_queue_internal;
805 else
806 q = &vm_pageout_queue_external;
807 q->pgo_laundry++;
808
809 m->pageout_queue = TRUE;
810 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
811
812 if (q->pgo_idle == TRUE) {
813 q->pgo_idle = FALSE;
814 thread_wakeup((event_t) &q->pgo_pending);
815 }
816 }
817
818
819 unsigned long vm_pageout_throttle_up_count = 0;
820
821 /*
822 * A page is back from laundry. See if there are some pages waiting to
823 * go to laundry and if we can let some of them go now.
824 *
825 * Object and page queues must be locked.
826 */
827 void
828 vm_pageout_throttle_up(
829 vm_page_t m)
830 {
831 struct vm_pageout_queue *q;
832
833 vm_pageout_throttle_up_count++;
834
835 assert(m->laundry);
836 assert(m->object != VM_OBJECT_NULL);
837 assert(m->object != kernel_object);
838
839 if (m->object->internal == TRUE)
840 q = &vm_pageout_queue_internal;
841 else
842 q = &vm_pageout_queue_external;
843
844 m->laundry = FALSE;
845 q->pgo_laundry--;
846
847 if (q->pgo_throttled == TRUE) {
848 q->pgo_throttled = FALSE;
849 thread_wakeup((event_t) &q->pgo_laundry);
850 }
851 }
852
853
854 /*
855 * vm_pageout_scan does the dirty work for the pageout daemon.
856 * It returns with vm_page_queue_free_lock held and
857 * vm_page_free_wanted == 0.
858 */
859
860 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
861
862 #define FCS_IDLE 0
863 #define FCS_DELAYED 1
864 #define FCS_DEADLOCK_DETECTED 2
865
866 struct flow_control {
867 int state;
868 mach_timespec_t ts;
869 };
870
871 void
872 vm_pageout_scan(void)
873 {
874 unsigned int loop_count = 0;
875 unsigned int inactive_burst_count = 0;
876 unsigned int active_burst_count = 0;
877 unsigned int reactivated_this_call;
878 unsigned int reactivate_limit;
879 vm_page_t local_freeq = NULL;
880 int local_freed = 0;
881 int delayed_unlock;
882 int need_internal_inactive = 0;
883 int refmod_state = 0;
884 int vm_pageout_deadlock_target = 0;
885 struct vm_pageout_queue *iq;
886 struct vm_pageout_queue *eq;
887 struct vm_speculative_age_q *sq;
888 struct flow_control flow_control;
889 boolean_t inactive_throttled = FALSE;
890 boolean_t try_failed;
891 mach_timespec_t ts;
892 unsigned int msecs = 0;
893 vm_object_t object;
894 vm_object_t last_object_tried;
895 int zf_ratio;
896 int zf_run_count;
897 uint32_t catch_up_count = 0;
898 uint32_t inactive_reclaim_run;
899 boolean_t forced_reclaim;
900
901 flow_control.state = FCS_IDLE;
902 iq = &vm_pageout_queue_internal;
903 eq = &vm_pageout_queue_external;
904 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
905
906
907 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
908
909
910 vm_page_lock_queues();
911 delayed_unlock = 1; /* must be nonzero if Qs are locked, 0 if unlocked */
912
913 /*
914 * Calculate the max number of referenced pages on the inactive
915 * queue that we will reactivate.
916 */
917 reactivated_this_call = 0;
918 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
919 vm_page_inactive_count);
920 inactive_reclaim_run = 0;
921
922
923 /*???*/ /*
924 * We want to gradually dribble pages from the active queue
925 * to the inactive queue. If we let the inactive queue get
926 * very small, and then suddenly dump many pages into it,
927 * those pages won't get a sufficient chance to be referenced
928 * before we start taking them from the inactive queue.
929 *
930 * We must limit the rate at which we send pages to the pagers.
931 * data_write messages consume memory, for message buffers and
932 * for map-copy objects. If we get too far ahead of the pagers,
933 * we can potentially run out of memory.
934 *
935 * We can use the laundry count to limit directly the number
936 * of pages outstanding to the default pager. A similar
937 * strategy for external pagers doesn't work, because
938 * external pagers don't have to deallocate the pages sent them,
939 * and because we might have to send pages to external pagers
940 * even if they aren't processing writes. So we also
941 * use a burst count to limit writes to external pagers.
942 *
943 * When memory is very tight, we can't rely on external pagers to
944 * clean pages. They probably aren't running, because they
945 * aren't vm-privileged. If we kept sending dirty pages to them,
946 * we could exhaust the free list.
947 */
948
949
950 Restart:
951 assert(delayed_unlock!=0);
952
953 /*
954 * A page is "zero-filled" if it was not paged in from somewhere,
955 * and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
956 * Recalculate the zero-filled page ratio. We use this to apportion
957 * victimized pages between the normal and zero-filled inactive
958 * queues according to their relative abundance in memory. Thus if a task
959 * is flooding memory with zf pages, we begin to hunt them down.
960 * It would be better to throttle greedy tasks at a higher level,
961 * but at the moment mach vm cannot do this.
962 */
963 {
964 uint32_t total = vm_page_active_count + vm_page_inactive_count;
965 uint32_t normal = total - vm_zf_count;
966
967 /* zf_ratio is the number of zf pages we victimize per normal page */
968
969 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
970 zf_ratio = 0;
971 else if ((vm_zf_count <= normal) || (normal == 0))
972 zf_ratio = 1;
973 else
974 zf_ratio = vm_zf_count / normal;
975
976 zf_run_count = 0;
977 }
978
979 /*
980 * Recalculate vm_page_inactivate_target.
981 */
982 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
983 vm_page_inactive_count +
984 vm_page_speculative_count);
985 /*
986 * don't want to wake the pageout_scan thread up everytime we fall below
987 * the targets... set a low water mark at 0.25% below the target
988 */
989 vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
990
991 vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
992 vm_page_inactive_count);
993 object = NULL;
994 last_object_tried = NULL;
995 try_failed = FALSE;
996
997 if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
998 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
999 else
1000 catch_up_count = 0;
1001
1002 for (;;) {
1003 vm_page_t m;
1004
1005 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1006
1007 if (delayed_unlock == 0) {
1008 vm_page_lock_queues();
1009 delayed_unlock = 1;
1010 }
1011
1012 /*
1013 * Don't sweep through active queue more than the throttle
1014 * which should be kept relatively low
1015 */
1016 active_burst_count = vm_pageout_burst_active_throttle;
1017
1018 /*
1019 * Move pages from active to inactive.
1020 */
1021 if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1022 goto done_moving_active_pages;
1023
1024 while (!queue_empty(&vm_page_queue_active) &&
1025 (need_internal_inactive || active_burst_count)) {
1026
1027 if (active_burst_count)
1028 active_burst_count--;
1029
1030 vm_pageout_active++;
1031
1032 m = (vm_page_t) queue_first(&vm_page_queue_active);
1033
1034 assert(m->active && !m->inactive);
1035 assert(!m->laundry);
1036 assert(m->object != kernel_object);
1037 assert(m->phys_page != vm_page_guard_addr);
1038
1039 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1040
1041 /*
1042 * Try to lock object; since we've already got the
1043 * page queues lock, we can only 'try' for this one.
1044 * if the 'try' fails, we need to do a mutex_pause
1045 * to allow the owner of the object lock a chance to
1046 * run... otherwise, we're likely to trip over this
1047 * object in the same state as we work our way through
1048 * the queue... clumps of pages associated with the same
1049 * object are fairly typical on the inactive and active queues
1050 */
1051 if (m->object != object) {
1052 if (object != NULL) {
1053 vm_object_unlock(object);
1054 object = NULL;
1055 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1056 }
1057 if (!vm_object_lock_try_scan(m->object)) {
1058 /*
1059 * move page to end of active queue and continue
1060 */
1061 queue_remove(&vm_page_queue_active, m,
1062 vm_page_t, pageq);
1063 queue_enter(&vm_page_queue_active, m,
1064 vm_page_t, pageq);
1065
1066 try_failed = TRUE;
1067
1068 m = (vm_page_t) queue_first(&vm_page_queue_active);
1069 /*
1070 * this is the next object we're going to be interested in
1071 * try to make sure its available after the mutex_yield
1072 * returns control
1073 */
1074 vm_pageout_scan_wants_object = m->object;
1075
1076 goto done_with_activepage;
1077 }
1078 object = m->object;
1079
1080 try_failed = FALSE;
1081 }
1082
1083 /*
1084 * if the page is BUSY, then we pull it
1085 * off the active queue and leave it alone.
1086 * when BUSY is cleared, it will get stuck
1087 * back on the appropriate queue
1088 */
1089 if (m->busy) {
1090 queue_remove(&vm_page_queue_active, m,
1091 vm_page_t, pageq);
1092 m->pageq.next = NULL;
1093 m->pageq.prev = NULL;
1094
1095 if (!m->fictitious)
1096 vm_page_active_count--;
1097 m->active = FALSE;
1098
1099 goto done_with_activepage;
1100 }
1101
1102 /*
1103 * Deactivate the page while holding the object
1104 * locked, so we know the page is still not busy.
1105 * This should prevent races between pmap_enter
1106 * and pmap_clear_reference. The page might be
1107 * absent or fictitious, but vm_page_deactivate
1108 * can handle that.
1109 */
1110 vm_page_deactivate(m);
1111
1112 if (need_internal_inactive) {
1113 vm_pageout_scan_active_throttle_success++;
1114 need_internal_inactive--;
1115 }
1116 done_with_activepage:
1117 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1118
1119 if (object != NULL) {
1120 vm_object_unlock(object);
1121 object = NULL;
1122 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1123 }
1124 if (local_freeq) {
1125 vm_page_free_list(local_freeq);
1126
1127 local_freeq = NULL;
1128 local_freed = 0;
1129 }
1130 mutex_yield(&vm_page_queue_lock);
1131
1132 delayed_unlock = 1;
1133
1134 /*
1135 * continue the while loop processing
1136 * the active queue... need to hold
1137 * the page queues lock
1138 */
1139 }
1140 }
1141
1142
1143
1144 /**********************************************************************
1145 * above this point we're playing with the active queue
1146 * below this point we're playing with the throttling mechanisms
1147 * and the inactive queue
1148 **********************************************************************/
1149
1150 done_moving_active_pages:
1151
1152 /*
1153 * We are done if we have met our target *and*
1154 * nobody is still waiting for a page.
1155 */
1156 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1157 if (object != NULL) {
1158 vm_object_unlock(object);
1159 object = NULL;
1160 }
1161 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1162
1163 if (local_freeq) {
1164 vm_page_free_list(local_freeq);
1165
1166 local_freeq = NULL;
1167 local_freed = 0;
1168 }
1169 /*
1170 * inactive target still not met... keep going
1171 * until we get the queues balanced
1172 */
1173 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1174 !queue_empty(&vm_page_queue_active))
1175 continue;
1176
1177 mutex_lock(&vm_page_queue_free_lock);
1178
1179 if ((vm_page_free_count >= vm_page_free_target) &&
1180 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1181
1182 vm_page_unlock_queues();
1183
1184 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1185
1186 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1187
1188 return;
1189 }
1190 mutex_unlock(&vm_page_queue_free_lock);
1191 }
1192 /*
1193 * Before anything, we check if we have any ripe volatile objects around.
1194 * If so, purge the first and see what it gives us.
1195 */
1196 assert (available_for_purge>=0);
1197 if (available_for_purge)
1198 {
1199 if (object != NULL) {
1200 vm_object_unlock(object);
1201 object = NULL;
1202 }
1203 vm_purgeable_object_purge_one();
1204 continue;
1205 }
1206
1207 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1208 /*
1209 * try to pull pages from the aging bins
1210 * see vm_page.h for an explanation of how
1211 * this mechanism works
1212 */
1213 struct vm_speculative_age_q *aq;
1214 mach_timespec_t ts_fully_aged;
1215 boolean_t can_steal = FALSE;
1216
1217 aq = &vm_page_queue_speculative[speculative_steal_index];
1218
1219 while (queue_empty(&aq->age_q)) {
1220
1221 speculative_steal_index++;
1222
1223 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1224 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1225
1226 aq = &vm_page_queue_speculative[speculative_steal_index];
1227 }
1228 if (vm_page_speculative_count > vm_page_speculative_target)
1229 can_steal = TRUE;
1230 else {
1231 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1232 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1233 * 1000 * NSEC_PER_USEC;
1234
1235 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1236
1237 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1238
1239 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1240 can_steal = TRUE;
1241 }
1242 if (can_steal == TRUE)
1243 vm_page_speculate_ageit(aq);
1244 }
1245
1246 /*
1247 * Sometimes we have to pause:
1248 * 1) No inactive pages - nothing to do.
1249 * 2) Flow control - default pageout queue is full
1250 * 3) Loop control - no acceptable pages found on the inactive queue
1251 * within the last vm_pageout_burst_inactive_throttle iterations
1252 */
1253 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1254 (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1255 vm_pageout_scan_empty_throttle++;
1256 msecs = vm_pageout_empty_wait;
1257 goto vm_pageout_scan_delay;
1258
1259 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1260 vm_pageout_scan_burst_throttle++;
1261 msecs = vm_pageout_burst_wait;
1262 goto vm_pageout_scan_delay;
1263
1264 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1265
1266 switch (flow_control.state) {
1267
1268 case FCS_IDLE:
1269 reset_deadlock_timer:
1270 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1271 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1272 clock_get_system_nanotime(&flow_control.ts.tv_sec,
1273 (unsigned *)&flow_control.ts.tv_nsec);
1274 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1275
1276 flow_control.state = FCS_DELAYED;
1277 msecs = vm_pageout_deadlock_wait;
1278
1279 break;
1280
1281 case FCS_DELAYED:
1282 clock_get_system_nanotime(&ts.tv_sec,
1283 (unsigned *)&ts.tv_nsec);
1284
1285 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1286 /*
1287 * the pageout thread for the default pager is potentially
1288 * deadlocked since the
1289 * default pager queue has been throttled for more than the
1290 * allowable time... we need to move some clean pages or dirty
1291 * pages belonging to the external pagers if they aren't throttled
1292 * vm_page_free_wanted represents the number of threads currently
1293 * blocked waiting for pages... we'll move one page for each of
1294 * these plus a fixed amount to break the logjam... once we're done
1295 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1296 * with a new timeout target since we have no way of knowing
1297 * whether we've broken the deadlock except through observation
1298 * of the queue associated with the default pager... we need to
1299 * stop moving pages and allow the system to run to see what
1300 * state it settles into.
1301 */
1302 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1303 vm_pageout_scan_deadlock_detected++;
1304 flow_control.state = FCS_DEADLOCK_DETECTED;
1305
1306 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1307 goto consider_inactive;
1308 }
1309 /*
1310 * just resniff instead of trying
1311 * to compute a new delay time... we're going to be
1312 * awakened immediately upon a laundry completion,
1313 * so we won't wait any longer than necessary
1314 */
1315 msecs = vm_pageout_idle_wait;
1316 break;
1317
1318 case FCS_DEADLOCK_DETECTED:
1319 if (vm_pageout_deadlock_target)
1320 goto consider_inactive;
1321 goto reset_deadlock_timer;
1322
1323 }
1324 vm_pageout_scan_throttle++;
1325 iq->pgo_throttled = TRUE;
1326 vm_pageout_scan_delay:
1327 if (object != NULL) {
1328 vm_object_unlock(object);
1329 object = NULL;
1330 }
1331 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1332
1333 if (local_freeq) {
1334 vm_page_free_list(local_freeq);
1335
1336 local_freeq = NULL;
1337 local_freed = 0;
1338 }
1339 #if CONFIG_EMBEDDED
1340 {
1341 int percent_avail;
1342
1343 /*
1344 * Decide if we need to send a memory status notification.
1345 */
1346 percent_avail =
1347 (vm_page_active_count + vm_page_inactive_count +
1348 vm_page_speculative_count + vm_page_free_count +
1349 vm_page_purgeable_count ) * 100 /
1350 atop_64(max_mem);
1351 if (percent_avail >= (kern_memorystatus_level + 5) ||
1352 percent_avail <= (kern_memorystatus_level - 5)) {
1353 kern_memorystatus_level = percent_avail;
1354 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1355 }
1356 }
1357 #endif
1358 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1359
1360 counter(c_vm_pageout_scan_block++);
1361
1362 vm_page_unlock_queues();
1363
1364 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1365
1366 thread_block(THREAD_CONTINUE_NULL);
1367
1368 vm_page_lock_queues();
1369 delayed_unlock = 1;
1370
1371 iq->pgo_throttled = FALSE;
1372
1373 if (loop_count >= vm_page_inactive_count)
1374 loop_count = 0;
1375 inactive_burst_count = 0;
1376
1377 goto Restart;
1378 /*NOTREACHED*/
1379 }
1380
1381
1382 flow_control.state = FCS_IDLE;
1383 consider_inactive:
1384 loop_count++;
1385 inactive_burst_count++;
1386 vm_pageout_inactive++;
1387
1388 /* Choose a victim. */
1389
1390 while (1) {
1391 m = NULL;
1392
1393 /*
1394 * the most eligible pages are ones that were throttled because the
1395 * pager wasn't ready at the time. If a pager is ready now,
1396 * see if one of these is useful.
1397 */
1398 if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1399 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1400 break;
1401 }
1402
1403 /*
1404 * The second most eligible pages are ones we paged in speculatively,
1405 * but which have not yet been touched.
1406 */
1407 if ( !queue_empty(&sq->age_q) ) {
1408 m = (vm_page_t) queue_first(&sq->age_q);
1409 break;
1410 }
1411 /*
1412 * Time for a zero-filled inactive page?
1413 */
1414 if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1415 queue_empty(&vm_page_queue_inactive)) {
1416 if ( !queue_empty(&vm_page_queue_zf) ) {
1417 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1418 zf_run_count++;
1419 break;
1420 }
1421 }
1422 /*
1423 * It's either a normal inactive page or nothing.
1424 */
1425 if ( !queue_empty(&vm_page_queue_inactive) ) {
1426 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1427 zf_run_count = 0;
1428 break;
1429 }
1430
1431 panic("vm_pageout: no victim");
1432 }
1433
1434 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1435 assert(!m->laundry);
1436 assert(m->object != kernel_object);
1437 assert(m->phys_page != vm_page_guard_addr);
1438
1439 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1440
1441 /*
1442 * check to see if we currently are working
1443 * with the same object... if so, we've
1444 * already got the lock
1445 */
1446 if (m->object != object) {
1447 /*
1448 * the object associated with candidate page is
1449 * different from the one we were just working
1450 * with... dump the lock if we still own it
1451 */
1452 if (object != NULL) {
1453 vm_object_unlock(object);
1454 object = NULL;
1455 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1456 }
1457 /*
1458 * Try to lock object; since we've alread got the
1459 * page queues lock, we can only 'try' for this one.
1460 * if the 'try' fails, we need to do a mutex_pause
1461 * to allow the owner of the object lock a chance to
1462 * run... otherwise, we're likely to trip over this
1463 * object in the same state as we work our way through
1464 * the queue... clumps of pages associated with the same
1465 * object are fairly typical on the inactive and active queues
1466 */
1467 if (!vm_object_lock_try_scan(m->object)) {
1468 /*
1469 * Move page to end and continue.
1470 * Don't re-issue ticket
1471 */
1472 if (m->zero_fill) {
1473 queue_remove(&vm_page_queue_zf, m,
1474 vm_page_t, pageq);
1475 queue_enter(&vm_page_queue_zf, m,
1476 vm_page_t, pageq);
1477 } else if (m->speculative) {
1478 remque(&m->pageq);
1479 m->speculative = FALSE;
1480 vm_page_speculative_count--;
1481
1482 /*
1483 * move to the tail of the inactive queue
1484 * to get it out of the way... the speculative
1485 * queue is generally too small to depend
1486 * on there being enough pages from other
1487 * objects to make cycling it back on the
1488 * same queue a winning proposition
1489 */
1490 queue_enter(&vm_page_queue_inactive, m,
1491 vm_page_t, pageq);
1492 m->inactive = TRUE;
1493 vm_page_inactive_count++;
1494 token_new_pagecount++;
1495 } else if (m->throttled) {
1496 queue_remove(&vm_page_queue_throttled, m,
1497 vm_page_t, pageq);
1498 m->throttled = FALSE;
1499 vm_page_throttled_count--;
1500
1501 /*
1502 * not throttled any more, so can stick
1503 * it on the inactive queue.
1504 */
1505 queue_enter(&vm_page_queue_inactive, m,
1506 vm_page_t, pageq);
1507 m->inactive = TRUE;
1508 vm_page_inactive_count++;
1509 token_new_pagecount++;
1510 } else {
1511 queue_remove(&vm_page_queue_inactive, m,
1512 vm_page_t, pageq);
1513 #if MACH_ASSERT
1514 vm_page_inactive_count--; /* balance for purgeable queue asserts */
1515 #endif
1516 vm_purgeable_q_advance_all(1);
1517
1518 queue_enter(&vm_page_queue_inactive, m,
1519 vm_page_t, pageq);
1520 #if MACH_ASSERT
1521 vm_page_inactive_count++; /* balance for purgeable queue asserts */
1522 #endif
1523 token_new_pagecount++;
1524 }
1525 pmap_clear_reference(m->phys_page);
1526 m->reference = FALSE;
1527
1528 vm_pageout_inactive_nolock++;
1529
1530 if ( !queue_empty(&sq->age_q) )
1531 m = (vm_page_t) queue_first(&sq->age_q);
1532 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1533 queue_empty(&vm_page_queue_inactive)) {
1534 if ( !queue_empty(&vm_page_queue_zf) )
1535 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1536 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1537 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1538 }
1539 /*
1540 * this is the next object we're going to be interested in
1541 * try to make sure its available after the mutex_yield
1542 * returns control
1543 */
1544 vm_pageout_scan_wants_object = m->object;
1545
1546 /*
1547 * force us to dump any collected free pages
1548 * and to pause before moving on
1549 */
1550 try_failed = TRUE;
1551
1552 goto done_with_inactivepage;
1553 }
1554 object = m->object;
1555 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1556
1557 try_failed = FALSE;
1558 }
1559
1560 /*
1561 * Paging out pages of external objects which
1562 * are currently being created must be avoided.
1563 * The pager may claim for memory, thus leading to a
1564 * possible dead lock between it and the pageout thread,
1565 * if such pages are finally chosen. The remaining assumption
1566 * is that there will finally be enough available pages in the
1567 * inactive pool to page out in order to satisfy all memory
1568 * claimed by the thread which concurrently creates the pager.
1569 */
1570 if (!object->pager_initialized && object->pager_created) {
1571 /*
1572 * Move page to end and continue, hoping that
1573 * there will be enough other inactive pages to
1574 * page out so that the thread which currently
1575 * initializes the pager will succeed.
1576 * Don't re-grant the ticket, the page should
1577 * pulled from the queue and paged out whenever
1578 * one of its logically adjacent fellows is
1579 * targeted.
1580 *
1581 * Pages found on the speculative list can never be
1582 * in this state... they always have a pager associated
1583 * with them.
1584 */
1585 assert(!m->speculative);
1586
1587 if (m->zero_fill) {
1588 queue_remove(&vm_page_queue_zf, m,
1589 vm_page_t, pageq);
1590 queue_enter(&vm_page_queue_zf, m,
1591 vm_page_t, pageq);
1592 } else {
1593 queue_remove(&vm_page_queue_inactive, m,
1594 vm_page_t, pageq);
1595 #if MACH_ASSERT
1596 vm_page_inactive_count--; /* balance for purgeable queue asserts */
1597 #endif
1598 vm_purgeable_q_advance_all(1);
1599
1600 queue_enter(&vm_page_queue_inactive, m,
1601 vm_page_t, pageq);
1602 #if MACH_ASSERT
1603 vm_page_inactive_count++; /* balance for purgeable queue asserts */
1604 #endif
1605 token_new_pagecount++;
1606 }
1607 vm_pageout_inactive_avoid++;
1608
1609 goto done_with_inactivepage;
1610 }
1611 /*
1612 * Remove the page from its list.
1613 */
1614 if (m->speculative) {
1615 remque(&m->pageq);
1616 m->speculative = FALSE;
1617 vm_page_speculative_count--;
1618 } else if (m->throttled) {
1619 queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1620 m->throttled = FALSE;
1621 vm_page_throttled_count--;
1622 } else {
1623 if (m->zero_fill) {
1624 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1625 vm_zf_queue_count--;
1626 } else {
1627 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1628 }
1629 m->inactive = FALSE;
1630 if (!m->fictitious)
1631 vm_page_inactive_count--;
1632 vm_purgeable_q_advance_all(1);
1633 }
1634
1635 if (object->copy == VM_OBJECT_NULL &&
1636 (object->purgable == VM_PURGABLE_EMPTY ||
1637 object->purgable == VM_PURGABLE_VOLATILE)) {
1638 assert(m->wire_count == 0); /* if it's wired, we can't put it on our queue */
1639 /* just stick it back on! */
1640 goto reactivate_page;
1641 }
1642 m->pageq.next = NULL;
1643 m->pageq.prev = NULL;
1644
1645 if ( !m->fictitious && catch_up_count)
1646 catch_up_count--;
1647
1648 /*
1649 * ENCRYPTED SWAP:
1650 * if this page has already been picked up as part of a
1651 * page-out cluster, it will be busy because it is being
1652 * encrypted (see vm_object_upl_request()). But we still
1653 * want to demote it from "clean-in-place" (aka "adjacent")
1654 * to "clean-and-free" (aka "target"), so let's ignore its
1655 * "busy" bit here and proceed to check for "cleaning" a
1656 * little bit below...
1657 */
1658 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1659 /*
1660 * Somebody is already playing with this page.
1661 * Leave it off the pageout queues.
1662 *
1663 */
1664 vm_pageout_inactive_busy++;
1665
1666 goto done_with_inactivepage;
1667 }
1668
1669 /*
1670 * If it's absent or in error, we can reclaim the page.
1671 */
1672
1673 if (m->absent || m->error) {
1674 vm_pageout_inactive_absent++;
1675 reclaim_page:
1676 if (vm_pageout_deadlock_target) {
1677 vm_pageout_scan_inactive_throttle_success++;
1678 vm_pageout_deadlock_target--;
1679 }
1680
1681 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1682
1683 if (m->object->internal) {
1684 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1685 } else {
1686 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1687 }
1688
1689 vm_page_free_prepare(m);
1690
1691 assert(m->pageq.next == NULL &&
1692 m->pageq.prev == NULL);
1693 m->pageq.next = (queue_entry_t)local_freeq;
1694 local_freeq = m;
1695 local_freed++;
1696
1697 inactive_burst_count = 0;
1698
1699 goto done_with_inactivepage;
1700 }
1701
1702 assert(!m->private);
1703 assert(!m->fictitious);
1704
1705 /*
1706 * If already cleaning this page in place, convert from
1707 * "adjacent" to "target". We can leave the page mapped,
1708 * and vm_pageout_object_terminate will determine whether
1709 * to free or reactivate.
1710 */
1711
1712 if (m->cleaning) {
1713 m->busy = TRUE;
1714 m->pageout = TRUE;
1715 m->dump_cleaning = TRUE;
1716 vm_page_wire(m);
1717
1718 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1719
1720 inactive_burst_count = 0;
1721
1722 goto done_with_inactivepage;
1723 }
1724
1725 /*
1726 * If it's being used, reactivate.
1727 * (Fictitious pages are either busy or absent.)
1728 * First, update the reference and dirty bits
1729 * to make sure the page is unreferenced.
1730 */
1731 refmod_state = -1;
1732
1733 if (m->reference == FALSE && m->pmapped == TRUE) {
1734 refmod_state = pmap_get_refmod(m->phys_page);
1735
1736 if (refmod_state & VM_MEM_REFERENCED)
1737 m->reference = TRUE;
1738 if (refmod_state & VM_MEM_MODIFIED)
1739 m->dirty = TRUE;
1740 }
1741 if (m->reference && !m->no_cache) {
1742 /*
1743 * The page we pulled off the inactive list has
1744 * been referenced. It is possible for other
1745 * processors to be touching pages faster than we
1746 * can clear the referenced bit and traverse the
1747 * inactive queue, so we limit the number of
1748 * reactivations.
1749 */
1750 if (++reactivated_this_call >= reactivate_limit) {
1751 vm_pageout_reactivation_limit_exceeded++;
1752 } else if (catch_up_count) {
1753 vm_pageout_catch_ups++;
1754 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1755 vm_pageout_inactive_force_reclaim++;
1756 } else {
1757 /*
1758 * The page was being used, so put back on active list.
1759 */
1760 reactivate_page:
1761 vm_page_activate(m);
1762 VM_STAT_INCR(reactivations);
1763
1764 vm_pageout_inactive_used++;
1765 inactive_burst_count = 0;
1766
1767 goto done_with_inactivepage;
1768 }
1769 /*
1770 * Make sure we call pmap_get_refmod() if it
1771 * wasn't already called just above, to update
1772 * the dirty bit.
1773 */
1774 if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1775 refmod_state = pmap_get_refmod(m->phys_page);
1776 if (refmod_state & VM_MEM_MODIFIED)
1777 m->dirty = TRUE;
1778 }
1779 forced_reclaim = TRUE;
1780 } else {
1781 forced_reclaim = FALSE;
1782 }
1783
1784 XPR(XPR_VM_PAGEOUT,
1785 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1786 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1787
1788 /*
1789 * we've got a candidate page to steal...
1790 *
1791 * m->dirty is up to date courtesy of the
1792 * preceding check for m->reference... if
1793 * we get here, then m->reference had to be
1794 * FALSE (or possibly "reactivate_limit" was
1795 * exceeded), but in either case we called
1796 * pmap_get_refmod() and updated both
1797 * m->reference and m->dirty
1798 *
1799 * if it's dirty or precious we need to
1800 * see if the target queue is throtttled
1801 * it if is, we need to skip over it by moving it back
1802 * to the end of the inactive queue
1803 */
1804 inactive_throttled = FALSE;
1805
1806 if (m->dirty || m->precious) {
1807 if (object->internal) {
1808 if (VM_PAGE_Q_THROTTLED(iq))
1809 inactive_throttled = TRUE;
1810 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1811 inactive_throttled = TRUE;
1812 }
1813 }
1814 if (inactive_throttled == TRUE) {
1815 throttle_inactive:
1816 if (!IP_VALID(memory_manager_default) &&
1817 object->internal &&
1818 (object->purgable == VM_PURGABLE_DENY ||
1819 object->purgable == VM_PURGABLE_NONVOLATILE)) {
1820 queue_enter(&vm_page_queue_throttled, m,
1821 vm_page_t, pageq);
1822 m->throttled = TRUE;
1823 vm_page_throttled_count++;
1824 } else {
1825 if (m->zero_fill) {
1826 queue_enter(&vm_page_queue_zf, m,
1827 vm_page_t, pageq);
1828 vm_zf_queue_count++;
1829 } else
1830 queue_enter(&vm_page_queue_inactive, m,
1831 vm_page_t, pageq);
1832 m->inactive = TRUE;
1833 if (!m->fictitious) {
1834 vm_page_inactive_count++;
1835 token_new_pagecount++;
1836 }
1837 }
1838 vm_pageout_scan_inactive_throttled++;
1839 goto done_with_inactivepage;
1840 }
1841
1842 /*
1843 * we've got a page that we can steal...
1844 * eliminate all mappings and make sure
1845 * we have the up-to-date modified state
1846 * first take the page BUSY, so that no new
1847 * mappings can be made
1848 */
1849 m->busy = TRUE;
1850
1851 /*
1852 * if we need to do a pmap_disconnect then we
1853 * need to re-evaluate m->dirty since the pmap_disconnect
1854 * provides the true state atomically... the
1855 * page was still mapped up to the pmap_disconnect
1856 * and may have been dirtied at the last microsecond
1857 *
1858 * we also check for the page being referenced 'late'
1859 * if it was, we first need to do a WAKEUP_DONE on it
1860 * since we already set m->busy = TRUE, before
1861 * going off to reactivate it
1862 *
1863 * Note that if 'pmapped' is FALSE then the page is not
1864 * and has not been in any map, so there is no point calling
1865 * pmap_disconnect(). m->dirty and/or m->reference could
1866 * have been set in anticipation of likely usage of the page.
1867 */
1868 if (m->pmapped == TRUE) {
1869 refmod_state = pmap_disconnect(m->phys_page);
1870
1871 if (refmod_state & VM_MEM_MODIFIED)
1872 m->dirty = TRUE;
1873 if (refmod_state & VM_MEM_REFERENCED) {
1874
1875 /* If m->reference is already set, this page must have
1876 * already failed the reactivate_limit test, so don't
1877 * bump the counts twice.
1878 */
1879 if ( ! m->reference ) {
1880 m->reference = TRUE;
1881 if (forced_reclaim ||
1882 ++reactivated_this_call >= reactivate_limit)
1883 vm_pageout_reactivation_limit_exceeded++;
1884 else {
1885 PAGE_WAKEUP_DONE(m);
1886 goto reactivate_page;
1887 }
1888 }
1889 }
1890 }
1891 /*
1892 * reset our count of pages that have been reclaimed
1893 * since the last page was 'stolen'
1894 */
1895 inactive_reclaim_run = 0;
1896
1897 /*
1898 * If it's clean and not precious, we can free the page.
1899 */
1900 if (!m->dirty && !m->precious) {
1901 vm_pageout_inactive_clean++;
1902 goto reclaim_page;
1903 }
1904
1905 /*
1906 * The page may have been dirtied since the last check
1907 * for a throttled target queue (which may have been skipped
1908 * if the page was clean then). With the dirty page
1909 * disconnected here, we can make one final check.
1910 */
1911 {
1912 boolean_t disconnect_throttled = FALSE;
1913 if (object->internal) {
1914 if (VM_PAGE_Q_THROTTLED(iq))
1915 disconnect_throttled = TRUE;
1916 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1917 disconnect_throttled = TRUE;
1918 }
1919
1920 if (disconnect_throttled == TRUE) {
1921 PAGE_WAKEUP_DONE(m);
1922 goto throttle_inactive;
1923 }
1924 }
1925
1926 vm_pageout_cluster(m);
1927
1928 vm_pageout_inactive_dirty++;
1929
1930 inactive_burst_count = 0;
1931
1932 done_with_inactivepage:
1933 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1934
1935 if (object != NULL) {
1936 vm_object_unlock(object);
1937 object = NULL;
1938 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1939 }
1940 if (local_freeq) {
1941 vm_page_free_list(local_freeq);
1942
1943 local_freeq = NULL;
1944 local_freed = 0;
1945 }
1946 mutex_yield(&vm_page_queue_lock);
1947
1948 delayed_unlock = 1;
1949 }
1950 /*
1951 * back to top of pageout scan loop
1952 */
1953 }
1954 }
1955
1956
1957 int vm_page_free_count_init;
1958
1959 void
1960 vm_page_free_reserve(
1961 int pages)
1962 {
1963 int free_after_reserve;
1964
1965 vm_page_free_reserved += pages;
1966
1967 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1968
1969 vm_page_free_min = vm_page_free_reserved +
1970 VM_PAGE_FREE_MIN(free_after_reserve);
1971
1972 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
1973 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
1974
1975 vm_page_free_target = vm_page_free_reserved +
1976 VM_PAGE_FREE_TARGET(free_after_reserve);
1977
1978 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
1979 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
1980
1981 if (vm_page_free_target < vm_page_free_min + 5)
1982 vm_page_free_target = vm_page_free_min + 5;
1983
1984 }
1985
1986 /*
1987 * vm_pageout is the high level pageout daemon.
1988 */
1989
1990 void
1991 vm_pageout_continue(void)
1992 {
1993 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
1994 vm_pageout_scan_event_counter++;
1995 vm_pageout_scan();
1996 /* we hold vm_page_queue_free_lock now */
1997 assert(vm_page_free_wanted == 0);
1998 assert(vm_page_free_wanted_privileged == 0);
1999 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2000 mutex_unlock(&vm_page_queue_free_lock);
2001
2002 counter(c_vm_pageout_block++);
2003 thread_block((thread_continue_t)vm_pageout_continue);
2004 /*NOTREACHED*/
2005 }
2006
2007
2008 /*
2009 * must be called with the
2010 * queues and object locks held
2011 */
2012 static void
2013 vm_pageout_queue_steal(vm_page_t m)
2014 {
2015 struct vm_pageout_queue *q;
2016
2017 if (m->object->internal == TRUE)
2018 q = &vm_pageout_queue_internal;
2019 else
2020 q = &vm_pageout_queue_external;
2021
2022 m->laundry = FALSE;
2023 m->pageout_queue = FALSE;
2024 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2025
2026 m->pageq.next = NULL;
2027 m->pageq.prev = NULL;
2028
2029 vm_object_paging_end(m->object);
2030
2031 q->pgo_laundry--;
2032 }
2033
2034
2035 #ifdef FAKE_DEADLOCK
2036
2037 #define FAKE_COUNT 5000
2038
2039 int internal_count = 0;
2040 int fake_deadlock = 0;
2041
2042 #endif
2043
2044 static void
2045 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2046 {
2047 vm_page_t m = NULL;
2048 vm_object_t object;
2049 boolean_t need_wakeup;
2050 memory_object_t pager;
2051 thread_t self = current_thread();
2052
2053 if ((vm_pageout_internal_iothread != THREAD_NULL)
2054 && (self == vm_pageout_external_iothread )
2055 && (self->options & TH_OPT_VMPRIV))
2056 self->options &= ~TH_OPT_VMPRIV;
2057
2058 vm_page_lockspin_queues();
2059
2060 while ( !queue_empty(&q->pgo_pending) ) {
2061
2062 q->pgo_busy = TRUE;
2063 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2064 m->pageout_queue = FALSE;
2065 vm_page_unlock_queues();
2066
2067 m->pageq.next = NULL;
2068 m->pageq.prev = NULL;
2069 #ifdef FAKE_DEADLOCK
2070 if (q == &vm_pageout_queue_internal) {
2071 vm_offset_t addr;
2072 int pg_count;
2073
2074 internal_count++;
2075
2076 if ((internal_count == FAKE_COUNT)) {
2077
2078 pg_count = vm_page_free_count + vm_page_free_reserved;
2079
2080 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2081 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2082 }
2083 internal_count = 0;
2084 fake_deadlock++;
2085 }
2086 }
2087 #endif
2088 object = m->object;
2089
2090 vm_object_lock(object);
2091
2092 if (!object->pager_initialized) {
2093
2094 /*
2095 * If there is no memory object for the page, create
2096 * one and hand it to the default pager.
2097 */
2098
2099 if (!object->pager_initialized)
2100 vm_object_collapse(object,
2101 (vm_object_offset_t) 0,
2102 TRUE);
2103 if (!object->pager_initialized)
2104 vm_object_pager_create(object);
2105 if (!object->pager_initialized) {
2106 /*
2107 * Still no pager for the object.
2108 * Reactivate the page.
2109 *
2110 * Should only happen if there is no
2111 * default pager.
2112 */
2113 m->list_req_pending = FALSE;
2114 m->cleaning = FALSE;
2115 m->pageout = FALSE;
2116
2117 vm_page_lockspin_queues();
2118 vm_page_unwire(m);
2119 vm_pageout_throttle_up(m);
2120 vm_pageout_dirty_no_pager++;
2121 vm_page_activate(m);
2122 vm_page_unlock_queues();
2123
2124 /*
2125 * And we are done with it.
2126 */
2127 PAGE_WAKEUP_DONE(m);
2128
2129 vm_object_paging_end(object);
2130 vm_object_unlock(object);
2131
2132 vm_page_lockspin_queues();
2133 continue;
2134 }
2135 }
2136 pager = object->pager;
2137 if (pager == MEMORY_OBJECT_NULL) {
2138 /*
2139 * This pager has been destroyed by either
2140 * memory_object_destroy or vm_object_destroy, and
2141 * so there is nowhere for the page to go.
2142 * Just free the page... VM_PAGE_FREE takes
2143 * care of cleaning up all the state...
2144 * including doing the vm_pageout_throttle_up
2145 */
2146
2147 VM_PAGE_FREE(m);
2148
2149 vm_object_paging_end(object);
2150 vm_object_unlock(object);
2151
2152 vm_page_lockspin_queues();
2153 continue;
2154 }
2155 vm_object_unlock(object);
2156 /*
2157 * we expect the paging_in_progress reference to have
2158 * already been taken on the object before it was added
2159 * to the appropriate pageout I/O queue... this will
2160 * keep the object from being terminated and/or the
2161 * paging_offset from changing until the I/O has
2162 * completed... therefore no need to lock the object to
2163 * pull the paging_offset from it.
2164 *
2165 * Send the data to the pager.
2166 * any pageout clustering happens there
2167 */
2168 memory_object_data_return(pager,
2169 m->offset + object->paging_offset,
2170 PAGE_SIZE,
2171 NULL,
2172 NULL,
2173 FALSE,
2174 FALSE,
2175 0);
2176
2177 vm_object_lock(object);
2178 vm_object_paging_end(object);
2179 vm_object_unlock(object);
2180
2181 vm_page_lockspin_queues();
2182 }
2183 assert_wait((event_t) q, THREAD_UNINT);
2184
2185
2186 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2187 q->pgo_throttled = FALSE;
2188 need_wakeup = TRUE;
2189 } else
2190 need_wakeup = FALSE;
2191
2192 q->pgo_busy = FALSE;
2193 q->pgo_idle = TRUE;
2194 vm_page_unlock_queues();
2195
2196 if (need_wakeup == TRUE)
2197 thread_wakeup((event_t) &q->pgo_laundry);
2198
2199 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2200 /*NOTREACHED*/
2201 }
2202
2203
2204 static void
2205 vm_pageout_iothread_external(void)
2206 {
2207 thread_t self = current_thread();
2208
2209 self->options |= TH_OPT_VMPRIV;
2210
2211 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2212 /*NOTREACHED*/
2213 }
2214
2215
2216 static void
2217 vm_pageout_iothread_internal(void)
2218 {
2219 thread_t self = current_thread();
2220
2221 self->options |= TH_OPT_VMPRIV;
2222
2223 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2224 /*NOTREACHED*/
2225 }
2226
2227 static void
2228 vm_pageout_garbage_collect(int collect)
2229 {
2230 if (collect) {
2231 stack_collect();
2232
2233 /*
2234 * consider_zone_gc should be last, because the other operations
2235 * might return memory to zones.
2236 */
2237 consider_machine_collect();
2238 consider_zone_gc();
2239
2240 consider_machine_adjust();
2241 }
2242
2243 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2244
2245 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2246 /*NOTREACHED*/
2247 }
2248
2249
2250
2251 void
2252 vm_pageout(void)
2253 {
2254 thread_t self = current_thread();
2255 thread_t thread;
2256 kern_return_t result;
2257 spl_t s;
2258
2259 /*
2260 * Set thread privileges.
2261 */
2262 s = splsched();
2263 thread_lock(self);
2264 self->priority = BASEPRI_PREEMPT - 1;
2265 set_sched_pri(self, self->priority);
2266 thread_unlock(self);
2267
2268 if (!self->reserved_stack)
2269 self->reserved_stack = self->kernel_stack;
2270
2271 splx(s);
2272
2273 /*
2274 * Initialize some paging parameters.
2275 */
2276
2277 if (vm_pageout_idle_wait == 0)
2278 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2279
2280 if (vm_pageout_burst_wait == 0)
2281 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2282
2283 if (vm_pageout_empty_wait == 0)
2284 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2285
2286 if (vm_pageout_deadlock_wait == 0)
2287 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2288
2289 if (vm_pageout_deadlock_relief == 0)
2290 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2291
2292 if (vm_pageout_inactive_relief == 0)
2293 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2294
2295 if (vm_pageout_burst_active_throttle == 0)
2296 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2297
2298 if (vm_pageout_burst_inactive_throttle == 0)
2299 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2300
2301 /*
2302 * Set kernel task to low backing store privileged
2303 * status
2304 */
2305 task_lock(kernel_task);
2306 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2307 task_unlock(kernel_task);
2308
2309 vm_page_free_count_init = vm_page_free_count;
2310
2311 /*
2312 * even if we've already called vm_page_free_reserve
2313 * call it again here to insure that the targets are
2314 * accurately calculated (it uses vm_page_free_count_init)
2315 * calling it with an arg of 0 will not change the reserve
2316 * but will re-calculate free_min and free_target
2317 */
2318 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2319 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2320 } else
2321 vm_page_free_reserve(0);
2322
2323
2324 queue_init(&vm_pageout_queue_external.pgo_pending);
2325 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2326 vm_pageout_queue_external.pgo_laundry = 0;
2327 vm_pageout_queue_external.pgo_idle = FALSE;
2328 vm_pageout_queue_external.pgo_busy = FALSE;
2329 vm_pageout_queue_external.pgo_throttled = FALSE;
2330
2331 queue_init(&vm_pageout_queue_internal.pgo_pending);
2332 vm_pageout_queue_internal.pgo_maxlaundry = 0;
2333 vm_pageout_queue_internal.pgo_laundry = 0;
2334 vm_pageout_queue_internal.pgo_idle = FALSE;
2335 vm_pageout_queue_internal.pgo_busy = FALSE;
2336 vm_pageout_queue_internal.pgo_throttled = FALSE;
2337
2338
2339 /* internal pageout thread started when default pager registered first time */
2340 /* external pageout and garbage collection threads started here */
2341
2342 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2343 BASEPRI_PREEMPT - 1,
2344 &vm_pageout_external_iothread);
2345 if (result != KERN_SUCCESS)
2346 panic("vm_pageout_iothread_external: create failed");
2347
2348 thread_deallocate(vm_pageout_external_iothread);
2349
2350 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2351 MINPRI_KERNEL,
2352 &thread);
2353 if (result != KERN_SUCCESS)
2354 panic("vm_pageout_garbage_collect: create failed");
2355
2356 thread_deallocate(thread);
2357
2358 vm_object_reaper_init();
2359
2360
2361 vm_pageout_continue();
2362
2363 /*
2364 * Unreached code!
2365 *
2366 * The vm_pageout_continue() call above never returns, so the code below is never
2367 * executed. We take advantage of this to declare several DTrace VM related probe
2368 * points that our kernel doesn't have an analog for. These are probe points that
2369 * exist in Solaris and are in the DTrace documentation, so people may have written
2370 * scripts that use them. Declaring the probe points here means their scripts will
2371 * compile and execute which we want for portability of the scripts, but since this
2372 * section of code is never reached, the probe points will simply never fire. Yes,
2373 * this is basically a hack. The problem is the DTrace probe points were chosen with
2374 * Solaris specific VM events in mind, not portability to different VM implementations.
2375 */
2376
2377 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2378 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2379 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2380 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2381 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2382 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2383 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2384 /*NOTREACHED*/
2385 }
2386
2387 kern_return_t
2388 vm_pageout_internal_start(void)
2389 {
2390 kern_return_t result;
2391
2392 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2393 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2394 if (result == KERN_SUCCESS)
2395 thread_deallocate(vm_pageout_internal_iothread);
2396 return result;
2397 }
2398
2399 #define UPL_DELAYED_UNLOCK_LIMIT (MAX_UPL_TRANSFER / 2)
2400
2401 static upl_t
2402 upl_create(int type, int flags, upl_size_t size)
2403 {
2404 upl_t upl;
2405 int page_field_size = 0;
2406 int upl_flags = 0;
2407 int upl_size = sizeof(struct upl);
2408
2409 if (type & UPL_CREATE_LITE) {
2410 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2411 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2412
2413 upl_flags |= UPL_LITE;
2414 }
2415 if (type & UPL_CREATE_INTERNAL) {
2416 upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2417
2418 upl_flags |= UPL_INTERNAL;
2419 }
2420 upl = (upl_t)kalloc(upl_size + page_field_size);
2421
2422 if (page_field_size)
2423 bzero((char *)upl + upl_size, page_field_size);
2424
2425 upl->flags = upl_flags | flags;
2426 upl->src_object = NULL;
2427 upl->kaddr = (vm_offset_t)0;
2428 upl->size = 0;
2429 upl->map_object = NULL;
2430 upl->ref_count = 1;
2431 upl->highest_page = 0;
2432 upl_lock_init(upl);
2433 #ifdef UPL_DEBUG
2434 upl->ubc_alias1 = 0;
2435 upl->ubc_alias2 = 0;
2436 #endif /* UPL_DEBUG */
2437 return(upl);
2438 }
2439
2440 static void
2441 upl_destroy(upl_t upl)
2442 {
2443 int page_field_size; /* bit field in word size buf */
2444 int size;
2445
2446 #ifdef UPL_DEBUG
2447 {
2448 vm_object_t object;
2449
2450 if (upl->flags & UPL_SHADOWED) {
2451 object = upl->map_object->shadow;
2452 } else {
2453 object = upl->map_object;
2454 }
2455 vm_object_lock(object);
2456 queue_remove(&object->uplq, upl, upl_t, uplq);
2457 vm_object_unlock(object);
2458 }
2459 #endif /* UPL_DEBUG */
2460 /*
2461 * drop a reference on the map_object whether or
2462 * not a pageout object is inserted
2463 */
2464 if (upl->flags & UPL_SHADOWED)
2465 vm_object_deallocate(upl->map_object);
2466
2467 if (upl->flags & UPL_DEVICE_MEMORY)
2468 size = PAGE_SIZE;
2469 else
2470 size = upl->size;
2471 page_field_size = 0;
2472
2473 if (upl->flags & UPL_LITE) {
2474 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2475 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2476 }
2477 if (upl->flags & UPL_INTERNAL) {
2478 kfree(upl,
2479 sizeof(struct upl) +
2480 (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2481 + page_field_size);
2482 } else {
2483 kfree(upl, sizeof(struct upl) + page_field_size);
2484 }
2485 }
2486
2487 void uc_upl_dealloc(upl_t upl);
2488 __private_extern__ void
2489 uc_upl_dealloc(upl_t upl)
2490 {
2491 if (--upl->ref_count == 0)
2492 upl_destroy(upl);
2493 }
2494
2495 void
2496 upl_deallocate(upl_t upl)
2497 {
2498 if (--upl->ref_count == 0)
2499 upl_destroy(upl);
2500 }
2501
2502 /*
2503 * Statistics about UPL enforcement of copy-on-write obligations.
2504 */
2505 unsigned long upl_cow = 0;
2506 unsigned long upl_cow_again = 0;
2507 unsigned long upl_cow_contiguous = 0;
2508 unsigned long upl_cow_pages = 0;
2509 unsigned long upl_cow_again_pages = 0;
2510 unsigned long upl_cow_contiguous_pages = 0;
2511
2512 /*
2513 * Routine: vm_object_upl_request
2514 * Purpose:
2515 * Cause the population of a portion of a vm_object.
2516 * Depending on the nature of the request, the pages
2517 * returned may be contain valid data or be uninitialized.
2518 * A page list structure, listing the physical pages
2519 * will be returned upon request.
2520 * This function is called by the file system or any other
2521 * supplier of backing store to a pager.
2522 * IMPORTANT NOTE: The caller must still respect the relationship
2523 * between the vm_object and its backing memory object. The
2524 * caller MUST NOT substitute changes in the backing file
2525 * without first doing a memory_object_lock_request on the
2526 * target range unless it is know that the pages are not
2527 * shared with another entity at the pager level.
2528 * Copy_in_to:
2529 * if a page list structure is present
2530 * return the mapped physical pages, where a
2531 * page is not present, return a non-initialized
2532 * one. If the no_sync bit is turned on, don't
2533 * call the pager unlock to synchronize with other
2534 * possible copies of the page. Leave pages busy
2535 * in the original object, if a page list structure
2536 * was specified. When a commit of the page list
2537 * pages is done, the dirty bit will be set for each one.
2538 * Copy_out_from:
2539 * If a page list structure is present, return
2540 * all mapped pages. Where a page does not exist
2541 * map a zero filled one. Leave pages busy in
2542 * the original object. If a page list structure
2543 * is not specified, this call is a no-op.
2544 *
2545 * Note: access of default pager objects has a rather interesting
2546 * twist. The caller of this routine, presumably the file system
2547 * page cache handling code, will never actually make a request
2548 * against a default pager backed object. Only the default
2549 * pager will make requests on backing store related vm_objects
2550 * In this way the default pager can maintain the relationship
2551 * between backing store files (abstract memory objects) and
2552 * the vm_objects (cache objects), they support.
2553 *
2554 */
2555
2556 __private_extern__ kern_return_t
2557 vm_object_upl_request(
2558 vm_object_t object,
2559 vm_object_offset_t offset,
2560 upl_size_t size,
2561 upl_t *upl_ptr,
2562 upl_page_info_array_t user_page_list,
2563 unsigned int *page_list_count,
2564 int cntrl_flags)
2565 {
2566 vm_page_t dst_page = VM_PAGE_NULL;
2567 vm_object_offset_t dst_offset;
2568 upl_size_t xfer_size;
2569 boolean_t dirty;
2570 boolean_t hw_dirty;
2571 upl_t upl = NULL;
2572 unsigned int entry;
2573 #if MACH_CLUSTER_STATS
2574 boolean_t encountered_lrp = FALSE;
2575 #endif
2576 vm_page_t alias_page = NULL;
2577 int refmod_state = 0;
2578 wpl_array_t lite_list = NULL;
2579 vm_object_t last_copy_object;
2580 int delayed_unlock = 0;
2581
2582 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2583 /*
2584 * For forward compatibility's sake,
2585 * reject any unknown flag.
2586 */
2587 return KERN_INVALID_VALUE;
2588 }
2589 if ( (!object->internal) && (object->paging_offset != 0) )
2590 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2591 if (object->phys_contiguous)
2592 panic("vm_object_upl_request: contiguous object specified\n");
2593
2594
2595 if ((size / PAGE_SIZE) > MAX_UPL_TRANSFER)
2596 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2597
2598 if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2599 *page_list_count = MAX_UPL_TRANSFER;
2600
2601 if (cntrl_flags & UPL_SET_INTERNAL) {
2602 if (cntrl_flags & UPL_SET_LITE) {
2603
2604 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2605
2606 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2607 lite_list = (wpl_array_t)
2608 (((uintptr_t)user_page_list) +
2609 ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2610 } else {
2611 upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2612
2613 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2614 }
2615 } else {
2616 if (cntrl_flags & UPL_SET_LITE) {
2617
2618 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2619
2620 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2621 } else {
2622 upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2623 }
2624 }
2625 *upl_ptr = upl;
2626
2627 if (user_page_list)
2628 user_page_list[0].device = FALSE;
2629
2630 if (cntrl_flags & UPL_SET_LITE) {
2631 upl->map_object = object;
2632 } else {
2633 upl->map_object = vm_object_allocate(size);
2634 /*
2635 * No neeed to lock the new object: nobody else knows
2636 * about it yet, so it's all ours so far.
2637 */
2638 upl->map_object->shadow = object;
2639 upl->map_object->pageout = TRUE;
2640 upl->map_object->can_persist = FALSE;
2641 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2642 upl->map_object->shadow_offset = offset;
2643 upl->map_object->wimg_bits = object->wimg_bits;
2644
2645 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2646
2647 upl->flags |= UPL_SHADOWED;
2648 }
2649 /*
2650 * ENCRYPTED SWAP:
2651 * Just mark the UPL as "encrypted" here.
2652 * We'll actually encrypt the pages later,
2653 * in upl_encrypt(), when the caller has
2654 * selected which pages need to go to swap.
2655 */
2656 if (cntrl_flags & UPL_ENCRYPT)
2657 upl->flags |= UPL_ENCRYPTED;
2658
2659 if (cntrl_flags & UPL_FOR_PAGEOUT)
2660 upl->flags |= UPL_PAGEOUT;
2661
2662 vm_object_lock(object);
2663 vm_object_paging_begin(object);
2664
2665 /*
2666 * we can lock in the paging_offset once paging_in_progress is set
2667 */
2668 upl->size = size;
2669 upl->offset = offset + object->paging_offset;
2670
2671 #ifdef UPL_DEBUG
2672 queue_enter(&object->uplq, upl, upl_t, uplq);
2673 #endif /* UPL_DEBUG */
2674
2675 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2676 /*
2677 * Honor copy-on-write obligations
2678 *
2679 * The caller is gathering these pages and
2680 * might modify their contents. We need to
2681 * make sure that the copy object has its own
2682 * private copies of these pages before we let
2683 * the caller modify them.
2684 */
2685 vm_object_update(object,
2686 offset,
2687 size,
2688 NULL,
2689 NULL,
2690 FALSE, /* should_return */
2691 MEMORY_OBJECT_COPY_SYNC,
2692 VM_PROT_NO_CHANGE);
2693 upl_cow++;
2694 upl_cow_pages += size >> PAGE_SHIFT;
2695 }
2696 /*
2697 * remember which copy object we synchronized with
2698 */
2699 last_copy_object = object->copy;
2700 entry = 0;
2701
2702 xfer_size = size;
2703 dst_offset = offset;
2704
2705 while (xfer_size) {
2706
2707 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2708 if (delayed_unlock) {
2709 delayed_unlock = 0;
2710 vm_page_unlock_queues();
2711 }
2712 vm_object_unlock(object);
2713 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2714 vm_object_lock(object);
2715 }
2716 if (delayed_unlock == 0)
2717 vm_page_lock_queues();
2718
2719 if (cntrl_flags & UPL_COPYOUT_FROM) {
2720 upl->flags |= UPL_PAGE_SYNC_DONE;
2721
2722 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2723 dst_page->fictitious ||
2724 dst_page->absent ||
2725 dst_page->error ||
2726 (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2727
2728 if (user_page_list)
2729 user_page_list[entry].phys_addr = 0;
2730
2731 goto delay_unlock_queues;
2732 }
2733 /*
2734 * grab this up front...
2735 * a high percentange of the time we're going to
2736 * need the hardware modification state a bit later
2737 * anyway... so we can eliminate an extra call into
2738 * the pmap layer by grabbing it here and recording it
2739 */
2740 if (dst_page->pmapped)
2741 refmod_state = pmap_get_refmod(dst_page->phys_page);
2742 else
2743 refmod_state = 0;
2744
2745 if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2746 /*
2747 * page is on inactive list and referenced...
2748 * reactivate it now... this gets it out of the
2749 * way of vm_pageout_scan which would have to
2750 * reactivate it upon tripping over it
2751 */
2752 vm_page_activate(dst_page);
2753 VM_STAT_INCR(reactivations);
2754 }
2755 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2756 /*
2757 * we're only asking for DIRTY pages to be returned
2758 */
2759 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2760 /*
2761 * if we were the page stolen by vm_pageout_scan to be
2762 * cleaned (as opposed to a buddy being clustered in
2763 * or this request is not being driven by a PAGEOUT cluster
2764 * then we only need to check for the page being dirty or
2765 * precious to decide whether to return it
2766 */
2767 if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2768 goto check_busy;
2769 goto dont_return;
2770 }
2771 /*
2772 * this is a request for a PAGEOUT cluster and this page
2773 * is merely along for the ride as a 'buddy'... not only
2774 * does it have to be dirty to be returned, but it also
2775 * can't have been referenced recently... note that we've
2776 * already filtered above based on whether this page is
2777 * currently on the inactive queue or it meets the page
2778 * ticket (generation count) check
2779 */
2780 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2781 ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2782 goto check_busy;
2783 }
2784 dont_return:
2785 /*
2786 * if we reach here, we're not to return
2787 * the page... go on to the next one
2788 */
2789 if (user_page_list)
2790 user_page_list[entry].phys_addr = 0;
2791
2792 goto delay_unlock_queues;
2793 }
2794 check_busy:
2795 if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2796 if (cntrl_flags & UPL_NOBLOCK) {
2797 if (user_page_list)
2798 user_page_list[entry].phys_addr = 0;
2799
2800 goto delay_unlock_queues;
2801 }
2802 /*
2803 * someone else is playing with the
2804 * page. We will have to wait.
2805 */
2806 delayed_unlock = 0;
2807 vm_page_unlock_queues();
2808
2809 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2810
2811 continue;
2812 }
2813 /*
2814 * Someone else already cleaning the page?
2815 */
2816 if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2817 if (user_page_list)
2818 user_page_list[entry].phys_addr = 0;
2819
2820 goto delay_unlock_queues;
2821 }
2822 /*
2823 * ENCRYPTED SWAP:
2824 * The caller is gathering this page and might
2825 * access its contents later on. Decrypt the
2826 * page before adding it to the UPL, so that
2827 * the caller never sees encrypted data.
2828 */
2829 if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2830 int was_busy;
2831
2832 delayed_unlock = 0;
2833 vm_page_unlock_queues();
2834 /*
2835 * save the current state of busy
2836 * mark page as busy while decrypt
2837 * is in progress since it will drop
2838 * the object lock...
2839 */
2840 was_busy = dst_page->busy;
2841 dst_page->busy = TRUE;
2842
2843 vm_page_decrypt(dst_page, 0);
2844 vm_page_decrypt_for_upl_counter++;
2845 /*
2846 * restore to original busy state
2847 */
2848 dst_page->busy = was_busy;
2849
2850 vm_page_lock_queues();
2851 }
2852 if (dst_page->pageout_queue == TRUE)
2853 /*
2854 * we've buddied up a page for a clustered pageout
2855 * that has already been moved to the pageout
2856 * queue by pageout_scan... we need to remove
2857 * it from the queue and drop the laundry count
2858 * on that queue
2859 */
2860 vm_pageout_queue_steal(dst_page);
2861 #if MACH_CLUSTER_STATS
2862 /*
2863 * pageout statistics gathering. count
2864 * all the pages we will page out that
2865 * were not counted in the initial
2866 * vm_pageout_scan work
2867 */
2868 if (dst_page->list_req_pending)
2869 encountered_lrp = TRUE;
2870 if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2871 if (encountered_lrp)
2872 CLUSTER_STAT(pages_at_higher_offsets++;)
2873 else
2874 CLUSTER_STAT(pages_at_lower_offsets++;)
2875 }
2876 #endif
2877 /*
2878 * Turn off busy indication on pending
2879 * pageout. Note: we can only get here
2880 * in the request pending case.
2881 */
2882 dst_page->list_req_pending = FALSE;
2883 dst_page->busy = FALSE;
2884
2885 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2886 dirty = hw_dirty ? TRUE : dst_page->dirty;
2887
2888 if (dst_page->phys_page > upl->highest_page)
2889 upl->highest_page = dst_page->phys_page;
2890
2891 if (cntrl_flags & UPL_SET_LITE) {
2892 int pg_num;
2893
2894 pg_num = (dst_offset-offset)/PAGE_SIZE;
2895 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2896
2897 if (hw_dirty)
2898 pmap_clear_modify(dst_page->phys_page);
2899
2900 /*
2901 * Mark original page as cleaning
2902 * in place.
2903 */
2904 dst_page->cleaning = TRUE;
2905 dst_page->precious = FALSE;
2906 } else {
2907 /*
2908 * use pageclean setup, it is more
2909 * convenient even for the pageout
2910 * cases here
2911 */
2912 vm_object_lock(upl->map_object);
2913 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2914 vm_object_unlock(upl->map_object);
2915
2916 alias_page->absent = FALSE;
2917 alias_page = NULL;
2918 }
2919 #if MACH_PAGEMAP
2920 /*
2921 * Record that this page has been
2922 * written out
2923 */
2924 vm_external_state_set(object->existence_map, dst_page->offset);
2925 #endif /*MACH_PAGEMAP*/
2926 dst_page->dirty = dirty;
2927
2928 if (!dirty)
2929 dst_page->precious = TRUE;
2930
2931 if (dst_page->pageout)
2932 dst_page->busy = TRUE;
2933
2934 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2935 /*
2936 * ENCRYPTED SWAP:
2937 * We want to deny access to the target page
2938 * because its contents are about to be
2939 * encrypted and the user would be very
2940 * confused to see encrypted data instead
2941 * of their data.
2942 * We also set "encrypted_cleaning" to allow
2943 * vm_pageout_scan() to demote that page
2944 * from "adjacent/clean-in-place" to
2945 * "target/clean-and-free" if it bumps into
2946 * this page during its scanning while we're
2947 * still processing this cluster.
2948 */
2949 dst_page->busy = TRUE;
2950 dst_page->encrypted_cleaning = TRUE;
2951 }
2952 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2953 /*
2954 * deny access to the target page
2955 * while it is being worked on
2956 */
2957 if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
2958 dst_page->busy = TRUE;
2959 dst_page->pageout = TRUE;
2960 vm_page_wire(dst_page);
2961 }
2962 }
2963 } else {
2964 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
2965 /*
2966 * Honor copy-on-write obligations
2967 *
2968 * The copy object has changed since we
2969 * last synchronized for copy-on-write.
2970 * Another copy object might have been
2971 * inserted while we released the object's
2972 * lock. Since someone could have seen the
2973 * original contents of the remaining pages
2974 * through that new object, we have to
2975 * synchronize with it again for the remaining
2976 * pages only. The previous pages are "busy"
2977 * so they can not be seen through the new
2978 * mapping. The new mapping will see our
2979 * upcoming changes for those previous pages,
2980 * but that's OK since they couldn't see what
2981 * was there before. It's just a race anyway
2982 * and there's no guarantee of consistency or
2983 * atomicity. We just don't want new mappings
2984 * to see both the *before* and *after* pages.
2985 */
2986 if (object->copy != VM_OBJECT_NULL) {
2987 delayed_unlock = 0;
2988 vm_page_unlock_queues();
2989
2990 vm_object_update(
2991 object,
2992 dst_offset,/* current offset */
2993 xfer_size, /* remaining size */
2994 NULL,
2995 NULL,
2996 FALSE, /* should_return */
2997 MEMORY_OBJECT_COPY_SYNC,
2998 VM_PROT_NO_CHANGE);
2999
3000 upl_cow_again++;
3001 upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3002
3003 vm_page_lock_queues();
3004 }
3005 /*
3006 * remember the copy object we synced with
3007 */
3008 last_copy_object = object->copy;
3009 }
3010 dst_page = vm_page_lookup(object, dst_offset);
3011
3012 if (dst_page != VM_PAGE_NULL) {
3013 if ( !(dst_page->list_req_pending) ) {
3014 if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3015 /*
3016 * skip over pages already present in the cache
3017 */
3018 if (user_page_list)
3019 user_page_list[entry].phys_addr = 0;
3020
3021 goto delay_unlock_queues;
3022 }
3023 if (dst_page->cleaning) {
3024 /*
3025 * someone else is writing to the page... wait...
3026 */
3027 delayed_unlock = 0;
3028 vm_page_unlock_queues();
3029
3030 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3031
3032 continue;
3033 }
3034 } else {
3035 if (dst_page->fictitious &&
3036 dst_page->phys_page == vm_page_fictitious_addr) {
3037 assert( !dst_page->speculative);
3038 /*
3039 * dump the fictitious page
3040 */
3041 dst_page->list_req_pending = FALSE;
3042
3043 vm_page_free(dst_page);
3044
3045 dst_page = NULL;
3046 } else if (dst_page->absent) {
3047 /*
3048 * the default_pager case
3049 */
3050 dst_page->list_req_pending = FALSE;
3051 dst_page->busy = FALSE;
3052 }
3053 }
3054 }
3055 if (dst_page == VM_PAGE_NULL) {
3056 if (object->private) {
3057 /*
3058 * This is a nasty wrinkle for users
3059 * of upl who encounter device or
3060 * private memory however, it is
3061 * unavoidable, only a fault can
3062 * resolve the actual backing
3063 * physical page by asking the
3064 * backing device.
3065 */
3066 if (user_page_list)
3067 user_page_list[entry].phys_addr = 0;
3068
3069 goto delay_unlock_queues;
3070 }
3071 /*
3072 * need to allocate a page
3073 * vm_page_alloc may grab the
3074 * queues lock for a purgeable object
3075 * so drop it
3076 */
3077 delayed_unlock = 0;
3078 vm_page_unlock_queues();
3079
3080 dst_page = vm_page_alloc(object, dst_offset);
3081
3082 if (dst_page == VM_PAGE_NULL) {
3083 if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3084 /*
3085 * we don't want to stall waiting for pages to come onto the free list
3086 * while we're already holding absent pages in this UPL
3087 * the caller will deal with the empty slots
3088 */
3089 if (user_page_list)
3090 user_page_list[entry].phys_addr = 0;
3091
3092 goto try_next_page;
3093 }
3094 /*
3095 * no pages available... wait
3096 * then try again for the same
3097 * offset...
3098 */
3099 vm_object_unlock(object);
3100 VM_PAGE_WAIT();
3101 vm_object_lock(object);
3102
3103 continue;
3104 }
3105 dst_page->busy = FALSE;
3106 dst_page->absent = TRUE;
3107
3108 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3109 /*
3110 * if UPL_RET_ONLY_ABSENT was specified,
3111 * than we're definitely setting up a
3112 * upl for a clustered read/pagein
3113 * operation... mark the pages as clustered
3114 * so upl_commit_range can put them on the
3115 * speculative list
3116 */
3117 dst_page->clustered = TRUE;
3118 }
3119 vm_page_lock_queues();
3120 }
3121 /*
3122 * ENCRYPTED SWAP:
3123 */
3124 if (cntrl_flags & UPL_ENCRYPT) {
3125 /*
3126 * The page is going to be encrypted when we
3127 * get it from the pager, so mark it so.
3128 */
3129 dst_page->encrypted = TRUE;
3130 } else {
3131 /*
3132 * Otherwise, the page will not contain
3133 * encrypted data.
3134 */
3135 dst_page->encrypted = FALSE;
3136 }
3137 dst_page->overwriting = TRUE;
3138
3139 if (dst_page->fictitious) {
3140 panic("need corner case for fictitious page");
3141 }
3142 if (dst_page->busy) {
3143 /*
3144 * someone else is playing with the
3145 * page. We will have to wait.
3146 */
3147 delayed_unlock = 0;
3148 vm_page_unlock_queues();
3149
3150 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3151
3152 continue;
3153 }
3154 if (dst_page->pmapped) {
3155 if ( !(cntrl_flags & UPL_FILE_IO))
3156 /*
3157 * eliminate all mappings from the
3158 * original object and its prodigy
3159 */
3160 refmod_state = pmap_disconnect(dst_page->phys_page);
3161 else
3162 refmod_state = pmap_get_refmod(dst_page->phys_page);
3163 } else
3164 refmod_state = 0;
3165
3166 hw_dirty = refmod_state & VM_MEM_MODIFIED;
3167 dirty = hw_dirty ? TRUE : dst_page->dirty;
3168
3169 if (cntrl_flags & UPL_SET_LITE) {
3170 int pg_num;
3171
3172 pg_num = (dst_offset-offset)/PAGE_SIZE;
3173 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3174
3175 if (hw_dirty)
3176 pmap_clear_modify(dst_page->phys_page);
3177
3178 /*
3179 * Mark original page as cleaning
3180 * in place.
3181 */
3182 dst_page->cleaning = TRUE;
3183 dst_page->precious = FALSE;
3184 } else {
3185 /*
3186 * use pageclean setup, it is more
3187 * convenient even for the pageout
3188 * cases here
3189 */
3190 vm_object_lock(upl->map_object);
3191 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3192 vm_object_unlock(upl->map_object);
3193
3194 alias_page->absent = FALSE;
3195 alias_page = NULL;
3196 }
3197
3198 if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3199 /*
3200 * clean in place for read implies
3201 * that a write will be done on all
3202 * the pages that are dirty before
3203 * a upl commit is done. The caller
3204 * is obligated to preserve the
3205 * contents of all pages marked dirty
3206 */
3207 upl->flags |= UPL_CLEAR_DIRTY;
3208 }
3209 dst_page->dirty = dirty;
3210
3211 if (!dirty)
3212 dst_page->precious = TRUE;
3213
3214 if (dst_page->wire_count == 0) {
3215 /*
3216 * deny access to the target page while
3217 * it is being worked on
3218 */
3219 dst_page->busy = TRUE;
3220 } else
3221 vm_page_wire(dst_page);
3222
3223 if (dst_page->clustered) {
3224 /*
3225 * expect the page not to be used
3226 * since it's coming in as part
3227 * of a speculative cluster...
3228 * pages that are 'consumed' will
3229 * get a hardware reference
3230 */
3231 dst_page->reference = FALSE;
3232 } else {
3233 /*
3234 * expect the page to be used
3235 */
3236 dst_page->reference = TRUE;
3237 }
3238 dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3239 }
3240 if (dst_page->phys_page > upl->highest_page)
3241 upl->highest_page = dst_page->phys_page;
3242 if (user_page_list) {
3243 user_page_list[entry].phys_addr = dst_page->phys_page;
3244 user_page_list[entry].dirty = dst_page->dirty;
3245 user_page_list[entry].pageout = dst_page->pageout;
3246 user_page_list[entry].absent = dst_page->absent;
3247 user_page_list[entry].precious = dst_page->precious;
3248
3249 if (dst_page->clustered == TRUE)
3250 user_page_list[entry].speculative = dst_page->speculative;
3251 else
3252 user_page_list[entry].speculative = FALSE;
3253 }
3254 /*
3255 * if UPL_RET_ONLY_ABSENT is set, then
3256 * we are working with a fresh page and we've
3257 * just set the clustered flag on it to
3258 * indicate that it was drug in as part of a
3259 * speculative cluster... so leave it alone
3260 */
3261 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3262 /*
3263 * someone is explicitly grabbing this page...
3264 * update clustered and speculative state
3265 *
3266 */
3267 VM_PAGE_CONSUME_CLUSTERED(dst_page);
3268 }
3269 delay_unlock_queues:
3270 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3271 mutex_yield(&vm_page_queue_lock);
3272 delayed_unlock = 1;
3273 }
3274 try_next_page:
3275 entry++;
3276 dst_offset += PAGE_SIZE_64;
3277 xfer_size -= PAGE_SIZE;
3278 }
3279 if (alias_page != NULL) {
3280 if (delayed_unlock == 0) {
3281 vm_page_lock_queues();
3282 delayed_unlock++;
3283 }
3284 vm_page_free(alias_page);
3285 }
3286 if (delayed_unlock)
3287 vm_page_unlock_queues();
3288
3289 if (page_list_count != NULL) {
3290 if (upl->flags & UPL_INTERNAL)
3291 *page_list_count = 0;
3292 else if (*page_list_count > entry)
3293 *page_list_count = entry;
3294 }
3295 vm_object_unlock(object);
3296
3297 return KERN_SUCCESS;
3298 }
3299
3300 /* JMM - Backward compatability for now */
3301 kern_return_t
3302 vm_fault_list_request( /* forward */
3303 memory_object_control_t control,
3304 vm_object_offset_t offset,
3305 upl_size_t size,
3306 upl_t *upl_ptr,
3307 upl_page_info_t **user_page_list_ptr,
3308 unsigned int page_list_count,
3309 int cntrl_flags);
3310 kern_return_t
3311 vm_fault_list_request(
3312 memory_object_control_t control,
3313 vm_object_offset_t offset,
3314 upl_size_t size,
3315 upl_t *upl_ptr,
3316 upl_page_info_t **user_page_list_ptr,
3317 unsigned int page_list_count,
3318 int cntrl_flags)
3319 {
3320 unsigned int local_list_count;
3321 upl_page_info_t *user_page_list;
3322 kern_return_t kr;
3323
3324 if (user_page_list_ptr != NULL) {
3325 local_list_count = page_list_count;
3326 user_page_list = *user_page_list_ptr;
3327 } else {
3328 local_list_count = 0;
3329 user_page_list = NULL;
3330 }
3331 kr = memory_object_upl_request(control,
3332 offset,
3333 size,
3334 upl_ptr,
3335 user_page_list,
3336 &local_list_count,
3337 cntrl_flags);
3338
3339 if(kr != KERN_SUCCESS)
3340 return kr;
3341
3342 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3343 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3344 }
3345
3346 return KERN_SUCCESS;
3347 }
3348
3349
3350
3351 /*
3352 * Routine: vm_object_super_upl_request
3353 * Purpose:
3354 * Cause the population of a portion of a vm_object
3355 * in much the same way as memory_object_upl_request.
3356 * Depending on the nature of the request, the pages
3357 * returned may be contain valid data or be uninitialized.
3358 * However, the region may be expanded up to the super
3359 * cluster size provided.
3360 */
3361
3362 __private_extern__ kern_return_t
3363 vm_object_super_upl_request(
3364 vm_object_t object,
3365 vm_object_offset_t offset,
3366 upl_size_t size,
3367 upl_size_t super_cluster,
3368 upl_t *upl,
3369 upl_page_info_t *user_page_list,
3370 unsigned int *page_list_count,
3371 int cntrl_flags)
3372 {
3373 if (object->paging_offset > offset)
3374 return KERN_FAILURE;
3375
3376 assert(object->paging_in_progress);
3377 offset = offset - object->paging_offset;
3378
3379 if (super_cluster > size) {
3380
3381 vm_object_offset_t base_offset;
3382 upl_size_t super_size;
3383
3384 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3385 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3386 super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3387
3388 if (offset > (base_offset + super_size)) {
3389 panic("vm_object_super_upl_request: Missed target pageout"
3390 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3391 offset, base_offset, super_size, super_cluster,
3392 size, object->paging_offset);
3393 }
3394 /*
3395 * apparently there is a case where the vm requests a
3396 * page to be written out who's offset is beyond the
3397 * object size
3398 */
3399 if ((offset + size) > (base_offset + super_size))
3400 super_size = (offset + size) - base_offset;
3401
3402 offset = base_offset;
3403 size = super_size;
3404 }
3405 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3406 }
3407
3408
3409 kern_return_t
3410 vm_map_create_upl(
3411 vm_map_t map,
3412 vm_map_address_t offset,
3413 upl_size_t *upl_size,
3414 upl_t *upl,
3415 upl_page_info_array_t page_list,
3416 unsigned int *count,
3417 int *flags)
3418 {
3419 vm_map_entry_t entry;
3420 int caller_flags;
3421 int force_data_sync;
3422 int sync_cow_data;
3423 vm_object_t local_object;
3424 vm_map_offset_t local_offset;
3425 vm_map_offset_t local_start;
3426 kern_return_t ret;
3427
3428 caller_flags = *flags;
3429
3430 if (caller_flags & ~UPL_VALID_FLAGS) {
3431 /*
3432 * For forward compatibility's sake,
3433 * reject any unknown flag.
3434 */
3435 return KERN_INVALID_VALUE;
3436 }
3437 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3438 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3439
3440 if (upl == NULL)
3441 return KERN_INVALID_ARGUMENT;
3442
3443 REDISCOVER_ENTRY:
3444 vm_map_lock(map);
3445
3446 if (vm_map_lookup_entry(map, offset, &entry)) {
3447
3448 if ((entry->vme_end - offset) < *upl_size)
3449 *upl_size = entry->vme_end - offset;
3450
3451 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3452 *flags = 0;
3453
3454 if (entry->object.vm_object != VM_OBJECT_NULL) {
3455 if (entry->object.vm_object->private)
3456 *flags = UPL_DEV_MEMORY;
3457
3458 if (entry->object.vm_object->phys_contiguous)
3459 *flags |= UPL_PHYS_CONTIG;
3460 }
3461 vm_map_unlock(map);
3462
3463 return KERN_SUCCESS;
3464 }
3465 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3466 if ((*upl_size/page_size) > MAX_UPL_TRANSFER)
3467 *upl_size = MAX_UPL_TRANSFER * page_size;
3468 }
3469 /*
3470 * Create an object if necessary.
3471 */
3472 if (entry->object.vm_object == VM_OBJECT_NULL) {
3473 entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3474 entry->offset = 0;
3475 }
3476 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3477 if (!(entry->protection & VM_PROT_WRITE)) {
3478 vm_map_unlock(map);
3479 return KERN_PROTECTION_FAILURE;
3480 }
3481 if (entry->needs_copy) {
3482 vm_map_t local_map;
3483 vm_object_t object;
3484 vm_object_offset_t new_offset;
3485 vm_prot_t prot;
3486 boolean_t wired;
3487 vm_map_version_t version;
3488 vm_map_t real_map;
3489
3490 local_map = map;
3491 vm_map_lock_write_to_read(map);
3492
3493 if (vm_map_lookup_locked(&local_map,
3494 offset, VM_PROT_WRITE,
3495 OBJECT_LOCK_EXCLUSIVE,
3496 &version, &object,
3497 &new_offset, &prot, &wired,
3498 NULL,
3499 &real_map)) {
3500 vm_map_unlock(local_map);
3501 return KERN_FAILURE;
3502 }
3503 if (real_map != map)
3504 vm_map_unlock(real_map);
3505 vm_object_unlock(object);
3506 vm_map_unlock(local_map);
3507
3508 goto REDISCOVER_ENTRY;
3509 }
3510 }
3511 if (entry->is_sub_map) {
3512 vm_map_t submap;
3513
3514 submap = entry->object.sub_map;
3515 local_start = entry->vme_start;
3516 local_offset = entry->offset;
3517
3518 vm_map_reference(submap);
3519 vm_map_unlock(map);
3520
3521 ret = vm_map_create_upl(submap,
3522 local_offset + (offset - local_start),
3523 upl_size, upl, page_list, count, flags);
3524 vm_map_deallocate(submap);
3525
3526 return ret;
3527 }
3528 if (sync_cow_data) {
3529 if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3530 local_object = entry->object.vm_object;
3531 local_start = entry->vme_start;
3532 local_offset = entry->offset;
3533
3534 vm_object_reference(local_object);
3535 vm_map_unlock(map);
3536
3537 if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3538 vm_object_lock_request(
3539 local_object->shadow,
3540 (vm_object_offset_t)
3541 ((offset - local_start) +
3542 local_offset) +
3543 local_object->shadow_offset,
3544 *upl_size, FALSE,
3545 MEMORY_OBJECT_DATA_SYNC,
3546 VM_PROT_NO_CHANGE);
3547 }
3548 sync_cow_data = FALSE;
3549 vm_object_deallocate(local_object);
3550
3551 goto REDISCOVER_ENTRY;
3552 }
3553 }
3554 if (force_data_sync) {
3555 local_object = entry->object.vm_object;
3556 local_start = entry->vme_start;
3557 local_offset = entry->offset;
3558
3559 vm_object_reference(local_object);
3560 vm_map_unlock(map);
3561
3562 vm_object_lock_request(
3563 local_object,
3564 (vm_object_offset_t)
3565 ((offset - local_start) + local_offset),
3566 (vm_object_size_t)*upl_size, FALSE,
3567 MEMORY_OBJECT_DATA_SYNC,
3568 VM_PROT_NO_CHANGE);
3569
3570 force_data_sync = FALSE;
3571 vm_object_deallocate(local_object);
3572
3573 goto REDISCOVER_ENTRY;
3574 }
3575 if (entry->object.vm_object->private)
3576 *flags = UPL_DEV_MEMORY;
3577 else
3578 *flags = 0;
3579
3580 if (entry->object.vm_object->phys_contiguous)
3581 *flags |= UPL_PHYS_CONTIG;
3582
3583 local_object = entry->object.vm_object;
3584 local_offset = entry->offset;
3585 local_start = entry->vme_start;
3586
3587 vm_object_reference(local_object);
3588 vm_map_unlock(map);
3589
3590 ret = vm_object_iopl_request(local_object,
3591 (vm_object_offset_t) ((offset - local_start) + local_offset),
3592 *upl_size,
3593 upl,
3594 page_list,
3595 count,
3596 caller_flags);
3597 vm_object_deallocate(local_object);
3598
3599 return(ret);
3600 }
3601 vm_map_unlock(map);
3602
3603 return(KERN_FAILURE);
3604 }
3605
3606 /*
3607 * Internal routine to enter a UPL into a VM map.
3608 *
3609 * JMM - This should just be doable through the standard
3610 * vm_map_enter() API.
3611 */
3612 kern_return_t
3613 vm_map_enter_upl(
3614 vm_map_t map,
3615 upl_t upl,
3616 vm_map_offset_t *dst_addr)
3617 {
3618 vm_map_size_t size;
3619 vm_object_offset_t offset;
3620 vm_map_offset_t addr;
3621 vm_page_t m;
3622 kern_return_t kr;
3623
3624 if (upl == UPL_NULL)
3625 return KERN_INVALID_ARGUMENT;
3626
3627 upl_lock(upl);
3628
3629 /*
3630 * check to see if already mapped
3631 */
3632 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3633 upl_unlock(upl);
3634 return KERN_FAILURE;
3635 }
3636
3637 if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3638 (upl->map_object->phys_contiguous))) {
3639 vm_object_t object;
3640 vm_page_t alias_page;
3641 vm_object_offset_t new_offset;
3642 int pg_num;
3643 wpl_array_t lite_list;
3644
3645 if (upl->flags & UPL_INTERNAL) {
3646 lite_list = (wpl_array_t)
3647 ((((uintptr_t)upl) + sizeof(struct upl))
3648 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3649 } else {
3650 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3651 }
3652 object = upl->map_object;
3653 upl->map_object = vm_object_allocate(upl->size);
3654
3655 vm_object_lock(upl->map_object);
3656
3657 upl->map_object->shadow = object;
3658 upl->map_object->pageout = TRUE;
3659 upl->map_object->can_persist = FALSE;
3660 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3661 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3662 upl->map_object->wimg_bits = object->wimg_bits;
3663 offset = upl->map_object->shadow_offset;
3664 new_offset = 0;
3665 size = upl->size;
3666
3667 upl->flags |= UPL_SHADOWED;
3668
3669 while (size) {
3670 pg_num = (new_offset)/PAGE_SIZE;
3671
3672 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3673
3674 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3675
3676 vm_object_lock(object);
3677
3678 m = vm_page_lookup(object, offset);
3679 if (m == VM_PAGE_NULL) {
3680 panic("vm_upl_map: page missing\n");
3681 }
3682
3683 /*
3684 * Convert the fictitious page to a private
3685 * shadow of the real page.
3686 */
3687 assert(alias_page->fictitious);
3688 alias_page->fictitious = FALSE;
3689 alias_page->private = TRUE;
3690 alias_page->pageout = TRUE;
3691 /*
3692 * since m is a page in the upl it must
3693 * already be wired or BUSY, so it's
3694 * safe to assign the underlying physical
3695 * page to the alias
3696 */
3697 alias_page->phys_page = m->phys_page;
3698
3699 vm_object_unlock(object);
3700
3701 vm_page_lockspin_queues();
3702 vm_page_wire(alias_page);
3703 vm_page_unlock_queues();
3704
3705 /*
3706 * ENCRYPTED SWAP:
3707 * The virtual page ("m") has to be wired in some way
3708 * here or its physical page ("m->phys_page") could
3709 * be recycled at any time.
3710 * Assuming this is enforced by the caller, we can't
3711 * get an encrypted page here. Since the encryption
3712 * key depends on the VM page's "pager" object and
3713 * the "paging_offset", we couldn't handle 2 pageable
3714 * VM pages (with different pagers and paging_offsets)
3715 * sharing the same physical page: we could end up
3716 * encrypting with one key (via one VM page) and
3717 * decrypting with another key (via the alias VM page).
3718 */
3719 ASSERT_PAGE_DECRYPTED(m);
3720
3721 vm_page_insert(alias_page, upl->map_object, new_offset);
3722
3723 assert(!alias_page->wanted);
3724 alias_page->busy = FALSE;
3725 alias_page->absent = FALSE;
3726 }
3727 size -= PAGE_SIZE;
3728 offset += PAGE_SIZE_64;
3729 new_offset += PAGE_SIZE_64;
3730 }
3731 vm_object_unlock(upl->map_object);
3732 }
3733 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3734 offset = upl->offset - upl->map_object->paging_offset;
3735 else
3736 offset = 0;
3737 size = upl->size;
3738
3739 vm_object_reference(upl->map_object);
3740
3741 *dst_addr = 0;
3742 /*
3743 * NEED A UPL_MAP ALIAS
3744 */
3745 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3746 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3747 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3748
3749 if (kr != KERN_SUCCESS) {
3750 upl_unlock(upl);
3751 return(kr);
3752 }
3753 vm_object_lock(upl->map_object);
3754
3755 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3756 m = vm_page_lookup(upl->map_object, offset);
3757
3758 if (m) {
3759 unsigned int cache_attr;
3760 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3761
3762 m->pmapped = TRUE;
3763
3764 PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3765 }
3766 offset += PAGE_SIZE_64;
3767 }
3768 vm_object_unlock(upl->map_object);
3769
3770 /*
3771 * hold a reference for the mapping
3772 */
3773 upl->ref_count++;
3774 upl->flags |= UPL_PAGE_LIST_MAPPED;
3775 upl->kaddr = *dst_addr;
3776 upl_unlock(upl);
3777
3778 return KERN_SUCCESS;
3779 }
3780
3781 /*
3782 * Internal routine to remove a UPL mapping from a VM map.
3783 *
3784 * XXX - This should just be doable through a standard
3785 * vm_map_remove() operation. Otherwise, implicit clean-up
3786 * of the target map won't be able to correctly remove
3787 * these (and release the reference on the UPL). Having
3788 * to do this means we can't map these into user-space
3789 * maps yet.
3790 */
3791 kern_return_t
3792 vm_map_remove_upl(
3793 vm_map_t map,
3794 upl_t upl)
3795 {
3796 vm_address_t addr;
3797 upl_size_t size;
3798
3799 if (upl == UPL_NULL)
3800 return KERN_INVALID_ARGUMENT;
3801
3802 upl_lock(upl);
3803
3804 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3805 addr = upl->kaddr;
3806 size = upl->size;
3807
3808 assert(upl->ref_count > 1);
3809 upl->ref_count--; /* removing mapping ref */
3810
3811 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3812 upl->kaddr = (vm_offset_t) 0;
3813 upl_unlock(upl);
3814
3815 vm_map_remove(map,
3816 vm_map_trunc_page(addr),
3817 vm_map_round_page(addr + size),
3818 VM_MAP_NO_FLAGS);
3819
3820 return KERN_SUCCESS;
3821 }
3822 upl_unlock(upl);
3823
3824 return KERN_FAILURE;
3825 }
3826
3827 kern_return_t
3828 upl_commit_range(
3829 upl_t upl,
3830 upl_offset_t offset,
3831 upl_size_t size,
3832 int flags,
3833 upl_page_info_t *page_list,
3834 mach_msg_type_number_t count,
3835 boolean_t *empty)
3836 {
3837 upl_size_t xfer_size;
3838 vm_object_t shadow_object;
3839 vm_object_t object;
3840 vm_object_offset_t target_offset;
3841 int entry;
3842 wpl_array_t lite_list;
3843 int occupied;
3844 int delayed_unlock = 0;
3845 int clear_refmod = 0;
3846 int pgpgout_count = 0;
3847
3848 *empty = FALSE;
3849
3850 if (upl == UPL_NULL)
3851 return KERN_INVALID_ARGUMENT;
3852
3853 if (count == 0)
3854 page_list = NULL;
3855
3856 if (upl->flags & UPL_DEVICE_MEMORY)
3857 xfer_size = 0;
3858 else if ((offset + size) <= upl->size)
3859 xfer_size = size;
3860 else
3861 return KERN_FAILURE;
3862
3863 upl_lock(upl);
3864
3865 if (upl->flags & UPL_ACCESS_BLOCKED) {
3866 /*
3867 * We used this UPL to block access to the pages by marking
3868 * them "busy". Now we need to clear the "busy" bit to allow
3869 * access to these pages again.
3870 */
3871 flags |= UPL_COMMIT_ALLOW_ACCESS;
3872 }
3873 if (upl->flags & UPL_CLEAR_DIRTY)
3874 flags |= UPL_COMMIT_CLEAR_DIRTY;
3875
3876 if (upl->flags & UPL_INTERNAL)
3877 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3878 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3879 else
3880 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3881
3882 object = upl->map_object;
3883
3884 if (upl->flags & UPL_SHADOWED) {
3885 vm_object_lock(object);
3886 shadow_object = object->shadow;
3887 } else {
3888 shadow_object = object;
3889 }
3890 vm_object_lock(shadow_object);
3891
3892 entry = offset/PAGE_SIZE;
3893 target_offset = (vm_object_offset_t)offset;
3894
3895 while (xfer_size) {
3896 vm_page_t t, m;
3897
3898 if (delayed_unlock == 0)
3899 vm_page_lock_queues();
3900
3901 m = VM_PAGE_NULL;
3902
3903 if (upl->flags & UPL_LITE) {
3904 int pg_num;
3905
3906 pg_num = target_offset/PAGE_SIZE;
3907
3908 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3909 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3910
3911 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
3912 }
3913 }
3914 if (upl->flags & UPL_SHADOWED) {
3915 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
3916
3917 t->pageout = FALSE;
3918
3919 vm_page_free(t);
3920
3921 if (m == VM_PAGE_NULL)
3922 m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
3923 }
3924 }
3925 if (m != VM_PAGE_NULL) {
3926
3927 clear_refmod = 0;
3928
3929 if (upl->flags & UPL_IO_WIRE) {
3930
3931 vm_page_unwire(m);
3932
3933 if (page_list)
3934 page_list[entry].phys_addr = 0;
3935
3936 if (flags & UPL_COMMIT_SET_DIRTY)
3937 m->dirty = TRUE;
3938 else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3939 m->dirty = FALSE;
3940 clear_refmod |= VM_MEM_MODIFIED;
3941 }
3942 if (flags & UPL_COMMIT_INACTIVATE)
3943 vm_page_deactivate(m);
3944
3945 if (clear_refmod)
3946 pmap_clear_refmod(m->phys_page, clear_refmod);
3947
3948 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3949 /*
3950 * We blocked access to the pages in this UPL.
3951 * Clear the "busy" bit and wake up any waiter
3952 * for this page.
3953 */
3954 PAGE_WAKEUP_DONE(m);
3955 }
3956 goto commit_next_page;
3957 }
3958 /*
3959 * make sure to clear the hardware
3960 * modify or reference bits before
3961 * releasing the BUSY bit on this page
3962 * otherwise we risk losing a legitimate
3963 * change of state
3964 */
3965 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3966 m->dirty = FALSE;
3967 clear_refmod |= VM_MEM_MODIFIED;
3968 }
3969 if (clear_refmod)
3970 pmap_clear_refmod(m->phys_page, clear_refmod);
3971
3972 if (page_list) {
3973 upl_page_info_t *p;
3974
3975 p = &(page_list[entry]);
3976
3977 if (p->phys_addr && p->pageout && !m->pageout) {
3978 m->busy = TRUE;
3979 m->pageout = TRUE;
3980 vm_page_wire(m);
3981 } else if (p->phys_addr &&
3982 !p->pageout && m->pageout &&
3983 !m->dump_cleaning) {
3984 m->pageout = FALSE;
3985 m->absent = FALSE;
3986 m->overwriting = FALSE;
3987 vm_page_unwire(m);
3988
3989 PAGE_WAKEUP_DONE(m);
3990 }
3991 page_list[entry].phys_addr = 0;
3992 }
3993 m->dump_cleaning = FALSE;
3994
3995 if (m->laundry)
3996 vm_pageout_throttle_up(m);
3997
3998 if (m->pageout) {
3999 m->cleaning = FALSE;
4000 m->encrypted_cleaning = FALSE;
4001 m->pageout = FALSE;
4002 #if MACH_CLUSTER_STATS
4003 if (m->wanted) vm_pageout_target_collisions++;
4004 #endif
4005 m->dirty = FALSE;
4006
4007 if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4008 m->dirty = TRUE;
4009
4010 if (m->dirty) {
4011 /*
4012 * page was re-dirtied after we started
4013 * the pageout... reactivate it since
4014 * we don't know whether the on-disk
4015 * copy matches what is now in memory
4016 */
4017 vm_page_unwire(m);
4018
4019 if (upl->flags & UPL_PAGEOUT) {
4020 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4021 VM_STAT_INCR(reactivations);
4022 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4023 }
4024 PAGE_WAKEUP_DONE(m);
4025 } else {
4026 /*
4027 * page has been successfully cleaned
4028 * go ahead and free it for other use
4029 */
4030
4031 if (m->object->internal) {
4032 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4033 } else {
4034 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4035 }
4036
4037 vm_page_free(m);
4038
4039 if (upl->flags & UPL_PAGEOUT) {
4040 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4041
4042 if (page_list[entry].dirty) {
4043 VM_STAT_INCR(pageouts);
4044 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4045 pgpgout_count++;
4046 }
4047 }
4048 }
4049 goto commit_next_page;
4050 }
4051 #if MACH_CLUSTER_STATS
4052 if (m->pmapped)
4053 m->dirty = pmap_is_modified(m->phys_page);
4054
4055 if (m->dirty) vm_pageout_cluster_dirtied++;
4056 else vm_pageout_cluster_cleaned++;
4057 if (m->wanted) vm_pageout_cluster_collisions++;
4058 #endif
4059 m->dirty = FALSE;
4060
4061 if ((m->busy) && (m->cleaning)) {
4062 /*
4063 * the request_page_list case
4064 */
4065 m->absent = FALSE;
4066 m->overwriting = FALSE;
4067 m->busy = FALSE;
4068 } else if (m->overwriting) {
4069 /*
4070 * alternate request page list, write to
4071 * page_list case. Occurs when the original
4072 * page was wired at the time of the list
4073 * request
4074 */
4075 assert(m->wire_count != 0);
4076 vm_page_unwire(m);/* reactivates */
4077 m->overwriting = FALSE;
4078 }
4079 m->cleaning = FALSE;
4080 m->encrypted_cleaning = FALSE;
4081
4082 /*
4083 * It is a part of the semantic of COPYOUT_FROM
4084 * UPLs that a commit implies cache sync
4085 * between the vm page and the backing store
4086 * this can be used to strip the precious bit
4087 * as well as clean
4088 */
4089 if (upl->flags & UPL_PAGE_SYNC_DONE)
4090 m->precious = FALSE;
4091
4092 if (flags & UPL_COMMIT_SET_DIRTY)
4093 m->dirty = TRUE;
4094
4095 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4096 vm_page_deactivate(m);
4097 } else if (!m->active && !m->inactive && !m->speculative) {
4098
4099 if (m->clustered)
4100 vm_page_speculate(m, TRUE);
4101 else if (m->reference)
4102 vm_page_activate(m);
4103 else
4104 vm_page_deactivate(m);
4105 }
4106 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4107 /*
4108 * We blocked access to the pages in this URL.
4109 * Clear the "busy" bit on this page before we
4110 * wake up any waiter.
4111 */
4112 m->busy = FALSE;
4113 }
4114 /*
4115 * Wakeup any thread waiting for the page to be un-cleaning.
4116 */
4117 PAGE_WAKEUP(m);
4118 }
4119 commit_next_page:
4120 target_offset += PAGE_SIZE_64;
4121 xfer_size -= PAGE_SIZE;
4122 entry++;
4123
4124 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4125 mutex_yield(&vm_page_queue_lock);
4126 delayed_unlock = 1;
4127 }
4128 }
4129 if (delayed_unlock)
4130 vm_page_unlock_queues();
4131
4132 occupied = 1;
4133
4134 if (upl->flags & UPL_DEVICE_MEMORY) {
4135 occupied = 0;
4136 } else if (upl->flags & UPL_LITE) {
4137 int pg_num;
4138 int i;
4139
4140 pg_num = upl->size/PAGE_SIZE;
4141 pg_num = (pg_num + 31) >> 5;
4142 occupied = 0;
4143
4144 for (i = 0; i < pg_num; i++) {
4145 if (lite_list[i] != 0) {
4146 occupied = 1;
4147 break;
4148 }
4149 }
4150 } else {
4151 if (queue_empty(&upl->map_object->memq))
4152 occupied = 0;
4153 }
4154 if (occupied == 0) {
4155 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4156 *empty = TRUE;
4157
4158 if (object == shadow_object) {
4159 /*
4160 * this is not a paging object
4161 * so we need to drop the paging reference
4162 * that was taken when we created the UPL
4163 * against this object
4164 */
4165 vm_object_paging_end(shadow_object);
4166 } else {
4167 /*
4168 * we dontated the paging reference to
4169 * the map object... vm_pageout_object_terminate
4170 * will drop this reference
4171 */
4172 }
4173 }
4174 vm_object_unlock(shadow_object);
4175 if (object != shadow_object)
4176 vm_object_unlock(object);
4177 upl_unlock(upl);
4178
4179 if (pgpgout_count) {
4180 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4181 }
4182
4183 return KERN_SUCCESS;
4184 }
4185
4186 kern_return_t
4187 upl_abort_range(
4188 upl_t upl,
4189 upl_offset_t offset,
4190 upl_size_t size,
4191 int error,
4192 boolean_t *empty)
4193 {
4194 upl_size_t xfer_size;
4195 vm_object_t shadow_object;
4196 vm_object_t object;
4197 vm_object_offset_t target_offset;
4198 int entry;
4199 wpl_array_t lite_list;
4200 int occupied;
4201 int delayed_unlock = 0;
4202
4203 *empty = FALSE;
4204
4205 if (upl == UPL_NULL)
4206 return KERN_INVALID_ARGUMENT;
4207
4208 if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4209 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4210
4211 if (upl->flags & UPL_DEVICE_MEMORY)
4212 xfer_size = 0;
4213 else if ((offset + size) <= upl->size)
4214 xfer_size = size;
4215 else
4216 return KERN_FAILURE;
4217
4218 upl_lock(upl);
4219
4220 if (upl->flags & UPL_INTERNAL) {
4221 lite_list = (wpl_array_t)
4222 ((((uintptr_t)upl) + sizeof(struct upl))
4223 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4224 } else {
4225 lite_list = (wpl_array_t)
4226 (((uintptr_t)upl) + sizeof(struct upl));
4227 }
4228 object = upl->map_object;
4229
4230 if (upl->flags & UPL_SHADOWED) {
4231 vm_object_lock(object);
4232 shadow_object = object->shadow;
4233 } else
4234 shadow_object = object;
4235
4236 vm_object_lock(shadow_object);
4237
4238 entry = offset/PAGE_SIZE;
4239 target_offset = (vm_object_offset_t)offset;
4240
4241 while (xfer_size) {
4242 vm_page_t t, m;
4243
4244 if (delayed_unlock == 0)
4245 vm_page_lock_queues();
4246
4247 m = VM_PAGE_NULL;
4248
4249 if (upl->flags & UPL_LITE) {
4250 int pg_num;
4251 pg_num = target_offset/PAGE_SIZE;
4252
4253 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4254 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4255
4256 m = vm_page_lookup(shadow_object, target_offset +
4257 (upl->offset - shadow_object->paging_offset));
4258 }
4259 }
4260 if (upl->flags & UPL_SHADOWED) {
4261 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4262 t->pageout = FALSE;
4263
4264 vm_page_free(t);
4265
4266 if (m == VM_PAGE_NULL)
4267 m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4268 }
4269 }
4270 if (m != VM_PAGE_NULL) {
4271
4272 if (m->absent) {
4273 boolean_t must_free = TRUE;
4274
4275 m->clustered = FALSE;
4276 /*
4277 * COPYOUT = FALSE case
4278 * check for error conditions which must
4279 * be passed back to the pages customer
4280 */
4281 if (error & UPL_ABORT_RESTART) {
4282 m->restart = TRUE;
4283 m->absent = FALSE;
4284 m->error = TRUE;
4285 m->unusual = TRUE;
4286 must_free = FALSE;
4287 } else if (error & UPL_ABORT_UNAVAILABLE) {
4288 m->restart = FALSE;
4289 m->unusual = TRUE;
4290 must_free = FALSE;
4291 } else if (error & UPL_ABORT_ERROR) {
4292 m->restart = FALSE;
4293 m->absent = FALSE;
4294 m->error = TRUE;
4295 m->unusual = TRUE;
4296 must_free = FALSE;
4297 }
4298
4299 /*
4300 * ENCRYPTED SWAP:
4301 * If the page was already encrypted,
4302 * we don't really need to decrypt it
4303 * now. It will get decrypted later,
4304 * on demand, as soon as someone needs
4305 * to access its contents.
4306 */
4307
4308 m->cleaning = FALSE;
4309 m->encrypted_cleaning = FALSE;
4310 m->overwriting = FALSE;
4311 PAGE_WAKEUP_DONE(m);
4312
4313 if (must_free == TRUE)
4314 vm_page_free(m);
4315 else
4316 vm_page_activate(m);
4317 } else {
4318 /*
4319 * Handle the trusted pager throttle.
4320 */
4321 if (m->laundry)
4322 vm_pageout_throttle_up(m);
4323
4324 if (m->pageout) {
4325 assert(m->busy);
4326 assert(m->wire_count == 1);
4327 m->pageout = FALSE;
4328 vm_page_unwire(m);
4329 }
4330 m->dump_cleaning = FALSE;
4331 m->cleaning = FALSE;
4332 m->encrypted_cleaning = FALSE;
4333 m->overwriting = FALSE;
4334 #if MACH_PAGEMAP
4335 vm_external_state_clr(m->object->existence_map, m->offset);
4336 #endif /* MACH_PAGEMAP */
4337 if (error & UPL_ABORT_DUMP_PAGES) {
4338 pmap_disconnect(m->phys_page);
4339 vm_page_free(m);
4340 } else {
4341 if (error & UPL_ABORT_REFERENCE) {
4342 /*
4343 * we've been told to explictly
4344 * reference this page... for
4345 * file I/O, this is done by
4346 * implementing an LRU on the inactive q
4347 */
4348 vm_page_lru(m);
4349 }
4350 PAGE_WAKEUP_DONE(m);
4351 }
4352 }
4353 }
4354 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4355 mutex_yield(&vm_page_queue_lock);
4356 delayed_unlock = 1;
4357 }
4358 target_offset += PAGE_SIZE_64;
4359 xfer_size -= PAGE_SIZE;
4360 entry++;
4361 }
4362 if (delayed_unlock)
4363 vm_page_unlock_queues();
4364
4365 occupied = 1;
4366
4367 if (upl->flags & UPL_DEVICE_MEMORY) {
4368 occupied = 0;
4369 } else if (upl->flags & UPL_LITE) {
4370 int pg_num;
4371 int i;
4372
4373 pg_num = upl->size/PAGE_SIZE;
4374 pg_num = (pg_num + 31) >> 5;
4375 occupied = 0;
4376
4377 for (i = 0; i < pg_num; i++) {
4378 if (lite_list[i] != 0) {
4379 occupied = 1;
4380 break;
4381 }
4382 }
4383 } else {
4384 if (queue_empty(&upl->map_object->memq))
4385 occupied = 0;
4386 }
4387 if (occupied == 0) {
4388 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4389 *empty = TRUE;
4390
4391 if (object == shadow_object) {
4392 /*
4393 * this is not a paging object
4394 * so we need to drop the paging reference
4395 * that was taken when we created the UPL
4396 * against this object
4397 */
4398 vm_object_paging_end(shadow_object);
4399 } else {
4400 /*
4401 * we dontated the paging reference to
4402 * the map object... vm_pageout_object_terminate
4403 * will drop this reference
4404 */
4405 }
4406 }
4407 vm_object_unlock(shadow_object);
4408 if (object != shadow_object)
4409 vm_object_unlock(object);
4410 upl_unlock(upl);
4411
4412 return KERN_SUCCESS;
4413 }
4414
4415
4416 kern_return_t
4417 upl_abort(
4418 upl_t upl,
4419 int error)
4420 {
4421 boolean_t empty;
4422
4423 return upl_abort_range(upl, 0, upl->size, error, &empty);
4424 }
4425
4426
4427 /* an option on commit should be wire */
4428 kern_return_t
4429 upl_commit(
4430 upl_t upl,
4431 upl_page_info_t *page_list,
4432 mach_msg_type_number_t count)
4433 {
4434 boolean_t empty;
4435
4436 return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4437 }
4438
4439
4440 kern_return_t
4441 vm_object_iopl_request(
4442 vm_object_t object,
4443 vm_object_offset_t offset,
4444 upl_size_t size,
4445 upl_t *upl_ptr,
4446 upl_page_info_array_t user_page_list,
4447 unsigned int *page_list_count,
4448 int cntrl_flags)
4449 {
4450 vm_page_t dst_page;
4451 vm_object_offset_t dst_offset;
4452 upl_size_t xfer_size;
4453 upl_t upl = NULL;
4454 unsigned int entry;
4455 wpl_array_t lite_list = NULL;
4456 int delayed_unlock = 0;
4457 int no_zero_fill = FALSE;
4458 u_int32_t psize;
4459 kern_return_t ret;
4460 vm_prot_t prot;
4461 struct vm_object_fault_info fault_info;
4462
4463
4464 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4465 /*
4466 * For forward compatibility's sake,
4467 * reject any unknown flag.
4468 */
4469 return KERN_INVALID_VALUE;
4470 }
4471 if (vm_lopage_poolsize == 0)
4472 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4473
4474 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4475 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4476 return KERN_INVALID_VALUE;
4477
4478 if (object->phys_contiguous) {
4479 if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4480 return KERN_INVALID_ADDRESS;
4481
4482 if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4483 return KERN_INVALID_ADDRESS;
4484 }
4485 }
4486
4487 if (cntrl_flags & UPL_ENCRYPT) {
4488 /*
4489 * ENCRYPTED SWAP:
4490 * The paging path doesn't use this interface,
4491 * so we don't support the UPL_ENCRYPT flag
4492 * here. We won't encrypt the pages.
4493 */
4494 assert(! (cntrl_flags & UPL_ENCRYPT));
4495 }
4496 if (cntrl_flags & UPL_NOZEROFILL)
4497 no_zero_fill = TRUE;
4498
4499 if (cntrl_flags & UPL_COPYOUT_FROM)
4500 prot = VM_PROT_READ;
4501 else
4502 prot = VM_PROT_READ | VM_PROT_WRITE;
4503
4504 if (((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous)
4505 size = MAX_UPL_TRANSFER * page_size;
4506
4507 if (cntrl_flags & UPL_SET_INTERNAL) {
4508 if (page_list_count != NULL)
4509 *page_list_count = MAX_UPL_TRANSFER;
4510 }
4511 if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4512 ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4513 return KERN_INVALID_ARGUMENT;
4514
4515 if ((!object->internal) && (object->paging_offset != 0))
4516 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4517
4518
4519 if (object->phys_contiguous)
4520 psize = PAGE_SIZE;
4521 else
4522 psize = size;
4523
4524 if (cntrl_flags & UPL_SET_INTERNAL) {
4525 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4526
4527 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4528 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4529 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4530 } else {
4531 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4532
4533 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4534 }
4535 if (user_page_list)
4536 user_page_list[0].device = FALSE;
4537 *upl_ptr = upl;
4538
4539 upl->map_object = object;
4540 upl->size = size;
4541
4542 vm_object_lock(object);
4543 vm_object_paging_begin(object);
4544 /*
4545 * paging in progress also protects the paging_offset
4546 */
4547 upl->offset = offset + object->paging_offset;
4548
4549 if (object->phys_contiguous) {
4550 #ifdef UPL_DEBUG
4551 queue_enter(&object->uplq, upl, upl_t, uplq);
4552 #endif /* UPL_DEBUG */
4553
4554 vm_object_unlock(object);
4555
4556 /*
4557 * don't need any shadow mappings for this one
4558 * since it is already I/O memory
4559 */
4560 upl->flags |= UPL_DEVICE_MEMORY;
4561
4562 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4563
4564 if (user_page_list) {
4565 user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4566 user_page_list[0].device = TRUE;
4567 }
4568 if (page_list_count != NULL) {
4569 if (upl->flags & UPL_INTERNAL)
4570 *page_list_count = 0;
4571 else
4572 *page_list_count = 1;
4573 }
4574 return KERN_SUCCESS;
4575 }
4576 /*
4577 * Protect user space from future COW operations
4578 */
4579 object->true_share = TRUE;
4580
4581 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4582 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4583
4584 #ifdef UPL_DEBUG
4585 queue_enter(&object->uplq, upl, upl_t, uplq);
4586 #endif /* UPL_DEBUG */
4587
4588 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4589 /*
4590 * The user requested that access to the pages in this URL
4591 * be blocked until the UPL is commited or aborted.
4592 */
4593 upl->flags |= UPL_ACCESS_BLOCKED;
4594 }
4595 entry = 0;
4596
4597 xfer_size = size;
4598 dst_offset = offset;
4599
4600 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4601 fault_info.user_tag = 0;
4602 fault_info.lo_offset = offset;
4603 fault_info.hi_offset = offset + xfer_size;
4604 fault_info.no_cache = FALSE;
4605
4606 while (xfer_size) {
4607 vm_fault_return_t result;
4608 int pg_num;
4609
4610 dst_page = vm_page_lookup(object, dst_offset);
4611
4612 /*
4613 * ENCRYPTED SWAP:
4614 * If the page is encrypted, we need to decrypt it,
4615 * so force a soft page fault.
4616 */
4617 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4618 (dst_page->encrypted) ||
4619 (dst_page->unusual && (dst_page->error ||
4620 dst_page->restart ||
4621 dst_page->absent ||
4622 dst_page->fictitious))) {
4623
4624 do {
4625 vm_page_t top_page;
4626 kern_return_t error_code;
4627 int interruptible;
4628
4629 if (delayed_unlock) {
4630 delayed_unlock = 0;
4631 vm_page_unlock_queues();
4632 }
4633 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4634 interruptible = THREAD_ABORTSAFE;
4635 else
4636 interruptible = THREAD_UNINT;
4637
4638 fault_info.interruptible = interruptible;
4639 fault_info.cluster_size = xfer_size;
4640
4641 result = vm_fault_page(object, dst_offset,
4642 prot | VM_PROT_WRITE, FALSE,
4643 &prot, &dst_page, &top_page,
4644 (int *)0,
4645 &error_code, no_zero_fill,
4646 FALSE, &fault_info);
4647
4648 switch (result) {
4649
4650 case VM_FAULT_SUCCESS:
4651
4652 PAGE_WAKEUP_DONE(dst_page);
4653 /*
4654 * Release paging references and
4655 * top-level placeholder page, if any.
4656 */
4657 if (top_page != VM_PAGE_NULL) {
4658 vm_object_t local_object;
4659
4660 local_object = top_page->object;
4661
4662 if (top_page->object != dst_page->object) {
4663 vm_object_lock(local_object);
4664 VM_PAGE_FREE(top_page);
4665 vm_object_paging_end(local_object);
4666 vm_object_unlock(local_object);
4667 } else {
4668 VM_PAGE_FREE(top_page);
4669 vm_object_paging_end(local_object);
4670 }
4671 }
4672 break;
4673
4674 case VM_FAULT_RETRY:
4675 vm_object_lock(object);
4676 vm_object_paging_begin(object);
4677 break;
4678
4679 case VM_FAULT_FICTITIOUS_SHORTAGE:
4680 vm_page_more_fictitious();
4681
4682 vm_object_lock(object);
4683 vm_object_paging_begin(object);
4684 break;
4685
4686 case VM_FAULT_MEMORY_SHORTAGE:
4687 if (vm_page_wait(interruptible)) {
4688 vm_object_lock(object);
4689 vm_object_paging_begin(object);
4690 break;
4691 }
4692 /* fall thru */
4693
4694 case VM_FAULT_INTERRUPTED:
4695 error_code = MACH_SEND_INTERRUPTED;
4696 case VM_FAULT_MEMORY_ERROR:
4697 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
4698
4699 vm_object_lock(object);
4700 vm_object_paging_begin(object);
4701 goto return_err;
4702 }
4703 } while (result != VM_FAULT_SUCCESS);
4704 }
4705
4706 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4707 dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4708 vm_page_t low_page;
4709 int refmod;
4710
4711 /*
4712 * support devices that can't DMA above 32 bits
4713 * by substituting pages from a pool of low address
4714 * memory for any pages we find above the 4G mark
4715 * can't substitute if the page is already wired because
4716 * we don't know whether that physical address has been
4717 * handed out to some other 64 bit capable DMA device to use
4718 */
4719 if (dst_page->wire_count) {
4720 ret = KERN_PROTECTION_FAILURE;
4721 goto return_err;
4722 }
4723 if (delayed_unlock) {
4724 delayed_unlock = 0;
4725 vm_page_unlock_queues();
4726 }
4727 low_page = vm_page_grablo();
4728
4729 if (low_page == VM_PAGE_NULL) {
4730 ret = KERN_RESOURCE_SHORTAGE;
4731 goto return_err;
4732 }
4733 /*
4734 * from here until the vm_page_replace completes
4735 * we musn't drop the object lock... we don't
4736 * want anyone refaulting this page in and using
4737 * it after we disconnect it... we want the fault
4738 * to find the new page being substituted.
4739 */
4740 if (dst_page->pmapped)
4741 refmod = pmap_disconnect(dst_page->phys_page);
4742 else
4743 refmod = 0;
4744 vm_page_copy(dst_page, low_page);
4745
4746 low_page->reference = dst_page->reference;
4747 low_page->dirty = dst_page->dirty;
4748
4749 if (refmod & VM_MEM_REFERENCED)
4750 low_page->reference = TRUE;
4751 if (refmod & VM_MEM_MODIFIED)
4752 low_page->dirty = TRUE;
4753
4754 vm_page_lock_queues();
4755 vm_page_replace(low_page, object, dst_offset);
4756 /*
4757 * keep the queue lock since we're going to
4758 * need it immediately
4759 */
4760 delayed_unlock = 1;
4761
4762 dst_page = low_page;
4763 /*
4764 * vm_page_grablo returned the page marked
4765 * BUSY... we don't need a PAGE_WAKEUP_DONE
4766 * here, because we've never dropped the object lock
4767 */
4768 dst_page->busy = FALSE;
4769 }
4770 if (delayed_unlock == 0)
4771 vm_page_lock_queues();
4772
4773 vm_page_wire(dst_page);
4774
4775 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4776 /*
4777 * Mark the page "busy" to block any future page fault
4778 * on this page. We'll also remove the mapping
4779 * of all these pages before leaving this routine.
4780 */
4781 assert(!dst_page->fictitious);
4782 dst_page->busy = TRUE;
4783 }
4784 pg_num = (dst_offset-offset)/PAGE_SIZE;
4785 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4786
4787 /*
4788 * expect the page to be used
4789 * page queues lock must be held to set 'reference'
4790 */
4791 dst_page->reference = TRUE;
4792
4793 if (!(cntrl_flags & UPL_COPYOUT_FROM))
4794 dst_page->dirty = TRUE;
4795
4796 if (dst_page->phys_page > upl->highest_page)
4797 upl->highest_page = dst_page->phys_page;
4798
4799 if (user_page_list) {
4800 user_page_list[entry].phys_addr = dst_page->phys_page;
4801 user_page_list[entry].dirty = dst_page->dirty;
4802 user_page_list[entry].pageout = dst_page->pageout;
4803 user_page_list[entry].absent = dst_page->absent;
4804 user_page_list[entry].precious = dst_page->precious;
4805
4806 if (dst_page->clustered == TRUE)
4807 user_page_list[entry].speculative = dst_page->speculative;
4808 else
4809 user_page_list[entry].speculative = FALSE;
4810 }
4811 /*
4812 * someone is explicitly grabbing this page...
4813 * update clustered and speculative state
4814 *
4815 */
4816 VM_PAGE_CONSUME_CLUSTERED(dst_page);
4817
4818 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4819 mutex_yield(&vm_page_queue_lock);
4820 delayed_unlock = 1;
4821 }
4822 entry++;
4823 dst_offset += PAGE_SIZE_64;
4824 xfer_size -= PAGE_SIZE;
4825 }
4826 if (delayed_unlock)
4827 vm_page_unlock_queues();
4828
4829 if (page_list_count != NULL) {
4830 if (upl->flags & UPL_INTERNAL)
4831 *page_list_count = 0;
4832 else if (*page_list_count > entry)
4833 *page_list_count = entry;
4834 }
4835 vm_object_unlock(object);
4836
4837 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4838 /*
4839 * We've marked all the pages "busy" so that future
4840 * page faults will block.
4841 * Now remove the mapping for these pages, so that they
4842 * can't be accessed without causing a page fault.
4843 */
4844 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
4845 PMAP_NULL, 0, VM_PROT_NONE);
4846 }
4847 return KERN_SUCCESS;
4848
4849 return_err:
4850 if (delayed_unlock)
4851 vm_page_unlock_queues();
4852
4853 for (; offset < dst_offset; offset += PAGE_SIZE) {
4854 dst_page = vm_page_lookup(object, offset);
4855
4856 if (dst_page == VM_PAGE_NULL)
4857 panic("vm_object_iopl_request: Wired pages missing. \n");
4858
4859 vm_page_lockspin_queues();
4860 vm_page_unwire(dst_page);
4861 vm_page_unlock_queues();
4862
4863 VM_STAT_INCR(reactivations);
4864 }
4865 vm_object_paging_end(object);
4866 vm_object_unlock(object);
4867 upl_destroy(upl);
4868
4869 return ret;
4870 }
4871
4872 kern_return_t
4873 upl_transpose(
4874 upl_t upl1,
4875 upl_t upl2)
4876 {
4877 kern_return_t retval;
4878 boolean_t upls_locked;
4879 vm_object_t object1, object2;
4880
4881 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
4882 return KERN_INVALID_ARGUMENT;
4883 }
4884
4885 upls_locked = FALSE;
4886
4887 /*
4888 * Since we need to lock both UPLs at the same time,
4889 * avoid deadlocks by always taking locks in the same order.
4890 */
4891 if (upl1 < upl2) {
4892 upl_lock(upl1);
4893 upl_lock(upl2);
4894 } else {
4895 upl_lock(upl2);
4896 upl_lock(upl1);
4897 }
4898 upls_locked = TRUE; /* the UPLs will need to be unlocked */
4899
4900 object1 = upl1->map_object;
4901 object2 = upl2->map_object;
4902
4903 if (upl1->offset != 0 || upl2->offset != 0 ||
4904 upl1->size != upl2->size) {
4905 /*
4906 * We deal only with full objects, not subsets.
4907 * That's because we exchange the entire backing store info
4908 * for the objects: pager, resident pages, etc... We can't do
4909 * only part of it.
4910 */
4911 retval = KERN_INVALID_VALUE;
4912 goto done;
4913 }
4914
4915 /*
4916 * Tranpose the VM objects' backing store.
4917 */
4918 retval = vm_object_transpose(object1, object2,
4919 (vm_object_size_t) upl1->size);
4920
4921 if (retval == KERN_SUCCESS) {
4922 /*
4923 * Make each UPL point to the correct VM object, i.e. the
4924 * object holding the pages that the UPL refers to...
4925 */
4926 #ifdef UPL_DEBUG
4927 queue_remove(&object1->uplq, upl1, upl_t, uplq);
4928 queue_remove(&object2->uplq, upl2, upl_t, uplq);
4929 #endif
4930 upl1->map_object = object2;
4931 upl2->map_object = object1;
4932 #ifdef UPL_DEBUG
4933 queue_enter(&object1->uplq, upl2, upl_t, uplq);
4934 queue_enter(&object2->uplq, upl1, upl_t, uplq);
4935 #endif
4936 }
4937
4938 done:
4939 /*
4940 * Cleanup.
4941 */
4942 if (upls_locked) {
4943 upl_unlock(upl1);
4944 upl_unlock(upl2);
4945 upls_locked = FALSE;
4946 }
4947
4948 return retval;
4949 }
4950
4951 /*
4952 * ENCRYPTED SWAP:
4953 *
4954 * Rationale: the user might have some encrypted data on disk (via
4955 * FileVault or any other mechanism). That data is then decrypted in
4956 * memory, which is safe as long as the machine is secure. But that
4957 * decrypted data in memory could be paged out to disk by the default
4958 * pager. The data would then be stored on disk in clear (not encrypted)
4959 * and it could be accessed by anyone who gets physical access to the
4960 * disk (if the laptop or the disk gets stolen for example). This weakens
4961 * the security offered by FileVault.
4962 *
4963 * Solution: the default pager will optionally request that all the
4964 * pages it gathers for pageout be encrypted, via the UPL interfaces,
4965 * before it sends this UPL to disk via the vnode_pageout() path.
4966 *
4967 * Notes:
4968 *
4969 * To avoid disrupting the VM LRU algorithms, we want to keep the
4970 * clean-in-place mechanisms, which allow us to send some extra pages to
4971 * swap (clustering) without actually removing them from the user's
4972 * address space. We don't want the user to unknowingly access encrypted
4973 * data, so we have to actually remove the encrypted pages from the page
4974 * table. When the user accesses the data, the hardware will fail to
4975 * locate the virtual page in its page table and will trigger a page
4976 * fault. We can then decrypt the page and enter it in the page table
4977 * again. Whenever we allow the user to access the contents of a page,
4978 * we have to make sure it's not encrypted.
4979 *
4980 *
4981 */
4982 /*
4983 * ENCRYPTED SWAP:
4984 * Reserve of virtual addresses in the kernel address space.
4985 * We need to map the physical pages in the kernel, so that we
4986 * can call the encryption/decryption routines with a kernel
4987 * virtual address. We keep this pool of pre-allocated kernel
4988 * virtual addresses so that we don't have to scan the kernel's
4989 * virtaul address space each time we need to encrypt or decrypt
4990 * a physical page.
4991 * It would be nice to be able to encrypt and decrypt in physical
4992 * mode but that might not always be more efficient...
4993 */
4994 decl_simple_lock_data(,vm_paging_lock)
4995 #define VM_PAGING_NUM_PAGES 64
4996 vm_map_offset_t vm_paging_base_address = 0;
4997 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
4998 int vm_paging_max_index = 0;
4999 int vm_paging_page_waiter = 0;
5000 int vm_paging_page_waiter_total = 0;
5001 unsigned long vm_paging_no_kernel_page = 0;
5002 unsigned long vm_paging_objects_mapped = 0;
5003 unsigned long vm_paging_pages_mapped = 0;
5004 unsigned long vm_paging_objects_mapped_slow = 0;
5005 unsigned long vm_paging_pages_mapped_slow = 0;
5006
5007 void
5008 vm_paging_map_init(void)
5009 {
5010 kern_return_t kr;
5011 vm_map_offset_t page_map_offset;
5012 vm_map_entry_t map_entry;
5013
5014 assert(vm_paging_base_address == 0);
5015
5016 /*
5017 * Initialize our pool of pre-allocated kernel
5018 * virtual addresses.
5019 */
5020 page_map_offset = 0;
5021 kr = vm_map_find_space(kernel_map,
5022 &page_map_offset,
5023 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5024 0,
5025 0,
5026 &map_entry);
5027 if (kr != KERN_SUCCESS) {
5028 panic("vm_paging_map_init: kernel_map full\n");
5029 }
5030 map_entry->object.vm_object = kernel_object;
5031 map_entry->offset =
5032 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5033 vm_object_reference(kernel_object);
5034 vm_map_unlock(kernel_map);
5035
5036 assert(vm_paging_base_address == 0);
5037 vm_paging_base_address = page_map_offset;
5038 }
5039
5040 /*
5041 * ENCRYPTED SWAP:
5042 * vm_paging_map_object:
5043 * Maps part of a VM object's pages in the kernel
5044 * virtual address space, using the pre-allocated
5045 * kernel virtual addresses, if possible.
5046 * Context:
5047 * The VM object is locked. This lock will get
5048 * dropped and re-acquired though, so the caller
5049 * must make sure the VM object is kept alive
5050 * (by holding a VM map that has a reference
5051 * on it, for example, or taking an extra reference).
5052 * The page should also be kept busy to prevent
5053 * it from being reclaimed.
5054 */
5055 kern_return_t
5056 vm_paging_map_object(
5057 vm_map_offset_t *address,
5058 vm_page_t page,
5059 vm_object_t object,
5060 vm_object_offset_t offset,
5061 vm_map_size_t *size,
5062 boolean_t can_unlock_object)
5063 {
5064 kern_return_t kr;
5065 vm_map_offset_t page_map_offset;
5066 vm_map_size_t map_size;
5067 vm_object_offset_t object_offset;
5068 int i;
5069
5070
5071 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5072 assert(page->busy);
5073 /*
5074 * Use one of the pre-allocated kernel virtual addresses
5075 * and just enter the VM page in the kernel address space
5076 * at that virtual address.
5077 */
5078 simple_lock(&vm_paging_lock);
5079
5080 /*
5081 * Try and find an available kernel virtual address
5082 * from our pre-allocated pool.
5083 */
5084 page_map_offset = 0;
5085 for (;;) {
5086 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5087 if (vm_paging_page_inuse[i] == FALSE) {
5088 page_map_offset =
5089 vm_paging_base_address +
5090 (i * PAGE_SIZE);
5091 break;
5092 }
5093 }
5094 if (page_map_offset != 0) {
5095 /* found a space to map our page ! */
5096 break;
5097 }
5098
5099 if (can_unlock_object) {
5100 /*
5101 * If we can afford to unlock the VM object,
5102 * let's take the slow path now...
5103 */
5104 break;
5105 }
5106 /*
5107 * We can't afford to unlock the VM object, so
5108 * let's wait for a space to become available...
5109 */
5110 vm_paging_page_waiter_total++;
5111 vm_paging_page_waiter++;
5112 thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5113 &vm_paging_lock,
5114 THREAD_UNINT);
5115 vm_paging_page_waiter--;
5116 /* ... and try again */
5117 }
5118
5119 if (page_map_offset != 0) {
5120 /*
5121 * We found a kernel virtual address;
5122 * map the physical page to that virtual address.
5123 */
5124 if (i > vm_paging_max_index) {
5125 vm_paging_max_index = i;
5126 }
5127 vm_paging_page_inuse[i] = TRUE;
5128 simple_unlock(&vm_paging_lock);
5129
5130 if (page->pmapped == FALSE) {
5131 pmap_sync_page_data_phys(page->phys_page);
5132 }
5133 page->pmapped = TRUE;
5134
5135 /*
5136 * Keep the VM object locked over the PMAP_ENTER
5137 * and the actual use of the page by the kernel,
5138 * or this pmap mapping might get undone by a
5139 * vm_object_pmap_protect() call...
5140 */
5141 PMAP_ENTER(kernel_pmap,
5142 page_map_offset,
5143 page,
5144 VM_PROT_DEFAULT,
5145 ((int) page->object->wimg_bits &
5146 VM_WIMG_MASK),
5147 TRUE);
5148 vm_paging_objects_mapped++;
5149 vm_paging_pages_mapped++;
5150 *address = page_map_offset;
5151
5152 /* all done and mapped, ready to use ! */
5153 return KERN_SUCCESS;
5154 }
5155
5156 /*
5157 * We ran out of pre-allocated kernel virtual
5158 * addresses. Just map the page in the kernel
5159 * the slow and regular way.
5160 */
5161 vm_paging_no_kernel_page++;
5162 simple_unlock(&vm_paging_lock);
5163 }
5164
5165 if (! can_unlock_object) {
5166 return KERN_NOT_SUPPORTED;
5167 }
5168
5169 object_offset = vm_object_trunc_page(offset);
5170 map_size = vm_map_round_page(*size);
5171
5172 /*
5173 * Try and map the required range of the object
5174 * in the kernel_map
5175 */
5176
5177 vm_object_reference_locked(object); /* for the map entry */
5178 vm_object_unlock(object);
5179
5180 kr = vm_map_enter(kernel_map,
5181 address,
5182 map_size,
5183 0,
5184 VM_FLAGS_ANYWHERE,
5185 object,
5186 object_offset,
5187 FALSE,
5188 VM_PROT_DEFAULT,
5189 VM_PROT_ALL,
5190 VM_INHERIT_NONE);
5191 if (kr != KERN_SUCCESS) {
5192 *address = 0;
5193 *size = 0;
5194 vm_object_deallocate(object); /* for the map entry */
5195 vm_object_lock(object);
5196 return kr;
5197 }
5198
5199 *size = map_size;
5200
5201 /*
5202 * Enter the mapped pages in the page table now.
5203 */
5204 vm_object_lock(object);
5205 /*
5206 * VM object must be kept locked from before PMAP_ENTER()
5207 * until after the kernel is done accessing the page(s).
5208 * Otherwise, the pmap mappings in the kernel could be
5209 * undone by a call to vm_object_pmap_protect().
5210 */
5211
5212 for (page_map_offset = 0;
5213 map_size != 0;
5214 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5215 unsigned int cache_attr;
5216
5217 page = vm_page_lookup(object, offset + page_map_offset);
5218 if (page == VM_PAGE_NULL) {
5219 printf("vm_paging_map_object: no page !?");
5220 vm_object_unlock(object);
5221 kr = vm_map_remove(kernel_map, *address, *size,
5222 VM_MAP_NO_FLAGS);
5223 assert(kr == KERN_SUCCESS);
5224 *address = 0;
5225 *size = 0;
5226 vm_object_lock(object);
5227 return KERN_MEMORY_ERROR;
5228 }
5229 if (page->pmapped == FALSE) {
5230 pmap_sync_page_data_phys(page->phys_page);
5231 }
5232 page->pmapped = TRUE;
5233 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5234
5235 //assert(pmap_verify_free(page->phys_page));
5236 PMAP_ENTER(kernel_pmap,
5237 *address + page_map_offset,
5238 page,
5239 VM_PROT_DEFAULT,
5240 cache_attr,
5241 TRUE);
5242 }
5243
5244 vm_paging_objects_mapped_slow++;
5245 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5246
5247 return KERN_SUCCESS;
5248 }
5249
5250 /*
5251 * ENCRYPTED SWAP:
5252 * vm_paging_unmap_object:
5253 * Unmaps part of a VM object's pages from the kernel
5254 * virtual address space.
5255 * Context:
5256 * The VM object is locked. This lock will get
5257 * dropped and re-acquired though.
5258 */
5259 void
5260 vm_paging_unmap_object(
5261 vm_object_t object,
5262 vm_map_offset_t start,
5263 vm_map_offset_t end)
5264 {
5265 kern_return_t kr;
5266 int i;
5267
5268 if ((vm_paging_base_address == 0) ||
5269 (start < vm_paging_base_address) ||
5270 (end > (vm_paging_base_address
5271 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5272 /*
5273 * We didn't use our pre-allocated pool of
5274 * kernel virtual address. Deallocate the
5275 * virtual memory.
5276 */
5277 if (object != VM_OBJECT_NULL) {
5278 vm_object_unlock(object);
5279 }
5280 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5281 if (object != VM_OBJECT_NULL) {
5282 vm_object_lock(object);
5283 }
5284 assert(kr == KERN_SUCCESS);
5285 } else {
5286 /*
5287 * We used a kernel virtual address from our
5288 * pre-allocated pool. Put it back in the pool
5289 * for next time.
5290 */
5291 assert(end - start == PAGE_SIZE);
5292 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5293
5294 /* undo the pmap mapping */
5295 pmap_remove(kernel_pmap, start, end);
5296
5297 simple_lock(&vm_paging_lock);
5298 vm_paging_page_inuse[i] = FALSE;
5299 if (vm_paging_page_waiter) {
5300 thread_wakeup(&vm_paging_page_waiter);
5301 }
5302 simple_unlock(&vm_paging_lock);
5303 }
5304 }
5305
5306 #if CRYPTO
5307 /*
5308 * Encryption data.
5309 * "iv" is the "initial vector". Ideally, we want to
5310 * have a different one for each page we encrypt, so that
5311 * crackers can't find encryption patterns too easily.
5312 */
5313 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5314 boolean_t swap_crypt_ctx_initialized = FALSE;
5315 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5316 aes_ctx swap_crypt_ctx;
5317 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5318
5319 #if DEBUG
5320 boolean_t swap_crypt_ctx_tested = FALSE;
5321 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5322 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5323 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5324 #endif /* DEBUG */
5325
5326 extern u_long random(void);
5327
5328 /*
5329 * Initialize the encryption context: key and key size.
5330 */
5331 void swap_crypt_ctx_initialize(void); /* forward */
5332 void
5333 swap_crypt_ctx_initialize(void)
5334 {
5335 unsigned int i;
5336
5337 /*
5338 * No need for locking to protect swap_crypt_ctx_initialized
5339 * because the first use of encryption will come from the
5340 * pageout thread (we won't pagein before there's been a pageout)
5341 * and there's only one pageout thread.
5342 */
5343 if (swap_crypt_ctx_initialized == FALSE) {
5344 for (i = 0;
5345 i < (sizeof (swap_crypt_key) /
5346 sizeof (swap_crypt_key[0]));
5347 i++) {
5348 swap_crypt_key[i] = random();
5349 }
5350 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5351 SWAP_CRYPT_AES_KEY_SIZE,
5352 &swap_crypt_ctx.encrypt);
5353 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5354 SWAP_CRYPT_AES_KEY_SIZE,
5355 &swap_crypt_ctx.decrypt);
5356 swap_crypt_ctx_initialized = TRUE;
5357 }
5358
5359 #if DEBUG
5360 /*
5361 * Validate the encryption algorithms.
5362 */
5363 if (swap_crypt_ctx_tested == FALSE) {
5364 /* initialize */
5365 for (i = 0; i < 4096; i++) {
5366 swap_crypt_test_page_ref[i] = (char) i;
5367 }
5368 /* encrypt */
5369 aes_encrypt_cbc(swap_crypt_test_page_ref,
5370 swap_crypt_null_iv,
5371 PAGE_SIZE / AES_BLOCK_SIZE,
5372 swap_crypt_test_page_encrypt,
5373 &swap_crypt_ctx.encrypt);
5374 /* decrypt */
5375 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5376 swap_crypt_null_iv,
5377 PAGE_SIZE / AES_BLOCK_SIZE,
5378 swap_crypt_test_page_decrypt,
5379 &swap_crypt_ctx.decrypt);
5380 /* compare result with original */
5381 for (i = 0; i < 4096; i ++) {
5382 if (swap_crypt_test_page_decrypt[i] !=
5383 swap_crypt_test_page_ref[i]) {
5384 panic("encryption test failed");
5385 }
5386 }
5387
5388 /* encrypt again */
5389 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5390 swap_crypt_null_iv,
5391 PAGE_SIZE / AES_BLOCK_SIZE,
5392 swap_crypt_test_page_decrypt,
5393 &swap_crypt_ctx.encrypt);
5394 /* decrypt in place */
5395 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5396 swap_crypt_null_iv,
5397 PAGE_SIZE / AES_BLOCK_SIZE,
5398 swap_crypt_test_page_decrypt,
5399 &swap_crypt_ctx.decrypt);
5400 for (i = 0; i < 4096; i ++) {
5401 if (swap_crypt_test_page_decrypt[i] !=
5402 swap_crypt_test_page_ref[i]) {
5403 panic("in place encryption test failed");
5404 }
5405 }
5406
5407 swap_crypt_ctx_tested = TRUE;
5408 }
5409 #endif /* DEBUG */
5410 }
5411
5412 /*
5413 * ENCRYPTED SWAP:
5414 * vm_page_encrypt:
5415 * Encrypt the given page, for secure paging.
5416 * The page might already be mapped at kernel virtual
5417 * address "kernel_mapping_offset". Otherwise, we need
5418 * to map it.
5419 *
5420 * Context:
5421 * The page's object is locked, but this lock will be released
5422 * and re-acquired.
5423 * The page is busy and not accessible by users (not entered in any pmap).
5424 */
5425 void
5426 vm_page_encrypt(
5427 vm_page_t page,
5428 vm_map_offset_t kernel_mapping_offset)
5429 {
5430 kern_return_t kr;
5431 vm_map_size_t kernel_mapping_size;
5432 vm_offset_t kernel_vaddr;
5433 union {
5434 unsigned char aes_iv[AES_BLOCK_SIZE];
5435 struct {
5436 memory_object_t pager_object;
5437 vm_object_offset_t paging_offset;
5438 } vm;
5439 } encrypt_iv;
5440
5441 if (! vm_pages_encrypted) {
5442 vm_pages_encrypted = TRUE;
5443 }
5444
5445 assert(page->busy);
5446 assert(page->dirty || page->precious);
5447
5448 if (page->encrypted) {
5449 /*
5450 * Already encrypted: no need to do it again.
5451 */
5452 vm_page_encrypt_already_encrypted_counter++;
5453 return;
5454 }
5455 ASSERT_PAGE_DECRYPTED(page);
5456
5457 /*
5458 * Take a paging-in-progress reference to keep the object
5459 * alive even if we have to unlock it (in vm_paging_map_object()
5460 * for example)...
5461 */
5462 vm_object_paging_begin(page->object);
5463
5464 if (kernel_mapping_offset == 0) {
5465 /*
5466 * The page hasn't already been mapped in kernel space
5467 * by the caller. Map it now, so that we can access
5468 * its contents and encrypt them.
5469 */
5470 kernel_mapping_size = PAGE_SIZE;
5471 kr = vm_paging_map_object(&kernel_mapping_offset,
5472 page,
5473 page->object,
5474 page->offset,
5475 &kernel_mapping_size,
5476 FALSE);
5477 if (kr != KERN_SUCCESS) {
5478 panic("vm_page_encrypt: "
5479 "could not map page in kernel: 0x%x\n",
5480 kr);
5481 }
5482 } else {
5483 kernel_mapping_size = 0;
5484 }
5485 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5486
5487 if (swap_crypt_ctx_initialized == FALSE) {
5488 swap_crypt_ctx_initialize();
5489 }
5490 assert(swap_crypt_ctx_initialized);
5491
5492 /*
5493 * Prepare an "initial vector" for the encryption.
5494 * We use the "pager" and the "paging_offset" for that
5495 * page to obfuscate the encrypted data a bit more and
5496 * prevent crackers from finding patterns that they could
5497 * use to break the key.
5498 */
5499 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5500 encrypt_iv.vm.pager_object = page->object->pager;
5501 encrypt_iv.vm.paging_offset =
5502 page->object->paging_offset + page->offset;
5503
5504 /* encrypt the "initial vector" */
5505 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5506 swap_crypt_null_iv,
5507 1,
5508 &encrypt_iv.aes_iv[0],
5509 &swap_crypt_ctx.encrypt);
5510
5511 /*
5512 * Encrypt the page.
5513 */
5514 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5515 &encrypt_iv.aes_iv[0],
5516 PAGE_SIZE / AES_BLOCK_SIZE,
5517 (unsigned char *) kernel_vaddr,
5518 &swap_crypt_ctx.encrypt);
5519
5520 vm_page_encrypt_counter++;
5521
5522 /*
5523 * Unmap the page from the kernel's address space,
5524 * if we had to map it ourselves. Otherwise, let
5525 * the caller undo the mapping if needed.
5526 */
5527 if (kernel_mapping_size != 0) {
5528 vm_paging_unmap_object(page->object,
5529 kernel_mapping_offset,
5530 kernel_mapping_offset + kernel_mapping_size);
5531 }
5532
5533 /*
5534 * Clear the "reference" and "modified" bits.
5535 * This should clean up any impact the encryption had
5536 * on them.
5537 * The page was kept busy and disconnected from all pmaps,
5538 * so it can't have been referenced or modified from user
5539 * space.
5540 * The software bits will be reset later after the I/O
5541 * has completed (in upl_commit_range()).
5542 */
5543 pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5544
5545 page->encrypted = TRUE;
5546
5547 vm_object_paging_end(page->object);
5548 }
5549
5550 /*
5551 * ENCRYPTED SWAP:
5552 * vm_page_decrypt:
5553 * Decrypt the given page.
5554 * The page might already be mapped at kernel virtual
5555 * address "kernel_mapping_offset". Otherwise, we need
5556 * to map it.
5557 *
5558 * Context:
5559 * The page's VM object is locked but will be unlocked and relocked.
5560 * The page is busy and not accessible by users (not entered in any pmap).
5561 */
5562 void
5563 vm_page_decrypt(
5564 vm_page_t page,
5565 vm_map_offset_t kernel_mapping_offset)
5566 {
5567 kern_return_t kr;
5568 vm_map_size_t kernel_mapping_size;
5569 vm_offset_t kernel_vaddr;
5570 union {
5571 unsigned char aes_iv[AES_BLOCK_SIZE];
5572 struct {
5573 memory_object_t pager_object;
5574 vm_object_offset_t paging_offset;
5575 } vm;
5576 } decrypt_iv;
5577
5578 assert(page->busy);
5579 assert(page->encrypted);
5580
5581 /*
5582 * Take a paging-in-progress reference to keep the object
5583 * alive even if we have to unlock it (in vm_paging_map_object()
5584 * for example)...
5585 */
5586 vm_object_paging_begin(page->object);
5587
5588 if (kernel_mapping_offset == 0) {
5589 /*
5590 * The page hasn't already been mapped in kernel space
5591 * by the caller. Map it now, so that we can access
5592 * its contents and decrypt them.
5593 */
5594 kernel_mapping_size = PAGE_SIZE;
5595 kr = vm_paging_map_object(&kernel_mapping_offset,
5596 page,
5597 page->object,
5598 page->offset,
5599 &kernel_mapping_size,
5600 FALSE);
5601 if (kr != KERN_SUCCESS) {
5602 panic("vm_page_decrypt: "
5603 "could not map page in kernel: 0x%x\n",
5604 kr);
5605 }
5606 } else {
5607 kernel_mapping_size = 0;
5608 }
5609 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5610
5611 assert(swap_crypt_ctx_initialized);
5612
5613 /*
5614 * Prepare an "initial vector" for the decryption.
5615 * It has to be the same as the "initial vector" we
5616 * used to encrypt that page.
5617 */
5618 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5619 decrypt_iv.vm.pager_object = page->object->pager;
5620 decrypt_iv.vm.paging_offset =
5621 page->object->paging_offset + page->offset;
5622
5623 /* encrypt the "initial vector" */
5624 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5625 swap_crypt_null_iv,
5626 1,
5627 &decrypt_iv.aes_iv[0],
5628 &swap_crypt_ctx.encrypt);
5629
5630 /*
5631 * Decrypt the page.
5632 */
5633 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5634 &decrypt_iv.aes_iv[0],
5635 PAGE_SIZE / AES_BLOCK_SIZE,
5636 (unsigned char *) kernel_vaddr,
5637 &swap_crypt_ctx.decrypt);
5638 vm_page_decrypt_counter++;
5639
5640 /*
5641 * Unmap the page from the kernel's address space,
5642 * if we had to map it ourselves. Otherwise, let
5643 * the caller undo the mapping if needed.
5644 */
5645 if (kernel_mapping_size != 0) {
5646 vm_paging_unmap_object(page->object,
5647 kernel_vaddr,
5648 kernel_vaddr + PAGE_SIZE);
5649 }
5650
5651 /*
5652 * After decryption, the page is actually clean.
5653 * It was encrypted as part of paging, which "cleans"
5654 * the "dirty" pages.
5655 * Noone could access it after it was encrypted
5656 * and the decryption doesn't count.
5657 */
5658 page->dirty = FALSE;
5659 pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5660
5661 page->encrypted = FALSE;
5662
5663 /*
5664 * We've just modified the page's contents via the data cache and part
5665 * of the new contents might still be in the cache and not yet in RAM.
5666 * Since the page is now available and might get gathered in a UPL to
5667 * be part of a DMA transfer from a driver that expects the memory to
5668 * be coherent at this point, we have to flush the data cache.
5669 */
5670 pmap_sync_page_attributes_phys(page->phys_page);
5671 /*
5672 * Since the page is not mapped yet, some code might assume that it
5673 * doesn't need to invalidate the instruction cache when writing to
5674 * that page. That code relies on "pmapped" being FALSE, so that the
5675 * caches get synchronized when the page is first mapped.
5676 */
5677 assert(pmap_verify_free(page->phys_page));
5678 page->pmapped = FALSE;
5679
5680 vm_object_paging_end(page->object);
5681 }
5682
5683 unsigned long upl_encrypt_upls = 0;
5684 unsigned long upl_encrypt_pages = 0;
5685
5686 /*
5687 * ENCRYPTED SWAP:
5688 *
5689 * upl_encrypt:
5690 * Encrypts all the pages in the UPL, within the specified range.
5691 *
5692 */
5693 void
5694 upl_encrypt(
5695 upl_t upl,
5696 upl_offset_t crypt_offset,
5697 upl_size_t crypt_size)
5698 {
5699 upl_size_t upl_size;
5700 upl_offset_t upl_offset;
5701 vm_object_t upl_object;
5702 vm_page_t page;
5703 vm_object_t shadow_object;
5704 vm_object_offset_t shadow_offset;
5705 vm_object_offset_t paging_offset;
5706 vm_object_offset_t base_offset;
5707
5708 upl_encrypt_upls++;
5709 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5710
5711 upl_object = upl->map_object;
5712 upl_offset = upl->offset;
5713 upl_size = upl->size;
5714
5715 vm_object_lock(upl_object);
5716
5717 /*
5718 * Find the VM object that contains the actual pages.
5719 */
5720 if (upl_object->pageout) {
5721 shadow_object = upl_object->shadow;
5722 /*
5723 * The offset in the shadow object is actually also
5724 * accounted for in upl->offset. It possibly shouldn't be
5725 * this way, but for now don't account for it twice.
5726 */
5727 shadow_offset = 0;
5728 assert(upl_object->paging_offset == 0); /* XXX ? */
5729 vm_object_lock(shadow_object);
5730 } else {
5731 shadow_object = upl_object;
5732 shadow_offset = 0;
5733 }
5734
5735 paging_offset = shadow_object->paging_offset;
5736 vm_object_paging_begin(shadow_object);
5737
5738 if (shadow_object != upl_object)
5739 vm_object_unlock(upl_object);
5740
5741
5742 base_offset = shadow_offset;
5743 base_offset += upl_offset;
5744 base_offset += crypt_offset;
5745 base_offset -= paging_offset;
5746
5747 assert(crypt_offset + crypt_size <= upl_size);
5748
5749 for (upl_offset = 0;
5750 upl_offset < crypt_size;
5751 upl_offset += PAGE_SIZE) {
5752 page = vm_page_lookup(shadow_object,
5753 base_offset + upl_offset);
5754 if (page == VM_PAGE_NULL) {
5755 panic("upl_encrypt: "
5756 "no page for (obj=%p,off=%lld+%d)!\n",
5757 shadow_object,
5758 base_offset,
5759 upl_offset);
5760 }
5761 /*
5762 * Disconnect the page from all pmaps, so that nobody can
5763 * access it while it's encrypted. After that point, all
5764 * accesses to this page will cause a page fault and block
5765 * while the page is busy being encrypted. After the
5766 * encryption completes, any access will cause a
5767 * page fault and the page gets decrypted at that time.
5768 */
5769 pmap_disconnect(page->phys_page);
5770 vm_page_encrypt(page, 0);
5771
5772 if (shadow_object == vm_pageout_scan_wants_object) {
5773 /*
5774 * Give vm_pageout_scan() a chance to convert more
5775 * pages from "clean-in-place" to "clean-and-free",
5776 * if it's interested in the same pages we selected
5777 * in this cluster.
5778 */
5779 vm_object_unlock(shadow_object);
5780 vm_object_lock(shadow_object);
5781 }
5782 }
5783
5784 vm_object_paging_end(shadow_object);
5785 vm_object_unlock(shadow_object);
5786 }
5787
5788 #else /* CRYPTO */
5789 void
5790 upl_encrypt(
5791 __unused upl_t upl,
5792 __unused upl_offset_t crypt_offset,
5793 __unused upl_size_t crypt_size)
5794 {
5795 }
5796
5797 void
5798 vm_page_encrypt(
5799 __unused vm_page_t page,
5800 __unused vm_map_offset_t kernel_mapping_offset)
5801 {
5802 }
5803
5804 void
5805 vm_page_decrypt(
5806 __unused vm_page_t page,
5807 __unused vm_map_offset_t kernel_mapping_offset)
5808 {
5809 }
5810
5811 #endif /* CRYPTO */
5812
5813 vm_size_t
5814 upl_get_internal_pagelist_offset(void)
5815 {
5816 return sizeof(struct upl);
5817 }
5818
5819 void
5820 upl_clear_dirty(
5821 upl_t upl,
5822 boolean_t value)
5823 {
5824 if (value) {
5825 upl->flags |= UPL_CLEAR_DIRTY;
5826 } else {
5827 upl->flags &= ~UPL_CLEAR_DIRTY;
5828 }
5829 }
5830
5831
5832 #ifdef MACH_BSD
5833
5834 boolean_t upl_device_page(upl_page_info_t *upl)
5835 {
5836 return(UPL_DEVICE_PAGE(upl));
5837 }
5838 boolean_t upl_page_present(upl_page_info_t *upl, int index)
5839 {
5840 return(UPL_PAGE_PRESENT(upl, index));
5841 }
5842 boolean_t upl_speculative_page(upl_page_info_t *upl, int index)
5843 {
5844 return(UPL_SPECULATIVE_PAGE(upl, index));
5845 }
5846 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
5847 {
5848 return(UPL_DIRTY_PAGE(upl, index));
5849 }
5850 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
5851 {
5852 return(UPL_VALID_PAGE(upl, index));
5853 }
5854 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
5855 {
5856 return(UPL_PHYS_PAGE(upl, index));
5857 }
5858
5859
5860 void
5861 vm_countdirtypages(void)
5862 {
5863 vm_page_t m;
5864 int dpages;
5865 int pgopages;
5866 int precpages;
5867
5868
5869 dpages=0;
5870 pgopages=0;
5871 precpages=0;
5872
5873 vm_page_lock_queues();
5874 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
5875 do {
5876 if (m ==(vm_page_t )0) break;
5877
5878 if(m->dirty) dpages++;
5879 if(m->pageout) pgopages++;
5880 if(m->precious) precpages++;
5881
5882 assert(m->object != kernel_object);
5883 m = (vm_page_t) queue_next(&m->pageq);
5884 if (m ==(vm_page_t )0) break;
5885
5886 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
5887 vm_page_unlock_queues();
5888
5889 vm_page_lock_queues();
5890 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
5891 do {
5892 if (m ==(vm_page_t )0) break;
5893
5894 dpages++;
5895 assert(m->dirty);
5896 assert(!m->pageout);
5897 assert(m->object != kernel_object);
5898 m = (vm_page_t) queue_next(&m->pageq);
5899 if (m ==(vm_page_t )0) break;
5900
5901 } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
5902 vm_page_unlock_queues();
5903
5904 vm_page_lock_queues();
5905 m = (vm_page_t) queue_first(&vm_page_queue_zf);
5906 do {
5907 if (m ==(vm_page_t )0) break;
5908
5909 if(m->dirty) dpages++;
5910 if(m->pageout) pgopages++;
5911 if(m->precious) precpages++;
5912
5913 assert(m->object != kernel_object);
5914 m = (vm_page_t) queue_next(&m->pageq);
5915 if (m ==(vm_page_t )0) break;
5916
5917 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
5918 vm_page_unlock_queues();
5919
5920 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
5921
5922 dpages=0;
5923 pgopages=0;
5924 precpages=0;
5925
5926 vm_page_lock_queues();
5927 m = (vm_page_t) queue_first(&vm_page_queue_active);
5928
5929 do {
5930 if(m == (vm_page_t )0) break;
5931 if(m->dirty) dpages++;
5932 if(m->pageout) pgopages++;
5933 if(m->precious) precpages++;
5934
5935 assert(m->object != kernel_object);
5936 m = (vm_page_t) queue_next(&m->pageq);
5937 if(m == (vm_page_t )0) break;
5938
5939 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
5940 vm_page_unlock_queues();
5941
5942 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
5943
5944 }
5945 #endif /* MACH_BSD */
5946
5947 ppnum_t upl_get_highest_page(
5948 upl_t upl)
5949 {
5950 return upl->highest_page;
5951 }
5952
5953 #ifdef UPL_DEBUG
5954 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
5955 {
5956 upl->ubc_alias1 = alias1;
5957 upl->ubc_alias2 = alias2;
5958 return KERN_SUCCESS;
5959 }
5960 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
5961 {
5962 if(al)
5963 *al = upl->ubc_alias1;
5964 if(al2)
5965 *al2 = upl->ubc_alias2;
5966 return KERN_SUCCESS;
5967 }
5968 #endif /* UPL_DEBUG */
5969
5970
5971
5972 #if MACH_KDB
5973 #include <ddb/db_output.h>
5974 #include <ddb/db_print.h>
5975 #include <vm/vm_print.h>
5976
5977 #define printf kdbprintf
5978 void db_pageout(void);
5979
5980 void
5981 db_vm(void)
5982 {
5983
5984 iprintf("VM Statistics:\n");
5985 db_indent += 2;
5986 iprintf("pages:\n");
5987 db_indent += 2;
5988 iprintf("activ %5d inact %5d free %5d",
5989 vm_page_active_count, vm_page_inactive_count,
5990 vm_page_free_count);
5991 printf(" wire %5d gobbl %5d\n",
5992 vm_page_wire_count, vm_page_gobble_count);
5993 db_indent -= 2;
5994 iprintf("target:\n");
5995 db_indent += 2;
5996 iprintf("min %5d inact %5d free %5d",
5997 vm_page_free_min, vm_page_inactive_target,
5998 vm_page_free_target);
5999 printf(" resrv %5d\n", vm_page_free_reserved);
6000 db_indent -= 2;
6001 iprintf("pause:\n");
6002 db_pageout();
6003 db_indent -= 2;
6004 }
6005
6006 #if MACH_COUNTERS
6007 extern int c_laundry_pages_freed;
6008 #endif /* MACH_COUNTERS */
6009
6010 void
6011 db_pageout(void)
6012 {
6013 iprintf("Pageout Statistics:\n");
6014 db_indent += 2;
6015 iprintf("active %5d inactv %5d\n",
6016 vm_pageout_active, vm_pageout_inactive);
6017 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6018 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6019 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6020 iprintf("used %5d clean %5d dirty %5d\n",
6021 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6022 vm_pageout_inactive_dirty);
6023 #if MACH_COUNTERS
6024 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6025 #endif /* MACH_COUNTERS */
6026 #if MACH_CLUSTER_STATS
6027 iprintf("Cluster Statistics:\n");
6028 db_indent += 2;
6029 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6030 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6031 vm_pageout_cluster_collisions);
6032 iprintf("clusters %5d conversions %5d\n",
6033 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6034 db_indent -= 2;
6035 iprintf("Target Statistics:\n");
6036 db_indent += 2;
6037 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6038 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6039 vm_pageout_target_page_freed);
6040 db_indent -= 2;
6041 #endif /* MACH_CLUSTER_STATS */
6042 db_indent -= 2;
6043 }
6044
6045 #endif /* MACH_KDB */