]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
a84614ed6d3c144118e85786922d60c0f36b0b5e
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * @OSF_COPYRIGHT@
32 */
33 /*
34 * Mach Operating System
35 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
36 * All Rights Reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and its
39 * documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
46 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
51 * School of Computer Science
52 * Carnegie Mellon University
53 * Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie Mellon
56 * the rights to redistribute these changes.
57 */
58 /*
59 */
60 /*
61 * File: vm/vm_pageout.c
62 * Author: Avadis Tevanian, Jr., Michael Wayne Young
63 * Date: 1985
64 *
65 * The proverbial page-out daemon.
66 */
67
68 #include <stdint.h>
69
70 #include <debug.h>
71 #include <mach_pagemap.h>
72 #include <mach_cluster_stats.h>
73 #include <mach_kdb.h>
74 #include <advisory_pageout.h>
75
76 #include <mach/mach_types.h>
77 #include <mach/memory_object.h>
78 #include <mach/memory_object_default.h>
79 #include <mach/memory_object_control_server.h>
80 #include <mach/mach_host_server.h>
81 #include <mach/upl.h>
82 #include <mach/vm_map.h>
83 #include <mach/vm_param.h>
84 #include <mach/vm_statistics.h>
85
86 #include <kern/kern_types.h>
87 #include <kern/counters.h>
88 #include <kern/host_statistics.h>
89 #include <kern/machine.h>
90 #include <kern/misc_protos.h>
91 #include <kern/thread.h>
92 #include <kern/xpr.h>
93 #include <kern/kalloc.h>
94
95 #include <machine/vm_tuning.h>
96
97 #include <vm/pmap.h>
98 #include <vm/vm_fault.h>
99 #include <vm/vm_map.h>
100 #include <vm/vm_object.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/vm_protos.h> /* must be last */
104
105 /*
106 * ENCRYPTED SWAP:
107 */
108 #ifdef __ppc__
109 #include <ppc/mappings.h>
110 #endif /* __ppc__ */
111 #include <../bsd/crypto/aes/aes.h>
112
113 extern ipc_port_t memory_manager_default;
114
115
116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
117 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
118 #endif
119
120 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
121 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
122 #endif
123
124 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
125 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
126 #endif
127
128 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
129 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
130 #endif
131
132 #ifndef VM_PAGE_LAUNDRY_MAX
133 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
134 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
135
136 #ifndef VM_PAGEOUT_BURST_WAIT
137 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
138 #endif /* VM_PAGEOUT_BURST_WAIT */
139
140 #ifndef VM_PAGEOUT_EMPTY_WAIT
141 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
142 #endif /* VM_PAGEOUT_EMPTY_WAIT */
143
144 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
145 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
146 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
147
148 #ifndef VM_PAGEOUT_IDLE_WAIT
149 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
150 #endif /* VM_PAGEOUT_IDLE_WAIT */
151
152
153 /*
154 * To obtain a reasonable LRU approximation, the inactive queue
155 * needs to be large enough to give pages on it a chance to be
156 * referenced a second time. This macro defines the fraction
157 * of active+inactive pages that should be inactive.
158 * The pageout daemon uses it to update vm_page_inactive_target.
159 *
160 * If vm_page_free_count falls below vm_page_free_target and
161 * vm_page_inactive_count is below vm_page_inactive_target,
162 * then the pageout daemon starts running.
163 */
164
165 #ifndef VM_PAGE_INACTIVE_TARGET
166 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
167 #endif /* VM_PAGE_INACTIVE_TARGET */
168
169 /*
170 * Once the pageout daemon starts running, it keeps going
171 * until vm_page_free_count meets or exceeds vm_page_free_target.
172 */
173
174 #ifndef VM_PAGE_FREE_TARGET
175 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
176 #endif /* VM_PAGE_FREE_TARGET */
177
178 /*
179 * The pageout daemon always starts running once vm_page_free_count
180 * falls below vm_page_free_min.
181 */
182
183 #ifndef VM_PAGE_FREE_MIN
184 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
185 #endif /* VM_PAGE_FREE_MIN */
186
187 /*
188 * When vm_page_free_count falls below vm_page_free_reserved,
189 * only vm-privileged threads can allocate pages. vm-privilege
190 * allows the pageout daemon and default pager (and any other
191 * associated threads needed for default pageout) to continue
192 * operation by dipping into the reserved pool of pages.
193 */
194
195 #ifndef VM_PAGE_FREE_RESERVED
196 #define VM_PAGE_FREE_RESERVED(n) \
197 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
198 #endif /* VM_PAGE_FREE_RESERVED */
199
200
201 /*
202 * must hold the page queues lock to
203 * manipulate this structure
204 */
205 struct vm_pageout_queue {
206 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
207 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
208 unsigned int pgo_maxlaundry;
209
210 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
211 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
212 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
213 :0;
214 };
215
216 #define VM_PAGE_Q_THROTTLED(q) \
217 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
218
219
220 /*
221 * Exported variable used to broadcast the activation of the pageout scan
222 * Working Set uses this to throttle its use of pmap removes. In this
223 * way, code which runs within memory in an uncontested context does
224 * not keep encountering soft faults.
225 */
226
227 unsigned int vm_pageout_scan_event_counter = 0;
228
229 /*
230 * Forward declarations for internal routines.
231 */
232
233 static void vm_pageout_garbage_collect(int);
234 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
235 static void vm_pageout_iothread_external(void);
236 static void vm_pageout_iothread_internal(void);
237 static void vm_pageout_queue_steal(vm_page_t);
238
239 extern void vm_pageout_continue(void);
240 extern void vm_pageout_scan(void);
241
242 unsigned int vm_pageout_reserved_internal = 0;
243 unsigned int vm_pageout_reserved_really = 0;
244
245 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
246 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
247 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
248 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
249 unsigned int vm_pageout_deadlock_relief = 0;
250 unsigned int vm_pageout_inactive_relief = 0;
251 unsigned int vm_pageout_burst_active_throttle = 0;
252 unsigned int vm_pageout_burst_inactive_throttle = 0;
253
254 /*
255 * Protection against zero fill flushing live working sets derived
256 * from existing backing store and files
257 */
258 unsigned int vm_accellerate_zf_pageout_trigger = 400;
259 unsigned int vm_zf_iterator;
260 unsigned int vm_zf_iterator_count = 40;
261 unsigned int last_page_zf;
262 unsigned int vm_zf_count = 0;
263
264 /*
265 * These variables record the pageout daemon's actions:
266 * how many pages it looks at and what happens to those pages.
267 * No locking needed because only one thread modifies the variables.
268 */
269
270 unsigned int vm_pageout_active = 0; /* debugging */
271 unsigned int vm_pageout_inactive = 0; /* debugging */
272 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
273 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
274 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
275 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
276 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
277 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
278 unsigned int vm_pageout_inactive_used = 0; /* debugging */
279 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
280 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
281 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
282 unsigned int vm_pageout_purged_objects = 0; /* debugging */
283 unsigned int vm_stat_discard = 0; /* debugging */
284 unsigned int vm_stat_discard_sent = 0; /* debugging */
285 unsigned int vm_stat_discard_failure = 0; /* debugging */
286 unsigned int vm_stat_discard_throttle = 0; /* debugging */
287
288 unsigned int vm_pageout_scan_active_throttled = 0;
289 unsigned int vm_pageout_scan_inactive_throttled = 0;
290 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
291 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
292 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
293 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
294 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
295 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
296 /*
297 * Backing store throttle when BS is exhausted
298 */
299 unsigned int vm_backing_store_low = 0;
300
301 unsigned int vm_pageout_out_of_line = 0;
302 unsigned int vm_pageout_in_place = 0;
303
304 /*
305 * ENCRYPTED SWAP:
306 * counters and statistics...
307 */
308 unsigned long vm_page_decrypt_counter = 0;
309 unsigned long vm_page_decrypt_for_upl_counter = 0;
310 unsigned long vm_page_encrypt_counter = 0;
311 unsigned long vm_page_encrypt_abort_counter = 0;
312 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
313 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
314
315
316 struct vm_pageout_queue vm_pageout_queue_internal;
317 struct vm_pageout_queue vm_pageout_queue_external;
318
319
320 /*
321 * Routine: vm_backing_store_disable
322 * Purpose:
323 * Suspend non-privileged threads wishing to extend
324 * backing store when we are low on backing store
325 * (Synchronized by caller)
326 */
327 void
328 vm_backing_store_disable(
329 boolean_t disable)
330 {
331 if(disable) {
332 vm_backing_store_low = 1;
333 } else {
334 if(vm_backing_store_low) {
335 vm_backing_store_low = 0;
336 thread_wakeup((event_t) &vm_backing_store_low);
337 }
338 }
339 }
340
341
342 /*
343 * Routine: vm_pageout_object_allocate
344 * Purpose:
345 * Allocate an object for use as out-of-line memory in a
346 * data_return/data_initialize message.
347 * The page must be in an unlocked object.
348 *
349 * If the page belongs to a trusted pager, cleaning in place
350 * will be used, which utilizes a special "pageout object"
351 * containing private alias pages for the real page frames.
352 * Untrusted pagers use normal out-of-line memory.
353 */
354 vm_object_t
355 vm_pageout_object_allocate(
356 vm_page_t m,
357 vm_size_t size,
358 vm_object_offset_t offset)
359 {
360 vm_object_t object = m->object;
361 vm_object_t new_object;
362
363 assert(object->pager_ready);
364
365 new_object = vm_object_allocate(size);
366
367 if (object->pager_trusted) {
368 assert (offset < object->size);
369
370 vm_object_lock(new_object);
371 new_object->pageout = TRUE;
372 new_object->shadow = object;
373 new_object->can_persist = FALSE;
374 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
375 new_object->shadow_offset = offset;
376 vm_object_unlock(new_object);
377
378 /*
379 * Take a paging reference on the object. This will be dropped
380 * in vm_pageout_object_terminate()
381 */
382 vm_object_lock(object);
383 vm_object_paging_begin(object);
384 vm_page_lock_queues();
385 vm_page_unlock_queues();
386 vm_object_unlock(object);
387
388 vm_pageout_in_place++;
389 } else
390 vm_pageout_out_of_line++;
391 return(new_object);
392 }
393
394 #if MACH_CLUSTER_STATS
395 unsigned long vm_pageout_cluster_dirtied = 0;
396 unsigned long vm_pageout_cluster_cleaned = 0;
397 unsigned long vm_pageout_cluster_collisions = 0;
398 unsigned long vm_pageout_cluster_clusters = 0;
399 unsigned long vm_pageout_cluster_conversions = 0;
400 unsigned long vm_pageout_target_collisions = 0;
401 unsigned long vm_pageout_target_page_dirtied = 0;
402 unsigned long vm_pageout_target_page_freed = 0;
403 #define CLUSTER_STAT(clause) clause
404 #else /* MACH_CLUSTER_STATS */
405 #define CLUSTER_STAT(clause)
406 #endif /* MACH_CLUSTER_STATS */
407
408 /*
409 * Routine: vm_pageout_object_terminate
410 * Purpose:
411 * Destroy the pageout_object allocated by
412 * vm_pageout_object_allocate(), and perform all of the
413 * required cleanup actions.
414 *
415 * In/Out conditions:
416 * The object must be locked, and will be returned locked.
417 */
418 void
419 vm_pageout_object_terminate(
420 vm_object_t object)
421 {
422 vm_object_t shadow_object;
423 boolean_t shadow_internal;
424
425 /*
426 * Deal with the deallocation (last reference) of a pageout object
427 * (used for cleaning-in-place) by dropping the paging references/
428 * freeing pages in the original object.
429 */
430
431 assert(object->pageout);
432 shadow_object = object->shadow;
433 vm_object_lock(shadow_object);
434 shadow_internal = shadow_object->internal;
435
436 while (!queue_empty(&object->memq)) {
437 vm_page_t p, m;
438 vm_object_offset_t offset;
439
440 p = (vm_page_t) queue_first(&object->memq);
441
442 assert(p->private);
443 assert(p->pageout);
444 p->pageout = FALSE;
445 assert(!p->cleaning);
446
447 offset = p->offset;
448 VM_PAGE_FREE(p);
449 p = VM_PAGE_NULL;
450
451 m = vm_page_lookup(shadow_object,
452 offset + object->shadow_offset);
453
454 if(m == VM_PAGE_NULL)
455 continue;
456 assert(m->cleaning);
457 /* used as a trigger on upl_commit etc to recognize the */
458 /* pageout daemon's subseqent desire to pageout a cleaning */
459 /* page. When the bit is on the upl commit code will */
460 /* respect the pageout bit in the target page over the */
461 /* caller's page list indication */
462 m->dump_cleaning = FALSE;
463
464 /*
465 * Account for the paging reference taken when
466 * m->cleaning was set on this page.
467 */
468 vm_object_paging_end(shadow_object);
469 assert((m->dirty) || (m->precious) ||
470 (m->busy && m->cleaning));
471
472 /*
473 * Handle the trusted pager throttle.
474 * Also decrement the burst throttle (if external).
475 */
476 vm_page_lock_queues();
477 if (m->laundry) {
478 vm_pageout_throttle_up(m);
479 }
480
481 /*
482 * Handle the "target" page(s). These pages are to be freed if
483 * successfully cleaned. Target pages are always busy, and are
484 * wired exactly once. The initial target pages are not mapped,
485 * (so cannot be referenced or modified) but converted target
486 * pages may have been modified between the selection as an
487 * adjacent page and conversion to a target.
488 */
489 if (m->pageout) {
490 assert(m->busy);
491 assert(m->wire_count == 1);
492 m->cleaning = FALSE;
493 m->pageout = FALSE;
494 #if MACH_CLUSTER_STATS
495 if (m->wanted) vm_pageout_target_collisions++;
496 #endif
497 /*
498 * Revoke all access to the page. Since the object is
499 * locked, and the page is busy, this prevents the page
500 * from being dirtied after the pmap_disconnect() call
501 * returns.
502 *
503 * Since the page is left "dirty" but "not modifed", we
504 * can detect whether the page was redirtied during
505 * pageout by checking the modify state.
506 */
507 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
508 m->dirty = TRUE;
509 else
510 m->dirty = FALSE;
511
512 if (m->dirty) {
513 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
514 vm_page_unwire(m);/* reactivates */
515 VM_STAT(reactivations++);
516 PAGE_WAKEUP_DONE(m);
517 } else {
518 CLUSTER_STAT(vm_pageout_target_page_freed++;)
519 vm_page_free(m);/* clears busy, etc. */
520 }
521 vm_page_unlock_queues();
522 continue;
523 }
524 /*
525 * Handle the "adjacent" pages. These pages were cleaned in
526 * place, and should be left alone.
527 * If prep_pin_count is nonzero, then someone is using the
528 * page, so make it active.
529 */
530 if (!m->active && !m->inactive && !m->private) {
531 if (m->reference)
532 vm_page_activate(m);
533 else
534 vm_page_deactivate(m);
535 }
536 if((m->busy) && (m->cleaning)) {
537
538 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
539 m->busy = FALSE;
540
541 /* We do not re-set m->dirty ! */
542 /* The page was busy so no extraneous activity */
543 /* could have occurred. COPY_INTO is a read into the */
544 /* new pages. CLEAN_IN_PLACE does actually write */
545 /* out the pages but handling outside of this code */
546 /* will take care of resetting dirty. We clear the */
547 /* modify however for the Programmed I/O case. */
548 pmap_clear_modify(m->phys_page);
549 if(m->absent) {
550 m->absent = FALSE;
551 if(shadow_object->absent_count == 1)
552 vm_object_absent_release(shadow_object);
553 else
554 shadow_object->absent_count--;
555 }
556 m->overwriting = FALSE;
557 } else if (m->overwriting) {
558 /* alternate request page list, write to page_list */
559 /* case. Occurs when the original page was wired */
560 /* at the time of the list request */
561 assert(m->wire_count != 0);
562 vm_page_unwire(m);/* reactivates */
563 m->overwriting = FALSE;
564 } else {
565 /*
566 * Set the dirty state according to whether or not the page was
567 * modified during the pageout. Note that we purposefully do
568 * NOT call pmap_clear_modify since the page is still mapped.
569 * If the page were to be dirtied between the 2 calls, this
570 * this fact would be lost. This code is only necessary to
571 * maintain statistics, since the pmap module is always
572 * consulted if m->dirty is false.
573 */
574 #if MACH_CLUSTER_STATS
575 m->dirty = pmap_is_modified(m->phys_page);
576
577 if (m->dirty) vm_pageout_cluster_dirtied++;
578 else vm_pageout_cluster_cleaned++;
579 if (m->wanted) vm_pageout_cluster_collisions++;
580 #else
581 m->dirty = 0;
582 #endif
583 }
584 m->cleaning = FALSE;
585
586 /*
587 * Wakeup any thread waiting for the page to be un-cleaning.
588 */
589 PAGE_WAKEUP(m);
590 vm_page_unlock_queues();
591 }
592 /*
593 * Account for the paging reference taken in vm_paging_object_allocate.
594 */
595 vm_object_paging_end(shadow_object);
596 vm_object_unlock(shadow_object);
597
598 assert(object->ref_count == 0);
599 assert(object->paging_in_progress == 0);
600 assert(object->resident_page_count == 0);
601 return;
602 }
603
604 /*
605 * Routine: vm_pageout_setup
606 * Purpose:
607 * Set up a page for pageout (clean & flush).
608 *
609 * Move the page to a new object, as part of which it will be
610 * sent to its memory manager in a memory_object_data_write or
611 * memory_object_initialize message.
612 *
613 * The "new_object" and "new_offset" arguments
614 * indicate where the page should be moved.
615 *
616 * In/Out conditions:
617 * The page in question must not be on any pageout queues,
618 * and must be busy. The object to which it belongs
619 * must be unlocked, and the caller must hold a paging
620 * reference to it. The new_object must not be locked.
621 *
622 * This routine returns a pointer to a place-holder page,
623 * inserted at the same offset, to block out-of-order
624 * requests for the page. The place-holder page must
625 * be freed after the data_write or initialize message
626 * has been sent.
627 *
628 * The original page is put on a paging queue and marked
629 * not busy on exit.
630 */
631 vm_page_t
632 vm_pageout_setup(
633 register vm_page_t m,
634 register vm_object_t new_object,
635 vm_object_offset_t new_offset)
636 {
637 register vm_object_t old_object = m->object;
638 vm_object_offset_t paging_offset;
639 vm_object_offset_t offset;
640 register vm_page_t holding_page;
641 register vm_page_t new_m;
642 boolean_t need_to_wire = FALSE;
643
644
645 XPR(XPR_VM_PAGEOUT,
646 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
647 (integer_t)m->object, (integer_t)m->offset,
648 (integer_t)m, (integer_t)new_object,
649 (integer_t)new_offset);
650 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
651 !m->restart);
652
653 assert(m->dirty || m->precious);
654
655 /*
656 * Create a place-holder page where the old one was, to prevent
657 * attempted pageins of this page while we're unlocked.
658 */
659 VM_PAGE_GRAB_FICTITIOUS(holding_page);
660
661 vm_object_lock(old_object);
662
663 offset = m->offset;
664 paging_offset = offset + old_object->paging_offset;
665
666 if (old_object->pager_trusted) {
667 /*
668 * This pager is trusted, so we can clean this page
669 * in place. Leave it in the old object, and mark it
670 * cleaning & pageout.
671 */
672 new_m = holding_page;
673 holding_page = VM_PAGE_NULL;
674
675 /*
676 * Set up new page to be private shadow of real page.
677 */
678 new_m->phys_page = m->phys_page;
679 new_m->fictitious = FALSE;
680 new_m->pageout = TRUE;
681
682 /*
683 * Mark real page as cleaning (indicating that we hold a
684 * paging reference to be released via m_o_d_r_c) and
685 * pageout (indicating that the page should be freed
686 * when the pageout completes).
687 */
688 pmap_clear_modify(m->phys_page);
689 vm_page_lock_queues();
690 new_m->private = TRUE;
691 vm_page_wire(new_m);
692 m->cleaning = TRUE;
693 m->pageout = TRUE;
694
695 vm_page_wire(m);
696 assert(m->wire_count == 1);
697 vm_page_unlock_queues();
698
699 m->dirty = TRUE;
700 m->precious = FALSE;
701 m->page_lock = VM_PROT_NONE;
702 m->unusual = FALSE;
703 m->unlock_request = VM_PROT_NONE;
704 } else {
705 /*
706 * Cannot clean in place, so rip the old page out of the
707 * object, and stick the holding page in. Set new_m to the
708 * page in the new object.
709 */
710 vm_page_lock_queues();
711 VM_PAGE_QUEUES_REMOVE(m);
712 vm_page_remove(m);
713
714 vm_page_insert(holding_page, old_object, offset);
715 vm_page_unlock_queues();
716
717 m->dirty = TRUE;
718 m->precious = FALSE;
719 new_m = m;
720 new_m->page_lock = VM_PROT_NONE;
721 new_m->unlock_request = VM_PROT_NONE;
722
723 if (old_object->internal)
724 need_to_wire = TRUE;
725 }
726 /*
727 * Record that this page has been written out
728 */
729 #if MACH_PAGEMAP
730 vm_external_state_set(old_object->existence_map, offset);
731 #endif /* MACH_PAGEMAP */
732
733 vm_object_unlock(old_object);
734
735 vm_object_lock(new_object);
736
737 /*
738 * Put the page into the new object. If it is a not wired
739 * (if it's the real page) it will be activated.
740 */
741
742 vm_page_lock_queues();
743 vm_page_insert(new_m, new_object, new_offset);
744 if (need_to_wire)
745 vm_page_wire(new_m);
746 else
747 vm_page_activate(new_m);
748 PAGE_WAKEUP_DONE(new_m);
749 vm_page_unlock_queues();
750
751 vm_object_unlock(new_object);
752
753 /*
754 * Return the placeholder page to simplify cleanup.
755 */
756 return (holding_page);
757 }
758
759 /*
760 * Routine: vm_pageclean_setup
761 *
762 * Purpose: setup a page to be cleaned (made non-dirty), but not
763 * necessarily flushed from the VM page cache.
764 * This is accomplished by cleaning in place.
765 *
766 * The page must not be busy, and the object and page
767 * queues must be locked.
768 *
769 */
770 void
771 vm_pageclean_setup(
772 vm_page_t m,
773 vm_page_t new_m,
774 vm_object_t new_object,
775 vm_object_offset_t new_offset)
776 {
777 vm_object_t old_object = m->object;
778 assert(!m->busy);
779 assert(!m->cleaning);
780
781 XPR(XPR_VM_PAGEOUT,
782 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
783 (integer_t)old_object, m->offset, (integer_t)m,
784 (integer_t)new_m, new_offset);
785
786 pmap_clear_modify(m->phys_page);
787 vm_object_paging_begin(old_object);
788
789 /*
790 * Record that this page has been written out
791 */
792 #if MACH_PAGEMAP
793 vm_external_state_set(old_object->existence_map, m->offset);
794 #endif /*MACH_PAGEMAP*/
795
796 /*
797 * Mark original page as cleaning in place.
798 */
799 m->cleaning = TRUE;
800 m->dirty = TRUE;
801 m->precious = FALSE;
802
803 /*
804 * Convert the fictitious page to a private shadow of
805 * the real page.
806 */
807 assert(new_m->fictitious);
808 new_m->fictitious = FALSE;
809 new_m->private = TRUE;
810 new_m->pageout = TRUE;
811 new_m->phys_page = m->phys_page;
812 vm_page_wire(new_m);
813
814 vm_page_insert(new_m, new_object, new_offset);
815 assert(!new_m->wanted);
816 new_m->busy = FALSE;
817 }
818
819 void
820 vm_pageclean_copy(
821 vm_page_t m,
822 vm_page_t new_m,
823 vm_object_t new_object,
824 vm_object_offset_t new_offset)
825 {
826 XPR(XPR_VM_PAGEOUT,
827 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
828 m, new_m, new_object, new_offset, 0);
829
830 assert((!m->busy) && (!m->cleaning));
831
832 assert(!new_m->private && !new_m->fictitious);
833
834 pmap_clear_modify(m->phys_page);
835
836 m->busy = TRUE;
837 vm_object_paging_begin(m->object);
838 vm_page_unlock_queues();
839 vm_object_unlock(m->object);
840
841 /*
842 * Copy the original page to the new page.
843 */
844 vm_page_copy(m, new_m);
845
846 /*
847 * Mark the old page as clean. A request to pmap_is_modified
848 * will get the right answer.
849 */
850 vm_object_lock(m->object);
851 m->dirty = FALSE;
852
853 vm_object_paging_end(m->object);
854
855 vm_page_lock_queues();
856 if (!m->active && !m->inactive)
857 vm_page_activate(m);
858 PAGE_WAKEUP_DONE(m);
859
860 vm_page_insert(new_m, new_object, new_offset);
861 vm_page_activate(new_m);
862 new_m->busy = FALSE; /* No other thread can be waiting */
863 }
864
865
866 /*
867 * Routine: vm_pageout_initialize_page
868 * Purpose:
869 * Causes the specified page to be initialized in
870 * the appropriate memory object. This routine is used to push
871 * pages into a copy-object when they are modified in the
872 * permanent object.
873 *
874 * The page is moved to a temporary object and paged out.
875 *
876 * In/out conditions:
877 * The page in question must not be on any pageout queues.
878 * The object to which it belongs must be locked.
879 * The page must be busy, but not hold a paging reference.
880 *
881 * Implementation:
882 * Move this page to a completely new object.
883 */
884 void
885 vm_pageout_initialize_page(
886 vm_page_t m)
887 {
888 vm_object_t object;
889 vm_object_offset_t paging_offset;
890 vm_page_t holding_page;
891
892
893 XPR(XPR_VM_PAGEOUT,
894 "vm_pageout_initialize_page, page 0x%X\n",
895 (integer_t)m, 0, 0, 0, 0);
896 assert(m->busy);
897
898 /*
899 * Verify that we really want to clean this page
900 */
901 assert(!m->absent);
902 assert(!m->error);
903 assert(m->dirty);
904
905 /*
906 * Create a paging reference to let us play with the object.
907 */
908 object = m->object;
909 paging_offset = m->offset + object->paging_offset;
910 vm_object_paging_begin(object);
911 if (m->absent || m->error || m->restart ||
912 (!m->dirty && !m->precious)) {
913 VM_PAGE_FREE(m);
914 panic("reservation without pageout?"); /* alan */
915 vm_object_unlock(object);
916 return;
917 }
918
919 /* set the page for future call to vm_fault_list_request */
920 holding_page = NULL;
921 vm_page_lock_queues();
922 pmap_clear_modify(m->phys_page);
923 m->dirty = TRUE;
924 m->busy = TRUE;
925 m->list_req_pending = TRUE;
926 m->cleaning = TRUE;
927 m->pageout = TRUE;
928 vm_page_wire(m);
929 vm_page_unlock_queues();
930 vm_object_unlock(object);
931
932 /*
933 * Write the data to its pager.
934 * Note that the data is passed by naming the new object,
935 * not a virtual address; the pager interface has been
936 * manipulated to use the "internal memory" data type.
937 * [The object reference from its allocation is donated
938 * to the eventual recipient.]
939 */
940 memory_object_data_initialize(object->pager,
941 paging_offset,
942 PAGE_SIZE);
943
944 vm_object_lock(object);
945 }
946
947 #if MACH_CLUSTER_STATS
948 #define MAXCLUSTERPAGES 16
949 struct {
950 unsigned long pages_in_cluster;
951 unsigned long pages_at_higher_offsets;
952 unsigned long pages_at_lower_offsets;
953 } cluster_stats[MAXCLUSTERPAGES];
954 #endif /* MACH_CLUSTER_STATS */
955
956 boolean_t allow_clustered_pageouts = FALSE;
957
958 /*
959 * vm_pageout_cluster:
960 *
961 * Given a page, queue it to the appropriate I/O thread,
962 * which will page it out and attempt to clean adjacent pages
963 * in the same operation.
964 *
965 * The page must be busy, and the object and queues locked. We will take a
966 * paging reference to prevent deallocation or collapse when we
967 * release the object lock back at the call site. The I/O thread
968 * is responsible for consuming this reference
969 *
970 * The page must not be on any pageout queue.
971 */
972
973 void
974 vm_pageout_cluster(vm_page_t m)
975 {
976 vm_object_t object = m->object;
977 struct vm_pageout_queue *q;
978
979
980 XPR(XPR_VM_PAGEOUT,
981 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
982 (integer_t)object, m->offset, (integer_t)m, 0, 0);
983
984 /*
985 * Only a certain kind of page is appreciated here.
986 */
987 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
988 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
989
990 /*
991 * protect the object from collapse -
992 * locking in the object's paging_offset.
993 */
994 vm_object_paging_begin(object);
995
996 /*
997 * set the page for future call to vm_fault_list_request
998 * page should already be marked busy
999 */
1000 vm_page_wire(m);
1001 m->list_req_pending = TRUE;
1002 m->cleaning = TRUE;
1003 m->pageout = TRUE;
1004 m->laundry = TRUE;
1005
1006 if (object->internal == TRUE)
1007 q = &vm_pageout_queue_internal;
1008 else
1009 q = &vm_pageout_queue_external;
1010 q->pgo_laundry++;
1011
1012 m->pageout_queue = TRUE;
1013 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1014
1015 if (q->pgo_idle == TRUE) {
1016 q->pgo_idle = FALSE;
1017 thread_wakeup((event_t) &q->pgo_pending);
1018 }
1019 }
1020
1021
1022 unsigned long vm_pageout_throttle_up_count = 0;
1023
1024 /*
1025 * A page is back from laundry. See if there are some pages waiting to
1026 * go to laundry and if we can let some of them go now.
1027 *
1028 * Object and page queues must be locked.
1029 */
1030 void
1031 vm_pageout_throttle_up(
1032 vm_page_t m)
1033 {
1034 struct vm_pageout_queue *q;
1035
1036 vm_pageout_throttle_up_count++;
1037
1038 assert(m->laundry);
1039 assert(m->object != VM_OBJECT_NULL);
1040 assert(m->object != kernel_object);
1041
1042 if (m->object->internal == TRUE)
1043 q = &vm_pageout_queue_internal;
1044 else
1045 q = &vm_pageout_queue_external;
1046
1047 m->laundry = FALSE;
1048 q->pgo_laundry--;
1049
1050 if (q->pgo_throttled == TRUE) {
1051 q->pgo_throttled = FALSE;
1052 thread_wakeup((event_t) &q->pgo_laundry);
1053 }
1054 }
1055
1056
1057 /*
1058 * vm_pageout_scan does the dirty work for the pageout daemon.
1059 * It returns with vm_page_queue_free_lock held and
1060 * vm_page_free_wanted == 0.
1061 */
1062
1063 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1064
1065 #define FCS_IDLE 0
1066 #define FCS_DELAYED 1
1067 #define FCS_DEADLOCK_DETECTED 2
1068
1069 struct flow_control {
1070 int state;
1071 mach_timespec_t ts;
1072 };
1073
1074 void
1075 vm_pageout_scan(void)
1076 {
1077 unsigned int loop_count = 0;
1078 unsigned int inactive_burst_count = 0;
1079 unsigned int active_burst_count = 0;
1080 vm_page_t local_freeq = 0;
1081 int local_freed = 0;
1082 int delayed_unlock = 0;
1083 int need_internal_inactive = 0;
1084 int refmod_state = 0;
1085 int vm_pageout_deadlock_target = 0;
1086 struct vm_pageout_queue *iq;
1087 struct vm_pageout_queue *eq;
1088 struct flow_control flow_control;
1089 boolean_t active_throttled = FALSE;
1090 boolean_t inactive_throttled = FALSE;
1091 mach_timespec_t ts;
1092 unsigned int msecs = 0;
1093 vm_object_t object;
1094
1095
1096 flow_control.state = FCS_IDLE;
1097 iq = &vm_pageout_queue_internal;
1098 eq = &vm_pageout_queue_external;
1099
1100 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1101
1102 /*???*/ /*
1103 * We want to gradually dribble pages from the active queue
1104 * to the inactive queue. If we let the inactive queue get
1105 * very small, and then suddenly dump many pages into it,
1106 * those pages won't get a sufficient chance to be referenced
1107 * before we start taking them from the inactive queue.
1108 *
1109 * We must limit the rate at which we send pages to the pagers.
1110 * data_write messages consume memory, for message buffers and
1111 * for map-copy objects. If we get too far ahead of the pagers,
1112 * we can potentially run out of memory.
1113 *
1114 * We can use the laundry count to limit directly the number
1115 * of pages outstanding to the default pager. A similar
1116 * strategy for external pagers doesn't work, because
1117 * external pagers don't have to deallocate the pages sent them,
1118 * and because we might have to send pages to external pagers
1119 * even if they aren't processing writes. So we also
1120 * use a burst count to limit writes to external pagers.
1121 *
1122 * When memory is very tight, we can't rely on external pagers to
1123 * clean pages. They probably aren't running, because they
1124 * aren't vm-privileged. If we kept sending dirty pages to them,
1125 * we could exhaust the free list.
1126 */
1127 vm_page_lock_queues();
1128 delayed_unlock = 1;
1129
1130
1131 Restart:
1132 /*
1133 * Recalculate vm_page_inactivate_target.
1134 */
1135 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1136 vm_page_inactive_count);
1137 object = NULL;
1138
1139 for (;;) {
1140 vm_page_t m;
1141
1142 if (delayed_unlock == 0)
1143 vm_page_lock_queues();
1144
1145 active_burst_count = vm_page_active_count;
1146
1147 if (active_burst_count > vm_pageout_burst_active_throttle)
1148 active_burst_count = vm_pageout_burst_active_throttle;
1149
1150 /*
1151 * Move pages from active to inactive.
1152 */
1153 while ((need_internal_inactive ||
1154 vm_page_inactive_count < vm_page_inactive_target) &&
1155 !queue_empty(&vm_page_queue_active) &&
1156 ((active_burst_count--) > 0)) {
1157
1158 vm_pageout_active++;
1159
1160 m = (vm_page_t) queue_first(&vm_page_queue_active);
1161
1162 assert(m->active && !m->inactive);
1163 assert(!m->laundry);
1164 assert(m->object != kernel_object);
1165
1166 /*
1167 * Try to lock object; since we've already got the
1168 * page queues lock, we can only 'try' for this one.
1169 * if the 'try' fails, we need to do a mutex_pause
1170 * to allow the owner of the object lock a chance to
1171 * run... otherwise, we're likely to trip over this
1172 * object in the same state as we work our way through
1173 * the queue... clumps of pages associated with the same
1174 * object are fairly typical on the inactive and active queues
1175 */
1176 if (m->object != object) {
1177 if (object != NULL) {
1178 vm_object_unlock(object);
1179 object = NULL;
1180 }
1181 if (!vm_object_lock_try(m->object)) {
1182 /*
1183 * move page to end of active queue and continue
1184 */
1185 queue_remove(&vm_page_queue_active, m,
1186 vm_page_t, pageq);
1187 queue_enter(&vm_page_queue_active, m,
1188 vm_page_t, pageq);
1189
1190 goto done_with_activepage;
1191 }
1192 object = m->object;
1193 }
1194 /*
1195 * if the page is BUSY, then we pull it
1196 * off the active queue and leave it alone.
1197 * when BUSY is cleared, it will get stuck
1198 * back on the appropriate queue
1199 */
1200 if (m->busy) {
1201 queue_remove(&vm_page_queue_active, m,
1202 vm_page_t, pageq);
1203 m->pageq.next = NULL;
1204 m->pageq.prev = NULL;
1205
1206 if (!m->fictitious)
1207 vm_page_active_count--;
1208 m->active = FALSE;
1209
1210 goto done_with_activepage;
1211 }
1212 if (need_internal_inactive) {
1213 /*
1214 * If we're unable to make forward progress
1215 * with the current set of pages on the
1216 * inactive queue due to busy objects or
1217 * throttled pageout queues, then
1218 * move a page that is already clean
1219 * or belongs to a pageout queue that
1220 * isn't currently throttled
1221 */
1222 active_throttled = FALSE;
1223
1224 if (object->internal) {
1225 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1226 active_throttled = TRUE;
1227 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1228 active_throttled = TRUE;
1229 }
1230 if (active_throttled == TRUE) {
1231 if (!m->dirty) {
1232 refmod_state = pmap_get_refmod(m->phys_page);
1233
1234 if (refmod_state & VM_MEM_REFERENCED)
1235 m->reference = TRUE;
1236 if (refmod_state & VM_MEM_MODIFIED)
1237 m->dirty = TRUE;
1238 }
1239 if (m->dirty || m->precious) {
1240 /*
1241 * page is dirty and targets a THROTTLED queue
1242 * so all we can do is move it back to the
1243 * end of the active queue to get it out
1244 * of the way
1245 */
1246 queue_remove(&vm_page_queue_active, m,
1247 vm_page_t, pageq);
1248 queue_enter(&vm_page_queue_active, m,
1249 vm_page_t, pageq);
1250
1251 vm_pageout_scan_active_throttled++;
1252
1253 goto done_with_activepage;
1254 }
1255 }
1256 vm_pageout_scan_active_throttle_success++;
1257 need_internal_inactive--;
1258 }
1259 /*
1260 * Deactivate the page while holding the object
1261 * locked, so we know the page is still not busy.
1262 * This should prevent races between pmap_enter
1263 * and pmap_clear_reference. The page might be
1264 * absent or fictitious, but vm_page_deactivate
1265 * can handle that.
1266 */
1267 vm_page_deactivate(m);
1268 done_with_activepage:
1269 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1270
1271 if (object != NULL) {
1272 vm_object_unlock(object);
1273 object = NULL;
1274 }
1275 if (local_freeq) {
1276 vm_page_free_list(local_freeq);
1277
1278 local_freeq = 0;
1279 local_freed = 0;
1280 }
1281 delayed_unlock = 0;
1282 vm_page_unlock_queues();
1283
1284 mutex_pause();
1285 vm_page_lock_queues();
1286 /*
1287 * continue the while loop processing
1288 * the active queue... need to hold
1289 * the page queues lock
1290 */
1291 continue;
1292 }
1293 }
1294
1295
1296
1297 /**********************************************************************
1298 * above this point we're playing with the active queue
1299 * below this point we're playing with the throttling mechanisms
1300 * and the inactive queue
1301 **********************************************************************/
1302
1303
1304
1305 /*
1306 * We are done if we have met our target *and*
1307 * nobody is still waiting for a page.
1308 */
1309 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1310 if (object != NULL) {
1311 vm_object_unlock(object);
1312 object = NULL;
1313 }
1314 if (local_freeq) {
1315 vm_page_free_list(local_freeq);
1316
1317 local_freeq = 0;
1318 local_freed = 0;
1319 }
1320 mutex_lock(&vm_page_queue_free_lock);
1321
1322 if ((vm_page_free_count >= vm_page_free_target) &&
1323 (vm_page_free_wanted == 0)) {
1324
1325 vm_page_unlock_queues();
1326
1327 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1328 return;
1329 }
1330 mutex_unlock(&vm_page_queue_free_lock);
1331 }
1332
1333
1334 /*
1335 * Sometimes we have to pause:
1336 * 1) No inactive pages - nothing to do.
1337 * 2) Flow control - default pageout queue is full
1338 * 3) Loop control - no acceptable pages found on the inactive queue
1339 * within the last vm_pageout_burst_inactive_throttle iterations
1340 */
1341 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1342 vm_pageout_scan_empty_throttle++;
1343 msecs = vm_pageout_empty_wait;
1344 goto vm_pageout_scan_delay;
1345
1346 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1347 vm_pageout_scan_burst_throttle++;
1348 msecs = vm_pageout_burst_wait;
1349 goto vm_pageout_scan_delay;
1350
1351 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1352
1353 switch (flow_control.state) {
1354
1355 case FCS_IDLE:
1356 reset_deadlock_timer:
1357 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1358 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1359 clock_get_system_nanotime(
1360 &flow_control.ts.tv_sec,
1361 (uint32_t *) &flow_control.ts.tv_nsec);
1362 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1363
1364 flow_control.state = FCS_DELAYED;
1365 msecs = vm_pageout_deadlock_wait;
1366
1367 break;
1368
1369 case FCS_DELAYED:
1370 clock_get_system_nanotime(
1371 &ts.tv_sec,
1372 (uint32_t *) &ts.tv_nsec);
1373
1374 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1375 /*
1376 * the pageout thread for the default pager is potentially
1377 * deadlocked since the
1378 * default pager queue has been throttled for more than the
1379 * allowable time... we need to move some clean pages or dirty
1380 * pages belonging to the external pagers if they aren't throttled
1381 * vm_page_free_wanted represents the number of threads currently
1382 * blocked waiting for pages... we'll move one page for each of
1383 * these plus a fixed amount to break the logjam... once we're done
1384 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1385 * with a new timeout target since we have no way of knowing
1386 * whether we've broken the deadlock except through observation
1387 * of the queue associated with the default pager... we need to
1388 * stop moving pagings and allow the system to run to see what
1389 * state it settles into.
1390 */
1391 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1392 vm_pageout_scan_deadlock_detected++;
1393 flow_control.state = FCS_DEADLOCK_DETECTED;
1394
1395 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1396 goto consider_inactive;
1397 }
1398 /*
1399 * just resniff instead of trying
1400 * to compute a new delay time... we're going to be
1401 * awakened immediately upon a laundry completion,
1402 * so we won't wait any longer than necessary
1403 */
1404 msecs = vm_pageout_idle_wait;
1405 break;
1406
1407 case FCS_DEADLOCK_DETECTED:
1408 if (vm_pageout_deadlock_target)
1409 goto consider_inactive;
1410 goto reset_deadlock_timer;
1411
1412 }
1413 vm_pageout_scan_throttle++;
1414 iq->pgo_throttled = TRUE;
1415 vm_pageout_scan_delay:
1416 if (object != NULL) {
1417 vm_object_unlock(object);
1418 object = NULL;
1419 }
1420 if (local_freeq) {
1421 vm_page_free_list(local_freeq);
1422
1423 local_freeq = 0;
1424 local_freed = 0;
1425 }
1426 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1427
1428 counter(c_vm_pageout_scan_block++);
1429
1430 vm_page_unlock_queues();
1431
1432 thread_block(THREAD_CONTINUE_NULL);
1433
1434 vm_page_lock_queues();
1435 delayed_unlock = 1;
1436
1437 iq->pgo_throttled = FALSE;
1438
1439 if (loop_count >= vm_page_inactive_count) {
1440 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1441 /*
1442 * Make sure we move enough "appropriate"
1443 * pages to the inactive queue before trying
1444 * again.
1445 */
1446 need_internal_inactive = vm_pageout_inactive_relief;
1447 }
1448 loop_count = 0;
1449 }
1450 inactive_burst_count = 0;
1451
1452 goto Restart;
1453 /*NOTREACHED*/
1454 }
1455
1456
1457 flow_control.state = FCS_IDLE;
1458 consider_inactive:
1459 loop_count++;
1460 inactive_burst_count++;
1461 vm_pageout_inactive++;
1462
1463 if (!queue_empty(&vm_page_queue_inactive)) {
1464 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1465
1466 if (m->clustered && (m->no_isync == TRUE)) {
1467 goto use_this_page;
1468 }
1469 }
1470 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1471 vm_zf_iterator = 0;
1472 } else {
1473 last_page_zf = 0;
1474 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1475 vm_zf_iterator = 0;
1476 }
1477 }
1478 if (queue_empty(&vm_page_queue_zf) ||
1479 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1480 !queue_empty(&vm_page_queue_inactive))) {
1481 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1482 last_page_zf = 0;
1483 } else {
1484 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1485 last_page_zf = 1;
1486 }
1487 use_this_page:
1488 assert(!m->active && m->inactive);
1489 assert(!m->laundry);
1490 assert(m->object != kernel_object);
1491
1492 /*
1493 * Try to lock object; since we've alread got the
1494 * page queues lock, we can only 'try' for this one.
1495 * if the 'try' fails, we need to do a mutex_pause
1496 * to allow the owner of the object lock a chance to
1497 * run... otherwise, we're likely to trip over this
1498 * object in the same state as we work our way through
1499 * the queue... clumps of pages associated with the same
1500 * object are fairly typical on the inactive and active queues
1501 */
1502 if (m->object != object) {
1503 if (object != NULL) {
1504 vm_object_unlock(object);
1505 object = NULL;
1506 }
1507 if (!vm_object_lock_try(m->object)) {
1508 /*
1509 * Move page to end and continue.
1510 * Don't re-issue ticket
1511 */
1512 if (m->zero_fill) {
1513 queue_remove(&vm_page_queue_zf, m,
1514 vm_page_t, pageq);
1515 queue_enter(&vm_page_queue_zf, m,
1516 vm_page_t, pageq);
1517 } else {
1518 queue_remove(&vm_page_queue_inactive, m,
1519 vm_page_t, pageq);
1520 queue_enter(&vm_page_queue_inactive, m,
1521 vm_page_t, pageq);
1522 }
1523 vm_pageout_inactive_nolock++;
1524
1525 /*
1526 * force us to dump any collected free pages
1527 * and to pause before moving on
1528 */
1529 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1530
1531 goto done_with_inactivepage;
1532 }
1533 object = m->object;
1534 }
1535 /*
1536 * If the page belongs to a purgable object with no pending copies
1537 * against it, then we reap all of the pages in the object
1538 * and note that the object has been "emptied". It'll be up to the
1539 * application the discover this and recreate its contents if desired.
1540 */
1541 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1542 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1543 object->copy == VM_OBJECT_NULL) {
1544
1545 (void) vm_object_purge(object);
1546 vm_pageout_purged_objects++;
1547 /*
1548 * we've just taken all of the pages from this object,
1549 * so drop the lock now since we're not going to find
1550 * any more pages belonging to it anytime soon
1551 */
1552 vm_object_unlock(object);
1553 object = NULL;
1554
1555 inactive_burst_count = 0;
1556
1557 goto done_with_inactivepage;
1558 }
1559
1560 /*
1561 * Paging out pages of external objects which
1562 * are currently being created must be avoided.
1563 * The pager may claim for memory, thus leading to a
1564 * possible dead lock between it and the pageout thread,
1565 * if such pages are finally chosen. The remaining assumption
1566 * is that there will finally be enough available pages in the
1567 * inactive pool to page out in order to satisfy all memory
1568 * claimed by the thread which concurrently creates the pager.
1569 */
1570 if (!object->pager_initialized && object->pager_created) {
1571 /*
1572 * Move page to end and continue, hoping that
1573 * there will be enough other inactive pages to
1574 * page out so that the thread which currently
1575 * initializes the pager will succeed.
1576 * Don't re-grant the ticket, the page should
1577 * pulled from the queue and paged out whenever
1578 * one of its logically adjacent fellows is
1579 * targeted.
1580 */
1581 if (m->zero_fill) {
1582 queue_remove(&vm_page_queue_zf, m,
1583 vm_page_t, pageq);
1584 queue_enter(&vm_page_queue_zf, m,
1585 vm_page_t, pageq);
1586 last_page_zf = 1;
1587 vm_zf_iterator = vm_zf_iterator_count - 1;
1588 } else {
1589 queue_remove(&vm_page_queue_inactive, m,
1590 vm_page_t, pageq);
1591 queue_enter(&vm_page_queue_inactive, m,
1592 vm_page_t, pageq);
1593 last_page_zf = 0;
1594 vm_zf_iterator = 1;
1595 }
1596 vm_pageout_inactive_avoid++;
1597
1598 goto done_with_inactivepage;
1599 }
1600 /*
1601 * Remove the page from the inactive list.
1602 */
1603 if (m->zero_fill) {
1604 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1605 } else {
1606 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1607 }
1608 m->pageq.next = NULL;
1609 m->pageq.prev = NULL;
1610 m->inactive = FALSE;
1611 if (!m->fictitious)
1612 vm_page_inactive_count--;
1613
1614 if (m->busy || !object->alive) {
1615 /*
1616 * Somebody is already playing with this page.
1617 * Leave it off the pageout queues.
1618 */
1619 vm_pageout_inactive_busy++;
1620
1621 goto done_with_inactivepage;
1622 }
1623
1624 /*
1625 * If it's absent or in error, we can reclaim the page.
1626 */
1627
1628 if (m->absent || m->error) {
1629 vm_pageout_inactive_absent++;
1630 reclaim_page:
1631 if (vm_pageout_deadlock_target) {
1632 vm_pageout_scan_inactive_throttle_success++;
1633 vm_pageout_deadlock_target--;
1634 }
1635 if (m->tabled)
1636 vm_page_remove(m); /* clears tabled, object, offset */
1637 if (m->absent)
1638 vm_object_absent_release(object);
1639
1640 assert(m->pageq.next == NULL &&
1641 m->pageq.prev == NULL);
1642 m->pageq.next = (queue_entry_t)local_freeq;
1643 local_freeq = m;
1644 local_freed++;
1645
1646 inactive_burst_count = 0;
1647
1648 goto done_with_inactivepage;
1649 }
1650
1651 assert(!m->private);
1652 assert(!m->fictitious);
1653
1654 /*
1655 * If already cleaning this page in place, convert from
1656 * "adjacent" to "target". We can leave the page mapped,
1657 * and vm_pageout_object_terminate will determine whether
1658 * to free or reactivate.
1659 */
1660
1661 if (m->cleaning) {
1662 m->busy = TRUE;
1663 m->pageout = TRUE;
1664 m->dump_cleaning = TRUE;
1665 vm_page_wire(m);
1666
1667 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1668
1669 inactive_burst_count = 0;
1670
1671 goto done_with_inactivepage;
1672 }
1673
1674 /*
1675 * If it's being used, reactivate.
1676 * (Fictitious pages are either busy or absent.)
1677 */
1678 if ( (!m->reference) ) {
1679 refmod_state = pmap_get_refmod(m->phys_page);
1680
1681 if (refmod_state & VM_MEM_REFERENCED)
1682 m->reference = TRUE;
1683 if (refmod_state & VM_MEM_MODIFIED)
1684 m->dirty = TRUE;
1685 }
1686 if (m->reference) {
1687 was_referenced:
1688 vm_page_activate(m);
1689 VM_STAT(reactivations++);
1690
1691 vm_pageout_inactive_used++;
1692 last_page_zf = 0;
1693 inactive_burst_count = 0;
1694
1695 goto done_with_inactivepage;
1696 }
1697
1698 XPR(XPR_VM_PAGEOUT,
1699 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1700 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1701
1702 /*
1703 * we've got a candidate page to steal...
1704 *
1705 * m->dirty is up to date courtesy of the
1706 * preceding check for m->reference... if
1707 * we get here, then m->reference had to be
1708 * FALSE which means we did a pmap_get_refmod
1709 * and updated both m->reference and m->dirty
1710 *
1711 * if it's dirty or precious we need to
1712 * see if the target queue is throtttled
1713 * it if is, we need to skip over it by moving it back
1714 * to the end of the inactive queue
1715 */
1716 inactive_throttled = FALSE;
1717
1718 if (m->dirty || m->precious) {
1719 if (object->internal) {
1720 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1721 inactive_throttled = TRUE;
1722 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1723 inactive_throttled = TRUE;
1724 }
1725 }
1726 if (inactive_throttled == TRUE) {
1727 if (m->zero_fill) {
1728 queue_enter(&vm_page_queue_zf, m,
1729 vm_page_t, pageq);
1730 } else {
1731 queue_enter(&vm_page_queue_inactive, m,
1732 vm_page_t, pageq);
1733 }
1734 if (!m->fictitious)
1735 vm_page_inactive_count++;
1736 m->inactive = TRUE;
1737
1738 vm_pageout_scan_inactive_throttled++;
1739
1740 goto done_with_inactivepage;
1741 }
1742 /*
1743 * we've got a page that we can steal...
1744 * eliminate all mappings and make sure
1745 * we have the up-to-date modified state
1746 * first take the page BUSY, so that no new
1747 * mappings can be made
1748 */
1749 m->busy = TRUE;
1750
1751 /*
1752 * if we need to do a pmap_disconnect then we
1753 * need to re-evaluate m->dirty since the pmap_disconnect
1754 * provides the true state atomically... the
1755 * page was still mapped up to the pmap_disconnect
1756 * and may have been dirtied at the last microsecond
1757 *
1758 * we also check for the page being referenced 'late'
1759 * if it was, we first need to do a WAKEUP_DONE on it
1760 * since we already set m->busy = TRUE, before
1761 * going off to reactivate it
1762 *
1763 * if we don't need the pmap_disconnect, then
1764 * m->dirty is up to date courtesy of the
1765 * earlier check for m->reference... if
1766 * we get here, then m->reference had to be
1767 * FALSE which means we did a pmap_get_refmod
1768 * and updated both m->reference and m->dirty...
1769 */
1770 if (m->no_isync == FALSE) {
1771 refmod_state = pmap_disconnect(m->phys_page);
1772
1773 if (refmod_state & VM_MEM_MODIFIED)
1774 m->dirty = TRUE;
1775 if (refmod_state & VM_MEM_REFERENCED) {
1776 m->reference = TRUE;
1777
1778 PAGE_WAKEUP_DONE(m);
1779 goto was_referenced;
1780 }
1781 }
1782 /*
1783 * If it's clean and not precious, we can free the page.
1784 */
1785 if (!m->dirty && !m->precious) {
1786 vm_pageout_inactive_clean++;
1787 goto reclaim_page;
1788 }
1789 vm_pageout_cluster(m);
1790
1791 vm_pageout_inactive_dirty++;
1792
1793 inactive_burst_count = 0;
1794
1795 done_with_inactivepage:
1796 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1797
1798 if (object != NULL) {
1799 vm_object_unlock(object);
1800 object = NULL;
1801 }
1802 if (local_freeq) {
1803 vm_page_free_list(local_freeq);
1804
1805 local_freeq = 0;
1806 local_freed = 0;
1807 }
1808 delayed_unlock = 0;
1809 vm_page_unlock_queues();
1810 mutex_pause();
1811 }
1812 /*
1813 * back to top of pageout scan loop
1814 */
1815 }
1816 }
1817
1818
1819 int vm_page_free_count_init;
1820
1821 void
1822 vm_page_free_reserve(
1823 int pages)
1824 {
1825 int free_after_reserve;
1826
1827 vm_page_free_reserved += pages;
1828
1829 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1830
1831 vm_page_free_min = vm_page_free_reserved +
1832 VM_PAGE_FREE_MIN(free_after_reserve);
1833
1834 vm_page_free_target = vm_page_free_reserved +
1835 VM_PAGE_FREE_TARGET(free_after_reserve);
1836
1837 if (vm_page_free_target < vm_page_free_min + 5)
1838 vm_page_free_target = vm_page_free_min + 5;
1839 }
1840
1841 /*
1842 * vm_pageout is the high level pageout daemon.
1843 */
1844
1845 void
1846 vm_pageout_continue(void)
1847 {
1848 vm_pageout_scan_event_counter++;
1849 vm_pageout_scan();
1850 /* we hold vm_page_queue_free_lock now */
1851 assert(vm_page_free_wanted == 0);
1852 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1853 mutex_unlock(&vm_page_queue_free_lock);
1854
1855 counter(c_vm_pageout_block++);
1856 thread_block((thread_continue_t)vm_pageout_continue);
1857 /*NOTREACHED*/
1858 }
1859
1860
1861 /*
1862 * must be called with the
1863 * queues and object locks held
1864 */
1865 static void
1866 vm_pageout_queue_steal(vm_page_t m)
1867 {
1868 struct vm_pageout_queue *q;
1869
1870 if (m->object->internal == TRUE)
1871 q = &vm_pageout_queue_internal;
1872 else
1873 q = &vm_pageout_queue_external;
1874
1875 m->laundry = FALSE;
1876 m->pageout_queue = FALSE;
1877 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1878
1879 m->pageq.next = NULL;
1880 m->pageq.prev = NULL;
1881
1882 vm_object_paging_end(m->object);
1883
1884 q->pgo_laundry--;
1885 }
1886
1887
1888 #ifdef FAKE_DEADLOCK
1889
1890 #define FAKE_COUNT 5000
1891
1892 int internal_count = 0;
1893 int fake_deadlock = 0;
1894
1895 #endif
1896
1897 static void
1898 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1899 {
1900 vm_page_t m = NULL;
1901 vm_object_t object;
1902 boolean_t need_wakeup;
1903
1904 vm_page_lock_queues();
1905
1906 while ( !queue_empty(&q->pgo_pending) ) {
1907
1908 q->pgo_busy = TRUE;
1909 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1910 m->pageout_queue = FALSE;
1911 vm_page_unlock_queues();
1912
1913 m->pageq.next = NULL;
1914 m->pageq.prev = NULL;
1915 #ifdef FAKE_DEADLOCK
1916 if (q == &vm_pageout_queue_internal) {
1917 vm_offset_t addr;
1918 int pg_count;
1919
1920 internal_count++;
1921
1922 if ((internal_count == FAKE_COUNT)) {
1923
1924 pg_count = vm_page_free_count + vm_page_free_reserved;
1925
1926 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1927 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1928 }
1929 internal_count = 0;
1930 fake_deadlock++;
1931 }
1932 }
1933 #endif
1934 object = m->object;
1935
1936 if (!object->pager_initialized) {
1937 vm_object_lock(object);
1938
1939 /*
1940 * If there is no memory object for the page, create
1941 * one and hand it to the default pager.
1942 */
1943
1944 if (!object->pager_initialized)
1945 vm_object_collapse(object,
1946 (vm_object_offset_t) 0,
1947 TRUE);
1948 if (!object->pager_initialized)
1949 vm_object_pager_create(object);
1950 if (!object->pager_initialized) {
1951 /*
1952 * Still no pager for the object.
1953 * Reactivate the page.
1954 *
1955 * Should only happen if there is no
1956 * default pager.
1957 */
1958 m->list_req_pending = FALSE;
1959 m->cleaning = FALSE;
1960 m->pageout = FALSE;
1961 vm_page_unwire(m);
1962
1963 vm_pageout_throttle_up(m);
1964
1965 vm_page_lock_queues();
1966 vm_pageout_dirty_no_pager++;
1967 vm_page_activate(m);
1968 vm_page_unlock_queues();
1969
1970 /*
1971 * And we are done with it.
1972 */
1973 PAGE_WAKEUP_DONE(m);
1974
1975 vm_object_paging_end(object);
1976 vm_object_unlock(object);
1977
1978 vm_page_lock_queues();
1979 continue;
1980 } else if (object->pager == MEMORY_OBJECT_NULL) {
1981 /*
1982 * This pager has been destroyed by either
1983 * memory_object_destroy or vm_object_destroy, and
1984 * so there is nowhere for the page to go.
1985 * Just free the page... VM_PAGE_FREE takes
1986 * care of cleaning up all the state...
1987 * including doing the vm_pageout_throttle_up
1988 */
1989 VM_PAGE_FREE(m);
1990
1991 vm_object_paging_end(object);
1992 vm_object_unlock(object);
1993
1994 vm_page_lock_queues();
1995 continue;
1996 }
1997 vm_object_unlock(object);
1998 }
1999 /*
2000 * we expect the paging_in_progress reference to have
2001 * already been taken on the object before it was added
2002 * to the appropriate pageout I/O queue... this will
2003 * keep the object from being terminated and/or the
2004 * paging_offset from changing until the I/O has
2005 * completed... therefore no need to lock the object to
2006 * pull the paging_offset from it.
2007 *
2008 * Send the data to the pager.
2009 * any pageout clustering happens there
2010 */
2011 memory_object_data_return(object->pager,
2012 m->offset + object->paging_offset,
2013 PAGE_SIZE,
2014 NULL,
2015 NULL,
2016 FALSE,
2017 FALSE,
2018 0);
2019
2020 vm_object_lock(object);
2021 vm_object_paging_end(object);
2022 vm_object_unlock(object);
2023
2024 vm_page_lock_queues();
2025 }
2026 assert_wait((event_t) q, THREAD_UNINT);
2027
2028
2029 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2030 q->pgo_throttled = FALSE;
2031 need_wakeup = TRUE;
2032 } else
2033 need_wakeup = FALSE;
2034
2035 q->pgo_busy = FALSE;
2036 q->pgo_idle = TRUE;
2037 vm_page_unlock_queues();
2038
2039 if (need_wakeup == TRUE)
2040 thread_wakeup((event_t) &q->pgo_laundry);
2041
2042 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2043 /*NOTREACHED*/
2044 }
2045
2046
2047 static void
2048 vm_pageout_iothread_external(void)
2049 {
2050
2051 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2052 /*NOTREACHED*/
2053 }
2054
2055
2056 static void
2057 vm_pageout_iothread_internal(void)
2058 {
2059 thread_t self = current_thread();
2060
2061 self->options |= TH_OPT_VMPRIV;
2062
2063 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2064 /*NOTREACHED*/
2065 }
2066
2067 static void
2068 vm_pageout_garbage_collect(int collect)
2069 {
2070 if (collect) {
2071 stack_collect();
2072
2073 /*
2074 * consider_zone_gc should be last, because the other operations
2075 * might return memory to zones.
2076 */
2077 consider_machine_collect();
2078 consider_zone_gc();
2079
2080 consider_machine_adjust();
2081 }
2082
2083 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2084
2085 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2086 /*NOTREACHED*/
2087 }
2088
2089
2090
2091 void
2092 vm_pageout(void)
2093 {
2094 thread_t self = current_thread();
2095 thread_t thread;
2096 kern_return_t result;
2097 spl_t s;
2098
2099 /*
2100 * Set thread privileges.
2101 */
2102 s = splsched();
2103 thread_lock(self);
2104 self->priority = BASEPRI_PREEMPT - 1;
2105 set_sched_pri(self, self->priority);
2106 thread_unlock(self);
2107 splx(s);
2108
2109 /*
2110 * Initialize some paging parameters.
2111 */
2112
2113 if (vm_pageout_idle_wait == 0)
2114 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2115
2116 if (vm_pageout_burst_wait == 0)
2117 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2118
2119 if (vm_pageout_empty_wait == 0)
2120 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2121
2122 if (vm_pageout_deadlock_wait == 0)
2123 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2124
2125 if (vm_pageout_deadlock_relief == 0)
2126 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2127
2128 if (vm_pageout_inactive_relief == 0)
2129 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2130
2131 if (vm_pageout_burst_active_throttle == 0)
2132 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2133
2134 if (vm_pageout_burst_inactive_throttle == 0)
2135 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2136
2137 /*
2138 * Set kernel task to low backing store privileged
2139 * status
2140 */
2141 task_lock(kernel_task);
2142 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2143 task_unlock(kernel_task);
2144
2145 vm_page_free_count_init = vm_page_free_count;
2146 vm_zf_iterator = 0;
2147 /*
2148 * even if we've already called vm_page_free_reserve
2149 * call it again here to insure that the targets are
2150 * accurately calculated (it uses vm_page_free_count_init)
2151 * calling it with an arg of 0 will not change the reserve
2152 * but will re-calculate free_min and free_target
2153 */
2154 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2155 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2156 } else
2157 vm_page_free_reserve(0);
2158
2159
2160 queue_init(&vm_pageout_queue_external.pgo_pending);
2161 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2162 vm_pageout_queue_external.pgo_laundry = 0;
2163 vm_pageout_queue_external.pgo_idle = FALSE;
2164 vm_pageout_queue_external.pgo_busy = FALSE;
2165 vm_pageout_queue_external.pgo_throttled = FALSE;
2166
2167 queue_init(&vm_pageout_queue_internal.pgo_pending);
2168 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2169 vm_pageout_queue_internal.pgo_laundry = 0;
2170 vm_pageout_queue_internal.pgo_idle = FALSE;
2171 vm_pageout_queue_internal.pgo_busy = FALSE;
2172 vm_pageout_queue_internal.pgo_throttled = FALSE;
2173
2174
2175 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2176 if (result != KERN_SUCCESS)
2177 panic("vm_pageout_iothread_internal: create failed");
2178
2179 thread_deallocate(thread);
2180
2181
2182 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2183 if (result != KERN_SUCCESS)
2184 panic("vm_pageout_iothread_external: create failed");
2185
2186 thread_deallocate(thread);
2187
2188
2189 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2190 if (result != KERN_SUCCESS)
2191 panic("vm_pageout_garbage_collect: create failed");
2192
2193 thread_deallocate(thread);
2194
2195
2196 vm_pageout_continue();
2197 /*NOTREACHED*/
2198 }
2199
2200
2201 static upl_t
2202 upl_create(
2203 int flags,
2204 upl_size_t size)
2205 {
2206 upl_t upl;
2207 int page_field_size; /* bit field in word size buf */
2208
2209 page_field_size = 0;
2210 if (flags & UPL_CREATE_LITE) {
2211 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2212 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2213 }
2214 if(flags & UPL_CREATE_INTERNAL) {
2215 upl = (upl_t)kalloc(sizeof(struct upl)
2216 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2217 + page_field_size);
2218 } else {
2219 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2220 }
2221 upl->flags = 0;
2222 upl->src_object = NULL;
2223 upl->kaddr = (vm_offset_t)0;
2224 upl->size = 0;
2225 upl->map_object = NULL;
2226 upl->ref_count = 1;
2227 upl->highest_page = 0;
2228 upl_lock_init(upl);
2229 #ifdef UPL_DEBUG
2230 upl->ubc_alias1 = 0;
2231 upl->ubc_alias2 = 0;
2232 #endif /* UPL_DEBUG */
2233 return(upl);
2234 }
2235
2236 static void
2237 upl_destroy(
2238 upl_t upl)
2239 {
2240 int page_field_size; /* bit field in word size buf */
2241
2242 #ifdef UPL_DEBUG
2243 {
2244 upl_t upl_ele;
2245 vm_object_t object;
2246 if (upl->map_object->pageout) {
2247 object = upl->map_object->shadow;
2248 } else {
2249 object = upl->map_object;
2250 }
2251 vm_object_lock(object);
2252 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2253 if(upl_ele == upl) {
2254 queue_remove(&object->uplq,
2255 upl_ele, upl_t, uplq);
2256 break;
2257 }
2258 }
2259 vm_object_unlock(object);
2260 }
2261 #endif /* UPL_DEBUG */
2262 /* drop a reference on the map_object whether or */
2263 /* not a pageout object is inserted */
2264 if(upl->map_object->pageout)
2265 vm_object_deallocate(upl->map_object);
2266
2267 page_field_size = 0;
2268 if (upl->flags & UPL_LITE) {
2269 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2270 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2271 }
2272 if(upl->flags & UPL_INTERNAL) {
2273 kfree(upl,
2274 sizeof(struct upl) +
2275 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2276 + page_field_size);
2277 } else {
2278 kfree(upl, sizeof(struct upl) + page_field_size);
2279 }
2280 }
2281
2282 void uc_upl_dealloc(upl_t upl);
2283 __private_extern__ void
2284 uc_upl_dealloc(
2285 upl_t upl)
2286 {
2287 upl->ref_count -= 1;
2288 if(upl->ref_count == 0) {
2289 upl_destroy(upl);
2290 }
2291 }
2292
2293 void
2294 upl_deallocate(
2295 upl_t upl)
2296 {
2297
2298 upl->ref_count -= 1;
2299 if(upl->ref_count == 0) {
2300 upl_destroy(upl);
2301 }
2302 }
2303
2304 /*
2305 * Statistics about UPL enforcement of copy-on-write obligations.
2306 */
2307 unsigned long upl_cow = 0;
2308 unsigned long upl_cow_again = 0;
2309 unsigned long upl_cow_contiguous = 0;
2310 unsigned long upl_cow_pages = 0;
2311 unsigned long upl_cow_again_pages = 0;
2312 unsigned long upl_cow_contiguous_pages = 0;
2313
2314 /*
2315 * Routine: vm_object_upl_request
2316 * Purpose:
2317 * Cause the population of a portion of a vm_object.
2318 * Depending on the nature of the request, the pages
2319 * returned may be contain valid data or be uninitialized.
2320 * A page list structure, listing the physical pages
2321 * will be returned upon request.
2322 * This function is called by the file system or any other
2323 * supplier of backing store to a pager.
2324 * IMPORTANT NOTE: The caller must still respect the relationship
2325 * between the vm_object and its backing memory object. The
2326 * caller MUST NOT substitute changes in the backing file
2327 * without first doing a memory_object_lock_request on the
2328 * target range unless it is know that the pages are not
2329 * shared with another entity at the pager level.
2330 * Copy_in_to:
2331 * if a page list structure is present
2332 * return the mapped physical pages, where a
2333 * page is not present, return a non-initialized
2334 * one. If the no_sync bit is turned on, don't
2335 * call the pager unlock to synchronize with other
2336 * possible copies of the page. Leave pages busy
2337 * in the original object, if a page list structure
2338 * was specified. When a commit of the page list
2339 * pages is done, the dirty bit will be set for each one.
2340 * Copy_out_from:
2341 * If a page list structure is present, return
2342 * all mapped pages. Where a page does not exist
2343 * map a zero filled one. Leave pages busy in
2344 * the original object. If a page list structure
2345 * is not specified, this call is a no-op.
2346 *
2347 * Note: access of default pager objects has a rather interesting
2348 * twist. The caller of this routine, presumably the file system
2349 * page cache handling code, will never actually make a request
2350 * against a default pager backed object. Only the default
2351 * pager will make requests on backing store related vm_objects
2352 * In this way the default pager can maintain the relationship
2353 * between backing store files (abstract memory objects) and
2354 * the vm_objects (cache objects), they support.
2355 *
2356 */
2357
2358 __private_extern__ kern_return_t
2359 vm_object_upl_request(
2360 vm_object_t object,
2361 vm_object_offset_t offset,
2362 upl_size_t size,
2363 upl_t *upl_ptr,
2364 upl_page_info_array_t user_page_list,
2365 unsigned int *page_list_count,
2366 int cntrl_flags)
2367 {
2368 vm_page_t dst_page = VM_PAGE_NULL;
2369 vm_object_offset_t dst_offset = offset;
2370 upl_size_t xfer_size = size;
2371 boolean_t do_m_lock = FALSE;
2372 boolean_t dirty;
2373 boolean_t hw_dirty;
2374 upl_t upl = NULL;
2375 unsigned int entry;
2376 #if MACH_CLUSTER_STATS
2377 boolean_t encountered_lrp = FALSE;
2378 #endif
2379 vm_page_t alias_page = NULL;
2380 int page_ticket;
2381 int refmod_state;
2382 wpl_array_t lite_list = NULL;
2383 vm_object_t last_copy_object;
2384
2385
2386 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2387 /*
2388 * For forward compatibility's sake,
2389 * reject any unknown flag.
2390 */
2391 return KERN_INVALID_VALUE;
2392 }
2393
2394 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2395 >> UPL_PAGE_TICKET_SHIFT;
2396
2397 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2398 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2399 }
2400
2401 if(cntrl_flags & UPL_SET_INTERNAL)
2402 if(page_list_count != NULL)
2403 *page_list_count = MAX_UPL_TRANSFER;
2404
2405 if((!object->internal) && (object->paging_offset != 0))
2406 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2407
2408 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2409 return KERN_SUCCESS;
2410 }
2411
2412 vm_object_lock(object);
2413 vm_object_paging_begin(object);
2414 vm_object_unlock(object);
2415
2416 if(upl_ptr) {
2417 if(cntrl_flags & UPL_SET_INTERNAL) {
2418 if(cntrl_flags & UPL_SET_LITE) {
2419 uintptr_t page_field_size;
2420 upl = upl_create(
2421 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2422 size);
2423 user_page_list = (upl_page_info_t *)
2424 (((uintptr_t)upl) + sizeof(struct upl));
2425 lite_list = (wpl_array_t)
2426 (((uintptr_t)user_page_list) +
2427 ((size/PAGE_SIZE) *
2428 sizeof(upl_page_info_t)));
2429 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2430 page_field_size =
2431 (page_field_size + 3) & 0xFFFFFFFC;
2432 bzero((char *)lite_list, page_field_size);
2433 upl->flags =
2434 UPL_LITE | UPL_INTERNAL;
2435 } else {
2436 upl = upl_create(UPL_CREATE_INTERNAL, size);
2437 user_page_list = (upl_page_info_t *)
2438 (((uintptr_t)upl) + sizeof(struct upl));
2439 upl->flags = UPL_INTERNAL;
2440 }
2441 } else {
2442 if(cntrl_flags & UPL_SET_LITE) {
2443 uintptr_t page_field_size;
2444 upl = upl_create(UPL_CREATE_LITE, size);
2445 lite_list = (wpl_array_t)
2446 (((uintptr_t)upl) + sizeof(struct upl));
2447 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2448 page_field_size =
2449 (page_field_size + 3) & 0xFFFFFFFC;
2450 bzero((char *)lite_list, page_field_size);
2451 upl->flags = UPL_LITE;
2452 } else {
2453 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2454 upl->flags = 0;
2455 }
2456 }
2457
2458 if (object->phys_contiguous) {
2459 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2460 object->copy != VM_OBJECT_NULL) {
2461 /* Honor copy-on-write obligations */
2462
2463 /*
2464 * XXX FBDP
2465 * We could still have a race...
2466 * A is here building the UPL for a write().
2467 * A pushes the pages to the current copy
2468 * object.
2469 * A returns the UPL to the caller.
2470 * B comes along and establishes another
2471 * private mapping on this object, inserting
2472 * a new copy object between the original
2473 * object and the old copy object.
2474 * B reads a page and gets the original contents
2475 * from the original object.
2476 * A modifies the page in the original object.
2477 * B reads the page again and sees A's changes,
2478 * which is wrong...
2479 *
2480 * The problem is that the pages are not
2481 * marked "busy" in the original object, so
2482 * nothing prevents B from reading it before
2483 * before A's changes are completed.
2484 *
2485 * The "paging_in_progress" might protect us
2486 * from the insertion of a new copy object
2487 * though... To be verified.
2488 */
2489 vm_object_lock_request(object,
2490 offset,
2491 size,
2492 FALSE,
2493 MEMORY_OBJECT_COPY_SYNC,
2494 VM_PROT_NO_CHANGE);
2495 upl_cow_contiguous++;
2496 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2497 }
2498
2499 upl->map_object = object;
2500 /* don't need any shadow mappings for this one */
2501 /* since it is already I/O memory */
2502 upl->flags |= UPL_DEVICE_MEMORY;
2503
2504
2505 /* paging_in_progress protects paging_offset */
2506 upl->offset = offset + object->paging_offset;
2507 upl->size = size;
2508 *upl_ptr = upl;
2509 if(user_page_list) {
2510 user_page_list[0].phys_addr =
2511 (offset + object->shadow_offset)>>PAGE_SHIFT;
2512 user_page_list[0].device = TRUE;
2513 }
2514 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2515
2516 if(page_list_count != NULL) {
2517 if (upl->flags & UPL_INTERNAL) {
2518 *page_list_count = 0;
2519 } else {
2520 *page_list_count = 1;
2521 }
2522 }
2523
2524 return KERN_SUCCESS;
2525 }
2526
2527 if(user_page_list)
2528 user_page_list[0].device = FALSE;
2529
2530 if(cntrl_flags & UPL_SET_LITE) {
2531 upl->map_object = object;
2532 } else {
2533 upl->map_object = vm_object_allocate(size);
2534 /*
2535 * No neeed to lock the new object: nobody else knows
2536 * about it yet, so it's all ours so far.
2537 */
2538 upl->map_object->shadow = object;
2539 upl->map_object->pageout = TRUE;
2540 upl->map_object->can_persist = FALSE;
2541 upl->map_object->copy_strategy =
2542 MEMORY_OBJECT_COPY_NONE;
2543 upl->map_object->shadow_offset = offset;
2544 upl->map_object->wimg_bits = object->wimg_bits;
2545 }
2546
2547 }
2548 if (!(cntrl_flags & UPL_SET_LITE)) {
2549 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2550 }
2551
2552 /*
2553 * ENCRYPTED SWAP:
2554 * Just mark the UPL as "encrypted" here.
2555 * We'll actually encrypt the pages later,
2556 * in upl_encrypt(), when the caller has
2557 * selected which pages need to go to swap.
2558 */
2559 if (cntrl_flags & UPL_ENCRYPT) {
2560 upl->flags |= UPL_ENCRYPTED;
2561 }
2562 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2563 upl->flags |= UPL_PAGEOUT;
2564 }
2565 vm_object_lock(object);
2566
2567 /* we can lock in the paging_offset once paging_in_progress is set */
2568 if(upl_ptr) {
2569 upl->size = size;
2570 upl->offset = offset + object->paging_offset;
2571 *upl_ptr = upl;
2572 #ifdef UPL_DEBUG
2573 queue_enter(&object->uplq, upl, upl_t, uplq);
2574 #endif /* UPL_DEBUG */
2575 }
2576
2577 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2578 object->copy != VM_OBJECT_NULL) {
2579 /* Honor copy-on-write obligations */
2580
2581 /*
2582 * The caller is gathering these pages and
2583 * might modify their contents. We need to
2584 * make sure that the copy object has its own
2585 * private copies of these pages before we let
2586 * the caller modify them.
2587 */
2588 vm_object_update(object,
2589 offset,
2590 size,
2591 NULL,
2592 NULL,
2593 FALSE, /* should_return */
2594 MEMORY_OBJECT_COPY_SYNC,
2595 VM_PROT_NO_CHANGE);
2596 upl_cow++;
2597 upl_cow_pages += size >> PAGE_SHIFT;
2598
2599 }
2600 /* remember which copy object we synchronized with */
2601 last_copy_object = object->copy;
2602
2603 entry = 0;
2604 if(cntrl_flags & UPL_COPYOUT_FROM) {
2605 upl->flags |= UPL_PAGE_SYNC_DONE;
2606
2607 while (xfer_size) {
2608 if((alias_page == NULL) &&
2609 !(cntrl_flags & UPL_SET_LITE)) {
2610 vm_object_unlock(object);
2611 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2612 vm_object_lock(object);
2613 }
2614 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2615 dst_page->fictitious ||
2616 dst_page->absent ||
2617 dst_page->error ||
2618 (dst_page->wire_count && !dst_page->pageout) ||
2619
2620 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2621 (dst_page->page_ticket != page_ticket) &&
2622 ((dst_page->page_ticket+1) != page_ticket)) ) {
2623
2624 if (user_page_list)
2625 user_page_list[entry].phys_addr = 0;
2626 } else {
2627 /*
2628 * grab this up front...
2629 * a high percentange of the time we're going to
2630 * need the hardware modification state a bit later
2631 * anyway... so we can eliminate an extra call into
2632 * the pmap layer by grabbing it here and recording it
2633 */
2634 refmod_state = pmap_get_refmod(dst_page->phys_page);
2635
2636 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2637 /*
2638 * we're only asking for DIRTY pages to be returned
2639 */
2640
2641 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2642 /*
2643 * if we were the page stolen by vm_pageout_scan to be
2644 * cleaned (as opposed to a buddy being clustered in
2645 * or this request is not being driven by a PAGEOUT cluster
2646 * then we only need to check for the page being diry or
2647 * precious to decide whether to return it
2648 */
2649 if (dst_page->dirty || dst_page->precious ||
2650 (refmod_state & VM_MEM_MODIFIED)) {
2651 goto check_busy;
2652 }
2653 }
2654 /*
2655 * this is a request for a PAGEOUT cluster and this page
2656 * is merely along for the ride as a 'buddy'... not only
2657 * does it have to be dirty to be returned, but it also
2658 * can't have been referenced recently... note that we've
2659 * already filtered above based on whether this page is
2660 * currently on the inactive queue or it meets the page
2661 * ticket (generation count) check
2662 */
2663 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2664 ((refmod_state & VM_MEM_MODIFIED) ||
2665 dst_page->dirty || dst_page->precious) ) {
2666 goto check_busy;
2667 }
2668 /*
2669 * if we reach here, we're not to return
2670 * the page... go on to the next one
2671 */
2672 if (user_page_list)
2673 user_page_list[entry].phys_addr = 0;
2674 entry++;
2675 dst_offset += PAGE_SIZE_64;
2676 xfer_size -= PAGE_SIZE;
2677 continue;
2678 }
2679 check_busy:
2680 if(dst_page->busy &&
2681 (!(dst_page->list_req_pending &&
2682 dst_page->pageout))) {
2683 if(cntrl_flags & UPL_NOBLOCK) {
2684 if(user_page_list) {
2685 user_page_list[entry].phys_addr = 0;
2686 }
2687 entry++;
2688 dst_offset += PAGE_SIZE_64;
2689 xfer_size -= PAGE_SIZE;
2690 continue;
2691 }
2692 /*
2693 * someone else is playing with the
2694 * page. We will have to wait.
2695 */
2696 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2697 continue;
2698 }
2699 /* Someone else already cleaning the page? */
2700 if((dst_page->cleaning || dst_page->absent ||
2701 dst_page->wire_count != 0) &&
2702 !dst_page->list_req_pending) {
2703 if(user_page_list) {
2704 user_page_list[entry].phys_addr = 0;
2705 }
2706 entry++;
2707 dst_offset += PAGE_SIZE_64;
2708 xfer_size -= PAGE_SIZE;
2709 continue;
2710 }
2711 /* eliminate all mappings from the */
2712 /* original object and its prodigy */
2713
2714 vm_page_lock_queues();
2715
2716 if (dst_page->pageout_queue == TRUE)
2717 /*
2718 * we've buddied up a page for a clustered pageout
2719 * that has already been moved to the pageout
2720 * queue by pageout_scan... we need to remove
2721 * it from the queue and drop the laundry count
2722 * on that queue
2723 */
2724 vm_pageout_queue_steal(dst_page);
2725 #if MACH_CLUSTER_STATS
2726 /* pageout statistics gathering. count */
2727 /* all the pages we will page out that */
2728 /* were not counted in the initial */
2729 /* vm_pageout_scan work */
2730 if(dst_page->list_req_pending)
2731 encountered_lrp = TRUE;
2732 if((dst_page->dirty ||
2733 (dst_page->object->internal &&
2734 dst_page->precious)) &&
2735 (dst_page->list_req_pending
2736 == FALSE)) {
2737 if(encountered_lrp) {
2738 CLUSTER_STAT
2739 (pages_at_higher_offsets++;)
2740 } else {
2741 CLUSTER_STAT
2742 (pages_at_lower_offsets++;)
2743 }
2744 }
2745 #endif
2746 /* Turn off busy indication on pending */
2747 /* pageout. Note: we can only get here */
2748 /* in the request pending case. */
2749 dst_page->list_req_pending = FALSE;
2750 dst_page->busy = FALSE;
2751 dst_page->cleaning = FALSE;
2752
2753 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2754 dirty = hw_dirty ? TRUE : dst_page->dirty;
2755
2756 if(cntrl_flags & UPL_SET_LITE) {
2757 int pg_num;
2758 pg_num = (dst_offset-offset)/PAGE_SIZE;
2759 lite_list[pg_num>>5] |=
2760 1 << (pg_num & 31);
2761 if (hw_dirty)
2762 pmap_clear_modify(dst_page->phys_page);
2763 /*
2764 * Record that this page has been
2765 * written out
2766 */
2767 #if MACH_PAGEMAP
2768 vm_external_state_set(
2769 object->existence_map,
2770 dst_page->offset);
2771 #endif /*MACH_PAGEMAP*/
2772
2773 /*
2774 * Mark original page as cleaning
2775 * in place.
2776 */
2777 dst_page->cleaning = TRUE;
2778 dst_page->dirty = TRUE;
2779 dst_page->precious = FALSE;
2780 } else {
2781 /* use pageclean setup, it is more */
2782 /* convenient even for the pageout */
2783 /* cases here */
2784
2785 vm_object_lock(upl->map_object);
2786 vm_pageclean_setup(dst_page,
2787 alias_page, upl->map_object,
2788 size - xfer_size);
2789 vm_object_unlock(upl->map_object);
2790
2791 alias_page->absent = FALSE;
2792 alias_page = NULL;
2793 }
2794
2795 if(!dirty) {
2796 dst_page->dirty = FALSE;
2797 dst_page->precious = TRUE;
2798 }
2799
2800 if(dst_page->pageout)
2801 dst_page->busy = TRUE;
2802
2803 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2804 /*
2805 * ENCRYPTED SWAP:
2806 * We want to deny access to the target page
2807 * because its contents are about to be
2808 * encrypted and the user would be very
2809 * confused to see encrypted data instead
2810 * of their data.
2811 */
2812 dst_page->busy = TRUE;
2813 }
2814 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2815 /*
2816 * deny access to the target page
2817 * while it is being worked on
2818 */
2819 if ((!dst_page->pageout) &&
2820 (dst_page->wire_count == 0)) {
2821 dst_page->busy = TRUE;
2822 dst_page->pageout = TRUE;
2823 vm_page_wire(dst_page);
2824 }
2825 }
2826
2827 if (dst_page->phys_page > upl->highest_page)
2828 upl->highest_page = dst_page->phys_page;
2829
2830 if(user_page_list) {
2831 user_page_list[entry].phys_addr
2832 = dst_page->phys_page;
2833 user_page_list[entry].dirty =
2834 dst_page->dirty;
2835 user_page_list[entry].pageout =
2836 dst_page->pageout;
2837 user_page_list[entry].absent =
2838 dst_page->absent;
2839 user_page_list[entry].precious =
2840 dst_page->precious;
2841 }
2842 vm_page_unlock_queues();
2843
2844 /*
2845 * ENCRYPTED SWAP:
2846 * The caller is gathering this page and might
2847 * access its contents later on. Decrypt the
2848 * page before adding it to the UPL, so that
2849 * the caller never sees encrypted data.
2850 */
2851 if (! (cntrl_flags & UPL_ENCRYPT) &&
2852 dst_page->encrypted) {
2853 assert(dst_page->busy);
2854
2855 vm_page_decrypt(dst_page, 0);
2856 vm_page_decrypt_for_upl_counter++;
2857
2858 /*
2859 * Retry this page, since anything
2860 * could have changed while we were
2861 * decrypting.
2862 */
2863 continue;
2864 }
2865 }
2866 entry++;
2867 dst_offset += PAGE_SIZE_64;
2868 xfer_size -= PAGE_SIZE;
2869 }
2870 } else {
2871 while (xfer_size) {
2872 if((alias_page == NULL) &&
2873 !(cntrl_flags & UPL_SET_LITE)) {
2874 vm_object_unlock(object);
2875 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2876 vm_object_lock(object);
2877 }
2878
2879 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2880 object->copy != last_copy_object) {
2881 /* Honor copy-on-write obligations */
2882
2883 /*
2884 * The copy object has changed since we
2885 * last synchronized for copy-on-write.
2886 * Another copy object might have been
2887 * inserted while we released the object's
2888 * lock. Since someone could have seen the
2889 * original contents of the remaining pages
2890 * through that new object, we have to
2891 * synchronize with it again for the remaining
2892 * pages only. The previous pages are "busy"
2893 * so they can not be seen through the new
2894 * mapping. The new mapping will see our
2895 * upcoming changes for those previous pages,
2896 * but that's OK since they couldn't see what
2897 * was there before. It's just a race anyway
2898 * and there's no guarantee of consistency or
2899 * atomicity. We just don't want new mappings
2900 * to see both the *before* and *after* pages.
2901 */
2902 if (object->copy != VM_OBJECT_NULL) {
2903 vm_object_update(
2904 object,
2905 dst_offset,/* current offset */
2906 xfer_size, /* remaining size */
2907 NULL,
2908 NULL,
2909 FALSE, /* should_return */
2910 MEMORY_OBJECT_COPY_SYNC,
2911 VM_PROT_NO_CHANGE);
2912 upl_cow_again++;
2913 upl_cow_again_pages +=
2914 xfer_size >> PAGE_SHIFT;
2915 }
2916 /* remember the copy object we synced with */
2917 last_copy_object = object->copy;
2918 }
2919
2920 dst_page = vm_page_lookup(object, dst_offset);
2921
2922 if(dst_page != VM_PAGE_NULL) {
2923 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2924 !((dst_page->list_req_pending)
2925 && (dst_page->absent))) {
2926 /* we are doing extended range */
2927 /* requests. we want to grab */
2928 /* pages around some which are */
2929 /* already present. */
2930 if(user_page_list) {
2931 user_page_list[entry].phys_addr = 0;
2932 }
2933 entry++;
2934 dst_offset += PAGE_SIZE_64;
2935 xfer_size -= PAGE_SIZE;
2936 continue;
2937 }
2938 if((dst_page->cleaning) &&
2939 !(dst_page->list_req_pending)) {
2940 /*someone else is writing to the */
2941 /* page. We will have to wait. */
2942 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2943 continue;
2944 }
2945 if ((dst_page->fictitious &&
2946 dst_page->list_req_pending)) {
2947 /* dump the fictitious page */
2948 dst_page->list_req_pending = FALSE;
2949 dst_page->clustered = FALSE;
2950
2951 vm_page_lock_queues();
2952 vm_page_free(dst_page);
2953 vm_page_unlock_queues();
2954
2955 dst_page = NULL;
2956 } else if ((dst_page->absent &&
2957 dst_page->list_req_pending)) {
2958 /* the default_pager case */
2959 dst_page->list_req_pending = FALSE;
2960 dst_page->busy = FALSE;
2961 }
2962 }
2963 if(dst_page == VM_PAGE_NULL) {
2964 if(object->private) {
2965 /*
2966 * This is a nasty wrinkle for users
2967 * of upl who encounter device or
2968 * private memory however, it is
2969 * unavoidable, only a fault can
2970 * reslove the actual backing
2971 * physical page by asking the
2972 * backing device.
2973 */
2974 if(user_page_list) {
2975 user_page_list[entry].phys_addr = 0;
2976 }
2977 entry++;
2978 dst_offset += PAGE_SIZE_64;
2979 xfer_size -= PAGE_SIZE;
2980 continue;
2981 }
2982 /* need to allocate a page */
2983 dst_page = vm_page_alloc(object, dst_offset);
2984 if (dst_page == VM_PAGE_NULL) {
2985 vm_object_unlock(object);
2986 VM_PAGE_WAIT();
2987 vm_object_lock(object);
2988 continue;
2989 }
2990 dst_page->busy = FALSE;
2991 #if 0
2992 if(cntrl_flags & UPL_NO_SYNC) {
2993 dst_page->page_lock = 0;
2994 dst_page->unlock_request = 0;
2995 }
2996 #endif
2997 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2998 /*
2999 * if UPL_RET_ONLY_ABSENT was specified,
3000 * than we're definitely setting up a
3001 * upl for a clustered read/pagein
3002 * operation... mark the pages as clustered
3003 * so vm_fault can correctly attribute them
3004 * to the 'pagein' bucket the first time
3005 * a fault happens on them
3006 */
3007 dst_page->clustered = TRUE;
3008 }
3009 dst_page->absent = TRUE;
3010 object->absent_count++;
3011 }
3012 #if 1
3013 if(cntrl_flags & UPL_NO_SYNC) {
3014 dst_page->page_lock = 0;
3015 dst_page->unlock_request = 0;
3016 }
3017 #endif /* 1 */
3018
3019 /*
3020 * ENCRYPTED SWAP:
3021 */
3022 if (cntrl_flags & UPL_ENCRYPT) {
3023 /*
3024 * The page is going to be encrypted when we
3025 * get it from the pager, so mark it so.
3026 */
3027 dst_page->encrypted = TRUE;
3028 } else {
3029 /*
3030 * Otherwise, the page will not contain
3031 * encrypted data.
3032 */
3033 dst_page->encrypted = FALSE;
3034 }
3035
3036 dst_page->overwriting = TRUE;
3037 if(dst_page->fictitious) {
3038 panic("need corner case for fictitious page");
3039 }
3040 if(dst_page->page_lock) {
3041 do_m_lock = TRUE;
3042 }
3043 if(upl_ptr) {
3044
3045 /* eliminate all mappings from the */
3046 /* original object and its prodigy */
3047
3048 if(dst_page->busy) {
3049 /*someone else is playing with the */
3050 /* page. We will have to wait. */
3051 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3052 continue;
3053 }
3054 vm_page_lock_queues();
3055
3056 if( !(cntrl_flags & UPL_FILE_IO))
3057 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3058 else
3059 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3060 dirty = hw_dirty ? TRUE : dst_page->dirty;
3061
3062 if(cntrl_flags & UPL_SET_LITE) {
3063 int pg_num;
3064 pg_num = (dst_offset-offset)/PAGE_SIZE;
3065 lite_list[pg_num>>5] |=
3066 1 << (pg_num & 31);
3067 if (hw_dirty)
3068 pmap_clear_modify(dst_page->phys_page);
3069 /*
3070 * Record that this page has been
3071 * written out
3072 */
3073 #if MACH_PAGEMAP
3074 vm_external_state_set(
3075 object->existence_map,
3076 dst_page->offset);
3077 #endif /*MACH_PAGEMAP*/
3078
3079 /*
3080 * Mark original page as cleaning
3081 * in place.
3082 */
3083 dst_page->cleaning = TRUE;
3084 dst_page->dirty = TRUE;
3085 dst_page->precious = FALSE;
3086 } else {
3087 /* use pageclean setup, it is more */
3088 /* convenient even for the pageout */
3089 /* cases here */
3090 vm_object_lock(upl->map_object);
3091 vm_pageclean_setup(dst_page,
3092 alias_page, upl->map_object,
3093 size - xfer_size);
3094 vm_object_unlock(upl->map_object);
3095
3096 alias_page->absent = FALSE;
3097 alias_page = NULL;
3098 }
3099
3100 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3101 /* clean in place for read implies */
3102 /* that a write will be done on all */
3103 /* the pages that are dirty before */
3104 /* a upl commit is done. The caller */
3105 /* is obligated to preserve the */
3106 /* contents of all pages marked */
3107 /* dirty. */
3108 upl->flags |= UPL_CLEAR_DIRTY;
3109 }
3110
3111 if(!dirty) {
3112 dst_page->dirty = FALSE;
3113 dst_page->precious = TRUE;
3114 }
3115
3116 if (dst_page->wire_count == 0) {
3117 /* deny access to the target page while */
3118 /* it is being worked on */
3119 dst_page->busy = TRUE;
3120 } else {
3121 vm_page_wire(dst_page);
3122 }
3123 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3124 /*
3125 * expect the page not to be used
3126 * since it's coming in as part
3127 * of a cluster and could be
3128 * speculative... pages that
3129 * are 'consumed' will get a
3130 * hardware reference
3131 */
3132 dst_page->reference = FALSE;
3133 } else {
3134 /*
3135 * expect the page to be used
3136 */
3137 dst_page->reference = TRUE;
3138 }
3139 dst_page->precious =
3140 (cntrl_flags & UPL_PRECIOUS)
3141 ? TRUE : FALSE;
3142
3143 if (dst_page->phys_page > upl->highest_page)
3144 upl->highest_page = dst_page->phys_page;
3145
3146 if(user_page_list) {
3147 user_page_list[entry].phys_addr
3148 = dst_page->phys_page;
3149 user_page_list[entry].dirty =
3150 dst_page->dirty;
3151 user_page_list[entry].pageout =
3152 dst_page->pageout;
3153 user_page_list[entry].absent =
3154 dst_page->absent;
3155 user_page_list[entry].precious =
3156 dst_page->precious;
3157 }
3158 vm_page_unlock_queues();
3159 }
3160 entry++;
3161 dst_offset += PAGE_SIZE_64;
3162 xfer_size -= PAGE_SIZE;
3163 }
3164 }
3165
3166 if (upl->flags & UPL_INTERNAL) {
3167 if(page_list_count != NULL)
3168 *page_list_count = 0;
3169 } else if (*page_list_count > entry) {
3170 if(page_list_count != NULL)
3171 *page_list_count = entry;
3172 }
3173
3174 if(alias_page != NULL) {
3175 vm_page_lock_queues();
3176 vm_page_free(alias_page);
3177 vm_page_unlock_queues();
3178 }
3179
3180 if(do_m_lock) {
3181 vm_prot_t access_required;
3182 /* call back all associated pages from other users of the pager */
3183 /* all future updates will be on data which is based on the */
3184 /* changes we are going to make here. Note: it is assumed that */
3185 /* we already hold copies of the data so we will not be seeing */
3186 /* an avalanche of incoming data from the pager */
3187 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3188 ? VM_PROT_READ : VM_PROT_WRITE;
3189 while (TRUE) {
3190 kern_return_t rc;
3191
3192 if(!object->pager_ready) {
3193 wait_result_t wait_result;
3194
3195 wait_result = vm_object_sleep(object,
3196 VM_OBJECT_EVENT_PAGER_READY,
3197 THREAD_UNINT);
3198 if (wait_result != THREAD_AWAKENED) {
3199 vm_object_unlock(object);
3200 return KERN_FAILURE;
3201 }
3202 continue;
3203 }
3204
3205 vm_object_unlock(object);
3206 rc = memory_object_data_unlock(
3207 object->pager,
3208 dst_offset + object->paging_offset,
3209 size,
3210 access_required);
3211 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3212 return KERN_FAILURE;
3213 vm_object_lock(object);
3214
3215 if (rc == KERN_SUCCESS)
3216 break;
3217 }
3218
3219 /* lets wait on the last page requested */
3220 /* NOTE: we will have to update lock completed routine to signal */
3221 if(dst_page != VM_PAGE_NULL &&
3222 (access_required & dst_page->page_lock) != access_required) {
3223 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3224 vm_object_unlock(object);
3225 thread_block(THREAD_CONTINUE_NULL);
3226 return KERN_SUCCESS;
3227 }
3228 }
3229
3230 vm_object_unlock(object);
3231 return KERN_SUCCESS;
3232 }
3233
3234 /* JMM - Backward compatability for now */
3235 kern_return_t
3236 vm_fault_list_request( /* forward */
3237 memory_object_control_t control,
3238 vm_object_offset_t offset,
3239 upl_size_t size,
3240 upl_t *upl_ptr,
3241 upl_page_info_t **user_page_list_ptr,
3242 int page_list_count,
3243 int cntrl_flags);
3244 kern_return_t
3245 vm_fault_list_request(
3246 memory_object_control_t control,
3247 vm_object_offset_t offset,
3248 upl_size_t size,
3249 upl_t *upl_ptr,
3250 upl_page_info_t **user_page_list_ptr,
3251 int page_list_count,
3252 int cntrl_flags)
3253 {
3254 unsigned int local_list_count;
3255 upl_page_info_t *user_page_list;
3256 kern_return_t kr;
3257
3258 if (user_page_list_ptr != NULL) {
3259 local_list_count = page_list_count;
3260 user_page_list = *user_page_list_ptr;
3261 } else {
3262 local_list_count = 0;
3263 user_page_list = NULL;
3264 }
3265 kr = memory_object_upl_request(control,
3266 offset,
3267 size,
3268 upl_ptr,
3269 user_page_list,
3270 &local_list_count,
3271 cntrl_flags);
3272
3273 if(kr != KERN_SUCCESS)
3274 return kr;
3275
3276 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3277 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3278 }
3279
3280 return KERN_SUCCESS;
3281 }
3282
3283
3284
3285 /*
3286 * Routine: vm_object_super_upl_request
3287 * Purpose:
3288 * Cause the population of a portion of a vm_object
3289 * in much the same way as memory_object_upl_request.
3290 * Depending on the nature of the request, the pages
3291 * returned may be contain valid data or be uninitialized.
3292 * However, the region may be expanded up to the super
3293 * cluster size provided.
3294 */
3295
3296 __private_extern__ kern_return_t
3297 vm_object_super_upl_request(
3298 vm_object_t object,
3299 vm_object_offset_t offset,
3300 upl_size_t size,
3301 upl_size_t super_cluster,
3302 upl_t *upl,
3303 upl_page_info_t *user_page_list,
3304 unsigned int *page_list_count,
3305 int cntrl_flags)
3306 {
3307 vm_page_t target_page;
3308 int ticket;
3309
3310
3311 if(object->paging_offset > offset)
3312 return KERN_FAILURE;
3313
3314 assert(object->paging_in_progress);
3315 offset = offset - object->paging_offset;
3316
3317 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3318
3319 vm_object_lock(object);
3320
3321 if((target_page = vm_page_lookup(object, offset))
3322 != VM_PAGE_NULL) {
3323 ticket = target_page->page_ticket;
3324 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3325 cntrl_flags = cntrl_flags |
3326 ((ticket << UPL_PAGE_TICKET_SHIFT)
3327 & UPL_PAGE_TICKET_MASK);
3328 }
3329 vm_object_unlock(object);
3330 }
3331
3332 if (super_cluster > size) {
3333
3334 vm_object_offset_t base_offset;
3335 upl_size_t super_size;
3336
3337 base_offset = (offset &
3338 ~((vm_object_offset_t) super_cluster - 1));
3339 super_size = (offset+size) > (base_offset + super_cluster) ?
3340 super_cluster<<1 : super_cluster;
3341 super_size = ((base_offset + super_size) > object->size) ?
3342 (object->size - base_offset) : super_size;
3343 if(offset > (base_offset + super_size))
3344 panic("vm_object_super_upl_request: Missed target pageout"
3345 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3346 offset, base_offset, super_size, super_cluster,
3347 size, object->paging_offset);
3348 /*
3349 * apparently there is a case where the vm requests a
3350 * page to be written out who's offset is beyond the
3351 * object size
3352 */
3353 if((offset + size) > (base_offset + super_size))
3354 super_size = (offset + size) - base_offset;
3355
3356 offset = base_offset;
3357 size = super_size;
3358 }
3359 return vm_object_upl_request(object, offset, size,
3360 upl, user_page_list, page_list_count,
3361 cntrl_flags);
3362 }
3363
3364
3365 kern_return_t
3366 vm_map_create_upl(
3367 vm_map_t map,
3368 vm_map_address_t offset,
3369 upl_size_t *upl_size,
3370 upl_t *upl,
3371 upl_page_info_array_t page_list,
3372 unsigned int *count,
3373 int *flags)
3374 {
3375 vm_map_entry_t entry;
3376 int caller_flags;
3377 int force_data_sync;
3378 int sync_cow_data;
3379 vm_object_t local_object;
3380 vm_map_offset_t local_offset;
3381 vm_map_offset_t local_start;
3382 kern_return_t ret;
3383
3384 caller_flags = *flags;
3385
3386 if (caller_flags & ~UPL_VALID_FLAGS) {
3387 /*
3388 * For forward compatibility's sake,
3389 * reject any unknown flag.
3390 */
3391 return KERN_INVALID_VALUE;
3392 }
3393
3394 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3395 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3396
3397 if(upl == NULL)
3398 return KERN_INVALID_ARGUMENT;
3399
3400
3401 REDISCOVER_ENTRY:
3402 vm_map_lock(map);
3403 if (vm_map_lookup_entry(map, offset, &entry)) {
3404 if (entry->object.vm_object == VM_OBJECT_NULL ||
3405 !entry->object.vm_object->phys_contiguous) {
3406 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3407 *upl_size = MAX_UPL_TRANSFER * page_size;
3408 }
3409 }
3410 if((entry->vme_end - offset) < *upl_size) {
3411 *upl_size = entry->vme_end - offset;
3412 }
3413 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3414 if (entry->object.vm_object == VM_OBJECT_NULL) {
3415 *flags = 0;
3416 } else if (entry->object.vm_object->private) {
3417 *flags = UPL_DEV_MEMORY;
3418 if (entry->object.vm_object->phys_contiguous) {
3419 *flags |= UPL_PHYS_CONTIG;
3420 }
3421 } else {
3422 *flags = 0;
3423 }
3424 vm_map_unlock(map);
3425 return KERN_SUCCESS;
3426 }
3427 /*
3428 * Create an object if necessary.
3429 */
3430 if (entry->object.vm_object == VM_OBJECT_NULL) {
3431 entry->object.vm_object = vm_object_allocate(
3432 (vm_size_t)(entry->vme_end - entry->vme_start));
3433 entry->offset = 0;
3434 }
3435 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3436 if (!(entry->protection & VM_PROT_WRITE)) {
3437 vm_map_unlock(map);
3438 return KERN_PROTECTION_FAILURE;
3439 }
3440 if (entry->needs_copy) {
3441 vm_map_t local_map;
3442 vm_object_t object;
3443 vm_map_offset_t offset_hi;
3444 vm_map_offset_t offset_lo;
3445 vm_object_offset_t new_offset;
3446 vm_prot_t prot;
3447 boolean_t wired;
3448 vm_behavior_t behavior;
3449 vm_map_version_t version;
3450 vm_map_t real_map;
3451
3452 local_map = map;
3453 vm_map_lock_write_to_read(map);
3454 if(vm_map_lookup_locked(&local_map,
3455 offset, VM_PROT_WRITE,
3456 &version, &object,
3457 &new_offset, &prot, &wired,
3458 &behavior, &offset_lo,
3459 &offset_hi, &real_map)) {
3460 vm_map_unlock(local_map);
3461 return KERN_FAILURE;
3462 }
3463 if (real_map != map) {
3464 vm_map_unlock(real_map);
3465 }
3466 vm_object_unlock(object);
3467 vm_map_unlock(local_map);
3468
3469 goto REDISCOVER_ENTRY;
3470 }
3471 }
3472 if (entry->is_sub_map) {
3473 vm_map_t submap;
3474
3475 submap = entry->object.sub_map;
3476 local_start = entry->vme_start;
3477 local_offset = entry->offset;
3478 vm_map_reference(submap);
3479 vm_map_unlock(map);
3480
3481 ret = (vm_map_create_upl(submap,
3482 local_offset + (offset - local_start),
3483 upl_size, upl, page_list, count,
3484 flags));
3485
3486 vm_map_deallocate(submap);
3487 return ret;
3488 }
3489
3490 if (sync_cow_data) {
3491 if (entry->object.vm_object->shadow
3492 || entry->object.vm_object->copy) {
3493
3494 local_object = entry->object.vm_object;
3495 local_start = entry->vme_start;
3496 local_offset = entry->offset;
3497 vm_object_reference(local_object);
3498 vm_map_unlock(map);
3499
3500 if (entry->object.vm_object->shadow &&
3501 entry->object.vm_object->copy) {
3502 vm_object_lock_request(
3503 local_object->shadow,
3504 (vm_object_offset_t)
3505 ((offset - local_start) +
3506 local_offset) +
3507 local_object->shadow_offset,
3508 *upl_size, FALSE,
3509 MEMORY_OBJECT_DATA_SYNC,
3510 VM_PROT_NO_CHANGE);
3511 }
3512 sync_cow_data = FALSE;
3513 vm_object_deallocate(local_object);
3514 goto REDISCOVER_ENTRY;
3515 }
3516 }
3517
3518 if (force_data_sync) {
3519
3520 local_object = entry->object.vm_object;
3521 local_start = entry->vme_start;
3522 local_offset = entry->offset;
3523 vm_object_reference(local_object);
3524 vm_map_unlock(map);
3525
3526 vm_object_lock_request(
3527 local_object,
3528 (vm_object_offset_t)
3529 ((offset - local_start) + local_offset),
3530 (vm_object_size_t)*upl_size, FALSE,
3531 MEMORY_OBJECT_DATA_SYNC,
3532 VM_PROT_NO_CHANGE);
3533 force_data_sync = FALSE;
3534 vm_object_deallocate(local_object);
3535 goto REDISCOVER_ENTRY;
3536 }
3537
3538 if(!(entry->object.vm_object->private)) {
3539 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3540 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3541 if(entry->object.vm_object->phys_contiguous) {
3542 *flags = UPL_PHYS_CONTIG;
3543 } else {
3544 *flags = 0;
3545 }
3546 } else {
3547 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3548 }
3549 local_object = entry->object.vm_object;
3550 local_offset = entry->offset;
3551 local_start = entry->vme_start;
3552 vm_object_reference(local_object);
3553 vm_map_unlock(map);
3554 if(caller_flags & UPL_SET_IO_WIRE) {
3555 ret = (vm_object_iopl_request(local_object,
3556 (vm_object_offset_t)
3557 ((offset - local_start)
3558 + local_offset),
3559 *upl_size,
3560 upl,
3561 page_list,
3562 count,
3563 caller_flags));
3564 } else {
3565 ret = (vm_object_upl_request(local_object,
3566 (vm_object_offset_t)
3567 ((offset - local_start)
3568 + local_offset),
3569 *upl_size,
3570 upl,
3571 page_list,
3572 count,
3573 caller_flags));
3574 }
3575 vm_object_deallocate(local_object);
3576 return(ret);
3577 }
3578
3579 vm_map_unlock(map);
3580 return(KERN_FAILURE);
3581
3582 }
3583
3584 /*
3585 * Internal routine to enter a UPL into a VM map.
3586 *
3587 * JMM - This should just be doable through the standard
3588 * vm_map_enter() API.
3589 */
3590 kern_return_t
3591 vm_map_enter_upl(
3592 vm_map_t map,
3593 upl_t upl,
3594 vm_map_offset_t *dst_addr)
3595 {
3596 vm_map_size_t size;
3597 vm_object_offset_t offset;
3598 vm_map_offset_t addr;
3599 vm_page_t m;
3600 kern_return_t kr;
3601
3602 if (upl == UPL_NULL)
3603 return KERN_INVALID_ARGUMENT;
3604
3605 upl_lock(upl);
3606
3607 /* check to see if already mapped */
3608 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3609 upl_unlock(upl);
3610 return KERN_FAILURE;
3611 }
3612
3613 if((!(upl->map_object->pageout)) &&
3614 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3615 (upl->map_object->phys_contiguous))) {
3616 vm_object_t object;
3617 vm_page_t alias_page;
3618 vm_object_offset_t new_offset;
3619 int pg_num;
3620 wpl_array_t lite_list;
3621
3622 if(upl->flags & UPL_INTERNAL) {
3623 lite_list = (wpl_array_t)
3624 ((((uintptr_t)upl) + sizeof(struct upl))
3625 + ((upl->size/PAGE_SIZE)
3626 * sizeof(upl_page_info_t)));
3627 } else {
3628 lite_list = (wpl_array_t)
3629 (((uintptr_t)upl) + sizeof(struct upl));
3630 }
3631 object = upl->map_object;
3632 upl->map_object = vm_object_allocate(upl->size);
3633 vm_object_lock(upl->map_object);
3634 upl->map_object->shadow = object;
3635 upl->map_object->pageout = TRUE;
3636 upl->map_object->can_persist = FALSE;
3637 upl->map_object->copy_strategy =
3638 MEMORY_OBJECT_COPY_NONE;
3639 upl->map_object->shadow_offset =
3640 upl->offset - object->paging_offset;
3641 upl->map_object->wimg_bits = object->wimg_bits;
3642 offset = upl->map_object->shadow_offset;
3643 new_offset = 0;
3644 size = upl->size;
3645
3646 vm_object_lock(object);
3647
3648 while(size) {
3649 pg_num = (new_offset)/PAGE_SIZE;
3650 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3651 vm_object_unlock(object);
3652 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3653 vm_object_lock(object);
3654 m = vm_page_lookup(object, offset);
3655 if (m == VM_PAGE_NULL) {
3656 panic("vm_upl_map: page missing\n");
3657 }
3658
3659 vm_object_paging_begin(object);
3660
3661 /*
3662 * Convert the fictitious page to a private
3663 * shadow of the real page.
3664 */
3665 assert(alias_page->fictitious);
3666 alias_page->fictitious = FALSE;
3667 alias_page->private = TRUE;
3668 alias_page->pageout = TRUE;
3669 alias_page->phys_page = m->phys_page;
3670
3671 vm_page_lock_queues();
3672 vm_page_wire(alias_page);
3673 vm_page_unlock_queues();
3674
3675 /*
3676 * ENCRYPTED SWAP:
3677 * The virtual page ("m") has to be wired in some way
3678 * here or its physical page ("m->phys_page") could
3679 * be recycled at any time.
3680 * Assuming this is enforced by the caller, we can't
3681 * get an encrypted page here. Since the encryption
3682 * key depends on the VM page's "pager" object and
3683 * the "paging_offset", we couldn't handle 2 pageable
3684 * VM pages (with different pagers and paging_offsets)
3685 * sharing the same physical page: we could end up
3686 * encrypting with one key (via one VM page) and
3687 * decrypting with another key (via the alias VM page).
3688 */
3689 ASSERT_PAGE_DECRYPTED(m);
3690
3691 vm_page_insert(alias_page,
3692 upl->map_object, new_offset);
3693 assert(!alias_page->wanted);
3694 alias_page->busy = FALSE;
3695 alias_page->absent = FALSE;
3696 }
3697
3698 size -= PAGE_SIZE;
3699 offset += PAGE_SIZE_64;
3700 new_offset += PAGE_SIZE_64;
3701 }
3702 vm_object_unlock(object);
3703 vm_object_unlock(upl->map_object);
3704 }
3705 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3706 offset = upl->offset - upl->map_object->paging_offset;
3707 else
3708 offset = 0;
3709
3710 size = upl->size;
3711
3712 vm_object_lock(upl->map_object);
3713 upl->map_object->ref_count++;
3714 vm_object_res_reference(upl->map_object);
3715 vm_object_unlock(upl->map_object);
3716
3717 *dst_addr = 0;
3718
3719
3720 /* NEED A UPL_MAP ALIAS */
3721 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3722 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3723 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3724
3725 if (kr != KERN_SUCCESS) {
3726 upl_unlock(upl);
3727 return(kr);
3728 }
3729
3730 vm_object_lock(upl->map_object);
3731
3732 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3733 m = vm_page_lookup(upl->map_object, offset);
3734 if(m) {
3735 unsigned int cache_attr;
3736 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3737
3738 PMAP_ENTER(map->pmap, addr,
3739 m, VM_PROT_ALL,
3740 cache_attr, TRUE);
3741 }
3742 offset+=PAGE_SIZE_64;
3743 }
3744 vm_object_unlock(upl->map_object);
3745
3746 upl->ref_count++; /* hold a reference for the mapping */
3747 upl->flags |= UPL_PAGE_LIST_MAPPED;
3748 upl->kaddr = *dst_addr;
3749 upl_unlock(upl);
3750 return KERN_SUCCESS;
3751 }
3752
3753 /*
3754 * Internal routine to remove a UPL mapping from a VM map.
3755 *
3756 * XXX - This should just be doable through a standard
3757 * vm_map_remove() operation. Otherwise, implicit clean-up
3758 * of the target map won't be able to correctly remove
3759 * these (and release the reference on the UPL). Having
3760 * to do this means we can't map these into user-space
3761 * maps yet.
3762 */
3763 kern_return_t
3764 vm_map_remove_upl(
3765 vm_map_t map,
3766 upl_t upl)
3767 {
3768 vm_address_t addr;
3769 upl_size_t size;
3770
3771 if (upl == UPL_NULL)
3772 return KERN_INVALID_ARGUMENT;
3773
3774 upl_lock(upl);
3775 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3776 addr = upl->kaddr;
3777 size = upl->size;
3778 assert(upl->ref_count > 1);
3779 upl->ref_count--; /* removing mapping ref */
3780 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3781 upl->kaddr = (vm_offset_t) 0;
3782 upl_unlock(upl);
3783
3784 vm_map_remove( map,
3785 vm_map_trunc_page(addr),
3786 vm_map_round_page(addr + size),
3787 VM_MAP_NO_FLAGS);
3788 return KERN_SUCCESS;
3789 }
3790 upl_unlock(upl);
3791 return KERN_FAILURE;
3792 }
3793
3794 kern_return_t
3795 upl_commit_range(
3796 upl_t upl,
3797 upl_offset_t offset,
3798 upl_size_t size,
3799 int flags,
3800 upl_page_info_t *page_list,
3801 mach_msg_type_number_t count,
3802 boolean_t *empty)
3803 {
3804 upl_size_t xfer_size = size;
3805 vm_object_t shadow_object;
3806 vm_object_t object = upl->map_object;
3807 vm_object_offset_t target_offset;
3808 int entry;
3809 wpl_array_t lite_list;
3810 int occupied;
3811 int delayed_unlock = 0;
3812 int clear_refmod = 0;
3813 boolean_t shadow_internal;
3814
3815 *empty = FALSE;
3816
3817 if (upl == UPL_NULL)
3818 return KERN_INVALID_ARGUMENT;
3819
3820
3821 if (count == 0)
3822 page_list = NULL;
3823
3824 if (object->pageout) {
3825 shadow_object = object->shadow;
3826 } else {
3827 shadow_object = object;
3828 }
3829
3830 upl_lock(upl);
3831
3832 if (upl->flags & UPL_ACCESS_BLOCKED) {
3833 /*
3834 * We used this UPL to block access to the pages by marking
3835 * them "busy". Now we need to clear the "busy" bit to allow
3836 * access to these pages again.
3837 */
3838 flags |= UPL_COMMIT_ALLOW_ACCESS;
3839 }
3840
3841 if (upl->flags & UPL_CLEAR_DIRTY)
3842 flags |= UPL_COMMIT_CLEAR_DIRTY;
3843
3844 if (upl->flags & UPL_DEVICE_MEMORY) {
3845 xfer_size = 0;
3846 } else if ((offset + size) > upl->size) {
3847 upl_unlock(upl);
3848 return KERN_FAILURE;
3849 }
3850
3851 if (upl->flags & UPL_INTERNAL) {
3852 lite_list = (wpl_array_t)
3853 ((((uintptr_t)upl) + sizeof(struct upl))
3854 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3855 } else {
3856 lite_list = (wpl_array_t)
3857 (((uintptr_t)upl) + sizeof(struct upl));
3858 }
3859 if (object != shadow_object)
3860 vm_object_lock(object);
3861 vm_object_lock(shadow_object);
3862
3863 shadow_internal = shadow_object->internal;
3864
3865 entry = offset/PAGE_SIZE;
3866 target_offset = (vm_object_offset_t)offset;
3867
3868 while (xfer_size) {
3869 vm_page_t t,m;
3870 upl_page_info_t *p;
3871
3872 m = VM_PAGE_NULL;
3873
3874 if (upl->flags & UPL_LITE) {
3875 int pg_num;
3876
3877 pg_num = target_offset/PAGE_SIZE;
3878
3879 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3880 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3881 m = vm_page_lookup(shadow_object,
3882 target_offset + (upl->offset -
3883 shadow_object->paging_offset));
3884 }
3885 }
3886 if (object->pageout) {
3887 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3888 t->pageout = FALSE;
3889
3890 if (delayed_unlock) {
3891 delayed_unlock = 0;
3892 vm_page_unlock_queues();
3893 }
3894 VM_PAGE_FREE(t);
3895
3896 if (m == NULL) {
3897 m = vm_page_lookup(
3898 shadow_object,
3899 target_offset +
3900 object->shadow_offset);
3901 }
3902 if (m != VM_PAGE_NULL)
3903 vm_object_paging_end(m->object);
3904 }
3905 }
3906 if (m != VM_PAGE_NULL) {
3907
3908 clear_refmod = 0;
3909
3910 if (upl->flags & UPL_IO_WIRE) {
3911
3912 if (delayed_unlock == 0)
3913 vm_page_lock_queues();
3914
3915 vm_page_unwire(m);
3916
3917 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3918 delayed_unlock = 0;
3919 vm_page_unlock_queues();
3920 }
3921 if (page_list) {
3922 page_list[entry].phys_addr = 0;
3923 }
3924 if (flags & UPL_COMMIT_SET_DIRTY) {
3925 m->dirty = TRUE;
3926 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3927 m->dirty = FALSE;
3928 clear_refmod |= VM_MEM_MODIFIED;
3929 }
3930 if (flags & UPL_COMMIT_INACTIVATE) {
3931 m->reference = FALSE;
3932 clear_refmod |= VM_MEM_REFERENCED;
3933 vm_page_deactivate(m);
3934 }
3935 if (clear_refmod)
3936 pmap_clear_refmod(m->phys_page, clear_refmod);
3937
3938 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3939 /*
3940 * We blocked access to the pages in this UPL.
3941 * Clear the "busy" bit and wake up any waiter
3942 * for this page.
3943 */
3944 PAGE_WAKEUP_DONE(m);
3945 }
3946
3947 target_offset += PAGE_SIZE_64;
3948 xfer_size -= PAGE_SIZE;
3949 entry++;
3950 continue;
3951 }
3952 if (delayed_unlock == 0)
3953 vm_page_lock_queues();
3954 /*
3955 * make sure to clear the hardware
3956 * modify or reference bits before
3957 * releasing the BUSY bit on this page
3958 * otherwise we risk losing a legitimate
3959 * change of state
3960 */
3961 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3962 m->dirty = FALSE;
3963 clear_refmod |= VM_MEM_MODIFIED;
3964 }
3965 if (flags & UPL_COMMIT_INACTIVATE)
3966 clear_refmod |= VM_MEM_REFERENCED;
3967
3968 if (clear_refmod)
3969 pmap_clear_refmod(m->phys_page, clear_refmod);
3970
3971 if (page_list) {
3972 p = &(page_list[entry]);
3973 if(p->phys_addr && p->pageout && !m->pageout) {
3974 m->busy = TRUE;
3975 m->pageout = TRUE;
3976 vm_page_wire(m);
3977 } else if (page_list[entry].phys_addr &&
3978 !p->pageout && m->pageout &&
3979 !m->dump_cleaning) {
3980 m->pageout = FALSE;
3981 m->absent = FALSE;
3982 m->overwriting = FALSE;
3983 vm_page_unwire(m);
3984 PAGE_WAKEUP_DONE(m);
3985 }
3986 page_list[entry].phys_addr = 0;
3987 }
3988 m->dump_cleaning = FALSE;
3989 if(m->laundry) {
3990 vm_pageout_throttle_up(m);
3991 }
3992 if(m->pageout) {
3993 m->cleaning = FALSE;
3994 m->pageout = FALSE;
3995 #if MACH_CLUSTER_STATS
3996 if (m->wanted) vm_pageout_target_collisions++;
3997 #endif
3998 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3999 m->dirty = TRUE;
4000 else
4001 m->dirty = FALSE;
4002
4003 if(m->dirty) {
4004 vm_page_unwire(m);/* reactivates */
4005
4006 if (upl->flags & UPL_PAGEOUT) {
4007 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4008 VM_STAT(reactivations++);
4009 }
4010 PAGE_WAKEUP_DONE(m);
4011 } else {
4012 vm_page_free(m);/* clears busy, etc. */
4013
4014 if (upl->flags & UPL_PAGEOUT) {
4015 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4016
4017 if (page_list[entry].dirty)
4018 VM_STAT(pageouts++);
4019 }
4020 }
4021 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4022 delayed_unlock = 0;
4023 vm_page_unlock_queues();
4024 }
4025 target_offset += PAGE_SIZE_64;
4026 xfer_size -= PAGE_SIZE;
4027 entry++;
4028 continue;
4029 }
4030 #if MACH_CLUSTER_STATS
4031 m->dirty = pmap_is_modified(m->phys_page);
4032
4033 if (m->dirty) vm_pageout_cluster_dirtied++;
4034 else vm_pageout_cluster_cleaned++;
4035 if (m->wanted) vm_pageout_cluster_collisions++;
4036 #else
4037 m->dirty = 0;
4038 #endif
4039
4040 if((m->busy) && (m->cleaning)) {
4041 /* the request_page_list case */
4042 if(m->absent) {
4043 m->absent = FALSE;
4044 if(shadow_object->absent_count == 1)
4045 vm_object_absent_release(shadow_object);
4046 else
4047 shadow_object->absent_count--;
4048 }
4049 m->overwriting = FALSE;
4050 m->busy = FALSE;
4051 m->dirty = FALSE;
4052 } else if (m->overwriting) {
4053 /* alternate request page list, write to
4054 * page_list case. Occurs when the original
4055 * page was wired at the time of the list
4056 * request */
4057 assert(m->wire_count != 0);
4058 vm_page_unwire(m);/* reactivates */
4059 m->overwriting = FALSE;
4060 }
4061 m->cleaning = FALSE;
4062
4063 /* It is a part of the semantic of COPYOUT_FROM */
4064 /* UPLs that a commit implies cache sync */
4065 /* between the vm page and the backing store */
4066 /* this can be used to strip the precious bit */
4067 /* as well as clean */
4068 if (upl->flags & UPL_PAGE_SYNC_DONE)
4069 m->precious = FALSE;
4070
4071 if (flags & UPL_COMMIT_SET_DIRTY)
4072 m->dirty = TRUE;
4073
4074 if (flags & UPL_COMMIT_INACTIVATE) {
4075 m->reference = FALSE;
4076 vm_page_deactivate(m);
4077 } else if (!m->active && !m->inactive) {
4078 if (m->reference)
4079 vm_page_activate(m);
4080 else
4081 vm_page_deactivate(m);
4082 }
4083
4084 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4085 /*
4086 * We blocked access to the pages in this URL.
4087 * Clear the "busy" bit on this page before we
4088 * wake up any waiter.
4089 */
4090 m->busy = FALSE;
4091 }
4092
4093 /*
4094 * Wakeup any thread waiting for the page to be un-cleaning.
4095 */
4096 PAGE_WAKEUP(m);
4097
4098 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4099 delayed_unlock = 0;
4100 vm_page_unlock_queues();
4101 }
4102 }
4103 target_offset += PAGE_SIZE_64;
4104 xfer_size -= PAGE_SIZE;
4105 entry++;
4106 }
4107 if (delayed_unlock)
4108 vm_page_unlock_queues();
4109
4110 occupied = 1;
4111
4112 if (upl->flags & UPL_DEVICE_MEMORY) {
4113 occupied = 0;
4114 } else if (upl->flags & UPL_LITE) {
4115 int pg_num;
4116 int i;
4117 pg_num = upl->size/PAGE_SIZE;
4118 pg_num = (pg_num + 31) >> 5;
4119 occupied = 0;
4120 for(i= 0; i<pg_num; i++) {
4121 if(lite_list[i] != 0) {
4122 occupied = 1;
4123 break;
4124 }
4125 }
4126 } else {
4127 if(queue_empty(&upl->map_object->memq)) {
4128 occupied = 0;
4129 }
4130 }
4131
4132 if(occupied == 0) {
4133 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4134 *empty = TRUE;
4135 }
4136 if(object == shadow_object)
4137 vm_object_paging_end(shadow_object);
4138 }
4139 vm_object_unlock(shadow_object);
4140 if (object != shadow_object)
4141 vm_object_unlock(object);
4142 upl_unlock(upl);
4143
4144 return KERN_SUCCESS;
4145 }
4146
4147 kern_return_t
4148 upl_abort_range(
4149 upl_t upl,
4150 upl_offset_t offset,
4151 upl_size_t size,
4152 int error,
4153 boolean_t *empty)
4154 {
4155 upl_size_t xfer_size = size;
4156 vm_object_t shadow_object;
4157 vm_object_t object = upl->map_object;
4158 vm_object_offset_t target_offset;
4159 int entry;
4160 wpl_array_t lite_list;
4161 int occupied;
4162 boolean_t shadow_internal;
4163
4164 *empty = FALSE;
4165
4166 if (upl == UPL_NULL)
4167 return KERN_INVALID_ARGUMENT;
4168
4169 if (upl->flags & UPL_IO_WIRE) {
4170 return upl_commit_range(upl,
4171 offset, size, 0,
4172 NULL, 0, empty);
4173 }
4174
4175 if(object->pageout) {
4176 shadow_object = object->shadow;
4177 } else {
4178 shadow_object = object;
4179 }
4180
4181 upl_lock(upl);
4182 if(upl->flags & UPL_DEVICE_MEMORY) {
4183 xfer_size = 0;
4184 } else if ((offset + size) > upl->size) {
4185 upl_unlock(upl);
4186 return KERN_FAILURE;
4187 }
4188 if (object != shadow_object)
4189 vm_object_lock(object);
4190 vm_object_lock(shadow_object);
4191
4192 shadow_internal = shadow_object->internal;
4193
4194 if(upl->flags & UPL_INTERNAL) {
4195 lite_list = (wpl_array_t)
4196 ((((uintptr_t)upl) + sizeof(struct upl))
4197 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4198 } else {
4199 lite_list = (wpl_array_t)
4200 (((uintptr_t)upl) + sizeof(struct upl));
4201 }
4202
4203 entry = offset/PAGE_SIZE;
4204 target_offset = (vm_object_offset_t)offset;
4205 while(xfer_size) {
4206 vm_page_t t,m;
4207
4208 m = VM_PAGE_NULL;
4209 if(upl->flags & UPL_LITE) {
4210 int pg_num;
4211 pg_num = target_offset/PAGE_SIZE;
4212 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4213 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4214 m = vm_page_lookup(shadow_object,
4215 target_offset + (upl->offset -
4216 shadow_object->paging_offset));
4217 }
4218 }
4219 if(object->pageout) {
4220 if ((t = vm_page_lookup(object, target_offset))
4221 != NULL) {
4222 t->pageout = FALSE;
4223 VM_PAGE_FREE(t);
4224 if(m == NULL) {
4225 m = vm_page_lookup(
4226 shadow_object,
4227 target_offset +
4228 object->shadow_offset);
4229 }
4230 if(m != VM_PAGE_NULL)
4231 vm_object_paging_end(m->object);
4232 }
4233 }
4234 if(m != VM_PAGE_NULL) {
4235 vm_page_lock_queues();
4236 if(m->absent) {
4237 boolean_t must_free = TRUE;
4238
4239 /* COPYOUT = FALSE case */
4240 /* check for error conditions which must */
4241 /* be passed back to the pages customer */
4242 if(error & UPL_ABORT_RESTART) {
4243 m->restart = TRUE;
4244 m->absent = FALSE;
4245 vm_object_absent_release(m->object);
4246 m->page_error = KERN_MEMORY_ERROR;
4247 m->error = TRUE;
4248 must_free = FALSE;
4249 } else if(error & UPL_ABORT_UNAVAILABLE) {
4250 m->restart = FALSE;
4251 m->unusual = TRUE;
4252 must_free = FALSE;
4253 } else if(error & UPL_ABORT_ERROR) {
4254 m->restart = FALSE;
4255 m->absent = FALSE;
4256 vm_object_absent_release(m->object);
4257 m->page_error = KERN_MEMORY_ERROR;
4258 m->error = TRUE;
4259 must_free = FALSE;
4260 }
4261
4262 /*
4263 * ENCRYPTED SWAP:
4264 * If the page was already encrypted,
4265 * we don't really need to decrypt it
4266 * now. It will get decrypted later,
4267 * on demand, as soon as someone needs
4268 * to access its contents.
4269 */
4270
4271 m->cleaning = FALSE;
4272 m->overwriting = FALSE;
4273 PAGE_WAKEUP_DONE(m);
4274
4275 if (must_free == TRUE) {
4276 vm_page_free(m);
4277 } else {
4278 vm_page_activate(m);
4279 }
4280 vm_page_unlock_queues();
4281
4282 target_offset += PAGE_SIZE_64;
4283 xfer_size -= PAGE_SIZE;
4284 entry++;
4285 continue;
4286 }
4287 /*
4288 * Handle the trusted pager throttle.
4289 */
4290 if (m->laundry) {
4291 vm_pageout_throttle_up(m);
4292 }
4293 if(m->pageout) {
4294 assert(m->busy);
4295 assert(m->wire_count == 1);
4296 m->pageout = FALSE;
4297 vm_page_unwire(m);
4298 }
4299 m->dump_cleaning = FALSE;
4300 m->cleaning = FALSE;
4301 m->overwriting = FALSE;
4302 #if MACH_PAGEMAP
4303 vm_external_state_clr(
4304 m->object->existence_map, m->offset);
4305 #endif /* MACH_PAGEMAP */
4306 if(error & UPL_ABORT_DUMP_PAGES) {
4307 vm_page_free(m);
4308 pmap_disconnect(m->phys_page);
4309 } else {
4310 PAGE_WAKEUP_DONE(m);
4311 }
4312 vm_page_unlock_queues();
4313 }
4314 target_offset += PAGE_SIZE_64;
4315 xfer_size -= PAGE_SIZE;
4316 entry++;
4317 }
4318 occupied = 1;
4319 if (upl->flags & UPL_DEVICE_MEMORY) {
4320 occupied = 0;
4321 } else if (upl->flags & UPL_LITE) {
4322 int pg_num;
4323 int i;
4324 pg_num = upl->size/PAGE_SIZE;
4325 pg_num = (pg_num + 31) >> 5;
4326 occupied = 0;
4327 for(i= 0; i<pg_num; i++) {
4328 if(lite_list[i] != 0) {
4329 occupied = 1;
4330 break;
4331 }
4332 }
4333 } else {
4334 if(queue_empty(&upl->map_object->memq)) {
4335 occupied = 0;
4336 }
4337 }
4338
4339 if(occupied == 0) {
4340 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4341 *empty = TRUE;
4342 }
4343 if(object == shadow_object)
4344 vm_object_paging_end(shadow_object);
4345 }
4346 vm_object_unlock(shadow_object);
4347 if (object != shadow_object)
4348 vm_object_unlock(object);
4349
4350 upl_unlock(upl);
4351
4352 return KERN_SUCCESS;
4353 }
4354
4355 kern_return_t
4356 upl_abort(
4357 upl_t upl,
4358 int error)
4359 {
4360 vm_object_t object = NULL;
4361 vm_object_t shadow_object = NULL;
4362 vm_object_offset_t offset;
4363 vm_object_offset_t shadow_offset;
4364 vm_object_offset_t target_offset;
4365 upl_size_t i;
4366 wpl_array_t lite_list;
4367 vm_page_t t,m;
4368 int occupied;
4369 boolean_t shadow_internal;
4370
4371 if (upl == UPL_NULL)
4372 return KERN_INVALID_ARGUMENT;
4373
4374 if (upl->flags & UPL_IO_WIRE) {
4375 boolean_t empty;
4376 return upl_commit_range(upl,
4377 0, upl->size, 0,
4378 NULL, 0, &empty);
4379 }
4380
4381 upl_lock(upl);
4382 if(upl->flags & UPL_DEVICE_MEMORY) {
4383 upl_unlock(upl);
4384 return KERN_SUCCESS;
4385 }
4386
4387 object = upl->map_object;
4388
4389 if (object == NULL) {
4390 panic("upl_abort: upl object is not backed by an object");
4391 upl_unlock(upl);
4392 return KERN_INVALID_ARGUMENT;
4393 }
4394
4395 if(object->pageout) {
4396 shadow_object = object->shadow;
4397 shadow_offset = object->shadow_offset;
4398 } else {
4399 shadow_object = object;
4400 shadow_offset = upl->offset - object->paging_offset;
4401 }
4402
4403 if(upl->flags & UPL_INTERNAL) {
4404 lite_list = (wpl_array_t)
4405 ((((uintptr_t)upl) + sizeof(struct upl))
4406 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4407 } else {
4408 lite_list = (wpl_array_t)
4409 (((uintptr_t)upl) + sizeof(struct upl));
4410 }
4411 offset = 0;
4412
4413 if (object != shadow_object)
4414 vm_object_lock(object);
4415 vm_object_lock(shadow_object);
4416
4417 shadow_internal = shadow_object->internal;
4418
4419 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4420 m = VM_PAGE_NULL;
4421 target_offset = offset + shadow_offset;
4422 if(upl->flags & UPL_LITE) {
4423 int pg_num;
4424 pg_num = offset/PAGE_SIZE;
4425 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4426 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4427 m = vm_page_lookup(
4428 shadow_object, target_offset);
4429 }
4430 }
4431 if(object->pageout) {
4432 if ((t = vm_page_lookup(object, offset)) != NULL) {
4433 t->pageout = FALSE;
4434 VM_PAGE_FREE(t);
4435 if(m == NULL) {
4436 m = vm_page_lookup(
4437 shadow_object, target_offset);
4438 }
4439 if(m != VM_PAGE_NULL)
4440 vm_object_paging_end(m->object);
4441 }
4442 }
4443 if(m != VM_PAGE_NULL) {
4444 vm_page_lock_queues();
4445 if(m->absent) {
4446 boolean_t must_free = TRUE;
4447
4448 /* COPYOUT = FALSE case */
4449 /* check for error conditions which must */
4450 /* be passed back to the pages customer */
4451 if(error & UPL_ABORT_RESTART) {
4452 m->restart = TRUE;
4453 m->absent = FALSE;
4454 vm_object_absent_release(m->object);
4455 m->page_error = KERN_MEMORY_ERROR;
4456 m->error = TRUE;
4457 must_free = FALSE;
4458 } else if(error & UPL_ABORT_UNAVAILABLE) {
4459 m->restart = FALSE;
4460 m->unusual = TRUE;
4461 must_free = FALSE;
4462 } else if(error & UPL_ABORT_ERROR) {
4463 m->restart = FALSE;
4464 m->absent = FALSE;
4465 vm_object_absent_release(m->object);
4466 m->page_error = KERN_MEMORY_ERROR;
4467 m->error = TRUE;
4468 must_free = FALSE;
4469 }
4470
4471 /*
4472 * ENCRYPTED SWAP:
4473 * If the page was already encrypted,
4474 * we don't really need to decrypt it
4475 * now. It will get decrypted later,
4476 * on demand, as soon as someone needs
4477 * to access its contents.
4478 */
4479
4480 m->cleaning = FALSE;
4481 m->overwriting = FALSE;
4482 PAGE_WAKEUP_DONE(m);
4483
4484 if (must_free == TRUE) {
4485 vm_page_free(m);
4486 } else {
4487 vm_page_activate(m);
4488 }
4489 vm_page_unlock_queues();
4490 continue;
4491 }
4492 /*
4493 * Handle the trusted pager throttle.
4494 */
4495 if (m->laundry) {
4496 vm_pageout_throttle_up(m);
4497 }
4498 if(m->pageout) {
4499 assert(m->busy);
4500 assert(m->wire_count == 1);
4501 m->pageout = FALSE;
4502 vm_page_unwire(m);
4503 }
4504 m->dump_cleaning = FALSE;
4505 m->cleaning = FALSE;
4506 m->overwriting = FALSE;
4507 #if MACH_PAGEMAP
4508 vm_external_state_clr(
4509 m->object->existence_map, m->offset);
4510 #endif /* MACH_PAGEMAP */
4511 if(error & UPL_ABORT_DUMP_PAGES) {
4512 vm_page_free(m);
4513 pmap_disconnect(m->phys_page);
4514 } else {
4515 PAGE_WAKEUP_DONE(m);
4516 }
4517 vm_page_unlock_queues();
4518 }
4519 }
4520 occupied = 1;
4521 if (upl->flags & UPL_DEVICE_MEMORY) {
4522 occupied = 0;
4523 } else if (upl->flags & UPL_LITE) {
4524 int pg_num;
4525 int j;
4526 pg_num = upl->size/PAGE_SIZE;
4527 pg_num = (pg_num + 31) >> 5;
4528 occupied = 0;
4529 for(j= 0; j<pg_num; j++) {
4530 if(lite_list[j] != 0) {
4531 occupied = 1;
4532 break;
4533 }
4534 }
4535 } else {
4536 if(queue_empty(&upl->map_object->memq)) {
4537 occupied = 0;
4538 }
4539 }
4540
4541 if(occupied == 0) {
4542 if(object == shadow_object)
4543 vm_object_paging_end(shadow_object);
4544 }
4545 vm_object_unlock(shadow_object);
4546 if (object != shadow_object)
4547 vm_object_unlock(object);
4548
4549 upl_unlock(upl);
4550 return KERN_SUCCESS;
4551 }
4552
4553 /* an option on commit should be wire */
4554 kern_return_t
4555 upl_commit(
4556 upl_t upl,
4557 upl_page_info_t *page_list,
4558 mach_msg_type_number_t count)
4559 {
4560 if (upl == UPL_NULL)
4561 return KERN_INVALID_ARGUMENT;
4562
4563 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4564 boolean_t empty;
4565 return upl_commit_range(upl, 0, upl->size, 0,
4566 page_list, count, &empty);
4567 }
4568
4569 if (count == 0)
4570 page_list = NULL;
4571
4572 upl_lock(upl);
4573 if (upl->flags & UPL_DEVICE_MEMORY)
4574 page_list = NULL;
4575
4576 if (upl->flags & UPL_ENCRYPTED) {
4577 /*
4578 * ENCRYPTED SWAP:
4579 * This UPL was encrypted, but we don't need
4580 * to decrypt here. We'll decrypt each page
4581 * later, on demand, as soon as someone needs
4582 * to access the page's contents.
4583 */
4584 }
4585
4586 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4587 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4588 vm_object_t shadow_object = upl->map_object->shadow;
4589 vm_object_t object = upl->map_object;
4590 vm_object_offset_t target_offset;
4591 upl_size_t xfer_end;
4592 int entry;
4593
4594 vm_page_t t, m;
4595 upl_page_info_t *p;
4596
4597 if (object != shadow_object)
4598 vm_object_lock(object);
4599 vm_object_lock(shadow_object);
4600
4601 entry = 0;
4602 target_offset = object->shadow_offset;
4603 xfer_end = upl->size + object->shadow_offset;
4604
4605 while(target_offset < xfer_end) {
4606
4607 if ((t = vm_page_lookup(object,
4608 target_offset - object->shadow_offset))
4609 == NULL) {
4610 target_offset += PAGE_SIZE_64;
4611 entry++;
4612 continue;
4613 }
4614
4615 m = vm_page_lookup(shadow_object, target_offset);
4616 if(m != VM_PAGE_NULL) {
4617 /*
4618 * ENCRYPTED SWAP:
4619 * If this page was encrypted, we
4620 * don't need to decrypt it here.
4621 * We'll decrypt it later, on demand,
4622 * as soon as someone needs to access
4623 * its contents.
4624 */
4625
4626 if (upl->flags & UPL_CLEAR_DIRTY) {
4627 pmap_clear_modify(m->phys_page);
4628 m->dirty = FALSE;
4629 }
4630 /* It is a part of the semantic of */
4631 /* COPYOUT_FROM UPLs that a commit */
4632 /* implies cache sync between the */
4633 /* vm page and the backing store */
4634 /* this can be used to strip the */
4635 /* precious bit as well as clean */
4636 if (upl->flags & UPL_PAGE_SYNC_DONE)
4637 m->precious = FALSE;
4638
4639 if(page_list) {
4640 p = &(page_list[entry]);
4641 if(page_list[entry].phys_addr &&
4642 p->pageout && !m->pageout) {
4643 vm_page_lock_queues();
4644 m->busy = TRUE;
4645 m->pageout = TRUE;
4646 vm_page_wire(m);
4647 vm_page_unlock_queues();
4648 } else if (page_list[entry].phys_addr &&
4649 !p->pageout && m->pageout &&
4650 !m->dump_cleaning) {
4651 vm_page_lock_queues();
4652 m->pageout = FALSE;
4653 m->absent = FALSE;
4654 m->overwriting = FALSE;
4655 vm_page_unwire(m);
4656 PAGE_WAKEUP_DONE(m);
4657 vm_page_unlock_queues();
4658 }
4659 page_list[entry].phys_addr = 0;
4660 }
4661 }
4662 target_offset += PAGE_SIZE_64;
4663 entry++;
4664 }
4665 vm_object_unlock(shadow_object);
4666 if (object != shadow_object)
4667 vm_object_unlock(object);
4668
4669 }
4670 if (upl->flags & UPL_DEVICE_MEMORY) {
4671 vm_object_lock(upl->map_object->shadow);
4672 if(upl->map_object == upl->map_object->shadow)
4673 vm_object_paging_end(upl->map_object->shadow);
4674 vm_object_unlock(upl->map_object->shadow);
4675 }
4676 upl_unlock(upl);
4677 return KERN_SUCCESS;
4678 }
4679
4680
4681
4682 kern_return_t
4683 vm_object_iopl_request(
4684 vm_object_t object,
4685 vm_object_offset_t offset,
4686 upl_size_t size,
4687 upl_t *upl_ptr,
4688 upl_page_info_array_t user_page_list,
4689 unsigned int *page_list_count,
4690 int cntrl_flags)
4691 {
4692 vm_page_t dst_page;
4693 vm_object_offset_t dst_offset = offset;
4694 upl_size_t xfer_size = size;
4695 upl_t upl = NULL;
4696 unsigned int entry;
4697 wpl_array_t lite_list = NULL;
4698 int page_field_size;
4699 int delayed_unlock = 0;
4700 int no_zero_fill = FALSE;
4701 vm_page_t alias_page = NULL;
4702 kern_return_t ret;
4703 vm_prot_t prot;
4704
4705
4706 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4707 /*
4708 * For forward compatibility's sake,
4709 * reject any unknown flag.
4710 */
4711 return KERN_INVALID_VALUE;
4712 }
4713 if (vm_lopage_poolsize == 0)
4714 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4715
4716 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4717 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4718 return KERN_INVALID_VALUE;
4719
4720 if (object->phys_contiguous) {
4721 if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4722 return KERN_INVALID_ADDRESS;
4723
4724 if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4725 return KERN_INVALID_ADDRESS;
4726 }
4727 }
4728
4729 if (cntrl_flags & UPL_ENCRYPT) {
4730 /*
4731 * ENCRYPTED SWAP:
4732 * The paging path doesn't use this interface,
4733 * so we don't support the UPL_ENCRYPT flag
4734 * here. We won't encrypt the pages.
4735 */
4736 assert(! (cntrl_flags & UPL_ENCRYPT));
4737 }
4738
4739 if (cntrl_flags & UPL_NOZEROFILL)
4740 no_zero_fill = TRUE;
4741
4742 if (cntrl_flags & UPL_COPYOUT_FROM)
4743 prot = VM_PROT_READ;
4744 else
4745 prot = VM_PROT_READ | VM_PROT_WRITE;
4746
4747 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4748 size = MAX_UPL_TRANSFER * page_size;
4749 }
4750
4751 if(cntrl_flags & UPL_SET_INTERNAL)
4752 if(page_list_count != NULL)
4753 *page_list_count = MAX_UPL_TRANSFER;
4754 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4755 ((page_list_count != NULL) && (*page_list_count != 0)
4756 && *page_list_count < (size/page_size)))
4757 return KERN_INVALID_ARGUMENT;
4758
4759 if((!object->internal) && (object->paging_offset != 0))
4760 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4761
4762 if(object->phys_contiguous) {
4763 /* No paging operations are possible against this memory */
4764 /* and so no need for map object, ever */
4765 cntrl_flags |= UPL_SET_LITE;
4766 }
4767
4768 if(upl_ptr) {
4769 if(cntrl_flags & UPL_SET_INTERNAL) {
4770 if(cntrl_flags & UPL_SET_LITE) {
4771 upl = upl_create(
4772 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4773 size);
4774 user_page_list = (upl_page_info_t *)
4775 (((uintptr_t)upl) + sizeof(struct upl));
4776 lite_list = (wpl_array_t)
4777 (((uintptr_t)user_page_list) +
4778 ((size/PAGE_SIZE) *
4779 sizeof(upl_page_info_t)));
4780 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4781 page_field_size =
4782 (page_field_size + 3) & 0xFFFFFFFC;
4783 bzero((char *)lite_list, page_field_size);
4784 upl->flags =
4785 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4786 } else {
4787 upl = upl_create(UPL_CREATE_INTERNAL, size);
4788 user_page_list = (upl_page_info_t *)
4789 (((uintptr_t)upl)
4790 + sizeof(struct upl));
4791 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4792 }
4793 } else {
4794 if(cntrl_flags & UPL_SET_LITE) {
4795 upl = upl_create(UPL_CREATE_LITE, size);
4796 lite_list = (wpl_array_t)
4797 (((uintptr_t)upl) + sizeof(struct upl));
4798 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4799 page_field_size =
4800 (page_field_size + 3) & 0xFFFFFFFC;
4801 bzero((char *)lite_list, page_field_size);
4802 upl->flags = UPL_LITE | UPL_IO_WIRE;
4803 } else {
4804 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4805 upl->flags = UPL_IO_WIRE;
4806 }
4807 }
4808
4809 if(object->phys_contiguous) {
4810 upl->map_object = object;
4811 /* don't need any shadow mappings for this one */
4812 /* since it is already I/O memory */
4813 upl->flags |= UPL_DEVICE_MEMORY;
4814
4815 vm_object_lock(object);
4816 vm_object_paging_begin(object);
4817 vm_object_unlock(object);
4818
4819 /* paging in progress also protects the paging_offset */
4820 upl->offset = offset + object->paging_offset;
4821 upl->size = size;
4822 *upl_ptr = upl;
4823 if(user_page_list) {
4824 user_page_list[0].phys_addr =
4825 (offset + object->shadow_offset)>>PAGE_SHIFT;
4826 user_page_list[0].device = TRUE;
4827 }
4828 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4829
4830 if(page_list_count != NULL) {
4831 if (upl->flags & UPL_INTERNAL) {
4832 *page_list_count = 0;
4833 } else {
4834 *page_list_count = 1;
4835 }
4836 }
4837 return KERN_SUCCESS;
4838 }
4839 if(user_page_list)
4840 user_page_list[0].device = FALSE;
4841
4842 if(cntrl_flags & UPL_SET_LITE) {
4843 upl->map_object = object;
4844 } else {
4845 upl->map_object = vm_object_allocate(size);
4846 vm_object_lock(upl->map_object);
4847 upl->map_object->shadow = object;
4848 upl->map_object->pageout = TRUE;
4849 upl->map_object->can_persist = FALSE;
4850 upl->map_object->copy_strategy =
4851 MEMORY_OBJECT_COPY_NONE;
4852 upl->map_object->shadow_offset = offset;
4853 upl->map_object->wimg_bits = object->wimg_bits;
4854 vm_object_unlock(upl->map_object);
4855 }
4856 }
4857 vm_object_lock(object);
4858 vm_object_paging_begin(object);
4859
4860 if (!object->phys_contiguous) {
4861 /* Protect user space from future COW operations */
4862 object->true_share = TRUE;
4863 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4864 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4865 }
4866
4867 /* we can lock the upl offset now that paging_in_progress is set */
4868 if(upl_ptr) {
4869 upl->size = size;
4870 upl->offset = offset + object->paging_offset;
4871 *upl_ptr = upl;
4872 #ifdef UPL_DEBUG
4873 queue_enter(&object->uplq, upl, upl_t, uplq);
4874 #endif /* UPL_DEBUG */
4875 }
4876
4877 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4878 /*
4879 * The user requested that access to the pages in this URL
4880 * be blocked until the UPL is commited or aborted.
4881 */
4882 upl->flags |= UPL_ACCESS_BLOCKED;
4883 }
4884
4885 entry = 0;
4886 while (xfer_size) {
4887 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4888 if (delayed_unlock) {
4889 delayed_unlock = 0;
4890 vm_page_unlock_queues();
4891 }
4892 vm_object_unlock(object);
4893 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4894 vm_object_lock(object);
4895 }
4896 dst_page = vm_page_lookup(object, dst_offset);
4897
4898 /*
4899 * ENCRYPTED SWAP:
4900 * If the page is encrypted, we need to decrypt it,
4901 * so force a soft page fault.
4902 */
4903 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4904 (dst_page->encrypted) ||
4905 (dst_page->unusual && (dst_page->error ||
4906 dst_page->restart ||
4907 dst_page->absent ||
4908 dst_page->fictitious ||
4909 (prot & dst_page->page_lock)))) {
4910 vm_fault_return_t result;
4911 do {
4912 vm_page_t top_page;
4913 kern_return_t error_code;
4914 int interruptible;
4915
4916 vm_object_offset_t lo_offset = offset;
4917 vm_object_offset_t hi_offset = offset + size;
4918
4919
4920 if (delayed_unlock) {
4921 delayed_unlock = 0;
4922 vm_page_unlock_queues();
4923 }
4924
4925 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4926 interruptible = THREAD_ABORTSAFE;
4927 } else {
4928 interruptible = THREAD_UNINT;
4929 }
4930
4931 result = vm_fault_page(object, dst_offset,
4932 prot | VM_PROT_WRITE, FALSE,
4933 interruptible,
4934 lo_offset, hi_offset,
4935 VM_BEHAVIOR_SEQUENTIAL,
4936 &prot, &dst_page, &top_page,
4937 (int *)0,
4938 &error_code, no_zero_fill, FALSE, NULL, 0);
4939
4940 switch(result) {
4941 case VM_FAULT_SUCCESS:
4942
4943 PAGE_WAKEUP_DONE(dst_page);
4944
4945 /*
4946 * Release paging references and
4947 * top-level placeholder page, if any.
4948 */
4949
4950 if(top_page != VM_PAGE_NULL) {
4951 vm_object_t local_object;
4952 local_object =
4953 top_page->object;
4954 if(top_page->object
4955 != dst_page->object) {
4956 vm_object_lock(
4957 local_object);
4958 VM_PAGE_FREE(top_page);
4959 vm_object_paging_end(
4960 local_object);
4961 vm_object_unlock(
4962 local_object);
4963 } else {
4964 VM_PAGE_FREE(top_page);
4965 vm_object_paging_end(
4966 local_object);
4967 }
4968 }
4969
4970 break;
4971
4972
4973 case VM_FAULT_RETRY:
4974 vm_object_lock(object);
4975 vm_object_paging_begin(object);
4976 break;
4977
4978 case VM_FAULT_FICTITIOUS_SHORTAGE:
4979 vm_page_more_fictitious();
4980 vm_object_lock(object);
4981 vm_object_paging_begin(object);
4982 break;
4983
4984 case VM_FAULT_MEMORY_SHORTAGE:
4985 if (vm_page_wait(interruptible)) {
4986 vm_object_lock(object);
4987 vm_object_paging_begin(object);
4988 break;
4989 }
4990 /* fall thru */
4991
4992 case VM_FAULT_INTERRUPTED:
4993 error_code = MACH_SEND_INTERRUPTED;
4994 case VM_FAULT_MEMORY_ERROR:
4995 ret = (error_code ? error_code:
4996 KERN_MEMORY_ERROR);
4997 vm_object_lock(object);
4998
4999 goto return_err;
5000 }
5001 } while ((result != VM_FAULT_SUCCESS)
5002 || (result == VM_FAULT_INTERRUPTED));
5003 }
5004
5005 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5006 dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5007 vm_page_t low_page;
5008 int refmod;
5009
5010 /*
5011 * support devices that can't DMA above 32 bits
5012 * by substituting pages from a pool of low address
5013 * memory for any pages we find above the 4G mark
5014 * can't substitute if the page is already wired because
5015 * we don't know whether that physical address has been
5016 * handed out to some other 64 bit capable DMA device to use
5017 */
5018 if (dst_page->wire_count) {
5019 ret = KERN_PROTECTION_FAILURE;
5020 goto return_err;
5021 }
5022 if (delayed_unlock) {
5023 delayed_unlock = 0;
5024 vm_page_unlock_queues();
5025 }
5026 low_page = vm_page_grablo();
5027
5028 if (low_page == VM_PAGE_NULL) {
5029 ret = KERN_RESOURCE_SHORTAGE;
5030 goto return_err;
5031 }
5032 /*
5033 * from here until the vm_page_replace completes
5034 * we musn't drop the object lock... we don't
5035 * want anyone refaulting this page in and using
5036 * it after we disconnect it... we want the fault
5037 * to find the new page being substituted.
5038 */
5039 refmod = pmap_disconnect(dst_page->phys_page);
5040
5041 vm_page_copy(dst_page, low_page);
5042
5043 low_page->reference = dst_page->reference;
5044 low_page->dirty = dst_page->dirty;
5045
5046 if (refmod & VM_MEM_REFERENCED)
5047 low_page->reference = TRUE;
5048 if (refmod & VM_MEM_MODIFIED)
5049 low_page->dirty = TRUE;
5050
5051 vm_page_lock_queues();
5052 vm_page_replace(low_page, object, dst_offset);
5053 /*
5054 * keep the queue lock since we're going to
5055 * need it immediately
5056 */
5057 delayed_unlock = 1;
5058
5059 dst_page = low_page;
5060 /*
5061 * vm_page_grablo returned the page marked
5062 * BUSY... we don't need a PAGE_WAKEUP_DONE
5063 * here, because we've never dropped the object lock
5064 */
5065 dst_page->busy = FALSE;
5066 }
5067 if (delayed_unlock == 0)
5068 vm_page_lock_queues();
5069 vm_page_wire(dst_page);
5070
5071 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5072 /*
5073 * Mark the page "busy" to block any future page fault
5074 * on this page. We'll also remove the mapping
5075 * of all these pages before leaving this routine.
5076 */
5077 assert(!dst_page->fictitious);
5078 dst_page->busy = TRUE;
5079 }
5080
5081 if (upl_ptr) {
5082 if (cntrl_flags & UPL_SET_LITE) {
5083 int pg_num;
5084 pg_num = (dst_offset-offset)/PAGE_SIZE;
5085 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5086 } else {
5087 /*
5088 * Convert the fictitious page to a
5089 * private shadow of the real page.
5090 */
5091 assert(alias_page->fictitious);
5092 alias_page->fictitious = FALSE;
5093 alias_page->private = TRUE;
5094 alias_page->pageout = TRUE;
5095 alias_page->phys_page = dst_page->phys_page;
5096 vm_page_wire(alias_page);
5097
5098 vm_page_insert(alias_page,
5099 upl->map_object, size - xfer_size);
5100 assert(!alias_page->wanted);
5101 alias_page->busy = FALSE;
5102 alias_page->absent = FALSE;
5103 }
5104
5105 /* expect the page to be used */
5106 dst_page->reference = TRUE;
5107
5108 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5109 dst_page->dirty = TRUE;
5110 alias_page = NULL;
5111
5112 if (dst_page->phys_page > upl->highest_page)
5113 upl->highest_page = dst_page->phys_page;
5114
5115 if (user_page_list) {
5116 user_page_list[entry].phys_addr
5117 = dst_page->phys_page;
5118 user_page_list[entry].dirty =
5119 dst_page->dirty;
5120 user_page_list[entry].pageout =
5121 dst_page->pageout;
5122 user_page_list[entry].absent =
5123 dst_page->absent;
5124 user_page_list[entry].precious =
5125 dst_page->precious;
5126 }
5127 }
5128 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5129 delayed_unlock = 0;
5130 vm_page_unlock_queues();
5131 }
5132 entry++;
5133 dst_offset += PAGE_SIZE_64;
5134 xfer_size -= PAGE_SIZE;
5135 }
5136 if (delayed_unlock)
5137 vm_page_unlock_queues();
5138
5139 if (upl->flags & UPL_INTERNAL) {
5140 if(page_list_count != NULL)
5141 *page_list_count = 0;
5142 } else if (*page_list_count > entry) {
5143 if(page_list_count != NULL)
5144 *page_list_count = entry;
5145 }
5146
5147 if (alias_page != NULL) {
5148 vm_page_lock_queues();
5149 vm_page_free(alias_page);
5150 vm_page_unlock_queues();
5151 }
5152
5153 vm_object_unlock(object);
5154
5155 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5156 /*
5157 * We've marked all the pages "busy" so that future
5158 * page faults will block.
5159 * Now remove the mapping for these pages, so that they
5160 * can't be accessed without causing a page fault.
5161 */
5162 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5163 PMAP_NULL, 0, VM_PROT_NONE);
5164 }
5165
5166 return KERN_SUCCESS;
5167
5168
5169 return_err:
5170 if (delayed_unlock)
5171 vm_page_unlock_queues();
5172
5173 for (; offset < dst_offset; offset += PAGE_SIZE) {
5174 dst_page = vm_page_lookup(object, offset);
5175
5176 if (dst_page == VM_PAGE_NULL)
5177 panic("vm_object_iopl_request: Wired pages missing. \n");
5178 vm_page_lock_queues();
5179 vm_page_unwire(dst_page);
5180 vm_page_unlock_queues();
5181 VM_STAT(reactivations++);
5182 }
5183 vm_object_paging_end(object);
5184 vm_object_unlock(object);
5185 upl_destroy(upl);
5186
5187 return ret;
5188 }
5189
5190
5191 kern_return_t
5192 upl_transpose(
5193 upl_t upl1,
5194 upl_t upl2)
5195 {
5196 kern_return_t retval;
5197 boolean_t upls_locked;
5198 vm_object_t object1, object2;
5199
5200 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5201 return KERN_INVALID_ARGUMENT;
5202 }
5203
5204 upls_locked = FALSE;
5205
5206 /*
5207 * Since we need to lock both UPLs at the same time,
5208 * avoid deadlocks by always taking locks in the same order.
5209 */
5210 if (upl1 < upl2) {
5211 upl_lock(upl1);
5212 upl_lock(upl2);
5213 } else {
5214 upl_lock(upl2);
5215 upl_lock(upl1);
5216 }
5217 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5218
5219 object1 = upl1->map_object;
5220 object2 = upl2->map_object;
5221
5222 if (upl1->offset != 0 || upl2->offset != 0 ||
5223 upl1->size != upl2->size) {
5224 /*
5225 * We deal only with full objects, not subsets.
5226 * That's because we exchange the entire backing store info
5227 * for the objects: pager, resident pages, etc... We can't do
5228 * only part of it.
5229 */
5230 retval = KERN_INVALID_VALUE;
5231 goto done;
5232 }
5233
5234 /*
5235 * Tranpose the VM objects' backing store.
5236 */
5237 retval = vm_object_transpose(object1, object2,
5238 (vm_object_size_t) upl1->size);
5239
5240 if (retval == KERN_SUCCESS) {
5241 /*
5242 * Make each UPL point to the correct VM object, i.e. the
5243 * object holding the pages that the UPL refers to...
5244 */
5245 upl1->map_object = object2;
5246 upl2->map_object = object1;
5247 }
5248
5249 done:
5250 /*
5251 * Cleanup.
5252 */
5253 if (upls_locked) {
5254 upl_unlock(upl1);
5255 upl_unlock(upl2);
5256 upls_locked = FALSE;
5257 }
5258
5259 return retval;
5260 }
5261
5262 /*
5263 * ENCRYPTED SWAP:
5264 *
5265 * Rationale: the user might have some encrypted data on disk (via
5266 * FileVault or any other mechanism). That data is then decrypted in
5267 * memory, which is safe as long as the machine is secure. But that
5268 * decrypted data in memory could be paged out to disk by the default
5269 * pager. The data would then be stored on disk in clear (not encrypted)
5270 * and it could be accessed by anyone who gets physical access to the
5271 * disk (if the laptop or the disk gets stolen for example). This weakens
5272 * the security offered by FileVault.
5273 *
5274 * Solution: the default pager will optionally request that all the
5275 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5276 * before it sends this UPL to disk via the vnode_pageout() path.
5277 *
5278 * Notes:
5279 *
5280 * To avoid disrupting the VM LRU algorithms, we want to keep the
5281 * clean-in-place mechanisms, which allow us to send some extra pages to
5282 * swap (clustering) without actually removing them from the user's
5283 * address space. We don't want the user to unknowingly access encrypted
5284 * data, so we have to actually remove the encrypted pages from the page
5285 * table. When the user accesses the data, the hardware will fail to
5286 * locate the virtual page in its page table and will trigger a page
5287 * fault. We can then decrypt the page and enter it in the page table
5288 * again. Whenever we allow the user to access the contents of a page,
5289 * we have to make sure it's not encrypted.
5290 *
5291 *
5292 */
5293 /*
5294 * ENCRYPTED SWAP:
5295 * Reserve of virtual addresses in the kernel address space.
5296 * We need to map the physical pages in the kernel, so that we
5297 * can call the encryption/decryption routines with a kernel
5298 * virtual address. We keep this pool of pre-allocated kernel
5299 * virtual addresses so that we don't have to scan the kernel's
5300 * virtaul address space each time we need to encrypt or decrypt
5301 * a physical page.
5302 * It would be nice to be able to encrypt and decrypt in physical
5303 * mode but that might not always be more efficient...
5304 */
5305 decl_simple_lock_data(,vm_paging_lock)
5306 #define VM_PAGING_NUM_PAGES 64
5307 vm_map_offset_t vm_paging_base_address = 0;
5308 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5309 int vm_paging_max_index = 0;
5310 unsigned long vm_paging_no_kernel_page = 0;
5311 unsigned long vm_paging_objects_mapped = 0;
5312 unsigned long vm_paging_pages_mapped = 0;
5313 unsigned long vm_paging_objects_mapped_slow = 0;
5314 unsigned long vm_paging_pages_mapped_slow = 0;
5315
5316 /*
5317 * ENCRYPTED SWAP:
5318 * vm_paging_map_object:
5319 * Maps part of a VM object's pages in the kernel
5320 * virtual address space, using the pre-allocated
5321 * kernel virtual addresses, if possible.
5322 * Context:
5323 * The VM object is locked. This lock will get
5324 * dropped and re-acquired though.
5325 */
5326 kern_return_t
5327 vm_paging_map_object(
5328 vm_map_offset_t *address,
5329 vm_page_t page,
5330 vm_object_t object,
5331 vm_object_offset_t offset,
5332 vm_map_size_t *size)
5333 {
5334 kern_return_t kr;
5335 vm_map_offset_t page_map_offset;
5336 vm_map_size_t map_size;
5337 vm_object_offset_t object_offset;
5338 #ifdef __ppc__
5339 int i;
5340 vm_map_entry_t map_entry;
5341 #endif /* __ppc__ */
5342
5343
5344 #ifdef __ppc__
5345 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5346 /*
5347 * Optimization for the PowerPC.
5348 * Use one of the pre-allocated kernel virtual addresses
5349 * and just enter the VM page in the kernel address space
5350 * at that virtual address.
5351 */
5352 vm_object_unlock(object);
5353 simple_lock(&vm_paging_lock);
5354
5355 if (vm_paging_base_address == 0) {
5356 /*
5357 * Initialize our pool of pre-allocated kernel
5358 * virtual addresses.
5359 */
5360 simple_unlock(&vm_paging_lock);
5361 page_map_offset = 0;
5362 kr = vm_map_find_space(kernel_map,
5363 &page_map_offset,
5364 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5365 0,
5366 0,
5367 &map_entry);
5368 if (kr != KERN_SUCCESS) {
5369 panic("vm_paging_map_object: "
5370 "kernel_map full\n");
5371 }
5372 map_entry->object.vm_object = kernel_object;
5373 map_entry->offset =
5374 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5375 vm_object_reference(kernel_object);
5376 vm_map_unlock(kernel_map);
5377
5378 simple_lock(&vm_paging_lock);
5379 if (vm_paging_base_address != 0) {
5380 /* someone raced us and won: undo */
5381 simple_unlock(&vm_paging_lock);
5382 kr = vm_map_remove(kernel_map,
5383 page_map_offset,
5384 page_map_offset +
5385 (VM_PAGING_NUM_PAGES
5386 * PAGE_SIZE),
5387 VM_MAP_NO_FLAGS);
5388 assert(kr == KERN_SUCCESS);
5389 simple_lock(&vm_paging_lock);
5390 } else {
5391 vm_paging_base_address = page_map_offset;
5392 }
5393 }
5394
5395 /*
5396 * Try and find an available kernel virtual address
5397 * from our pre-allocated pool.
5398 */
5399 page_map_offset = 0;
5400 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5401 if (vm_paging_page_inuse[i] == FALSE) {
5402 page_map_offset = vm_paging_base_address +
5403 (i * PAGE_SIZE);
5404 break;
5405 }
5406 }
5407
5408 if (page_map_offset != 0) {
5409 /*
5410 * We found a kernel virtual address;
5411 * map the physical page to that virtual address.
5412 */
5413 if (i > vm_paging_max_index) {
5414 vm_paging_max_index = i;
5415 }
5416 vm_paging_page_inuse[i] = TRUE;
5417 simple_unlock(&vm_paging_lock);
5418 pmap_map_block(kernel_pmap,
5419 page_map_offset,
5420 page->phys_page,
5421 1, /* Size is number of 4k pages */
5422 VM_PROT_DEFAULT,
5423 ((int) page->object->wimg_bits &
5424 VM_WIMG_MASK),
5425 0);
5426 vm_paging_objects_mapped++;
5427 vm_paging_pages_mapped++;
5428 *address = page_map_offset;
5429 vm_object_lock(object);
5430
5431 /* all done and mapped, ready to use ! */
5432 return KERN_SUCCESS;
5433 }
5434
5435 /*
5436 * We ran out of pre-allocated kernel virtual
5437 * addresses. Just map the page in the kernel
5438 * the slow and regular way.
5439 */
5440 vm_paging_no_kernel_page++;
5441 simple_unlock(&vm_paging_lock);
5442 vm_object_lock(object);
5443 }
5444 #endif /* __ppc__ */
5445
5446 object_offset = vm_object_trunc_page(offset);
5447 map_size = vm_map_round_page(*size);
5448
5449 /*
5450 * Try and map the required range of the object
5451 * in the kernel_map
5452 */
5453
5454 /* don't go beyond the object's end... */
5455 if (object_offset >= object->size) {
5456 map_size = 0;
5457 } else if (map_size > object->size - offset) {
5458 map_size = object->size - offset;
5459 }
5460
5461 vm_object_reference_locked(object); /* for the map entry */
5462 vm_object_unlock(object);
5463
5464 kr = vm_map_enter(kernel_map,
5465 address,
5466 map_size,
5467 0,
5468 VM_FLAGS_ANYWHERE,
5469 object,
5470 object_offset,
5471 FALSE,
5472 VM_PROT_DEFAULT,
5473 VM_PROT_ALL,
5474 VM_INHERIT_NONE);
5475 if (kr != KERN_SUCCESS) {
5476 *address = 0;
5477 *size = 0;
5478 vm_object_deallocate(object); /* for the map entry */
5479 return kr;
5480 }
5481
5482 *size = map_size;
5483
5484 /*
5485 * Enter the mapped pages in the page table now.
5486 */
5487 vm_object_lock(object);
5488 for (page_map_offset = 0;
5489 map_size != 0;
5490 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5491 unsigned int cache_attr;
5492
5493 page = vm_page_lookup(object, offset + page_map_offset);
5494 if (page == VM_PAGE_NULL) {
5495 panic("vm_paging_map_object: no page !?");
5496 }
5497 if (page->no_isync == TRUE) {
5498 pmap_sync_page_data_phys(page->phys_page);
5499 }
5500 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5501
5502 PMAP_ENTER(kernel_pmap,
5503 *address + page_map_offset,
5504 page,
5505 VM_PROT_DEFAULT,
5506 cache_attr,
5507 FALSE);
5508 }
5509
5510 vm_paging_objects_mapped_slow++;
5511 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5512
5513 return KERN_SUCCESS;
5514 }
5515
5516 /*
5517 * ENCRYPTED SWAP:
5518 * vm_paging_unmap_object:
5519 * Unmaps part of a VM object's pages from the kernel
5520 * virtual address space.
5521 * Context:
5522 * The VM object is locked. This lock will get
5523 * dropped and re-acquired though.
5524 */
5525 void
5526 vm_paging_unmap_object(
5527 vm_object_t object,
5528 vm_map_offset_t start,
5529 vm_map_offset_t end)
5530 {
5531 kern_return_t kr;
5532 #ifdef __ppc__
5533 int i;
5534 #endif /* __ppc__ */
5535
5536 if ((vm_paging_base_address == 0) &&
5537 ((start < vm_paging_base_address) ||
5538 (end > (vm_paging_base_address
5539 + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5540 /*
5541 * We didn't use our pre-allocated pool of
5542 * kernel virtual address. Deallocate the
5543 * virtual memory.
5544 */
5545 if (object != VM_OBJECT_NULL) {
5546 vm_object_unlock(object);
5547 }
5548 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5549 if (object != VM_OBJECT_NULL) {
5550 vm_object_lock(object);
5551 }
5552 assert(kr == KERN_SUCCESS);
5553 } else {
5554 /*
5555 * We used a kernel virtual address from our
5556 * pre-allocated pool. Put it back in the pool
5557 * for next time.
5558 */
5559 #ifdef __ppc__
5560 assert(end - start == PAGE_SIZE);
5561 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5562
5563 /* undo the pmap mapping */
5564 mapping_remove(kernel_pmap, start);
5565
5566 simple_lock(&vm_paging_lock);
5567 vm_paging_page_inuse[i] = FALSE;
5568 simple_unlock(&vm_paging_lock);
5569 #endif /* __ppc__ */
5570 }
5571 }
5572
5573 /*
5574 * Encryption data.
5575 * "iv" is the "initial vector". Ideally, we want to
5576 * have a different one for each page we encrypt, so that
5577 * crackers can't find encryption patterns too easily.
5578 */
5579 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5580 boolean_t swap_crypt_ctx_initialized = FALSE;
5581 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5582 aes_ctx swap_crypt_ctx;
5583 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5584
5585 #if DEBUG
5586 boolean_t swap_crypt_ctx_tested = FALSE;
5587 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5588 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5589 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5590 #endif /* DEBUG */
5591
5592 extern u_long random(void);
5593
5594 /*
5595 * Initialize the encryption context: key and key size.
5596 */
5597 void swap_crypt_ctx_initialize(void); /* forward */
5598 void
5599 swap_crypt_ctx_initialize(void)
5600 {
5601 unsigned int i;
5602
5603 /*
5604 * No need for locking to protect swap_crypt_ctx_initialized
5605 * because the first use of encryption will come from the
5606 * pageout thread (we won't pagein before there's been a pageout)
5607 * and there's only one pageout thread.
5608 */
5609 if (swap_crypt_ctx_initialized == FALSE) {
5610 for (i = 0;
5611 i < (sizeof (swap_crypt_key) /
5612 sizeof (swap_crypt_key[0]));
5613 i++) {
5614 swap_crypt_key[i] = random();
5615 }
5616 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5617 SWAP_CRYPT_AES_KEY_SIZE,
5618 &swap_crypt_ctx.encrypt);
5619 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5620 SWAP_CRYPT_AES_KEY_SIZE,
5621 &swap_crypt_ctx.decrypt);
5622 swap_crypt_ctx_initialized = TRUE;
5623 }
5624
5625 #if DEBUG
5626 /*
5627 * Validate the encryption algorithms.
5628 */
5629 if (swap_crypt_ctx_tested == FALSE) {
5630 /* initialize */
5631 for (i = 0; i < 4096; i++) {
5632 swap_crypt_test_page_ref[i] = (char) i;
5633 }
5634 /* encrypt */
5635 aes_encrypt_cbc(swap_crypt_test_page_ref,
5636 swap_crypt_null_iv,
5637 PAGE_SIZE / AES_BLOCK_SIZE,
5638 swap_crypt_test_page_encrypt,
5639 &swap_crypt_ctx.encrypt);
5640 /* decrypt */
5641 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5642 swap_crypt_null_iv,
5643 PAGE_SIZE / AES_BLOCK_SIZE,
5644 swap_crypt_test_page_decrypt,
5645 &swap_crypt_ctx.decrypt);
5646 /* compare result with original */
5647 for (i = 0; i < 4096; i ++) {
5648 if (swap_crypt_test_page_decrypt[i] !=
5649 swap_crypt_test_page_ref[i]) {
5650 panic("encryption test failed");
5651 }
5652 }
5653
5654 /* encrypt again */
5655 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5656 swap_crypt_null_iv,
5657 PAGE_SIZE / AES_BLOCK_SIZE,
5658 swap_crypt_test_page_decrypt,
5659 &swap_crypt_ctx.encrypt);
5660 /* decrypt in place */
5661 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5662 swap_crypt_null_iv,
5663 PAGE_SIZE / AES_BLOCK_SIZE,
5664 swap_crypt_test_page_decrypt,
5665 &swap_crypt_ctx.decrypt);
5666 for (i = 0; i < 4096; i ++) {
5667 if (swap_crypt_test_page_decrypt[i] !=
5668 swap_crypt_test_page_ref[i]) {
5669 panic("in place encryption test failed");
5670 }
5671 }
5672
5673 swap_crypt_ctx_tested = TRUE;
5674 }
5675 #endif /* DEBUG */
5676 }
5677
5678 /*
5679 * ENCRYPTED SWAP:
5680 * vm_page_encrypt:
5681 * Encrypt the given page, for secure paging.
5682 * The page might already be mapped at kernel virtual
5683 * address "kernel_mapping_offset". Otherwise, we need
5684 * to map it.
5685 *
5686 * Context:
5687 * The page's object is locked, but this lock will be released
5688 * and re-acquired.
5689 * The page is busy and not accessible by users (not entered in any pmap).
5690 */
5691 void
5692 vm_page_encrypt(
5693 vm_page_t page,
5694 vm_map_offset_t kernel_mapping_offset)
5695 {
5696 int clear_refmod = 0;
5697 kern_return_t kr;
5698 boolean_t page_was_referenced;
5699 boolean_t page_was_modified;
5700 vm_map_size_t kernel_mapping_size;
5701 vm_offset_t kernel_vaddr;
5702 union {
5703 unsigned char aes_iv[AES_BLOCK_SIZE];
5704 struct {
5705 memory_object_t pager_object;
5706 vm_object_offset_t paging_offset;
5707 } vm;
5708 } encrypt_iv;
5709
5710 if (! vm_pages_encrypted) {
5711 vm_pages_encrypted = TRUE;
5712 }
5713
5714 assert(page->busy);
5715 assert(page->dirty || page->precious);
5716
5717 if (page->encrypted) {
5718 /*
5719 * Already encrypted: no need to do it again.
5720 */
5721 vm_page_encrypt_already_encrypted_counter++;
5722 return;
5723 }
5724 ASSERT_PAGE_DECRYPTED(page);
5725
5726 /*
5727 * Gather the "reference" and "modified" status of the page.
5728 * We'll restore these values after the encryption, so that
5729 * the encryption is transparent to the rest of the system
5730 * and doesn't impact the VM's LRU logic.
5731 */
5732 page_was_referenced =
5733 (page->reference || pmap_is_referenced(page->phys_page));
5734 page_was_modified =
5735 (page->dirty || pmap_is_modified(page->phys_page));
5736
5737 if (kernel_mapping_offset == 0) {
5738 /*
5739 * The page hasn't already been mapped in kernel space
5740 * by the caller. Map it now, so that we can access
5741 * its contents and encrypt them.
5742 */
5743 kernel_mapping_size = PAGE_SIZE;
5744 kr = vm_paging_map_object(&kernel_mapping_offset,
5745 page,
5746 page->object,
5747 page->offset,
5748 &kernel_mapping_size);
5749 if (kr != KERN_SUCCESS) {
5750 panic("vm_page_encrypt: "
5751 "could not map page in kernel: 0x%x\n",
5752 kr);
5753 }
5754 } else {
5755 kernel_mapping_size = 0;
5756 }
5757 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5758
5759 if (swap_crypt_ctx_initialized == FALSE) {
5760 swap_crypt_ctx_initialize();
5761 }
5762 assert(swap_crypt_ctx_initialized);
5763
5764 /*
5765 * Prepare an "initial vector" for the encryption.
5766 * We use the "pager" and the "paging_offset" for that
5767 * page to obfuscate the encrypted data a bit more and
5768 * prevent crackers from finding patterns that they could
5769 * use to break the key.
5770 */
5771 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5772 encrypt_iv.vm.pager_object = page->object->pager;
5773 encrypt_iv.vm.paging_offset =
5774 page->object->paging_offset + page->offset;
5775
5776 vm_object_unlock(page->object);
5777
5778 /* encrypt the "initial vector" */
5779 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5780 swap_crypt_null_iv,
5781 1,
5782 &encrypt_iv.aes_iv[0],
5783 &swap_crypt_ctx.encrypt);
5784
5785 /*
5786 * Encrypt the page.
5787 */
5788 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5789 &encrypt_iv.aes_iv[0],
5790 PAGE_SIZE / AES_BLOCK_SIZE,
5791 (unsigned char *) kernel_vaddr,
5792 &swap_crypt_ctx.encrypt);
5793
5794 vm_page_encrypt_counter++;
5795
5796 vm_object_lock(page->object);
5797
5798 /*
5799 * Unmap the page from the kernel's address space,
5800 * if we had to map it ourselves. Otherwise, let
5801 * the caller undo the mapping if needed.
5802 */
5803 if (kernel_mapping_size != 0) {
5804 vm_paging_unmap_object(page->object,
5805 kernel_mapping_offset,
5806 kernel_mapping_offset + kernel_mapping_size);
5807 }
5808
5809 /*
5810 * Restore the "reference" and "modified" bits.
5811 * This should clean up any impact the encryption had
5812 * on them.
5813 */
5814 if (! page_was_referenced) {
5815 clear_refmod |= VM_MEM_REFERENCED;
5816 page->reference = FALSE;
5817 }
5818 if (! page_was_modified) {
5819 clear_refmod |= VM_MEM_MODIFIED;
5820 page->dirty = FALSE;
5821 }
5822 if (clear_refmod)
5823 pmap_clear_refmod(page->phys_page, clear_refmod);
5824
5825 page->encrypted = TRUE;
5826 }
5827
5828 /*
5829 * ENCRYPTED SWAP:
5830 * vm_page_decrypt:
5831 * Decrypt the given page.
5832 * The page might already be mapped at kernel virtual
5833 * address "kernel_mapping_offset". Otherwise, we need
5834 * to map it.
5835 *
5836 * Context:
5837 * The page's VM object is locked but will be unlocked and relocked.
5838 * The page is busy and not accessible by users (not entered in any pmap).
5839 */
5840 void
5841 vm_page_decrypt(
5842 vm_page_t page,
5843 vm_map_offset_t kernel_mapping_offset)
5844 {
5845 int clear_refmod = 0;
5846 kern_return_t kr;
5847 vm_map_size_t kernel_mapping_size;
5848 vm_offset_t kernel_vaddr;
5849 boolean_t page_was_referenced;
5850 union {
5851 unsigned char aes_iv[AES_BLOCK_SIZE];
5852 struct {
5853 memory_object_t pager_object;
5854 vm_object_offset_t paging_offset;
5855 } vm;
5856 } decrypt_iv;
5857
5858 assert(page->busy);
5859 assert(page->encrypted);
5860
5861 /*
5862 * Gather the "reference" status of the page.
5863 * We'll restore its value after the decryption, so that
5864 * the decryption is transparent to the rest of the system
5865 * and doesn't impact the VM's LRU logic.
5866 */
5867 page_was_referenced =
5868 (page->reference || pmap_is_referenced(page->phys_page));
5869
5870 if (kernel_mapping_offset == 0) {
5871 /*
5872 * The page hasn't already been mapped in kernel space
5873 * by the caller. Map it now, so that we can access
5874 * its contents and decrypt them.
5875 */
5876 kernel_mapping_size = PAGE_SIZE;
5877 kr = vm_paging_map_object(&kernel_mapping_offset,
5878 page,
5879 page->object,
5880 page->offset,
5881 &kernel_mapping_size);
5882 if (kr != KERN_SUCCESS) {
5883 panic("vm_page_decrypt: "
5884 "could not map page in kernel: 0x%x\n");
5885 }
5886 } else {
5887 kernel_mapping_size = 0;
5888 }
5889 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5890
5891 assert(swap_crypt_ctx_initialized);
5892
5893 /*
5894 * Prepare an "initial vector" for the decryption.
5895 * It has to be the same as the "initial vector" we
5896 * used to encrypt that page.
5897 */
5898 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5899 decrypt_iv.vm.pager_object = page->object->pager;
5900 decrypt_iv.vm.paging_offset =
5901 page->object->paging_offset + page->offset;
5902
5903 vm_object_unlock(page->object);
5904
5905 /* encrypt the "initial vector" */
5906 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5907 swap_crypt_null_iv,
5908 1,
5909 &decrypt_iv.aes_iv[0],
5910 &swap_crypt_ctx.encrypt);
5911
5912 /*
5913 * Decrypt the page.
5914 */
5915 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5916 &decrypt_iv.aes_iv[0],
5917 PAGE_SIZE / AES_BLOCK_SIZE,
5918 (unsigned char *) kernel_vaddr,
5919 &swap_crypt_ctx.decrypt);
5920 vm_page_decrypt_counter++;
5921
5922 vm_object_lock(page->object);
5923
5924 /*
5925 * Unmap the page from the kernel's address space,
5926 * if we had to map it ourselves. Otherwise, let
5927 * the caller undo the mapping if needed.
5928 */
5929 if (kernel_mapping_size != 0) {
5930 vm_paging_unmap_object(page->object,
5931 kernel_vaddr,
5932 kernel_vaddr + PAGE_SIZE);
5933 }
5934
5935 /*
5936 * After decryption, the page is actually clean.
5937 * It was encrypted as part of paging, which "cleans"
5938 * the "dirty" pages.
5939 * Noone could access it after it was encrypted
5940 * and the decryption doesn't count.
5941 */
5942 page->dirty = FALSE;
5943 clear_refmod = VM_MEM_MODIFIED;
5944
5945 /* restore the "reference" bit */
5946 if (! page_was_referenced) {
5947 page->reference = FALSE;
5948 clear_refmod |= VM_MEM_REFERENCED;
5949 }
5950 pmap_clear_refmod(page->phys_page, clear_refmod);
5951
5952 page->encrypted = FALSE;
5953
5954 /*
5955 * We've just modified the page's contents via the data cache and part
5956 * of the new contents might still be in the cache and not yet in RAM.
5957 * Since the page is now available and might get gathered in a UPL to
5958 * be part of a DMA transfer from a driver that expects the memory to
5959 * be coherent at this point, we have to flush the data cache.
5960 */
5961 pmap_sync_page_attributes_phys(page->phys_page);
5962 /*
5963 * Since the page is not mapped yet, some code might assume that it
5964 * doesn't need to invalidate the instruction cache when writing to
5965 * that page. That code relies on "no_isync" being set, so that the
5966 * caches get syncrhonized when the page is first mapped. So we need
5967 * to set "no_isync" here too, despite the fact that we just
5968 * synchronized the caches above...
5969 */
5970 page->no_isync = TRUE;
5971 }
5972
5973 unsigned long upl_encrypt_upls = 0;
5974 unsigned long upl_encrypt_pages = 0;
5975
5976 /*
5977 * ENCRYPTED SWAP:
5978 *
5979 * upl_encrypt:
5980 * Encrypts all the pages in the UPL, within the specified range.
5981 *
5982 */
5983 void
5984 upl_encrypt(
5985 upl_t upl,
5986 upl_offset_t crypt_offset,
5987 upl_size_t crypt_size)
5988 {
5989 upl_size_t upl_size;
5990 upl_offset_t upl_offset;
5991 vm_object_t upl_object;
5992 vm_page_t page;
5993 vm_object_t shadow_object;
5994 vm_object_offset_t shadow_offset;
5995 vm_object_offset_t paging_offset;
5996 vm_object_offset_t base_offset;
5997
5998 upl_encrypt_upls++;
5999 upl_encrypt_pages += crypt_size / PAGE_SIZE;
6000
6001 upl_lock(upl);
6002
6003 upl_object = upl->map_object;
6004 upl_offset = upl->offset;
6005 upl_size = upl->size;
6006
6007 upl_unlock(upl);
6008
6009 vm_object_lock(upl_object);
6010
6011 /*
6012 * Find the VM object that contains the actual pages.
6013 */
6014 if (upl_object->pageout) {
6015 shadow_object = upl_object->shadow;
6016 /*
6017 * The offset in the shadow object is actually also
6018 * accounted for in upl->offset. It possibly shouldn't be
6019 * this way, but for now don't account for it twice.
6020 */
6021 shadow_offset = 0;
6022 assert(upl_object->paging_offset == 0); /* XXX ? */
6023 vm_object_lock(shadow_object);
6024 } else {
6025 shadow_object = upl_object;
6026 shadow_offset = 0;
6027 }
6028
6029 paging_offset = shadow_object->paging_offset;
6030 vm_object_paging_begin(shadow_object);
6031
6032 if (shadow_object != upl_object) {
6033 vm_object_unlock(shadow_object);
6034 }
6035 vm_object_unlock(upl_object);
6036
6037 base_offset = shadow_offset;
6038 base_offset += upl_offset;
6039 base_offset += crypt_offset;
6040 base_offset -= paging_offset;
6041 /*
6042 * Unmap the pages, so that nobody can continue accessing them while
6043 * they're encrypted. After that point, all accesses to these pages
6044 * will cause a page fault and block while the page is being encrypted
6045 * (busy). After the encryption completes, any access will cause a
6046 * page fault and the page gets decrypted at that time.
6047 */
6048 assert(crypt_offset + crypt_size <= upl_size);
6049 vm_object_pmap_protect(shadow_object,
6050 base_offset,
6051 (vm_object_size_t)crypt_size,
6052 PMAP_NULL,
6053 0,
6054 VM_PROT_NONE);
6055
6056 /* XXX FBDP could the object have changed significantly here ? */
6057 vm_object_lock(shadow_object);
6058
6059 for (upl_offset = 0;
6060 upl_offset < crypt_size;
6061 upl_offset += PAGE_SIZE) {
6062 page = vm_page_lookup(shadow_object,
6063 base_offset + upl_offset);
6064 if (page == VM_PAGE_NULL) {
6065 panic("upl_encrypt: "
6066 "no page for (obj=%p,off=%lld+%d)!\n",
6067 shadow_object,
6068 base_offset,
6069 upl_offset);
6070 }
6071 vm_page_encrypt(page, 0);
6072 }
6073
6074 vm_object_paging_end(shadow_object);
6075 vm_object_unlock(shadow_object);
6076 }
6077
6078 vm_size_t
6079 upl_get_internal_pagelist_offset(void)
6080 {
6081 return sizeof(struct upl);
6082 }
6083
6084 void
6085 upl_clear_dirty(
6086 upl_t upl,
6087 boolean_t value)
6088 {
6089 if (value) {
6090 upl->flags |= UPL_CLEAR_DIRTY;
6091 } else {
6092 upl->flags &= ~UPL_CLEAR_DIRTY;
6093 }
6094 }
6095
6096
6097 #ifdef MACH_BSD
6098
6099 boolean_t upl_page_present(upl_page_info_t *upl, int index)
6100 {
6101 return(UPL_PAGE_PRESENT(upl, index));
6102 }
6103 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
6104 {
6105 return(UPL_DIRTY_PAGE(upl, index));
6106 }
6107 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
6108 {
6109 return(UPL_VALID_PAGE(upl, index));
6110 }
6111 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6112 {
6113 return(UPL_PHYS_PAGE(upl, index));
6114 }
6115
6116 void
6117 vm_countdirtypages(void)
6118 {
6119 vm_page_t m;
6120 int dpages;
6121 int pgopages;
6122 int precpages;
6123
6124
6125 dpages=0;
6126 pgopages=0;
6127 precpages=0;
6128
6129 vm_page_lock_queues();
6130 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6131 do {
6132 if (m ==(vm_page_t )0) break;
6133
6134 if(m->dirty) dpages++;
6135 if(m->pageout) pgopages++;
6136 if(m->precious) precpages++;
6137
6138 assert(m->object != kernel_object);
6139 m = (vm_page_t) queue_next(&m->pageq);
6140 if (m ==(vm_page_t )0) break;
6141
6142 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6143 vm_page_unlock_queues();
6144
6145 vm_page_lock_queues();
6146 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6147 do {
6148 if (m ==(vm_page_t )0) break;
6149
6150 if(m->dirty) dpages++;
6151 if(m->pageout) pgopages++;
6152 if(m->precious) precpages++;
6153
6154 assert(m->object != kernel_object);
6155 m = (vm_page_t) queue_next(&m->pageq);
6156 if (m ==(vm_page_t )0) break;
6157
6158 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6159 vm_page_unlock_queues();
6160
6161 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6162
6163 dpages=0;
6164 pgopages=0;
6165 precpages=0;
6166
6167 vm_page_lock_queues();
6168 m = (vm_page_t) queue_first(&vm_page_queue_active);
6169
6170 do {
6171 if(m == (vm_page_t )0) break;
6172 if(m->dirty) dpages++;
6173 if(m->pageout) pgopages++;
6174 if(m->precious) precpages++;
6175
6176 assert(m->object != kernel_object);
6177 m = (vm_page_t) queue_next(&m->pageq);
6178 if(m == (vm_page_t )0) break;
6179
6180 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6181 vm_page_unlock_queues();
6182
6183 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6184
6185 }
6186 #endif /* MACH_BSD */
6187
6188 ppnum_t upl_get_highest_page(
6189 upl_t upl)
6190 {
6191 return upl->highest_page;
6192 }
6193
6194 #ifdef UPL_DEBUG
6195 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6196 {
6197 upl->ubc_alias1 = alias1;
6198 upl->ubc_alias2 = alias2;
6199 return KERN_SUCCESS;
6200 }
6201 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6202 {
6203 if(al)
6204 *al = upl->ubc_alias1;
6205 if(al2)
6206 *al2 = upl->ubc_alias2;
6207 return KERN_SUCCESS;
6208 }
6209 #endif /* UPL_DEBUG */
6210
6211
6212
6213 #if MACH_KDB
6214 #include <ddb/db_output.h>
6215 #include <ddb/db_print.h>
6216 #include <vm/vm_print.h>
6217
6218 #define printf kdbprintf
6219 void db_pageout(void);
6220
6221 void
6222 db_vm(void)
6223 {
6224
6225 iprintf("VM Statistics:\n");
6226 db_indent += 2;
6227 iprintf("pages:\n");
6228 db_indent += 2;
6229 iprintf("activ %5d inact %5d free %5d",
6230 vm_page_active_count, vm_page_inactive_count,
6231 vm_page_free_count);
6232 printf(" wire %5d gobbl %5d\n",
6233 vm_page_wire_count, vm_page_gobble_count);
6234 db_indent -= 2;
6235 iprintf("target:\n");
6236 db_indent += 2;
6237 iprintf("min %5d inact %5d free %5d",
6238 vm_page_free_min, vm_page_inactive_target,
6239 vm_page_free_target);
6240 printf(" resrv %5d\n", vm_page_free_reserved);
6241 db_indent -= 2;
6242 iprintf("pause:\n");
6243 db_pageout();
6244 db_indent -= 2;
6245 }
6246
6247 #if MACH_COUNTERS
6248 extern int c_laundry_pages_freed;
6249 #endif /* MACH_COUNTERS */
6250
6251 void
6252 db_pageout(void)
6253 {
6254 iprintf("Pageout Statistics:\n");
6255 db_indent += 2;
6256 iprintf("active %5d inactv %5d\n",
6257 vm_pageout_active, vm_pageout_inactive);
6258 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6259 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6260 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6261 iprintf("used %5d clean %5d dirty %5d\n",
6262 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6263 vm_pageout_inactive_dirty);
6264 #if MACH_COUNTERS
6265 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6266 #endif /* MACH_COUNTERS */
6267 #if MACH_CLUSTER_STATS
6268 iprintf("Cluster Statistics:\n");
6269 db_indent += 2;
6270 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6271 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6272 vm_pageout_cluster_collisions);
6273 iprintf("clusters %5d conversions %5d\n",
6274 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6275 db_indent -= 2;
6276 iprintf("Target Statistics:\n");
6277 db_indent += 2;
6278 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6279 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6280 vm_pageout_target_page_freed);
6281 db_indent -= 2;
6282 #endif /* MACH_CLUSTER_STATS */
6283 db_indent -= 2;
6284 }
6285
6286 #endif /* MACH_KDB */