]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
fdc811c8177adcad370602ba765c78e46686fba8
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67
68 #include <debug.h>
69 #include <mach_pagemap.h>
70 #include <mach_cluster_stats.h>
71 #include <mach_kdb.h>
72 #include <advisory_pageout.h>
73
74 #include <mach/mach_types.h>
75 #include <mach/memory_object.h>
76 #include <mach/memory_object_default.h>
77 #include <mach/memory_object_control_server.h>
78 #include <mach/mach_host_server.h>
79 #include <mach/upl.h>
80 #include <mach/vm_map.h>
81 #include <mach/vm_param.h>
82 #include <mach/vm_statistics.h>
83
84 #include <kern/kern_types.h>
85 #include <kern/counters.h>
86 #include <kern/host_statistics.h>
87 #include <kern/machine.h>
88 #include <kern/misc_protos.h>
89 #include <kern/thread.h>
90 #include <kern/xpr.h>
91 #include <kern/kalloc.h>
92
93 #include <machine/vm_tuning.h>
94
95 #include <vm/pmap.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h> /* must be last */
102
103 /*
104 * ENCRYPTED SWAP:
105 */
106 #include <../bsd/crypto/aes/aes.h>
107
108 extern ipc_port_t memory_manager_default;
109
110
111 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
112 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
113 #endif
114
115 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
116 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
117 #endif
118
119 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
120 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
121 #endif
122
123 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
124 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
125 #endif
126
127 #ifndef VM_PAGE_LAUNDRY_MAX
128 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
129 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
130
131 #ifndef VM_PAGEOUT_BURST_WAIT
132 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
133 #endif /* VM_PAGEOUT_BURST_WAIT */
134
135 #ifndef VM_PAGEOUT_EMPTY_WAIT
136 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
137 #endif /* VM_PAGEOUT_EMPTY_WAIT */
138
139 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
140 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
141 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
142
143 #ifndef VM_PAGEOUT_IDLE_WAIT
144 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
145 #endif /* VM_PAGEOUT_IDLE_WAIT */
146
147
148 /*
149 * To obtain a reasonable LRU approximation, the inactive queue
150 * needs to be large enough to give pages on it a chance to be
151 * referenced a second time. This macro defines the fraction
152 * of active+inactive pages that should be inactive.
153 * The pageout daemon uses it to update vm_page_inactive_target.
154 *
155 * If vm_page_free_count falls below vm_page_free_target and
156 * vm_page_inactive_count is below vm_page_inactive_target,
157 * then the pageout daemon starts running.
158 */
159
160 #ifndef VM_PAGE_INACTIVE_TARGET
161 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
162 #endif /* VM_PAGE_INACTIVE_TARGET */
163
164 /*
165 * Once the pageout daemon starts running, it keeps going
166 * until vm_page_free_count meets or exceeds vm_page_free_target.
167 */
168
169 #ifndef VM_PAGE_FREE_TARGET
170 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
171 #endif /* VM_PAGE_FREE_TARGET */
172
173 /*
174 * The pageout daemon always starts running once vm_page_free_count
175 * falls below vm_page_free_min.
176 */
177
178 #ifndef VM_PAGE_FREE_MIN
179 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
180 #endif /* VM_PAGE_FREE_MIN */
181
182 /*
183 * When vm_page_free_count falls below vm_page_free_reserved,
184 * only vm-privileged threads can allocate pages. vm-privilege
185 * allows the pageout daemon and default pager (and any other
186 * associated threads needed for default pageout) to continue
187 * operation by dipping into the reserved pool of pages.
188 */
189
190 #ifndef VM_PAGE_FREE_RESERVED
191 #define VM_PAGE_FREE_RESERVED(n) \
192 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
193 #endif /* VM_PAGE_FREE_RESERVED */
194
195
196 /*
197 * must hold the page queues lock to
198 * manipulate this structure
199 */
200 struct vm_pageout_queue {
201 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
202 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
203 unsigned int pgo_maxlaundry;
204
205 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
206 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
207 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
208 :0;
209 };
210
211 #define VM_PAGE_Q_THROTTLED(q) \
212 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
213
214
215 /*
216 * Exported variable used to broadcast the activation of the pageout scan
217 * Working Set uses this to throttle its use of pmap removes. In this
218 * way, code which runs within memory in an uncontested context does
219 * not keep encountering soft faults.
220 */
221
222 unsigned int vm_pageout_scan_event_counter = 0;
223
224 /*
225 * Forward declarations for internal routines.
226 */
227
228 static void vm_pageout_garbage_collect(int);
229 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
230 static void vm_pageout_iothread_external(void);
231 static void vm_pageout_iothread_internal(void);
232 static void vm_pageout_queue_steal(vm_page_t);
233
234 extern void vm_pageout_continue(void);
235 extern void vm_pageout_scan(void);
236
237 unsigned int vm_pageout_reserved_internal = 0;
238 unsigned int vm_pageout_reserved_really = 0;
239
240 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
241 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
242 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
243 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
244 unsigned int vm_pageout_deadlock_relief = 0;
245 unsigned int vm_pageout_inactive_relief = 0;
246 unsigned int vm_pageout_burst_active_throttle = 0;
247 unsigned int vm_pageout_burst_inactive_throttle = 0;
248
249 /*
250 * Protection against zero fill flushing live working sets derived
251 * from existing backing store and files
252 */
253 unsigned int vm_accellerate_zf_pageout_trigger = 400;
254 unsigned int vm_zf_iterator;
255 unsigned int vm_zf_iterator_count = 40;
256 unsigned int last_page_zf;
257 unsigned int vm_zf_count = 0;
258
259 /*
260 * These variables record the pageout daemon's actions:
261 * how many pages it looks at and what happens to those pages.
262 * No locking needed because only one thread modifies the variables.
263 */
264
265 unsigned int vm_pageout_active = 0; /* debugging */
266 unsigned int vm_pageout_inactive = 0; /* debugging */
267 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
268 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
269 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
270 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
271 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
272 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
273 unsigned int vm_pageout_inactive_used = 0; /* debugging */
274 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
275 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
276 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
277 unsigned int vm_pageout_purged_objects = 0; /* debugging */
278 unsigned int vm_stat_discard = 0; /* debugging */
279 unsigned int vm_stat_discard_sent = 0; /* debugging */
280 unsigned int vm_stat_discard_failure = 0; /* debugging */
281 unsigned int vm_stat_discard_throttle = 0; /* debugging */
282
283 unsigned int vm_pageout_scan_active_throttled = 0;
284 unsigned int vm_pageout_scan_inactive_throttled = 0;
285 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
286 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
287 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
288 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
289 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
290 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
291 /*
292 * Backing store throttle when BS is exhausted
293 */
294 unsigned int vm_backing_store_low = 0;
295
296 unsigned int vm_pageout_out_of_line = 0;
297 unsigned int vm_pageout_in_place = 0;
298
299 /*
300 * ENCRYPTED SWAP:
301 * counters and statistics...
302 */
303 unsigned long vm_page_decrypt_counter = 0;
304 unsigned long vm_page_decrypt_for_upl_counter = 0;
305 unsigned long vm_page_encrypt_counter = 0;
306 unsigned long vm_page_encrypt_abort_counter = 0;
307 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
308 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
309
310
311 struct vm_pageout_queue vm_pageout_queue_internal;
312 struct vm_pageout_queue vm_pageout_queue_external;
313
314
315 /*
316 * Routine: vm_backing_store_disable
317 * Purpose:
318 * Suspend non-privileged threads wishing to extend
319 * backing store when we are low on backing store
320 * (Synchronized by caller)
321 */
322 void
323 vm_backing_store_disable(
324 boolean_t disable)
325 {
326 if(disable) {
327 vm_backing_store_low = 1;
328 } else {
329 if(vm_backing_store_low) {
330 vm_backing_store_low = 0;
331 thread_wakeup((event_t) &vm_backing_store_low);
332 }
333 }
334 }
335
336
337 /*
338 * Routine: vm_pageout_object_allocate
339 * Purpose:
340 * Allocate an object for use as out-of-line memory in a
341 * data_return/data_initialize message.
342 * The page must be in an unlocked object.
343 *
344 * If the page belongs to a trusted pager, cleaning in place
345 * will be used, which utilizes a special "pageout object"
346 * containing private alias pages for the real page frames.
347 * Untrusted pagers use normal out-of-line memory.
348 */
349 vm_object_t
350 vm_pageout_object_allocate(
351 vm_page_t m,
352 vm_size_t size,
353 vm_object_offset_t offset)
354 {
355 vm_object_t object = m->object;
356 vm_object_t new_object;
357
358 assert(object->pager_ready);
359
360 new_object = vm_object_allocate(size);
361
362 if (object->pager_trusted) {
363 assert (offset < object->size);
364
365 vm_object_lock(new_object);
366 new_object->pageout = TRUE;
367 new_object->shadow = object;
368 new_object->can_persist = FALSE;
369 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
370 new_object->shadow_offset = offset;
371 vm_object_unlock(new_object);
372
373 /*
374 * Take a paging reference on the object. This will be dropped
375 * in vm_pageout_object_terminate()
376 */
377 vm_object_lock(object);
378 vm_object_paging_begin(object);
379 vm_page_lock_queues();
380 vm_page_unlock_queues();
381 vm_object_unlock(object);
382
383 vm_pageout_in_place++;
384 } else
385 vm_pageout_out_of_line++;
386 return(new_object);
387 }
388
389 #if MACH_CLUSTER_STATS
390 unsigned long vm_pageout_cluster_dirtied = 0;
391 unsigned long vm_pageout_cluster_cleaned = 0;
392 unsigned long vm_pageout_cluster_collisions = 0;
393 unsigned long vm_pageout_cluster_clusters = 0;
394 unsigned long vm_pageout_cluster_conversions = 0;
395 unsigned long vm_pageout_target_collisions = 0;
396 unsigned long vm_pageout_target_page_dirtied = 0;
397 unsigned long vm_pageout_target_page_freed = 0;
398 #define CLUSTER_STAT(clause) clause
399 #else /* MACH_CLUSTER_STATS */
400 #define CLUSTER_STAT(clause)
401 #endif /* MACH_CLUSTER_STATS */
402
403 /*
404 * Routine: vm_pageout_object_terminate
405 * Purpose:
406 * Destroy the pageout_object allocated by
407 * vm_pageout_object_allocate(), and perform all of the
408 * required cleanup actions.
409 *
410 * In/Out conditions:
411 * The object must be locked, and will be returned locked.
412 */
413 void
414 vm_pageout_object_terminate(
415 vm_object_t object)
416 {
417 vm_object_t shadow_object;
418 boolean_t shadow_internal;
419
420 /*
421 * Deal with the deallocation (last reference) of a pageout object
422 * (used for cleaning-in-place) by dropping the paging references/
423 * freeing pages in the original object.
424 */
425
426 assert(object->pageout);
427 shadow_object = object->shadow;
428 vm_object_lock(shadow_object);
429 shadow_internal = shadow_object->internal;
430
431 while (!queue_empty(&object->memq)) {
432 vm_page_t p, m;
433 vm_object_offset_t offset;
434
435 p = (vm_page_t) queue_first(&object->memq);
436
437 assert(p->private);
438 assert(p->pageout);
439 p->pageout = FALSE;
440 assert(!p->cleaning);
441
442 offset = p->offset;
443 VM_PAGE_FREE(p);
444 p = VM_PAGE_NULL;
445
446 m = vm_page_lookup(shadow_object,
447 offset + object->shadow_offset);
448
449 if(m == VM_PAGE_NULL)
450 continue;
451 assert(m->cleaning);
452 /* used as a trigger on upl_commit etc to recognize the */
453 /* pageout daemon's subseqent desire to pageout a cleaning */
454 /* page. When the bit is on the upl commit code will */
455 /* respect the pageout bit in the target page over the */
456 /* caller's page list indication */
457 m->dump_cleaning = FALSE;
458
459 /*
460 * Account for the paging reference taken when
461 * m->cleaning was set on this page.
462 */
463 vm_object_paging_end(shadow_object);
464 assert((m->dirty) || (m->precious) ||
465 (m->busy && m->cleaning));
466
467 /*
468 * Handle the trusted pager throttle.
469 * Also decrement the burst throttle (if external).
470 */
471 vm_page_lock_queues();
472 if (m->laundry) {
473 vm_pageout_throttle_up(m);
474 }
475
476 /*
477 * Handle the "target" page(s). These pages are to be freed if
478 * successfully cleaned. Target pages are always busy, and are
479 * wired exactly once. The initial target pages are not mapped,
480 * (so cannot be referenced or modified) but converted target
481 * pages may have been modified between the selection as an
482 * adjacent page and conversion to a target.
483 */
484 if (m->pageout) {
485 assert(m->busy);
486 assert(m->wire_count == 1);
487 m->cleaning = FALSE;
488 m->pageout = FALSE;
489 #if MACH_CLUSTER_STATS
490 if (m->wanted) vm_pageout_target_collisions++;
491 #endif
492 /*
493 * Revoke all access to the page. Since the object is
494 * locked, and the page is busy, this prevents the page
495 * from being dirtied after the pmap_disconnect() call
496 * returns.
497 *
498 * Since the page is left "dirty" but "not modifed", we
499 * can detect whether the page was redirtied during
500 * pageout by checking the modify state.
501 */
502 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
503 m->dirty = TRUE;
504 else
505 m->dirty = FALSE;
506
507 if (m->dirty) {
508 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
509 vm_page_unwire(m);/* reactivates */
510 VM_STAT(reactivations++);
511 PAGE_WAKEUP_DONE(m);
512 } else {
513 CLUSTER_STAT(vm_pageout_target_page_freed++;)
514 vm_page_free(m);/* clears busy, etc. */
515 }
516 vm_page_unlock_queues();
517 continue;
518 }
519 /*
520 * Handle the "adjacent" pages. These pages were cleaned in
521 * place, and should be left alone.
522 * If prep_pin_count is nonzero, then someone is using the
523 * page, so make it active.
524 */
525 if (!m->active && !m->inactive && !m->private) {
526 if (m->reference)
527 vm_page_activate(m);
528 else
529 vm_page_deactivate(m);
530 }
531 if((m->busy) && (m->cleaning)) {
532
533 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
534 m->busy = FALSE;
535
536 /* We do not re-set m->dirty ! */
537 /* The page was busy so no extraneous activity */
538 /* could have occurred. COPY_INTO is a read into the */
539 /* new pages. CLEAN_IN_PLACE does actually write */
540 /* out the pages but handling outside of this code */
541 /* will take care of resetting dirty. We clear the */
542 /* modify however for the Programmed I/O case. */
543 pmap_clear_modify(m->phys_page);
544 if(m->absent) {
545 m->absent = FALSE;
546 if(shadow_object->absent_count == 1)
547 vm_object_absent_release(shadow_object);
548 else
549 shadow_object->absent_count--;
550 }
551 m->overwriting = FALSE;
552 } else if (m->overwriting) {
553 /* alternate request page list, write to page_list */
554 /* case. Occurs when the original page was wired */
555 /* at the time of the list request */
556 assert(m->wire_count != 0);
557 vm_page_unwire(m);/* reactivates */
558 m->overwriting = FALSE;
559 } else {
560 /*
561 * Set the dirty state according to whether or not the page was
562 * modified during the pageout. Note that we purposefully do
563 * NOT call pmap_clear_modify since the page is still mapped.
564 * If the page were to be dirtied between the 2 calls, this
565 * this fact would be lost. This code is only necessary to
566 * maintain statistics, since the pmap module is always
567 * consulted if m->dirty is false.
568 */
569 #if MACH_CLUSTER_STATS
570 m->dirty = pmap_is_modified(m->phys_page);
571
572 if (m->dirty) vm_pageout_cluster_dirtied++;
573 else vm_pageout_cluster_cleaned++;
574 if (m->wanted) vm_pageout_cluster_collisions++;
575 #else
576 m->dirty = 0;
577 #endif
578 }
579 m->cleaning = FALSE;
580
581 /*
582 * Wakeup any thread waiting for the page to be un-cleaning.
583 */
584 PAGE_WAKEUP(m);
585 vm_page_unlock_queues();
586 }
587 /*
588 * Account for the paging reference taken in vm_paging_object_allocate.
589 */
590 vm_object_paging_end(shadow_object);
591 vm_object_unlock(shadow_object);
592
593 assert(object->ref_count == 0);
594 assert(object->paging_in_progress == 0);
595 assert(object->resident_page_count == 0);
596 return;
597 }
598
599 /*
600 * Routine: vm_pageout_setup
601 * Purpose:
602 * Set up a page for pageout (clean & flush).
603 *
604 * Move the page to a new object, as part of which it will be
605 * sent to its memory manager in a memory_object_data_write or
606 * memory_object_initialize message.
607 *
608 * The "new_object" and "new_offset" arguments
609 * indicate where the page should be moved.
610 *
611 * In/Out conditions:
612 * The page in question must not be on any pageout queues,
613 * and must be busy. The object to which it belongs
614 * must be unlocked, and the caller must hold a paging
615 * reference to it. The new_object must not be locked.
616 *
617 * This routine returns a pointer to a place-holder page,
618 * inserted at the same offset, to block out-of-order
619 * requests for the page. The place-holder page must
620 * be freed after the data_write or initialize message
621 * has been sent.
622 *
623 * The original page is put on a paging queue and marked
624 * not busy on exit.
625 */
626 vm_page_t
627 vm_pageout_setup(
628 register vm_page_t m,
629 register vm_object_t new_object,
630 vm_object_offset_t new_offset)
631 {
632 register vm_object_t old_object = m->object;
633 vm_object_offset_t paging_offset;
634 vm_object_offset_t offset;
635 register vm_page_t holding_page;
636 register vm_page_t new_m;
637 boolean_t need_to_wire = FALSE;
638
639
640 XPR(XPR_VM_PAGEOUT,
641 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
642 (integer_t)m->object, (integer_t)m->offset,
643 (integer_t)m, (integer_t)new_object,
644 (integer_t)new_offset);
645 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
646 !m->restart);
647
648 assert(m->dirty || m->precious);
649
650 /*
651 * Create a place-holder page where the old one was, to prevent
652 * attempted pageins of this page while we're unlocked.
653 */
654 VM_PAGE_GRAB_FICTITIOUS(holding_page);
655
656 vm_object_lock(old_object);
657
658 offset = m->offset;
659 paging_offset = offset + old_object->paging_offset;
660
661 if (old_object->pager_trusted) {
662 /*
663 * This pager is trusted, so we can clean this page
664 * in place. Leave it in the old object, and mark it
665 * cleaning & pageout.
666 */
667 new_m = holding_page;
668 holding_page = VM_PAGE_NULL;
669
670 /*
671 * Set up new page to be private shadow of real page.
672 */
673 new_m->phys_page = m->phys_page;
674 new_m->fictitious = FALSE;
675 new_m->pageout = TRUE;
676
677 /*
678 * Mark real page as cleaning (indicating that we hold a
679 * paging reference to be released via m_o_d_r_c) and
680 * pageout (indicating that the page should be freed
681 * when the pageout completes).
682 */
683 pmap_clear_modify(m->phys_page);
684 vm_page_lock_queues();
685 new_m->private = TRUE;
686 vm_page_wire(new_m);
687 m->cleaning = TRUE;
688 m->pageout = TRUE;
689
690 vm_page_wire(m);
691 assert(m->wire_count == 1);
692 vm_page_unlock_queues();
693
694 m->dirty = TRUE;
695 m->precious = FALSE;
696 m->page_lock = VM_PROT_NONE;
697 m->unusual = FALSE;
698 m->unlock_request = VM_PROT_NONE;
699 } else {
700 /*
701 * Cannot clean in place, so rip the old page out of the
702 * object, and stick the holding page in. Set new_m to the
703 * page in the new object.
704 */
705 vm_page_lock_queues();
706 VM_PAGE_QUEUES_REMOVE(m);
707 vm_page_remove(m);
708
709 vm_page_insert(holding_page, old_object, offset);
710 vm_page_unlock_queues();
711
712 m->dirty = TRUE;
713 m->precious = FALSE;
714 new_m = m;
715 new_m->page_lock = VM_PROT_NONE;
716 new_m->unlock_request = VM_PROT_NONE;
717
718 if (old_object->internal)
719 need_to_wire = TRUE;
720 }
721 /*
722 * Record that this page has been written out
723 */
724 #if MACH_PAGEMAP
725 vm_external_state_set(old_object->existence_map, offset);
726 #endif /* MACH_PAGEMAP */
727
728 vm_object_unlock(old_object);
729
730 vm_object_lock(new_object);
731
732 /*
733 * Put the page into the new object. If it is a not wired
734 * (if it's the real page) it will be activated.
735 */
736
737 vm_page_lock_queues();
738 vm_page_insert(new_m, new_object, new_offset);
739 if (need_to_wire)
740 vm_page_wire(new_m);
741 else
742 vm_page_activate(new_m);
743 PAGE_WAKEUP_DONE(new_m);
744 vm_page_unlock_queues();
745
746 vm_object_unlock(new_object);
747
748 /*
749 * Return the placeholder page to simplify cleanup.
750 */
751 return (holding_page);
752 }
753
754 /*
755 * Routine: vm_pageclean_setup
756 *
757 * Purpose: setup a page to be cleaned (made non-dirty), but not
758 * necessarily flushed from the VM page cache.
759 * This is accomplished by cleaning in place.
760 *
761 * The page must not be busy, and the object and page
762 * queues must be locked.
763 *
764 */
765 void
766 vm_pageclean_setup(
767 vm_page_t m,
768 vm_page_t new_m,
769 vm_object_t new_object,
770 vm_object_offset_t new_offset)
771 {
772 vm_object_t old_object = m->object;
773 assert(!m->busy);
774 assert(!m->cleaning);
775
776 XPR(XPR_VM_PAGEOUT,
777 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
778 (integer_t)old_object, m->offset, (integer_t)m,
779 (integer_t)new_m, new_offset);
780
781 pmap_clear_modify(m->phys_page);
782 vm_object_paging_begin(old_object);
783
784 /*
785 * Record that this page has been written out
786 */
787 #if MACH_PAGEMAP
788 vm_external_state_set(old_object->existence_map, m->offset);
789 #endif /*MACH_PAGEMAP*/
790
791 /*
792 * Mark original page as cleaning in place.
793 */
794 m->cleaning = TRUE;
795 m->dirty = TRUE;
796 m->precious = FALSE;
797
798 /*
799 * Convert the fictitious page to a private shadow of
800 * the real page.
801 */
802 assert(new_m->fictitious);
803 new_m->fictitious = FALSE;
804 new_m->private = TRUE;
805 new_m->pageout = TRUE;
806 new_m->phys_page = m->phys_page;
807 vm_page_wire(new_m);
808
809 vm_page_insert(new_m, new_object, new_offset);
810 assert(!new_m->wanted);
811 new_m->busy = FALSE;
812 }
813
814 void
815 vm_pageclean_copy(
816 vm_page_t m,
817 vm_page_t new_m,
818 vm_object_t new_object,
819 vm_object_offset_t new_offset)
820 {
821 XPR(XPR_VM_PAGEOUT,
822 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
823 m, new_m, new_object, new_offset, 0);
824
825 assert((!m->busy) && (!m->cleaning));
826
827 assert(!new_m->private && !new_m->fictitious);
828
829 pmap_clear_modify(m->phys_page);
830
831 m->busy = TRUE;
832 vm_object_paging_begin(m->object);
833 vm_page_unlock_queues();
834 vm_object_unlock(m->object);
835
836 /*
837 * Copy the original page to the new page.
838 */
839 vm_page_copy(m, new_m);
840
841 /*
842 * Mark the old page as clean. A request to pmap_is_modified
843 * will get the right answer.
844 */
845 vm_object_lock(m->object);
846 m->dirty = FALSE;
847
848 vm_object_paging_end(m->object);
849
850 vm_page_lock_queues();
851 if (!m->active && !m->inactive)
852 vm_page_activate(m);
853 PAGE_WAKEUP_DONE(m);
854
855 vm_page_insert(new_m, new_object, new_offset);
856 vm_page_activate(new_m);
857 new_m->busy = FALSE; /* No other thread can be waiting */
858 }
859
860
861 /*
862 * Routine: vm_pageout_initialize_page
863 * Purpose:
864 * Causes the specified page to be initialized in
865 * the appropriate memory object. This routine is used to push
866 * pages into a copy-object when they are modified in the
867 * permanent object.
868 *
869 * The page is moved to a temporary object and paged out.
870 *
871 * In/out conditions:
872 * The page in question must not be on any pageout queues.
873 * The object to which it belongs must be locked.
874 * The page must be busy, but not hold a paging reference.
875 *
876 * Implementation:
877 * Move this page to a completely new object.
878 */
879 void
880 vm_pageout_initialize_page(
881 vm_page_t m)
882 {
883 vm_object_t object;
884 vm_object_offset_t paging_offset;
885 vm_page_t holding_page;
886
887
888 XPR(XPR_VM_PAGEOUT,
889 "vm_pageout_initialize_page, page 0x%X\n",
890 (integer_t)m, 0, 0, 0, 0);
891 assert(m->busy);
892
893 /*
894 * Verify that we really want to clean this page
895 */
896 assert(!m->absent);
897 assert(!m->error);
898 assert(m->dirty);
899
900 /*
901 * Create a paging reference to let us play with the object.
902 */
903 object = m->object;
904 paging_offset = m->offset + object->paging_offset;
905 vm_object_paging_begin(object);
906 if (m->absent || m->error || m->restart ||
907 (!m->dirty && !m->precious)) {
908 VM_PAGE_FREE(m);
909 panic("reservation without pageout?"); /* alan */
910 vm_object_unlock(object);
911 return;
912 }
913
914 /* set the page for future call to vm_fault_list_request */
915 holding_page = NULL;
916 vm_page_lock_queues();
917 pmap_clear_modify(m->phys_page);
918 m->dirty = TRUE;
919 m->busy = TRUE;
920 m->list_req_pending = TRUE;
921 m->cleaning = TRUE;
922 m->pageout = TRUE;
923 vm_page_wire(m);
924 vm_page_unlock_queues();
925 vm_object_unlock(object);
926
927 /*
928 * Write the data to its pager.
929 * Note that the data is passed by naming the new object,
930 * not a virtual address; the pager interface has been
931 * manipulated to use the "internal memory" data type.
932 * [The object reference from its allocation is donated
933 * to the eventual recipient.]
934 */
935 memory_object_data_initialize(object->pager,
936 paging_offset,
937 PAGE_SIZE);
938
939 vm_object_lock(object);
940 }
941
942 #if MACH_CLUSTER_STATS
943 #define MAXCLUSTERPAGES 16
944 struct {
945 unsigned long pages_in_cluster;
946 unsigned long pages_at_higher_offsets;
947 unsigned long pages_at_lower_offsets;
948 } cluster_stats[MAXCLUSTERPAGES];
949 #endif /* MACH_CLUSTER_STATS */
950
951 boolean_t allow_clustered_pageouts = FALSE;
952
953 /*
954 * vm_pageout_cluster:
955 *
956 * Given a page, queue it to the appropriate I/O thread,
957 * which will page it out and attempt to clean adjacent pages
958 * in the same operation.
959 *
960 * The page must be busy, and the object and queues locked. We will take a
961 * paging reference to prevent deallocation or collapse when we
962 * release the object lock back at the call site. The I/O thread
963 * is responsible for consuming this reference
964 *
965 * The page must not be on any pageout queue.
966 */
967
968 void
969 vm_pageout_cluster(vm_page_t m)
970 {
971 vm_object_t object = m->object;
972 struct vm_pageout_queue *q;
973
974
975 XPR(XPR_VM_PAGEOUT,
976 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
977 (integer_t)object, m->offset, (integer_t)m, 0, 0);
978
979 /*
980 * Only a certain kind of page is appreciated here.
981 */
982 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
983 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
984
985 /*
986 * protect the object from collapse -
987 * locking in the object's paging_offset.
988 */
989 vm_object_paging_begin(object);
990
991 /*
992 * set the page for future call to vm_fault_list_request
993 * page should already be marked busy
994 */
995 vm_page_wire(m);
996 m->list_req_pending = TRUE;
997 m->cleaning = TRUE;
998 m->pageout = TRUE;
999 m->laundry = TRUE;
1000
1001 if (object->internal == TRUE)
1002 q = &vm_pageout_queue_internal;
1003 else
1004 q = &vm_pageout_queue_external;
1005 q->pgo_laundry++;
1006
1007 m->pageout_queue = TRUE;
1008 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1009
1010 if (q->pgo_idle == TRUE) {
1011 q->pgo_idle = FALSE;
1012 thread_wakeup((event_t) &q->pgo_pending);
1013 }
1014 }
1015
1016
1017 unsigned long vm_pageout_throttle_up_count = 0;
1018
1019 /*
1020 * A page is back from laundry. See if there are some pages waiting to
1021 * go to laundry and if we can let some of them go now.
1022 *
1023 * Object and page queues must be locked.
1024 */
1025 void
1026 vm_pageout_throttle_up(
1027 vm_page_t m)
1028 {
1029 struct vm_pageout_queue *q;
1030
1031 vm_pageout_throttle_up_count++;
1032
1033 assert(m->laundry);
1034 assert(m->object != VM_OBJECT_NULL);
1035 assert(m->object != kernel_object);
1036
1037 if (m->object->internal == TRUE)
1038 q = &vm_pageout_queue_internal;
1039 else
1040 q = &vm_pageout_queue_external;
1041
1042 m->laundry = FALSE;
1043 q->pgo_laundry--;
1044
1045 if (q->pgo_throttled == TRUE) {
1046 q->pgo_throttled = FALSE;
1047 thread_wakeup((event_t) &q->pgo_laundry);
1048 }
1049 }
1050
1051
1052 /*
1053 * vm_pageout_scan does the dirty work for the pageout daemon.
1054 * It returns with vm_page_queue_free_lock held and
1055 * vm_page_free_wanted == 0.
1056 */
1057
1058 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1059
1060 #define FCS_IDLE 0
1061 #define FCS_DELAYED 1
1062 #define FCS_DEADLOCK_DETECTED 2
1063
1064 struct flow_control {
1065 int state;
1066 mach_timespec_t ts;
1067 };
1068
1069 void
1070 vm_pageout_scan(void)
1071 {
1072 unsigned int loop_count = 0;
1073 unsigned int inactive_burst_count = 0;
1074 unsigned int active_burst_count = 0;
1075 vm_page_t local_freeq = 0;
1076 int local_freed = 0;
1077 int delayed_unlock = 0;
1078 int need_internal_inactive = 0;
1079 int refmod_state = 0;
1080 int vm_pageout_deadlock_target = 0;
1081 struct vm_pageout_queue *iq;
1082 struct vm_pageout_queue *eq;
1083 struct flow_control flow_control;
1084 boolean_t active_throttled = FALSE;
1085 boolean_t inactive_throttled = FALSE;
1086 mach_timespec_t ts;
1087 unsigned int msecs = 0;
1088 vm_object_t object;
1089
1090
1091 flow_control.state = FCS_IDLE;
1092 iq = &vm_pageout_queue_internal;
1093 eq = &vm_pageout_queue_external;
1094
1095 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1096
1097 /*???*/ /*
1098 * We want to gradually dribble pages from the active queue
1099 * to the inactive queue. If we let the inactive queue get
1100 * very small, and then suddenly dump many pages into it,
1101 * those pages won't get a sufficient chance to be referenced
1102 * before we start taking them from the inactive queue.
1103 *
1104 * We must limit the rate at which we send pages to the pagers.
1105 * data_write messages consume memory, for message buffers and
1106 * for map-copy objects. If we get too far ahead of the pagers,
1107 * we can potentially run out of memory.
1108 *
1109 * We can use the laundry count to limit directly the number
1110 * of pages outstanding to the default pager. A similar
1111 * strategy for external pagers doesn't work, because
1112 * external pagers don't have to deallocate the pages sent them,
1113 * and because we might have to send pages to external pagers
1114 * even if they aren't processing writes. So we also
1115 * use a burst count to limit writes to external pagers.
1116 *
1117 * When memory is very tight, we can't rely on external pagers to
1118 * clean pages. They probably aren't running, because they
1119 * aren't vm-privileged. If we kept sending dirty pages to them,
1120 * we could exhaust the free list.
1121 */
1122 vm_page_lock_queues();
1123 delayed_unlock = 1;
1124
1125
1126 Restart:
1127 /*
1128 * Recalculate vm_page_inactivate_target.
1129 */
1130 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1131 vm_page_inactive_count);
1132 object = NULL;
1133
1134 for (;;) {
1135 vm_page_t m;
1136
1137 if (delayed_unlock == 0)
1138 vm_page_lock_queues();
1139
1140 active_burst_count = vm_page_active_count;
1141
1142 if (active_burst_count > vm_pageout_burst_active_throttle)
1143 active_burst_count = vm_pageout_burst_active_throttle;
1144
1145 /*
1146 * Move pages from active to inactive.
1147 */
1148 while ((need_internal_inactive ||
1149 vm_page_inactive_count < vm_page_inactive_target) &&
1150 !queue_empty(&vm_page_queue_active) &&
1151 ((active_burst_count--) > 0)) {
1152
1153 vm_pageout_active++;
1154
1155 m = (vm_page_t) queue_first(&vm_page_queue_active);
1156
1157 assert(m->active && !m->inactive);
1158 assert(!m->laundry);
1159 assert(m->object != kernel_object);
1160
1161 /*
1162 * Try to lock object; since we've already got the
1163 * page queues lock, we can only 'try' for this one.
1164 * if the 'try' fails, we need to do a mutex_pause
1165 * to allow the owner of the object lock a chance to
1166 * run... otherwise, we're likely to trip over this
1167 * object in the same state as we work our way through
1168 * the queue... clumps of pages associated with the same
1169 * object are fairly typical on the inactive and active queues
1170 */
1171 if (m->object != object) {
1172 if (object != NULL) {
1173 vm_object_unlock(object);
1174 object = NULL;
1175 }
1176 if (!vm_object_lock_try(m->object)) {
1177 /*
1178 * move page to end of active queue and continue
1179 */
1180 queue_remove(&vm_page_queue_active, m,
1181 vm_page_t, pageq);
1182 queue_enter(&vm_page_queue_active, m,
1183 vm_page_t, pageq);
1184
1185 goto done_with_activepage;
1186 }
1187 object = m->object;
1188 }
1189 /*
1190 * if the page is BUSY, then we pull it
1191 * off the active queue and leave it alone.
1192 * when BUSY is cleared, it will get stuck
1193 * back on the appropriate queue
1194 */
1195 if (m->busy) {
1196 queue_remove(&vm_page_queue_active, m,
1197 vm_page_t, pageq);
1198 m->pageq.next = NULL;
1199 m->pageq.prev = NULL;
1200
1201 if (!m->fictitious)
1202 vm_page_active_count--;
1203 m->active = FALSE;
1204
1205 goto done_with_activepage;
1206 }
1207 if (need_internal_inactive) {
1208 /*
1209 * If we're unable to make forward progress
1210 * with the current set of pages on the
1211 * inactive queue due to busy objects or
1212 * throttled pageout queues, then
1213 * move a page that is already clean
1214 * or belongs to a pageout queue that
1215 * isn't currently throttled
1216 */
1217 active_throttled = FALSE;
1218
1219 if (object->internal) {
1220 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1221 active_throttled = TRUE;
1222 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1223 active_throttled = TRUE;
1224 }
1225 if (active_throttled == TRUE) {
1226 if (!m->dirty) {
1227 refmod_state = pmap_get_refmod(m->phys_page);
1228
1229 if (refmod_state & VM_MEM_REFERENCED)
1230 m->reference = TRUE;
1231 if (refmod_state & VM_MEM_MODIFIED)
1232 m->dirty = TRUE;
1233 }
1234 if (m->dirty || m->precious) {
1235 /*
1236 * page is dirty and targets a THROTTLED queue
1237 * so all we can do is move it back to the
1238 * end of the active queue to get it out
1239 * of the way
1240 */
1241 queue_remove(&vm_page_queue_active, m,
1242 vm_page_t, pageq);
1243 queue_enter(&vm_page_queue_active, m,
1244 vm_page_t, pageq);
1245
1246 vm_pageout_scan_active_throttled++;
1247
1248 goto done_with_activepage;
1249 }
1250 }
1251 vm_pageout_scan_active_throttle_success++;
1252 need_internal_inactive--;
1253 }
1254 /*
1255 * Deactivate the page while holding the object
1256 * locked, so we know the page is still not busy.
1257 * This should prevent races between pmap_enter
1258 * and pmap_clear_reference. The page might be
1259 * absent or fictitious, but vm_page_deactivate
1260 * can handle that.
1261 */
1262 vm_page_deactivate(m);
1263 done_with_activepage:
1264 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1265
1266 if (object != NULL) {
1267 vm_object_unlock(object);
1268 object = NULL;
1269 }
1270 if (local_freeq) {
1271 vm_page_free_list(local_freeq);
1272
1273 local_freeq = 0;
1274 local_freed = 0;
1275 }
1276 delayed_unlock = 0;
1277 vm_page_unlock_queues();
1278
1279 mutex_pause();
1280 vm_page_lock_queues();
1281 /*
1282 * continue the while loop processing
1283 * the active queue... need to hold
1284 * the page queues lock
1285 */
1286 continue;
1287 }
1288 }
1289
1290
1291
1292 /**********************************************************************
1293 * above this point we're playing with the active queue
1294 * below this point we're playing with the throttling mechanisms
1295 * and the inactive queue
1296 **********************************************************************/
1297
1298
1299
1300 /*
1301 * We are done if we have met our target *and*
1302 * nobody is still waiting for a page.
1303 */
1304 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1305 if (object != NULL) {
1306 vm_object_unlock(object);
1307 object = NULL;
1308 }
1309 if (local_freeq) {
1310 vm_page_free_list(local_freeq);
1311
1312 local_freeq = 0;
1313 local_freed = 0;
1314 }
1315 mutex_lock(&vm_page_queue_free_lock);
1316
1317 if ((vm_page_free_count >= vm_page_free_target) &&
1318 (vm_page_free_wanted == 0)) {
1319
1320 vm_page_unlock_queues();
1321
1322 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1323 return;
1324 }
1325 mutex_unlock(&vm_page_queue_free_lock);
1326 }
1327
1328
1329 /*
1330 * Sometimes we have to pause:
1331 * 1) No inactive pages - nothing to do.
1332 * 2) Flow control - default pageout queue is full
1333 * 3) Loop control - no acceptable pages found on the inactive queue
1334 * within the last vm_pageout_burst_inactive_throttle iterations
1335 */
1336 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1337 vm_pageout_scan_empty_throttle++;
1338 msecs = vm_pageout_empty_wait;
1339 goto vm_pageout_scan_delay;
1340
1341 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1342 vm_pageout_scan_burst_throttle++;
1343 msecs = vm_pageout_burst_wait;
1344 goto vm_pageout_scan_delay;
1345
1346 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1347
1348 switch (flow_control.state) {
1349
1350 case FCS_IDLE:
1351 reset_deadlock_timer:
1352 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1353 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1354 clock_get_system_nanotime(
1355 &flow_control.ts.tv_sec,
1356 (uint32_t *) &flow_control.ts.tv_nsec);
1357 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1358
1359 flow_control.state = FCS_DELAYED;
1360 msecs = vm_pageout_deadlock_wait;
1361
1362 break;
1363
1364 case FCS_DELAYED:
1365 clock_get_system_nanotime(
1366 &ts.tv_sec,
1367 (uint32_t *) &ts.tv_nsec);
1368
1369 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1370 /*
1371 * the pageout thread for the default pager is potentially
1372 * deadlocked since the
1373 * default pager queue has been throttled for more than the
1374 * allowable time... we need to move some clean pages or dirty
1375 * pages belonging to the external pagers if they aren't throttled
1376 * vm_page_free_wanted represents the number of threads currently
1377 * blocked waiting for pages... we'll move one page for each of
1378 * these plus a fixed amount to break the logjam... once we're done
1379 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1380 * with a new timeout target since we have no way of knowing
1381 * whether we've broken the deadlock except through observation
1382 * of the queue associated with the default pager... we need to
1383 * stop moving pagings and allow the system to run to see what
1384 * state it settles into.
1385 */
1386 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1387 vm_pageout_scan_deadlock_detected++;
1388 flow_control.state = FCS_DEADLOCK_DETECTED;
1389
1390 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1391 goto consider_inactive;
1392 }
1393 /*
1394 * just resniff instead of trying
1395 * to compute a new delay time... we're going to be
1396 * awakened immediately upon a laundry completion,
1397 * so we won't wait any longer than necessary
1398 */
1399 msecs = vm_pageout_idle_wait;
1400 break;
1401
1402 case FCS_DEADLOCK_DETECTED:
1403 if (vm_pageout_deadlock_target)
1404 goto consider_inactive;
1405 goto reset_deadlock_timer;
1406
1407 }
1408 vm_pageout_scan_throttle++;
1409 iq->pgo_throttled = TRUE;
1410 vm_pageout_scan_delay:
1411 if (object != NULL) {
1412 vm_object_unlock(object);
1413 object = NULL;
1414 }
1415 if (local_freeq) {
1416 vm_page_free_list(local_freeq);
1417
1418 local_freeq = 0;
1419 local_freed = 0;
1420 }
1421 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1422
1423 counter(c_vm_pageout_scan_block++);
1424
1425 vm_page_unlock_queues();
1426
1427 thread_block(THREAD_CONTINUE_NULL);
1428
1429 vm_page_lock_queues();
1430 delayed_unlock = 1;
1431
1432 iq->pgo_throttled = FALSE;
1433
1434 if (loop_count >= vm_page_inactive_count) {
1435 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1436 /*
1437 * Make sure we move enough "appropriate"
1438 * pages to the inactive queue before trying
1439 * again.
1440 */
1441 need_internal_inactive = vm_pageout_inactive_relief;
1442 }
1443 loop_count = 0;
1444 }
1445 inactive_burst_count = 0;
1446
1447 goto Restart;
1448 /*NOTREACHED*/
1449 }
1450
1451
1452 flow_control.state = FCS_IDLE;
1453 consider_inactive:
1454 loop_count++;
1455 inactive_burst_count++;
1456 vm_pageout_inactive++;
1457
1458 if (!queue_empty(&vm_page_queue_inactive)) {
1459 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1460
1461 if (m->clustered && (m->no_isync == TRUE)) {
1462 goto use_this_page;
1463 }
1464 }
1465 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1466 vm_zf_iterator = 0;
1467 } else {
1468 last_page_zf = 0;
1469 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1470 vm_zf_iterator = 0;
1471 }
1472 }
1473 if (queue_empty(&vm_page_queue_zf) ||
1474 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1475 !queue_empty(&vm_page_queue_inactive))) {
1476 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1477 last_page_zf = 0;
1478 } else {
1479 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1480 last_page_zf = 1;
1481 }
1482 use_this_page:
1483 assert(!m->active && m->inactive);
1484 assert(!m->laundry);
1485 assert(m->object != kernel_object);
1486
1487 /*
1488 * Try to lock object; since we've alread got the
1489 * page queues lock, we can only 'try' for this one.
1490 * if the 'try' fails, we need to do a mutex_pause
1491 * to allow the owner of the object lock a chance to
1492 * run... otherwise, we're likely to trip over this
1493 * object in the same state as we work our way through
1494 * the queue... clumps of pages associated with the same
1495 * object are fairly typical on the inactive and active queues
1496 */
1497 if (m->object != object) {
1498 if (object != NULL) {
1499 vm_object_unlock(object);
1500 object = NULL;
1501 }
1502 if (!vm_object_lock_try(m->object)) {
1503 /*
1504 * Move page to end and continue.
1505 * Don't re-issue ticket
1506 */
1507 if (m->zero_fill) {
1508 queue_remove(&vm_page_queue_zf, m,
1509 vm_page_t, pageq);
1510 queue_enter(&vm_page_queue_zf, m,
1511 vm_page_t, pageq);
1512 } else {
1513 queue_remove(&vm_page_queue_inactive, m,
1514 vm_page_t, pageq);
1515 queue_enter(&vm_page_queue_inactive, m,
1516 vm_page_t, pageq);
1517 }
1518 vm_pageout_inactive_nolock++;
1519
1520 /*
1521 * force us to dump any collected free pages
1522 * and to pause before moving on
1523 */
1524 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1525
1526 goto done_with_inactivepage;
1527 }
1528 object = m->object;
1529 }
1530 /*
1531 * If the page belongs to a purgable object with no pending copies
1532 * against it, then we reap all of the pages in the object
1533 * and note that the object has been "emptied". It'll be up to the
1534 * application the discover this and recreate its contents if desired.
1535 */
1536 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1537 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1538 object->copy == VM_OBJECT_NULL) {
1539
1540 (void) vm_object_purge(object);
1541 vm_pageout_purged_objects++;
1542 /*
1543 * we've just taken all of the pages from this object,
1544 * so drop the lock now since we're not going to find
1545 * any more pages belonging to it anytime soon
1546 */
1547 vm_object_unlock(object);
1548 object = NULL;
1549
1550 inactive_burst_count = 0;
1551
1552 goto done_with_inactivepage;
1553 }
1554
1555 /*
1556 * Paging out pages of external objects which
1557 * are currently being created must be avoided.
1558 * The pager may claim for memory, thus leading to a
1559 * possible dead lock between it and the pageout thread,
1560 * if such pages are finally chosen. The remaining assumption
1561 * is that there will finally be enough available pages in the
1562 * inactive pool to page out in order to satisfy all memory
1563 * claimed by the thread which concurrently creates the pager.
1564 */
1565 if (!object->pager_initialized && object->pager_created) {
1566 /*
1567 * Move page to end and continue, hoping that
1568 * there will be enough other inactive pages to
1569 * page out so that the thread which currently
1570 * initializes the pager will succeed.
1571 * Don't re-grant the ticket, the page should
1572 * pulled from the queue and paged out whenever
1573 * one of its logically adjacent fellows is
1574 * targeted.
1575 */
1576 if (m->zero_fill) {
1577 queue_remove(&vm_page_queue_zf, m,
1578 vm_page_t, pageq);
1579 queue_enter(&vm_page_queue_zf, m,
1580 vm_page_t, pageq);
1581 last_page_zf = 1;
1582 vm_zf_iterator = vm_zf_iterator_count - 1;
1583 } else {
1584 queue_remove(&vm_page_queue_inactive, m,
1585 vm_page_t, pageq);
1586 queue_enter(&vm_page_queue_inactive, m,
1587 vm_page_t, pageq);
1588 last_page_zf = 0;
1589 vm_zf_iterator = 1;
1590 }
1591 vm_pageout_inactive_avoid++;
1592
1593 goto done_with_inactivepage;
1594 }
1595 /*
1596 * Remove the page from the inactive list.
1597 */
1598 if (m->zero_fill) {
1599 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1600 } else {
1601 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1602 }
1603 m->pageq.next = NULL;
1604 m->pageq.prev = NULL;
1605 m->inactive = FALSE;
1606 if (!m->fictitious)
1607 vm_page_inactive_count--;
1608
1609 if (m->busy || !object->alive) {
1610 /*
1611 * Somebody is already playing with this page.
1612 * Leave it off the pageout queues.
1613 */
1614 vm_pageout_inactive_busy++;
1615
1616 goto done_with_inactivepage;
1617 }
1618
1619 /*
1620 * If it's absent or in error, we can reclaim the page.
1621 */
1622
1623 if (m->absent || m->error) {
1624 vm_pageout_inactive_absent++;
1625 reclaim_page:
1626 if (vm_pageout_deadlock_target) {
1627 vm_pageout_scan_inactive_throttle_success++;
1628 vm_pageout_deadlock_target--;
1629 }
1630 if (m->tabled)
1631 vm_page_remove(m); /* clears tabled, object, offset */
1632 if (m->absent)
1633 vm_object_absent_release(object);
1634
1635 assert(m->pageq.next == NULL &&
1636 m->pageq.prev == NULL);
1637 m->pageq.next = (queue_entry_t)local_freeq;
1638 local_freeq = m;
1639 local_freed++;
1640
1641 inactive_burst_count = 0;
1642
1643 goto done_with_inactivepage;
1644 }
1645
1646 assert(!m->private);
1647 assert(!m->fictitious);
1648
1649 /*
1650 * If already cleaning this page in place, convert from
1651 * "adjacent" to "target". We can leave the page mapped,
1652 * and vm_pageout_object_terminate will determine whether
1653 * to free or reactivate.
1654 */
1655
1656 if (m->cleaning) {
1657 m->busy = TRUE;
1658 m->pageout = TRUE;
1659 m->dump_cleaning = TRUE;
1660 vm_page_wire(m);
1661
1662 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1663
1664 inactive_burst_count = 0;
1665
1666 goto done_with_inactivepage;
1667 }
1668
1669 /*
1670 * If it's being used, reactivate.
1671 * (Fictitious pages are either busy or absent.)
1672 */
1673 if ( (!m->reference) ) {
1674 refmod_state = pmap_get_refmod(m->phys_page);
1675
1676 if (refmod_state & VM_MEM_REFERENCED)
1677 m->reference = TRUE;
1678 if (refmod_state & VM_MEM_MODIFIED)
1679 m->dirty = TRUE;
1680 }
1681 if (m->reference) {
1682 was_referenced:
1683 vm_page_activate(m);
1684 VM_STAT(reactivations++);
1685
1686 vm_pageout_inactive_used++;
1687 last_page_zf = 0;
1688 inactive_burst_count = 0;
1689
1690 goto done_with_inactivepage;
1691 }
1692
1693 XPR(XPR_VM_PAGEOUT,
1694 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1695 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1696
1697 /*
1698 * we've got a candidate page to steal...
1699 *
1700 * m->dirty is up to date courtesy of the
1701 * preceding check for m->reference... if
1702 * we get here, then m->reference had to be
1703 * FALSE which means we did a pmap_get_refmod
1704 * and updated both m->reference and m->dirty
1705 *
1706 * if it's dirty or precious we need to
1707 * see if the target queue is throtttled
1708 * it if is, we need to skip over it by moving it back
1709 * to the end of the inactive queue
1710 */
1711 inactive_throttled = FALSE;
1712
1713 if (m->dirty || m->precious) {
1714 if (object->internal) {
1715 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1716 inactive_throttled = TRUE;
1717 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1718 inactive_throttled = TRUE;
1719 }
1720 }
1721 if (inactive_throttled == TRUE) {
1722 if (m->zero_fill) {
1723 queue_enter(&vm_page_queue_zf, m,
1724 vm_page_t, pageq);
1725 } else {
1726 queue_enter(&vm_page_queue_inactive, m,
1727 vm_page_t, pageq);
1728 }
1729 if (!m->fictitious)
1730 vm_page_inactive_count++;
1731 m->inactive = TRUE;
1732
1733 vm_pageout_scan_inactive_throttled++;
1734
1735 goto done_with_inactivepage;
1736 }
1737 /*
1738 * we've got a page that we can steal...
1739 * eliminate all mappings and make sure
1740 * we have the up-to-date modified state
1741 * first take the page BUSY, so that no new
1742 * mappings can be made
1743 */
1744 m->busy = TRUE;
1745
1746 /*
1747 * if we need to do a pmap_disconnect then we
1748 * need to re-evaluate m->dirty since the pmap_disconnect
1749 * provides the true state atomically... the
1750 * page was still mapped up to the pmap_disconnect
1751 * and may have been dirtied at the last microsecond
1752 *
1753 * we also check for the page being referenced 'late'
1754 * if it was, we first need to do a WAKEUP_DONE on it
1755 * since we already set m->busy = TRUE, before
1756 * going off to reactivate it
1757 *
1758 * if we don't need the pmap_disconnect, then
1759 * m->dirty is up to date courtesy of the
1760 * earlier check for m->reference... if
1761 * we get here, then m->reference had to be
1762 * FALSE which means we did a pmap_get_refmod
1763 * and updated both m->reference and m->dirty...
1764 */
1765 if (m->no_isync == FALSE) {
1766 refmod_state = pmap_disconnect(m->phys_page);
1767
1768 if (refmod_state & VM_MEM_MODIFIED)
1769 m->dirty = TRUE;
1770 if (refmod_state & VM_MEM_REFERENCED) {
1771 m->reference = TRUE;
1772
1773 PAGE_WAKEUP_DONE(m);
1774 goto was_referenced;
1775 }
1776 }
1777 /*
1778 * If it's clean and not precious, we can free the page.
1779 */
1780 if (!m->dirty && !m->precious) {
1781 vm_pageout_inactive_clean++;
1782 goto reclaim_page;
1783 }
1784 vm_pageout_cluster(m);
1785
1786 vm_pageout_inactive_dirty++;
1787
1788 inactive_burst_count = 0;
1789
1790 done_with_inactivepage:
1791 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1792
1793 if (object != NULL) {
1794 vm_object_unlock(object);
1795 object = NULL;
1796 }
1797 if (local_freeq) {
1798 vm_page_free_list(local_freeq);
1799
1800 local_freeq = 0;
1801 local_freed = 0;
1802 }
1803 delayed_unlock = 0;
1804 vm_page_unlock_queues();
1805 mutex_pause();
1806 }
1807 /*
1808 * back to top of pageout scan loop
1809 */
1810 }
1811 }
1812
1813
1814 int vm_page_free_count_init;
1815
1816 void
1817 vm_page_free_reserve(
1818 int pages)
1819 {
1820 int free_after_reserve;
1821
1822 vm_page_free_reserved += pages;
1823
1824 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1825
1826 vm_page_free_min = vm_page_free_reserved +
1827 VM_PAGE_FREE_MIN(free_after_reserve);
1828
1829 vm_page_free_target = vm_page_free_reserved +
1830 VM_PAGE_FREE_TARGET(free_after_reserve);
1831
1832 if (vm_page_free_target < vm_page_free_min + 5)
1833 vm_page_free_target = vm_page_free_min + 5;
1834 }
1835
1836 /*
1837 * vm_pageout is the high level pageout daemon.
1838 */
1839
1840 void
1841 vm_pageout_continue(void)
1842 {
1843 vm_pageout_scan_event_counter++;
1844 vm_pageout_scan();
1845 /* we hold vm_page_queue_free_lock now */
1846 assert(vm_page_free_wanted == 0);
1847 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1848 mutex_unlock(&vm_page_queue_free_lock);
1849
1850 counter(c_vm_pageout_block++);
1851 thread_block((thread_continue_t)vm_pageout_continue);
1852 /*NOTREACHED*/
1853 }
1854
1855
1856 /*
1857 * must be called with the
1858 * queues and object locks held
1859 */
1860 static void
1861 vm_pageout_queue_steal(vm_page_t m)
1862 {
1863 struct vm_pageout_queue *q;
1864
1865 if (m->object->internal == TRUE)
1866 q = &vm_pageout_queue_internal;
1867 else
1868 q = &vm_pageout_queue_external;
1869
1870 m->laundry = FALSE;
1871 m->pageout_queue = FALSE;
1872 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1873
1874 m->pageq.next = NULL;
1875 m->pageq.prev = NULL;
1876
1877 vm_object_paging_end(m->object);
1878
1879 q->pgo_laundry--;
1880 }
1881
1882
1883 #ifdef FAKE_DEADLOCK
1884
1885 #define FAKE_COUNT 5000
1886
1887 int internal_count = 0;
1888 int fake_deadlock = 0;
1889
1890 #endif
1891
1892 static void
1893 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1894 {
1895 vm_page_t m = NULL;
1896 vm_object_t object;
1897 boolean_t need_wakeup;
1898
1899 vm_page_lock_queues();
1900
1901 while ( !queue_empty(&q->pgo_pending) ) {
1902
1903 q->pgo_busy = TRUE;
1904 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1905 m->pageout_queue = FALSE;
1906 vm_page_unlock_queues();
1907
1908 m->pageq.next = NULL;
1909 m->pageq.prev = NULL;
1910 #ifdef FAKE_DEADLOCK
1911 if (q == &vm_pageout_queue_internal) {
1912 vm_offset_t addr;
1913 int pg_count;
1914
1915 internal_count++;
1916
1917 if ((internal_count == FAKE_COUNT)) {
1918
1919 pg_count = vm_page_free_count + vm_page_free_reserved;
1920
1921 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1922 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1923 }
1924 internal_count = 0;
1925 fake_deadlock++;
1926 }
1927 }
1928 #endif
1929 object = m->object;
1930
1931 if (!object->pager_initialized) {
1932 vm_object_lock(object);
1933
1934 /*
1935 * If there is no memory object for the page, create
1936 * one and hand it to the default pager.
1937 */
1938
1939 if (!object->pager_initialized)
1940 vm_object_collapse(object,
1941 (vm_object_offset_t) 0,
1942 TRUE);
1943 if (!object->pager_initialized)
1944 vm_object_pager_create(object);
1945 if (!object->pager_initialized) {
1946 /*
1947 * Still no pager for the object.
1948 * Reactivate the page.
1949 *
1950 * Should only happen if there is no
1951 * default pager.
1952 */
1953 m->list_req_pending = FALSE;
1954 m->cleaning = FALSE;
1955 m->pageout = FALSE;
1956 vm_page_unwire(m);
1957
1958 vm_pageout_throttle_up(m);
1959
1960 vm_page_lock_queues();
1961 vm_pageout_dirty_no_pager++;
1962 vm_page_activate(m);
1963 vm_page_unlock_queues();
1964
1965 /*
1966 * And we are done with it.
1967 */
1968 PAGE_WAKEUP_DONE(m);
1969
1970 vm_object_paging_end(object);
1971 vm_object_unlock(object);
1972
1973 vm_page_lock_queues();
1974 continue;
1975 } else if (object->pager == MEMORY_OBJECT_NULL) {
1976 /*
1977 * This pager has been destroyed by either
1978 * memory_object_destroy or vm_object_destroy, and
1979 * so there is nowhere for the page to go.
1980 * Just free the page... VM_PAGE_FREE takes
1981 * care of cleaning up all the state...
1982 * including doing the vm_pageout_throttle_up
1983 */
1984 VM_PAGE_FREE(m);
1985
1986 vm_object_paging_end(object);
1987 vm_object_unlock(object);
1988
1989 vm_page_lock_queues();
1990 continue;
1991 }
1992 vm_object_unlock(object);
1993 }
1994 /*
1995 * we expect the paging_in_progress reference to have
1996 * already been taken on the object before it was added
1997 * to the appropriate pageout I/O queue... this will
1998 * keep the object from being terminated and/or the
1999 * paging_offset from changing until the I/O has
2000 * completed... therefore no need to lock the object to
2001 * pull the paging_offset from it.
2002 *
2003 * Send the data to the pager.
2004 * any pageout clustering happens there
2005 */
2006 memory_object_data_return(object->pager,
2007 m->offset + object->paging_offset,
2008 PAGE_SIZE,
2009 NULL,
2010 NULL,
2011 FALSE,
2012 FALSE,
2013 0);
2014
2015 vm_object_lock(object);
2016 vm_object_paging_end(object);
2017 vm_object_unlock(object);
2018
2019 vm_page_lock_queues();
2020 }
2021 assert_wait((event_t) q, THREAD_UNINT);
2022
2023
2024 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2025 q->pgo_throttled = FALSE;
2026 need_wakeup = TRUE;
2027 } else
2028 need_wakeup = FALSE;
2029
2030 q->pgo_busy = FALSE;
2031 q->pgo_idle = TRUE;
2032 vm_page_unlock_queues();
2033
2034 if (need_wakeup == TRUE)
2035 thread_wakeup((event_t) &q->pgo_laundry);
2036
2037 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2038 /*NOTREACHED*/
2039 }
2040
2041
2042 static void
2043 vm_pageout_iothread_external(void)
2044 {
2045
2046 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2047 /*NOTREACHED*/
2048 }
2049
2050
2051 static void
2052 vm_pageout_iothread_internal(void)
2053 {
2054 thread_t self = current_thread();
2055
2056 self->options |= TH_OPT_VMPRIV;
2057
2058 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2059 /*NOTREACHED*/
2060 }
2061
2062 static void
2063 vm_pageout_garbage_collect(int collect)
2064 {
2065 if (collect) {
2066 stack_collect();
2067
2068 /*
2069 * consider_zone_gc should be last, because the other operations
2070 * might return memory to zones.
2071 */
2072 consider_machine_collect();
2073 consider_zone_gc();
2074
2075 consider_machine_adjust();
2076 }
2077
2078 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2079
2080 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2081 /*NOTREACHED*/
2082 }
2083
2084
2085
2086 void
2087 vm_pageout(void)
2088 {
2089 thread_t self = current_thread();
2090 thread_t thread;
2091 kern_return_t result;
2092 spl_t s;
2093
2094 /*
2095 * Set thread privileges.
2096 */
2097 s = splsched();
2098 thread_lock(self);
2099 self->priority = BASEPRI_PREEMPT - 1;
2100 set_sched_pri(self, self->priority);
2101 thread_unlock(self);
2102 splx(s);
2103
2104 /*
2105 * Initialize some paging parameters.
2106 */
2107
2108 if (vm_pageout_idle_wait == 0)
2109 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2110
2111 if (vm_pageout_burst_wait == 0)
2112 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2113
2114 if (vm_pageout_empty_wait == 0)
2115 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2116
2117 if (vm_pageout_deadlock_wait == 0)
2118 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2119
2120 if (vm_pageout_deadlock_relief == 0)
2121 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2122
2123 if (vm_pageout_inactive_relief == 0)
2124 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2125
2126 if (vm_pageout_burst_active_throttle == 0)
2127 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2128
2129 if (vm_pageout_burst_inactive_throttle == 0)
2130 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2131
2132 /*
2133 * Set kernel task to low backing store privileged
2134 * status
2135 */
2136 task_lock(kernel_task);
2137 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2138 task_unlock(kernel_task);
2139
2140 vm_page_free_count_init = vm_page_free_count;
2141 vm_zf_iterator = 0;
2142 /*
2143 * even if we've already called vm_page_free_reserve
2144 * call it again here to insure that the targets are
2145 * accurately calculated (it uses vm_page_free_count_init)
2146 * calling it with an arg of 0 will not change the reserve
2147 * but will re-calculate free_min and free_target
2148 */
2149 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2150 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2151 } else
2152 vm_page_free_reserve(0);
2153
2154
2155 queue_init(&vm_pageout_queue_external.pgo_pending);
2156 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2157 vm_pageout_queue_external.pgo_laundry = 0;
2158 vm_pageout_queue_external.pgo_idle = FALSE;
2159 vm_pageout_queue_external.pgo_busy = FALSE;
2160 vm_pageout_queue_external.pgo_throttled = FALSE;
2161
2162 queue_init(&vm_pageout_queue_internal.pgo_pending);
2163 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2164 vm_pageout_queue_internal.pgo_laundry = 0;
2165 vm_pageout_queue_internal.pgo_idle = FALSE;
2166 vm_pageout_queue_internal.pgo_busy = FALSE;
2167 vm_pageout_queue_internal.pgo_throttled = FALSE;
2168
2169
2170 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2171 if (result != KERN_SUCCESS)
2172 panic("vm_pageout_iothread_internal: create failed");
2173
2174 thread_deallocate(thread);
2175
2176
2177 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2178 if (result != KERN_SUCCESS)
2179 panic("vm_pageout_iothread_external: create failed");
2180
2181 thread_deallocate(thread);
2182
2183
2184 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2185 if (result != KERN_SUCCESS)
2186 panic("vm_pageout_garbage_collect: create failed");
2187
2188 thread_deallocate(thread);
2189
2190 vm_object_reaper_init();
2191
2192 vm_pageout_continue();
2193 /*NOTREACHED*/
2194 }
2195
2196
2197 static upl_t
2198 upl_create(
2199 int flags,
2200 upl_size_t size)
2201 {
2202 upl_t upl;
2203 int page_field_size; /* bit field in word size buf */
2204
2205 page_field_size = 0;
2206 if (flags & UPL_CREATE_LITE) {
2207 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2208 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2209 }
2210 if(flags & UPL_CREATE_INTERNAL) {
2211 upl = (upl_t)kalloc(sizeof(struct upl)
2212 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2213 + page_field_size);
2214 } else {
2215 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2216 }
2217 upl->flags = 0;
2218 upl->src_object = NULL;
2219 upl->kaddr = (vm_offset_t)0;
2220 upl->size = 0;
2221 upl->map_object = NULL;
2222 upl->ref_count = 1;
2223 upl->highest_page = 0;
2224 upl_lock_init(upl);
2225 #ifdef UPL_DEBUG
2226 upl->ubc_alias1 = 0;
2227 upl->ubc_alias2 = 0;
2228 #endif /* UPL_DEBUG */
2229 return(upl);
2230 }
2231
2232 static void
2233 upl_destroy(
2234 upl_t upl)
2235 {
2236 int page_field_size; /* bit field in word size buf */
2237
2238 #ifdef UPL_DEBUG
2239 {
2240 upl_t upl_ele;
2241 vm_object_t object;
2242 if (upl->map_object->pageout) {
2243 object = upl->map_object->shadow;
2244 } else {
2245 object = upl->map_object;
2246 }
2247 vm_object_lock(object);
2248 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2249 if(upl_ele == upl) {
2250 queue_remove(&object->uplq,
2251 upl_ele, upl_t, uplq);
2252 break;
2253 }
2254 }
2255 vm_object_unlock(object);
2256 }
2257 #endif /* UPL_DEBUG */
2258 /* drop a reference on the map_object whether or */
2259 /* not a pageout object is inserted */
2260 if(upl->map_object->pageout)
2261 vm_object_deallocate(upl->map_object);
2262
2263 page_field_size = 0;
2264 if (upl->flags & UPL_LITE) {
2265 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2266 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2267 }
2268 if(upl->flags & UPL_INTERNAL) {
2269 kfree(upl,
2270 sizeof(struct upl) +
2271 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2272 + page_field_size);
2273 } else {
2274 kfree(upl, sizeof(struct upl) + page_field_size);
2275 }
2276 }
2277
2278 void uc_upl_dealloc(upl_t upl);
2279 __private_extern__ void
2280 uc_upl_dealloc(
2281 upl_t upl)
2282 {
2283 upl->ref_count -= 1;
2284 if(upl->ref_count == 0) {
2285 upl_destroy(upl);
2286 }
2287 }
2288
2289 void
2290 upl_deallocate(
2291 upl_t upl)
2292 {
2293
2294 upl->ref_count -= 1;
2295 if(upl->ref_count == 0) {
2296 upl_destroy(upl);
2297 }
2298 }
2299
2300 /*
2301 * Statistics about UPL enforcement of copy-on-write obligations.
2302 */
2303 unsigned long upl_cow = 0;
2304 unsigned long upl_cow_again = 0;
2305 unsigned long upl_cow_contiguous = 0;
2306 unsigned long upl_cow_pages = 0;
2307 unsigned long upl_cow_again_pages = 0;
2308 unsigned long upl_cow_contiguous_pages = 0;
2309
2310 /*
2311 * Routine: vm_object_upl_request
2312 * Purpose:
2313 * Cause the population of a portion of a vm_object.
2314 * Depending on the nature of the request, the pages
2315 * returned may be contain valid data or be uninitialized.
2316 * A page list structure, listing the physical pages
2317 * will be returned upon request.
2318 * This function is called by the file system or any other
2319 * supplier of backing store to a pager.
2320 * IMPORTANT NOTE: The caller must still respect the relationship
2321 * between the vm_object and its backing memory object. The
2322 * caller MUST NOT substitute changes in the backing file
2323 * without first doing a memory_object_lock_request on the
2324 * target range unless it is know that the pages are not
2325 * shared with another entity at the pager level.
2326 * Copy_in_to:
2327 * if a page list structure is present
2328 * return the mapped physical pages, where a
2329 * page is not present, return a non-initialized
2330 * one. If the no_sync bit is turned on, don't
2331 * call the pager unlock to synchronize with other
2332 * possible copies of the page. Leave pages busy
2333 * in the original object, if a page list structure
2334 * was specified. When a commit of the page list
2335 * pages is done, the dirty bit will be set for each one.
2336 * Copy_out_from:
2337 * If a page list structure is present, return
2338 * all mapped pages. Where a page does not exist
2339 * map a zero filled one. Leave pages busy in
2340 * the original object. If a page list structure
2341 * is not specified, this call is a no-op.
2342 *
2343 * Note: access of default pager objects has a rather interesting
2344 * twist. The caller of this routine, presumably the file system
2345 * page cache handling code, will never actually make a request
2346 * against a default pager backed object. Only the default
2347 * pager will make requests on backing store related vm_objects
2348 * In this way the default pager can maintain the relationship
2349 * between backing store files (abstract memory objects) and
2350 * the vm_objects (cache objects), they support.
2351 *
2352 */
2353
2354 __private_extern__ kern_return_t
2355 vm_object_upl_request(
2356 vm_object_t object,
2357 vm_object_offset_t offset,
2358 upl_size_t size,
2359 upl_t *upl_ptr,
2360 upl_page_info_array_t user_page_list,
2361 unsigned int *page_list_count,
2362 int cntrl_flags)
2363 {
2364 vm_page_t dst_page = VM_PAGE_NULL;
2365 vm_object_offset_t dst_offset = offset;
2366 upl_size_t xfer_size = size;
2367 boolean_t do_m_lock = FALSE;
2368 boolean_t dirty;
2369 boolean_t hw_dirty;
2370 upl_t upl = NULL;
2371 unsigned int entry;
2372 #if MACH_CLUSTER_STATS
2373 boolean_t encountered_lrp = FALSE;
2374 #endif
2375 vm_page_t alias_page = NULL;
2376 int page_ticket;
2377 int refmod_state;
2378 wpl_array_t lite_list = NULL;
2379 vm_object_t last_copy_object;
2380
2381
2382 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2383 /*
2384 * For forward compatibility's sake,
2385 * reject any unknown flag.
2386 */
2387 return KERN_INVALID_VALUE;
2388 }
2389
2390 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2391 >> UPL_PAGE_TICKET_SHIFT;
2392
2393 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2394 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2395 }
2396
2397 if(cntrl_flags & UPL_SET_INTERNAL)
2398 if(page_list_count != NULL)
2399 *page_list_count = MAX_UPL_TRANSFER;
2400
2401 if((!object->internal) && (object->paging_offset != 0))
2402 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2403
2404 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2405 return KERN_SUCCESS;
2406 }
2407
2408 vm_object_lock(object);
2409 vm_object_paging_begin(object);
2410 vm_object_unlock(object);
2411
2412 if(upl_ptr) {
2413 if(cntrl_flags & UPL_SET_INTERNAL) {
2414 if(cntrl_flags & UPL_SET_LITE) {
2415 uintptr_t page_field_size;
2416 upl = upl_create(
2417 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2418 size);
2419 user_page_list = (upl_page_info_t *)
2420 (((uintptr_t)upl) + sizeof(struct upl));
2421 lite_list = (wpl_array_t)
2422 (((uintptr_t)user_page_list) +
2423 ((size/PAGE_SIZE) *
2424 sizeof(upl_page_info_t)));
2425 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2426 page_field_size =
2427 (page_field_size + 3) & 0xFFFFFFFC;
2428 bzero((char *)lite_list, page_field_size);
2429 upl->flags =
2430 UPL_LITE | UPL_INTERNAL;
2431 } else {
2432 upl = upl_create(UPL_CREATE_INTERNAL, size);
2433 user_page_list = (upl_page_info_t *)
2434 (((uintptr_t)upl) + sizeof(struct upl));
2435 upl->flags = UPL_INTERNAL;
2436 }
2437 } else {
2438 if(cntrl_flags & UPL_SET_LITE) {
2439 uintptr_t page_field_size;
2440 upl = upl_create(UPL_CREATE_LITE, size);
2441 lite_list = (wpl_array_t)
2442 (((uintptr_t)upl) + sizeof(struct upl));
2443 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2444 page_field_size =
2445 (page_field_size + 3) & 0xFFFFFFFC;
2446 bzero((char *)lite_list, page_field_size);
2447 upl->flags = UPL_LITE;
2448 } else {
2449 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2450 upl->flags = 0;
2451 }
2452 }
2453
2454 if (object->phys_contiguous) {
2455 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2456 object->copy != VM_OBJECT_NULL) {
2457 /* Honor copy-on-write obligations */
2458
2459 /*
2460 * XXX FBDP
2461 * We could still have a race...
2462 * A is here building the UPL for a write().
2463 * A pushes the pages to the current copy
2464 * object.
2465 * A returns the UPL to the caller.
2466 * B comes along and establishes another
2467 * private mapping on this object, inserting
2468 * a new copy object between the original
2469 * object and the old copy object.
2470 * B reads a page and gets the original contents
2471 * from the original object.
2472 * A modifies the page in the original object.
2473 * B reads the page again and sees A's changes,
2474 * which is wrong...
2475 *
2476 * The problem is that the pages are not
2477 * marked "busy" in the original object, so
2478 * nothing prevents B from reading it before
2479 * before A's changes are completed.
2480 *
2481 * The "paging_in_progress" might protect us
2482 * from the insertion of a new copy object
2483 * though... To be verified.
2484 */
2485 vm_object_lock_request(object,
2486 offset,
2487 size,
2488 FALSE,
2489 MEMORY_OBJECT_COPY_SYNC,
2490 VM_PROT_NO_CHANGE);
2491 upl_cow_contiguous++;
2492 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2493 }
2494
2495 upl->map_object = object;
2496 /* don't need any shadow mappings for this one */
2497 /* since it is already I/O memory */
2498 upl->flags |= UPL_DEVICE_MEMORY;
2499
2500
2501 /* paging_in_progress protects paging_offset */
2502 upl->offset = offset + object->paging_offset;
2503 upl->size = size;
2504 *upl_ptr = upl;
2505 if(user_page_list) {
2506 user_page_list[0].phys_addr =
2507 (offset + object->shadow_offset)>>PAGE_SHIFT;
2508 user_page_list[0].device = TRUE;
2509 }
2510 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2511
2512 if(page_list_count != NULL) {
2513 if (upl->flags & UPL_INTERNAL) {
2514 *page_list_count = 0;
2515 } else {
2516 *page_list_count = 1;
2517 }
2518 }
2519
2520 return KERN_SUCCESS;
2521 }
2522
2523 if(user_page_list)
2524 user_page_list[0].device = FALSE;
2525
2526 if(cntrl_flags & UPL_SET_LITE) {
2527 upl->map_object = object;
2528 } else {
2529 upl->map_object = vm_object_allocate(size);
2530 /*
2531 * No neeed to lock the new object: nobody else knows
2532 * about it yet, so it's all ours so far.
2533 */
2534 upl->map_object->shadow = object;
2535 upl->map_object->pageout = TRUE;
2536 upl->map_object->can_persist = FALSE;
2537 upl->map_object->copy_strategy =
2538 MEMORY_OBJECT_COPY_NONE;
2539 upl->map_object->shadow_offset = offset;
2540 upl->map_object->wimg_bits = object->wimg_bits;
2541 }
2542
2543 }
2544 if (!(cntrl_flags & UPL_SET_LITE)) {
2545 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2546 }
2547
2548 /*
2549 * ENCRYPTED SWAP:
2550 * Just mark the UPL as "encrypted" here.
2551 * We'll actually encrypt the pages later,
2552 * in upl_encrypt(), when the caller has
2553 * selected which pages need to go to swap.
2554 */
2555 if (cntrl_flags & UPL_ENCRYPT) {
2556 upl->flags |= UPL_ENCRYPTED;
2557 }
2558 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2559 upl->flags |= UPL_PAGEOUT;
2560 }
2561 vm_object_lock(object);
2562
2563 /* we can lock in the paging_offset once paging_in_progress is set */
2564 if(upl_ptr) {
2565 upl->size = size;
2566 upl->offset = offset + object->paging_offset;
2567 *upl_ptr = upl;
2568 #ifdef UPL_DEBUG
2569 queue_enter(&object->uplq, upl, upl_t, uplq);
2570 #endif /* UPL_DEBUG */
2571 }
2572
2573 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2574 object->copy != VM_OBJECT_NULL) {
2575 /* Honor copy-on-write obligations */
2576
2577 /*
2578 * The caller is gathering these pages and
2579 * might modify their contents. We need to
2580 * make sure that the copy object has its own
2581 * private copies of these pages before we let
2582 * the caller modify them.
2583 */
2584 vm_object_update(object,
2585 offset,
2586 size,
2587 NULL,
2588 NULL,
2589 FALSE, /* should_return */
2590 MEMORY_OBJECT_COPY_SYNC,
2591 VM_PROT_NO_CHANGE);
2592 upl_cow++;
2593 upl_cow_pages += size >> PAGE_SHIFT;
2594
2595 }
2596 /* remember which copy object we synchronized with */
2597 last_copy_object = object->copy;
2598
2599 entry = 0;
2600 if(cntrl_flags & UPL_COPYOUT_FROM) {
2601 upl->flags |= UPL_PAGE_SYNC_DONE;
2602
2603 while (xfer_size) {
2604 if((alias_page == NULL) &&
2605 !(cntrl_flags & UPL_SET_LITE)) {
2606 vm_object_unlock(object);
2607 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2608 vm_object_lock(object);
2609 }
2610 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2611 dst_page->fictitious ||
2612 dst_page->absent ||
2613 dst_page->error ||
2614 (dst_page->wire_count && !dst_page->pageout) ||
2615
2616 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2617 (dst_page->page_ticket != page_ticket) &&
2618 ((dst_page->page_ticket+1) != page_ticket)) ) {
2619
2620 if (user_page_list)
2621 user_page_list[entry].phys_addr = 0;
2622 } else {
2623 /*
2624 * grab this up front...
2625 * a high percentange of the time we're going to
2626 * need the hardware modification state a bit later
2627 * anyway... so we can eliminate an extra call into
2628 * the pmap layer by grabbing it here and recording it
2629 */
2630 refmod_state = pmap_get_refmod(dst_page->phys_page);
2631
2632 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2633 /*
2634 * we're only asking for DIRTY pages to be returned
2635 */
2636
2637 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2638 /*
2639 * if we were the page stolen by vm_pageout_scan to be
2640 * cleaned (as opposed to a buddy being clustered in
2641 * or this request is not being driven by a PAGEOUT cluster
2642 * then we only need to check for the page being diry or
2643 * precious to decide whether to return it
2644 */
2645 if (dst_page->dirty || dst_page->precious ||
2646 (refmod_state & VM_MEM_MODIFIED)) {
2647 goto check_busy;
2648 }
2649 }
2650 /*
2651 * this is a request for a PAGEOUT cluster and this page
2652 * is merely along for the ride as a 'buddy'... not only
2653 * does it have to be dirty to be returned, but it also
2654 * can't have been referenced recently... note that we've
2655 * already filtered above based on whether this page is
2656 * currently on the inactive queue or it meets the page
2657 * ticket (generation count) check
2658 */
2659 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2660 ((refmod_state & VM_MEM_MODIFIED) ||
2661 dst_page->dirty || dst_page->precious) ) {
2662 goto check_busy;
2663 }
2664 /*
2665 * if we reach here, we're not to return
2666 * the page... go on to the next one
2667 */
2668 if (user_page_list)
2669 user_page_list[entry].phys_addr = 0;
2670 entry++;
2671 dst_offset += PAGE_SIZE_64;
2672 xfer_size -= PAGE_SIZE;
2673 continue;
2674 }
2675 check_busy:
2676 if(dst_page->busy &&
2677 (!(dst_page->list_req_pending &&
2678 dst_page->pageout))) {
2679 if(cntrl_flags & UPL_NOBLOCK) {
2680 if(user_page_list) {
2681 user_page_list[entry].phys_addr = 0;
2682 }
2683 entry++;
2684 dst_offset += PAGE_SIZE_64;
2685 xfer_size -= PAGE_SIZE;
2686 continue;
2687 }
2688 /*
2689 * someone else is playing with the
2690 * page. We will have to wait.
2691 */
2692 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2693 continue;
2694 }
2695 /* Someone else already cleaning the page? */
2696 if((dst_page->cleaning || dst_page->absent ||
2697 dst_page->wire_count != 0) &&
2698 !dst_page->list_req_pending) {
2699 if(user_page_list) {
2700 user_page_list[entry].phys_addr = 0;
2701 }
2702 entry++;
2703 dst_offset += PAGE_SIZE_64;
2704 xfer_size -= PAGE_SIZE;
2705 continue;
2706 }
2707 /* eliminate all mappings from the */
2708 /* original object and its prodigy */
2709
2710 vm_page_lock_queues();
2711
2712 if (dst_page->pageout_queue == TRUE)
2713 /*
2714 * we've buddied up a page for a clustered pageout
2715 * that has already been moved to the pageout
2716 * queue by pageout_scan... we need to remove
2717 * it from the queue and drop the laundry count
2718 * on that queue
2719 */
2720 vm_pageout_queue_steal(dst_page);
2721 #if MACH_CLUSTER_STATS
2722 /* pageout statistics gathering. count */
2723 /* all the pages we will page out that */
2724 /* were not counted in the initial */
2725 /* vm_pageout_scan work */
2726 if(dst_page->list_req_pending)
2727 encountered_lrp = TRUE;
2728 if((dst_page->dirty ||
2729 (dst_page->object->internal &&
2730 dst_page->precious)) &&
2731 (dst_page->list_req_pending
2732 == FALSE)) {
2733 if(encountered_lrp) {
2734 CLUSTER_STAT
2735 (pages_at_higher_offsets++;)
2736 } else {
2737 CLUSTER_STAT
2738 (pages_at_lower_offsets++;)
2739 }
2740 }
2741 #endif
2742 /* Turn off busy indication on pending */
2743 /* pageout. Note: we can only get here */
2744 /* in the request pending case. */
2745 dst_page->list_req_pending = FALSE;
2746 dst_page->busy = FALSE;
2747 dst_page->cleaning = FALSE;
2748
2749 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2750 dirty = hw_dirty ? TRUE : dst_page->dirty;
2751
2752 if(cntrl_flags & UPL_SET_LITE) {
2753 int pg_num;
2754 pg_num = (dst_offset-offset)/PAGE_SIZE;
2755 lite_list[pg_num>>5] |=
2756 1 << (pg_num & 31);
2757 if (hw_dirty)
2758 pmap_clear_modify(dst_page->phys_page);
2759 /*
2760 * Record that this page has been
2761 * written out
2762 */
2763 #if MACH_PAGEMAP
2764 vm_external_state_set(
2765 object->existence_map,
2766 dst_page->offset);
2767 #endif /*MACH_PAGEMAP*/
2768
2769 /*
2770 * Mark original page as cleaning
2771 * in place.
2772 */
2773 dst_page->cleaning = TRUE;
2774 dst_page->dirty = TRUE;
2775 dst_page->precious = FALSE;
2776 } else {
2777 /* use pageclean setup, it is more */
2778 /* convenient even for the pageout */
2779 /* cases here */
2780
2781 vm_object_lock(upl->map_object);
2782 vm_pageclean_setup(dst_page,
2783 alias_page, upl->map_object,
2784 size - xfer_size);
2785 vm_object_unlock(upl->map_object);
2786
2787 alias_page->absent = FALSE;
2788 alias_page = NULL;
2789 }
2790
2791 if(!dirty) {
2792 dst_page->dirty = FALSE;
2793 dst_page->precious = TRUE;
2794 }
2795
2796 if(dst_page->pageout)
2797 dst_page->busy = TRUE;
2798
2799 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2800 /*
2801 * ENCRYPTED SWAP:
2802 * We want to deny access to the target page
2803 * because its contents are about to be
2804 * encrypted and the user would be very
2805 * confused to see encrypted data instead
2806 * of their data.
2807 */
2808 dst_page->busy = TRUE;
2809 }
2810 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2811 /*
2812 * deny access to the target page
2813 * while it is being worked on
2814 */
2815 if ((!dst_page->pageout) &&
2816 (dst_page->wire_count == 0)) {
2817 dst_page->busy = TRUE;
2818 dst_page->pageout = TRUE;
2819 vm_page_wire(dst_page);
2820 }
2821 }
2822
2823 if (dst_page->phys_page > upl->highest_page)
2824 upl->highest_page = dst_page->phys_page;
2825
2826 if(user_page_list) {
2827 user_page_list[entry].phys_addr
2828 = dst_page->phys_page;
2829 user_page_list[entry].dirty =
2830 dst_page->dirty;
2831 user_page_list[entry].pageout =
2832 dst_page->pageout;
2833 user_page_list[entry].absent =
2834 dst_page->absent;
2835 user_page_list[entry].precious =
2836 dst_page->precious;
2837 }
2838 vm_page_unlock_queues();
2839
2840 /*
2841 * ENCRYPTED SWAP:
2842 * The caller is gathering this page and might
2843 * access its contents later on. Decrypt the
2844 * page before adding it to the UPL, so that
2845 * the caller never sees encrypted data.
2846 */
2847 if (! (cntrl_flags & UPL_ENCRYPT) &&
2848 dst_page->encrypted) {
2849 assert(dst_page->busy);
2850
2851 vm_page_decrypt(dst_page, 0);
2852 vm_page_decrypt_for_upl_counter++;
2853
2854 /*
2855 * Retry this page, since anything
2856 * could have changed while we were
2857 * decrypting.
2858 */
2859 continue;
2860 }
2861 }
2862 entry++;
2863 dst_offset += PAGE_SIZE_64;
2864 xfer_size -= PAGE_SIZE;
2865 }
2866 } else {
2867 while (xfer_size) {
2868 if((alias_page == NULL) &&
2869 !(cntrl_flags & UPL_SET_LITE)) {
2870 vm_object_unlock(object);
2871 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2872 vm_object_lock(object);
2873 }
2874
2875 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2876 object->copy != last_copy_object) {
2877 /* Honor copy-on-write obligations */
2878
2879 /*
2880 * The copy object has changed since we
2881 * last synchronized for copy-on-write.
2882 * Another copy object might have been
2883 * inserted while we released the object's
2884 * lock. Since someone could have seen the
2885 * original contents of the remaining pages
2886 * through that new object, we have to
2887 * synchronize with it again for the remaining
2888 * pages only. The previous pages are "busy"
2889 * so they can not be seen through the new
2890 * mapping. The new mapping will see our
2891 * upcoming changes for those previous pages,
2892 * but that's OK since they couldn't see what
2893 * was there before. It's just a race anyway
2894 * and there's no guarantee of consistency or
2895 * atomicity. We just don't want new mappings
2896 * to see both the *before* and *after* pages.
2897 */
2898 if (object->copy != VM_OBJECT_NULL) {
2899 vm_object_update(
2900 object,
2901 dst_offset,/* current offset */
2902 xfer_size, /* remaining size */
2903 NULL,
2904 NULL,
2905 FALSE, /* should_return */
2906 MEMORY_OBJECT_COPY_SYNC,
2907 VM_PROT_NO_CHANGE);
2908 upl_cow_again++;
2909 upl_cow_again_pages +=
2910 xfer_size >> PAGE_SHIFT;
2911 }
2912 /* remember the copy object we synced with */
2913 last_copy_object = object->copy;
2914 }
2915
2916 dst_page = vm_page_lookup(object, dst_offset);
2917
2918 if(dst_page != VM_PAGE_NULL) {
2919 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2920 !((dst_page->list_req_pending)
2921 && (dst_page->absent))) {
2922 /* we are doing extended range */
2923 /* requests. we want to grab */
2924 /* pages around some which are */
2925 /* already present. */
2926 if(user_page_list) {
2927 user_page_list[entry].phys_addr = 0;
2928 }
2929 entry++;
2930 dst_offset += PAGE_SIZE_64;
2931 xfer_size -= PAGE_SIZE;
2932 continue;
2933 }
2934 if((dst_page->cleaning) &&
2935 !(dst_page->list_req_pending)) {
2936 /*someone else is writing to the */
2937 /* page. We will have to wait. */
2938 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2939 continue;
2940 }
2941 if ((dst_page->fictitious &&
2942 dst_page->list_req_pending)) {
2943 /* dump the fictitious page */
2944 dst_page->list_req_pending = FALSE;
2945 dst_page->clustered = FALSE;
2946
2947 vm_page_lock_queues();
2948 vm_page_free(dst_page);
2949 vm_page_unlock_queues();
2950
2951 dst_page = NULL;
2952 } else if ((dst_page->absent &&
2953 dst_page->list_req_pending)) {
2954 /* the default_pager case */
2955 dst_page->list_req_pending = FALSE;
2956 dst_page->busy = FALSE;
2957 }
2958 }
2959 if(dst_page == VM_PAGE_NULL) {
2960 if(object->private) {
2961 /*
2962 * This is a nasty wrinkle for users
2963 * of upl who encounter device or
2964 * private memory however, it is
2965 * unavoidable, only a fault can
2966 * reslove the actual backing
2967 * physical page by asking the
2968 * backing device.
2969 */
2970 if(user_page_list) {
2971 user_page_list[entry].phys_addr = 0;
2972 }
2973 entry++;
2974 dst_offset += PAGE_SIZE_64;
2975 xfer_size -= PAGE_SIZE;
2976 continue;
2977 }
2978 /* need to allocate a page */
2979 dst_page = vm_page_alloc(object, dst_offset);
2980 if (dst_page == VM_PAGE_NULL) {
2981 vm_object_unlock(object);
2982 VM_PAGE_WAIT();
2983 vm_object_lock(object);
2984 continue;
2985 }
2986 dst_page->busy = FALSE;
2987 #if 0
2988 if(cntrl_flags & UPL_NO_SYNC) {
2989 dst_page->page_lock = 0;
2990 dst_page->unlock_request = 0;
2991 }
2992 #endif
2993 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2994 /*
2995 * if UPL_RET_ONLY_ABSENT was specified,
2996 * than we're definitely setting up a
2997 * upl for a clustered read/pagein
2998 * operation... mark the pages as clustered
2999 * so vm_fault can correctly attribute them
3000 * to the 'pagein' bucket the first time
3001 * a fault happens on them
3002 */
3003 dst_page->clustered = TRUE;
3004 }
3005 dst_page->absent = TRUE;
3006 object->absent_count++;
3007 }
3008 #if 1
3009 if(cntrl_flags & UPL_NO_SYNC) {
3010 dst_page->page_lock = 0;
3011 dst_page->unlock_request = 0;
3012 }
3013 #endif /* 1 */
3014
3015 /*
3016 * ENCRYPTED SWAP:
3017 */
3018 if (cntrl_flags & UPL_ENCRYPT) {
3019 /*
3020 * The page is going to be encrypted when we
3021 * get it from the pager, so mark it so.
3022 */
3023 dst_page->encrypted = TRUE;
3024 } else {
3025 /*
3026 * Otherwise, the page will not contain
3027 * encrypted data.
3028 */
3029 dst_page->encrypted = FALSE;
3030 }
3031
3032 dst_page->overwriting = TRUE;
3033 if(dst_page->fictitious) {
3034 panic("need corner case for fictitious page");
3035 }
3036 if(dst_page->page_lock) {
3037 do_m_lock = TRUE;
3038 }
3039 if(upl_ptr) {
3040
3041 /* eliminate all mappings from the */
3042 /* original object and its prodigy */
3043
3044 if(dst_page->busy) {
3045 /*someone else is playing with the */
3046 /* page. We will have to wait. */
3047 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3048 continue;
3049 }
3050 vm_page_lock_queues();
3051
3052 if( !(cntrl_flags & UPL_FILE_IO))
3053 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3054 else
3055 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3056 dirty = hw_dirty ? TRUE : dst_page->dirty;
3057
3058 if(cntrl_flags & UPL_SET_LITE) {
3059 int pg_num;
3060 pg_num = (dst_offset-offset)/PAGE_SIZE;
3061 lite_list[pg_num>>5] |=
3062 1 << (pg_num & 31);
3063 if (hw_dirty)
3064 pmap_clear_modify(dst_page->phys_page);
3065 /*
3066 * Record that this page has been
3067 * written out
3068 */
3069 #if MACH_PAGEMAP
3070 vm_external_state_set(
3071 object->existence_map,
3072 dst_page->offset);
3073 #endif /*MACH_PAGEMAP*/
3074
3075 /*
3076 * Mark original page as cleaning
3077 * in place.
3078 */
3079 dst_page->cleaning = TRUE;
3080 dst_page->dirty = TRUE;
3081 dst_page->precious = FALSE;
3082 } else {
3083 /* use pageclean setup, it is more */
3084 /* convenient even for the pageout */
3085 /* cases here */
3086 vm_object_lock(upl->map_object);
3087 vm_pageclean_setup(dst_page,
3088 alias_page, upl->map_object,
3089 size - xfer_size);
3090 vm_object_unlock(upl->map_object);
3091
3092 alias_page->absent = FALSE;
3093 alias_page = NULL;
3094 }
3095
3096 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3097 /* clean in place for read implies */
3098 /* that a write will be done on all */
3099 /* the pages that are dirty before */
3100 /* a upl commit is done. The caller */
3101 /* is obligated to preserve the */
3102 /* contents of all pages marked */
3103 /* dirty. */
3104 upl->flags |= UPL_CLEAR_DIRTY;
3105 }
3106
3107 if(!dirty) {
3108 dst_page->dirty = FALSE;
3109 dst_page->precious = TRUE;
3110 }
3111
3112 if (dst_page->wire_count == 0) {
3113 /* deny access to the target page while */
3114 /* it is being worked on */
3115 dst_page->busy = TRUE;
3116 } else {
3117 vm_page_wire(dst_page);
3118 }
3119 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3120 /*
3121 * expect the page not to be used
3122 * since it's coming in as part
3123 * of a cluster and could be
3124 * speculative... pages that
3125 * are 'consumed' will get a
3126 * hardware reference
3127 */
3128 dst_page->reference = FALSE;
3129 } else {
3130 /*
3131 * expect the page to be used
3132 */
3133 dst_page->reference = TRUE;
3134 }
3135 dst_page->precious =
3136 (cntrl_flags & UPL_PRECIOUS)
3137 ? TRUE : FALSE;
3138
3139 if (dst_page->phys_page > upl->highest_page)
3140 upl->highest_page = dst_page->phys_page;
3141
3142 if(user_page_list) {
3143 user_page_list[entry].phys_addr
3144 = dst_page->phys_page;
3145 user_page_list[entry].dirty =
3146 dst_page->dirty;
3147 user_page_list[entry].pageout =
3148 dst_page->pageout;
3149 user_page_list[entry].absent =
3150 dst_page->absent;
3151 user_page_list[entry].precious =
3152 dst_page->precious;
3153 }
3154 vm_page_unlock_queues();
3155 }
3156 entry++;
3157 dst_offset += PAGE_SIZE_64;
3158 xfer_size -= PAGE_SIZE;
3159 }
3160 }
3161
3162 if (upl->flags & UPL_INTERNAL) {
3163 if(page_list_count != NULL)
3164 *page_list_count = 0;
3165 } else if (*page_list_count > entry) {
3166 if(page_list_count != NULL)
3167 *page_list_count = entry;
3168 }
3169
3170 if(alias_page != NULL) {
3171 vm_page_lock_queues();
3172 vm_page_free(alias_page);
3173 vm_page_unlock_queues();
3174 }
3175
3176 if(do_m_lock) {
3177 vm_prot_t access_required;
3178 /* call back all associated pages from other users of the pager */
3179 /* all future updates will be on data which is based on the */
3180 /* changes we are going to make here. Note: it is assumed that */
3181 /* we already hold copies of the data so we will not be seeing */
3182 /* an avalanche of incoming data from the pager */
3183 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3184 ? VM_PROT_READ : VM_PROT_WRITE;
3185 while (TRUE) {
3186 kern_return_t rc;
3187
3188 if(!object->pager_ready) {
3189 wait_result_t wait_result;
3190
3191 wait_result = vm_object_sleep(object,
3192 VM_OBJECT_EVENT_PAGER_READY,
3193 THREAD_UNINT);
3194 if (wait_result != THREAD_AWAKENED) {
3195 vm_object_unlock(object);
3196 return KERN_FAILURE;
3197 }
3198 continue;
3199 }
3200
3201 vm_object_unlock(object);
3202 rc = memory_object_data_unlock(
3203 object->pager,
3204 dst_offset + object->paging_offset,
3205 size,
3206 access_required);
3207 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3208 return KERN_FAILURE;
3209 vm_object_lock(object);
3210
3211 if (rc == KERN_SUCCESS)
3212 break;
3213 }
3214
3215 /* lets wait on the last page requested */
3216 /* NOTE: we will have to update lock completed routine to signal */
3217 if(dst_page != VM_PAGE_NULL &&
3218 (access_required & dst_page->page_lock) != access_required) {
3219 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3220 vm_object_unlock(object);
3221 thread_block(THREAD_CONTINUE_NULL);
3222 return KERN_SUCCESS;
3223 }
3224 }
3225
3226 vm_object_unlock(object);
3227 return KERN_SUCCESS;
3228 }
3229
3230 /* JMM - Backward compatability for now */
3231 kern_return_t
3232 vm_fault_list_request( /* forward */
3233 memory_object_control_t control,
3234 vm_object_offset_t offset,
3235 upl_size_t size,
3236 upl_t *upl_ptr,
3237 upl_page_info_t **user_page_list_ptr,
3238 int page_list_count,
3239 int cntrl_flags);
3240 kern_return_t
3241 vm_fault_list_request(
3242 memory_object_control_t control,
3243 vm_object_offset_t offset,
3244 upl_size_t size,
3245 upl_t *upl_ptr,
3246 upl_page_info_t **user_page_list_ptr,
3247 int page_list_count,
3248 int cntrl_flags)
3249 {
3250 unsigned int local_list_count;
3251 upl_page_info_t *user_page_list;
3252 kern_return_t kr;
3253
3254 if (user_page_list_ptr != NULL) {
3255 local_list_count = page_list_count;
3256 user_page_list = *user_page_list_ptr;
3257 } else {
3258 local_list_count = 0;
3259 user_page_list = NULL;
3260 }
3261 kr = memory_object_upl_request(control,
3262 offset,
3263 size,
3264 upl_ptr,
3265 user_page_list,
3266 &local_list_count,
3267 cntrl_flags);
3268
3269 if(kr != KERN_SUCCESS)
3270 return kr;
3271
3272 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3273 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3274 }
3275
3276 return KERN_SUCCESS;
3277 }
3278
3279
3280
3281 /*
3282 * Routine: vm_object_super_upl_request
3283 * Purpose:
3284 * Cause the population of a portion of a vm_object
3285 * in much the same way as memory_object_upl_request.
3286 * Depending on the nature of the request, the pages
3287 * returned may be contain valid data or be uninitialized.
3288 * However, the region may be expanded up to the super
3289 * cluster size provided.
3290 */
3291
3292 __private_extern__ kern_return_t
3293 vm_object_super_upl_request(
3294 vm_object_t object,
3295 vm_object_offset_t offset,
3296 upl_size_t size,
3297 upl_size_t super_cluster,
3298 upl_t *upl,
3299 upl_page_info_t *user_page_list,
3300 unsigned int *page_list_count,
3301 int cntrl_flags)
3302 {
3303 vm_page_t target_page;
3304 int ticket;
3305
3306
3307 if(object->paging_offset > offset)
3308 return KERN_FAILURE;
3309
3310 assert(object->paging_in_progress);
3311 offset = offset - object->paging_offset;
3312
3313 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3314
3315 vm_object_lock(object);
3316
3317 if((target_page = vm_page_lookup(object, offset))
3318 != VM_PAGE_NULL) {
3319 ticket = target_page->page_ticket;
3320 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3321 cntrl_flags = cntrl_flags |
3322 ((ticket << UPL_PAGE_TICKET_SHIFT)
3323 & UPL_PAGE_TICKET_MASK);
3324 }
3325 vm_object_unlock(object);
3326 }
3327
3328 if (super_cluster > size) {
3329
3330 vm_object_offset_t base_offset;
3331 upl_size_t super_size;
3332
3333 base_offset = (offset &
3334 ~((vm_object_offset_t) super_cluster - 1));
3335 super_size = (offset+size) > (base_offset + super_cluster) ?
3336 super_cluster<<1 : super_cluster;
3337 super_size = ((base_offset + super_size) > object->size) ?
3338 (object->size - base_offset) : super_size;
3339 if(offset > (base_offset + super_size))
3340 panic("vm_object_super_upl_request: Missed target pageout"
3341 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3342 offset, base_offset, super_size, super_cluster,
3343 size, object->paging_offset);
3344 /*
3345 * apparently there is a case where the vm requests a
3346 * page to be written out who's offset is beyond the
3347 * object size
3348 */
3349 if((offset + size) > (base_offset + super_size))
3350 super_size = (offset + size) - base_offset;
3351
3352 offset = base_offset;
3353 size = super_size;
3354 }
3355 return vm_object_upl_request(object, offset, size,
3356 upl, user_page_list, page_list_count,
3357 cntrl_flags);
3358 }
3359
3360
3361 kern_return_t
3362 vm_map_create_upl(
3363 vm_map_t map,
3364 vm_map_address_t offset,
3365 upl_size_t *upl_size,
3366 upl_t *upl,
3367 upl_page_info_array_t page_list,
3368 unsigned int *count,
3369 int *flags)
3370 {
3371 vm_map_entry_t entry;
3372 int caller_flags;
3373 int force_data_sync;
3374 int sync_cow_data;
3375 vm_object_t local_object;
3376 vm_map_offset_t local_offset;
3377 vm_map_offset_t local_start;
3378 kern_return_t ret;
3379
3380 caller_flags = *flags;
3381
3382 if (caller_flags & ~UPL_VALID_FLAGS) {
3383 /*
3384 * For forward compatibility's sake,
3385 * reject any unknown flag.
3386 */
3387 return KERN_INVALID_VALUE;
3388 }
3389
3390 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3391 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3392
3393 if(upl == NULL)
3394 return KERN_INVALID_ARGUMENT;
3395
3396
3397 REDISCOVER_ENTRY:
3398 vm_map_lock(map);
3399 if (vm_map_lookup_entry(map, offset, &entry)) {
3400 if (entry->object.vm_object == VM_OBJECT_NULL ||
3401 !entry->object.vm_object->phys_contiguous) {
3402 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3403 *upl_size = MAX_UPL_TRANSFER * page_size;
3404 }
3405 }
3406 if((entry->vme_end - offset) < *upl_size) {
3407 *upl_size = entry->vme_end - offset;
3408 }
3409 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3410 if (entry->object.vm_object == VM_OBJECT_NULL) {
3411 *flags = 0;
3412 } else if (entry->object.vm_object->private) {
3413 *flags = UPL_DEV_MEMORY;
3414 if (entry->object.vm_object->phys_contiguous) {
3415 *flags |= UPL_PHYS_CONTIG;
3416 }
3417 } else {
3418 *flags = 0;
3419 }
3420 vm_map_unlock(map);
3421 return KERN_SUCCESS;
3422 }
3423 /*
3424 * Create an object if necessary.
3425 */
3426 if (entry->object.vm_object == VM_OBJECT_NULL) {
3427 entry->object.vm_object = vm_object_allocate(
3428 (vm_size_t)(entry->vme_end - entry->vme_start));
3429 entry->offset = 0;
3430 }
3431 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3432 if (!(entry->protection & VM_PROT_WRITE)) {
3433 vm_map_unlock(map);
3434 return KERN_PROTECTION_FAILURE;
3435 }
3436 if (entry->needs_copy) {
3437 vm_map_t local_map;
3438 vm_object_t object;
3439 vm_map_offset_t offset_hi;
3440 vm_map_offset_t offset_lo;
3441 vm_object_offset_t new_offset;
3442 vm_prot_t prot;
3443 boolean_t wired;
3444 vm_behavior_t behavior;
3445 vm_map_version_t version;
3446 vm_map_t real_map;
3447
3448 local_map = map;
3449 vm_map_lock_write_to_read(map);
3450 if(vm_map_lookup_locked(&local_map,
3451 offset, VM_PROT_WRITE,
3452 &version, &object,
3453 &new_offset, &prot, &wired,
3454 &behavior, &offset_lo,
3455 &offset_hi, &real_map)) {
3456 vm_map_unlock(local_map);
3457 return KERN_FAILURE;
3458 }
3459 if (real_map != map) {
3460 vm_map_unlock(real_map);
3461 }
3462 vm_object_unlock(object);
3463 vm_map_unlock(local_map);
3464
3465 goto REDISCOVER_ENTRY;
3466 }
3467 }
3468 if (entry->is_sub_map) {
3469 vm_map_t submap;
3470
3471 submap = entry->object.sub_map;
3472 local_start = entry->vme_start;
3473 local_offset = entry->offset;
3474 vm_map_reference(submap);
3475 vm_map_unlock(map);
3476
3477 ret = (vm_map_create_upl(submap,
3478 local_offset + (offset - local_start),
3479 upl_size, upl, page_list, count,
3480 flags));
3481
3482 vm_map_deallocate(submap);
3483 return ret;
3484 }
3485
3486 if (sync_cow_data) {
3487 if (entry->object.vm_object->shadow
3488 || entry->object.vm_object->copy) {
3489
3490 local_object = entry->object.vm_object;
3491 local_start = entry->vme_start;
3492 local_offset = entry->offset;
3493 vm_object_reference(local_object);
3494 vm_map_unlock(map);
3495
3496 if (entry->object.vm_object->shadow &&
3497 entry->object.vm_object->copy) {
3498 vm_object_lock_request(
3499 local_object->shadow,
3500 (vm_object_offset_t)
3501 ((offset - local_start) +
3502 local_offset) +
3503 local_object->shadow_offset,
3504 *upl_size, FALSE,
3505 MEMORY_OBJECT_DATA_SYNC,
3506 VM_PROT_NO_CHANGE);
3507 }
3508 sync_cow_data = FALSE;
3509 vm_object_deallocate(local_object);
3510 goto REDISCOVER_ENTRY;
3511 }
3512 }
3513
3514 if (force_data_sync) {
3515
3516 local_object = entry->object.vm_object;
3517 local_start = entry->vme_start;
3518 local_offset = entry->offset;
3519 vm_object_reference(local_object);
3520 vm_map_unlock(map);
3521
3522 vm_object_lock_request(
3523 local_object,
3524 (vm_object_offset_t)
3525 ((offset - local_start) + local_offset),
3526 (vm_object_size_t)*upl_size, FALSE,
3527 MEMORY_OBJECT_DATA_SYNC,
3528 VM_PROT_NO_CHANGE);
3529 force_data_sync = FALSE;
3530 vm_object_deallocate(local_object);
3531 goto REDISCOVER_ENTRY;
3532 }
3533
3534 if(!(entry->object.vm_object->private)) {
3535 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3536 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3537 if(entry->object.vm_object->phys_contiguous) {
3538 *flags = UPL_PHYS_CONTIG;
3539 } else {
3540 *flags = 0;
3541 }
3542 } else {
3543 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3544 }
3545 local_object = entry->object.vm_object;
3546 local_offset = entry->offset;
3547 local_start = entry->vme_start;
3548 vm_object_reference(local_object);
3549 vm_map_unlock(map);
3550 if(caller_flags & UPL_SET_IO_WIRE) {
3551 ret = (vm_object_iopl_request(local_object,
3552 (vm_object_offset_t)
3553 ((offset - local_start)
3554 + local_offset),
3555 *upl_size,
3556 upl,
3557 page_list,
3558 count,
3559 caller_flags));
3560 } else {
3561 ret = (vm_object_upl_request(local_object,
3562 (vm_object_offset_t)
3563 ((offset - local_start)
3564 + local_offset),
3565 *upl_size,
3566 upl,
3567 page_list,
3568 count,
3569 caller_flags));
3570 }
3571 vm_object_deallocate(local_object);
3572 return(ret);
3573 }
3574
3575 vm_map_unlock(map);
3576 return(KERN_FAILURE);
3577
3578 }
3579
3580 /*
3581 * Internal routine to enter a UPL into a VM map.
3582 *
3583 * JMM - This should just be doable through the standard
3584 * vm_map_enter() API.
3585 */
3586 kern_return_t
3587 vm_map_enter_upl(
3588 vm_map_t map,
3589 upl_t upl,
3590 vm_map_offset_t *dst_addr)
3591 {
3592 vm_map_size_t size;
3593 vm_object_offset_t offset;
3594 vm_map_offset_t addr;
3595 vm_page_t m;
3596 kern_return_t kr;
3597
3598 if (upl == UPL_NULL)
3599 return KERN_INVALID_ARGUMENT;
3600
3601 upl_lock(upl);
3602
3603 /* check to see if already mapped */
3604 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3605 upl_unlock(upl);
3606 return KERN_FAILURE;
3607 }
3608
3609 if((!(upl->map_object->pageout)) &&
3610 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3611 (upl->map_object->phys_contiguous))) {
3612 vm_object_t object;
3613 vm_page_t alias_page;
3614 vm_object_offset_t new_offset;
3615 int pg_num;
3616 wpl_array_t lite_list;
3617
3618 if(upl->flags & UPL_INTERNAL) {
3619 lite_list = (wpl_array_t)
3620 ((((uintptr_t)upl) + sizeof(struct upl))
3621 + ((upl->size/PAGE_SIZE)
3622 * sizeof(upl_page_info_t)));
3623 } else {
3624 lite_list = (wpl_array_t)
3625 (((uintptr_t)upl) + sizeof(struct upl));
3626 }
3627 object = upl->map_object;
3628 upl->map_object = vm_object_allocate(upl->size);
3629 vm_object_lock(upl->map_object);
3630 upl->map_object->shadow = object;
3631 upl->map_object->pageout = TRUE;
3632 upl->map_object->can_persist = FALSE;
3633 upl->map_object->copy_strategy =
3634 MEMORY_OBJECT_COPY_NONE;
3635 upl->map_object->shadow_offset =
3636 upl->offset - object->paging_offset;
3637 upl->map_object->wimg_bits = object->wimg_bits;
3638 offset = upl->map_object->shadow_offset;
3639 new_offset = 0;
3640 size = upl->size;
3641
3642 vm_object_lock(object);
3643
3644 while(size) {
3645 pg_num = (new_offset)/PAGE_SIZE;
3646 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3647 vm_object_unlock(object);
3648 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3649 vm_object_lock(object);
3650 m = vm_page_lookup(object, offset);
3651 if (m == VM_PAGE_NULL) {
3652 panic("vm_upl_map: page missing\n");
3653 }
3654
3655 vm_object_paging_begin(object);
3656
3657 /*
3658 * Convert the fictitious page to a private
3659 * shadow of the real page.
3660 */
3661 assert(alias_page->fictitious);
3662 alias_page->fictitious = FALSE;
3663 alias_page->private = TRUE;
3664 alias_page->pageout = TRUE;
3665 alias_page->phys_page = m->phys_page;
3666
3667 vm_page_lock_queues();
3668 vm_page_wire(alias_page);
3669 vm_page_unlock_queues();
3670
3671 /*
3672 * ENCRYPTED SWAP:
3673 * The virtual page ("m") has to be wired in some way
3674 * here or its physical page ("m->phys_page") could
3675 * be recycled at any time.
3676 * Assuming this is enforced by the caller, we can't
3677 * get an encrypted page here. Since the encryption
3678 * key depends on the VM page's "pager" object and
3679 * the "paging_offset", we couldn't handle 2 pageable
3680 * VM pages (with different pagers and paging_offsets)
3681 * sharing the same physical page: we could end up
3682 * encrypting with one key (via one VM page) and
3683 * decrypting with another key (via the alias VM page).
3684 */
3685 ASSERT_PAGE_DECRYPTED(m);
3686
3687 vm_page_insert(alias_page,
3688 upl->map_object, new_offset);
3689 assert(!alias_page->wanted);
3690 alias_page->busy = FALSE;
3691 alias_page->absent = FALSE;
3692 }
3693
3694 size -= PAGE_SIZE;
3695 offset += PAGE_SIZE_64;
3696 new_offset += PAGE_SIZE_64;
3697 }
3698 vm_object_unlock(object);
3699 vm_object_unlock(upl->map_object);
3700 }
3701 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3702 offset = upl->offset - upl->map_object->paging_offset;
3703 else
3704 offset = 0;
3705
3706 size = upl->size;
3707
3708 vm_object_lock(upl->map_object);
3709 upl->map_object->ref_count++;
3710 vm_object_res_reference(upl->map_object);
3711 vm_object_unlock(upl->map_object);
3712
3713 *dst_addr = 0;
3714
3715
3716 /* NEED A UPL_MAP ALIAS */
3717 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3718 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3719 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3720
3721 if (kr != KERN_SUCCESS) {
3722 upl_unlock(upl);
3723 return(kr);
3724 }
3725
3726 vm_object_lock(upl->map_object);
3727
3728 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3729 m = vm_page_lookup(upl->map_object, offset);
3730 if(m) {
3731 unsigned int cache_attr;
3732 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3733
3734 PMAP_ENTER(map->pmap, addr,
3735 m, VM_PROT_ALL,
3736 cache_attr, TRUE);
3737 }
3738 offset+=PAGE_SIZE_64;
3739 }
3740 vm_object_unlock(upl->map_object);
3741
3742 upl->ref_count++; /* hold a reference for the mapping */
3743 upl->flags |= UPL_PAGE_LIST_MAPPED;
3744 upl->kaddr = *dst_addr;
3745 upl_unlock(upl);
3746 return KERN_SUCCESS;
3747 }
3748
3749 /*
3750 * Internal routine to remove a UPL mapping from a VM map.
3751 *
3752 * XXX - This should just be doable through a standard
3753 * vm_map_remove() operation. Otherwise, implicit clean-up
3754 * of the target map won't be able to correctly remove
3755 * these (and release the reference on the UPL). Having
3756 * to do this means we can't map these into user-space
3757 * maps yet.
3758 */
3759 kern_return_t
3760 vm_map_remove_upl(
3761 vm_map_t map,
3762 upl_t upl)
3763 {
3764 vm_address_t addr;
3765 upl_size_t size;
3766
3767 if (upl == UPL_NULL)
3768 return KERN_INVALID_ARGUMENT;
3769
3770 upl_lock(upl);
3771 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3772 addr = upl->kaddr;
3773 size = upl->size;
3774 assert(upl->ref_count > 1);
3775 upl->ref_count--; /* removing mapping ref */
3776 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3777 upl->kaddr = (vm_offset_t) 0;
3778 upl_unlock(upl);
3779
3780 vm_map_remove( map,
3781 vm_map_trunc_page(addr),
3782 vm_map_round_page(addr + size),
3783 VM_MAP_NO_FLAGS);
3784 return KERN_SUCCESS;
3785 }
3786 upl_unlock(upl);
3787 return KERN_FAILURE;
3788 }
3789
3790 kern_return_t
3791 upl_commit_range(
3792 upl_t upl,
3793 upl_offset_t offset,
3794 upl_size_t size,
3795 int flags,
3796 upl_page_info_t *page_list,
3797 mach_msg_type_number_t count,
3798 boolean_t *empty)
3799 {
3800 upl_size_t xfer_size = size;
3801 vm_object_t shadow_object;
3802 vm_object_t object = upl->map_object;
3803 vm_object_offset_t target_offset;
3804 int entry;
3805 wpl_array_t lite_list;
3806 int occupied;
3807 int delayed_unlock = 0;
3808 int clear_refmod = 0;
3809 boolean_t shadow_internal;
3810
3811 *empty = FALSE;
3812
3813 if (upl == UPL_NULL)
3814 return KERN_INVALID_ARGUMENT;
3815
3816
3817 if (count == 0)
3818 page_list = NULL;
3819
3820 if (object->pageout) {
3821 shadow_object = object->shadow;
3822 } else {
3823 shadow_object = object;
3824 }
3825
3826 upl_lock(upl);
3827
3828 if (upl->flags & UPL_ACCESS_BLOCKED) {
3829 /*
3830 * We used this UPL to block access to the pages by marking
3831 * them "busy". Now we need to clear the "busy" bit to allow
3832 * access to these pages again.
3833 */
3834 flags |= UPL_COMMIT_ALLOW_ACCESS;
3835 }
3836
3837 if (upl->flags & UPL_CLEAR_DIRTY)
3838 flags |= UPL_COMMIT_CLEAR_DIRTY;
3839
3840 if (upl->flags & UPL_DEVICE_MEMORY) {
3841 xfer_size = 0;
3842 } else if ((offset + size) > upl->size) {
3843 upl_unlock(upl);
3844 return KERN_FAILURE;
3845 }
3846
3847 if (upl->flags & UPL_INTERNAL) {
3848 lite_list = (wpl_array_t)
3849 ((((uintptr_t)upl) + sizeof(struct upl))
3850 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3851 } else {
3852 lite_list = (wpl_array_t)
3853 (((uintptr_t)upl) + sizeof(struct upl));
3854 }
3855 if (object != shadow_object)
3856 vm_object_lock(object);
3857 vm_object_lock(shadow_object);
3858
3859 shadow_internal = shadow_object->internal;
3860
3861 entry = offset/PAGE_SIZE;
3862 target_offset = (vm_object_offset_t)offset;
3863
3864 while (xfer_size) {
3865 vm_page_t t,m;
3866 upl_page_info_t *p;
3867
3868 m = VM_PAGE_NULL;
3869
3870 if (upl->flags & UPL_LITE) {
3871 int pg_num;
3872
3873 pg_num = target_offset/PAGE_SIZE;
3874
3875 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3876 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3877 m = vm_page_lookup(shadow_object,
3878 target_offset + (upl->offset -
3879 shadow_object->paging_offset));
3880 }
3881 }
3882 if (object->pageout) {
3883 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3884 t->pageout = FALSE;
3885
3886 if (delayed_unlock) {
3887 delayed_unlock = 0;
3888 vm_page_unlock_queues();
3889 }
3890 VM_PAGE_FREE(t);
3891
3892 if (m == NULL) {
3893 m = vm_page_lookup(
3894 shadow_object,
3895 target_offset +
3896 object->shadow_offset);
3897 }
3898 if (m != VM_PAGE_NULL)
3899 vm_object_paging_end(m->object);
3900 }
3901 }
3902 if (m != VM_PAGE_NULL) {
3903
3904 clear_refmod = 0;
3905
3906 if (upl->flags & UPL_IO_WIRE) {
3907
3908 if (delayed_unlock == 0)
3909 vm_page_lock_queues();
3910
3911 vm_page_unwire(m);
3912
3913 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3914 delayed_unlock = 0;
3915 vm_page_unlock_queues();
3916 }
3917 if (page_list) {
3918 page_list[entry].phys_addr = 0;
3919 }
3920 if (flags & UPL_COMMIT_SET_DIRTY) {
3921 m->dirty = TRUE;
3922 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3923 m->dirty = FALSE;
3924 clear_refmod |= VM_MEM_MODIFIED;
3925 }
3926 if (flags & UPL_COMMIT_INACTIVATE) {
3927 m->reference = FALSE;
3928 clear_refmod |= VM_MEM_REFERENCED;
3929 vm_page_deactivate(m);
3930 }
3931 if (clear_refmod)
3932 pmap_clear_refmod(m->phys_page, clear_refmod);
3933
3934 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3935 /*
3936 * We blocked access to the pages in this UPL.
3937 * Clear the "busy" bit and wake up any waiter
3938 * for this page.
3939 */
3940 PAGE_WAKEUP_DONE(m);
3941 }
3942
3943 target_offset += PAGE_SIZE_64;
3944 xfer_size -= PAGE_SIZE;
3945 entry++;
3946 continue;
3947 }
3948 if (delayed_unlock == 0)
3949 vm_page_lock_queues();
3950 /*
3951 * make sure to clear the hardware
3952 * modify or reference bits before
3953 * releasing the BUSY bit on this page
3954 * otherwise we risk losing a legitimate
3955 * change of state
3956 */
3957 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3958 m->dirty = FALSE;
3959 clear_refmod |= VM_MEM_MODIFIED;
3960 }
3961 if (flags & UPL_COMMIT_INACTIVATE)
3962 clear_refmod |= VM_MEM_REFERENCED;
3963
3964 if (clear_refmod)
3965 pmap_clear_refmod(m->phys_page, clear_refmod);
3966
3967 if (page_list) {
3968 p = &(page_list[entry]);
3969 if(p->phys_addr && p->pageout && !m->pageout) {
3970 m->busy = TRUE;
3971 m->pageout = TRUE;
3972 vm_page_wire(m);
3973 } else if (page_list[entry].phys_addr &&
3974 !p->pageout && m->pageout &&
3975 !m->dump_cleaning) {
3976 m->pageout = FALSE;
3977 m->absent = FALSE;
3978 m->overwriting = FALSE;
3979 vm_page_unwire(m);
3980 PAGE_WAKEUP_DONE(m);
3981 }
3982 page_list[entry].phys_addr = 0;
3983 }
3984 m->dump_cleaning = FALSE;
3985 if(m->laundry) {
3986 vm_pageout_throttle_up(m);
3987 }
3988 if(m->pageout) {
3989 m->cleaning = FALSE;
3990 m->pageout = FALSE;
3991 #if MACH_CLUSTER_STATS
3992 if (m->wanted) vm_pageout_target_collisions++;
3993 #endif
3994 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3995 m->dirty = TRUE;
3996 else
3997 m->dirty = FALSE;
3998
3999 if(m->dirty) {
4000 vm_page_unwire(m);/* reactivates */
4001
4002 if (upl->flags & UPL_PAGEOUT) {
4003 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4004 VM_STAT(reactivations++);
4005 }
4006 PAGE_WAKEUP_DONE(m);
4007 } else {
4008 vm_page_free(m);/* clears busy, etc. */
4009
4010 if (upl->flags & UPL_PAGEOUT) {
4011 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4012
4013 if (page_list[entry].dirty)
4014 VM_STAT(pageouts++);
4015 }
4016 }
4017 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4018 delayed_unlock = 0;
4019 vm_page_unlock_queues();
4020 }
4021 target_offset += PAGE_SIZE_64;
4022 xfer_size -= PAGE_SIZE;
4023 entry++;
4024 continue;
4025 }
4026 #if MACH_CLUSTER_STATS
4027 m->dirty = pmap_is_modified(m->phys_page);
4028
4029 if (m->dirty) vm_pageout_cluster_dirtied++;
4030 else vm_pageout_cluster_cleaned++;
4031 if (m->wanted) vm_pageout_cluster_collisions++;
4032 #else
4033 m->dirty = 0;
4034 #endif
4035
4036 if((m->busy) && (m->cleaning)) {
4037 /* the request_page_list case */
4038 if(m->absent) {
4039 m->absent = FALSE;
4040 if(shadow_object->absent_count == 1)
4041 vm_object_absent_release(shadow_object);
4042 else
4043 shadow_object->absent_count--;
4044 }
4045 m->overwriting = FALSE;
4046 m->busy = FALSE;
4047 m->dirty = FALSE;
4048 } else if (m->overwriting) {
4049 /* alternate request page list, write to
4050 * page_list case. Occurs when the original
4051 * page was wired at the time of the list
4052 * request */
4053 assert(m->wire_count != 0);
4054 vm_page_unwire(m);/* reactivates */
4055 m->overwriting = FALSE;
4056 }
4057 m->cleaning = FALSE;
4058
4059 /* It is a part of the semantic of COPYOUT_FROM */
4060 /* UPLs that a commit implies cache sync */
4061 /* between the vm page and the backing store */
4062 /* this can be used to strip the precious bit */
4063 /* as well as clean */
4064 if (upl->flags & UPL_PAGE_SYNC_DONE)
4065 m->precious = FALSE;
4066
4067 if (flags & UPL_COMMIT_SET_DIRTY)
4068 m->dirty = TRUE;
4069
4070 if (flags & UPL_COMMIT_INACTIVATE) {
4071 m->reference = FALSE;
4072 vm_page_deactivate(m);
4073 } else if (!m->active && !m->inactive) {
4074 if (m->reference)
4075 vm_page_activate(m);
4076 else
4077 vm_page_deactivate(m);
4078 }
4079
4080 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4081 /*
4082 * We blocked access to the pages in this URL.
4083 * Clear the "busy" bit on this page before we
4084 * wake up any waiter.
4085 */
4086 m->busy = FALSE;
4087 }
4088
4089 /*
4090 * Wakeup any thread waiting for the page to be un-cleaning.
4091 */
4092 PAGE_WAKEUP(m);
4093
4094 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4095 delayed_unlock = 0;
4096 vm_page_unlock_queues();
4097 }
4098 }
4099 target_offset += PAGE_SIZE_64;
4100 xfer_size -= PAGE_SIZE;
4101 entry++;
4102 }
4103 if (delayed_unlock)
4104 vm_page_unlock_queues();
4105
4106 occupied = 1;
4107
4108 if (upl->flags & UPL_DEVICE_MEMORY) {
4109 occupied = 0;
4110 } else if (upl->flags & UPL_LITE) {
4111 int pg_num;
4112 int i;
4113 pg_num = upl->size/PAGE_SIZE;
4114 pg_num = (pg_num + 31) >> 5;
4115 occupied = 0;
4116 for(i= 0; i<pg_num; i++) {
4117 if(lite_list[i] != 0) {
4118 occupied = 1;
4119 break;
4120 }
4121 }
4122 } else {
4123 if(queue_empty(&upl->map_object->memq)) {
4124 occupied = 0;
4125 }
4126 }
4127
4128 if(occupied == 0) {
4129 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4130 *empty = TRUE;
4131 }
4132 if(object == shadow_object)
4133 vm_object_paging_end(shadow_object);
4134 }
4135 vm_object_unlock(shadow_object);
4136 if (object != shadow_object)
4137 vm_object_unlock(object);
4138 upl_unlock(upl);
4139
4140 return KERN_SUCCESS;
4141 }
4142
4143 kern_return_t
4144 upl_abort_range(
4145 upl_t upl,
4146 upl_offset_t offset,
4147 upl_size_t size,
4148 int error,
4149 boolean_t *empty)
4150 {
4151 upl_size_t xfer_size = size;
4152 vm_object_t shadow_object;
4153 vm_object_t object = upl->map_object;
4154 vm_object_offset_t target_offset;
4155 int entry;
4156 wpl_array_t lite_list;
4157 int occupied;
4158 boolean_t shadow_internal;
4159
4160 *empty = FALSE;
4161
4162 if (upl == UPL_NULL)
4163 return KERN_INVALID_ARGUMENT;
4164
4165 if (upl->flags & UPL_IO_WIRE) {
4166 return upl_commit_range(upl,
4167 offset, size, 0,
4168 NULL, 0, empty);
4169 }
4170
4171 if(object->pageout) {
4172 shadow_object = object->shadow;
4173 } else {
4174 shadow_object = object;
4175 }
4176
4177 upl_lock(upl);
4178 if(upl->flags & UPL_DEVICE_MEMORY) {
4179 xfer_size = 0;
4180 } else if ((offset + size) > upl->size) {
4181 upl_unlock(upl);
4182 return KERN_FAILURE;
4183 }
4184 if (object != shadow_object)
4185 vm_object_lock(object);
4186 vm_object_lock(shadow_object);
4187
4188 shadow_internal = shadow_object->internal;
4189
4190 if(upl->flags & UPL_INTERNAL) {
4191 lite_list = (wpl_array_t)
4192 ((((uintptr_t)upl) + sizeof(struct upl))
4193 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4194 } else {
4195 lite_list = (wpl_array_t)
4196 (((uintptr_t)upl) + sizeof(struct upl));
4197 }
4198
4199 entry = offset/PAGE_SIZE;
4200 target_offset = (vm_object_offset_t)offset;
4201 while(xfer_size) {
4202 vm_page_t t,m;
4203
4204 m = VM_PAGE_NULL;
4205 if(upl->flags & UPL_LITE) {
4206 int pg_num;
4207 pg_num = target_offset/PAGE_SIZE;
4208 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4209 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4210 m = vm_page_lookup(shadow_object,
4211 target_offset + (upl->offset -
4212 shadow_object->paging_offset));
4213 }
4214 }
4215 if(object->pageout) {
4216 if ((t = vm_page_lookup(object, target_offset))
4217 != NULL) {
4218 t->pageout = FALSE;
4219 VM_PAGE_FREE(t);
4220 if(m == NULL) {
4221 m = vm_page_lookup(
4222 shadow_object,
4223 target_offset +
4224 object->shadow_offset);
4225 }
4226 if(m != VM_PAGE_NULL)
4227 vm_object_paging_end(m->object);
4228 }
4229 }
4230 if(m != VM_PAGE_NULL) {
4231 vm_page_lock_queues();
4232 if(m->absent) {
4233 boolean_t must_free = TRUE;
4234
4235 /* COPYOUT = FALSE case */
4236 /* check for error conditions which must */
4237 /* be passed back to the pages customer */
4238 if(error & UPL_ABORT_RESTART) {
4239 m->restart = TRUE;
4240 m->absent = FALSE;
4241 vm_object_absent_release(m->object);
4242 m->page_error = KERN_MEMORY_ERROR;
4243 m->error = TRUE;
4244 must_free = FALSE;
4245 } else if(error & UPL_ABORT_UNAVAILABLE) {
4246 m->restart = FALSE;
4247 m->unusual = TRUE;
4248 must_free = FALSE;
4249 } else if(error & UPL_ABORT_ERROR) {
4250 m->restart = FALSE;
4251 m->absent = FALSE;
4252 vm_object_absent_release(m->object);
4253 m->page_error = KERN_MEMORY_ERROR;
4254 m->error = TRUE;
4255 must_free = FALSE;
4256 }
4257
4258 /*
4259 * ENCRYPTED SWAP:
4260 * If the page was already encrypted,
4261 * we don't really need to decrypt it
4262 * now. It will get decrypted later,
4263 * on demand, as soon as someone needs
4264 * to access its contents.
4265 */
4266
4267 m->cleaning = FALSE;
4268 m->overwriting = FALSE;
4269 PAGE_WAKEUP_DONE(m);
4270
4271 if (must_free == TRUE) {
4272 vm_page_free(m);
4273 } else {
4274 vm_page_activate(m);
4275 }
4276 vm_page_unlock_queues();
4277
4278 target_offset += PAGE_SIZE_64;
4279 xfer_size -= PAGE_SIZE;
4280 entry++;
4281 continue;
4282 }
4283 /*
4284 * Handle the trusted pager throttle.
4285 */
4286 if (m->laundry) {
4287 vm_pageout_throttle_up(m);
4288 }
4289 if(m->pageout) {
4290 assert(m->busy);
4291 assert(m->wire_count == 1);
4292 m->pageout = FALSE;
4293 vm_page_unwire(m);
4294 }
4295 m->dump_cleaning = FALSE;
4296 m->cleaning = FALSE;
4297 m->overwriting = FALSE;
4298 #if MACH_PAGEMAP
4299 vm_external_state_clr(
4300 m->object->existence_map, m->offset);
4301 #endif /* MACH_PAGEMAP */
4302 if(error & UPL_ABORT_DUMP_PAGES) {
4303 vm_page_free(m);
4304 pmap_disconnect(m->phys_page);
4305 } else {
4306 PAGE_WAKEUP_DONE(m);
4307 }
4308 vm_page_unlock_queues();
4309 }
4310 target_offset += PAGE_SIZE_64;
4311 xfer_size -= PAGE_SIZE;
4312 entry++;
4313 }
4314 occupied = 1;
4315 if (upl->flags & UPL_DEVICE_MEMORY) {
4316 occupied = 0;
4317 } else if (upl->flags & UPL_LITE) {
4318 int pg_num;
4319 int i;
4320 pg_num = upl->size/PAGE_SIZE;
4321 pg_num = (pg_num + 31) >> 5;
4322 occupied = 0;
4323 for(i= 0; i<pg_num; i++) {
4324 if(lite_list[i] != 0) {
4325 occupied = 1;
4326 break;
4327 }
4328 }
4329 } else {
4330 if(queue_empty(&upl->map_object->memq)) {
4331 occupied = 0;
4332 }
4333 }
4334
4335 if(occupied == 0) {
4336 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4337 *empty = TRUE;
4338 }
4339 if(object == shadow_object)
4340 vm_object_paging_end(shadow_object);
4341 }
4342 vm_object_unlock(shadow_object);
4343 if (object != shadow_object)
4344 vm_object_unlock(object);
4345
4346 upl_unlock(upl);
4347
4348 return KERN_SUCCESS;
4349 }
4350
4351 kern_return_t
4352 upl_abort(
4353 upl_t upl,
4354 int error)
4355 {
4356 vm_object_t object = NULL;
4357 vm_object_t shadow_object = NULL;
4358 vm_object_offset_t offset;
4359 vm_object_offset_t shadow_offset;
4360 vm_object_offset_t target_offset;
4361 upl_size_t i;
4362 wpl_array_t lite_list;
4363 vm_page_t t,m;
4364 int occupied;
4365 boolean_t shadow_internal;
4366
4367 if (upl == UPL_NULL)
4368 return KERN_INVALID_ARGUMENT;
4369
4370 if (upl->flags & UPL_IO_WIRE) {
4371 boolean_t empty;
4372 return upl_commit_range(upl,
4373 0, upl->size, 0,
4374 NULL, 0, &empty);
4375 }
4376
4377 upl_lock(upl);
4378 if(upl->flags & UPL_DEVICE_MEMORY) {
4379 upl_unlock(upl);
4380 return KERN_SUCCESS;
4381 }
4382
4383 object = upl->map_object;
4384
4385 if (object == NULL) {
4386 panic("upl_abort: upl object is not backed by an object");
4387 upl_unlock(upl);
4388 return KERN_INVALID_ARGUMENT;
4389 }
4390
4391 if(object->pageout) {
4392 shadow_object = object->shadow;
4393 shadow_offset = object->shadow_offset;
4394 } else {
4395 shadow_object = object;
4396 shadow_offset = upl->offset - object->paging_offset;
4397 }
4398
4399 if(upl->flags & UPL_INTERNAL) {
4400 lite_list = (wpl_array_t)
4401 ((((uintptr_t)upl) + sizeof(struct upl))
4402 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4403 } else {
4404 lite_list = (wpl_array_t)
4405 (((uintptr_t)upl) + sizeof(struct upl));
4406 }
4407 offset = 0;
4408
4409 if (object != shadow_object)
4410 vm_object_lock(object);
4411 vm_object_lock(shadow_object);
4412
4413 shadow_internal = shadow_object->internal;
4414
4415 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4416 m = VM_PAGE_NULL;
4417 target_offset = offset + shadow_offset;
4418 if(upl->flags & UPL_LITE) {
4419 int pg_num;
4420 pg_num = offset/PAGE_SIZE;
4421 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4422 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4423 m = vm_page_lookup(
4424 shadow_object, target_offset);
4425 }
4426 }
4427 if(object->pageout) {
4428 if ((t = vm_page_lookup(object, offset)) != NULL) {
4429 t->pageout = FALSE;
4430 VM_PAGE_FREE(t);
4431 if(m == NULL) {
4432 m = vm_page_lookup(
4433 shadow_object, target_offset);
4434 }
4435 if(m != VM_PAGE_NULL)
4436 vm_object_paging_end(m->object);
4437 }
4438 }
4439 if(m != VM_PAGE_NULL) {
4440 vm_page_lock_queues();
4441 if(m->absent) {
4442 boolean_t must_free = TRUE;
4443
4444 /* COPYOUT = FALSE case */
4445 /* check for error conditions which must */
4446 /* be passed back to the pages customer */
4447 if(error & UPL_ABORT_RESTART) {
4448 m->restart = TRUE;
4449 m->absent = FALSE;
4450 vm_object_absent_release(m->object);
4451 m->page_error = KERN_MEMORY_ERROR;
4452 m->error = TRUE;
4453 must_free = FALSE;
4454 } else if(error & UPL_ABORT_UNAVAILABLE) {
4455 m->restart = FALSE;
4456 m->unusual = TRUE;
4457 must_free = FALSE;
4458 } else if(error & UPL_ABORT_ERROR) {
4459 m->restart = FALSE;
4460 m->absent = FALSE;
4461 vm_object_absent_release(m->object);
4462 m->page_error = KERN_MEMORY_ERROR;
4463 m->error = TRUE;
4464 must_free = FALSE;
4465 }
4466
4467 /*
4468 * ENCRYPTED SWAP:
4469 * If the page was already encrypted,
4470 * we don't really need to decrypt it
4471 * now. It will get decrypted later,
4472 * on demand, as soon as someone needs
4473 * to access its contents.
4474 */
4475
4476 m->cleaning = FALSE;
4477 m->overwriting = FALSE;
4478 PAGE_WAKEUP_DONE(m);
4479
4480 if (must_free == TRUE) {
4481 vm_page_free(m);
4482 } else {
4483 vm_page_activate(m);
4484 }
4485 vm_page_unlock_queues();
4486 continue;
4487 }
4488 /*
4489 * Handle the trusted pager throttle.
4490 */
4491 if (m->laundry) {
4492 vm_pageout_throttle_up(m);
4493 }
4494 if(m->pageout) {
4495 assert(m->busy);
4496 assert(m->wire_count == 1);
4497 m->pageout = FALSE;
4498 vm_page_unwire(m);
4499 }
4500 m->dump_cleaning = FALSE;
4501 m->cleaning = FALSE;
4502 m->overwriting = FALSE;
4503 #if MACH_PAGEMAP
4504 vm_external_state_clr(
4505 m->object->existence_map, m->offset);
4506 #endif /* MACH_PAGEMAP */
4507 if(error & UPL_ABORT_DUMP_PAGES) {
4508 vm_page_free(m);
4509 pmap_disconnect(m->phys_page);
4510 } else {
4511 PAGE_WAKEUP_DONE(m);
4512 }
4513 vm_page_unlock_queues();
4514 }
4515 }
4516 occupied = 1;
4517 if (upl->flags & UPL_DEVICE_MEMORY) {
4518 occupied = 0;
4519 } else if (upl->flags & UPL_LITE) {
4520 int pg_num;
4521 int j;
4522 pg_num = upl->size/PAGE_SIZE;
4523 pg_num = (pg_num + 31) >> 5;
4524 occupied = 0;
4525 for(j= 0; j<pg_num; j++) {
4526 if(lite_list[j] != 0) {
4527 occupied = 1;
4528 break;
4529 }
4530 }
4531 } else {
4532 if(queue_empty(&upl->map_object->memq)) {
4533 occupied = 0;
4534 }
4535 }
4536
4537 if(occupied == 0) {
4538 if(object == shadow_object)
4539 vm_object_paging_end(shadow_object);
4540 }
4541 vm_object_unlock(shadow_object);
4542 if (object != shadow_object)
4543 vm_object_unlock(object);
4544
4545 upl_unlock(upl);
4546 return KERN_SUCCESS;
4547 }
4548
4549 /* an option on commit should be wire */
4550 kern_return_t
4551 upl_commit(
4552 upl_t upl,
4553 upl_page_info_t *page_list,
4554 mach_msg_type_number_t count)
4555 {
4556 if (upl == UPL_NULL)
4557 return KERN_INVALID_ARGUMENT;
4558
4559 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4560 boolean_t empty;
4561 return upl_commit_range(upl, 0, upl->size, 0,
4562 page_list, count, &empty);
4563 }
4564
4565 if (count == 0)
4566 page_list = NULL;
4567
4568 upl_lock(upl);
4569 if (upl->flags & UPL_DEVICE_MEMORY)
4570 page_list = NULL;
4571
4572 if (upl->flags & UPL_ENCRYPTED) {
4573 /*
4574 * ENCRYPTED SWAP:
4575 * This UPL was encrypted, but we don't need
4576 * to decrypt here. We'll decrypt each page
4577 * later, on demand, as soon as someone needs
4578 * to access the page's contents.
4579 */
4580 }
4581
4582 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4583 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4584 vm_object_t shadow_object = upl->map_object->shadow;
4585 vm_object_t object = upl->map_object;
4586 vm_object_offset_t target_offset;
4587 upl_size_t xfer_end;
4588 int entry;
4589
4590 vm_page_t t, m;
4591 upl_page_info_t *p;
4592
4593 if (object != shadow_object)
4594 vm_object_lock(object);
4595 vm_object_lock(shadow_object);
4596
4597 entry = 0;
4598 target_offset = object->shadow_offset;
4599 xfer_end = upl->size + object->shadow_offset;
4600
4601 while(target_offset < xfer_end) {
4602
4603 if ((t = vm_page_lookup(object,
4604 target_offset - object->shadow_offset))
4605 == NULL) {
4606 target_offset += PAGE_SIZE_64;
4607 entry++;
4608 continue;
4609 }
4610
4611 m = vm_page_lookup(shadow_object, target_offset);
4612 if(m != VM_PAGE_NULL) {
4613 /*
4614 * ENCRYPTED SWAP:
4615 * If this page was encrypted, we
4616 * don't need to decrypt it here.
4617 * We'll decrypt it later, on demand,
4618 * as soon as someone needs to access
4619 * its contents.
4620 */
4621
4622 if (upl->flags & UPL_CLEAR_DIRTY) {
4623 pmap_clear_modify(m->phys_page);
4624 m->dirty = FALSE;
4625 }
4626 /* It is a part of the semantic of */
4627 /* COPYOUT_FROM UPLs that a commit */
4628 /* implies cache sync between the */
4629 /* vm page and the backing store */
4630 /* this can be used to strip the */
4631 /* precious bit as well as clean */
4632 if (upl->flags & UPL_PAGE_SYNC_DONE)
4633 m->precious = FALSE;
4634
4635 if(page_list) {
4636 p = &(page_list[entry]);
4637 if(page_list[entry].phys_addr &&
4638 p->pageout && !m->pageout) {
4639 vm_page_lock_queues();
4640 m->busy = TRUE;
4641 m->pageout = TRUE;
4642 vm_page_wire(m);
4643 vm_page_unlock_queues();
4644 } else if (page_list[entry].phys_addr &&
4645 !p->pageout && m->pageout &&
4646 !m->dump_cleaning) {
4647 vm_page_lock_queues();
4648 m->pageout = FALSE;
4649 m->absent = FALSE;
4650 m->overwriting = FALSE;
4651 vm_page_unwire(m);
4652 PAGE_WAKEUP_DONE(m);
4653 vm_page_unlock_queues();
4654 }
4655 page_list[entry].phys_addr = 0;
4656 }
4657 }
4658 target_offset += PAGE_SIZE_64;
4659 entry++;
4660 }
4661 vm_object_unlock(shadow_object);
4662 if (object != shadow_object)
4663 vm_object_unlock(object);
4664
4665 }
4666 if (upl->flags & UPL_DEVICE_MEMORY) {
4667 vm_object_lock(upl->map_object->shadow);
4668 if(upl->map_object == upl->map_object->shadow)
4669 vm_object_paging_end(upl->map_object->shadow);
4670 vm_object_unlock(upl->map_object->shadow);
4671 }
4672 upl_unlock(upl);
4673 return KERN_SUCCESS;
4674 }
4675
4676
4677
4678 kern_return_t
4679 vm_object_iopl_request(
4680 vm_object_t object,
4681 vm_object_offset_t offset,
4682 upl_size_t size,
4683 upl_t *upl_ptr,
4684 upl_page_info_array_t user_page_list,
4685 unsigned int *page_list_count,
4686 int cntrl_flags)
4687 {
4688 vm_page_t dst_page;
4689 vm_object_offset_t dst_offset = offset;
4690 upl_size_t xfer_size = size;
4691 upl_t upl = NULL;
4692 unsigned int entry;
4693 wpl_array_t lite_list = NULL;
4694 int page_field_size;
4695 int delayed_unlock = 0;
4696 int no_zero_fill = FALSE;
4697 vm_page_t alias_page = NULL;
4698 kern_return_t ret;
4699 vm_prot_t prot;
4700
4701
4702 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4703 /*
4704 * For forward compatibility's sake,
4705 * reject any unknown flag.
4706 */
4707 return KERN_INVALID_VALUE;
4708 }
4709 if (vm_lopage_poolsize == 0)
4710 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4711
4712 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4713 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4714 return KERN_INVALID_VALUE;
4715
4716 if (object->phys_contiguous) {
4717 if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4718 return KERN_INVALID_ADDRESS;
4719
4720 if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4721 return KERN_INVALID_ADDRESS;
4722 }
4723 }
4724
4725 if (cntrl_flags & UPL_ENCRYPT) {
4726 /*
4727 * ENCRYPTED SWAP:
4728 * The paging path doesn't use this interface,
4729 * so we don't support the UPL_ENCRYPT flag
4730 * here. We won't encrypt the pages.
4731 */
4732 assert(! (cntrl_flags & UPL_ENCRYPT));
4733 }
4734
4735 if (cntrl_flags & UPL_NOZEROFILL)
4736 no_zero_fill = TRUE;
4737
4738 if (cntrl_flags & UPL_COPYOUT_FROM)
4739 prot = VM_PROT_READ;
4740 else
4741 prot = VM_PROT_READ | VM_PROT_WRITE;
4742
4743 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4744 size = MAX_UPL_TRANSFER * page_size;
4745 }
4746
4747 if(cntrl_flags & UPL_SET_INTERNAL)
4748 if(page_list_count != NULL)
4749 *page_list_count = MAX_UPL_TRANSFER;
4750 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4751 ((page_list_count != NULL) && (*page_list_count != 0)
4752 && *page_list_count < (size/page_size)))
4753 return KERN_INVALID_ARGUMENT;
4754
4755 if((!object->internal) && (object->paging_offset != 0))
4756 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4757
4758 if(object->phys_contiguous) {
4759 /* No paging operations are possible against this memory */
4760 /* and so no need for map object, ever */
4761 cntrl_flags |= UPL_SET_LITE;
4762 }
4763
4764 if(upl_ptr) {
4765 if(cntrl_flags & UPL_SET_INTERNAL) {
4766 if(cntrl_flags & UPL_SET_LITE) {
4767 upl = upl_create(
4768 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4769 size);
4770 user_page_list = (upl_page_info_t *)
4771 (((uintptr_t)upl) + sizeof(struct upl));
4772 lite_list = (wpl_array_t)
4773 (((uintptr_t)user_page_list) +
4774 ((size/PAGE_SIZE) *
4775 sizeof(upl_page_info_t)));
4776 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4777 page_field_size =
4778 (page_field_size + 3) & 0xFFFFFFFC;
4779 bzero((char *)lite_list, page_field_size);
4780 upl->flags =
4781 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4782 } else {
4783 upl = upl_create(UPL_CREATE_INTERNAL, size);
4784 user_page_list = (upl_page_info_t *)
4785 (((uintptr_t)upl)
4786 + sizeof(struct upl));
4787 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4788 }
4789 } else {
4790 if(cntrl_flags & UPL_SET_LITE) {
4791 upl = upl_create(UPL_CREATE_LITE, size);
4792 lite_list = (wpl_array_t)
4793 (((uintptr_t)upl) + sizeof(struct upl));
4794 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4795 page_field_size =
4796 (page_field_size + 3) & 0xFFFFFFFC;
4797 bzero((char *)lite_list, page_field_size);
4798 upl->flags = UPL_LITE | UPL_IO_WIRE;
4799 } else {
4800 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4801 upl->flags = UPL_IO_WIRE;
4802 }
4803 }
4804
4805 if(object->phys_contiguous) {
4806 upl->map_object = object;
4807 /* don't need any shadow mappings for this one */
4808 /* since it is already I/O memory */
4809 upl->flags |= UPL_DEVICE_MEMORY;
4810
4811 vm_object_lock(object);
4812 vm_object_paging_begin(object);
4813 vm_object_unlock(object);
4814
4815 /* paging in progress also protects the paging_offset */
4816 upl->offset = offset + object->paging_offset;
4817 upl->size = size;
4818 *upl_ptr = upl;
4819 if(user_page_list) {
4820 user_page_list[0].phys_addr =
4821 (offset + object->shadow_offset)>>PAGE_SHIFT;
4822 user_page_list[0].device = TRUE;
4823 }
4824 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4825
4826 if(page_list_count != NULL) {
4827 if (upl->flags & UPL_INTERNAL) {
4828 *page_list_count = 0;
4829 } else {
4830 *page_list_count = 1;
4831 }
4832 }
4833 return KERN_SUCCESS;
4834 }
4835 if(user_page_list)
4836 user_page_list[0].device = FALSE;
4837
4838 if(cntrl_flags & UPL_SET_LITE) {
4839 upl->map_object = object;
4840 } else {
4841 upl->map_object = vm_object_allocate(size);
4842 vm_object_lock(upl->map_object);
4843 upl->map_object->shadow = object;
4844 upl->map_object->pageout = TRUE;
4845 upl->map_object->can_persist = FALSE;
4846 upl->map_object->copy_strategy =
4847 MEMORY_OBJECT_COPY_NONE;
4848 upl->map_object->shadow_offset = offset;
4849 upl->map_object->wimg_bits = object->wimg_bits;
4850 vm_object_unlock(upl->map_object);
4851 }
4852 }
4853 vm_object_lock(object);
4854 vm_object_paging_begin(object);
4855
4856 if (!object->phys_contiguous) {
4857 /* Protect user space from future COW operations */
4858 object->true_share = TRUE;
4859 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4860 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4861 }
4862
4863 /* we can lock the upl offset now that paging_in_progress is set */
4864 if(upl_ptr) {
4865 upl->size = size;
4866 upl->offset = offset + object->paging_offset;
4867 *upl_ptr = upl;
4868 #ifdef UPL_DEBUG
4869 queue_enter(&object->uplq, upl, upl_t, uplq);
4870 #endif /* UPL_DEBUG */
4871 }
4872
4873 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4874 /*
4875 * The user requested that access to the pages in this URL
4876 * be blocked until the UPL is commited or aborted.
4877 */
4878 upl->flags |= UPL_ACCESS_BLOCKED;
4879 }
4880
4881 entry = 0;
4882 while (xfer_size) {
4883 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4884 if (delayed_unlock) {
4885 delayed_unlock = 0;
4886 vm_page_unlock_queues();
4887 }
4888 vm_object_unlock(object);
4889 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4890 vm_object_lock(object);
4891 }
4892 dst_page = vm_page_lookup(object, dst_offset);
4893
4894 /*
4895 * ENCRYPTED SWAP:
4896 * If the page is encrypted, we need to decrypt it,
4897 * so force a soft page fault.
4898 */
4899 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4900 (dst_page->encrypted) ||
4901 (dst_page->unusual && (dst_page->error ||
4902 dst_page->restart ||
4903 dst_page->absent ||
4904 dst_page->fictitious ||
4905 (prot & dst_page->page_lock)))) {
4906 vm_fault_return_t result;
4907 do {
4908 vm_page_t top_page;
4909 kern_return_t error_code;
4910 int interruptible;
4911
4912 vm_object_offset_t lo_offset = offset;
4913 vm_object_offset_t hi_offset = offset + size;
4914
4915
4916 if (delayed_unlock) {
4917 delayed_unlock = 0;
4918 vm_page_unlock_queues();
4919 }
4920
4921 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4922 interruptible = THREAD_ABORTSAFE;
4923 } else {
4924 interruptible = THREAD_UNINT;
4925 }
4926
4927 result = vm_fault_page(object, dst_offset,
4928 prot | VM_PROT_WRITE, FALSE,
4929 interruptible,
4930 lo_offset, hi_offset,
4931 VM_BEHAVIOR_SEQUENTIAL,
4932 &prot, &dst_page, &top_page,
4933 (int *)0,
4934 &error_code, no_zero_fill, FALSE, NULL, 0);
4935
4936 switch(result) {
4937 case VM_FAULT_SUCCESS:
4938
4939 PAGE_WAKEUP_DONE(dst_page);
4940
4941 /*
4942 * Release paging references and
4943 * top-level placeholder page, if any.
4944 */
4945
4946 if(top_page != VM_PAGE_NULL) {
4947 vm_object_t local_object;
4948 local_object =
4949 top_page->object;
4950 if(top_page->object
4951 != dst_page->object) {
4952 vm_object_lock(
4953 local_object);
4954 VM_PAGE_FREE(top_page);
4955 vm_object_paging_end(
4956 local_object);
4957 vm_object_unlock(
4958 local_object);
4959 } else {
4960 VM_PAGE_FREE(top_page);
4961 vm_object_paging_end(
4962 local_object);
4963 }
4964 }
4965
4966 break;
4967
4968
4969 case VM_FAULT_RETRY:
4970 vm_object_lock(object);
4971 vm_object_paging_begin(object);
4972 break;
4973
4974 case VM_FAULT_FICTITIOUS_SHORTAGE:
4975 vm_page_more_fictitious();
4976 vm_object_lock(object);
4977 vm_object_paging_begin(object);
4978 break;
4979
4980 case VM_FAULT_MEMORY_SHORTAGE:
4981 if (vm_page_wait(interruptible)) {
4982 vm_object_lock(object);
4983 vm_object_paging_begin(object);
4984 break;
4985 }
4986 /* fall thru */
4987
4988 case VM_FAULT_INTERRUPTED:
4989 error_code = MACH_SEND_INTERRUPTED;
4990 case VM_FAULT_MEMORY_ERROR:
4991 ret = (error_code ? error_code:
4992 KERN_MEMORY_ERROR);
4993 vm_object_lock(object);
4994
4995 goto return_err;
4996 }
4997 } while ((result != VM_FAULT_SUCCESS)
4998 || (result == VM_FAULT_INTERRUPTED));
4999 }
5000
5001 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5002 dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5003 vm_page_t low_page;
5004 int refmod;
5005
5006 /*
5007 * support devices that can't DMA above 32 bits
5008 * by substituting pages from a pool of low address
5009 * memory for any pages we find above the 4G mark
5010 * can't substitute if the page is already wired because
5011 * we don't know whether that physical address has been
5012 * handed out to some other 64 bit capable DMA device to use
5013 */
5014 if (dst_page->wire_count) {
5015 ret = KERN_PROTECTION_FAILURE;
5016 goto return_err;
5017 }
5018 if (delayed_unlock) {
5019 delayed_unlock = 0;
5020 vm_page_unlock_queues();
5021 }
5022 low_page = vm_page_grablo();
5023
5024 if (low_page == VM_PAGE_NULL) {
5025 ret = KERN_RESOURCE_SHORTAGE;
5026 goto return_err;
5027 }
5028 /*
5029 * from here until the vm_page_replace completes
5030 * we musn't drop the object lock... we don't
5031 * want anyone refaulting this page in and using
5032 * it after we disconnect it... we want the fault
5033 * to find the new page being substituted.
5034 */
5035 refmod = pmap_disconnect(dst_page->phys_page);
5036
5037 vm_page_copy(dst_page, low_page);
5038
5039 low_page->reference = dst_page->reference;
5040 low_page->dirty = dst_page->dirty;
5041
5042 if (refmod & VM_MEM_REFERENCED)
5043 low_page->reference = TRUE;
5044 if (refmod & VM_MEM_MODIFIED)
5045 low_page->dirty = TRUE;
5046
5047 vm_page_lock_queues();
5048 vm_page_replace(low_page, object, dst_offset);
5049 /*
5050 * keep the queue lock since we're going to
5051 * need it immediately
5052 */
5053 delayed_unlock = 1;
5054
5055 dst_page = low_page;
5056 /*
5057 * vm_page_grablo returned the page marked
5058 * BUSY... we don't need a PAGE_WAKEUP_DONE
5059 * here, because we've never dropped the object lock
5060 */
5061 dst_page->busy = FALSE;
5062 }
5063 if (delayed_unlock == 0)
5064 vm_page_lock_queues();
5065 vm_page_wire(dst_page);
5066
5067 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5068 /*
5069 * Mark the page "busy" to block any future page fault
5070 * on this page. We'll also remove the mapping
5071 * of all these pages before leaving this routine.
5072 */
5073 assert(!dst_page->fictitious);
5074 dst_page->busy = TRUE;
5075 }
5076
5077 if (upl_ptr) {
5078 if (cntrl_flags & UPL_SET_LITE) {
5079 int pg_num;
5080 pg_num = (dst_offset-offset)/PAGE_SIZE;
5081 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5082 } else {
5083 /*
5084 * Convert the fictitious page to a
5085 * private shadow of the real page.
5086 */
5087 assert(alias_page->fictitious);
5088 alias_page->fictitious = FALSE;
5089 alias_page->private = TRUE;
5090 alias_page->pageout = TRUE;
5091 alias_page->phys_page = dst_page->phys_page;
5092 vm_page_wire(alias_page);
5093
5094 vm_page_insert(alias_page,
5095 upl->map_object, size - xfer_size);
5096 assert(!alias_page->wanted);
5097 alias_page->busy = FALSE;
5098 alias_page->absent = FALSE;
5099 }
5100
5101 /* expect the page to be used */
5102 dst_page->reference = TRUE;
5103
5104 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5105 dst_page->dirty = TRUE;
5106 alias_page = NULL;
5107
5108 if (dst_page->phys_page > upl->highest_page)
5109 upl->highest_page = dst_page->phys_page;
5110
5111 if (user_page_list) {
5112 user_page_list[entry].phys_addr
5113 = dst_page->phys_page;
5114 user_page_list[entry].dirty =
5115 dst_page->dirty;
5116 user_page_list[entry].pageout =
5117 dst_page->pageout;
5118 user_page_list[entry].absent =
5119 dst_page->absent;
5120 user_page_list[entry].precious =
5121 dst_page->precious;
5122 }
5123 }
5124 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5125 delayed_unlock = 0;
5126 vm_page_unlock_queues();
5127 }
5128 entry++;
5129 dst_offset += PAGE_SIZE_64;
5130 xfer_size -= PAGE_SIZE;
5131 }
5132 if (delayed_unlock)
5133 vm_page_unlock_queues();
5134
5135 if (upl->flags & UPL_INTERNAL) {
5136 if(page_list_count != NULL)
5137 *page_list_count = 0;
5138 } else if (*page_list_count > entry) {
5139 if(page_list_count != NULL)
5140 *page_list_count = entry;
5141 }
5142
5143 if (alias_page != NULL) {
5144 vm_page_lock_queues();
5145 vm_page_free(alias_page);
5146 vm_page_unlock_queues();
5147 }
5148
5149 vm_object_unlock(object);
5150
5151 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5152 /*
5153 * We've marked all the pages "busy" so that future
5154 * page faults will block.
5155 * Now remove the mapping for these pages, so that they
5156 * can't be accessed without causing a page fault.
5157 */
5158 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5159 PMAP_NULL, 0, VM_PROT_NONE);
5160 }
5161
5162 return KERN_SUCCESS;
5163
5164
5165 return_err:
5166 if (delayed_unlock)
5167 vm_page_unlock_queues();
5168
5169 for (; offset < dst_offset; offset += PAGE_SIZE) {
5170 dst_page = vm_page_lookup(object, offset);
5171
5172 if (dst_page == VM_PAGE_NULL)
5173 panic("vm_object_iopl_request: Wired pages missing. \n");
5174 vm_page_lock_queues();
5175 vm_page_unwire(dst_page);
5176 vm_page_unlock_queues();
5177 VM_STAT(reactivations++);
5178 }
5179 vm_object_paging_end(object);
5180 vm_object_unlock(object);
5181 upl_destroy(upl);
5182
5183 return ret;
5184 }
5185
5186
5187 kern_return_t
5188 upl_transpose(
5189 upl_t upl1,
5190 upl_t upl2)
5191 {
5192 kern_return_t retval;
5193 boolean_t upls_locked;
5194 vm_object_t object1, object2;
5195
5196 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5197 return KERN_INVALID_ARGUMENT;
5198 }
5199
5200 upls_locked = FALSE;
5201
5202 /*
5203 * Since we need to lock both UPLs at the same time,
5204 * avoid deadlocks by always taking locks in the same order.
5205 */
5206 if (upl1 < upl2) {
5207 upl_lock(upl1);
5208 upl_lock(upl2);
5209 } else {
5210 upl_lock(upl2);
5211 upl_lock(upl1);
5212 }
5213 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5214
5215 object1 = upl1->map_object;
5216 object2 = upl2->map_object;
5217
5218 if (upl1->offset != 0 || upl2->offset != 0 ||
5219 upl1->size != upl2->size) {
5220 /*
5221 * We deal only with full objects, not subsets.
5222 * That's because we exchange the entire backing store info
5223 * for the objects: pager, resident pages, etc... We can't do
5224 * only part of it.
5225 */
5226 retval = KERN_INVALID_VALUE;
5227 goto done;
5228 }
5229
5230 /*
5231 * Tranpose the VM objects' backing store.
5232 */
5233 retval = vm_object_transpose(object1, object2,
5234 (vm_object_size_t) upl1->size);
5235
5236 if (retval == KERN_SUCCESS) {
5237 /*
5238 * Make each UPL point to the correct VM object, i.e. the
5239 * object holding the pages that the UPL refers to...
5240 */
5241 upl1->map_object = object2;
5242 upl2->map_object = object1;
5243 }
5244
5245 done:
5246 /*
5247 * Cleanup.
5248 */
5249 if (upls_locked) {
5250 upl_unlock(upl1);
5251 upl_unlock(upl2);
5252 upls_locked = FALSE;
5253 }
5254
5255 return retval;
5256 }
5257
5258 /*
5259 * ENCRYPTED SWAP:
5260 *
5261 * Rationale: the user might have some encrypted data on disk (via
5262 * FileVault or any other mechanism). That data is then decrypted in
5263 * memory, which is safe as long as the machine is secure. But that
5264 * decrypted data in memory could be paged out to disk by the default
5265 * pager. The data would then be stored on disk in clear (not encrypted)
5266 * and it could be accessed by anyone who gets physical access to the
5267 * disk (if the laptop or the disk gets stolen for example). This weakens
5268 * the security offered by FileVault.
5269 *
5270 * Solution: the default pager will optionally request that all the
5271 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5272 * before it sends this UPL to disk via the vnode_pageout() path.
5273 *
5274 * Notes:
5275 *
5276 * To avoid disrupting the VM LRU algorithms, we want to keep the
5277 * clean-in-place mechanisms, which allow us to send some extra pages to
5278 * swap (clustering) without actually removing them from the user's
5279 * address space. We don't want the user to unknowingly access encrypted
5280 * data, so we have to actually remove the encrypted pages from the page
5281 * table. When the user accesses the data, the hardware will fail to
5282 * locate the virtual page in its page table and will trigger a page
5283 * fault. We can then decrypt the page and enter it in the page table
5284 * again. Whenever we allow the user to access the contents of a page,
5285 * we have to make sure it's not encrypted.
5286 *
5287 *
5288 */
5289 /*
5290 * ENCRYPTED SWAP:
5291 * Reserve of virtual addresses in the kernel address space.
5292 * We need to map the physical pages in the kernel, so that we
5293 * can call the encryption/decryption routines with a kernel
5294 * virtual address. We keep this pool of pre-allocated kernel
5295 * virtual addresses so that we don't have to scan the kernel's
5296 * virtaul address space each time we need to encrypt or decrypt
5297 * a physical page.
5298 * It would be nice to be able to encrypt and decrypt in physical
5299 * mode but that might not always be more efficient...
5300 */
5301 decl_simple_lock_data(,vm_paging_lock)
5302 #define VM_PAGING_NUM_PAGES 64
5303 vm_map_offset_t vm_paging_base_address = 0;
5304 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5305 int vm_paging_max_index = 0;
5306 unsigned long vm_paging_no_kernel_page = 0;
5307 unsigned long vm_paging_objects_mapped = 0;
5308 unsigned long vm_paging_pages_mapped = 0;
5309 unsigned long vm_paging_objects_mapped_slow = 0;
5310 unsigned long vm_paging_pages_mapped_slow = 0;
5311
5312 /*
5313 * ENCRYPTED SWAP:
5314 * vm_paging_map_object:
5315 * Maps part of a VM object's pages in the kernel
5316 * virtual address space, using the pre-allocated
5317 * kernel virtual addresses, if possible.
5318 * Context:
5319 * The VM object is locked. This lock will get
5320 * dropped and re-acquired though.
5321 */
5322 kern_return_t
5323 vm_paging_map_object(
5324 vm_map_offset_t *address,
5325 vm_page_t page,
5326 vm_object_t object,
5327 vm_object_offset_t offset,
5328 vm_map_size_t *size)
5329 {
5330 kern_return_t kr;
5331 vm_map_offset_t page_map_offset;
5332 vm_map_size_t map_size;
5333 vm_object_offset_t object_offset;
5334 int i;
5335 vm_map_entry_t map_entry;
5336
5337
5338 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5339 /*
5340 * Use one of the pre-allocated kernel virtual addresses
5341 * and just enter the VM page in the kernel address space
5342 * at that virtual address.
5343 */
5344 vm_object_unlock(object);
5345 simple_lock(&vm_paging_lock);
5346
5347 if (vm_paging_base_address == 0) {
5348 /*
5349 * Initialize our pool of pre-allocated kernel
5350 * virtual addresses.
5351 */
5352 simple_unlock(&vm_paging_lock);
5353 page_map_offset = 0;
5354 kr = vm_map_find_space(kernel_map,
5355 &page_map_offset,
5356 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5357 0,
5358 0,
5359 &map_entry);
5360 if (kr != KERN_SUCCESS) {
5361 panic("vm_paging_map_object: "
5362 "kernel_map full\n");
5363 }
5364 map_entry->object.vm_object = kernel_object;
5365 map_entry->offset =
5366 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5367 vm_object_reference(kernel_object);
5368 vm_map_unlock(kernel_map);
5369
5370 simple_lock(&vm_paging_lock);
5371 if (vm_paging_base_address != 0) {
5372 /* someone raced us and won: undo */
5373 simple_unlock(&vm_paging_lock);
5374 kr = vm_map_remove(kernel_map,
5375 page_map_offset,
5376 page_map_offset +
5377 (VM_PAGING_NUM_PAGES
5378 * PAGE_SIZE),
5379 VM_MAP_NO_FLAGS);
5380 assert(kr == KERN_SUCCESS);
5381 simple_lock(&vm_paging_lock);
5382 } else {
5383 vm_paging_base_address = page_map_offset;
5384 }
5385 }
5386
5387 /*
5388 * Try and find an available kernel virtual address
5389 * from our pre-allocated pool.
5390 */
5391 page_map_offset = 0;
5392 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5393 if (vm_paging_page_inuse[i] == FALSE) {
5394 page_map_offset = vm_paging_base_address +
5395 (i * PAGE_SIZE);
5396 break;
5397 }
5398 }
5399
5400 if (page_map_offset != 0) {
5401 /*
5402 * We found a kernel virtual address;
5403 * map the physical page to that virtual address.
5404 */
5405 if (i > vm_paging_max_index) {
5406 vm_paging_max_index = i;
5407 }
5408 vm_paging_page_inuse[i] = TRUE;
5409 simple_unlock(&vm_paging_lock);
5410 if (page->no_isync == TRUE) {
5411 pmap_sync_page_data_phys(page->phys_page);
5412 }
5413 assert(pmap_verify_free(page->phys_page));
5414 PMAP_ENTER(kernel_pmap,
5415 page_map_offset,
5416 page,
5417 VM_PROT_DEFAULT,
5418 ((int) page->object->wimg_bits &
5419 VM_WIMG_MASK),
5420 TRUE);
5421 vm_paging_objects_mapped++;
5422 vm_paging_pages_mapped++;
5423 *address = page_map_offset;
5424 vm_object_lock(object);
5425
5426 /* all done and mapped, ready to use ! */
5427 return KERN_SUCCESS;
5428 }
5429
5430 /*
5431 * We ran out of pre-allocated kernel virtual
5432 * addresses. Just map the page in the kernel
5433 * the slow and regular way.
5434 */
5435 vm_paging_no_kernel_page++;
5436 simple_unlock(&vm_paging_lock);
5437 vm_object_lock(object);
5438 }
5439
5440 object_offset = vm_object_trunc_page(offset);
5441 map_size = vm_map_round_page(*size);
5442
5443 /*
5444 * Try and map the required range of the object
5445 * in the kernel_map
5446 */
5447
5448 /* don't go beyond the object's end... */
5449 if (object_offset >= object->size) {
5450 map_size = 0;
5451 } else if (map_size > object->size - offset) {
5452 map_size = object->size - offset;
5453 }
5454
5455 vm_object_reference_locked(object); /* for the map entry */
5456 vm_object_unlock(object);
5457
5458 kr = vm_map_enter(kernel_map,
5459 address,
5460 map_size,
5461 0,
5462 VM_FLAGS_ANYWHERE,
5463 object,
5464 object_offset,
5465 FALSE,
5466 VM_PROT_DEFAULT,
5467 VM_PROT_ALL,
5468 VM_INHERIT_NONE);
5469 if (kr != KERN_SUCCESS) {
5470 *address = 0;
5471 *size = 0;
5472 vm_object_deallocate(object); /* for the map entry */
5473 return kr;
5474 }
5475
5476 *size = map_size;
5477
5478 /*
5479 * Enter the mapped pages in the page table now.
5480 */
5481 vm_object_lock(object);
5482 for (page_map_offset = 0;
5483 map_size != 0;
5484 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5485 unsigned int cache_attr;
5486
5487 page = vm_page_lookup(object, offset + page_map_offset);
5488 if (page == VM_PAGE_NULL) {
5489 panic("vm_paging_map_object: no page !?");
5490 }
5491 if (page->no_isync == TRUE) {
5492 pmap_sync_page_data_phys(page->phys_page);
5493 }
5494 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5495
5496 assert(pmap_verify_free(page->phys_page));
5497 PMAP_ENTER(kernel_pmap,
5498 *address + page_map_offset,
5499 page,
5500 VM_PROT_DEFAULT,
5501 cache_attr,
5502 TRUE);
5503 }
5504
5505 vm_paging_objects_mapped_slow++;
5506 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5507
5508 return KERN_SUCCESS;
5509 }
5510
5511 /*
5512 * ENCRYPTED SWAP:
5513 * vm_paging_unmap_object:
5514 * Unmaps part of a VM object's pages from the kernel
5515 * virtual address space.
5516 * Context:
5517 * The VM object is locked. This lock will get
5518 * dropped and re-acquired though.
5519 */
5520 void
5521 vm_paging_unmap_object(
5522 vm_object_t object,
5523 vm_map_offset_t start,
5524 vm_map_offset_t end)
5525 {
5526 kern_return_t kr;
5527 int i;
5528
5529 if ((vm_paging_base_address == 0) ||
5530 (start < vm_paging_base_address) ||
5531 (end > (vm_paging_base_address
5532 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5533 /*
5534 * We didn't use our pre-allocated pool of
5535 * kernel virtual address. Deallocate the
5536 * virtual memory.
5537 */
5538 if (object != VM_OBJECT_NULL) {
5539 vm_object_unlock(object);
5540 }
5541 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5542 if (object != VM_OBJECT_NULL) {
5543 vm_object_lock(object);
5544 }
5545 assert(kr == KERN_SUCCESS);
5546 } else {
5547 /*
5548 * We used a kernel virtual address from our
5549 * pre-allocated pool. Put it back in the pool
5550 * for next time.
5551 */
5552 assert(end - start == PAGE_SIZE);
5553 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5554
5555 /* undo the pmap mapping */
5556 pmap_remove(kernel_pmap, start, end);
5557
5558 simple_lock(&vm_paging_lock);
5559 vm_paging_page_inuse[i] = FALSE;
5560 simple_unlock(&vm_paging_lock);
5561 }
5562 }
5563
5564 /*
5565 * Encryption data.
5566 * "iv" is the "initial vector". Ideally, we want to
5567 * have a different one for each page we encrypt, so that
5568 * crackers can't find encryption patterns too easily.
5569 */
5570 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5571 boolean_t swap_crypt_ctx_initialized = FALSE;
5572 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5573 aes_ctx swap_crypt_ctx;
5574 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5575
5576 #if DEBUG
5577 boolean_t swap_crypt_ctx_tested = FALSE;
5578 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5579 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5580 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5581 #endif /* DEBUG */
5582
5583 extern u_long random(void);
5584
5585 /*
5586 * Initialize the encryption context: key and key size.
5587 */
5588 void swap_crypt_ctx_initialize(void); /* forward */
5589 void
5590 swap_crypt_ctx_initialize(void)
5591 {
5592 unsigned int i;
5593
5594 /*
5595 * No need for locking to protect swap_crypt_ctx_initialized
5596 * because the first use of encryption will come from the
5597 * pageout thread (we won't pagein before there's been a pageout)
5598 * and there's only one pageout thread.
5599 */
5600 if (swap_crypt_ctx_initialized == FALSE) {
5601 for (i = 0;
5602 i < (sizeof (swap_crypt_key) /
5603 sizeof (swap_crypt_key[0]));
5604 i++) {
5605 swap_crypt_key[i] = random();
5606 }
5607 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5608 SWAP_CRYPT_AES_KEY_SIZE,
5609 &swap_crypt_ctx.encrypt);
5610 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5611 SWAP_CRYPT_AES_KEY_SIZE,
5612 &swap_crypt_ctx.decrypt);
5613 swap_crypt_ctx_initialized = TRUE;
5614 }
5615
5616 #if DEBUG
5617 /*
5618 * Validate the encryption algorithms.
5619 */
5620 if (swap_crypt_ctx_tested == FALSE) {
5621 /* initialize */
5622 for (i = 0; i < 4096; i++) {
5623 swap_crypt_test_page_ref[i] = (char) i;
5624 }
5625 /* encrypt */
5626 aes_encrypt_cbc(swap_crypt_test_page_ref,
5627 swap_crypt_null_iv,
5628 PAGE_SIZE / AES_BLOCK_SIZE,
5629 swap_crypt_test_page_encrypt,
5630 &swap_crypt_ctx.encrypt);
5631 /* decrypt */
5632 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5633 swap_crypt_null_iv,
5634 PAGE_SIZE / AES_BLOCK_SIZE,
5635 swap_crypt_test_page_decrypt,
5636 &swap_crypt_ctx.decrypt);
5637 /* compare result with original */
5638 for (i = 0; i < 4096; i ++) {
5639 if (swap_crypt_test_page_decrypt[i] !=
5640 swap_crypt_test_page_ref[i]) {
5641 panic("encryption test failed");
5642 }
5643 }
5644
5645 /* encrypt again */
5646 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5647 swap_crypt_null_iv,
5648 PAGE_SIZE / AES_BLOCK_SIZE,
5649 swap_crypt_test_page_decrypt,
5650 &swap_crypt_ctx.encrypt);
5651 /* decrypt in place */
5652 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5653 swap_crypt_null_iv,
5654 PAGE_SIZE / AES_BLOCK_SIZE,
5655 swap_crypt_test_page_decrypt,
5656 &swap_crypt_ctx.decrypt);
5657 for (i = 0; i < 4096; i ++) {
5658 if (swap_crypt_test_page_decrypt[i] !=
5659 swap_crypt_test_page_ref[i]) {
5660 panic("in place encryption test failed");
5661 }
5662 }
5663
5664 swap_crypt_ctx_tested = TRUE;
5665 }
5666 #endif /* DEBUG */
5667 }
5668
5669 /*
5670 * ENCRYPTED SWAP:
5671 * vm_page_encrypt:
5672 * Encrypt the given page, for secure paging.
5673 * The page might already be mapped at kernel virtual
5674 * address "kernel_mapping_offset". Otherwise, we need
5675 * to map it.
5676 *
5677 * Context:
5678 * The page's object is locked, but this lock will be released
5679 * and re-acquired.
5680 * The page is busy and not accessible by users (not entered in any pmap).
5681 */
5682 void
5683 vm_page_encrypt(
5684 vm_page_t page,
5685 vm_map_offset_t kernel_mapping_offset)
5686 {
5687 int clear_refmod = 0;
5688 kern_return_t kr;
5689 boolean_t page_was_referenced;
5690 boolean_t page_was_modified;
5691 vm_map_size_t kernel_mapping_size;
5692 vm_offset_t kernel_vaddr;
5693 union {
5694 unsigned char aes_iv[AES_BLOCK_SIZE];
5695 struct {
5696 memory_object_t pager_object;
5697 vm_object_offset_t paging_offset;
5698 } vm;
5699 } encrypt_iv;
5700
5701 if (! vm_pages_encrypted) {
5702 vm_pages_encrypted = TRUE;
5703 }
5704
5705 assert(page->busy);
5706 assert(page->dirty || page->precious);
5707
5708 if (page->encrypted) {
5709 /*
5710 * Already encrypted: no need to do it again.
5711 */
5712 vm_page_encrypt_already_encrypted_counter++;
5713 return;
5714 }
5715 ASSERT_PAGE_DECRYPTED(page);
5716
5717 /*
5718 * Gather the "reference" and "modified" status of the page.
5719 * We'll restore these values after the encryption, so that
5720 * the encryption is transparent to the rest of the system
5721 * and doesn't impact the VM's LRU logic.
5722 */
5723 page_was_referenced =
5724 (page->reference || pmap_is_referenced(page->phys_page));
5725 page_was_modified =
5726 (page->dirty || pmap_is_modified(page->phys_page));
5727
5728 if (kernel_mapping_offset == 0) {
5729 /*
5730 * The page hasn't already been mapped in kernel space
5731 * by the caller. Map it now, so that we can access
5732 * its contents and encrypt them.
5733 */
5734 kernel_mapping_size = PAGE_SIZE;
5735 kr = vm_paging_map_object(&kernel_mapping_offset,
5736 page,
5737 page->object,
5738 page->offset,
5739 &kernel_mapping_size);
5740 if (kr != KERN_SUCCESS) {
5741 panic("vm_page_encrypt: "
5742 "could not map page in kernel: 0x%x\n",
5743 kr);
5744 }
5745 } else {
5746 kernel_mapping_size = 0;
5747 }
5748 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5749
5750 if (swap_crypt_ctx_initialized == FALSE) {
5751 swap_crypt_ctx_initialize();
5752 }
5753 assert(swap_crypt_ctx_initialized);
5754
5755 /*
5756 * Prepare an "initial vector" for the encryption.
5757 * We use the "pager" and the "paging_offset" for that
5758 * page to obfuscate the encrypted data a bit more and
5759 * prevent crackers from finding patterns that they could
5760 * use to break the key.
5761 */
5762 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5763 encrypt_iv.vm.pager_object = page->object->pager;
5764 encrypt_iv.vm.paging_offset =
5765 page->object->paging_offset + page->offset;
5766
5767 vm_object_unlock(page->object);
5768
5769 /* encrypt the "initial vector" */
5770 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5771 swap_crypt_null_iv,
5772 1,
5773 &encrypt_iv.aes_iv[0],
5774 &swap_crypt_ctx.encrypt);
5775
5776 /*
5777 * Encrypt the page.
5778 */
5779 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5780 &encrypt_iv.aes_iv[0],
5781 PAGE_SIZE / AES_BLOCK_SIZE,
5782 (unsigned char *) kernel_vaddr,
5783 &swap_crypt_ctx.encrypt);
5784
5785 vm_page_encrypt_counter++;
5786
5787 vm_object_lock(page->object);
5788
5789 /*
5790 * Unmap the page from the kernel's address space,
5791 * if we had to map it ourselves. Otherwise, let
5792 * the caller undo the mapping if needed.
5793 */
5794 if (kernel_mapping_size != 0) {
5795 vm_paging_unmap_object(page->object,
5796 kernel_mapping_offset,
5797 kernel_mapping_offset + kernel_mapping_size);
5798 }
5799
5800 /*
5801 * Restore the "reference" and "modified" bits.
5802 * This should clean up any impact the encryption had
5803 * on them.
5804 */
5805 if (! page_was_referenced) {
5806 clear_refmod |= VM_MEM_REFERENCED;
5807 page->reference = FALSE;
5808 }
5809 if (! page_was_modified) {
5810 clear_refmod |= VM_MEM_MODIFIED;
5811 page->dirty = FALSE;
5812 }
5813 if (clear_refmod)
5814 pmap_clear_refmod(page->phys_page, clear_refmod);
5815
5816 page->encrypted = TRUE;
5817 }
5818
5819 /*
5820 * ENCRYPTED SWAP:
5821 * vm_page_decrypt:
5822 * Decrypt the given page.
5823 * The page might already be mapped at kernel virtual
5824 * address "kernel_mapping_offset". Otherwise, we need
5825 * to map it.
5826 *
5827 * Context:
5828 * The page's VM object is locked but will be unlocked and relocked.
5829 * The page is busy and not accessible by users (not entered in any pmap).
5830 */
5831 void
5832 vm_page_decrypt(
5833 vm_page_t page,
5834 vm_map_offset_t kernel_mapping_offset)
5835 {
5836 int clear_refmod = 0;
5837 kern_return_t kr;
5838 vm_map_size_t kernel_mapping_size;
5839 vm_offset_t kernel_vaddr;
5840 boolean_t page_was_referenced;
5841 union {
5842 unsigned char aes_iv[AES_BLOCK_SIZE];
5843 struct {
5844 memory_object_t pager_object;
5845 vm_object_offset_t paging_offset;
5846 } vm;
5847 } decrypt_iv;
5848
5849 assert(page->busy);
5850 assert(page->encrypted);
5851
5852 /*
5853 * Gather the "reference" status of the page.
5854 * We'll restore its value after the decryption, so that
5855 * the decryption is transparent to the rest of the system
5856 * and doesn't impact the VM's LRU logic.
5857 */
5858 page_was_referenced =
5859 (page->reference || pmap_is_referenced(page->phys_page));
5860
5861 if (kernel_mapping_offset == 0) {
5862 /*
5863 * The page hasn't already been mapped in kernel space
5864 * by the caller. Map it now, so that we can access
5865 * its contents and decrypt them.
5866 */
5867 kernel_mapping_size = PAGE_SIZE;
5868 kr = vm_paging_map_object(&kernel_mapping_offset,
5869 page,
5870 page->object,
5871 page->offset,
5872 &kernel_mapping_size);
5873 if (kr != KERN_SUCCESS) {
5874 panic("vm_page_decrypt: "
5875 "could not map page in kernel: 0x%x\n");
5876 }
5877 } else {
5878 kernel_mapping_size = 0;
5879 }
5880 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5881
5882 assert(swap_crypt_ctx_initialized);
5883
5884 /*
5885 * Prepare an "initial vector" for the decryption.
5886 * It has to be the same as the "initial vector" we
5887 * used to encrypt that page.
5888 */
5889 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5890 decrypt_iv.vm.pager_object = page->object->pager;
5891 decrypt_iv.vm.paging_offset =
5892 page->object->paging_offset + page->offset;
5893
5894 vm_object_unlock(page->object);
5895
5896 /* encrypt the "initial vector" */
5897 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5898 swap_crypt_null_iv,
5899 1,
5900 &decrypt_iv.aes_iv[0],
5901 &swap_crypt_ctx.encrypt);
5902
5903 /*
5904 * Decrypt the page.
5905 */
5906 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5907 &decrypt_iv.aes_iv[0],
5908 PAGE_SIZE / AES_BLOCK_SIZE,
5909 (unsigned char *) kernel_vaddr,
5910 &swap_crypt_ctx.decrypt);
5911 vm_page_decrypt_counter++;
5912
5913 vm_object_lock(page->object);
5914
5915 /*
5916 * Unmap the page from the kernel's address space,
5917 * if we had to map it ourselves. Otherwise, let
5918 * the caller undo the mapping if needed.
5919 */
5920 if (kernel_mapping_size != 0) {
5921 vm_paging_unmap_object(page->object,
5922 kernel_vaddr,
5923 kernel_vaddr + PAGE_SIZE);
5924 }
5925
5926 /*
5927 * After decryption, the page is actually clean.
5928 * It was encrypted as part of paging, which "cleans"
5929 * the "dirty" pages.
5930 * Noone could access it after it was encrypted
5931 * and the decryption doesn't count.
5932 */
5933 page->dirty = FALSE;
5934 clear_refmod = VM_MEM_MODIFIED;
5935
5936 /* restore the "reference" bit */
5937 if (! page_was_referenced) {
5938 page->reference = FALSE;
5939 clear_refmod |= VM_MEM_REFERENCED;
5940 }
5941 pmap_clear_refmod(page->phys_page, clear_refmod);
5942
5943 page->encrypted = FALSE;
5944
5945 /*
5946 * We've just modified the page's contents via the data cache and part
5947 * of the new contents might still be in the cache and not yet in RAM.
5948 * Since the page is now available and might get gathered in a UPL to
5949 * be part of a DMA transfer from a driver that expects the memory to
5950 * be coherent at this point, we have to flush the data cache.
5951 */
5952 pmap_sync_page_attributes_phys(page->phys_page);
5953 /*
5954 * Since the page is not mapped yet, some code might assume that it
5955 * doesn't need to invalidate the instruction cache when writing to
5956 * that page. That code relies on "no_isync" being set, so that the
5957 * caches get syncrhonized when the page is first mapped. So we need
5958 * to set "no_isync" here too, despite the fact that we just
5959 * synchronized the caches above...
5960 */
5961 page->no_isync = TRUE;
5962 }
5963
5964 unsigned long upl_encrypt_upls = 0;
5965 unsigned long upl_encrypt_pages = 0;
5966
5967 /*
5968 * ENCRYPTED SWAP:
5969 *
5970 * upl_encrypt:
5971 * Encrypts all the pages in the UPL, within the specified range.
5972 *
5973 */
5974 void
5975 upl_encrypt(
5976 upl_t upl,
5977 upl_offset_t crypt_offset,
5978 upl_size_t crypt_size)
5979 {
5980 upl_size_t upl_size;
5981 upl_offset_t upl_offset;
5982 vm_object_t upl_object;
5983 vm_page_t page;
5984 vm_object_t shadow_object;
5985 vm_object_offset_t shadow_offset;
5986 vm_object_offset_t paging_offset;
5987 vm_object_offset_t base_offset;
5988
5989 upl_encrypt_upls++;
5990 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5991
5992 upl_lock(upl);
5993
5994 upl_object = upl->map_object;
5995 upl_offset = upl->offset;
5996 upl_size = upl->size;
5997
5998 upl_unlock(upl);
5999
6000 vm_object_lock(upl_object);
6001
6002 /*
6003 * Find the VM object that contains the actual pages.
6004 */
6005 if (upl_object->pageout) {
6006 shadow_object = upl_object->shadow;
6007 /*
6008 * The offset in the shadow object is actually also
6009 * accounted for in upl->offset. It possibly shouldn't be
6010 * this way, but for now don't account for it twice.
6011 */
6012 shadow_offset = 0;
6013 assert(upl_object->paging_offset == 0); /* XXX ? */
6014 vm_object_lock(shadow_object);
6015 } else {
6016 shadow_object = upl_object;
6017 shadow_offset = 0;
6018 }
6019
6020 paging_offset = shadow_object->paging_offset;
6021 vm_object_paging_begin(shadow_object);
6022
6023 if (shadow_object != upl_object) {
6024 vm_object_unlock(shadow_object);
6025 }
6026 vm_object_unlock(upl_object);
6027
6028 base_offset = shadow_offset;
6029 base_offset += upl_offset;
6030 base_offset += crypt_offset;
6031 base_offset -= paging_offset;
6032 /*
6033 * Unmap the pages, so that nobody can continue accessing them while
6034 * they're encrypted. After that point, all accesses to these pages
6035 * will cause a page fault and block while the page is being encrypted
6036 * (busy). After the encryption completes, any access will cause a
6037 * page fault and the page gets decrypted at that time.
6038 */
6039 assert(crypt_offset + crypt_size <= upl_size);
6040 vm_object_pmap_protect(shadow_object,
6041 base_offset,
6042 (vm_object_size_t)crypt_size,
6043 PMAP_NULL,
6044 0,
6045 VM_PROT_NONE);
6046
6047 /* XXX FBDP could the object have changed significantly here ? */
6048 vm_object_lock(shadow_object);
6049
6050 for (upl_offset = 0;
6051 upl_offset < crypt_size;
6052 upl_offset += PAGE_SIZE) {
6053 page = vm_page_lookup(shadow_object,
6054 base_offset + upl_offset);
6055 if (page == VM_PAGE_NULL) {
6056 panic("upl_encrypt: "
6057 "no page for (obj=%p,off=%lld+%d)!\n",
6058 shadow_object,
6059 base_offset,
6060 upl_offset);
6061 }
6062 vm_page_encrypt(page, 0);
6063 }
6064
6065 vm_object_paging_end(shadow_object);
6066 vm_object_unlock(shadow_object);
6067 }
6068
6069 vm_size_t
6070 upl_get_internal_pagelist_offset(void)
6071 {
6072 return sizeof(struct upl);
6073 }
6074
6075 void
6076 upl_clear_dirty(
6077 upl_t upl,
6078 boolean_t value)
6079 {
6080 if (value) {
6081 upl->flags |= UPL_CLEAR_DIRTY;
6082 } else {
6083 upl->flags &= ~UPL_CLEAR_DIRTY;
6084 }
6085 }
6086
6087
6088 #ifdef MACH_BSD
6089
6090 boolean_t upl_page_present(upl_page_info_t *upl, int index)
6091 {
6092 return(UPL_PAGE_PRESENT(upl, index));
6093 }
6094 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
6095 {
6096 return(UPL_DIRTY_PAGE(upl, index));
6097 }
6098 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
6099 {
6100 return(UPL_VALID_PAGE(upl, index));
6101 }
6102 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6103 {
6104 return(UPL_PHYS_PAGE(upl, index));
6105 }
6106
6107 void
6108 vm_countdirtypages(void)
6109 {
6110 vm_page_t m;
6111 int dpages;
6112 int pgopages;
6113 int precpages;
6114
6115
6116 dpages=0;
6117 pgopages=0;
6118 precpages=0;
6119
6120 vm_page_lock_queues();
6121 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6122 do {
6123 if (m ==(vm_page_t )0) break;
6124
6125 if(m->dirty) dpages++;
6126 if(m->pageout) pgopages++;
6127 if(m->precious) precpages++;
6128
6129 assert(m->object != kernel_object);
6130 m = (vm_page_t) queue_next(&m->pageq);
6131 if (m ==(vm_page_t )0) break;
6132
6133 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6134 vm_page_unlock_queues();
6135
6136 vm_page_lock_queues();
6137 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6138 do {
6139 if (m ==(vm_page_t )0) break;
6140
6141 if(m->dirty) dpages++;
6142 if(m->pageout) pgopages++;
6143 if(m->precious) precpages++;
6144
6145 assert(m->object != kernel_object);
6146 m = (vm_page_t) queue_next(&m->pageq);
6147 if (m ==(vm_page_t )0) break;
6148
6149 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6150 vm_page_unlock_queues();
6151
6152 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6153
6154 dpages=0;
6155 pgopages=0;
6156 precpages=0;
6157
6158 vm_page_lock_queues();
6159 m = (vm_page_t) queue_first(&vm_page_queue_active);
6160
6161 do {
6162 if(m == (vm_page_t )0) break;
6163 if(m->dirty) dpages++;
6164 if(m->pageout) pgopages++;
6165 if(m->precious) precpages++;
6166
6167 assert(m->object != kernel_object);
6168 m = (vm_page_t) queue_next(&m->pageq);
6169 if(m == (vm_page_t )0) break;
6170
6171 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6172 vm_page_unlock_queues();
6173
6174 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6175
6176 }
6177 #endif /* MACH_BSD */
6178
6179 ppnum_t upl_get_highest_page(
6180 upl_t upl)
6181 {
6182 return upl->highest_page;
6183 }
6184
6185 #ifdef UPL_DEBUG
6186 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6187 {
6188 upl->ubc_alias1 = alias1;
6189 upl->ubc_alias2 = alias2;
6190 return KERN_SUCCESS;
6191 }
6192 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6193 {
6194 if(al)
6195 *al = upl->ubc_alias1;
6196 if(al2)
6197 *al2 = upl->ubc_alias2;
6198 return KERN_SUCCESS;
6199 }
6200 #endif /* UPL_DEBUG */
6201
6202
6203
6204 #if MACH_KDB
6205 #include <ddb/db_output.h>
6206 #include <ddb/db_print.h>
6207 #include <vm/vm_print.h>
6208
6209 #define printf kdbprintf
6210 void db_pageout(void);
6211
6212 void
6213 db_vm(void)
6214 {
6215
6216 iprintf("VM Statistics:\n");
6217 db_indent += 2;
6218 iprintf("pages:\n");
6219 db_indent += 2;
6220 iprintf("activ %5d inact %5d free %5d",
6221 vm_page_active_count, vm_page_inactive_count,
6222 vm_page_free_count);
6223 printf(" wire %5d gobbl %5d\n",
6224 vm_page_wire_count, vm_page_gobble_count);
6225 db_indent -= 2;
6226 iprintf("target:\n");
6227 db_indent += 2;
6228 iprintf("min %5d inact %5d free %5d",
6229 vm_page_free_min, vm_page_inactive_target,
6230 vm_page_free_target);
6231 printf(" resrv %5d\n", vm_page_free_reserved);
6232 db_indent -= 2;
6233 iprintf("pause:\n");
6234 db_pageout();
6235 db_indent -= 2;
6236 }
6237
6238 #if MACH_COUNTERS
6239 extern int c_laundry_pages_freed;
6240 #endif /* MACH_COUNTERS */
6241
6242 void
6243 db_pageout(void)
6244 {
6245 iprintf("Pageout Statistics:\n");
6246 db_indent += 2;
6247 iprintf("active %5d inactv %5d\n",
6248 vm_pageout_active, vm_pageout_inactive);
6249 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6250 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6251 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6252 iprintf("used %5d clean %5d dirty %5d\n",
6253 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6254 vm_pageout_inactive_dirty);
6255 #if MACH_COUNTERS
6256 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6257 #endif /* MACH_COUNTERS */
6258 #if MACH_CLUSTER_STATS
6259 iprintf("Cluster Statistics:\n");
6260 db_indent += 2;
6261 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6262 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6263 vm_pageout_cluster_collisions);
6264 iprintf("clusters %5d conversions %5d\n",
6265 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6266 db_indent -= 2;
6267 iprintf("Target Statistics:\n");
6268 db_indent += 2;
6269 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6270 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6271 vm_pageout_target_page_freed);
6272 db_indent -= 2;
6273 #endif /* MACH_CLUSTER_STATS */
6274 db_indent -= 2;
6275 }
6276
6277 #endif /* MACH_KDB */