]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
87c5bd40c35202aec98ec6bb8648ab2444d4ca0d
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66 #include <stdint.h>
67
68 #include <debug.h>
69 #include <mach_pagemap.h>
70 #include <mach_cluster_stats.h>
71 #include <mach_kdb.h>
72 #include <advisory_pageout.h>
73
74 #include <mach/mach_types.h>
75 #include <mach/memory_object.h>
76 #include <mach/memory_object_default.h>
77 #include <mach/memory_object_control_server.h>
78 #include <mach/mach_host_server.h>
79 #include <mach/upl.h>
80 #include <mach/vm_map.h>
81 #include <mach/vm_param.h>
82 #include <mach/vm_statistics.h>
83
84 #include <kern/kern_types.h>
85 #include <kern/counters.h>
86 #include <kern/host_statistics.h>
87 #include <kern/machine.h>
88 #include <kern/misc_protos.h>
89 #include <kern/thread.h>
90 #include <kern/xpr.h>
91 #include <kern/kalloc.h>
92
93 #include <machine/vm_tuning.h>
94
95 #include <vm/pmap.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h> /* must be last */
102
103 /*
104 * ENCRYPTED SWAP:
105 */
106 #ifdef __ppc__
107 #include <ppc/mappings.h>
108 #endif /* __ppc__ */
109 #include <../bsd/crypto/aes/aes.h>
110
111 extern ipc_port_t memory_manager_default;
112
113
114 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
115 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
116 #endif
117
118 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
119 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
120 #endif
121
122 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
123 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
124 #endif
125
126 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
127 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
128 #endif
129
130 #ifndef VM_PAGE_LAUNDRY_MAX
131 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
132 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
133
134 #ifndef VM_PAGEOUT_BURST_WAIT
135 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
136 #endif /* VM_PAGEOUT_BURST_WAIT */
137
138 #ifndef VM_PAGEOUT_EMPTY_WAIT
139 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
140 #endif /* VM_PAGEOUT_EMPTY_WAIT */
141
142 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
143 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
144 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
145
146 #ifndef VM_PAGEOUT_IDLE_WAIT
147 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
148 #endif /* VM_PAGEOUT_IDLE_WAIT */
149
150
151 /*
152 * To obtain a reasonable LRU approximation, the inactive queue
153 * needs to be large enough to give pages on it a chance to be
154 * referenced a second time. This macro defines the fraction
155 * of active+inactive pages that should be inactive.
156 * The pageout daemon uses it to update vm_page_inactive_target.
157 *
158 * If vm_page_free_count falls below vm_page_free_target and
159 * vm_page_inactive_count is below vm_page_inactive_target,
160 * then the pageout daemon starts running.
161 */
162
163 #ifndef VM_PAGE_INACTIVE_TARGET
164 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
165 #endif /* VM_PAGE_INACTIVE_TARGET */
166
167 /*
168 * Once the pageout daemon starts running, it keeps going
169 * until vm_page_free_count meets or exceeds vm_page_free_target.
170 */
171
172 #ifndef VM_PAGE_FREE_TARGET
173 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
174 #endif /* VM_PAGE_FREE_TARGET */
175
176 /*
177 * The pageout daemon always starts running once vm_page_free_count
178 * falls below vm_page_free_min.
179 */
180
181 #ifndef VM_PAGE_FREE_MIN
182 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
183 #endif /* VM_PAGE_FREE_MIN */
184
185 /*
186 * When vm_page_free_count falls below vm_page_free_reserved,
187 * only vm-privileged threads can allocate pages. vm-privilege
188 * allows the pageout daemon and default pager (and any other
189 * associated threads needed for default pageout) to continue
190 * operation by dipping into the reserved pool of pages.
191 */
192
193 #ifndef VM_PAGE_FREE_RESERVED
194 #define VM_PAGE_FREE_RESERVED(n) \
195 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
196 #endif /* VM_PAGE_FREE_RESERVED */
197
198
199 /*
200 * must hold the page queues lock to
201 * manipulate this structure
202 */
203 struct vm_pageout_queue {
204 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
205 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
206 unsigned int pgo_maxlaundry;
207
208 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
209 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
210 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
211 :0;
212 };
213
214 #define VM_PAGE_Q_THROTTLED(q) \
215 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
216
217
218 /*
219 * Exported variable used to broadcast the activation of the pageout scan
220 * Working Set uses this to throttle its use of pmap removes. In this
221 * way, code which runs within memory in an uncontested context does
222 * not keep encountering soft faults.
223 */
224
225 unsigned int vm_pageout_scan_event_counter = 0;
226
227 /*
228 * Forward declarations for internal routines.
229 */
230
231 static void vm_pageout_garbage_collect(int);
232 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
233 static void vm_pageout_iothread_external(void);
234 static void vm_pageout_iothread_internal(void);
235 static void vm_pageout_queue_steal(vm_page_t);
236
237 extern void vm_pageout_continue(void);
238 extern void vm_pageout_scan(void);
239
240 unsigned int vm_pageout_reserved_internal = 0;
241 unsigned int vm_pageout_reserved_really = 0;
242
243 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
244 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
245 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
246 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
247 unsigned int vm_pageout_deadlock_relief = 0;
248 unsigned int vm_pageout_inactive_relief = 0;
249 unsigned int vm_pageout_burst_active_throttle = 0;
250 unsigned int vm_pageout_burst_inactive_throttle = 0;
251
252 /*
253 * Protection against zero fill flushing live working sets derived
254 * from existing backing store and files
255 */
256 unsigned int vm_accellerate_zf_pageout_trigger = 400;
257 unsigned int vm_zf_iterator;
258 unsigned int vm_zf_iterator_count = 40;
259 unsigned int last_page_zf;
260 unsigned int vm_zf_count = 0;
261
262 /*
263 * These variables record the pageout daemon's actions:
264 * how many pages it looks at and what happens to those pages.
265 * No locking needed because only one thread modifies the variables.
266 */
267
268 unsigned int vm_pageout_active = 0; /* debugging */
269 unsigned int vm_pageout_inactive = 0; /* debugging */
270 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
271 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
272 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
273 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
274 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
275 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
276 unsigned int vm_pageout_inactive_used = 0; /* debugging */
277 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
278 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
279 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
280 unsigned int vm_pageout_purged_objects = 0; /* debugging */
281 unsigned int vm_stat_discard = 0; /* debugging */
282 unsigned int vm_stat_discard_sent = 0; /* debugging */
283 unsigned int vm_stat_discard_failure = 0; /* debugging */
284 unsigned int vm_stat_discard_throttle = 0; /* debugging */
285
286 unsigned int vm_pageout_scan_active_throttled = 0;
287 unsigned int vm_pageout_scan_inactive_throttled = 0;
288 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
289 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
290 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
291 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
292 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
293 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
294 /*
295 * Backing store throttle when BS is exhausted
296 */
297 unsigned int vm_backing_store_low = 0;
298
299 unsigned int vm_pageout_out_of_line = 0;
300 unsigned int vm_pageout_in_place = 0;
301
302 /*
303 * ENCRYPTED SWAP:
304 * counters and statistics...
305 */
306 unsigned long vm_page_decrypt_counter = 0;
307 unsigned long vm_page_decrypt_for_upl_counter = 0;
308 unsigned long vm_page_encrypt_counter = 0;
309 unsigned long vm_page_encrypt_abort_counter = 0;
310 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
311 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
312
313
314 struct vm_pageout_queue vm_pageout_queue_internal;
315 struct vm_pageout_queue vm_pageout_queue_external;
316
317
318 /*
319 * Routine: vm_backing_store_disable
320 * Purpose:
321 * Suspend non-privileged threads wishing to extend
322 * backing store when we are low on backing store
323 * (Synchronized by caller)
324 */
325 void
326 vm_backing_store_disable(
327 boolean_t disable)
328 {
329 if(disable) {
330 vm_backing_store_low = 1;
331 } else {
332 if(vm_backing_store_low) {
333 vm_backing_store_low = 0;
334 thread_wakeup((event_t) &vm_backing_store_low);
335 }
336 }
337 }
338
339
340 /*
341 * Routine: vm_pageout_object_allocate
342 * Purpose:
343 * Allocate an object for use as out-of-line memory in a
344 * data_return/data_initialize message.
345 * The page must be in an unlocked object.
346 *
347 * If the page belongs to a trusted pager, cleaning in place
348 * will be used, which utilizes a special "pageout object"
349 * containing private alias pages for the real page frames.
350 * Untrusted pagers use normal out-of-line memory.
351 */
352 vm_object_t
353 vm_pageout_object_allocate(
354 vm_page_t m,
355 vm_size_t size,
356 vm_object_offset_t offset)
357 {
358 vm_object_t object = m->object;
359 vm_object_t new_object;
360
361 assert(object->pager_ready);
362
363 new_object = vm_object_allocate(size);
364
365 if (object->pager_trusted) {
366 assert (offset < object->size);
367
368 vm_object_lock(new_object);
369 new_object->pageout = TRUE;
370 new_object->shadow = object;
371 new_object->can_persist = FALSE;
372 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
373 new_object->shadow_offset = offset;
374 vm_object_unlock(new_object);
375
376 /*
377 * Take a paging reference on the object. This will be dropped
378 * in vm_pageout_object_terminate()
379 */
380 vm_object_lock(object);
381 vm_object_paging_begin(object);
382 vm_page_lock_queues();
383 vm_page_unlock_queues();
384 vm_object_unlock(object);
385
386 vm_pageout_in_place++;
387 } else
388 vm_pageout_out_of_line++;
389 return(new_object);
390 }
391
392 #if MACH_CLUSTER_STATS
393 unsigned long vm_pageout_cluster_dirtied = 0;
394 unsigned long vm_pageout_cluster_cleaned = 0;
395 unsigned long vm_pageout_cluster_collisions = 0;
396 unsigned long vm_pageout_cluster_clusters = 0;
397 unsigned long vm_pageout_cluster_conversions = 0;
398 unsigned long vm_pageout_target_collisions = 0;
399 unsigned long vm_pageout_target_page_dirtied = 0;
400 unsigned long vm_pageout_target_page_freed = 0;
401 #define CLUSTER_STAT(clause) clause
402 #else /* MACH_CLUSTER_STATS */
403 #define CLUSTER_STAT(clause)
404 #endif /* MACH_CLUSTER_STATS */
405
406 /*
407 * Routine: vm_pageout_object_terminate
408 * Purpose:
409 * Destroy the pageout_object allocated by
410 * vm_pageout_object_allocate(), and perform all of the
411 * required cleanup actions.
412 *
413 * In/Out conditions:
414 * The object must be locked, and will be returned locked.
415 */
416 void
417 vm_pageout_object_terminate(
418 vm_object_t object)
419 {
420 vm_object_t shadow_object;
421 boolean_t shadow_internal;
422
423 /*
424 * Deal with the deallocation (last reference) of a pageout object
425 * (used for cleaning-in-place) by dropping the paging references/
426 * freeing pages in the original object.
427 */
428
429 assert(object->pageout);
430 shadow_object = object->shadow;
431 vm_object_lock(shadow_object);
432 shadow_internal = shadow_object->internal;
433
434 while (!queue_empty(&object->memq)) {
435 vm_page_t p, m;
436 vm_object_offset_t offset;
437
438 p = (vm_page_t) queue_first(&object->memq);
439
440 assert(p->private);
441 assert(p->pageout);
442 p->pageout = FALSE;
443 assert(!p->cleaning);
444
445 offset = p->offset;
446 VM_PAGE_FREE(p);
447 p = VM_PAGE_NULL;
448
449 m = vm_page_lookup(shadow_object,
450 offset + object->shadow_offset);
451
452 if(m == VM_PAGE_NULL)
453 continue;
454 assert(m->cleaning);
455 /* used as a trigger on upl_commit etc to recognize the */
456 /* pageout daemon's subseqent desire to pageout a cleaning */
457 /* page. When the bit is on the upl commit code will */
458 /* respect the pageout bit in the target page over the */
459 /* caller's page list indication */
460 m->dump_cleaning = FALSE;
461
462 /*
463 * Account for the paging reference taken when
464 * m->cleaning was set on this page.
465 */
466 vm_object_paging_end(shadow_object);
467 assert((m->dirty) || (m->precious) ||
468 (m->busy && m->cleaning));
469
470 /*
471 * Handle the trusted pager throttle.
472 * Also decrement the burst throttle (if external).
473 */
474 vm_page_lock_queues();
475 if (m->laundry) {
476 vm_pageout_throttle_up(m);
477 }
478
479 /*
480 * Handle the "target" page(s). These pages are to be freed if
481 * successfully cleaned. Target pages are always busy, and are
482 * wired exactly once. The initial target pages are not mapped,
483 * (so cannot be referenced or modified) but converted target
484 * pages may have been modified between the selection as an
485 * adjacent page and conversion to a target.
486 */
487 if (m->pageout) {
488 assert(m->busy);
489 assert(m->wire_count == 1);
490 m->cleaning = FALSE;
491 m->pageout = FALSE;
492 #if MACH_CLUSTER_STATS
493 if (m->wanted) vm_pageout_target_collisions++;
494 #endif
495 /*
496 * Revoke all access to the page. Since the object is
497 * locked, and the page is busy, this prevents the page
498 * from being dirtied after the pmap_disconnect() call
499 * returns.
500 *
501 * Since the page is left "dirty" but "not modifed", we
502 * can detect whether the page was redirtied during
503 * pageout by checking the modify state.
504 */
505 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
506 m->dirty = TRUE;
507 else
508 m->dirty = FALSE;
509
510 if (m->dirty) {
511 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
512 vm_page_unwire(m);/* reactivates */
513 VM_STAT(reactivations++);
514 PAGE_WAKEUP_DONE(m);
515 } else {
516 CLUSTER_STAT(vm_pageout_target_page_freed++;)
517 vm_page_free(m);/* clears busy, etc. */
518 }
519 vm_page_unlock_queues();
520 continue;
521 }
522 /*
523 * Handle the "adjacent" pages. These pages were cleaned in
524 * place, and should be left alone.
525 * If prep_pin_count is nonzero, then someone is using the
526 * page, so make it active.
527 */
528 if (!m->active && !m->inactive && !m->private) {
529 if (m->reference)
530 vm_page_activate(m);
531 else
532 vm_page_deactivate(m);
533 }
534 if((m->busy) && (m->cleaning)) {
535
536 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
537 m->busy = FALSE;
538
539 /* We do not re-set m->dirty ! */
540 /* The page was busy so no extraneous activity */
541 /* could have occurred. COPY_INTO is a read into the */
542 /* new pages. CLEAN_IN_PLACE does actually write */
543 /* out the pages but handling outside of this code */
544 /* will take care of resetting dirty. We clear the */
545 /* modify however for the Programmed I/O case. */
546 pmap_clear_modify(m->phys_page);
547 if(m->absent) {
548 m->absent = FALSE;
549 if(shadow_object->absent_count == 1)
550 vm_object_absent_release(shadow_object);
551 else
552 shadow_object->absent_count--;
553 }
554 m->overwriting = FALSE;
555 } else if (m->overwriting) {
556 /* alternate request page list, write to page_list */
557 /* case. Occurs when the original page was wired */
558 /* at the time of the list request */
559 assert(m->wire_count != 0);
560 vm_page_unwire(m);/* reactivates */
561 m->overwriting = FALSE;
562 } else {
563 /*
564 * Set the dirty state according to whether or not the page was
565 * modified during the pageout. Note that we purposefully do
566 * NOT call pmap_clear_modify since the page is still mapped.
567 * If the page were to be dirtied between the 2 calls, this
568 * this fact would be lost. This code is only necessary to
569 * maintain statistics, since the pmap module is always
570 * consulted if m->dirty is false.
571 */
572 #if MACH_CLUSTER_STATS
573 m->dirty = pmap_is_modified(m->phys_page);
574
575 if (m->dirty) vm_pageout_cluster_dirtied++;
576 else vm_pageout_cluster_cleaned++;
577 if (m->wanted) vm_pageout_cluster_collisions++;
578 #else
579 m->dirty = 0;
580 #endif
581 }
582 m->cleaning = FALSE;
583
584 /*
585 * Wakeup any thread waiting for the page to be un-cleaning.
586 */
587 PAGE_WAKEUP(m);
588 vm_page_unlock_queues();
589 }
590 /*
591 * Account for the paging reference taken in vm_paging_object_allocate.
592 */
593 vm_object_paging_end(shadow_object);
594 vm_object_unlock(shadow_object);
595
596 assert(object->ref_count == 0);
597 assert(object->paging_in_progress == 0);
598 assert(object->resident_page_count == 0);
599 return;
600 }
601
602 /*
603 * Routine: vm_pageout_setup
604 * Purpose:
605 * Set up a page for pageout (clean & flush).
606 *
607 * Move the page to a new object, as part of which it will be
608 * sent to its memory manager in a memory_object_data_write or
609 * memory_object_initialize message.
610 *
611 * The "new_object" and "new_offset" arguments
612 * indicate where the page should be moved.
613 *
614 * In/Out conditions:
615 * The page in question must not be on any pageout queues,
616 * and must be busy. The object to which it belongs
617 * must be unlocked, and the caller must hold a paging
618 * reference to it. The new_object must not be locked.
619 *
620 * This routine returns a pointer to a place-holder page,
621 * inserted at the same offset, to block out-of-order
622 * requests for the page. The place-holder page must
623 * be freed after the data_write or initialize message
624 * has been sent.
625 *
626 * The original page is put on a paging queue and marked
627 * not busy on exit.
628 */
629 vm_page_t
630 vm_pageout_setup(
631 register vm_page_t m,
632 register vm_object_t new_object,
633 vm_object_offset_t new_offset)
634 {
635 register vm_object_t old_object = m->object;
636 vm_object_offset_t paging_offset;
637 vm_object_offset_t offset;
638 register vm_page_t holding_page;
639 register vm_page_t new_m;
640 boolean_t need_to_wire = FALSE;
641
642
643 XPR(XPR_VM_PAGEOUT,
644 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
645 (integer_t)m->object, (integer_t)m->offset,
646 (integer_t)m, (integer_t)new_object,
647 (integer_t)new_offset);
648 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
649 !m->restart);
650
651 assert(m->dirty || m->precious);
652
653 /*
654 * Create a place-holder page where the old one was, to prevent
655 * attempted pageins of this page while we're unlocked.
656 */
657 VM_PAGE_GRAB_FICTITIOUS(holding_page);
658
659 vm_object_lock(old_object);
660
661 offset = m->offset;
662 paging_offset = offset + old_object->paging_offset;
663
664 if (old_object->pager_trusted) {
665 /*
666 * This pager is trusted, so we can clean this page
667 * in place. Leave it in the old object, and mark it
668 * cleaning & pageout.
669 */
670 new_m = holding_page;
671 holding_page = VM_PAGE_NULL;
672
673 /*
674 * Set up new page to be private shadow of real page.
675 */
676 new_m->phys_page = m->phys_page;
677 new_m->fictitious = FALSE;
678 new_m->pageout = TRUE;
679
680 /*
681 * Mark real page as cleaning (indicating that we hold a
682 * paging reference to be released via m_o_d_r_c) and
683 * pageout (indicating that the page should be freed
684 * when the pageout completes).
685 */
686 pmap_clear_modify(m->phys_page);
687 vm_page_lock_queues();
688 new_m->private = TRUE;
689 vm_page_wire(new_m);
690 m->cleaning = TRUE;
691 m->pageout = TRUE;
692
693 vm_page_wire(m);
694 assert(m->wire_count == 1);
695 vm_page_unlock_queues();
696
697 m->dirty = TRUE;
698 m->precious = FALSE;
699 m->page_lock = VM_PROT_NONE;
700 m->unusual = FALSE;
701 m->unlock_request = VM_PROT_NONE;
702 } else {
703 /*
704 * Cannot clean in place, so rip the old page out of the
705 * object, and stick the holding page in. Set new_m to the
706 * page in the new object.
707 */
708 vm_page_lock_queues();
709 VM_PAGE_QUEUES_REMOVE(m);
710 vm_page_remove(m);
711
712 vm_page_insert(holding_page, old_object, offset);
713 vm_page_unlock_queues();
714
715 m->dirty = TRUE;
716 m->precious = FALSE;
717 new_m = m;
718 new_m->page_lock = VM_PROT_NONE;
719 new_m->unlock_request = VM_PROT_NONE;
720
721 if (old_object->internal)
722 need_to_wire = TRUE;
723 }
724 /*
725 * Record that this page has been written out
726 */
727 #if MACH_PAGEMAP
728 vm_external_state_set(old_object->existence_map, offset);
729 #endif /* MACH_PAGEMAP */
730
731 vm_object_unlock(old_object);
732
733 vm_object_lock(new_object);
734
735 /*
736 * Put the page into the new object. If it is a not wired
737 * (if it's the real page) it will be activated.
738 */
739
740 vm_page_lock_queues();
741 vm_page_insert(new_m, new_object, new_offset);
742 if (need_to_wire)
743 vm_page_wire(new_m);
744 else
745 vm_page_activate(new_m);
746 PAGE_WAKEUP_DONE(new_m);
747 vm_page_unlock_queues();
748
749 vm_object_unlock(new_object);
750
751 /*
752 * Return the placeholder page to simplify cleanup.
753 */
754 return (holding_page);
755 }
756
757 /*
758 * Routine: vm_pageclean_setup
759 *
760 * Purpose: setup a page to be cleaned (made non-dirty), but not
761 * necessarily flushed from the VM page cache.
762 * This is accomplished by cleaning in place.
763 *
764 * The page must not be busy, and the object and page
765 * queues must be locked.
766 *
767 */
768 void
769 vm_pageclean_setup(
770 vm_page_t m,
771 vm_page_t new_m,
772 vm_object_t new_object,
773 vm_object_offset_t new_offset)
774 {
775 vm_object_t old_object = m->object;
776 assert(!m->busy);
777 assert(!m->cleaning);
778
779 XPR(XPR_VM_PAGEOUT,
780 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
781 (integer_t)old_object, m->offset, (integer_t)m,
782 (integer_t)new_m, new_offset);
783
784 pmap_clear_modify(m->phys_page);
785 vm_object_paging_begin(old_object);
786
787 /*
788 * Record that this page has been written out
789 */
790 #if MACH_PAGEMAP
791 vm_external_state_set(old_object->existence_map, m->offset);
792 #endif /*MACH_PAGEMAP*/
793
794 /*
795 * Mark original page as cleaning in place.
796 */
797 m->cleaning = TRUE;
798 m->dirty = TRUE;
799 m->precious = FALSE;
800
801 /*
802 * Convert the fictitious page to a private shadow of
803 * the real page.
804 */
805 assert(new_m->fictitious);
806 new_m->fictitious = FALSE;
807 new_m->private = TRUE;
808 new_m->pageout = TRUE;
809 new_m->phys_page = m->phys_page;
810 vm_page_wire(new_m);
811
812 vm_page_insert(new_m, new_object, new_offset);
813 assert(!new_m->wanted);
814 new_m->busy = FALSE;
815 }
816
817 void
818 vm_pageclean_copy(
819 vm_page_t m,
820 vm_page_t new_m,
821 vm_object_t new_object,
822 vm_object_offset_t new_offset)
823 {
824 XPR(XPR_VM_PAGEOUT,
825 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
826 m, new_m, new_object, new_offset, 0);
827
828 assert((!m->busy) && (!m->cleaning));
829
830 assert(!new_m->private && !new_m->fictitious);
831
832 pmap_clear_modify(m->phys_page);
833
834 m->busy = TRUE;
835 vm_object_paging_begin(m->object);
836 vm_page_unlock_queues();
837 vm_object_unlock(m->object);
838
839 /*
840 * Copy the original page to the new page.
841 */
842 vm_page_copy(m, new_m);
843
844 /*
845 * Mark the old page as clean. A request to pmap_is_modified
846 * will get the right answer.
847 */
848 vm_object_lock(m->object);
849 m->dirty = FALSE;
850
851 vm_object_paging_end(m->object);
852
853 vm_page_lock_queues();
854 if (!m->active && !m->inactive)
855 vm_page_activate(m);
856 PAGE_WAKEUP_DONE(m);
857
858 vm_page_insert(new_m, new_object, new_offset);
859 vm_page_activate(new_m);
860 new_m->busy = FALSE; /* No other thread can be waiting */
861 }
862
863
864 /*
865 * Routine: vm_pageout_initialize_page
866 * Purpose:
867 * Causes the specified page to be initialized in
868 * the appropriate memory object. This routine is used to push
869 * pages into a copy-object when they are modified in the
870 * permanent object.
871 *
872 * The page is moved to a temporary object and paged out.
873 *
874 * In/out conditions:
875 * The page in question must not be on any pageout queues.
876 * The object to which it belongs must be locked.
877 * The page must be busy, but not hold a paging reference.
878 *
879 * Implementation:
880 * Move this page to a completely new object.
881 */
882 void
883 vm_pageout_initialize_page(
884 vm_page_t m)
885 {
886 vm_object_t object;
887 vm_object_offset_t paging_offset;
888 vm_page_t holding_page;
889
890
891 XPR(XPR_VM_PAGEOUT,
892 "vm_pageout_initialize_page, page 0x%X\n",
893 (integer_t)m, 0, 0, 0, 0);
894 assert(m->busy);
895
896 /*
897 * Verify that we really want to clean this page
898 */
899 assert(!m->absent);
900 assert(!m->error);
901 assert(m->dirty);
902
903 /*
904 * Create a paging reference to let us play with the object.
905 */
906 object = m->object;
907 paging_offset = m->offset + object->paging_offset;
908 vm_object_paging_begin(object);
909 if (m->absent || m->error || m->restart ||
910 (!m->dirty && !m->precious)) {
911 VM_PAGE_FREE(m);
912 panic("reservation without pageout?"); /* alan */
913 vm_object_unlock(object);
914 return;
915 }
916
917 /* set the page for future call to vm_fault_list_request */
918 holding_page = NULL;
919 vm_page_lock_queues();
920 pmap_clear_modify(m->phys_page);
921 m->dirty = TRUE;
922 m->busy = TRUE;
923 m->list_req_pending = TRUE;
924 m->cleaning = TRUE;
925 m->pageout = TRUE;
926 vm_page_wire(m);
927 vm_page_unlock_queues();
928 vm_object_unlock(object);
929
930 /*
931 * Write the data to its pager.
932 * Note that the data is passed by naming the new object,
933 * not a virtual address; the pager interface has been
934 * manipulated to use the "internal memory" data type.
935 * [The object reference from its allocation is donated
936 * to the eventual recipient.]
937 */
938 memory_object_data_initialize(object->pager,
939 paging_offset,
940 PAGE_SIZE);
941
942 vm_object_lock(object);
943 }
944
945 #if MACH_CLUSTER_STATS
946 #define MAXCLUSTERPAGES 16
947 struct {
948 unsigned long pages_in_cluster;
949 unsigned long pages_at_higher_offsets;
950 unsigned long pages_at_lower_offsets;
951 } cluster_stats[MAXCLUSTERPAGES];
952 #endif /* MACH_CLUSTER_STATS */
953
954 boolean_t allow_clustered_pageouts = FALSE;
955
956 /*
957 * vm_pageout_cluster:
958 *
959 * Given a page, queue it to the appropriate I/O thread,
960 * which will page it out and attempt to clean adjacent pages
961 * in the same operation.
962 *
963 * The page must be busy, and the object and queues locked. We will take a
964 * paging reference to prevent deallocation or collapse when we
965 * release the object lock back at the call site. The I/O thread
966 * is responsible for consuming this reference
967 *
968 * The page must not be on any pageout queue.
969 */
970
971 void
972 vm_pageout_cluster(vm_page_t m)
973 {
974 vm_object_t object = m->object;
975 struct vm_pageout_queue *q;
976
977
978 XPR(XPR_VM_PAGEOUT,
979 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
980 (integer_t)object, m->offset, (integer_t)m, 0, 0);
981
982 /*
983 * Only a certain kind of page is appreciated here.
984 */
985 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
986 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
987
988 /*
989 * protect the object from collapse -
990 * locking in the object's paging_offset.
991 */
992 vm_object_paging_begin(object);
993
994 /*
995 * set the page for future call to vm_fault_list_request
996 * page should already be marked busy
997 */
998 vm_page_wire(m);
999 m->list_req_pending = TRUE;
1000 m->cleaning = TRUE;
1001 m->pageout = TRUE;
1002 m->laundry = TRUE;
1003
1004 if (object->internal == TRUE)
1005 q = &vm_pageout_queue_internal;
1006 else
1007 q = &vm_pageout_queue_external;
1008 q->pgo_laundry++;
1009
1010 m->pageout_queue = TRUE;
1011 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1012
1013 if (q->pgo_idle == TRUE) {
1014 q->pgo_idle = FALSE;
1015 thread_wakeup((event_t) &q->pgo_pending);
1016 }
1017 }
1018
1019
1020 unsigned long vm_pageout_throttle_up_count = 0;
1021
1022 /*
1023 * A page is back from laundry. See if there are some pages waiting to
1024 * go to laundry and if we can let some of them go now.
1025 *
1026 * Object and page queues must be locked.
1027 */
1028 void
1029 vm_pageout_throttle_up(
1030 vm_page_t m)
1031 {
1032 struct vm_pageout_queue *q;
1033
1034 vm_pageout_throttle_up_count++;
1035
1036 assert(m->laundry);
1037 assert(m->object != VM_OBJECT_NULL);
1038 assert(m->object != kernel_object);
1039
1040 if (m->object->internal == TRUE)
1041 q = &vm_pageout_queue_internal;
1042 else
1043 q = &vm_pageout_queue_external;
1044
1045 m->laundry = FALSE;
1046 q->pgo_laundry--;
1047
1048 if (q->pgo_throttled == TRUE) {
1049 q->pgo_throttled = FALSE;
1050 thread_wakeup((event_t) &q->pgo_laundry);
1051 }
1052 }
1053
1054
1055 /*
1056 * vm_pageout_scan does the dirty work for the pageout daemon.
1057 * It returns with vm_page_queue_free_lock held and
1058 * vm_page_free_wanted == 0.
1059 */
1060
1061 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1062
1063 #define FCS_IDLE 0
1064 #define FCS_DELAYED 1
1065 #define FCS_DEADLOCK_DETECTED 2
1066
1067 struct flow_control {
1068 int state;
1069 mach_timespec_t ts;
1070 };
1071
1072 extern kern_return_t sysclk_gettime(mach_timespec_t *);
1073
1074
1075 void
1076 vm_pageout_scan(void)
1077 {
1078 unsigned int loop_count = 0;
1079 unsigned int inactive_burst_count = 0;
1080 unsigned int active_burst_count = 0;
1081 vm_page_t local_freeq = 0;
1082 int local_freed = 0;
1083 int delayed_unlock = 0;
1084 int need_internal_inactive = 0;
1085 int refmod_state = 0;
1086 int vm_pageout_deadlock_target = 0;
1087 struct vm_pageout_queue *iq;
1088 struct vm_pageout_queue *eq;
1089 struct flow_control flow_control;
1090 boolean_t active_throttled = FALSE;
1091 boolean_t inactive_throttled = FALSE;
1092 mach_timespec_t ts;
1093 unsigned int msecs = 0;
1094 vm_object_t object;
1095
1096
1097 flow_control.state = FCS_IDLE;
1098 iq = &vm_pageout_queue_internal;
1099 eq = &vm_pageout_queue_external;
1100
1101 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1102
1103 /*???*/ /*
1104 * We want to gradually dribble pages from the active queue
1105 * to the inactive queue. If we let the inactive queue get
1106 * very small, and then suddenly dump many pages into it,
1107 * those pages won't get a sufficient chance to be referenced
1108 * before we start taking them from the inactive queue.
1109 *
1110 * We must limit the rate at which we send pages to the pagers.
1111 * data_write messages consume memory, for message buffers and
1112 * for map-copy objects. If we get too far ahead of the pagers,
1113 * we can potentially run out of memory.
1114 *
1115 * We can use the laundry count to limit directly the number
1116 * of pages outstanding to the default pager. A similar
1117 * strategy for external pagers doesn't work, because
1118 * external pagers don't have to deallocate the pages sent them,
1119 * and because we might have to send pages to external pagers
1120 * even if they aren't processing writes. So we also
1121 * use a burst count to limit writes to external pagers.
1122 *
1123 * When memory is very tight, we can't rely on external pagers to
1124 * clean pages. They probably aren't running, because they
1125 * aren't vm-privileged. If we kept sending dirty pages to them,
1126 * we could exhaust the free list.
1127 */
1128 vm_page_lock_queues();
1129 delayed_unlock = 1;
1130
1131
1132 Restart:
1133 /*
1134 * Recalculate vm_page_inactivate_target.
1135 */
1136 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1137 vm_page_inactive_count);
1138 object = NULL;
1139
1140 for (;;) {
1141 vm_page_t m;
1142
1143 if (delayed_unlock == 0)
1144 vm_page_lock_queues();
1145
1146 active_burst_count = vm_page_active_count;
1147
1148 if (active_burst_count > vm_pageout_burst_active_throttle)
1149 active_burst_count = vm_pageout_burst_active_throttle;
1150
1151 /*
1152 * Move pages from active to inactive.
1153 */
1154 while ((need_internal_inactive ||
1155 vm_page_inactive_count < vm_page_inactive_target) &&
1156 !queue_empty(&vm_page_queue_active) &&
1157 ((active_burst_count--) > 0)) {
1158
1159 vm_pageout_active++;
1160
1161 m = (vm_page_t) queue_first(&vm_page_queue_active);
1162
1163 assert(m->active && !m->inactive);
1164 assert(!m->laundry);
1165 assert(m->object != kernel_object);
1166
1167 /*
1168 * Try to lock object; since we've already got the
1169 * page queues lock, we can only 'try' for this one.
1170 * if the 'try' fails, we need to do a mutex_pause
1171 * to allow the owner of the object lock a chance to
1172 * run... otherwise, we're likely to trip over this
1173 * object in the same state as we work our way through
1174 * the queue... clumps of pages associated with the same
1175 * object are fairly typical on the inactive and active queues
1176 */
1177 if (m->object != object) {
1178 if (object != NULL) {
1179 vm_object_unlock(object);
1180 object = NULL;
1181 }
1182 if (!vm_object_lock_try(m->object)) {
1183 /*
1184 * move page to end of active queue and continue
1185 */
1186 queue_remove(&vm_page_queue_active, m,
1187 vm_page_t, pageq);
1188 queue_enter(&vm_page_queue_active, m,
1189 vm_page_t, pageq);
1190
1191 goto done_with_activepage;
1192 }
1193 object = m->object;
1194 }
1195 /*
1196 * if the page is BUSY, then we pull it
1197 * off the active queue and leave it alone.
1198 * when BUSY is cleared, it will get stuck
1199 * back on the appropriate queue
1200 */
1201 if (m->busy) {
1202 queue_remove(&vm_page_queue_active, m,
1203 vm_page_t, pageq);
1204 m->pageq.next = NULL;
1205 m->pageq.prev = NULL;
1206
1207 if (!m->fictitious)
1208 vm_page_active_count--;
1209 m->active = FALSE;
1210
1211 goto done_with_activepage;
1212 }
1213 if (need_internal_inactive) {
1214 /*
1215 * If we're unable to make forward progress
1216 * with the current set of pages on the
1217 * inactive queue due to busy objects or
1218 * throttled pageout queues, then
1219 * move a page that is already clean
1220 * or belongs to a pageout queue that
1221 * isn't currently throttled
1222 */
1223 active_throttled = FALSE;
1224
1225 if (object->internal) {
1226 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1227 active_throttled = TRUE;
1228 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1229 active_throttled = TRUE;
1230 }
1231 if (active_throttled == TRUE) {
1232 if (!m->dirty) {
1233 refmod_state = pmap_get_refmod(m->phys_page);
1234
1235 if (refmod_state & VM_MEM_REFERENCED)
1236 m->reference = TRUE;
1237 if (refmod_state & VM_MEM_MODIFIED)
1238 m->dirty = TRUE;
1239 }
1240 if (m->dirty || m->precious) {
1241 /*
1242 * page is dirty and targets a THROTTLED queue
1243 * so all we can do is move it back to the
1244 * end of the active queue to get it out
1245 * of the way
1246 */
1247 queue_remove(&vm_page_queue_active, m,
1248 vm_page_t, pageq);
1249 queue_enter(&vm_page_queue_active, m,
1250 vm_page_t, pageq);
1251
1252 vm_pageout_scan_active_throttled++;
1253
1254 goto done_with_activepage;
1255 }
1256 }
1257 vm_pageout_scan_active_throttle_success++;
1258 need_internal_inactive--;
1259 }
1260 /*
1261 * Deactivate the page while holding the object
1262 * locked, so we know the page is still not busy.
1263 * This should prevent races between pmap_enter
1264 * and pmap_clear_reference. The page might be
1265 * absent or fictitious, but vm_page_deactivate
1266 * can handle that.
1267 */
1268 vm_page_deactivate(m);
1269 done_with_activepage:
1270 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1271
1272 if (object != NULL) {
1273 vm_object_unlock(object);
1274 object = NULL;
1275 }
1276 if (local_freeq) {
1277 vm_page_free_list(local_freeq);
1278
1279 local_freeq = 0;
1280 local_freed = 0;
1281 }
1282 delayed_unlock = 0;
1283 vm_page_unlock_queues();
1284
1285 mutex_pause();
1286 vm_page_lock_queues();
1287 /*
1288 * continue the while loop processing
1289 * the active queue... need to hold
1290 * the page queues lock
1291 */
1292 continue;
1293 }
1294 }
1295
1296
1297
1298 /**********************************************************************
1299 * above this point we're playing with the active queue
1300 * below this point we're playing with the throttling mechanisms
1301 * and the inactive queue
1302 **********************************************************************/
1303
1304
1305
1306 /*
1307 * We are done if we have met our target *and*
1308 * nobody is still waiting for a page.
1309 */
1310 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1311 if (object != NULL) {
1312 vm_object_unlock(object);
1313 object = NULL;
1314 }
1315 if (local_freeq) {
1316 vm_page_free_list(local_freeq);
1317
1318 local_freeq = 0;
1319 local_freed = 0;
1320 }
1321 mutex_lock(&vm_page_queue_free_lock);
1322
1323 if ((vm_page_free_count >= vm_page_free_target) &&
1324 (vm_page_free_wanted == 0)) {
1325
1326 vm_page_unlock_queues();
1327
1328 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1329 return;
1330 }
1331 mutex_unlock(&vm_page_queue_free_lock);
1332 }
1333
1334
1335 /*
1336 * Sometimes we have to pause:
1337 * 1) No inactive pages - nothing to do.
1338 * 2) Flow control - default pageout queue is full
1339 * 3) Loop control - no acceptable pages found on the inactive queue
1340 * within the last vm_pageout_burst_inactive_throttle iterations
1341 */
1342 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1343 vm_pageout_scan_empty_throttle++;
1344 msecs = vm_pageout_empty_wait;
1345 goto vm_pageout_scan_delay;
1346
1347 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1348 vm_pageout_scan_burst_throttle++;
1349 msecs = vm_pageout_burst_wait;
1350 goto vm_pageout_scan_delay;
1351
1352 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1353
1354 switch (flow_control.state) {
1355
1356 case FCS_IDLE:
1357 reset_deadlock_timer:
1358 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1359 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1360 sysclk_gettime(&flow_control.ts);
1361 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1362
1363 flow_control.state = FCS_DELAYED;
1364 msecs = vm_pageout_deadlock_wait;
1365
1366 break;
1367
1368 case FCS_DELAYED:
1369 sysclk_gettime(&ts);
1370
1371 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1372 /*
1373 * the pageout thread for the default pager is potentially
1374 * deadlocked since the
1375 * default pager queue has been throttled for more than the
1376 * allowable time... we need to move some clean pages or dirty
1377 * pages belonging to the external pagers if they aren't throttled
1378 * vm_page_free_wanted represents the number of threads currently
1379 * blocked waiting for pages... we'll move one page for each of
1380 * these plus a fixed amount to break the logjam... once we're done
1381 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1382 * with a new timeout target since we have no way of knowing
1383 * whether we've broken the deadlock except through observation
1384 * of the queue associated with the default pager... we need to
1385 * stop moving pagings and allow the system to run to see what
1386 * state it settles into.
1387 */
1388 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1389 vm_pageout_scan_deadlock_detected++;
1390 flow_control.state = FCS_DEADLOCK_DETECTED;
1391
1392 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1393 goto consider_inactive;
1394 }
1395 /*
1396 * just resniff instead of trying
1397 * to compute a new delay time... we're going to be
1398 * awakened immediately upon a laundry completion,
1399 * so we won't wait any longer than necessary
1400 */
1401 msecs = vm_pageout_idle_wait;
1402 break;
1403
1404 case FCS_DEADLOCK_DETECTED:
1405 if (vm_pageout_deadlock_target)
1406 goto consider_inactive;
1407 goto reset_deadlock_timer;
1408
1409 }
1410 vm_pageout_scan_throttle++;
1411 iq->pgo_throttled = TRUE;
1412 vm_pageout_scan_delay:
1413 if (object != NULL) {
1414 vm_object_unlock(object);
1415 object = NULL;
1416 }
1417 if (local_freeq) {
1418 vm_page_free_list(local_freeq);
1419
1420 local_freeq = 0;
1421 local_freed = 0;
1422 }
1423 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1424
1425 counter(c_vm_pageout_scan_block++);
1426
1427 vm_page_unlock_queues();
1428
1429 thread_block(THREAD_CONTINUE_NULL);
1430
1431 vm_page_lock_queues();
1432 delayed_unlock = 1;
1433
1434 iq->pgo_throttled = FALSE;
1435
1436 if (loop_count >= vm_page_inactive_count) {
1437 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1438 /*
1439 * Make sure we move enough "appropriate"
1440 * pages to the inactive queue before trying
1441 * again.
1442 */
1443 need_internal_inactive = vm_pageout_inactive_relief;
1444 }
1445 loop_count = 0;
1446 }
1447 inactive_burst_count = 0;
1448
1449 goto Restart;
1450 /*NOTREACHED*/
1451 }
1452
1453
1454 flow_control.state = FCS_IDLE;
1455 consider_inactive:
1456 loop_count++;
1457 inactive_burst_count++;
1458 vm_pageout_inactive++;
1459
1460 if (!queue_empty(&vm_page_queue_inactive)) {
1461 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1462
1463 if (m->clustered && (m->no_isync == TRUE)) {
1464 goto use_this_page;
1465 }
1466 }
1467 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1468 vm_zf_iterator = 0;
1469 } else {
1470 last_page_zf = 0;
1471 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1472 vm_zf_iterator = 0;
1473 }
1474 }
1475 if (queue_empty(&vm_page_queue_zf) ||
1476 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1477 !queue_empty(&vm_page_queue_inactive))) {
1478 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1479 last_page_zf = 0;
1480 } else {
1481 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1482 last_page_zf = 1;
1483 }
1484 use_this_page:
1485 assert(!m->active && m->inactive);
1486 assert(!m->laundry);
1487 assert(m->object != kernel_object);
1488
1489 /*
1490 * Try to lock object; since we've alread got the
1491 * page queues lock, we can only 'try' for this one.
1492 * if the 'try' fails, we need to do a mutex_pause
1493 * to allow the owner of the object lock a chance to
1494 * run... otherwise, we're likely to trip over this
1495 * object in the same state as we work our way through
1496 * the queue... clumps of pages associated with the same
1497 * object are fairly typical on the inactive and active queues
1498 */
1499 if (m->object != object) {
1500 if (object != NULL) {
1501 vm_object_unlock(object);
1502 object = NULL;
1503 }
1504 if (!vm_object_lock_try(m->object)) {
1505 /*
1506 * Move page to end and continue.
1507 * Don't re-issue ticket
1508 */
1509 if (m->zero_fill) {
1510 queue_remove(&vm_page_queue_zf, m,
1511 vm_page_t, pageq);
1512 queue_enter(&vm_page_queue_zf, m,
1513 vm_page_t, pageq);
1514 } else {
1515 queue_remove(&vm_page_queue_inactive, m,
1516 vm_page_t, pageq);
1517 queue_enter(&vm_page_queue_inactive, m,
1518 vm_page_t, pageq);
1519 }
1520 vm_pageout_inactive_nolock++;
1521
1522 /*
1523 * force us to dump any collected free pages
1524 * and to pause before moving on
1525 */
1526 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1527
1528 goto done_with_inactivepage;
1529 }
1530 object = m->object;
1531 }
1532 /*
1533 * If the page belongs to a purgable object with no pending copies
1534 * against it, then we reap all of the pages in the object
1535 * and note that the object has been "emptied". It'll be up to the
1536 * application the discover this and recreate its contents if desired.
1537 */
1538 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1539 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1540 object->copy == VM_OBJECT_NULL) {
1541
1542 (void) vm_object_purge(object);
1543 vm_pageout_purged_objects++;
1544 /*
1545 * we've just taken all of the pages from this object,
1546 * so drop the lock now since we're not going to find
1547 * any more pages belonging to it anytime soon
1548 */
1549 vm_object_unlock(object);
1550 object = NULL;
1551
1552 inactive_burst_count = 0;
1553
1554 goto done_with_inactivepage;
1555 }
1556
1557 /*
1558 * Paging out pages of external objects which
1559 * are currently being created must be avoided.
1560 * The pager may claim for memory, thus leading to a
1561 * possible dead lock between it and the pageout thread,
1562 * if such pages are finally chosen. The remaining assumption
1563 * is that there will finally be enough available pages in the
1564 * inactive pool to page out in order to satisfy all memory
1565 * claimed by the thread which concurrently creates the pager.
1566 */
1567 if (!object->pager_initialized && object->pager_created) {
1568 /*
1569 * Move page to end and continue, hoping that
1570 * there will be enough other inactive pages to
1571 * page out so that the thread which currently
1572 * initializes the pager will succeed.
1573 * Don't re-grant the ticket, the page should
1574 * pulled from the queue and paged out whenever
1575 * one of its logically adjacent fellows is
1576 * targeted.
1577 */
1578 if (m->zero_fill) {
1579 queue_remove(&vm_page_queue_zf, m,
1580 vm_page_t, pageq);
1581 queue_enter(&vm_page_queue_zf, m,
1582 vm_page_t, pageq);
1583 last_page_zf = 1;
1584 vm_zf_iterator = vm_zf_iterator_count - 1;
1585 } else {
1586 queue_remove(&vm_page_queue_inactive, m,
1587 vm_page_t, pageq);
1588 queue_enter(&vm_page_queue_inactive, m,
1589 vm_page_t, pageq);
1590 last_page_zf = 0;
1591 vm_zf_iterator = 1;
1592 }
1593 vm_pageout_inactive_avoid++;
1594
1595 goto done_with_inactivepage;
1596 }
1597 /*
1598 * Remove the page from the inactive list.
1599 */
1600 if (m->zero_fill) {
1601 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1602 } else {
1603 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1604 }
1605 m->pageq.next = NULL;
1606 m->pageq.prev = NULL;
1607 m->inactive = FALSE;
1608 if (!m->fictitious)
1609 vm_page_inactive_count--;
1610
1611 if (m->busy || !object->alive) {
1612 /*
1613 * Somebody is already playing with this page.
1614 * Leave it off the pageout queues.
1615 */
1616 vm_pageout_inactive_busy++;
1617
1618 goto done_with_inactivepage;
1619 }
1620
1621 /*
1622 * If it's absent or in error, we can reclaim the page.
1623 */
1624
1625 if (m->absent || m->error) {
1626 vm_pageout_inactive_absent++;
1627 reclaim_page:
1628 if (vm_pageout_deadlock_target) {
1629 vm_pageout_scan_inactive_throttle_success++;
1630 vm_pageout_deadlock_target--;
1631 }
1632 if (m->tabled)
1633 vm_page_remove(m); /* clears tabled, object, offset */
1634 if (m->absent)
1635 vm_object_absent_release(object);
1636
1637 assert(m->pageq.next == NULL &&
1638 m->pageq.prev == NULL);
1639 m->pageq.next = (queue_entry_t)local_freeq;
1640 local_freeq = m;
1641 local_freed++;
1642
1643 inactive_burst_count = 0;
1644
1645 goto done_with_inactivepage;
1646 }
1647
1648 assert(!m->private);
1649 assert(!m->fictitious);
1650
1651 /*
1652 * If already cleaning this page in place, convert from
1653 * "adjacent" to "target". We can leave the page mapped,
1654 * and vm_pageout_object_terminate will determine whether
1655 * to free or reactivate.
1656 */
1657
1658 if (m->cleaning) {
1659 m->busy = TRUE;
1660 m->pageout = TRUE;
1661 m->dump_cleaning = TRUE;
1662 vm_page_wire(m);
1663
1664 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1665
1666 inactive_burst_count = 0;
1667
1668 goto done_with_inactivepage;
1669 }
1670
1671 /*
1672 * If it's being used, reactivate.
1673 * (Fictitious pages are either busy or absent.)
1674 */
1675 if ( (!m->reference) ) {
1676 refmod_state = pmap_get_refmod(m->phys_page);
1677
1678 if (refmod_state & VM_MEM_REFERENCED)
1679 m->reference = TRUE;
1680 if (refmod_state & VM_MEM_MODIFIED)
1681 m->dirty = TRUE;
1682 }
1683 if (m->reference) {
1684 was_referenced:
1685 vm_page_activate(m);
1686 VM_STAT(reactivations++);
1687
1688 vm_pageout_inactive_used++;
1689 last_page_zf = 0;
1690 inactive_burst_count = 0;
1691
1692 goto done_with_inactivepage;
1693 }
1694
1695 XPR(XPR_VM_PAGEOUT,
1696 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1697 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1698
1699 /*
1700 * we've got a candidate page to steal...
1701 *
1702 * m->dirty is up to date courtesy of the
1703 * preceding check for m->reference... if
1704 * we get here, then m->reference had to be
1705 * FALSE which means we did a pmap_get_refmod
1706 * and updated both m->reference and m->dirty
1707 *
1708 * if it's dirty or precious we need to
1709 * see if the target queue is throtttled
1710 * it if is, we need to skip over it by moving it back
1711 * to the end of the inactive queue
1712 */
1713 inactive_throttled = FALSE;
1714
1715 if (m->dirty || m->precious) {
1716 if (object->internal) {
1717 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1718 inactive_throttled = TRUE;
1719 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1720 inactive_throttled = TRUE;
1721 }
1722 }
1723 if (inactive_throttled == TRUE) {
1724 if (m->zero_fill) {
1725 queue_enter(&vm_page_queue_zf, m,
1726 vm_page_t, pageq);
1727 } else {
1728 queue_enter(&vm_page_queue_inactive, m,
1729 vm_page_t, pageq);
1730 }
1731 if (!m->fictitious)
1732 vm_page_inactive_count++;
1733 m->inactive = TRUE;
1734
1735 vm_pageout_scan_inactive_throttled++;
1736
1737 goto done_with_inactivepage;
1738 }
1739 /*
1740 * we've got a page that we can steal...
1741 * eliminate all mappings and make sure
1742 * we have the up-to-date modified state
1743 * first take the page BUSY, so that no new
1744 * mappings can be made
1745 */
1746 m->busy = TRUE;
1747
1748 /*
1749 * if we need to do a pmap_disconnect then we
1750 * need to re-evaluate m->dirty since the pmap_disconnect
1751 * provides the true state atomically... the
1752 * page was still mapped up to the pmap_disconnect
1753 * and may have been dirtied at the last microsecond
1754 *
1755 * we also check for the page being referenced 'late'
1756 * if it was, we first need to do a WAKEUP_DONE on it
1757 * since we already set m->busy = TRUE, before
1758 * going off to reactivate it
1759 *
1760 * if we don't need the pmap_disconnect, then
1761 * m->dirty is up to date courtesy of the
1762 * earlier check for m->reference... if
1763 * we get here, then m->reference had to be
1764 * FALSE which means we did a pmap_get_refmod
1765 * and updated both m->reference and m->dirty...
1766 */
1767 if (m->no_isync == FALSE) {
1768 refmod_state = pmap_disconnect(m->phys_page);
1769
1770 if (refmod_state & VM_MEM_MODIFIED)
1771 m->dirty = TRUE;
1772 if (refmod_state & VM_MEM_REFERENCED) {
1773 m->reference = TRUE;
1774
1775 PAGE_WAKEUP_DONE(m);
1776 goto was_referenced;
1777 }
1778 }
1779 /*
1780 * If it's clean and not precious, we can free the page.
1781 */
1782 if (!m->dirty && !m->precious) {
1783 vm_pageout_inactive_clean++;
1784 goto reclaim_page;
1785 }
1786 vm_pageout_cluster(m);
1787
1788 vm_pageout_inactive_dirty++;
1789
1790 inactive_burst_count = 0;
1791
1792 done_with_inactivepage:
1793 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1794
1795 if (object != NULL) {
1796 vm_object_unlock(object);
1797 object = NULL;
1798 }
1799 if (local_freeq) {
1800 vm_page_free_list(local_freeq);
1801
1802 local_freeq = 0;
1803 local_freed = 0;
1804 }
1805 delayed_unlock = 0;
1806 vm_page_unlock_queues();
1807 mutex_pause();
1808 }
1809 /*
1810 * back to top of pageout scan loop
1811 */
1812 }
1813 }
1814
1815
1816 int vm_page_free_count_init;
1817
1818 void
1819 vm_page_free_reserve(
1820 int pages)
1821 {
1822 int free_after_reserve;
1823
1824 vm_page_free_reserved += pages;
1825
1826 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1827
1828 vm_page_free_min = vm_page_free_reserved +
1829 VM_PAGE_FREE_MIN(free_after_reserve);
1830
1831 vm_page_free_target = vm_page_free_reserved +
1832 VM_PAGE_FREE_TARGET(free_after_reserve);
1833
1834 if (vm_page_free_target < vm_page_free_min + 5)
1835 vm_page_free_target = vm_page_free_min + 5;
1836 }
1837
1838 /*
1839 * vm_pageout is the high level pageout daemon.
1840 */
1841
1842 void
1843 vm_pageout_continue(void)
1844 {
1845 vm_pageout_scan_event_counter++;
1846 vm_pageout_scan();
1847 /* we hold vm_page_queue_free_lock now */
1848 assert(vm_page_free_wanted == 0);
1849 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1850 mutex_unlock(&vm_page_queue_free_lock);
1851
1852 counter(c_vm_pageout_block++);
1853 thread_block((thread_continue_t)vm_pageout_continue);
1854 /*NOTREACHED*/
1855 }
1856
1857
1858 /*
1859 * must be called with the
1860 * queues and object locks held
1861 */
1862 static void
1863 vm_pageout_queue_steal(vm_page_t m)
1864 {
1865 struct vm_pageout_queue *q;
1866
1867 if (m->object->internal == TRUE)
1868 q = &vm_pageout_queue_internal;
1869 else
1870 q = &vm_pageout_queue_external;
1871
1872 m->laundry = FALSE;
1873 m->pageout_queue = FALSE;
1874 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1875
1876 m->pageq.next = NULL;
1877 m->pageq.prev = NULL;
1878
1879 vm_object_paging_end(m->object);
1880
1881 q->pgo_laundry--;
1882 }
1883
1884
1885 #ifdef FAKE_DEADLOCK
1886
1887 #define FAKE_COUNT 5000
1888
1889 int internal_count = 0;
1890 int fake_deadlock = 0;
1891
1892 #endif
1893
1894 static void
1895 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1896 {
1897 vm_page_t m = NULL;
1898 vm_object_t object;
1899 boolean_t need_wakeup;
1900
1901 vm_page_lock_queues();
1902
1903 while ( !queue_empty(&q->pgo_pending) ) {
1904
1905 q->pgo_busy = TRUE;
1906 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1907 m->pageout_queue = FALSE;
1908 vm_page_unlock_queues();
1909
1910 m->pageq.next = NULL;
1911 m->pageq.prev = NULL;
1912 #ifdef FAKE_DEADLOCK
1913 if (q == &vm_pageout_queue_internal) {
1914 vm_offset_t addr;
1915 int pg_count;
1916
1917 internal_count++;
1918
1919 if ((internal_count == FAKE_COUNT)) {
1920
1921 pg_count = vm_page_free_count + vm_page_free_reserved;
1922
1923 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1924 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1925 }
1926 internal_count = 0;
1927 fake_deadlock++;
1928 }
1929 }
1930 #endif
1931 object = m->object;
1932
1933 if (!object->pager_initialized) {
1934 vm_object_lock(object);
1935
1936 /*
1937 * If there is no memory object for the page, create
1938 * one and hand it to the default pager.
1939 */
1940
1941 if (!object->pager_initialized)
1942 vm_object_collapse(object, (vm_object_offset_t)0);
1943 if (!object->pager_initialized)
1944 vm_object_pager_create(object);
1945 if (!object->pager_initialized) {
1946 /*
1947 * Still no pager for the object.
1948 * Reactivate the page.
1949 *
1950 * Should only happen if there is no
1951 * default pager.
1952 */
1953 m->list_req_pending = FALSE;
1954 m->cleaning = FALSE;
1955 m->pageout = FALSE;
1956 vm_page_unwire(m);
1957
1958 vm_pageout_throttle_up(m);
1959
1960 vm_page_lock_queues();
1961 vm_pageout_dirty_no_pager++;
1962 vm_page_activate(m);
1963 vm_page_unlock_queues();
1964
1965 /*
1966 * And we are done with it.
1967 */
1968 PAGE_WAKEUP_DONE(m);
1969
1970 vm_object_paging_end(object);
1971 vm_object_unlock(object);
1972
1973 vm_page_lock_queues();
1974 continue;
1975 } else if (object->pager == MEMORY_OBJECT_NULL) {
1976 /*
1977 * This pager has been destroyed by either
1978 * memory_object_destroy or vm_object_destroy, and
1979 * so there is nowhere for the page to go.
1980 * Just free the page... VM_PAGE_FREE takes
1981 * care of cleaning up all the state...
1982 * including doing the vm_pageout_throttle_up
1983 */
1984 VM_PAGE_FREE(m);
1985
1986 vm_object_paging_end(object);
1987 vm_object_unlock(object);
1988
1989 vm_page_lock_queues();
1990 continue;
1991 }
1992 vm_object_unlock(object);
1993 }
1994 /*
1995 * we expect the paging_in_progress reference to have
1996 * already been taken on the object before it was added
1997 * to the appropriate pageout I/O queue... this will
1998 * keep the object from being terminated and/or the
1999 * paging_offset from changing until the I/O has
2000 * completed... therefore no need to lock the object to
2001 * pull the paging_offset from it.
2002 *
2003 * Send the data to the pager.
2004 * any pageout clustering happens there
2005 */
2006 memory_object_data_return(object->pager,
2007 m->offset + object->paging_offset,
2008 PAGE_SIZE,
2009 NULL,
2010 NULL,
2011 FALSE,
2012 FALSE,
2013 0);
2014
2015 vm_object_lock(object);
2016 vm_object_paging_end(object);
2017 vm_object_unlock(object);
2018
2019 vm_page_lock_queues();
2020 }
2021 assert_wait((event_t) q, THREAD_UNINT);
2022
2023
2024 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2025 q->pgo_throttled = FALSE;
2026 need_wakeup = TRUE;
2027 } else
2028 need_wakeup = FALSE;
2029
2030 q->pgo_busy = FALSE;
2031 q->pgo_idle = TRUE;
2032 vm_page_unlock_queues();
2033
2034 if (need_wakeup == TRUE)
2035 thread_wakeup((event_t) &q->pgo_laundry);
2036
2037 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2038 /*NOTREACHED*/
2039 }
2040
2041
2042 static void
2043 vm_pageout_iothread_external(void)
2044 {
2045
2046 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2047 /*NOTREACHED*/
2048 }
2049
2050
2051 static void
2052 vm_pageout_iothread_internal(void)
2053 {
2054 thread_t self = current_thread();
2055
2056 self->options |= TH_OPT_VMPRIV;
2057
2058 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2059 /*NOTREACHED*/
2060 }
2061
2062 static void
2063 vm_pageout_garbage_collect(int collect)
2064 {
2065 if (collect) {
2066 stack_collect();
2067
2068 /*
2069 * consider_zone_gc should be last, because the other operations
2070 * might return memory to zones.
2071 */
2072 consider_machine_collect();
2073 consider_zone_gc();
2074
2075 consider_machine_adjust();
2076 }
2077
2078 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2079
2080 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2081 /*NOTREACHED*/
2082 }
2083
2084
2085
2086 void
2087 vm_pageout(void)
2088 {
2089 thread_t self = current_thread();
2090 thread_t thread;
2091 kern_return_t result;
2092 spl_t s;
2093
2094 /*
2095 * Set thread privileges.
2096 */
2097 s = splsched();
2098 thread_lock(self);
2099 self->priority = BASEPRI_PREEMPT - 1;
2100 set_sched_pri(self, self->priority);
2101 thread_unlock(self);
2102 splx(s);
2103
2104 /*
2105 * Initialize some paging parameters.
2106 */
2107
2108 if (vm_pageout_idle_wait == 0)
2109 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2110
2111 if (vm_pageout_burst_wait == 0)
2112 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2113
2114 if (vm_pageout_empty_wait == 0)
2115 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2116
2117 if (vm_pageout_deadlock_wait == 0)
2118 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2119
2120 if (vm_pageout_deadlock_relief == 0)
2121 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2122
2123 if (vm_pageout_inactive_relief == 0)
2124 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2125
2126 if (vm_pageout_burst_active_throttle == 0)
2127 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2128
2129 if (vm_pageout_burst_inactive_throttle == 0)
2130 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2131
2132 /*
2133 * Set kernel task to low backing store privileged
2134 * status
2135 */
2136 task_lock(kernel_task);
2137 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2138 task_unlock(kernel_task);
2139
2140 vm_page_free_count_init = vm_page_free_count;
2141 vm_zf_iterator = 0;
2142 /*
2143 * even if we've already called vm_page_free_reserve
2144 * call it again here to insure that the targets are
2145 * accurately calculated (it uses vm_page_free_count_init)
2146 * calling it with an arg of 0 will not change the reserve
2147 * but will re-calculate free_min and free_target
2148 */
2149 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2150 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2151 } else
2152 vm_page_free_reserve(0);
2153
2154
2155 queue_init(&vm_pageout_queue_external.pgo_pending);
2156 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2157 vm_pageout_queue_external.pgo_laundry = 0;
2158 vm_pageout_queue_external.pgo_idle = FALSE;
2159 vm_pageout_queue_external.pgo_busy = FALSE;
2160 vm_pageout_queue_external.pgo_throttled = FALSE;
2161
2162 queue_init(&vm_pageout_queue_internal.pgo_pending);
2163 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2164 vm_pageout_queue_internal.pgo_laundry = 0;
2165 vm_pageout_queue_internal.pgo_idle = FALSE;
2166 vm_pageout_queue_internal.pgo_busy = FALSE;
2167 vm_pageout_queue_internal.pgo_throttled = FALSE;
2168
2169
2170 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2171 if (result != KERN_SUCCESS)
2172 panic("vm_pageout_iothread_internal: create failed");
2173
2174 thread_deallocate(thread);
2175
2176
2177 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2178 if (result != KERN_SUCCESS)
2179 panic("vm_pageout_iothread_external: create failed");
2180
2181 thread_deallocate(thread);
2182
2183
2184 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2185 if (result != KERN_SUCCESS)
2186 panic("vm_pageout_garbage_collect: create failed");
2187
2188 thread_deallocate(thread);
2189
2190 vm_object_reaper_init();
2191
2192
2193 vm_pageout_continue();
2194 /*NOTREACHED*/
2195 }
2196
2197
2198 static upl_t
2199 upl_create(
2200 int flags,
2201 upl_size_t size)
2202 {
2203 upl_t upl;
2204 int page_field_size; /* bit field in word size buf */
2205
2206 page_field_size = 0;
2207 if (flags & UPL_CREATE_LITE) {
2208 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2209 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2210 }
2211 if(flags & UPL_CREATE_INTERNAL) {
2212 upl = (upl_t)kalloc(sizeof(struct upl)
2213 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2214 + page_field_size);
2215 } else {
2216 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2217 }
2218 upl->flags = 0;
2219 upl->src_object = NULL;
2220 upl->kaddr = (vm_offset_t)0;
2221 upl->size = 0;
2222 upl->map_object = NULL;
2223 upl->ref_count = 1;
2224 upl_lock_init(upl);
2225 #ifdef UPL_DEBUG
2226 upl->ubc_alias1 = 0;
2227 upl->ubc_alias2 = 0;
2228 #endif /* UPL_DEBUG */
2229 return(upl);
2230 }
2231
2232 static void
2233 upl_destroy(
2234 upl_t upl)
2235 {
2236 int page_field_size; /* bit field in word size buf */
2237
2238 #ifdef UPL_DEBUG
2239 {
2240 upl_t upl_ele;
2241 vm_object_t object;
2242 if (upl->map_object->pageout) {
2243 object = upl->map_object->shadow;
2244 } else {
2245 object = upl->map_object;
2246 }
2247 vm_object_lock(object);
2248 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2249 if(upl_ele == upl) {
2250 queue_remove(&object->uplq,
2251 upl_ele, upl_t, uplq);
2252 break;
2253 }
2254 }
2255 vm_object_unlock(object);
2256 }
2257 #endif /* UPL_DEBUG */
2258 /* drop a reference on the map_object whether or */
2259 /* not a pageout object is inserted */
2260 if(upl->map_object->pageout)
2261 vm_object_deallocate(upl->map_object);
2262
2263 page_field_size = 0;
2264 if (upl->flags & UPL_LITE) {
2265 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2266 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2267 }
2268 if(upl->flags & UPL_INTERNAL) {
2269 kfree(upl,
2270 sizeof(struct upl) +
2271 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2272 + page_field_size);
2273 } else {
2274 kfree(upl, sizeof(struct upl) + page_field_size);
2275 }
2276 }
2277
2278 void uc_upl_dealloc(upl_t upl);
2279 __private_extern__ void
2280 uc_upl_dealloc(
2281 upl_t upl)
2282 {
2283 upl->ref_count -= 1;
2284 if(upl->ref_count == 0) {
2285 upl_destroy(upl);
2286 }
2287 }
2288
2289 void
2290 upl_deallocate(
2291 upl_t upl)
2292 {
2293
2294 upl->ref_count -= 1;
2295 if(upl->ref_count == 0) {
2296 upl_destroy(upl);
2297 }
2298 }
2299
2300 /*
2301 * Statistics about UPL enforcement of copy-on-write obligations.
2302 */
2303 unsigned long upl_cow = 0;
2304 unsigned long upl_cow_again = 0;
2305 unsigned long upl_cow_contiguous = 0;
2306 unsigned long upl_cow_pages = 0;
2307 unsigned long upl_cow_again_pages = 0;
2308 unsigned long upl_cow_contiguous_pages = 0;
2309
2310 /*
2311 * Routine: vm_object_upl_request
2312 * Purpose:
2313 * Cause the population of a portion of a vm_object.
2314 * Depending on the nature of the request, the pages
2315 * returned may be contain valid data or be uninitialized.
2316 * A page list structure, listing the physical pages
2317 * will be returned upon request.
2318 * This function is called by the file system or any other
2319 * supplier of backing store to a pager.
2320 * IMPORTANT NOTE: The caller must still respect the relationship
2321 * between the vm_object and its backing memory object. The
2322 * caller MUST NOT substitute changes in the backing file
2323 * without first doing a memory_object_lock_request on the
2324 * target range unless it is know that the pages are not
2325 * shared with another entity at the pager level.
2326 * Copy_in_to:
2327 * if a page list structure is present
2328 * return the mapped physical pages, where a
2329 * page is not present, return a non-initialized
2330 * one. If the no_sync bit is turned on, don't
2331 * call the pager unlock to synchronize with other
2332 * possible copies of the page. Leave pages busy
2333 * in the original object, if a page list structure
2334 * was specified. When a commit of the page list
2335 * pages is done, the dirty bit will be set for each one.
2336 * Copy_out_from:
2337 * If a page list structure is present, return
2338 * all mapped pages. Where a page does not exist
2339 * map a zero filled one. Leave pages busy in
2340 * the original object. If a page list structure
2341 * is not specified, this call is a no-op.
2342 *
2343 * Note: access of default pager objects has a rather interesting
2344 * twist. The caller of this routine, presumably the file system
2345 * page cache handling code, will never actually make a request
2346 * against a default pager backed object. Only the default
2347 * pager will make requests on backing store related vm_objects
2348 * In this way the default pager can maintain the relationship
2349 * between backing store files (abstract memory objects) and
2350 * the vm_objects (cache objects), they support.
2351 *
2352 */
2353
2354 __private_extern__ kern_return_t
2355 vm_object_upl_request(
2356 vm_object_t object,
2357 vm_object_offset_t offset,
2358 upl_size_t size,
2359 upl_t *upl_ptr,
2360 upl_page_info_array_t user_page_list,
2361 unsigned int *page_list_count,
2362 int cntrl_flags)
2363 {
2364 vm_page_t dst_page = VM_PAGE_NULL;
2365 vm_object_offset_t dst_offset = offset;
2366 upl_size_t xfer_size = size;
2367 boolean_t do_m_lock = FALSE;
2368 boolean_t dirty;
2369 boolean_t hw_dirty;
2370 upl_t upl = NULL;
2371 unsigned int entry;
2372 #if MACH_CLUSTER_STATS
2373 boolean_t encountered_lrp = FALSE;
2374 #endif
2375 vm_page_t alias_page = NULL;
2376 int page_ticket;
2377 int refmod_state;
2378 wpl_array_t lite_list = NULL;
2379 vm_object_t last_copy_object;
2380
2381
2382 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2383 /*
2384 * For forward compatibility's sake,
2385 * reject any unknown flag.
2386 */
2387 return KERN_INVALID_VALUE;
2388 }
2389
2390 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2391 >> UPL_PAGE_TICKET_SHIFT;
2392
2393 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2394 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2395 }
2396
2397 if(cntrl_flags & UPL_SET_INTERNAL)
2398 if(page_list_count != NULL)
2399 *page_list_count = MAX_UPL_TRANSFER;
2400
2401 if((!object->internal) && (object->paging_offset != 0))
2402 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
2403
2404 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2405 return KERN_SUCCESS;
2406 }
2407
2408 vm_object_lock(object);
2409 vm_object_paging_begin(object);
2410 vm_object_unlock(object);
2411
2412 if(upl_ptr) {
2413 if(cntrl_flags & UPL_SET_INTERNAL) {
2414 if(cntrl_flags & UPL_SET_LITE) {
2415 uintptr_t page_field_size;
2416 upl = upl_create(
2417 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2418 size);
2419 user_page_list = (upl_page_info_t *)
2420 (((uintptr_t)upl) + sizeof(struct upl));
2421 lite_list = (wpl_array_t)
2422 (((uintptr_t)user_page_list) +
2423 ((size/PAGE_SIZE) *
2424 sizeof(upl_page_info_t)));
2425 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2426 page_field_size =
2427 (page_field_size + 3) & 0xFFFFFFFC;
2428 bzero((char *)lite_list, page_field_size);
2429 upl->flags =
2430 UPL_LITE | UPL_INTERNAL;
2431 } else {
2432 upl = upl_create(UPL_CREATE_INTERNAL, size);
2433 user_page_list = (upl_page_info_t *)
2434 (((uintptr_t)upl) + sizeof(struct upl));
2435 upl->flags = UPL_INTERNAL;
2436 }
2437 } else {
2438 if(cntrl_flags & UPL_SET_LITE) {
2439 uintptr_t page_field_size;
2440 upl = upl_create(UPL_CREATE_LITE, size);
2441 lite_list = (wpl_array_t)
2442 (((uintptr_t)upl) + sizeof(struct upl));
2443 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2444 page_field_size =
2445 (page_field_size + 3) & 0xFFFFFFFC;
2446 bzero((char *)lite_list, page_field_size);
2447 upl->flags = UPL_LITE;
2448 } else {
2449 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2450 upl->flags = 0;
2451 }
2452 }
2453
2454 if (object->phys_contiguous) {
2455 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2456 object->copy != VM_OBJECT_NULL) {
2457 /* Honor copy-on-write obligations */
2458
2459 /*
2460 * XXX FBDP
2461 * We could still have a race...
2462 * A is here building the UPL for a write().
2463 * A pushes the pages to the current copy
2464 * object.
2465 * A returns the UPL to the caller.
2466 * B comes along and establishes another
2467 * private mapping on this object, inserting
2468 * a new copy object between the original
2469 * object and the old copy object.
2470 * B reads a page and gets the original contents
2471 * from the original object.
2472 * A modifies the page in the original object.
2473 * B reads the page again and sees A's changes,
2474 * which is wrong...
2475 *
2476 * The problem is that the pages are not
2477 * marked "busy" in the original object, so
2478 * nothing prevents B from reading it before
2479 * before A's changes are completed.
2480 *
2481 * The "paging_in_progress" might protect us
2482 * from the insertion of a new copy object
2483 * though... To be verified.
2484 */
2485 vm_object_lock_request(object,
2486 offset,
2487 size,
2488 FALSE,
2489 MEMORY_OBJECT_COPY_SYNC,
2490 VM_PROT_NO_CHANGE);
2491 upl_cow_contiguous++;
2492 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2493 }
2494
2495 upl->map_object = object;
2496 /* don't need any shadow mappings for this one */
2497 /* since it is already I/O memory */
2498 upl->flags |= UPL_DEVICE_MEMORY;
2499
2500
2501 /* paging_in_progress protects paging_offset */
2502 upl->offset = offset + object->paging_offset;
2503 upl->size = size;
2504 *upl_ptr = upl;
2505 if(user_page_list) {
2506 user_page_list[0].phys_addr =
2507 (offset + object->shadow_offset)>>PAGE_SHIFT;
2508 user_page_list[0].device = TRUE;
2509 }
2510
2511 if(page_list_count != NULL) {
2512 if (upl->flags & UPL_INTERNAL) {
2513 *page_list_count = 0;
2514 } else {
2515 *page_list_count = 1;
2516 }
2517 }
2518
2519 return KERN_SUCCESS;
2520 }
2521
2522 if(user_page_list)
2523 user_page_list[0].device = FALSE;
2524
2525 if(cntrl_flags & UPL_SET_LITE) {
2526 upl->map_object = object;
2527 } else {
2528 upl->map_object = vm_object_allocate(size);
2529 /*
2530 * No neeed to lock the new object: nobody else knows
2531 * about it yet, so it's all ours so far.
2532 */
2533 upl->map_object->shadow = object;
2534 upl->map_object->pageout = TRUE;
2535 upl->map_object->can_persist = FALSE;
2536 upl->map_object->copy_strategy =
2537 MEMORY_OBJECT_COPY_NONE;
2538 upl->map_object->shadow_offset = offset;
2539 upl->map_object->wimg_bits = object->wimg_bits;
2540 }
2541
2542 }
2543 if (!(cntrl_flags & UPL_SET_LITE)) {
2544 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2545 }
2546
2547 /*
2548 * ENCRYPTED SWAP:
2549 * Just mark the UPL as "encrypted" here.
2550 * We'll actually encrypt the pages later,
2551 * in upl_encrypt(), when the caller has
2552 * selected which pages need to go to swap.
2553 */
2554 if (cntrl_flags & UPL_ENCRYPT) {
2555 upl->flags |= UPL_ENCRYPTED;
2556 }
2557 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2558 upl->flags |= UPL_PAGEOUT;
2559 }
2560 vm_object_lock(object);
2561
2562 /* we can lock in the paging_offset once paging_in_progress is set */
2563 if(upl_ptr) {
2564 upl->size = size;
2565 upl->offset = offset + object->paging_offset;
2566 *upl_ptr = upl;
2567 #ifdef UPL_DEBUG
2568 queue_enter(&object->uplq, upl, upl_t, uplq);
2569 #endif /* UPL_DEBUG */
2570 }
2571
2572 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2573 object->copy != VM_OBJECT_NULL) {
2574 /* Honor copy-on-write obligations */
2575
2576 /*
2577 * The caller is gathering these pages and
2578 * might modify their contents. We need to
2579 * make sure that the copy object has its own
2580 * private copies of these pages before we let
2581 * the caller modify them.
2582 */
2583 vm_object_update(object,
2584 offset,
2585 size,
2586 NULL,
2587 NULL,
2588 FALSE, /* should_return */
2589 MEMORY_OBJECT_COPY_SYNC,
2590 VM_PROT_NO_CHANGE);
2591 upl_cow++;
2592 upl_cow_pages += size >> PAGE_SHIFT;
2593
2594 }
2595 /* remember which copy object we synchronized with */
2596 last_copy_object = object->copy;
2597
2598 entry = 0;
2599 if(cntrl_flags & UPL_COPYOUT_FROM) {
2600 upl->flags |= UPL_PAGE_SYNC_DONE;
2601
2602 while (xfer_size) {
2603 if((alias_page == NULL) &&
2604 !(cntrl_flags & UPL_SET_LITE)) {
2605 vm_object_unlock(object);
2606 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2607 vm_object_lock(object);
2608 }
2609 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2610 dst_page->fictitious ||
2611 dst_page->absent ||
2612 dst_page->error ||
2613 (dst_page->wire_count && !dst_page->pageout) ||
2614
2615 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2616 (dst_page->page_ticket != page_ticket) &&
2617 ((dst_page->page_ticket+1) != page_ticket)) ) {
2618
2619 if (user_page_list)
2620 user_page_list[entry].phys_addr = 0;
2621 } else {
2622 /*
2623 * grab this up front...
2624 * a high percentange of the time we're going to
2625 * need the hardware modification state a bit later
2626 * anyway... so we can eliminate an extra call into
2627 * the pmap layer by grabbing it here and recording it
2628 */
2629 refmod_state = pmap_get_refmod(dst_page->phys_page);
2630
2631 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2632 /*
2633 * we're only asking for DIRTY pages to be returned
2634 */
2635
2636 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2637 /*
2638 * if we were the page stolen by vm_pageout_scan to be
2639 * cleaned (as opposed to a buddy being clustered in
2640 * or this request is not being driven by a PAGEOUT cluster
2641 * then we only need to check for the page being diry or
2642 * precious to decide whether to return it
2643 */
2644 if (dst_page->dirty || dst_page->precious ||
2645 (refmod_state & VM_MEM_MODIFIED)) {
2646 goto check_busy;
2647 }
2648 }
2649 /*
2650 * this is a request for a PAGEOUT cluster and this page
2651 * is merely along for the ride as a 'buddy'... not only
2652 * does it have to be dirty to be returned, but it also
2653 * can't have been referenced recently... note that we've
2654 * already filtered above based on whether this page is
2655 * currently on the inactive queue or it meets the page
2656 * ticket (generation count) check
2657 */
2658 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2659 ((refmod_state & VM_MEM_MODIFIED) ||
2660 dst_page->dirty || dst_page->precious) ) {
2661 goto check_busy;
2662 }
2663 /*
2664 * if we reach here, we're not to return
2665 * the page... go on to the next one
2666 */
2667 if (user_page_list)
2668 user_page_list[entry].phys_addr = 0;
2669 entry++;
2670 dst_offset += PAGE_SIZE_64;
2671 xfer_size -= PAGE_SIZE;
2672 continue;
2673 }
2674 check_busy:
2675 if(dst_page->busy &&
2676 (!(dst_page->list_req_pending &&
2677 dst_page->pageout))) {
2678 if(cntrl_flags & UPL_NOBLOCK) {
2679 if(user_page_list) {
2680 user_page_list[entry].phys_addr = 0;
2681 }
2682 entry++;
2683 dst_offset += PAGE_SIZE_64;
2684 xfer_size -= PAGE_SIZE;
2685 continue;
2686 }
2687 /*
2688 * someone else is playing with the
2689 * page. We will have to wait.
2690 */
2691 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2692 continue;
2693 }
2694 /* Someone else already cleaning the page? */
2695 if((dst_page->cleaning || dst_page->absent ||
2696 dst_page->wire_count != 0) &&
2697 !dst_page->list_req_pending) {
2698 if(user_page_list) {
2699 user_page_list[entry].phys_addr = 0;
2700 }
2701 entry++;
2702 dst_offset += PAGE_SIZE_64;
2703 xfer_size -= PAGE_SIZE;
2704 continue;
2705 }
2706 /* eliminate all mappings from the */
2707 /* original object and its prodigy */
2708
2709 vm_page_lock_queues();
2710
2711 if (dst_page->pageout_queue == TRUE)
2712 /*
2713 * we've buddied up a page for a clustered pageout
2714 * that has already been moved to the pageout
2715 * queue by pageout_scan... we need to remove
2716 * it from the queue and drop the laundry count
2717 * on that queue
2718 */
2719 vm_pageout_queue_steal(dst_page);
2720 #if MACH_CLUSTER_STATS
2721 /* pageout statistics gathering. count */
2722 /* all the pages we will page out that */
2723 /* were not counted in the initial */
2724 /* vm_pageout_scan work */
2725 if(dst_page->list_req_pending)
2726 encountered_lrp = TRUE;
2727 if((dst_page->dirty ||
2728 (dst_page->object->internal &&
2729 dst_page->precious)) &&
2730 (dst_page->list_req_pending
2731 == FALSE)) {
2732 if(encountered_lrp) {
2733 CLUSTER_STAT
2734 (pages_at_higher_offsets++;)
2735 } else {
2736 CLUSTER_STAT
2737 (pages_at_lower_offsets++;)
2738 }
2739 }
2740 #endif
2741 /* Turn off busy indication on pending */
2742 /* pageout. Note: we can only get here */
2743 /* in the request pending case. */
2744 dst_page->list_req_pending = FALSE;
2745 dst_page->busy = FALSE;
2746 dst_page->cleaning = FALSE;
2747
2748 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2749 dirty = hw_dirty ? TRUE : dst_page->dirty;
2750
2751 if(cntrl_flags & UPL_SET_LITE) {
2752 int pg_num;
2753 pg_num = (dst_offset-offset)/PAGE_SIZE;
2754 lite_list[pg_num>>5] |=
2755 1 << (pg_num & 31);
2756 if (hw_dirty)
2757 pmap_clear_modify(dst_page->phys_page);
2758 /*
2759 * Record that this page has been
2760 * written out
2761 */
2762 #if MACH_PAGEMAP
2763 vm_external_state_set(
2764 object->existence_map,
2765 dst_page->offset);
2766 #endif /*MACH_PAGEMAP*/
2767
2768 /*
2769 * Mark original page as cleaning
2770 * in place.
2771 */
2772 dst_page->cleaning = TRUE;
2773 dst_page->dirty = TRUE;
2774 dst_page->precious = FALSE;
2775 } else {
2776 /* use pageclean setup, it is more */
2777 /* convenient even for the pageout */
2778 /* cases here */
2779
2780 vm_object_lock(upl->map_object);
2781 vm_pageclean_setup(dst_page,
2782 alias_page, upl->map_object,
2783 size - xfer_size);
2784 vm_object_unlock(upl->map_object);
2785
2786 alias_page->absent = FALSE;
2787 alias_page = NULL;
2788 }
2789
2790 if(!dirty) {
2791 dst_page->dirty = FALSE;
2792 dst_page->precious = TRUE;
2793 }
2794
2795 if(dst_page->pageout)
2796 dst_page->busy = TRUE;
2797
2798 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2799 /*
2800 * ENCRYPTED SWAP:
2801 * We want to deny access to the target page
2802 * because its contents are about to be
2803 * encrypted and the user would be very
2804 * confused to see encrypted data instead
2805 * of their data.
2806 */
2807 dst_page->busy = TRUE;
2808 }
2809 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2810 /*
2811 * deny access to the target page
2812 * while it is being worked on
2813 */
2814 if ((!dst_page->pageout) &&
2815 (dst_page->wire_count == 0)) {
2816 dst_page->busy = TRUE;
2817 dst_page->pageout = TRUE;
2818 vm_page_wire(dst_page);
2819 }
2820 }
2821
2822 if(user_page_list) {
2823 user_page_list[entry].phys_addr
2824 = dst_page->phys_page;
2825 user_page_list[entry].dirty =
2826 dst_page->dirty;
2827 user_page_list[entry].pageout =
2828 dst_page->pageout;
2829 user_page_list[entry].absent =
2830 dst_page->absent;
2831 user_page_list[entry].precious =
2832 dst_page->precious;
2833 }
2834 vm_page_unlock_queues();
2835
2836 /*
2837 * ENCRYPTED SWAP:
2838 * The caller is gathering this page and might
2839 * access its contents later on. Decrypt the
2840 * page before adding it to the UPL, so that
2841 * the caller never sees encrypted data.
2842 */
2843 if (! (cntrl_flags & UPL_ENCRYPT) &&
2844 dst_page->encrypted) {
2845 assert(dst_page->busy);
2846
2847 vm_page_decrypt(dst_page, 0);
2848 vm_page_decrypt_for_upl_counter++;
2849
2850 /*
2851 * Retry this page, since anything
2852 * could have changed while we were
2853 * decrypting.
2854 */
2855 continue;
2856 }
2857 }
2858 entry++;
2859 dst_offset += PAGE_SIZE_64;
2860 xfer_size -= PAGE_SIZE;
2861 }
2862 } else {
2863 while (xfer_size) {
2864 if((alias_page == NULL) &&
2865 !(cntrl_flags & UPL_SET_LITE)) {
2866 vm_object_unlock(object);
2867 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2868 vm_object_lock(object);
2869 }
2870
2871 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2872 object->copy != last_copy_object) {
2873 /* Honor copy-on-write obligations */
2874
2875 /*
2876 * The copy object has changed since we
2877 * last synchronized for copy-on-write.
2878 * Another copy object might have been
2879 * inserted while we released the object's
2880 * lock. Since someone could have seen the
2881 * original contents of the remaining pages
2882 * through that new object, we have to
2883 * synchronize with it again for the remaining
2884 * pages only. The previous pages are "busy"
2885 * so they can not be seen through the new
2886 * mapping. The new mapping will see our
2887 * upcoming changes for those previous pages,
2888 * but that's OK since they couldn't see what
2889 * was there before. It's just a race anyway
2890 * and there's no guarantee of consistency or
2891 * atomicity. We just don't want new mappings
2892 * to see both the *before* and *after* pages.
2893 */
2894 if (object->copy != VM_OBJECT_NULL) {
2895 vm_object_update(
2896 object,
2897 dst_offset,/* current offset */
2898 xfer_size, /* remaining size */
2899 NULL,
2900 NULL,
2901 FALSE, /* should_return */
2902 MEMORY_OBJECT_COPY_SYNC,
2903 VM_PROT_NO_CHANGE);
2904 upl_cow_again++;
2905 upl_cow_again_pages +=
2906 xfer_size >> PAGE_SHIFT;
2907 }
2908 /* remember the copy object we synced with */
2909 last_copy_object = object->copy;
2910 }
2911
2912 dst_page = vm_page_lookup(object, dst_offset);
2913
2914 if(dst_page != VM_PAGE_NULL) {
2915 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2916 !((dst_page->list_req_pending)
2917 && (dst_page->absent))) {
2918 /* we are doing extended range */
2919 /* requests. we want to grab */
2920 /* pages around some which are */
2921 /* already present. */
2922 if(user_page_list) {
2923 user_page_list[entry].phys_addr = 0;
2924 }
2925 entry++;
2926 dst_offset += PAGE_SIZE_64;
2927 xfer_size -= PAGE_SIZE;
2928 continue;
2929 }
2930 if((dst_page->cleaning) &&
2931 !(dst_page->list_req_pending)) {
2932 /*someone else is writing to the */
2933 /* page. We will have to wait. */
2934 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2935 continue;
2936 }
2937 if ((dst_page->fictitious &&
2938 dst_page->list_req_pending)) {
2939 /* dump the fictitious page */
2940 dst_page->list_req_pending = FALSE;
2941 dst_page->clustered = FALSE;
2942
2943 vm_page_lock_queues();
2944 vm_page_free(dst_page);
2945 vm_page_unlock_queues();
2946
2947 dst_page = NULL;
2948 } else if ((dst_page->absent &&
2949 dst_page->list_req_pending)) {
2950 /* the default_pager case */
2951 dst_page->list_req_pending = FALSE;
2952 dst_page->busy = FALSE;
2953 }
2954 }
2955 if(dst_page == VM_PAGE_NULL) {
2956 if(object->private) {
2957 /*
2958 * This is a nasty wrinkle for users
2959 * of upl who encounter device or
2960 * private memory however, it is
2961 * unavoidable, only a fault can
2962 * reslove the actual backing
2963 * physical page by asking the
2964 * backing device.
2965 */
2966 if(user_page_list) {
2967 user_page_list[entry].phys_addr = 0;
2968 }
2969 entry++;
2970 dst_offset += PAGE_SIZE_64;
2971 xfer_size -= PAGE_SIZE;
2972 continue;
2973 }
2974 /* need to allocate a page */
2975 dst_page = vm_page_alloc(object, dst_offset);
2976 if (dst_page == VM_PAGE_NULL) {
2977 vm_object_unlock(object);
2978 VM_PAGE_WAIT();
2979 vm_object_lock(object);
2980 continue;
2981 }
2982 dst_page->busy = FALSE;
2983 #if 0
2984 if(cntrl_flags & UPL_NO_SYNC) {
2985 dst_page->page_lock = 0;
2986 dst_page->unlock_request = 0;
2987 }
2988 #endif
2989 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2990 /*
2991 * if UPL_RET_ONLY_ABSENT was specified,
2992 * than we're definitely setting up a
2993 * upl for a clustered read/pagein
2994 * operation... mark the pages as clustered
2995 * so vm_fault can correctly attribute them
2996 * to the 'pagein' bucket the first time
2997 * a fault happens on them
2998 */
2999 dst_page->clustered = TRUE;
3000 }
3001 dst_page->absent = TRUE;
3002 object->absent_count++;
3003 }
3004 #if 1
3005 if(cntrl_flags & UPL_NO_SYNC) {
3006 dst_page->page_lock = 0;
3007 dst_page->unlock_request = 0;
3008 }
3009 #endif /* 1 */
3010
3011 /*
3012 * ENCRYPTED SWAP:
3013 */
3014 if (cntrl_flags & UPL_ENCRYPT) {
3015 /*
3016 * The page is going to be encrypted when we
3017 * get it from the pager, so mark it so.
3018 */
3019 dst_page->encrypted = TRUE;
3020 } else {
3021 /*
3022 * Otherwise, the page will not contain
3023 * encrypted data.
3024 */
3025 dst_page->encrypted = FALSE;
3026 }
3027
3028 dst_page->overwriting = TRUE;
3029 if(dst_page->fictitious) {
3030 panic("need corner case for fictitious page");
3031 }
3032 if(dst_page->page_lock) {
3033 do_m_lock = TRUE;
3034 }
3035 if(upl_ptr) {
3036
3037 /* eliminate all mappings from the */
3038 /* original object and its prodigy */
3039
3040 if(dst_page->busy) {
3041 /*someone else is playing with the */
3042 /* page. We will have to wait. */
3043 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3044 continue;
3045 }
3046 vm_page_lock_queues();
3047
3048 if( !(cntrl_flags & UPL_FILE_IO))
3049 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3050 else
3051 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3052 dirty = hw_dirty ? TRUE : dst_page->dirty;
3053
3054 if(cntrl_flags & UPL_SET_LITE) {
3055 int pg_num;
3056 pg_num = (dst_offset-offset)/PAGE_SIZE;
3057 lite_list[pg_num>>5] |=
3058 1 << (pg_num & 31);
3059 if (hw_dirty)
3060 pmap_clear_modify(dst_page->phys_page);
3061 /*
3062 * Record that this page has been
3063 * written out
3064 */
3065 #if MACH_PAGEMAP
3066 vm_external_state_set(
3067 object->existence_map,
3068 dst_page->offset);
3069 #endif /*MACH_PAGEMAP*/
3070
3071 /*
3072 * Mark original page as cleaning
3073 * in place.
3074 */
3075 dst_page->cleaning = TRUE;
3076 dst_page->dirty = TRUE;
3077 dst_page->precious = FALSE;
3078 } else {
3079 /* use pageclean setup, it is more */
3080 /* convenient even for the pageout */
3081 /* cases here */
3082 vm_object_lock(upl->map_object);
3083 vm_pageclean_setup(dst_page,
3084 alias_page, upl->map_object,
3085 size - xfer_size);
3086 vm_object_unlock(upl->map_object);
3087
3088 alias_page->absent = FALSE;
3089 alias_page = NULL;
3090 }
3091
3092 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3093 /* clean in place for read implies */
3094 /* that a write will be done on all */
3095 /* the pages that are dirty before */
3096 /* a upl commit is done. The caller */
3097 /* is obligated to preserve the */
3098 /* contents of all pages marked */
3099 /* dirty. */
3100 upl->flags |= UPL_CLEAR_DIRTY;
3101 }
3102
3103 if(!dirty) {
3104 dst_page->dirty = FALSE;
3105 dst_page->precious = TRUE;
3106 }
3107
3108 if (dst_page->wire_count == 0) {
3109 /* deny access to the target page while */
3110 /* it is being worked on */
3111 dst_page->busy = TRUE;
3112 } else {
3113 vm_page_wire(dst_page);
3114 }
3115 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3116 /*
3117 * expect the page not to be used
3118 * since it's coming in as part
3119 * of a cluster and could be
3120 * speculative... pages that
3121 * are 'consumed' will get a
3122 * hardware reference
3123 */
3124 dst_page->reference = FALSE;
3125 } else {
3126 /*
3127 * expect the page to be used
3128 */
3129 dst_page->reference = TRUE;
3130 }
3131 dst_page->precious =
3132 (cntrl_flags & UPL_PRECIOUS)
3133 ? TRUE : FALSE;
3134 if(user_page_list) {
3135 user_page_list[entry].phys_addr
3136 = dst_page->phys_page;
3137 user_page_list[entry].dirty =
3138 dst_page->dirty;
3139 user_page_list[entry].pageout =
3140 dst_page->pageout;
3141 user_page_list[entry].absent =
3142 dst_page->absent;
3143 user_page_list[entry].precious =
3144 dst_page->precious;
3145 }
3146 vm_page_unlock_queues();
3147 }
3148 entry++;
3149 dst_offset += PAGE_SIZE_64;
3150 xfer_size -= PAGE_SIZE;
3151 }
3152 }
3153
3154 if (upl->flags & UPL_INTERNAL) {
3155 if(page_list_count != NULL)
3156 *page_list_count = 0;
3157 } else if (*page_list_count > entry) {
3158 if(page_list_count != NULL)
3159 *page_list_count = entry;
3160 }
3161
3162 if(alias_page != NULL) {
3163 vm_page_lock_queues();
3164 vm_page_free(alias_page);
3165 vm_page_unlock_queues();
3166 }
3167
3168 if(do_m_lock) {
3169 vm_prot_t access_required;
3170 /* call back all associated pages from other users of the pager */
3171 /* all future updates will be on data which is based on the */
3172 /* changes we are going to make here. Note: it is assumed that */
3173 /* we already hold copies of the data so we will not be seeing */
3174 /* an avalanche of incoming data from the pager */
3175 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3176 ? VM_PROT_READ : VM_PROT_WRITE;
3177 while (TRUE) {
3178 kern_return_t rc;
3179
3180 if(!object->pager_ready) {
3181 wait_result_t wait_result;
3182
3183 wait_result = vm_object_sleep(object,
3184 VM_OBJECT_EVENT_PAGER_READY,
3185 THREAD_UNINT);
3186 if (wait_result != THREAD_AWAKENED) {
3187 vm_object_unlock(object);
3188 return KERN_FAILURE;
3189 }
3190 continue;
3191 }
3192
3193 vm_object_unlock(object);
3194 rc = memory_object_data_unlock(
3195 object->pager,
3196 dst_offset + object->paging_offset,
3197 size,
3198 access_required);
3199 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3200 return KERN_FAILURE;
3201 vm_object_lock(object);
3202
3203 if (rc == KERN_SUCCESS)
3204 break;
3205 }
3206
3207 /* lets wait on the last page requested */
3208 /* NOTE: we will have to update lock completed routine to signal */
3209 if(dst_page != VM_PAGE_NULL &&
3210 (access_required & dst_page->page_lock) != access_required) {
3211 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3212 vm_object_unlock(object);
3213 thread_block(THREAD_CONTINUE_NULL);
3214 return KERN_SUCCESS;
3215 }
3216 }
3217
3218 vm_object_unlock(object);
3219 return KERN_SUCCESS;
3220 }
3221
3222 /* JMM - Backward compatability for now */
3223 kern_return_t
3224 vm_fault_list_request( /* forward */
3225 memory_object_control_t control,
3226 vm_object_offset_t offset,
3227 upl_size_t size,
3228 upl_t *upl_ptr,
3229 upl_page_info_t **user_page_list_ptr,
3230 int page_list_count,
3231 int cntrl_flags);
3232 kern_return_t
3233 vm_fault_list_request(
3234 memory_object_control_t control,
3235 vm_object_offset_t offset,
3236 upl_size_t size,
3237 upl_t *upl_ptr,
3238 upl_page_info_t **user_page_list_ptr,
3239 int page_list_count,
3240 int cntrl_flags)
3241 {
3242 int local_list_count;
3243 upl_page_info_t *user_page_list;
3244 kern_return_t kr;
3245
3246 if (user_page_list_ptr != NULL) {
3247 local_list_count = page_list_count;
3248 user_page_list = *user_page_list_ptr;
3249 } else {
3250 local_list_count = 0;
3251 user_page_list = NULL;
3252 }
3253 kr = memory_object_upl_request(control,
3254 offset,
3255 size,
3256 upl_ptr,
3257 user_page_list,
3258 &local_list_count,
3259 cntrl_flags);
3260
3261 if(kr != KERN_SUCCESS)
3262 return kr;
3263
3264 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3265 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3266 }
3267
3268 return KERN_SUCCESS;
3269 }
3270
3271
3272
3273 /*
3274 * Routine: vm_object_super_upl_request
3275 * Purpose:
3276 * Cause the population of a portion of a vm_object
3277 * in much the same way as memory_object_upl_request.
3278 * Depending on the nature of the request, the pages
3279 * returned may be contain valid data or be uninitialized.
3280 * However, the region may be expanded up to the super
3281 * cluster size provided.
3282 */
3283
3284 __private_extern__ kern_return_t
3285 vm_object_super_upl_request(
3286 vm_object_t object,
3287 vm_object_offset_t offset,
3288 upl_size_t size,
3289 upl_size_t super_cluster,
3290 upl_t *upl,
3291 upl_page_info_t *user_page_list,
3292 unsigned int *page_list_count,
3293 int cntrl_flags)
3294 {
3295 vm_page_t target_page;
3296 int ticket;
3297
3298
3299 if(object->paging_offset > offset)
3300 return KERN_FAILURE;
3301
3302 assert(object->paging_in_progress);
3303 offset = offset - object->paging_offset;
3304
3305 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3306
3307 vm_object_lock(object);
3308
3309 if((target_page = vm_page_lookup(object, offset))
3310 != VM_PAGE_NULL) {
3311 ticket = target_page->page_ticket;
3312 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3313 cntrl_flags = cntrl_flags |
3314 ((ticket << UPL_PAGE_TICKET_SHIFT)
3315 & UPL_PAGE_TICKET_MASK);
3316 }
3317 vm_object_unlock(object);
3318 }
3319
3320 if (super_cluster > size) {
3321
3322 vm_object_offset_t base_offset;
3323 upl_size_t super_size;
3324
3325 base_offset = (offset &
3326 ~((vm_object_offset_t) super_cluster - 1));
3327 super_size = (offset+size) > (base_offset + super_cluster) ?
3328 super_cluster<<1 : super_cluster;
3329 super_size = ((base_offset + super_size) > object->size) ?
3330 (object->size - base_offset) : super_size;
3331 if(offset > (base_offset + super_size))
3332 panic("vm_object_super_upl_request: Missed target pageout"
3333 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3334 offset, base_offset, super_size, super_cluster,
3335 size, object->paging_offset);
3336 /*
3337 * apparently there is a case where the vm requests a
3338 * page to be written out who's offset is beyond the
3339 * object size
3340 */
3341 if((offset + size) > (base_offset + super_size))
3342 super_size = (offset + size) - base_offset;
3343
3344 offset = base_offset;
3345 size = super_size;
3346 }
3347 return vm_object_upl_request(object, offset, size,
3348 upl, user_page_list, page_list_count,
3349 cntrl_flags);
3350 }
3351
3352
3353 kern_return_t
3354 vm_map_create_upl(
3355 vm_map_t map,
3356 vm_map_address_t offset,
3357 upl_size_t *upl_size,
3358 upl_t *upl,
3359 upl_page_info_array_t page_list,
3360 unsigned int *count,
3361 int *flags)
3362 {
3363 vm_map_entry_t entry;
3364 int caller_flags;
3365 int force_data_sync;
3366 int sync_cow_data;
3367 vm_object_t local_object;
3368 vm_map_offset_t local_offset;
3369 vm_map_offset_t local_start;
3370 kern_return_t ret;
3371
3372 caller_flags = *flags;
3373
3374 if (caller_flags & ~UPL_VALID_FLAGS) {
3375 /*
3376 * For forward compatibility's sake,
3377 * reject any unknown flag.
3378 */
3379 return KERN_INVALID_VALUE;
3380 }
3381
3382 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3383 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3384
3385 if(upl == NULL)
3386 return KERN_INVALID_ARGUMENT;
3387
3388
3389 REDISCOVER_ENTRY:
3390 vm_map_lock(map);
3391 if (vm_map_lookup_entry(map, offset, &entry)) {
3392 if (entry->object.vm_object == VM_OBJECT_NULL ||
3393 !entry->object.vm_object->phys_contiguous) {
3394 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3395 *upl_size = MAX_UPL_TRANSFER * page_size;
3396 }
3397 }
3398 if((entry->vme_end - offset) < *upl_size) {
3399 *upl_size = entry->vme_end - offset;
3400 }
3401 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3402 if (entry->object.vm_object == VM_OBJECT_NULL) {
3403 *flags = 0;
3404 } else if (entry->object.vm_object->private) {
3405 *flags = UPL_DEV_MEMORY;
3406 if (entry->object.vm_object->phys_contiguous) {
3407 *flags |= UPL_PHYS_CONTIG;
3408 }
3409 } else {
3410 *flags = 0;
3411 }
3412 vm_map_unlock(map);
3413 return KERN_SUCCESS;
3414 }
3415 /*
3416 * Create an object if necessary.
3417 */
3418 if (entry->object.vm_object == VM_OBJECT_NULL) {
3419 entry->object.vm_object = vm_object_allocate(
3420 (vm_size_t)(entry->vme_end - entry->vme_start));
3421 entry->offset = 0;
3422 }
3423 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3424 if (!(entry->protection & VM_PROT_WRITE)) {
3425 vm_map_unlock(map);
3426 return KERN_PROTECTION_FAILURE;
3427 }
3428 if (entry->needs_copy) {
3429 vm_map_t local_map;
3430 vm_object_t object;
3431 vm_map_offset_t offset_hi;
3432 vm_map_offset_t offset_lo;
3433 vm_object_offset_t new_offset;
3434 vm_prot_t prot;
3435 boolean_t wired;
3436 vm_behavior_t behavior;
3437 vm_map_version_t version;
3438 vm_map_t real_map;
3439
3440 local_map = map;
3441 vm_map_lock_write_to_read(map);
3442 if(vm_map_lookup_locked(&local_map,
3443 offset, VM_PROT_WRITE,
3444 &version, &object,
3445 &new_offset, &prot, &wired,
3446 &behavior, &offset_lo,
3447 &offset_hi, &real_map)) {
3448 vm_map_unlock(local_map);
3449 return KERN_FAILURE;
3450 }
3451 if (real_map != map) {
3452 vm_map_unlock(real_map);
3453 }
3454 vm_object_unlock(object);
3455 vm_map_unlock(local_map);
3456
3457 goto REDISCOVER_ENTRY;
3458 }
3459 }
3460 if (entry->is_sub_map) {
3461 vm_map_t submap;
3462
3463 submap = entry->object.sub_map;
3464 local_start = entry->vme_start;
3465 local_offset = entry->offset;
3466 vm_map_reference(submap);
3467 vm_map_unlock(map);
3468
3469 ret = (vm_map_create_upl(submap,
3470 local_offset + (offset - local_start),
3471 upl_size, upl, page_list, count,
3472 flags));
3473
3474 vm_map_deallocate(submap);
3475 return ret;
3476 }
3477
3478 if (sync_cow_data) {
3479 if (entry->object.vm_object->shadow
3480 || entry->object.vm_object->copy) {
3481
3482 local_object = entry->object.vm_object;
3483 local_start = entry->vme_start;
3484 local_offset = entry->offset;
3485 vm_object_reference(local_object);
3486 vm_map_unlock(map);
3487
3488 if (entry->object.vm_object->shadow &&
3489 entry->object.vm_object->copy) {
3490 vm_object_lock_request(
3491 local_object->shadow,
3492 (vm_object_offset_t)
3493 ((offset - local_start) +
3494 local_offset) +
3495 local_object->shadow_offset,
3496 *upl_size, FALSE,
3497 MEMORY_OBJECT_DATA_SYNC,
3498 VM_PROT_NO_CHANGE);
3499 }
3500 sync_cow_data = FALSE;
3501 vm_object_deallocate(local_object);
3502 goto REDISCOVER_ENTRY;
3503 }
3504 }
3505
3506 if (force_data_sync) {
3507
3508 local_object = entry->object.vm_object;
3509 local_start = entry->vme_start;
3510 local_offset = entry->offset;
3511 vm_object_reference(local_object);
3512 vm_map_unlock(map);
3513
3514 vm_object_lock_request(
3515 local_object,
3516 (vm_object_offset_t)
3517 ((offset - local_start) + local_offset),
3518 (vm_object_size_t)*upl_size, FALSE,
3519 MEMORY_OBJECT_DATA_SYNC,
3520 VM_PROT_NO_CHANGE);
3521 force_data_sync = FALSE;
3522 vm_object_deallocate(local_object);
3523 goto REDISCOVER_ENTRY;
3524 }
3525
3526 if(!(entry->object.vm_object->private)) {
3527 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3528 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3529 if(entry->object.vm_object->phys_contiguous) {
3530 *flags = UPL_PHYS_CONTIG;
3531 } else {
3532 *flags = 0;
3533 }
3534 } else {
3535 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3536 }
3537 local_object = entry->object.vm_object;
3538 local_offset = entry->offset;
3539 local_start = entry->vme_start;
3540 vm_object_reference(local_object);
3541 vm_map_unlock(map);
3542 if(caller_flags & UPL_SET_IO_WIRE) {
3543 ret = (vm_object_iopl_request(local_object,
3544 (vm_object_offset_t)
3545 ((offset - local_start)
3546 + local_offset),
3547 *upl_size,
3548 upl,
3549 page_list,
3550 count,
3551 caller_flags));
3552 } else {
3553 ret = (vm_object_upl_request(local_object,
3554 (vm_object_offset_t)
3555 ((offset - local_start)
3556 + local_offset),
3557 *upl_size,
3558 upl,
3559 page_list,
3560 count,
3561 caller_flags));
3562 }
3563 vm_object_deallocate(local_object);
3564 return(ret);
3565 }
3566
3567 vm_map_unlock(map);
3568 return(KERN_FAILURE);
3569
3570 }
3571
3572 /*
3573 * Internal routine to enter a UPL into a VM map.
3574 *
3575 * JMM - This should just be doable through the standard
3576 * vm_map_enter() API.
3577 */
3578 kern_return_t
3579 vm_map_enter_upl(
3580 vm_map_t map,
3581 upl_t upl,
3582 vm_map_offset_t *dst_addr)
3583 {
3584 vm_map_size_t size;
3585 vm_object_offset_t offset;
3586 vm_map_offset_t addr;
3587 vm_page_t m;
3588 kern_return_t kr;
3589
3590 if (upl == UPL_NULL)
3591 return KERN_INVALID_ARGUMENT;
3592
3593 upl_lock(upl);
3594
3595 /* check to see if already mapped */
3596 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3597 upl_unlock(upl);
3598 return KERN_FAILURE;
3599 }
3600
3601 if((!(upl->map_object->pageout)) &&
3602 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3603 (upl->map_object->phys_contiguous))) {
3604 vm_object_t object;
3605 vm_page_t alias_page;
3606 vm_object_offset_t new_offset;
3607 int pg_num;
3608 wpl_array_t lite_list;
3609
3610 if(upl->flags & UPL_INTERNAL) {
3611 lite_list = (wpl_array_t)
3612 ((((uintptr_t)upl) + sizeof(struct upl))
3613 + ((upl->size/PAGE_SIZE)
3614 * sizeof(upl_page_info_t)));
3615 } else {
3616 lite_list = (wpl_array_t)
3617 (((uintptr_t)upl) + sizeof(struct upl));
3618 }
3619 object = upl->map_object;
3620 upl->map_object = vm_object_allocate(upl->size);
3621 vm_object_lock(upl->map_object);
3622 upl->map_object->shadow = object;
3623 upl->map_object->pageout = TRUE;
3624 upl->map_object->can_persist = FALSE;
3625 upl->map_object->copy_strategy =
3626 MEMORY_OBJECT_COPY_NONE;
3627 upl->map_object->shadow_offset =
3628 upl->offset - object->paging_offset;
3629 upl->map_object->wimg_bits = object->wimg_bits;
3630 offset = upl->map_object->shadow_offset;
3631 new_offset = 0;
3632 size = upl->size;
3633
3634 vm_object_lock(object);
3635
3636 while(size) {
3637 pg_num = (new_offset)/PAGE_SIZE;
3638 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3639 vm_object_unlock(object);
3640 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3641 vm_object_lock(object);
3642 m = vm_page_lookup(object, offset);
3643 if (m == VM_PAGE_NULL) {
3644 panic("vm_upl_map: page missing\n");
3645 }
3646
3647 vm_object_paging_begin(object);
3648
3649 /*
3650 * Convert the fictitious page to a private
3651 * shadow of the real page.
3652 */
3653 assert(alias_page->fictitious);
3654 alias_page->fictitious = FALSE;
3655 alias_page->private = TRUE;
3656 alias_page->pageout = TRUE;
3657 alias_page->phys_page = m->phys_page;
3658
3659 vm_page_lock_queues();
3660 vm_page_wire(alias_page);
3661 vm_page_unlock_queues();
3662
3663 /*
3664 * ENCRYPTED SWAP:
3665 * The virtual page ("m") has to be wired in some way
3666 * here or its physical page ("m->phys_page") could
3667 * be recycled at any time.
3668 * Assuming this is enforced by the caller, we can't
3669 * get an encrypted page here. Since the encryption
3670 * key depends on the VM page's "pager" object and
3671 * the "paging_offset", we couldn't handle 2 pageable
3672 * VM pages (with different pagers and paging_offsets)
3673 * sharing the same physical page: we could end up
3674 * encrypting with one key (via one VM page) and
3675 * decrypting with another key (via the alias VM page).
3676 */
3677 ASSERT_PAGE_DECRYPTED(m);
3678
3679 vm_page_insert(alias_page,
3680 upl->map_object, new_offset);
3681 assert(!alias_page->wanted);
3682 alias_page->busy = FALSE;
3683 alias_page->absent = FALSE;
3684 }
3685
3686 size -= PAGE_SIZE;
3687 offset += PAGE_SIZE_64;
3688 new_offset += PAGE_SIZE_64;
3689 }
3690 vm_object_unlock(object);
3691 vm_object_unlock(upl->map_object);
3692 }
3693 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3694 offset = upl->offset - upl->map_object->paging_offset;
3695 else
3696 offset = 0;
3697
3698 size = upl->size;
3699
3700 vm_object_lock(upl->map_object);
3701 upl->map_object->ref_count++;
3702 vm_object_res_reference(upl->map_object);
3703 vm_object_unlock(upl->map_object);
3704
3705 *dst_addr = 0;
3706
3707
3708 /* NEED A UPL_MAP ALIAS */
3709 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3710 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3711 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3712
3713 if (kr != KERN_SUCCESS) {
3714 upl_unlock(upl);
3715 return(kr);
3716 }
3717
3718 vm_object_lock(upl->map_object);
3719
3720 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3721 m = vm_page_lookup(upl->map_object, offset);
3722 if(m) {
3723 unsigned int cache_attr;
3724 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3725
3726 PMAP_ENTER(map->pmap, addr,
3727 m, VM_PROT_ALL,
3728 cache_attr, TRUE);
3729 }
3730 offset+=PAGE_SIZE_64;
3731 }
3732 vm_object_unlock(upl->map_object);
3733
3734 upl->ref_count++; /* hold a reference for the mapping */
3735 upl->flags |= UPL_PAGE_LIST_MAPPED;
3736 upl->kaddr = *dst_addr;
3737 upl_unlock(upl);
3738 return KERN_SUCCESS;
3739 }
3740
3741 /*
3742 * Internal routine to remove a UPL mapping from a VM map.
3743 *
3744 * XXX - This should just be doable through a standard
3745 * vm_map_remove() operation. Otherwise, implicit clean-up
3746 * of the target map won't be able to correctly remove
3747 * these (and release the reference on the UPL). Having
3748 * to do this means we can't map these into user-space
3749 * maps yet.
3750 */
3751 kern_return_t
3752 vm_map_remove_upl(
3753 vm_map_t map,
3754 upl_t upl)
3755 {
3756 vm_address_t addr;
3757 upl_size_t size;
3758
3759 if (upl == UPL_NULL)
3760 return KERN_INVALID_ARGUMENT;
3761
3762 upl_lock(upl);
3763 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3764 addr = upl->kaddr;
3765 size = upl->size;
3766 assert(upl->ref_count > 1);
3767 upl->ref_count--; /* removing mapping ref */
3768 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3769 upl->kaddr = (vm_offset_t) 0;
3770 upl_unlock(upl);
3771
3772 vm_map_remove( map,
3773 vm_map_trunc_page(addr),
3774 vm_map_round_page(addr + size),
3775 VM_MAP_NO_FLAGS);
3776 return KERN_SUCCESS;
3777 }
3778 upl_unlock(upl);
3779 return KERN_FAILURE;
3780 }
3781
3782 kern_return_t
3783 upl_commit_range(
3784 upl_t upl,
3785 upl_offset_t offset,
3786 upl_size_t size,
3787 int flags,
3788 upl_page_info_t *page_list,
3789 mach_msg_type_number_t count,
3790 boolean_t *empty)
3791 {
3792 upl_size_t xfer_size = size;
3793 vm_object_t shadow_object;
3794 vm_object_t object = upl->map_object;
3795 vm_object_offset_t target_offset;
3796 int entry;
3797 wpl_array_t lite_list;
3798 int occupied;
3799 int delayed_unlock = 0;
3800 int clear_refmod = 0;
3801 boolean_t shadow_internal;
3802
3803 *empty = FALSE;
3804
3805 if (upl == UPL_NULL)
3806 return KERN_INVALID_ARGUMENT;
3807
3808
3809 if (count == 0)
3810 page_list = NULL;
3811
3812 if (object->pageout) {
3813 shadow_object = object->shadow;
3814 } else {
3815 shadow_object = object;
3816 }
3817
3818 upl_lock(upl);
3819
3820 if (upl->flags & UPL_ACCESS_BLOCKED) {
3821 /*
3822 * We used this UPL to block access to the pages by marking
3823 * them "busy". Now we need to clear the "busy" bit to allow
3824 * access to these pages again.
3825 */
3826 flags |= UPL_COMMIT_ALLOW_ACCESS;
3827 }
3828
3829 if (upl->flags & UPL_CLEAR_DIRTY)
3830 flags |= UPL_COMMIT_CLEAR_DIRTY;
3831
3832 if (upl->flags & UPL_DEVICE_MEMORY) {
3833 xfer_size = 0;
3834 } else if ((offset + size) > upl->size) {
3835 upl_unlock(upl);
3836 return KERN_FAILURE;
3837 }
3838
3839 if (upl->flags & UPL_INTERNAL) {
3840 lite_list = (wpl_array_t)
3841 ((((uintptr_t)upl) + sizeof(struct upl))
3842 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3843 } else {
3844 lite_list = (wpl_array_t)
3845 (((uintptr_t)upl) + sizeof(struct upl));
3846 }
3847 if (object != shadow_object)
3848 vm_object_lock(object);
3849 vm_object_lock(shadow_object);
3850
3851 shadow_internal = shadow_object->internal;
3852
3853 entry = offset/PAGE_SIZE;
3854 target_offset = (vm_object_offset_t)offset;
3855
3856 while (xfer_size) {
3857 vm_page_t t,m;
3858 upl_page_info_t *p;
3859
3860 m = VM_PAGE_NULL;
3861
3862 if (upl->flags & UPL_LITE) {
3863 int pg_num;
3864
3865 pg_num = target_offset/PAGE_SIZE;
3866
3867 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3868 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3869 m = vm_page_lookup(shadow_object,
3870 target_offset + (upl->offset -
3871 shadow_object->paging_offset));
3872 }
3873 }
3874 if (object->pageout) {
3875 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3876 t->pageout = FALSE;
3877
3878 if (delayed_unlock) {
3879 delayed_unlock = 0;
3880 vm_page_unlock_queues();
3881 }
3882 VM_PAGE_FREE(t);
3883
3884 if (m == NULL) {
3885 m = vm_page_lookup(
3886 shadow_object,
3887 target_offset +
3888 object->shadow_offset);
3889 }
3890 if (m != VM_PAGE_NULL)
3891 vm_object_paging_end(m->object);
3892 }
3893 }
3894 if (m != VM_PAGE_NULL) {
3895
3896 clear_refmod = 0;
3897
3898 if (upl->flags & UPL_IO_WIRE) {
3899
3900 if (delayed_unlock == 0)
3901 vm_page_lock_queues();
3902
3903 vm_page_unwire(m);
3904
3905 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3906 delayed_unlock = 0;
3907 vm_page_unlock_queues();
3908 }
3909 if (page_list) {
3910 page_list[entry].phys_addr = 0;
3911 }
3912 if (flags & UPL_COMMIT_SET_DIRTY) {
3913 m->dirty = TRUE;
3914 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3915 m->dirty = FALSE;
3916 clear_refmod |= VM_MEM_MODIFIED;
3917 }
3918 if (flags & UPL_COMMIT_INACTIVATE) {
3919 m->reference = FALSE;
3920 clear_refmod |= VM_MEM_REFERENCED;
3921 vm_page_deactivate(m);
3922 }
3923 if (clear_refmod)
3924 pmap_clear_refmod(m->phys_page, clear_refmod);
3925
3926 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3927 /*
3928 * We blocked access to the pages in this UPL.
3929 * Clear the "busy" bit and wake up any waiter
3930 * for this page.
3931 */
3932 PAGE_WAKEUP_DONE(m);
3933 }
3934
3935 target_offset += PAGE_SIZE_64;
3936 xfer_size -= PAGE_SIZE;
3937 entry++;
3938 continue;
3939 }
3940 if (delayed_unlock == 0)
3941 vm_page_lock_queues();
3942 /*
3943 * make sure to clear the hardware
3944 * modify or reference bits before
3945 * releasing the BUSY bit on this page
3946 * otherwise we risk losing a legitimate
3947 * change of state
3948 */
3949 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3950 m->dirty = FALSE;
3951 clear_refmod |= VM_MEM_MODIFIED;
3952 }
3953 if (flags & UPL_COMMIT_INACTIVATE)
3954 clear_refmod |= VM_MEM_REFERENCED;
3955
3956 if (clear_refmod)
3957 pmap_clear_refmod(m->phys_page, clear_refmod);
3958
3959 if (page_list) {
3960 p = &(page_list[entry]);
3961 if(p->phys_addr && p->pageout && !m->pageout) {
3962 m->busy = TRUE;
3963 m->pageout = TRUE;
3964 vm_page_wire(m);
3965 } else if (page_list[entry].phys_addr &&
3966 !p->pageout && m->pageout &&
3967 !m->dump_cleaning) {
3968 m->pageout = FALSE;
3969 m->absent = FALSE;
3970 m->overwriting = FALSE;
3971 vm_page_unwire(m);
3972 PAGE_WAKEUP_DONE(m);
3973 }
3974 page_list[entry].phys_addr = 0;
3975 }
3976 m->dump_cleaning = FALSE;
3977 if(m->laundry) {
3978 vm_pageout_throttle_up(m);
3979 }
3980 if(m->pageout) {
3981 m->cleaning = FALSE;
3982 m->pageout = FALSE;
3983 #if MACH_CLUSTER_STATS
3984 if (m->wanted) vm_pageout_target_collisions++;
3985 #endif
3986 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3987 m->dirty = TRUE;
3988 else
3989 m->dirty = FALSE;
3990
3991 if(m->dirty) {
3992 vm_page_unwire(m);/* reactivates */
3993
3994 if (upl->flags & UPL_PAGEOUT) {
3995 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
3996 VM_STAT(reactivations++);
3997 }
3998 PAGE_WAKEUP_DONE(m);
3999 } else {
4000 vm_page_free(m);/* clears busy, etc. */
4001
4002 if (upl->flags & UPL_PAGEOUT) {
4003 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4004
4005 if (page_list[entry].dirty)
4006 VM_STAT(pageouts++);
4007 }
4008 }
4009 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4010 delayed_unlock = 0;
4011 vm_page_unlock_queues();
4012 }
4013 target_offset += PAGE_SIZE_64;
4014 xfer_size -= PAGE_SIZE;
4015 entry++;
4016 continue;
4017 }
4018 #if MACH_CLUSTER_STATS
4019 m->dirty = pmap_is_modified(m->phys_page);
4020
4021 if (m->dirty) vm_pageout_cluster_dirtied++;
4022 else vm_pageout_cluster_cleaned++;
4023 if (m->wanted) vm_pageout_cluster_collisions++;
4024 #else
4025 m->dirty = 0;
4026 #endif
4027
4028 if((m->busy) && (m->cleaning)) {
4029 /* the request_page_list case */
4030 if(m->absent) {
4031 m->absent = FALSE;
4032 if(shadow_object->absent_count == 1)
4033 vm_object_absent_release(shadow_object);
4034 else
4035 shadow_object->absent_count--;
4036 }
4037 m->overwriting = FALSE;
4038 m->busy = FALSE;
4039 m->dirty = FALSE;
4040 } else if (m->overwriting) {
4041 /* alternate request page list, write to
4042 * page_list case. Occurs when the original
4043 * page was wired at the time of the list
4044 * request */
4045 assert(m->wire_count != 0);
4046 vm_page_unwire(m);/* reactivates */
4047 m->overwriting = FALSE;
4048 }
4049 m->cleaning = FALSE;
4050
4051 /* It is a part of the semantic of COPYOUT_FROM */
4052 /* UPLs that a commit implies cache sync */
4053 /* between the vm page and the backing store */
4054 /* this can be used to strip the precious bit */
4055 /* as well as clean */
4056 if (upl->flags & UPL_PAGE_SYNC_DONE)
4057 m->precious = FALSE;
4058
4059 if (flags & UPL_COMMIT_SET_DIRTY)
4060 m->dirty = TRUE;
4061
4062 if (flags & UPL_COMMIT_INACTIVATE) {
4063 m->reference = FALSE;
4064 vm_page_deactivate(m);
4065 } else if (!m->active && !m->inactive) {
4066 if (m->reference)
4067 vm_page_activate(m);
4068 else
4069 vm_page_deactivate(m);
4070 }
4071
4072 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4073 /*
4074 * We blocked access to the pages in this URL.
4075 * Clear the "busy" bit on this page before we
4076 * wake up any waiter.
4077 */
4078 m->busy = FALSE;
4079 }
4080
4081 /*
4082 * Wakeup any thread waiting for the page to be un-cleaning.
4083 */
4084 PAGE_WAKEUP(m);
4085
4086 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4087 delayed_unlock = 0;
4088 vm_page_unlock_queues();
4089 }
4090 }
4091 target_offset += PAGE_SIZE_64;
4092 xfer_size -= PAGE_SIZE;
4093 entry++;
4094 }
4095 if (delayed_unlock)
4096 vm_page_unlock_queues();
4097
4098 occupied = 1;
4099
4100 if (upl->flags & UPL_DEVICE_MEMORY) {
4101 occupied = 0;
4102 } else if (upl->flags & UPL_LITE) {
4103 int pg_num;
4104 int i;
4105 pg_num = upl->size/PAGE_SIZE;
4106 pg_num = (pg_num + 31) >> 5;
4107 occupied = 0;
4108 for(i= 0; i<pg_num; i++) {
4109 if(lite_list[i] != 0) {
4110 occupied = 1;
4111 break;
4112 }
4113 }
4114 } else {
4115 if(queue_empty(&upl->map_object->memq)) {
4116 occupied = 0;
4117 }
4118 }
4119
4120 if(occupied == 0) {
4121 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4122 *empty = TRUE;
4123 }
4124 if(object == shadow_object)
4125 vm_object_paging_end(shadow_object);
4126 }
4127 vm_object_unlock(shadow_object);
4128 if (object != shadow_object)
4129 vm_object_unlock(object);
4130 upl_unlock(upl);
4131
4132 return KERN_SUCCESS;
4133 }
4134
4135 kern_return_t
4136 upl_abort_range(
4137 upl_t upl,
4138 upl_offset_t offset,
4139 upl_size_t size,
4140 int error,
4141 boolean_t *empty)
4142 {
4143 upl_size_t xfer_size = size;
4144 vm_object_t shadow_object;
4145 vm_object_t object = upl->map_object;
4146 vm_object_offset_t target_offset;
4147 int entry;
4148 wpl_array_t lite_list;
4149 int occupied;
4150 boolean_t shadow_internal;
4151
4152 *empty = FALSE;
4153
4154 if (upl == UPL_NULL)
4155 return KERN_INVALID_ARGUMENT;
4156
4157 if (upl->flags & UPL_IO_WIRE) {
4158 return upl_commit_range(upl,
4159 offset, size, 0,
4160 NULL, 0, empty);
4161 }
4162
4163 if(object->pageout) {
4164 shadow_object = object->shadow;
4165 } else {
4166 shadow_object = object;
4167 }
4168
4169 upl_lock(upl);
4170 if(upl->flags & UPL_DEVICE_MEMORY) {
4171 xfer_size = 0;
4172 } else if ((offset + size) > upl->size) {
4173 upl_unlock(upl);
4174 return KERN_FAILURE;
4175 }
4176 if (object != shadow_object)
4177 vm_object_lock(object);
4178 vm_object_lock(shadow_object);
4179
4180 shadow_internal = shadow_object->internal;
4181
4182 if(upl->flags & UPL_INTERNAL) {
4183 lite_list = (wpl_array_t)
4184 ((((uintptr_t)upl) + sizeof(struct upl))
4185 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4186 } else {
4187 lite_list = (wpl_array_t)
4188 (((uintptr_t)upl) + sizeof(struct upl));
4189 }
4190
4191 entry = offset/PAGE_SIZE;
4192 target_offset = (vm_object_offset_t)offset;
4193 while(xfer_size) {
4194 vm_page_t t,m;
4195
4196 m = VM_PAGE_NULL;
4197 if(upl->flags & UPL_LITE) {
4198 int pg_num;
4199 pg_num = target_offset/PAGE_SIZE;
4200 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4201 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4202 m = vm_page_lookup(shadow_object,
4203 target_offset + (upl->offset -
4204 shadow_object->paging_offset));
4205 }
4206 }
4207 if(object->pageout) {
4208 if ((t = vm_page_lookup(object, target_offset))
4209 != NULL) {
4210 t->pageout = FALSE;
4211 VM_PAGE_FREE(t);
4212 if(m == NULL) {
4213 m = vm_page_lookup(
4214 shadow_object,
4215 target_offset +
4216 object->shadow_offset);
4217 }
4218 if(m != VM_PAGE_NULL)
4219 vm_object_paging_end(m->object);
4220 }
4221 }
4222 if(m != VM_PAGE_NULL) {
4223 vm_page_lock_queues();
4224 if(m->absent) {
4225 boolean_t must_free = TRUE;
4226
4227 /* COPYOUT = FALSE case */
4228 /* check for error conditions which must */
4229 /* be passed back to the pages customer */
4230 if(error & UPL_ABORT_RESTART) {
4231 m->restart = TRUE;
4232 m->absent = FALSE;
4233 vm_object_absent_release(m->object);
4234 m->page_error = KERN_MEMORY_ERROR;
4235 m->error = TRUE;
4236 must_free = FALSE;
4237 } else if(error & UPL_ABORT_UNAVAILABLE) {
4238 m->restart = FALSE;
4239 m->unusual = TRUE;
4240 must_free = FALSE;
4241 } else if(error & UPL_ABORT_ERROR) {
4242 m->restart = FALSE;
4243 m->absent = FALSE;
4244 vm_object_absent_release(m->object);
4245 m->page_error = KERN_MEMORY_ERROR;
4246 m->error = TRUE;
4247 must_free = FALSE;
4248 }
4249
4250 /*
4251 * ENCRYPTED SWAP:
4252 * If the page was already encrypted,
4253 * we don't really need to decrypt it
4254 * now. It will get decrypted later,
4255 * on demand, as soon as someone needs
4256 * to access its contents.
4257 */
4258
4259 m->cleaning = FALSE;
4260 m->overwriting = FALSE;
4261 PAGE_WAKEUP_DONE(m);
4262
4263 if (must_free == TRUE) {
4264 vm_page_free(m);
4265 } else {
4266 vm_page_activate(m);
4267 }
4268 vm_page_unlock_queues();
4269
4270 target_offset += PAGE_SIZE_64;
4271 xfer_size -= PAGE_SIZE;
4272 entry++;
4273 continue;
4274 }
4275 /*
4276 * Handle the trusted pager throttle.
4277 */
4278 if (m->laundry) {
4279 vm_pageout_throttle_up(m);
4280 }
4281 if(m->pageout) {
4282 assert(m->busy);
4283 assert(m->wire_count == 1);
4284 m->pageout = FALSE;
4285 vm_page_unwire(m);
4286 }
4287 m->dump_cleaning = FALSE;
4288 m->cleaning = FALSE;
4289 m->overwriting = FALSE;
4290 #if MACH_PAGEMAP
4291 vm_external_state_clr(
4292 m->object->existence_map, m->offset);
4293 #endif /* MACH_PAGEMAP */
4294 if(error & UPL_ABORT_DUMP_PAGES) {
4295 vm_page_free(m);
4296 pmap_disconnect(m->phys_page);
4297 } else {
4298 PAGE_WAKEUP_DONE(m);
4299 }
4300 vm_page_unlock_queues();
4301 }
4302 target_offset += PAGE_SIZE_64;
4303 xfer_size -= PAGE_SIZE;
4304 entry++;
4305 }
4306 occupied = 1;
4307 if (upl->flags & UPL_DEVICE_MEMORY) {
4308 occupied = 0;
4309 } else if (upl->flags & UPL_LITE) {
4310 int pg_num;
4311 int i;
4312 pg_num = upl->size/PAGE_SIZE;
4313 pg_num = (pg_num + 31) >> 5;
4314 occupied = 0;
4315 for(i= 0; i<pg_num; i++) {
4316 if(lite_list[i] != 0) {
4317 occupied = 1;
4318 break;
4319 }
4320 }
4321 } else {
4322 if(queue_empty(&upl->map_object->memq)) {
4323 occupied = 0;
4324 }
4325 }
4326
4327 if(occupied == 0) {
4328 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4329 *empty = TRUE;
4330 }
4331 if(object == shadow_object)
4332 vm_object_paging_end(shadow_object);
4333 }
4334 vm_object_unlock(shadow_object);
4335 if (object != shadow_object)
4336 vm_object_unlock(object);
4337
4338 upl_unlock(upl);
4339
4340 return KERN_SUCCESS;
4341 }
4342
4343 kern_return_t
4344 upl_abort(
4345 upl_t upl,
4346 int error)
4347 {
4348 vm_object_t object = NULL;
4349 vm_object_t shadow_object = NULL;
4350 vm_object_offset_t offset;
4351 vm_object_offset_t shadow_offset;
4352 vm_object_offset_t target_offset;
4353 upl_size_t i;
4354 wpl_array_t lite_list;
4355 vm_page_t t,m;
4356 int occupied;
4357 boolean_t shadow_internal;
4358
4359 if (upl == UPL_NULL)
4360 return KERN_INVALID_ARGUMENT;
4361
4362 if (upl->flags & UPL_IO_WIRE) {
4363 boolean_t empty;
4364 return upl_commit_range(upl,
4365 0, upl->size, 0,
4366 NULL, 0, &empty);
4367 }
4368
4369 upl_lock(upl);
4370 if(upl->flags & UPL_DEVICE_MEMORY) {
4371 upl_unlock(upl);
4372 return KERN_SUCCESS;
4373 }
4374
4375 object = upl->map_object;
4376
4377 if (object == NULL) {
4378 panic("upl_abort: upl object is not backed by an object");
4379 upl_unlock(upl);
4380 return KERN_INVALID_ARGUMENT;
4381 }
4382
4383 if(object->pageout) {
4384 shadow_object = object->shadow;
4385 shadow_offset = object->shadow_offset;
4386 } else {
4387 shadow_object = object;
4388 shadow_offset = upl->offset - object->paging_offset;
4389 }
4390
4391 if(upl->flags & UPL_INTERNAL) {
4392 lite_list = (wpl_array_t)
4393 ((((uintptr_t)upl) + sizeof(struct upl))
4394 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4395 } else {
4396 lite_list = (wpl_array_t)
4397 (((uintptr_t)upl) + sizeof(struct upl));
4398 }
4399 offset = 0;
4400
4401 if (object != shadow_object)
4402 vm_object_lock(object);
4403 vm_object_lock(shadow_object);
4404
4405 shadow_internal = shadow_object->internal;
4406
4407 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4408 m = VM_PAGE_NULL;
4409 target_offset = offset + shadow_offset;
4410 if(upl->flags & UPL_LITE) {
4411 int pg_num;
4412 pg_num = offset/PAGE_SIZE;
4413 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4414 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4415 m = vm_page_lookup(
4416 shadow_object, target_offset);
4417 }
4418 }
4419 if(object->pageout) {
4420 if ((t = vm_page_lookup(object, offset)) != NULL) {
4421 t->pageout = FALSE;
4422 VM_PAGE_FREE(t);
4423 if(m == NULL) {
4424 m = vm_page_lookup(
4425 shadow_object, target_offset);
4426 }
4427 if(m != VM_PAGE_NULL)
4428 vm_object_paging_end(m->object);
4429 }
4430 }
4431 if(m != VM_PAGE_NULL) {
4432 vm_page_lock_queues();
4433 if(m->absent) {
4434 boolean_t must_free = TRUE;
4435
4436 /* COPYOUT = FALSE case */
4437 /* check for error conditions which must */
4438 /* be passed back to the pages customer */
4439 if(error & UPL_ABORT_RESTART) {
4440 m->restart = TRUE;
4441 m->absent = FALSE;
4442 vm_object_absent_release(m->object);
4443 m->page_error = KERN_MEMORY_ERROR;
4444 m->error = TRUE;
4445 must_free = FALSE;
4446 } else if(error & UPL_ABORT_UNAVAILABLE) {
4447 m->restart = FALSE;
4448 m->unusual = TRUE;
4449 must_free = FALSE;
4450 } else if(error & UPL_ABORT_ERROR) {
4451 m->restart = FALSE;
4452 m->absent = FALSE;
4453 vm_object_absent_release(m->object);
4454 m->page_error = KERN_MEMORY_ERROR;
4455 m->error = TRUE;
4456 must_free = FALSE;
4457 }
4458
4459 /*
4460 * ENCRYPTED SWAP:
4461 * If the page was already encrypted,
4462 * we don't really need to decrypt it
4463 * now. It will get decrypted later,
4464 * on demand, as soon as someone needs
4465 * to access its contents.
4466 */
4467
4468 m->cleaning = FALSE;
4469 m->overwriting = FALSE;
4470 PAGE_WAKEUP_DONE(m);
4471
4472 if (must_free == TRUE) {
4473 vm_page_free(m);
4474 } else {
4475 vm_page_activate(m);
4476 }
4477 vm_page_unlock_queues();
4478 continue;
4479 }
4480 /*
4481 * Handle the trusted pager throttle.
4482 */
4483 if (m->laundry) {
4484 vm_pageout_throttle_up(m);
4485 }
4486 if(m->pageout) {
4487 assert(m->busy);
4488 assert(m->wire_count == 1);
4489 m->pageout = FALSE;
4490 vm_page_unwire(m);
4491 }
4492 m->dump_cleaning = FALSE;
4493 m->cleaning = FALSE;
4494 m->overwriting = FALSE;
4495 #if MACH_PAGEMAP
4496 vm_external_state_clr(
4497 m->object->existence_map, m->offset);
4498 #endif /* MACH_PAGEMAP */
4499 if(error & UPL_ABORT_DUMP_PAGES) {
4500 vm_page_free(m);
4501 pmap_disconnect(m->phys_page);
4502 } else {
4503 PAGE_WAKEUP_DONE(m);
4504 }
4505 vm_page_unlock_queues();
4506 }
4507 }
4508 occupied = 1;
4509 if (upl->flags & UPL_DEVICE_MEMORY) {
4510 occupied = 0;
4511 } else if (upl->flags & UPL_LITE) {
4512 int pg_num;
4513 int j;
4514 pg_num = upl->size/PAGE_SIZE;
4515 pg_num = (pg_num + 31) >> 5;
4516 occupied = 0;
4517 for(j= 0; j<pg_num; j++) {
4518 if(lite_list[j] != 0) {
4519 occupied = 1;
4520 break;
4521 }
4522 }
4523 } else {
4524 if(queue_empty(&upl->map_object->memq)) {
4525 occupied = 0;
4526 }
4527 }
4528
4529 if(occupied == 0) {
4530 if(object == shadow_object)
4531 vm_object_paging_end(shadow_object);
4532 }
4533 vm_object_unlock(shadow_object);
4534 if (object != shadow_object)
4535 vm_object_unlock(object);
4536
4537 upl_unlock(upl);
4538 return KERN_SUCCESS;
4539 }
4540
4541 /* an option on commit should be wire */
4542 kern_return_t
4543 upl_commit(
4544 upl_t upl,
4545 upl_page_info_t *page_list,
4546 mach_msg_type_number_t count)
4547 {
4548 if (upl == UPL_NULL)
4549 return KERN_INVALID_ARGUMENT;
4550
4551 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4552 boolean_t empty;
4553 return upl_commit_range(upl, 0, upl->size, 0,
4554 page_list, count, &empty);
4555 }
4556
4557 if (count == 0)
4558 page_list = NULL;
4559
4560 upl_lock(upl);
4561 if (upl->flags & UPL_DEVICE_MEMORY)
4562 page_list = NULL;
4563
4564 if (upl->flags & UPL_ENCRYPTED) {
4565 /*
4566 * ENCRYPTED SWAP:
4567 * This UPL was encrypted, but we don't need
4568 * to decrypt here. We'll decrypt each page
4569 * later, on demand, as soon as someone needs
4570 * to access the page's contents.
4571 */
4572 }
4573
4574 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4575 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4576 vm_object_t shadow_object = upl->map_object->shadow;
4577 vm_object_t object = upl->map_object;
4578 vm_object_offset_t target_offset;
4579 upl_size_t xfer_end;
4580 int entry;
4581
4582 vm_page_t t, m;
4583 upl_page_info_t *p;
4584
4585 if (object != shadow_object)
4586 vm_object_lock(object);
4587 vm_object_lock(shadow_object);
4588
4589 entry = 0;
4590 target_offset = object->shadow_offset;
4591 xfer_end = upl->size + object->shadow_offset;
4592
4593 while(target_offset < xfer_end) {
4594
4595 if ((t = vm_page_lookup(object,
4596 target_offset - object->shadow_offset))
4597 == NULL) {
4598 target_offset += PAGE_SIZE_64;
4599 entry++;
4600 continue;
4601 }
4602
4603 m = vm_page_lookup(shadow_object, target_offset);
4604 if(m != VM_PAGE_NULL) {
4605 /*
4606 * ENCRYPTED SWAP:
4607 * If this page was encrypted, we
4608 * don't need to decrypt it here.
4609 * We'll decrypt it later, on demand,
4610 * as soon as someone needs to access
4611 * its contents.
4612 */
4613
4614 if (upl->flags & UPL_CLEAR_DIRTY) {
4615 pmap_clear_modify(m->phys_page);
4616 m->dirty = FALSE;
4617 }
4618 /* It is a part of the semantic of */
4619 /* COPYOUT_FROM UPLs that a commit */
4620 /* implies cache sync between the */
4621 /* vm page and the backing store */
4622 /* this can be used to strip the */
4623 /* precious bit as well as clean */
4624 if (upl->flags & UPL_PAGE_SYNC_DONE)
4625 m->precious = FALSE;
4626
4627 if(page_list) {
4628 p = &(page_list[entry]);
4629 if(page_list[entry].phys_addr &&
4630 p->pageout && !m->pageout) {
4631 vm_page_lock_queues();
4632 m->busy = TRUE;
4633 m->pageout = TRUE;
4634 vm_page_wire(m);
4635 vm_page_unlock_queues();
4636 } else if (page_list[entry].phys_addr &&
4637 !p->pageout && m->pageout &&
4638 !m->dump_cleaning) {
4639 vm_page_lock_queues();
4640 m->pageout = FALSE;
4641 m->absent = FALSE;
4642 m->overwriting = FALSE;
4643 vm_page_unwire(m);
4644 PAGE_WAKEUP_DONE(m);
4645 vm_page_unlock_queues();
4646 }
4647 page_list[entry].phys_addr = 0;
4648 }
4649 }
4650 target_offset += PAGE_SIZE_64;
4651 entry++;
4652 }
4653 vm_object_unlock(shadow_object);
4654 if (object != shadow_object)
4655 vm_object_unlock(object);
4656
4657 }
4658 if (upl->flags & UPL_DEVICE_MEMORY) {
4659 vm_object_lock(upl->map_object->shadow);
4660 if(upl->map_object == upl->map_object->shadow)
4661 vm_object_paging_end(upl->map_object->shadow);
4662 vm_object_unlock(upl->map_object->shadow);
4663 }
4664 upl_unlock(upl);
4665 return KERN_SUCCESS;
4666 }
4667
4668
4669
4670 kern_return_t
4671 vm_object_iopl_request(
4672 vm_object_t object,
4673 vm_object_offset_t offset,
4674 upl_size_t size,
4675 upl_t *upl_ptr,
4676 upl_page_info_array_t user_page_list,
4677 unsigned int *page_list_count,
4678 int cntrl_flags)
4679 {
4680 vm_page_t dst_page;
4681 vm_object_offset_t dst_offset = offset;
4682 upl_size_t xfer_size = size;
4683 upl_t upl = NULL;
4684 unsigned int entry;
4685 wpl_array_t lite_list = NULL;
4686 int page_field_size;
4687 int delayed_unlock = 0;
4688 int no_zero_fill = FALSE;
4689 vm_page_t alias_page = NULL;
4690 kern_return_t ret;
4691 vm_prot_t prot;
4692
4693
4694 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4695 /*
4696 * For forward compatibility's sake,
4697 * reject any unknown flag.
4698 */
4699 return KERN_INVALID_VALUE;
4700 }
4701
4702 if (cntrl_flags & UPL_ENCRYPT) {
4703 /*
4704 * ENCRYPTED SWAP:
4705 * The paging path doesn't use this interface,
4706 * so we don't support the UPL_ENCRYPT flag
4707 * here. We won't encrypt the pages.
4708 */
4709 assert(! (cntrl_flags & UPL_ENCRYPT));
4710 }
4711
4712 if (cntrl_flags & UPL_NOZEROFILL)
4713 no_zero_fill = TRUE;
4714
4715 if (cntrl_flags & UPL_COPYOUT_FROM)
4716 prot = VM_PROT_READ;
4717 else
4718 prot = VM_PROT_READ | VM_PROT_WRITE;
4719
4720 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4721 size = MAX_UPL_TRANSFER * page_size;
4722 }
4723
4724 if(cntrl_flags & UPL_SET_INTERNAL)
4725 if(page_list_count != NULL)
4726 *page_list_count = MAX_UPL_TRANSFER;
4727 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4728 ((page_list_count != NULL) && (*page_list_count != 0)
4729 && *page_list_count < (size/page_size)))
4730 return KERN_INVALID_ARGUMENT;
4731
4732 if((!object->internal) && (object->paging_offset != 0))
4733 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
4734
4735 if(object->phys_contiguous) {
4736 /* No paging operations are possible against this memory */
4737 /* and so no need for map object, ever */
4738 cntrl_flags |= UPL_SET_LITE;
4739 }
4740
4741 if(upl_ptr) {
4742 if(cntrl_flags & UPL_SET_INTERNAL) {
4743 if(cntrl_flags & UPL_SET_LITE) {
4744 upl = upl_create(
4745 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4746 size);
4747 user_page_list = (upl_page_info_t *)
4748 (((uintptr_t)upl) + sizeof(struct upl));
4749 lite_list = (wpl_array_t)
4750 (((uintptr_t)user_page_list) +
4751 ((size/PAGE_SIZE) *
4752 sizeof(upl_page_info_t)));
4753 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4754 page_field_size =
4755 (page_field_size + 3) & 0xFFFFFFFC;
4756 bzero((char *)lite_list, page_field_size);
4757 upl->flags =
4758 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4759 } else {
4760 upl = upl_create(UPL_CREATE_INTERNAL, size);
4761 user_page_list = (upl_page_info_t *)
4762 (((uintptr_t)upl)
4763 + sizeof(struct upl));
4764 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4765 }
4766 } else {
4767 if(cntrl_flags & UPL_SET_LITE) {
4768 upl = upl_create(UPL_CREATE_LITE, size);
4769 lite_list = (wpl_array_t)
4770 (((uintptr_t)upl) + sizeof(struct upl));
4771 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4772 page_field_size =
4773 (page_field_size + 3) & 0xFFFFFFFC;
4774 bzero((char *)lite_list, page_field_size);
4775 upl->flags = UPL_LITE | UPL_IO_WIRE;
4776 } else {
4777 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4778 upl->flags = UPL_IO_WIRE;
4779 }
4780 }
4781
4782 if(object->phys_contiguous) {
4783 upl->map_object = object;
4784 /* don't need any shadow mappings for this one */
4785 /* since it is already I/O memory */
4786 upl->flags |= UPL_DEVICE_MEMORY;
4787
4788 vm_object_lock(object);
4789 vm_object_paging_begin(object);
4790 vm_object_unlock(object);
4791
4792 /* paging in progress also protects the paging_offset */
4793 upl->offset = offset + object->paging_offset;
4794 upl->size = size;
4795 *upl_ptr = upl;
4796 if(user_page_list) {
4797 user_page_list[0].phys_addr =
4798 (offset + object->shadow_offset)>>PAGE_SHIFT;
4799 user_page_list[0].device = TRUE;
4800 }
4801
4802 if(page_list_count != NULL) {
4803 if (upl->flags & UPL_INTERNAL) {
4804 *page_list_count = 0;
4805 } else {
4806 *page_list_count = 1;
4807 }
4808 }
4809 return KERN_SUCCESS;
4810 }
4811 if(user_page_list)
4812 user_page_list[0].device = FALSE;
4813
4814 if(cntrl_flags & UPL_SET_LITE) {
4815 upl->map_object = object;
4816 } else {
4817 upl->map_object = vm_object_allocate(size);
4818 vm_object_lock(upl->map_object);
4819 upl->map_object->shadow = object;
4820 upl->map_object->pageout = TRUE;
4821 upl->map_object->can_persist = FALSE;
4822 upl->map_object->copy_strategy =
4823 MEMORY_OBJECT_COPY_NONE;
4824 upl->map_object->shadow_offset = offset;
4825 upl->map_object->wimg_bits = object->wimg_bits;
4826 vm_object_unlock(upl->map_object);
4827 }
4828 }
4829 vm_object_lock(object);
4830 vm_object_paging_begin(object);
4831
4832 if (!object->phys_contiguous) {
4833 /* Protect user space from future COW operations */
4834 object->true_share = TRUE;
4835 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4836 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4837 }
4838
4839 /* we can lock the upl offset now that paging_in_progress is set */
4840 if(upl_ptr) {
4841 upl->size = size;
4842 upl->offset = offset + object->paging_offset;
4843 *upl_ptr = upl;
4844 #ifdef UPL_DEBUG
4845 queue_enter(&object->uplq, upl, upl_t, uplq);
4846 #endif /* UPL_DEBUG */
4847 }
4848
4849 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4850 /*
4851 * The user requested that access to the pages in this URL
4852 * be blocked until the UPL is commited or aborted.
4853 */
4854 upl->flags |= UPL_ACCESS_BLOCKED;
4855 }
4856
4857 entry = 0;
4858 while (xfer_size) {
4859 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4860 if (delayed_unlock) {
4861 delayed_unlock = 0;
4862 vm_page_unlock_queues();
4863 }
4864 vm_object_unlock(object);
4865 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4866 vm_object_lock(object);
4867 }
4868 dst_page = vm_page_lookup(object, dst_offset);
4869
4870 /*
4871 * ENCRYPTED SWAP:
4872 * If the page is encrypted, we need to decrypt it,
4873 * so force a soft page fault.
4874 */
4875 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4876 (dst_page->encrypted) ||
4877 (dst_page->unusual && (dst_page->error ||
4878 dst_page->restart ||
4879 dst_page->absent ||
4880 dst_page->fictitious ||
4881 (prot & dst_page->page_lock)))) {
4882 vm_fault_return_t result;
4883 do {
4884 vm_page_t top_page;
4885 kern_return_t error_code;
4886 int interruptible;
4887
4888 vm_object_offset_t lo_offset = offset;
4889 vm_object_offset_t hi_offset = offset + size;
4890
4891
4892 if (delayed_unlock) {
4893 delayed_unlock = 0;
4894 vm_page_unlock_queues();
4895 }
4896
4897 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4898 interruptible = THREAD_ABORTSAFE;
4899 } else {
4900 interruptible = THREAD_UNINT;
4901 }
4902
4903 result = vm_fault_page(object, dst_offset,
4904 prot | VM_PROT_WRITE, FALSE,
4905 interruptible,
4906 lo_offset, hi_offset,
4907 VM_BEHAVIOR_SEQUENTIAL,
4908 &prot, &dst_page, &top_page,
4909 (int *)0,
4910 &error_code, no_zero_fill, FALSE, NULL, 0);
4911
4912 switch(result) {
4913 case VM_FAULT_SUCCESS:
4914
4915 PAGE_WAKEUP_DONE(dst_page);
4916
4917 /*
4918 * Release paging references and
4919 * top-level placeholder page, if any.
4920 */
4921
4922 if(top_page != VM_PAGE_NULL) {
4923 vm_object_t local_object;
4924 local_object =
4925 top_page->object;
4926 if(top_page->object
4927 != dst_page->object) {
4928 vm_object_lock(
4929 local_object);
4930 VM_PAGE_FREE(top_page);
4931 vm_object_paging_end(
4932 local_object);
4933 vm_object_unlock(
4934 local_object);
4935 } else {
4936 VM_PAGE_FREE(top_page);
4937 vm_object_paging_end(
4938 local_object);
4939 }
4940 }
4941
4942 break;
4943
4944
4945 case VM_FAULT_RETRY:
4946 vm_object_lock(object);
4947 vm_object_paging_begin(object);
4948 break;
4949
4950 case VM_FAULT_FICTITIOUS_SHORTAGE:
4951 vm_page_more_fictitious();
4952 vm_object_lock(object);
4953 vm_object_paging_begin(object);
4954 break;
4955
4956 case VM_FAULT_MEMORY_SHORTAGE:
4957 if (vm_page_wait(interruptible)) {
4958 vm_object_lock(object);
4959 vm_object_paging_begin(object);
4960 break;
4961 }
4962 /* fall thru */
4963
4964 case VM_FAULT_INTERRUPTED:
4965 error_code = MACH_SEND_INTERRUPTED;
4966 case VM_FAULT_MEMORY_ERROR:
4967 ret = (error_code ? error_code:
4968 KERN_MEMORY_ERROR);
4969 vm_object_lock(object);
4970 for(; offset < dst_offset;
4971 offset += PAGE_SIZE) {
4972 dst_page = vm_page_lookup(
4973 object, offset);
4974 if(dst_page == VM_PAGE_NULL)
4975 panic("vm_object_iopl_request: Wired pages missing. \n");
4976 vm_page_lock_queues();
4977 vm_page_unwire(dst_page);
4978 vm_page_unlock_queues();
4979 VM_STAT(reactivations++);
4980 }
4981 vm_object_unlock(object);
4982 upl_destroy(upl);
4983 return ret;
4984 }
4985 } while ((result != VM_FAULT_SUCCESS)
4986 || (result == VM_FAULT_INTERRUPTED));
4987 }
4988 if (delayed_unlock == 0)
4989 vm_page_lock_queues();
4990 vm_page_wire(dst_page);
4991
4992 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4993 /*
4994 * Mark the page "busy" to block any future page fault
4995 * on this page. We'll also remove the mapping
4996 * of all these pages before leaving this routine.
4997 */
4998 assert(!dst_page->fictitious);
4999 dst_page->busy = TRUE;
5000 }
5001
5002 if (upl_ptr) {
5003 if (cntrl_flags & UPL_SET_LITE) {
5004 int pg_num;
5005 pg_num = (dst_offset-offset)/PAGE_SIZE;
5006 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5007 } else {
5008 /*
5009 * Convert the fictitious page to a
5010 * private shadow of the real page.
5011 */
5012 assert(alias_page->fictitious);
5013 alias_page->fictitious = FALSE;
5014 alias_page->private = TRUE;
5015 alias_page->pageout = TRUE;
5016 alias_page->phys_page = dst_page->phys_page;
5017 vm_page_wire(alias_page);
5018
5019 vm_page_insert(alias_page,
5020 upl->map_object, size - xfer_size);
5021 assert(!alias_page->wanted);
5022 alias_page->busy = FALSE;
5023 alias_page->absent = FALSE;
5024 }
5025
5026 /* expect the page to be used */
5027 dst_page->reference = TRUE;
5028
5029 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5030 dst_page->dirty = TRUE;
5031 alias_page = NULL;
5032
5033 if (user_page_list) {
5034 user_page_list[entry].phys_addr
5035 = dst_page->phys_page;
5036 user_page_list[entry].dirty =
5037 dst_page->dirty;
5038 user_page_list[entry].pageout =
5039 dst_page->pageout;
5040 user_page_list[entry].absent =
5041 dst_page->absent;
5042 user_page_list[entry].precious =
5043 dst_page->precious;
5044 }
5045 }
5046 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5047 delayed_unlock = 0;
5048 vm_page_unlock_queues();
5049 }
5050 entry++;
5051 dst_offset += PAGE_SIZE_64;
5052 xfer_size -= PAGE_SIZE;
5053 }
5054 if (delayed_unlock)
5055 vm_page_unlock_queues();
5056
5057 if (upl->flags & UPL_INTERNAL) {
5058 if(page_list_count != NULL)
5059 *page_list_count = 0;
5060 } else if (*page_list_count > entry) {
5061 if(page_list_count != NULL)
5062 *page_list_count = entry;
5063 }
5064
5065 if (alias_page != NULL) {
5066 vm_page_lock_queues();
5067 vm_page_free(alias_page);
5068 vm_page_unlock_queues();
5069 }
5070
5071 vm_object_unlock(object);
5072
5073 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5074 /*
5075 * We've marked all the pages "busy" so that future
5076 * page faults will block.
5077 * Now remove the mapping for these pages, so that they
5078 * can't be accessed without causing a page fault.
5079 */
5080 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5081 PMAP_NULL, 0, VM_PROT_NONE);
5082 }
5083
5084 return KERN_SUCCESS;
5085 }
5086
5087 kern_return_t
5088 upl_transpose(
5089 upl_t upl1,
5090 upl_t upl2)
5091 {
5092 kern_return_t retval;
5093 boolean_t upls_locked;
5094 vm_object_t object1, object2;
5095
5096 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5097 return KERN_INVALID_ARGUMENT;
5098 }
5099
5100 upls_locked = FALSE;
5101
5102 /*
5103 * Since we need to lock both UPLs at the same time,
5104 * avoid deadlocks by always taking locks in the same order.
5105 */
5106 if (upl1 < upl2) {
5107 upl_lock(upl1);
5108 upl_lock(upl2);
5109 } else {
5110 upl_lock(upl2);
5111 upl_lock(upl1);
5112 }
5113 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5114
5115 object1 = upl1->map_object;
5116 object2 = upl2->map_object;
5117
5118 if (upl1->offset != 0 || upl2->offset != 0 ||
5119 upl1->size != upl2->size) {
5120 /*
5121 * We deal only with full objects, not subsets.
5122 * That's because we exchange the entire backing store info
5123 * for the objects: pager, resident pages, etc... We can't do
5124 * only part of it.
5125 */
5126 retval = KERN_INVALID_VALUE;
5127 goto done;
5128 }
5129
5130 /*
5131 * Tranpose the VM objects' backing store.
5132 */
5133 retval = vm_object_transpose(object1, object2,
5134 (vm_object_size_t) upl1->size);
5135
5136 if (retval == KERN_SUCCESS) {
5137 /*
5138 * Make each UPL point to the correct VM object, i.e. the
5139 * object holding the pages that the UPL refers to...
5140 */
5141 upl1->map_object = object2;
5142 upl2->map_object = object1;
5143 }
5144
5145 done:
5146 /*
5147 * Cleanup.
5148 */
5149 if (upls_locked) {
5150 upl_unlock(upl1);
5151 upl_unlock(upl2);
5152 upls_locked = FALSE;
5153 }
5154
5155 return retval;
5156 }
5157
5158 /*
5159 * ENCRYPTED SWAP:
5160 *
5161 * Rationale: the user might have some encrypted data on disk (via
5162 * FileVault or any other mechanism). That data is then decrypted in
5163 * memory, which is safe as long as the machine is secure. But that
5164 * decrypted data in memory could be paged out to disk by the default
5165 * pager. The data would then be stored on disk in clear (not encrypted)
5166 * and it could be accessed by anyone who gets physical access to the
5167 * disk (if the laptop or the disk gets stolen for example). This weakens
5168 * the security offered by FileVault.
5169 *
5170 * Solution: the default pager will optionally request that all the
5171 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5172 * before it sends this UPL to disk via the vnode_pageout() path.
5173 *
5174 * Notes:
5175 *
5176 * To avoid disrupting the VM LRU algorithms, we want to keep the
5177 * clean-in-place mechanisms, which allow us to send some extra pages to
5178 * swap (clustering) without actually removing them from the user's
5179 * address space. We don't want the user to unknowingly access encrypted
5180 * data, so we have to actually remove the encrypted pages from the page
5181 * table. When the user accesses the data, the hardware will fail to
5182 * locate the virtual page in its page table and will trigger a page
5183 * fault. We can then decrypt the page and enter it in the page table
5184 * again. Whenever we allow the user to access the contents of a page,
5185 * we have to make sure it's not encrypted.
5186 *
5187 *
5188 */
5189 /*
5190 * ENCRYPTED SWAP:
5191 * Reserve of virtual addresses in the kernel address space.
5192 * We need to map the physical pages in the kernel, so that we
5193 * can call the encryption/decryption routines with a kernel
5194 * virtual address. We keep this pool of pre-allocated kernel
5195 * virtual addresses so that we don't have to scan the kernel's
5196 * virtaul address space each time we need to encrypt or decrypt
5197 * a physical page.
5198 * It would be nice to be able to encrypt and decrypt in physical
5199 * mode but that might not always be more efficient...
5200 */
5201 decl_simple_lock_data(,vm_paging_lock)
5202 #define VM_PAGING_NUM_PAGES 64
5203 vm_map_offset_t vm_paging_base_address = 0;
5204 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5205 int vm_paging_max_index = 0;
5206 unsigned long vm_paging_no_kernel_page = 0;
5207 unsigned long vm_paging_objects_mapped = 0;
5208 unsigned long vm_paging_pages_mapped = 0;
5209 unsigned long vm_paging_objects_mapped_slow = 0;
5210 unsigned long vm_paging_pages_mapped_slow = 0;
5211
5212 /*
5213 * ENCRYPTED SWAP:
5214 * vm_paging_map_object:
5215 * Maps part of a VM object's pages in the kernel
5216 * virtual address space, using the pre-allocated
5217 * kernel virtual addresses, if possible.
5218 * Context:
5219 * The VM object is locked. This lock will get
5220 * dropped and re-acquired though.
5221 */
5222 kern_return_t
5223 vm_paging_map_object(
5224 vm_map_offset_t *address,
5225 vm_page_t page,
5226 vm_object_t object,
5227 vm_object_offset_t offset,
5228 vm_map_size_t *size)
5229 {
5230 kern_return_t kr;
5231 vm_map_offset_t page_map_offset;
5232 vm_map_size_t map_size;
5233 vm_object_offset_t object_offset;
5234 #ifdef __ppc__
5235 int i;
5236 vm_map_entry_t map_entry;
5237 #endif /* __ppc__ */
5238
5239
5240 #ifdef __ppc__
5241 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5242 /*
5243 * Optimization for the PowerPC.
5244 * Use one of the pre-allocated kernel virtual addresses
5245 * and just enter the VM page in the kernel address space
5246 * at that virtual address.
5247 */
5248 vm_object_unlock(object);
5249 simple_lock(&vm_paging_lock);
5250
5251 if (vm_paging_base_address == 0) {
5252 /*
5253 * Initialize our pool of pre-allocated kernel
5254 * virtual addresses.
5255 */
5256 simple_unlock(&vm_paging_lock);
5257 page_map_offset = 0;
5258 kr = vm_map_find_space(kernel_map,
5259 &page_map_offset,
5260 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5261 0,
5262 &map_entry);
5263 if (kr != KERN_SUCCESS) {
5264 panic("vm_paging_map_object: "
5265 "kernel_map full\n");
5266 }
5267 map_entry->object.vm_object = kernel_object;
5268 map_entry->offset =
5269 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5270 vm_object_reference(kernel_object);
5271 vm_map_unlock(kernel_map);
5272
5273 simple_lock(&vm_paging_lock);
5274 if (vm_paging_base_address != 0) {
5275 /* someone raced us and won: undo */
5276 simple_unlock(&vm_paging_lock);
5277 kr = vm_map_remove(kernel_map,
5278 page_map_offset,
5279 page_map_offset +
5280 (VM_PAGING_NUM_PAGES
5281 * PAGE_SIZE),
5282 VM_MAP_NO_FLAGS);
5283 assert(kr == KERN_SUCCESS);
5284 simple_lock(&vm_paging_lock);
5285 } else {
5286 vm_paging_base_address = page_map_offset;
5287 }
5288 }
5289
5290 /*
5291 * Try and find an available kernel virtual address
5292 * from our pre-allocated pool.
5293 */
5294 page_map_offset = 0;
5295 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5296 if (vm_paging_page_inuse[i] == FALSE) {
5297 page_map_offset = vm_paging_base_address +
5298 (i * PAGE_SIZE);
5299 break;
5300 }
5301 }
5302
5303 if (page_map_offset != 0) {
5304 /*
5305 * We found a kernel virtual address;
5306 * map the physical page to that virtual address.
5307 */
5308 if (i > vm_paging_max_index) {
5309 vm_paging_max_index = i;
5310 }
5311 vm_paging_page_inuse[i] = TRUE;
5312 simple_unlock(&vm_paging_lock);
5313 pmap_map_block(kernel_pmap,
5314 page_map_offset,
5315 page->phys_page,
5316 1, /* Size is number of 4k pages */
5317 VM_PROT_DEFAULT,
5318 ((int) page->object->wimg_bits &
5319 VM_WIMG_MASK),
5320 0);
5321 vm_paging_objects_mapped++;
5322 vm_paging_pages_mapped++;
5323 *address = page_map_offset;
5324 vm_object_lock(object);
5325
5326 /* all done and mapped, ready to use ! */
5327 return KERN_SUCCESS;
5328 }
5329
5330 /*
5331 * We ran out of pre-allocated kernel virtual
5332 * addresses. Just map the page in the kernel
5333 * the slow and regular way.
5334 */
5335 vm_paging_no_kernel_page++;
5336 simple_unlock(&vm_paging_lock);
5337 vm_object_lock(object);
5338 }
5339 #endif /* __ppc__ */
5340
5341 object_offset = vm_object_trunc_page(offset);
5342 map_size = vm_map_round_page(*size);
5343
5344 /*
5345 * Try and map the required range of the object
5346 * in the kernel_map
5347 */
5348
5349 /* don't go beyond the object's end... */
5350 if (object_offset >= object->size) {
5351 map_size = 0;
5352 } else if (map_size > object->size - offset) {
5353 map_size = object->size - offset;
5354 }
5355
5356 vm_object_reference_locked(object); /* for the map entry */
5357 vm_object_unlock(object);
5358
5359 kr = vm_map_enter(kernel_map,
5360 address,
5361 map_size,
5362 0,
5363 VM_FLAGS_ANYWHERE,
5364 object,
5365 object_offset,
5366 FALSE,
5367 VM_PROT_DEFAULT,
5368 VM_PROT_ALL,
5369 VM_INHERIT_NONE);
5370 if (kr != KERN_SUCCESS) {
5371 *address = 0;
5372 *size = 0;
5373 vm_object_deallocate(object); /* for the map entry */
5374 return kr;
5375 }
5376
5377 *size = map_size;
5378
5379 /*
5380 * Enter the mapped pages in the page table now.
5381 */
5382 vm_object_lock(object);
5383 for (page_map_offset = 0;
5384 map_size != 0;
5385 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5386 unsigned int cache_attr;
5387
5388 page = vm_page_lookup(object, offset + page_map_offset);
5389 if (page == VM_PAGE_NULL) {
5390 panic("vm_paging_map_object: no page !?");
5391 }
5392 if (page->no_isync == TRUE) {
5393 pmap_sync_page_data_phys(page->phys_page);
5394 }
5395 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5396
5397 PMAP_ENTER(kernel_pmap,
5398 *address + page_map_offset,
5399 page,
5400 VM_PROT_DEFAULT,
5401 cache_attr,
5402 FALSE);
5403 }
5404
5405 vm_paging_objects_mapped_slow++;
5406 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5407
5408 return KERN_SUCCESS;
5409 }
5410
5411 /*
5412 * ENCRYPTED SWAP:
5413 * vm_paging_unmap_object:
5414 * Unmaps part of a VM object's pages from the kernel
5415 * virtual address space.
5416 * Context:
5417 * The VM object is locked. This lock will get
5418 * dropped and re-acquired though.
5419 */
5420 void
5421 vm_paging_unmap_object(
5422 vm_object_t object,
5423 vm_map_offset_t start,
5424 vm_map_offset_t end)
5425 {
5426 kern_return_t kr;
5427 #ifdef __ppc__
5428 int i;
5429 #endif /* __ppc__ */
5430
5431 if ((vm_paging_base_address != 0) ||
5432 (start < vm_paging_base_address) ||
5433 (end > (vm_paging_base_address
5434 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5435 /*
5436 * We didn't use our pre-allocated pool of
5437 * kernel virtual address. Deallocate the
5438 * virtual memory.
5439 */
5440 if (object != VM_OBJECT_NULL) {
5441 vm_object_unlock(object);
5442 }
5443 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5444 if (object != VM_OBJECT_NULL) {
5445 vm_object_lock(object);
5446 }
5447 assert(kr == KERN_SUCCESS);
5448 } else {
5449 /*
5450 * We used a kernel virtual address from our
5451 * pre-allocated pool. Put it back in the pool
5452 * for next time.
5453 */
5454 #ifdef __ppc__
5455 assert(end - start == PAGE_SIZE);
5456 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5457
5458 /* undo the pmap mapping */
5459 mapping_remove(kernel_pmap, start);
5460
5461 simple_lock(&vm_paging_lock);
5462 vm_paging_page_inuse[i] = FALSE;
5463 simple_unlock(&vm_paging_lock);
5464 #endif /* __ppc__ */
5465 }
5466 }
5467
5468 /*
5469 * Encryption data.
5470 * "iv" is the "initial vector". Ideally, we want to
5471 * have a different one for each page we encrypt, so that
5472 * crackers can't find encryption patterns too easily.
5473 */
5474 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5475 boolean_t swap_crypt_ctx_initialized = FALSE;
5476 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5477 aes_ctx swap_crypt_ctx;
5478 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5479
5480 #if DEBUG
5481 boolean_t swap_crypt_ctx_tested = FALSE;
5482 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5483 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5484 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5485 #endif /* DEBUG */
5486
5487 extern u_long random(void);
5488
5489 /*
5490 * Initialize the encryption context: key and key size.
5491 */
5492 void swap_crypt_ctx_initialize(void); /* forward */
5493 void
5494 swap_crypt_ctx_initialize(void)
5495 {
5496 unsigned int i;
5497
5498 /*
5499 * No need for locking to protect swap_crypt_ctx_initialized
5500 * because the first use of encryption will come from the
5501 * pageout thread (we won't pagein before there's been a pageout)
5502 * and there's only one pageout thread.
5503 */
5504 if (swap_crypt_ctx_initialized == FALSE) {
5505 for (i = 0;
5506 i < (sizeof (swap_crypt_key) /
5507 sizeof (swap_crypt_key[0]));
5508 i++) {
5509 swap_crypt_key[i] = random();
5510 }
5511 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5512 SWAP_CRYPT_AES_KEY_SIZE,
5513 &swap_crypt_ctx.encrypt);
5514 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5515 SWAP_CRYPT_AES_KEY_SIZE,
5516 &swap_crypt_ctx.decrypt);
5517 swap_crypt_ctx_initialized = TRUE;
5518 }
5519
5520 #if DEBUG
5521 /*
5522 * Validate the encryption algorithms.
5523 */
5524 if (swap_crypt_ctx_tested == FALSE) {
5525 /* initialize */
5526 for (i = 0; i < 4096; i++) {
5527 swap_crypt_test_page_ref[i] = (char) i;
5528 }
5529 /* encrypt */
5530 aes_encrypt_cbc(swap_crypt_test_page_ref,
5531 swap_crypt_null_iv,
5532 PAGE_SIZE / AES_BLOCK_SIZE,
5533 swap_crypt_test_page_encrypt,
5534 &swap_crypt_ctx.encrypt);
5535 /* decrypt */
5536 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5537 swap_crypt_null_iv,
5538 PAGE_SIZE / AES_BLOCK_SIZE,
5539 swap_crypt_test_page_decrypt,
5540 &swap_crypt_ctx.decrypt);
5541 /* compare result with original */
5542 for (i = 0; i < 4096; i ++) {
5543 if (swap_crypt_test_page_decrypt[i] !=
5544 swap_crypt_test_page_ref[i]) {
5545 panic("encryption test failed");
5546 }
5547 }
5548
5549 /* encrypt again */
5550 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5551 swap_crypt_null_iv,
5552 PAGE_SIZE / AES_BLOCK_SIZE,
5553 swap_crypt_test_page_decrypt,
5554 &swap_crypt_ctx.encrypt);
5555 /* decrypt in place */
5556 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5557 swap_crypt_null_iv,
5558 PAGE_SIZE / AES_BLOCK_SIZE,
5559 swap_crypt_test_page_decrypt,
5560 &swap_crypt_ctx.decrypt);
5561 for (i = 0; i < 4096; i ++) {
5562 if (swap_crypt_test_page_decrypt[i] !=
5563 swap_crypt_test_page_ref[i]) {
5564 panic("in place encryption test failed");
5565 }
5566 }
5567
5568 swap_crypt_ctx_tested = TRUE;
5569 }
5570 #endif /* DEBUG */
5571 }
5572
5573 /*
5574 * ENCRYPTED SWAP:
5575 * vm_page_encrypt:
5576 * Encrypt the given page, for secure paging.
5577 * The page might already be mapped at kernel virtual
5578 * address "kernel_mapping_offset". Otherwise, we need
5579 * to map it.
5580 *
5581 * Context:
5582 * The page's object is locked, but this lock will be released
5583 * and re-acquired.
5584 * The page is busy and not accessible by users (not entered in any pmap).
5585 */
5586 void
5587 vm_page_encrypt(
5588 vm_page_t page,
5589 vm_map_offset_t kernel_mapping_offset)
5590 {
5591 int clear_refmod = 0;
5592 kern_return_t kr;
5593 boolean_t page_was_referenced;
5594 boolean_t page_was_modified;
5595 vm_map_size_t kernel_mapping_size;
5596 vm_offset_t kernel_vaddr;
5597 union {
5598 unsigned char aes_iv[AES_BLOCK_SIZE];
5599 struct {
5600 memory_object_t pager_object;
5601 vm_object_offset_t paging_offset;
5602 } vm;
5603 } encrypt_iv;
5604
5605 if (! vm_pages_encrypted) {
5606 vm_pages_encrypted = TRUE;
5607 }
5608
5609 assert(page->busy);
5610 assert(page->dirty || page->precious);
5611
5612 if (page->encrypted) {
5613 /*
5614 * Already encrypted: no need to do it again.
5615 */
5616 vm_page_encrypt_already_encrypted_counter++;
5617 return;
5618 }
5619 ASSERT_PAGE_DECRYPTED(page);
5620
5621 /*
5622 * Gather the "reference" and "modified" status of the page.
5623 * We'll restore these values after the encryption, so that
5624 * the encryption is transparent to the rest of the system
5625 * and doesn't impact the VM's LRU logic.
5626 */
5627 page_was_referenced =
5628 (page->reference || pmap_is_referenced(page->phys_page));
5629 page_was_modified =
5630 (page->dirty || pmap_is_modified(page->phys_page));
5631
5632 if (kernel_mapping_offset == 0) {
5633 /*
5634 * The page hasn't already been mapped in kernel space
5635 * by the caller. Map it now, so that we can access
5636 * its contents and encrypt them.
5637 */
5638 kernel_mapping_size = PAGE_SIZE;
5639 kr = vm_paging_map_object(&kernel_mapping_offset,
5640 page,
5641 page->object,
5642 page->offset,
5643 &kernel_mapping_size);
5644 if (kr != KERN_SUCCESS) {
5645 panic("vm_page_encrypt: "
5646 "could not map page in kernel: 0x%x\n",
5647 kr);
5648 }
5649 } else {
5650 kernel_mapping_size = 0;
5651 }
5652 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5653
5654 if (swap_crypt_ctx_initialized == FALSE) {
5655 swap_crypt_ctx_initialize();
5656 }
5657 assert(swap_crypt_ctx_initialized);
5658
5659 /*
5660 * Prepare an "initial vector" for the encryption.
5661 * We use the "pager" and the "paging_offset" for that
5662 * page to obfuscate the encrypted data a bit more and
5663 * prevent crackers from finding patterns that they could
5664 * use to break the key.
5665 */
5666 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5667 encrypt_iv.vm.pager_object = page->object->pager;
5668 encrypt_iv.vm.paging_offset =
5669 page->object->paging_offset + page->offset;
5670
5671 vm_object_unlock(page->object);
5672
5673 /* encrypt the "initial vector" */
5674 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5675 swap_crypt_null_iv,
5676 1,
5677 &encrypt_iv.aes_iv[0],
5678 &swap_crypt_ctx.encrypt);
5679
5680 /*
5681 * Encrypt the page.
5682 */
5683 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5684 &encrypt_iv.aes_iv[0],
5685 PAGE_SIZE / AES_BLOCK_SIZE,
5686 (unsigned char *) kernel_vaddr,
5687 &swap_crypt_ctx.encrypt);
5688
5689 vm_page_encrypt_counter++;
5690
5691 vm_object_lock(page->object);
5692
5693 /*
5694 * Unmap the page from the kernel's address space,
5695 * if we had to map it ourselves. Otherwise, let
5696 * the caller undo the mapping if needed.
5697 */
5698 if (kernel_mapping_size != 0) {
5699 vm_paging_unmap_object(page->object,
5700 kernel_mapping_offset,
5701 kernel_mapping_offset + kernel_mapping_size);
5702 }
5703
5704 /*
5705 * Restore the "reference" and "modified" bits.
5706 * This should clean up any impact the encryption had
5707 * on them.
5708 */
5709 if (! page_was_referenced) {
5710 clear_refmod |= VM_MEM_REFERENCED;
5711 page->reference = FALSE;
5712 }
5713 if (! page_was_modified) {
5714 clear_refmod |= VM_MEM_MODIFIED;
5715 page->dirty = FALSE;
5716 }
5717 if (clear_refmod)
5718 pmap_clear_refmod(page->phys_page, clear_refmod);
5719
5720 page->encrypted = TRUE;
5721 }
5722
5723 /*
5724 * ENCRYPTED SWAP:
5725 * vm_page_decrypt:
5726 * Decrypt the given page.
5727 * The page might already be mapped at kernel virtual
5728 * address "kernel_mapping_offset". Otherwise, we need
5729 * to map it.
5730 *
5731 * Context:
5732 * The page's VM object is locked but will be unlocked and relocked.
5733 * The page is busy and not accessible by users (not entered in any pmap).
5734 */
5735 void
5736 vm_page_decrypt(
5737 vm_page_t page,
5738 vm_map_offset_t kernel_mapping_offset)
5739 {
5740 int clear_refmod = 0;
5741 kern_return_t kr;
5742 vm_map_size_t kernel_mapping_size;
5743 vm_offset_t kernel_vaddr;
5744 boolean_t page_was_referenced;
5745 union {
5746 unsigned char aes_iv[AES_BLOCK_SIZE];
5747 struct {
5748 memory_object_t pager_object;
5749 vm_object_offset_t paging_offset;
5750 } vm;
5751 } decrypt_iv;
5752
5753 assert(page->busy);
5754 assert(page->encrypted);
5755
5756 /*
5757 * Gather the "reference" status of the page.
5758 * We'll restore its value after the decryption, so that
5759 * the decryption is transparent to the rest of the system
5760 * and doesn't impact the VM's LRU logic.
5761 */
5762 page_was_referenced =
5763 (page->reference || pmap_is_referenced(page->phys_page));
5764
5765 if (kernel_mapping_offset == 0) {
5766 /*
5767 * The page hasn't already been mapped in kernel space
5768 * by the caller. Map it now, so that we can access
5769 * its contents and decrypt them.
5770 */
5771 kernel_mapping_size = PAGE_SIZE;
5772 kr = vm_paging_map_object(&kernel_mapping_offset,
5773 page,
5774 page->object,
5775 page->offset,
5776 &kernel_mapping_size);
5777 if (kr != KERN_SUCCESS) {
5778 panic("vm_page_decrypt: "
5779 "could not map page in kernel: 0x%x\n");
5780 }
5781 } else {
5782 kernel_mapping_size = 0;
5783 }
5784 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5785
5786 assert(swap_crypt_ctx_initialized);
5787
5788 /*
5789 * Prepare an "initial vector" for the decryption.
5790 * It has to be the same as the "initial vector" we
5791 * used to encrypt that page.
5792 */
5793 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5794 decrypt_iv.vm.pager_object = page->object->pager;
5795 decrypt_iv.vm.paging_offset =
5796 page->object->paging_offset + page->offset;
5797
5798 vm_object_unlock(page->object);
5799
5800 /* encrypt the "initial vector" */
5801 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5802 swap_crypt_null_iv,
5803 1,
5804 &decrypt_iv.aes_iv[0],
5805 &swap_crypt_ctx.encrypt);
5806
5807 /*
5808 * Decrypt the page.
5809 */
5810 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5811 &decrypt_iv.aes_iv[0],
5812 PAGE_SIZE / AES_BLOCK_SIZE,
5813 (unsigned char *) kernel_vaddr,
5814 &swap_crypt_ctx.decrypt);
5815 vm_page_decrypt_counter++;
5816
5817 vm_object_lock(page->object);
5818
5819 /*
5820 * Unmap the page from the kernel's address space,
5821 * if we had to map it ourselves. Otherwise, let
5822 * the caller undo the mapping if needed.
5823 */
5824 if (kernel_mapping_size != 0) {
5825 vm_paging_unmap_object(page->object,
5826 kernel_vaddr,
5827 kernel_vaddr + PAGE_SIZE);
5828 }
5829
5830 /*
5831 * After decryption, the page is actually clean.
5832 * It was encrypted as part of paging, which "cleans"
5833 * the "dirty" pages.
5834 * Noone could access it after it was encrypted
5835 * and the decryption doesn't count.
5836 */
5837 page->dirty = FALSE;
5838 clear_refmod = VM_MEM_MODIFIED;
5839
5840 /* restore the "reference" bit */
5841 if (! page_was_referenced) {
5842 page->reference = FALSE;
5843 clear_refmod |= VM_MEM_REFERENCED;
5844 }
5845 pmap_clear_refmod(page->phys_page, clear_refmod);
5846
5847 page->encrypted = FALSE;
5848
5849 /*
5850 * We've just modified the page's contents via the data cache and part
5851 * of the new contents might still be in the cache and not yet in RAM.
5852 * Since the page is now available and might get gathered in a UPL to
5853 * be part of a DMA transfer from a driver that expects the memory to
5854 * be coherent at this point, we have to flush the data cache.
5855 */
5856 pmap_sync_page_data_phys(page->phys_page);
5857 /*
5858 * Since the page is not mapped yet, some code might assume that it
5859 * doesn't need to invalidate the instruction cache when writing to
5860 * that page. That code relies on "no_isync" being set, so that the
5861 * caches get syncrhonized when the page is first mapped. So we need
5862 * to set "no_isync" here too, despite the fact that we just
5863 * synchronized the caches above...
5864 */
5865 page->no_isync = TRUE;
5866 }
5867
5868 unsigned long upl_encrypt_upls = 0;
5869 unsigned long upl_encrypt_pages = 0;
5870
5871 /*
5872 * ENCRYPTED SWAP:
5873 *
5874 * upl_encrypt:
5875 * Encrypts all the pages in the UPL, within the specified range.
5876 *
5877 */
5878 void
5879 upl_encrypt(
5880 upl_t upl,
5881 upl_offset_t crypt_offset,
5882 upl_size_t crypt_size)
5883 {
5884 upl_size_t upl_size;
5885 upl_offset_t upl_offset;
5886 vm_object_t upl_object;
5887 vm_page_t page;
5888 vm_object_t shadow_object;
5889 vm_object_offset_t shadow_offset;
5890 vm_object_offset_t paging_offset;
5891 vm_object_offset_t base_offset;
5892
5893 upl_encrypt_upls++;
5894 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5895
5896 upl_lock(upl);
5897
5898 upl_object = upl->map_object;
5899 upl_offset = upl->offset;
5900 upl_size = upl->size;
5901
5902 upl_unlock(upl);
5903
5904 vm_object_lock(upl_object);
5905
5906 /*
5907 * Find the VM object that contains the actual pages.
5908 */
5909 if (upl_object->pageout) {
5910 shadow_object = upl_object->shadow;
5911 /*
5912 * The offset in the shadow object is actually also
5913 * accounted for in upl->offset. It possibly shouldn't be
5914 * this way, but for now don't account for it twice.
5915 */
5916 shadow_offset = 0;
5917 assert(upl_object->paging_offset == 0); /* XXX ? */
5918 vm_object_lock(shadow_object);
5919 } else {
5920 shadow_object = upl_object;
5921 shadow_offset = 0;
5922 }
5923
5924 paging_offset = shadow_object->paging_offset;
5925 vm_object_paging_begin(shadow_object);
5926
5927 if (shadow_object != upl_object) {
5928 vm_object_unlock(shadow_object);
5929 }
5930 vm_object_unlock(upl_object);
5931
5932 base_offset = shadow_offset;
5933 base_offset += upl_offset;
5934 base_offset += crypt_offset;
5935 base_offset -= paging_offset;
5936 /*
5937 * Unmap the pages, so that nobody can continue accessing them while
5938 * they're encrypted. After that point, all accesses to these pages
5939 * will cause a page fault and block while the page is being encrypted
5940 * (busy). After the encryption completes, any access will cause a
5941 * page fault and the page gets decrypted at that time.
5942 */
5943 assert(crypt_offset + crypt_size <= upl_size);
5944 vm_object_pmap_protect(shadow_object,
5945 base_offset,
5946 (vm_object_size_t)crypt_size,
5947 PMAP_NULL,
5948 0,
5949 VM_PROT_NONE);
5950
5951 /* XXX FBDP could the object have changed significantly here ? */
5952 vm_object_lock(shadow_object);
5953
5954 for (upl_offset = 0;
5955 upl_offset < crypt_size;
5956 upl_offset += PAGE_SIZE) {
5957 page = vm_page_lookup(shadow_object,
5958 base_offset + upl_offset);
5959 if (page == VM_PAGE_NULL) {
5960 panic("upl_encrypt: "
5961 "no page for (obj=%p,off=%lld+%d)!\n",
5962 shadow_object,
5963 base_offset,
5964 upl_offset);
5965 }
5966 vm_page_encrypt(page, 0);
5967 }
5968
5969 vm_object_paging_end(shadow_object);
5970 vm_object_unlock(shadow_object);
5971 }
5972
5973 vm_size_t
5974 upl_get_internal_pagelist_offset(void)
5975 {
5976 return sizeof(struct upl);
5977 }
5978
5979 void
5980 upl_set_dirty(
5981 upl_t upl)
5982 {
5983 upl->flags |= UPL_CLEAR_DIRTY;
5984 }
5985
5986 void
5987 upl_clear_dirty(
5988 upl_t upl)
5989 {
5990 upl->flags &= ~UPL_CLEAR_DIRTY;
5991 }
5992
5993
5994 #ifdef MACH_BSD
5995
5996 boolean_t upl_page_present(upl_page_info_t *upl, int index)
5997 {
5998 return(UPL_PAGE_PRESENT(upl, index));
5999 }
6000 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
6001 {
6002 return(UPL_DIRTY_PAGE(upl, index));
6003 }
6004 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
6005 {
6006 return(UPL_VALID_PAGE(upl, index));
6007 }
6008 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6009 {
6010 return(UPL_PHYS_PAGE(upl, index));
6011 }
6012
6013 void
6014 vm_countdirtypages(void)
6015 {
6016 vm_page_t m;
6017 int dpages;
6018 int pgopages;
6019 int precpages;
6020
6021
6022 dpages=0;
6023 pgopages=0;
6024 precpages=0;
6025
6026 vm_page_lock_queues();
6027 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6028 do {
6029 if (m ==(vm_page_t )0) break;
6030
6031 if(m->dirty) dpages++;
6032 if(m->pageout) pgopages++;
6033 if(m->precious) precpages++;
6034
6035 assert(m->object != kernel_object);
6036 m = (vm_page_t) queue_next(&m->pageq);
6037 if (m ==(vm_page_t )0) break;
6038
6039 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6040 vm_page_unlock_queues();
6041
6042 vm_page_lock_queues();
6043 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6044 do {
6045 if (m ==(vm_page_t )0) break;
6046
6047 if(m->dirty) dpages++;
6048 if(m->pageout) pgopages++;
6049 if(m->precious) precpages++;
6050
6051 assert(m->object != kernel_object);
6052 m = (vm_page_t) queue_next(&m->pageq);
6053 if (m ==(vm_page_t )0) break;
6054
6055 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6056 vm_page_unlock_queues();
6057
6058 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6059
6060 dpages=0;
6061 pgopages=0;
6062 precpages=0;
6063
6064 vm_page_lock_queues();
6065 m = (vm_page_t) queue_first(&vm_page_queue_active);
6066
6067 do {
6068 if(m == (vm_page_t )0) break;
6069 if(m->dirty) dpages++;
6070 if(m->pageout) pgopages++;
6071 if(m->precious) precpages++;
6072
6073 assert(m->object != kernel_object);
6074 m = (vm_page_t) queue_next(&m->pageq);
6075 if(m == (vm_page_t )0) break;
6076
6077 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6078 vm_page_unlock_queues();
6079
6080 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6081
6082 }
6083 #endif /* MACH_BSD */
6084
6085 #ifdef UPL_DEBUG
6086 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6087 {
6088 upl->ubc_alias1 = alias1;
6089 upl->ubc_alias2 = alias2;
6090 return KERN_SUCCESS;
6091 }
6092 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6093 {
6094 if(al)
6095 *al = upl->ubc_alias1;
6096 if(al2)
6097 *al2 = upl->ubc_alias2;
6098 return KERN_SUCCESS;
6099 }
6100 #endif /* UPL_DEBUG */
6101
6102
6103
6104 #if MACH_KDB
6105 #include <ddb/db_output.h>
6106 #include <ddb/db_print.h>
6107 #include <vm/vm_print.h>
6108
6109 #define printf kdbprintf
6110 void db_pageout(void);
6111
6112 void
6113 db_vm(void)
6114 {
6115
6116 iprintf("VM Statistics:\n");
6117 db_indent += 2;
6118 iprintf("pages:\n");
6119 db_indent += 2;
6120 iprintf("activ %5d inact %5d free %5d",
6121 vm_page_active_count, vm_page_inactive_count,
6122 vm_page_free_count);
6123 printf(" wire %5d gobbl %5d\n",
6124 vm_page_wire_count, vm_page_gobble_count);
6125 db_indent -= 2;
6126 iprintf("target:\n");
6127 db_indent += 2;
6128 iprintf("min %5d inact %5d free %5d",
6129 vm_page_free_min, vm_page_inactive_target,
6130 vm_page_free_target);
6131 printf(" resrv %5d\n", vm_page_free_reserved);
6132 db_indent -= 2;
6133 iprintf("pause:\n");
6134 db_pageout();
6135 db_indent -= 2;
6136 }
6137
6138 #if MACH_COUNTERS
6139 extern int c_laundry_pages_freed;
6140 #endif /* MACH_COUNTERS */
6141
6142 void
6143 db_pageout(void)
6144 {
6145 iprintf("Pageout Statistics:\n");
6146 db_indent += 2;
6147 iprintf("active %5d inactv %5d\n",
6148 vm_pageout_active, vm_pageout_inactive);
6149 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6150 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6151 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6152 iprintf("used %5d clean %5d dirty %5d\n",
6153 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6154 vm_pageout_inactive_dirty);
6155 #if MACH_COUNTERS
6156 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6157 #endif /* MACH_COUNTERS */
6158 #if MACH_CLUSTER_STATS
6159 iprintf("Cluster Statistics:\n");
6160 db_indent += 2;
6161 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6162 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6163 vm_pageout_cluster_collisions);
6164 iprintf("clusters %5d conversions %5d\n",
6165 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6166 db_indent -= 2;
6167 iprintf("Target Statistics:\n");
6168 db_indent += 2;
6169 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6170 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6171 vm_pageout_target_page_freed);
6172 db_indent -= 2;
6173 #endif /* MACH_CLUSTER_STATS */
6174 db_indent -= 2;
6175 }
6176
6177 #endif /* MACH_KDB */