]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
357782704d541943dc5a73c97426d0907707b5ac
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * @OSF_COPYRIGHT@
24 */
25 /*
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
28 * All Rights Reserved.
29 *
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
35 *
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
39 *
40 * Carnegie Mellon requests users of this software to return to
41 *
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
46 *
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
49 */
50 /*
51 */
52 /*
53 * File: vm/vm_pageout.c
54 * Author: Avadis Tevanian, Jr., Michael Wayne Young
55 * Date: 1985
56 *
57 * The proverbial page-out daemon.
58 */
59
60 #include <stdint.h>
61
62 #include <debug.h>
63 #include <mach_pagemap.h>
64 #include <mach_cluster_stats.h>
65 #include <mach_kdb.h>
66 #include <advisory_pageout.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/memory_object.h>
70 #include <mach/memory_object_default.h>
71 #include <mach/memory_object_control_server.h>
72 #include <mach/mach_host_server.h>
73 #include <mach/upl.h>
74 #include <mach/vm_map.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_statistics.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/counters.h>
80 #include <kern/host_statistics.h>
81 #include <kern/machine.h>
82 #include <kern/misc_protos.h>
83 #include <kern/thread.h>
84 #include <kern/xpr.h>
85 #include <kern/kalloc.h>
86
87 #include <machine/vm_tuning.h>
88
89 #include <vm/pmap.h>
90 #include <vm/vm_fault.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_object.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_protos.h> /* must be last */
96
97 /*
98 * ENCRYPTED SWAP:
99 */
100 #ifdef __ppc__
101 #include <ppc/mappings.h>
102 #endif /* __ppc__ */
103 #include <../bsd/crypto/aes/aes.h>
104
105 extern ipc_port_t memory_manager_default;
106
107
108 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
109 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
110 #endif
111
112 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
113 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
114 #endif
115
116 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
117 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
118 #endif
119
120 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
121 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
122 #endif
123
124 #ifndef VM_PAGE_LAUNDRY_MAX
125 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
126 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
127
128 #ifndef VM_PAGEOUT_BURST_WAIT
129 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
130 #endif /* VM_PAGEOUT_BURST_WAIT */
131
132 #ifndef VM_PAGEOUT_EMPTY_WAIT
133 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
134 #endif /* VM_PAGEOUT_EMPTY_WAIT */
135
136 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
137 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
138 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
139
140 #ifndef VM_PAGEOUT_IDLE_WAIT
141 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
142 #endif /* VM_PAGEOUT_IDLE_WAIT */
143
144
145 /*
146 * To obtain a reasonable LRU approximation, the inactive queue
147 * needs to be large enough to give pages on it a chance to be
148 * referenced a second time. This macro defines the fraction
149 * of active+inactive pages that should be inactive.
150 * The pageout daemon uses it to update vm_page_inactive_target.
151 *
152 * If vm_page_free_count falls below vm_page_free_target and
153 * vm_page_inactive_count is below vm_page_inactive_target,
154 * then the pageout daemon starts running.
155 */
156
157 #ifndef VM_PAGE_INACTIVE_TARGET
158 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
159 #endif /* VM_PAGE_INACTIVE_TARGET */
160
161 /*
162 * Once the pageout daemon starts running, it keeps going
163 * until vm_page_free_count meets or exceeds vm_page_free_target.
164 */
165
166 #ifndef VM_PAGE_FREE_TARGET
167 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
168 #endif /* VM_PAGE_FREE_TARGET */
169
170 /*
171 * The pageout daemon always starts running once vm_page_free_count
172 * falls below vm_page_free_min.
173 */
174
175 #ifndef VM_PAGE_FREE_MIN
176 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
177 #endif /* VM_PAGE_FREE_MIN */
178
179 /*
180 * When vm_page_free_count falls below vm_page_free_reserved,
181 * only vm-privileged threads can allocate pages. vm-privilege
182 * allows the pageout daemon and default pager (and any other
183 * associated threads needed for default pageout) to continue
184 * operation by dipping into the reserved pool of pages.
185 */
186
187 #ifndef VM_PAGE_FREE_RESERVED
188 #define VM_PAGE_FREE_RESERVED(n) \
189 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
190 #endif /* VM_PAGE_FREE_RESERVED */
191
192
193 /*
194 * must hold the page queues lock to
195 * manipulate this structure
196 */
197 struct vm_pageout_queue {
198 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
199 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
200 unsigned int pgo_maxlaundry;
201
202 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
203 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
204 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
205 :0;
206 };
207
208 #define VM_PAGE_Q_THROTTLED(q) \
209 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
210
211
212 /*
213 * Exported variable used to broadcast the activation of the pageout scan
214 * Working Set uses this to throttle its use of pmap removes. In this
215 * way, code which runs within memory in an uncontested context does
216 * not keep encountering soft faults.
217 */
218
219 unsigned int vm_pageout_scan_event_counter = 0;
220
221 /*
222 * Forward declarations for internal routines.
223 */
224
225 static void vm_pageout_garbage_collect(int);
226 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
227 static void vm_pageout_iothread_external(void);
228 static void vm_pageout_iothread_internal(void);
229 static void vm_pageout_queue_steal(vm_page_t);
230
231 extern void vm_pageout_continue(void);
232 extern void vm_pageout_scan(void);
233
234 unsigned int vm_pageout_reserved_internal = 0;
235 unsigned int vm_pageout_reserved_really = 0;
236
237 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
238 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
239 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
240 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
241 unsigned int vm_pageout_deadlock_relief = 0;
242 unsigned int vm_pageout_inactive_relief = 0;
243 unsigned int vm_pageout_burst_active_throttle = 0;
244 unsigned int vm_pageout_burst_inactive_throttle = 0;
245
246 /*
247 * Protection against zero fill flushing live working sets derived
248 * from existing backing store and files
249 */
250 unsigned int vm_accellerate_zf_pageout_trigger = 400;
251 unsigned int vm_zf_iterator;
252 unsigned int vm_zf_iterator_count = 40;
253 unsigned int last_page_zf;
254 unsigned int vm_zf_count = 0;
255
256 /*
257 * These variables record the pageout daemon's actions:
258 * how many pages it looks at and what happens to those pages.
259 * No locking needed because only one thread modifies the variables.
260 */
261
262 unsigned int vm_pageout_active = 0; /* debugging */
263 unsigned int vm_pageout_inactive = 0; /* debugging */
264 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
265 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
266 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
267 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
268 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
269 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
270 unsigned int vm_pageout_inactive_used = 0; /* debugging */
271 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
272 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
273 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
274 unsigned int vm_pageout_purged_objects = 0; /* debugging */
275 unsigned int vm_stat_discard = 0; /* debugging */
276 unsigned int vm_stat_discard_sent = 0; /* debugging */
277 unsigned int vm_stat_discard_failure = 0; /* debugging */
278 unsigned int vm_stat_discard_throttle = 0; /* debugging */
279
280 unsigned int vm_pageout_scan_active_throttled = 0;
281 unsigned int vm_pageout_scan_inactive_throttled = 0;
282 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
283 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
284 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
285 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
286 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
287 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
288 /*
289 * Backing store throttle when BS is exhausted
290 */
291 unsigned int vm_backing_store_low = 0;
292
293 unsigned int vm_pageout_out_of_line = 0;
294 unsigned int vm_pageout_in_place = 0;
295
296 /*
297 * ENCRYPTED SWAP:
298 * counters and statistics...
299 */
300 unsigned long vm_page_decrypt_counter = 0;
301 unsigned long vm_page_decrypt_for_upl_counter = 0;
302 unsigned long vm_page_encrypt_counter = 0;
303 unsigned long vm_page_encrypt_abort_counter = 0;
304 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
305 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
306
307
308 struct vm_pageout_queue vm_pageout_queue_internal;
309 struct vm_pageout_queue vm_pageout_queue_external;
310
311
312 /*
313 * Routine: vm_backing_store_disable
314 * Purpose:
315 * Suspend non-privileged threads wishing to extend
316 * backing store when we are low on backing store
317 * (Synchronized by caller)
318 */
319 void
320 vm_backing_store_disable(
321 boolean_t disable)
322 {
323 if(disable) {
324 vm_backing_store_low = 1;
325 } else {
326 if(vm_backing_store_low) {
327 vm_backing_store_low = 0;
328 thread_wakeup((event_t) &vm_backing_store_low);
329 }
330 }
331 }
332
333
334 /*
335 * Routine: vm_pageout_object_allocate
336 * Purpose:
337 * Allocate an object for use as out-of-line memory in a
338 * data_return/data_initialize message.
339 * The page must be in an unlocked object.
340 *
341 * If the page belongs to a trusted pager, cleaning in place
342 * will be used, which utilizes a special "pageout object"
343 * containing private alias pages for the real page frames.
344 * Untrusted pagers use normal out-of-line memory.
345 */
346 vm_object_t
347 vm_pageout_object_allocate(
348 vm_page_t m,
349 vm_size_t size,
350 vm_object_offset_t offset)
351 {
352 vm_object_t object = m->object;
353 vm_object_t new_object;
354
355 assert(object->pager_ready);
356
357 new_object = vm_object_allocate(size);
358
359 if (object->pager_trusted) {
360 assert (offset < object->size);
361
362 vm_object_lock(new_object);
363 new_object->pageout = TRUE;
364 new_object->shadow = object;
365 new_object->can_persist = FALSE;
366 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
367 new_object->shadow_offset = offset;
368 vm_object_unlock(new_object);
369
370 /*
371 * Take a paging reference on the object. This will be dropped
372 * in vm_pageout_object_terminate()
373 */
374 vm_object_lock(object);
375 vm_object_paging_begin(object);
376 vm_page_lock_queues();
377 vm_page_unlock_queues();
378 vm_object_unlock(object);
379
380 vm_pageout_in_place++;
381 } else
382 vm_pageout_out_of_line++;
383 return(new_object);
384 }
385
386 #if MACH_CLUSTER_STATS
387 unsigned long vm_pageout_cluster_dirtied = 0;
388 unsigned long vm_pageout_cluster_cleaned = 0;
389 unsigned long vm_pageout_cluster_collisions = 0;
390 unsigned long vm_pageout_cluster_clusters = 0;
391 unsigned long vm_pageout_cluster_conversions = 0;
392 unsigned long vm_pageout_target_collisions = 0;
393 unsigned long vm_pageout_target_page_dirtied = 0;
394 unsigned long vm_pageout_target_page_freed = 0;
395 #define CLUSTER_STAT(clause) clause
396 #else /* MACH_CLUSTER_STATS */
397 #define CLUSTER_STAT(clause)
398 #endif /* MACH_CLUSTER_STATS */
399
400 /*
401 * Routine: vm_pageout_object_terminate
402 * Purpose:
403 * Destroy the pageout_object allocated by
404 * vm_pageout_object_allocate(), and perform all of the
405 * required cleanup actions.
406 *
407 * In/Out conditions:
408 * The object must be locked, and will be returned locked.
409 */
410 void
411 vm_pageout_object_terminate(
412 vm_object_t object)
413 {
414 vm_object_t shadow_object;
415 boolean_t shadow_internal;
416
417 /*
418 * Deal with the deallocation (last reference) of a pageout object
419 * (used for cleaning-in-place) by dropping the paging references/
420 * freeing pages in the original object.
421 */
422
423 assert(object->pageout);
424 shadow_object = object->shadow;
425 vm_object_lock(shadow_object);
426 shadow_internal = shadow_object->internal;
427
428 while (!queue_empty(&object->memq)) {
429 vm_page_t p, m;
430 vm_object_offset_t offset;
431
432 p = (vm_page_t) queue_first(&object->memq);
433
434 assert(p->private);
435 assert(p->pageout);
436 p->pageout = FALSE;
437 assert(!p->cleaning);
438
439 offset = p->offset;
440 VM_PAGE_FREE(p);
441 p = VM_PAGE_NULL;
442
443 m = vm_page_lookup(shadow_object,
444 offset + object->shadow_offset);
445
446 if(m == VM_PAGE_NULL)
447 continue;
448 assert(m->cleaning);
449 /* used as a trigger on upl_commit etc to recognize the */
450 /* pageout daemon's subseqent desire to pageout a cleaning */
451 /* page. When the bit is on the upl commit code will */
452 /* respect the pageout bit in the target page over the */
453 /* caller's page list indication */
454 m->dump_cleaning = FALSE;
455
456 /*
457 * Account for the paging reference taken when
458 * m->cleaning was set on this page.
459 */
460 vm_object_paging_end(shadow_object);
461 assert((m->dirty) || (m->precious) ||
462 (m->busy && m->cleaning));
463
464 /*
465 * Handle the trusted pager throttle.
466 * Also decrement the burst throttle (if external).
467 */
468 vm_page_lock_queues();
469 if (m->laundry) {
470 vm_pageout_throttle_up(m);
471 }
472
473 /*
474 * Handle the "target" page(s). These pages are to be freed if
475 * successfully cleaned. Target pages are always busy, and are
476 * wired exactly once. The initial target pages are not mapped,
477 * (so cannot be referenced or modified) but converted target
478 * pages may have been modified between the selection as an
479 * adjacent page and conversion to a target.
480 */
481 if (m->pageout) {
482 assert(m->busy);
483 assert(m->wire_count == 1);
484 m->cleaning = FALSE;
485 m->pageout = FALSE;
486 #if MACH_CLUSTER_STATS
487 if (m->wanted) vm_pageout_target_collisions++;
488 #endif
489 /*
490 * Revoke all access to the page. Since the object is
491 * locked, and the page is busy, this prevents the page
492 * from being dirtied after the pmap_disconnect() call
493 * returns.
494 *
495 * Since the page is left "dirty" but "not modifed", we
496 * can detect whether the page was redirtied during
497 * pageout by checking the modify state.
498 */
499 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
500 m->dirty = TRUE;
501 else
502 m->dirty = FALSE;
503
504 if (m->dirty) {
505 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
506 vm_page_unwire(m);/* reactivates */
507 VM_STAT(reactivations++);
508 PAGE_WAKEUP_DONE(m);
509 } else {
510 CLUSTER_STAT(vm_pageout_target_page_freed++;)
511 vm_page_free(m);/* clears busy, etc. */
512 }
513 vm_page_unlock_queues();
514 continue;
515 }
516 /*
517 * Handle the "adjacent" pages. These pages were cleaned in
518 * place, and should be left alone.
519 * If prep_pin_count is nonzero, then someone is using the
520 * page, so make it active.
521 */
522 if (!m->active && !m->inactive && !m->private) {
523 if (m->reference)
524 vm_page_activate(m);
525 else
526 vm_page_deactivate(m);
527 }
528 if((m->busy) && (m->cleaning)) {
529
530 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
531 m->busy = FALSE;
532
533 /* We do not re-set m->dirty ! */
534 /* The page was busy so no extraneous activity */
535 /* could have occurred. COPY_INTO is a read into the */
536 /* new pages. CLEAN_IN_PLACE does actually write */
537 /* out the pages but handling outside of this code */
538 /* will take care of resetting dirty. We clear the */
539 /* modify however for the Programmed I/O case. */
540 pmap_clear_modify(m->phys_page);
541 if(m->absent) {
542 m->absent = FALSE;
543 if(shadow_object->absent_count == 1)
544 vm_object_absent_release(shadow_object);
545 else
546 shadow_object->absent_count--;
547 }
548 m->overwriting = FALSE;
549 } else if (m->overwriting) {
550 /* alternate request page list, write to page_list */
551 /* case. Occurs when the original page was wired */
552 /* at the time of the list request */
553 assert(m->wire_count != 0);
554 vm_page_unwire(m);/* reactivates */
555 m->overwriting = FALSE;
556 } else {
557 /*
558 * Set the dirty state according to whether or not the page was
559 * modified during the pageout. Note that we purposefully do
560 * NOT call pmap_clear_modify since the page is still mapped.
561 * If the page were to be dirtied between the 2 calls, this
562 * this fact would be lost. This code is only necessary to
563 * maintain statistics, since the pmap module is always
564 * consulted if m->dirty is false.
565 */
566 #if MACH_CLUSTER_STATS
567 m->dirty = pmap_is_modified(m->phys_page);
568
569 if (m->dirty) vm_pageout_cluster_dirtied++;
570 else vm_pageout_cluster_cleaned++;
571 if (m->wanted) vm_pageout_cluster_collisions++;
572 #else
573 m->dirty = 0;
574 #endif
575 }
576 m->cleaning = FALSE;
577
578 /*
579 * Wakeup any thread waiting for the page to be un-cleaning.
580 */
581 PAGE_WAKEUP(m);
582 vm_page_unlock_queues();
583 }
584 /*
585 * Account for the paging reference taken in vm_paging_object_allocate.
586 */
587 vm_object_paging_end(shadow_object);
588 vm_object_unlock(shadow_object);
589
590 assert(object->ref_count == 0);
591 assert(object->paging_in_progress == 0);
592 assert(object->resident_page_count == 0);
593 return;
594 }
595
596 /*
597 * Routine: vm_pageout_setup
598 * Purpose:
599 * Set up a page for pageout (clean & flush).
600 *
601 * Move the page to a new object, as part of which it will be
602 * sent to its memory manager in a memory_object_data_write or
603 * memory_object_initialize message.
604 *
605 * The "new_object" and "new_offset" arguments
606 * indicate where the page should be moved.
607 *
608 * In/Out conditions:
609 * The page in question must not be on any pageout queues,
610 * and must be busy. The object to which it belongs
611 * must be unlocked, and the caller must hold a paging
612 * reference to it. The new_object must not be locked.
613 *
614 * This routine returns a pointer to a place-holder page,
615 * inserted at the same offset, to block out-of-order
616 * requests for the page. The place-holder page must
617 * be freed after the data_write or initialize message
618 * has been sent.
619 *
620 * The original page is put on a paging queue and marked
621 * not busy on exit.
622 */
623 vm_page_t
624 vm_pageout_setup(
625 register vm_page_t m,
626 register vm_object_t new_object,
627 vm_object_offset_t new_offset)
628 {
629 register vm_object_t old_object = m->object;
630 vm_object_offset_t paging_offset;
631 vm_object_offset_t offset;
632 register vm_page_t holding_page;
633 register vm_page_t new_m;
634 boolean_t need_to_wire = FALSE;
635
636
637 XPR(XPR_VM_PAGEOUT,
638 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
639 (integer_t)m->object, (integer_t)m->offset,
640 (integer_t)m, (integer_t)new_object,
641 (integer_t)new_offset);
642 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
643 !m->restart);
644
645 assert(m->dirty || m->precious);
646
647 /*
648 * Create a place-holder page where the old one was, to prevent
649 * attempted pageins of this page while we're unlocked.
650 */
651 VM_PAGE_GRAB_FICTITIOUS(holding_page);
652
653 vm_object_lock(old_object);
654
655 offset = m->offset;
656 paging_offset = offset + old_object->paging_offset;
657
658 if (old_object->pager_trusted) {
659 /*
660 * This pager is trusted, so we can clean this page
661 * in place. Leave it in the old object, and mark it
662 * cleaning & pageout.
663 */
664 new_m = holding_page;
665 holding_page = VM_PAGE_NULL;
666
667 /*
668 * Set up new page to be private shadow of real page.
669 */
670 new_m->phys_page = m->phys_page;
671 new_m->fictitious = FALSE;
672 new_m->pageout = TRUE;
673
674 /*
675 * Mark real page as cleaning (indicating that we hold a
676 * paging reference to be released via m_o_d_r_c) and
677 * pageout (indicating that the page should be freed
678 * when the pageout completes).
679 */
680 pmap_clear_modify(m->phys_page);
681 vm_page_lock_queues();
682 new_m->private = TRUE;
683 vm_page_wire(new_m);
684 m->cleaning = TRUE;
685 m->pageout = TRUE;
686
687 vm_page_wire(m);
688 assert(m->wire_count == 1);
689 vm_page_unlock_queues();
690
691 m->dirty = TRUE;
692 m->precious = FALSE;
693 m->page_lock = VM_PROT_NONE;
694 m->unusual = FALSE;
695 m->unlock_request = VM_PROT_NONE;
696 } else {
697 /*
698 * Cannot clean in place, so rip the old page out of the
699 * object, and stick the holding page in. Set new_m to the
700 * page in the new object.
701 */
702 vm_page_lock_queues();
703 VM_PAGE_QUEUES_REMOVE(m);
704 vm_page_remove(m);
705
706 vm_page_insert(holding_page, old_object, offset);
707 vm_page_unlock_queues();
708
709 m->dirty = TRUE;
710 m->precious = FALSE;
711 new_m = m;
712 new_m->page_lock = VM_PROT_NONE;
713 new_m->unlock_request = VM_PROT_NONE;
714
715 if (old_object->internal)
716 need_to_wire = TRUE;
717 }
718 /*
719 * Record that this page has been written out
720 */
721 #if MACH_PAGEMAP
722 vm_external_state_set(old_object->existence_map, offset);
723 #endif /* MACH_PAGEMAP */
724
725 vm_object_unlock(old_object);
726
727 vm_object_lock(new_object);
728
729 /*
730 * Put the page into the new object. If it is a not wired
731 * (if it's the real page) it will be activated.
732 */
733
734 vm_page_lock_queues();
735 vm_page_insert(new_m, new_object, new_offset);
736 if (need_to_wire)
737 vm_page_wire(new_m);
738 else
739 vm_page_activate(new_m);
740 PAGE_WAKEUP_DONE(new_m);
741 vm_page_unlock_queues();
742
743 vm_object_unlock(new_object);
744
745 /*
746 * Return the placeholder page to simplify cleanup.
747 */
748 return (holding_page);
749 }
750
751 /*
752 * Routine: vm_pageclean_setup
753 *
754 * Purpose: setup a page to be cleaned (made non-dirty), but not
755 * necessarily flushed from the VM page cache.
756 * This is accomplished by cleaning in place.
757 *
758 * The page must not be busy, and the object and page
759 * queues must be locked.
760 *
761 */
762 void
763 vm_pageclean_setup(
764 vm_page_t m,
765 vm_page_t new_m,
766 vm_object_t new_object,
767 vm_object_offset_t new_offset)
768 {
769 vm_object_t old_object = m->object;
770 assert(!m->busy);
771 assert(!m->cleaning);
772
773 XPR(XPR_VM_PAGEOUT,
774 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
775 (integer_t)old_object, m->offset, (integer_t)m,
776 (integer_t)new_m, new_offset);
777
778 pmap_clear_modify(m->phys_page);
779 vm_object_paging_begin(old_object);
780
781 /*
782 * Record that this page has been written out
783 */
784 #if MACH_PAGEMAP
785 vm_external_state_set(old_object->existence_map, m->offset);
786 #endif /*MACH_PAGEMAP*/
787
788 /*
789 * Mark original page as cleaning in place.
790 */
791 m->cleaning = TRUE;
792 m->dirty = TRUE;
793 m->precious = FALSE;
794
795 /*
796 * Convert the fictitious page to a private shadow of
797 * the real page.
798 */
799 assert(new_m->fictitious);
800 new_m->fictitious = FALSE;
801 new_m->private = TRUE;
802 new_m->pageout = TRUE;
803 new_m->phys_page = m->phys_page;
804 vm_page_wire(new_m);
805
806 vm_page_insert(new_m, new_object, new_offset);
807 assert(!new_m->wanted);
808 new_m->busy = FALSE;
809 }
810
811 void
812 vm_pageclean_copy(
813 vm_page_t m,
814 vm_page_t new_m,
815 vm_object_t new_object,
816 vm_object_offset_t new_offset)
817 {
818 XPR(XPR_VM_PAGEOUT,
819 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
820 m, new_m, new_object, new_offset, 0);
821
822 assert((!m->busy) && (!m->cleaning));
823
824 assert(!new_m->private && !new_m->fictitious);
825
826 pmap_clear_modify(m->phys_page);
827
828 m->busy = TRUE;
829 vm_object_paging_begin(m->object);
830 vm_page_unlock_queues();
831 vm_object_unlock(m->object);
832
833 /*
834 * Copy the original page to the new page.
835 */
836 vm_page_copy(m, new_m);
837
838 /*
839 * Mark the old page as clean. A request to pmap_is_modified
840 * will get the right answer.
841 */
842 vm_object_lock(m->object);
843 m->dirty = FALSE;
844
845 vm_object_paging_end(m->object);
846
847 vm_page_lock_queues();
848 if (!m->active && !m->inactive)
849 vm_page_activate(m);
850 PAGE_WAKEUP_DONE(m);
851
852 vm_page_insert(new_m, new_object, new_offset);
853 vm_page_activate(new_m);
854 new_m->busy = FALSE; /* No other thread can be waiting */
855 }
856
857
858 /*
859 * Routine: vm_pageout_initialize_page
860 * Purpose:
861 * Causes the specified page to be initialized in
862 * the appropriate memory object. This routine is used to push
863 * pages into a copy-object when they are modified in the
864 * permanent object.
865 *
866 * The page is moved to a temporary object and paged out.
867 *
868 * In/out conditions:
869 * The page in question must not be on any pageout queues.
870 * The object to which it belongs must be locked.
871 * The page must be busy, but not hold a paging reference.
872 *
873 * Implementation:
874 * Move this page to a completely new object.
875 */
876 void
877 vm_pageout_initialize_page(
878 vm_page_t m)
879 {
880 vm_object_t object;
881 vm_object_offset_t paging_offset;
882 vm_page_t holding_page;
883
884
885 XPR(XPR_VM_PAGEOUT,
886 "vm_pageout_initialize_page, page 0x%X\n",
887 (integer_t)m, 0, 0, 0, 0);
888 assert(m->busy);
889
890 /*
891 * Verify that we really want to clean this page
892 */
893 assert(!m->absent);
894 assert(!m->error);
895 assert(m->dirty);
896
897 /*
898 * Create a paging reference to let us play with the object.
899 */
900 object = m->object;
901 paging_offset = m->offset + object->paging_offset;
902 vm_object_paging_begin(object);
903 if (m->absent || m->error || m->restart ||
904 (!m->dirty && !m->precious)) {
905 VM_PAGE_FREE(m);
906 panic("reservation without pageout?"); /* alan */
907 vm_object_unlock(object);
908 return;
909 }
910
911 /* set the page for future call to vm_fault_list_request */
912 holding_page = NULL;
913 vm_page_lock_queues();
914 pmap_clear_modify(m->phys_page);
915 m->dirty = TRUE;
916 m->busy = TRUE;
917 m->list_req_pending = TRUE;
918 m->cleaning = TRUE;
919 m->pageout = TRUE;
920 vm_page_wire(m);
921 vm_page_unlock_queues();
922 vm_object_unlock(object);
923
924 /*
925 * Write the data to its pager.
926 * Note that the data is passed by naming the new object,
927 * not a virtual address; the pager interface has been
928 * manipulated to use the "internal memory" data type.
929 * [The object reference from its allocation is donated
930 * to the eventual recipient.]
931 */
932 memory_object_data_initialize(object->pager,
933 paging_offset,
934 PAGE_SIZE);
935
936 vm_object_lock(object);
937 }
938
939 #if MACH_CLUSTER_STATS
940 #define MAXCLUSTERPAGES 16
941 struct {
942 unsigned long pages_in_cluster;
943 unsigned long pages_at_higher_offsets;
944 unsigned long pages_at_lower_offsets;
945 } cluster_stats[MAXCLUSTERPAGES];
946 #endif /* MACH_CLUSTER_STATS */
947
948 boolean_t allow_clustered_pageouts = FALSE;
949
950 /*
951 * vm_pageout_cluster:
952 *
953 * Given a page, queue it to the appropriate I/O thread,
954 * which will page it out and attempt to clean adjacent pages
955 * in the same operation.
956 *
957 * The page must be busy, and the object and queues locked. We will take a
958 * paging reference to prevent deallocation or collapse when we
959 * release the object lock back at the call site. The I/O thread
960 * is responsible for consuming this reference
961 *
962 * The page must not be on any pageout queue.
963 */
964
965 void
966 vm_pageout_cluster(vm_page_t m)
967 {
968 vm_object_t object = m->object;
969 struct vm_pageout_queue *q;
970
971
972 XPR(XPR_VM_PAGEOUT,
973 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
974 (integer_t)object, m->offset, (integer_t)m, 0, 0);
975
976 /*
977 * Only a certain kind of page is appreciated here.
978 */
979 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
980 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
981
982 /*
983 * protect the object from collapse -
984 * locking in the object's paging_offset.
985 */
986 vm_object_paging_begin(object);
987
988 /*
989 * set the page for future call to vm_fault_list_request
990 * page should already be marked busy
991 */
992 vm_page_wire(m);
993 m->list_req_pending = TRUE;
994 m->cleaning = TRUE;
995 m->pageout = TRUE;
996 m->laundry = TRUE;
997
998 if (object->internal == TRUE)
999 q = &vm_pageout_queue_internal;
1000 else
1001 q = &vm_pageout_queue_external;
1002 q->pgo_laundry++;
1003
1004 m->pageout_queue = TRUE;
1005 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1006
1007 if (q->pgo_idle == TRUE) {
1008 q->pgo_idle = FALSE;
1009 thread_wakeup((event_t) &q->pgo_pending);
1010 }
1011 }
1012
1013
1014 unsigned long vm_pageout_throttle_up_count = 0;
1015
1016 /*
1017 * A page is back from laundry. See if there are some pages waiting to
1018 * go to laundry and if we can let some of them go now.
1019 *
1020 * Object and page queues must be locked.
1021 */
1022 void
1023 vm_pageout_throttle_up(
1024 vm_page_t m)
1025 {
1026 struct vm_pageout_queue *q;
1027
1028 vm_pageout_throttle_up_count++;
1029
1030 assert(m->laundry);
1031 assert(m->object != VM_OBJECT_NULL);
1032 assert(m->object != kernel_object);
1033
1034 if (m->object->internal == TRUE)
1035 q = &vm_pageout_queue_internal;
1036 else
1037 q = &vm_pageout_queue_external;
1038
1039 m->laundry = FALSE;
1040 q->pgo_laundry--;
1041
1042 if (q->pgo_throttled == TRUE) {
1043 q->pgo_throttled = FALSE;
1044 thread_wakeup((event_t) &q->pgo_laundry);
1045 }
1046 }
1047
1048
1049 /*
1050 * vm_pageout_scan does the dirty work for the pageout daemon.
1051 * It returns with vm_page_queue_free_lock held and
1052 * vm_page_free_wanted == 0.
1053 */
1054
1055 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1056
1057 #define FCS_IDLE 0
1058 #define FCS_DELAYED 1
1059 #define FCS_DEADLOCK_DETECTED 2
1060
1061 struct flow_control {
1062 int state;
1063 mach_timespec_t ts;
1064 };
1065
1066 void
1067 vm_pageout_scan(void)
1068 {
1069 unsigned int loop_count = 0;
1070 unsigned int inactive_burst_count = 0;
1071 unsigned int active_burst_count = 0;
1072 vm_page_t local_freeq = 0;
1073 int local_freed = 0;
1074 int delayed_unlock = 0;
1075 int need_internal_inactive = 0;
1076 int refmod_state = 0;
1077 int vm_pageout_deadlock_target = 0;
1078 struct vm_pageout_queue *iq;
1079 struct vm_pageout_queue *eq;
1080 struct flow_control flow_control;
1081 boolean_t active_throttled = FALSE;
1082 boolean_t inactive_throttled = FALSE;
1083 mach_timespec_t ts;
1084 unsigned int msecs = 0;
1085 vm_object_t object;
1086
1087
1088 flow_control.state = FCS_IDLE;
1089 iq = &vm_pageout_queue_internal;
1090 eq = &vm_pageout_queue_external;
1091
1092 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1093
1094 /*???*/ /*
1095 * We want to gradually dribble pages from the active queue
1096 * to the inactive queue. If we let the inactive queue get
1097 * very small, and then suddenly dump many pages into it,
1098 * those pages won't get a sufficient chance to be referenced
1099 * before we start taking them from the inactive queue.
1100 *
1101 * We must limit the rate at which we send pages to the pagers.
1102 * data_write messages consume memory, for message buffers and
1103 * for map-copy objects. If we get too far ahead of the pagers,
1104 * we can potentially run out of memory.
1105 *
1106 * We can use the laundry count to limit directly the number
1107 * of pages outstanding to the default pager. A similar
1108 * strategy for external pagers doesn't work, because
1109 * external pagers don't have to deallocate the pages sent them,
1110 * and because we might have to send pages to external pagers
1111 * even if they aren't processing writes. So we also
1112 * use a burst count to limit writes to external pagers.
1113 *
1114 * When memory is very tight, we can't rely on external pagers to
1115 * clean pages. They probably aren't running, because they
1116 * aren't vm-privileged. If we kept sending dirty pages to them,
1117 * we could exhaust the free list.
1118 */
1119 vm_page_lock_queues();
1120 delayed_unlock = 1;
1121
1122
1123 Restart:
1124 /*
1125 * Recalculate vm_page_inactivate_target.
1126 */
1127 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1128 vm_page_inactive_count);
1129 object = NULL;
1130
1131 for (;;) {
1132 vm_page_t m;
1133
1134 if (delayed_unlock == 0)
1135 vm_page_lock_queues();
1136
1137 active_burst_count = vm_page_active_count;
1138
1139 if (active_burst_count > vm_pageout_burst_active_throttle)
1140 active_burst_count = vm_pageout_burst_active_throttle;
1141
1142 /*
1143 * Move pages from active to inactive.
1144 */
1145 while ((need_internal_inactive ||
1146 vm_page_inactive_count < vm_page_inactive_target) &&
1147 !queue_empty(&vm_page_queue_active) &&
1148 ((active_burst_count--) > 0)) {
1149
1150 vm_pageout_active++;
1151
1152 m = (vm_page_t) queue_first(&vm_page_queue_active);
1153
1154 assert(m->active && !m->inactive);
1155 assert(!m->laundry);
1156 assert(m->object != kernel_object);
1157
1158 /*
1159 * Try to lock object; since we've already got the
1160 * page queues lock, we can only 'try' for this one.
1161 * if the 'try' fails, we need to do a mutex_pause
1162 * to allow the owner of the object lock a chance to
1163 * run... otherwise, we're likely to trip over this
1164 * object in the same state as we work our way through
1165 * the queue... clumps of pages associated with the same
1166 * object are fairly typical on the inactive and active queues
1167 */
1168 if (m->object != object) {
1169 if (object != NULL) {
1170 vm_object_unlock(object);
1171 object = NULL;
1172 }
1173 if (!vm_object_lock_try(m->object)) {
1174 /*
1175 * move page to end of active queue and continue
1176 */
1177 queue_remove(&vm_page_queue_active, m,
1178 vm_page_t, pageq);
1179 queue_enter(&vm_page_queue_active, m,
1180 vm_page_t, pageq);
1181
1182 goto done_with_activepage;
1183 }
1184 object = m->object;
1185 }
1186 /*
1187 * if the page is BUSY, then we pull it
1188 * off the active queue and leave it alone.
1189 * when BUSY is cleared, it will get stuck
1190 * back on the appropriate queue
1191 */
1192 if (m->busy) {
1193 queue_remove(&vm_page_queue_active, m,
1194 vm_page_t, pageq);
1195 m->pageq.next = NULL;
1196 m->pageq.prev = NULL;
1197
1198 if (!m->fictitious)
1199 vm_page_active_count--;
1200 m->active = FALSE;
1201
1202 goto done_with_activepage;
1203 }
1204 if (need_internal_inactive) {
1205 /*
1206 * If we're unable to make forward progress
1207 * with the current set of pages on the
1208 * inactive queue due to busy objects or
1209 * throttled pageout queues, then
1210 * move a page that is already clean
1211 * or belongs to a pageout queue that
1212 * isn't currently throttled
1213 */
1214 active_throttled = FALSE;
1215
1216 if (object->internal) {
1217 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1218 active_throttled = TRUE;
1219 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1220 active_throttled = TRUE;
1221 }
1222 if (active_throttled == TRUE) {
1223 if (!m->dirty) {
1224 refmod_state = pmap_get_refmod(m->phys_page);
1225
1226 if (refmod_state & VM_MEM_REFERENCED)
1227 m->reference = TRUE;
1228 if (refmod_state & VM_MEM_MODIFIED)
1229 m->dirty = TRUE;
1230 }
1231 if (m->dirty || m->precious) {
1232 /*
1233 * page is dirty and targets a THROTTLED queue
1234 * so all we can do is move it back to the
1235 * end of the active queue to get it out
1236 * of the way
1237 */
1238 queue_remove(&vm_page_queue_active, m,
1239 vm_page_t, pageq);
1240 queue_enter(&vm_page_queue_active, m,
1241 vm_page_t, pageq);
1242
1243 vm_pageout_scan_active_throttled++;
1244
1245 goto done_with_activepage;
1246 }
1247 }
1248 vm_pageout_scan_active_throttle_success++;
1249 need_internal_inactive--;
1250 }
1251 /*
1252 * Deactivate the page while holding the object
1253 * locked, so we know the page is still not busy.
1254 * This should prevent races between pmap_enter
1255 * and pmap_clear_reference. The page might be
1256 * absent or fictitious, but vm_page_deactivate
1257 * can handle that.
1258 */
1259 vm_page_deactivate(m);
1260 done_with_activepage:
1261 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1262
1263 if (object != NULL) {
1264 vm_object_unlock(object);
1265 object = NULL;
1266 }
1267 if (local_freeq) {
1268 vm_page_free_list(local_freeq);
1269
1270 local_freeq = 0;
1271 local_freed = 0;
1272 }
1273 delayed_unlock = 0;
1274 vm_page_unlock_queues();
1275
1276 mutex_pause();
1277 vm_page_lock_queues();
1278 /*
1279 * continue the while loop processing
1280 * the active queue... need to hold
1281 * the page queues lock
1282 */
1283 continue;
1284 }
1285 }
1286
1287
1288
1289 /**********************************************************************
1290 * above this point we're playing with the active queue
1291 * below this point we're playing with the throttling mechanisms
1292 * and the inactive queue
1293 **********************************************************************/
1294
1295
1296
1297 /*
1298 * We are done if we have met our target *and*
1299 * nobody is still waiting for a page.
1300 */
1301 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1302 if (object != NULL) {
1303 vm_object_unlock(object);
1304 object = NULL;
1305 }
1306 if (local_freeq) {
1307 vm_page_free_list(local_freeq);
1308
1309 local_freeq = 0;
1310 local_freed = 0;
1311 }
1312 mutex_lock(&vm_page_queue_free_lock);
1313
1314 if ((vm_page_free_count >= vm_page_free_target) &&
1315 (vm_page_free_wanted == 0)) {
1316
1317 vm_page_unlock_queues();
1318
1319 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1320 return;
1321 }
1322 mutex_unlock(&vm_page_queue_free_lock);
1323 }
1324
1325
1326 /*
1327 * Sometimes we have to pause:
1328 * 1) No inactive pages - nothing to do.
1329 * 2) Flow control - default pageout queue is full
1330 * 3) Loop control - no acceptable pages found on the inactive queue
1331 * within the last vm_pageout_burst_inactive_throttle iterations
1332 */
1333 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1334 vm_pageout_scan_empty_throttle++;
1335 msecs = vm_pageout_empty_wait;
1336 goto vm_pageout_scan_delay;
1337
1338 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1339 vm_pageout_scan_burst_throttle++;
1340 msecs = vm_pageout_burst_wait;
1341 goto vm_pageout_scan_delay;
1342
1343 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1344
1345 switch (flow_control.state) {
1346
1347 case FCS_IDLE:
1348 reset_deadlock_timer:
1349 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1350 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1351 clock_get_system_nanotime(
1352 &flow_control.ts.tv_sec,
1353 (uint32_t *) &flow_control.ts.tv_nsec);
1354 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1355
1356 flow_control.state = FCS_DELAYED;
1357 msecs = vm_pageout_deadlock_wait;
1358
1359 break;
1360
1361 case FCS_DELAYED:
1362 clock_get_system_nanotime(
1363 &ts.tv_sec,
1364 (uint32_t *) &ts.tv_nsec);
1365
1366 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1367 /*
1368 * the pageout thread for the default pager is potentially
1369 * deadlocked since the
1370 * default pager queue has been throttled for more than the
1371 * allowable time... we need to move some clean pages or dirty
1372 * pages belonging to the external pagers if they aren't throttled
1373 * vm_page_free_wanted represents the number of threads currently
1374 * blocked waiting for pages... we'll move one page for each of
1375 * these plus a fixed amount to break the logjam... once we're done
1376 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1377 * with a new timeout target since we have no way of knowing
1378 * whether we've broken the deadlock except through observation
1379 * of the queue associated with the default pager... we need to
1380 * stop moving pagings and allow the system to run to see what
1381 * state it settles into.
1382 */
1383 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1384 vm_pageout_scan_deadlock_detected++;
1385 flow_control.state = FCS_DEADLOCK_DETECTED;
1386
1387 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1388 goto consider_inactive;
1389 }
1390 /*
1391 * just resniff instead of trying
1392 * to compute a new delay time... we're going to be
1393 * awakened immediately upon a laundry completion,
1394 * so we won't wait any longer than necessary
1395 */
1396 msecs = vm_pageout_idle_wait;
1397 break;
1398
1399 case FCS_DEADLOCK_DETECTED:
1400 if (vm_pageout_deadlock_target)
1401 goto consider_inactive;
1402 goto reset_deadlock_timer;
1403
1404 }
1405 vm_pageout_scan_throttle++;
1406 iq->pgo_throttled = TRUE;
1407 vm_pageout_scan_delay:
1408 if (object != NULL) {
1409 vm_object_unlock(object);
1410 object = NULL;
1411 }
1412 if (local_freeq) {
1413 vm_page_free_list(local_freeq);
1414
1415 local_freeq = 0;
1416 local_freed = 0;
1417 }
1418 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1419
1420 counter(c_vm_pageout_scan_block++);
1421
1422 vm_page_unlock_queues();
1423
1424 thread_block(THREAD_CONTINUE_NULL);
1425
1426 vm_page_lock_queues();
1427 delayed_unlock = 1;
1428
1429 iq->pgo_throttled = FALSE;
1430
1431 if (loop_count >= vm_page_inactive_count) {
1432 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1433 /*
1434 * Make sure we move enough "appropriate"
1435 * pages to the inactive queue before trying
1436 * again.
1437 */
1438 need_internal_inactive = vm_pageout_inactive_relief;
1439 }
1440 loop_count = 0;
1441 }
1442 inactive_burst_count = 0;
1443
1444 goto Restart;
1445 /*NOTREACHED*/
1446 }
1447
1448
1449 flow_control.state = FCS_IDLE;
1450 consider_inactive:
1451 loop_count++;
1452 inactive_burst_count++;
1453 vm_pageout_inactive++;
1454
1455 if (!queue_empty(&vm_page_queue_inactive)) {
1456 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1457
1458 if (m->clustered && (m->no_isync == TRUE)) {
1459 goto use_this_page;
1460 }
1461 }
1462 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1463 vm_zf_iterator = 0;
1464 } else {
1465 last_page_zf = 0;
1466 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1467 vm_zf_iterator = 0;
1468 }
1469 }
1470 if (queue_empty(&vm_page_queue_zf) ||
1471 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1472 !queue_empty(&vm_page_queue_inactive))) {
1473 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1474 last_page_zf = 0;
1475 } else {
1476 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1477 last_page_zf = 1;
1478 }
1479 use_this_page:
1480 assert(!m->active && m->inactive);
1481 assert(!m->laundry);
1482 assert(m->object != kernel_object);
1483
1484 /*
1485 * Try to lock object; since we've alread got the
1486 * page queues lock, we can only 'try' for this one.
1487 * if the 'try' fails, we need to do a mutex_pause
1488 * to allow the owner of the object lock a chance to
1489 * run... otherwise, we're likely to trip over this
1490 * object in the same state as we work our way through
1491 * the queue... clumps of pages associated with the same
1492 * object are fairly typical on the inactive and active queues
1493 */
1494 if (m->object != object) {
1495 if (object != NULL) {
1496 vm_object_unlock(object);
1497 object = NULL;
1498 }
1499 if (!vm_object_lock_try(m->object)) {
1500 /*
1501 * Move page to end and continue.
1502 * Don't re-issue ticket
1503 */
1504 if (m->zero_fill) {
1505 queue_remove(&vm_page_queue_zf, m,
1506 vm_page_t, pageq);
1507 queue_enter(&vm_page_queue_zf, m,
1508 vm_page_t, pageq);
1509 } else {
1510 queue_remove(&vm_page_queue_inactive, m,
1511 vm_page_t, pageq);
1512 queue_enter(&vm_page_queue_inactive, m,
1513 vm_page_t, pageq);
1514 }
1515 vm_pageout_inactive_nolock++;
1516
1517 /*
1518 * force us to dump any collected free pages
1519 * and to pause before moving on
1520 */
1521 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1522
1523 goto done_with_inactivepage;
1524 }
1525 object = m->object;
1526 }
1527 /*
1528 * If the page belongs to a purgable object with no pending copies
1529 * against it, then we reap all of the pages in the object
1530 * and note that the object has been "emptied". It'll be up to the
1531 * application the discover this and recreate its contents if desired.
1532 */
1533 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1534 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1535 object->copy == VM_OBJECT_NULL) {
1536
1537 (void) vm_object_purge(object);
1538 vm_pageout_purged_objects++;
1539 /*
1540 * we've just taken all of the pages from this object,
1541 * so drop the lock now since we're not going to find
1542 * any more pages belonging to it anytime soon
1543 */
1544 vm_object_unlock(object);
1545 object = NULL;
1546
1547 inactive_burst_count = 0;
1548
1549 goto done_with_inactivepage;
1550 }
1551
1552 /*
1553 * Paging out pages of external objects which
1554 * are currently being created must be avoided.
1555 * The pager may claim for memory, thus leading to a
1556 * possible dead lock between it and the pageout thread,
1557 * if such pages are finally chosen. The remaining assumption
1558 * is that there will finally be enough available pages in the
1559 * inactive pool to page out in order to satisfy all memory
1560 * claimed by the thread which concurrently creates the pager.
1561 */
1562 if (!object->pager_initialized && object->pager_created) {
1563 /*
1564 * Move page to end and continue, hoping that
1565 * there will be enough other inactive pages to
1566 * page out so that the thread which currently
1567 * initializes the pager will succeed.
1568 * Don't re-grant the ticket, the page should
1569 * pulled from the queue and paged out whenever
1570 * one of its logically adjacent fellows is
1571 * targeted.
1572 */
1573 if (m->zero_fill) {
1574 queue_remove(&vm_page_queue_zf, m,
1575 vm_page_t, pageq);
1576 queue_enter(&vm_page_queue_zf, m,
1577 vm_page_t, pageq);
1578 last_page_zf = 1;
1579 vm_zf_iterator = vm_zf_iterator_count - 1;
1580 } else {
1581 queue_remove(&vm_page_queue_inactive, m,
1582 vm_page_t, pageq);
1583 queue_enter(&vm_page_queue_inactive, m,
1584 vm_page_t, pageq);
1585 last_page_zf = 0;
1586 vm_zf_iterator = 1;
1587 }
1588 vm_pageout_inactive_avoid++;
1589
1590 goto done_with_inactivepage;
1591 }
1592 /*
1593 * Remove the page from the inactive list.
1594 */
1595 if (m->zero_fill) {
1596 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1597 } else {
1598 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1599 }
1600 m->pageq.next = NULL;
1601 m->pageq.prev = NULL;
1602 m->inactive = FALSE;
1603 if (!m->fictitious)
1604 vm_page_inactive_count--;
1605
1606 if (m->busy || !object->alive) {
1607 /*
1608 * Somebody is already playing with this page.
1609 * Leave it off the pageout queues.
1610 */
1611 vm_pageout_inactive_busy++;
1612
1613 goto done_with_inactivepage;
1614 }
1615
1616 /*
1617 * If it's absent or in error, we can reclaim the page.
1618 */
1619
1620 if (m->absent || m->error) {
1621 vm_pageout_inactive_absent++;
1622 reclaim_page:
1623 if (vm_pageout_deadlock_target) {
1624 vm_pageout_scan_inactive_throttle_success++;
1625 vm_pageout_deadlock_target--;
1626 }
1627 if (m->tabled)
1628 vm_page_remove(m); /* clears tabled, object, offset */
1629 if (m->absent)
1630 vm_object_absent_release(object);
1631
1632 assert(m->pageq.next == NULL &&
1633 m->pageq.prev == NULL);
1634 m->pageq.next = (queue_entry_t)local_freeq;
1635 local_freeq = m;
1636 local_freed++;
1637
1638 inactive_burst_count = 0;
1639
1640 goto done_with_inactivepage;
1641 }
1642
1643 assert(!m->private);
1644 assert(!m->fictitious);
1645
1646 /*
1647 * If already cleaning this page in place, convert from
1648 * "adjacent" to "target". We can leave the page mapped,
1649 * and vm_pageout_object_terminate will determine whether
1650 * to free or reactivate.
1651 */
1652
1653 if (m->cleaning) {
1654 m->busy = TRUE;
1655 m->pageout = TRUE;
1656 m->dump_cleaning = TRUE;
1657 vm_page_wire(m);
1658
1659 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1660
1661 inactive_burst_count = 0;
1662
1663 goto done_with_inactivepage;
1664 }
1665
1666 /*
1667 * If it's being used, reactivate.
1668 * (Fictitious pages are either busy or absent.)
1669 */
1670 if ( (!m->reference) ) {
1671 refmod_state = pmap_get_refmod(m->phys_page);
1672
1673 if (refmod_state & VM_MEM_REFERENCED)
1674 m->reference = TRUE;
1675 if (refmod_state & VM_MEM_MODIFIED)
1676 m->dirty = TRUE;
1677 }
1678 if (m->reference) {
1679 was_referenced:
1680 vm_page_activate(m);
1681 VM_STAT(reactivations++);
1682
1683 vm_pageout_inactive_used++;
1684 last_page_zf = 0;
1685 inactive_burst_count = 0;
1686
1687 goto done_with_inactivepage;
1688 }
1689
1690 XPR(XPR_VM_PAGEOUT,
1691 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1692 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1693
1694 /*
1695 * we've got a candidate page to steal...
1696 *
1697 * m->dirty is up to date courtesy of the
1698 * preceding check for m->reference... if
1699 * we get here, then m->reference had to be
1700 * FALSE which means we did a pmap_get_refmod
1701 * and updated both m->reference and m->dirty
1702 *
1703 * if it's dirty or precious we need to
1704 * see if the target queue is throtttled
1705 * it if is, we need to skip over it by moving it back
1706 * to the end of the inactive queue
1707 */
1708 inactive_throttled = FALSE;
1709
1710 if (m->dirty || m->precious) {
1711 if (object->internal) {
1712 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1713 inactive_throttled = TRUE;
1714 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1715 inactive_throttled = TRUE;
1716 }
1717 }
1718 if (inactive_throttled == TRUE) {
1719 if (m->zero_fill) {
1720 queue_enter(&vm_page_queue_zf, m,
1721 vm_page_t, pageq);
1722 } else {
1723 queue_enter(&vm_page_queue_inactive, m,
1724 vm_page_t, pageq);
1725 }
1726 if (!m->fictitious)
1727 vm_page_inactive_count++;
1728 m->inactive = TRUE;
1729
1730 vm_pageout_scan_inactive_throttled++;
1731
1732 goto done_with_inactivepage;
1733 }
1734 /*
1735 * we've got a page that we can steal...
1736 * eliminate all mappings and make sure
1737 * we have the up-to-date modified state
1738 * first take the page BUSY, so that no new
1739 * mappings can be made
1740 */
1741 m->busy = TRUE;
1742
1743 /*
1744 * if we need to do a pmap_disconnect then we
1745 * need to re-evaluate m->dirty since the pmap_disconnect
1746 * provides the true state atomically... the
1747 * page was still mapped up to the pmap_disconnect
1748 * and may have been dirtied at the last microsecond
1749 *
1750 * we also check for the page being referenced 'late'
1751 * if it was, we first need to do a WAKEUP_DONE on it
1752 * since we already set m->busy = TRUE, before
1753 * going off to reactivate it
1754 *
1755 * if we don't need the pmap_disconnect, then
1756 * m->dirty is up to date courtesy of the
1757 * earlier check for m->reference... if
1758 * we get here, then m->reference had to be
1759 * FALSE which means we did a pmap_get_refmod
1760 * and updated both m->reference and m->dirty...
1761 */
1762 if (m->no_isync == FALSE) {
1763 refmod_state = pmap_disconnect(m->phys_page);
1764
1765 if (refmod_state & VM_MEM_MODIFIED)
1766 m->dirty = TRUE;
1767 if (refmod_state & VM_MEM_REFERENCED) {
1768 m->reference = TRUE;
1769
1770 PAGE_WAKEUP_DONE(m);
1771 goto was_referenced;
1772 }
1773 }
1774 /*
1775 * If it's clean and not precious, we can free the page.
1776 */
1777 if (!m->dirty && !m->precious) {
1778 vm_pageout_inactive_clean++;
1779 goto reclaim_page;
1780 }
1781 vm_pageout_cluster(m);
1782
1783 vm_pageout_inactive_dirty++;
1784
1785 inactive_burst_count = 0;
1786
1787 done_with_inactivepage:
1788 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1789
1790 if (object != NULL) {
1791 vm_object_unlock(object);
1792 object = NULL;
1793 }
1794 if (local_freeq) {
1795 vm_page_free_list(local_freeq);
1796
1797 local_freeq = 0;
1798 local_freed = 0;
1799 }
1800 delayed_unlock = 0;
1801 vm_page_unlock_queues();
1802 mutex_pause();
1803 }
1804 /*
1805 * back to top of pageout scan loop
1806 */
1807 }
1808 }
1809
1810
1811 int vm_page_free_count_init;
1812
1813 void
1814 vm_page_free_reserve(
1815 int pages)
1816 {
1817 int free_after_reserve;
1818
1819 vm_page_free_reserved += pages;
1820
1821 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1822
1823 vm_page_free_min = vm_page_free_reserved +
1824 VM_PAGE_FREE_MIN(free_after_reserve);
1825
1826 vm_page_free_target = vm_page_free_reserved +
1827 VM_PAGE_FREE_TARGET(free_after_reserve);
1828
1829 if (vm_page_free_target < vm_page_free_min + 5)
1830 vm_page_free_target = vm_page_free_min + 5;
1831 }
1832
1833 /*
1834 * vm_pageout is the high level pageout daemon.
1835 */
1836
1837 void
1838 vm_pageout_continue(void)
1839 {
1840 vm_pageout_scan_event_counter++;
1841 vm_pageout_scan();
1842 /* we hold vm_page_queue_free_lock now */
1843 assert(vm_page_free_wanted == 0);
1844 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1845 mutex_unlock(&vm_page_queue_free_lock);
1846
1847 counter(c_vm_pageout_block++);
1848 thread_block((thread_continue_t)vm_pageout_continue);
1849 /*NOTREACHED*/
1850 }
1851
1852
1853 /*
1854 * must be called with the
1855 * queues and object locks held
1856 */
1857 static void
1858 vm_pageout_queue_steal(vm_page_t m)
1859 {
1860 struct vm_pageout_queue *q;
1861
1862 if (m->object->internal == TRUE)
1863 q = &vm_pageout_queue_internal;
1864 else
1865 q = &vm_pageout_queue_external;
1866
1867 m->laundry = FALSE;
1868 m->pageout_queue = FALSE;
1869 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1870
1871 m->pageq.next = NULL;
1872 m->pageq.prev = NULL;
1873
1874 vm_object_paging_end(m->object);
1875
1876 q->pgo_laundry--;
1877 }
1878
1879
1880 #ifdef FAKE_DEADLOCK
1881
1882 #define FAKE_COUNT 5000
1883
1884 int internal_count = 0;
1885 int fake_deadlock = 0;
1886
1887 #endif
1888
1889 static void
1890 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1891 {
1892 vm_page_t m = NULL;
1893 vm_object_t object;
1894 boolean_t need_wakeup;
1895
1896 vm_page_lock_queues();
1897
1898 while ( !queue_empty(&q->pgo_pending) ) {
1899
1900 q->pgo_busy = TRUE;
1901 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1902 m->pageout_queue = FALSE;
1903 vm_page_unlock_queues();
1904
1905 m->pageq.next = NULL;
1906 m->pageq.prev = NULL;
1907 #ifdef FAKE_DEADLOCK
1908 if (q == &vm_pageout_queue_internal) {
1909 vm_offset_t addr;
1910 int pg_count;
1911
1912 internal_count++;
1913
1914 if ((internal_count == FAKE_COUNT)) {
1915
1916 pg_count = vm_page_free_count + vm_page_free_reserved;
1917
1918 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1919 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1920 }
1921 internal_count = 0;
1922 fake_deadlock++;
1923 }
1924 }
1925 #endif
1926 object = m->object;
1927
1928 if (!object->pager_initialized) {
1929 vm_object_lock(object);
1930
1931 /*
1932 * If there is no memory object for the page, create
1933 * one and hand it to the default pager.
1934 */
1935
1936 if (!object->pager_initialized)
1937 vm_object_collapse(object,
1938 (vm_object_offset_t) 0,
1939 TRUE);
1940 if (!object->pager_initialized)
1941 vm_object_pager_create(object);
1942 if (!object->pager_initialized) {
1943 /*
1944 * Still no pager for the object.
1945 * Reactivate the page.
1946 *
1947 * Should only happen if there is no
1948 * default pager.
1949 */
1950 m->list_req_pending = FALSE;
1951 m->cleaning = FALSE;
1952 m->pageout = FALSE;
1953 vm_page_unwire(m);
1954
1955 vm_pageout_throttle_up(m);
1956
1957 vm_page_lock_queues();
1958 vm_pageout_dirty_no_pager++;
1959 vm_page_activate(m);
1960 vm_page_unlock_queues();
1961
1962 /*
1963 * And we are done with it.
1964 */
1965 PAGE_WAKEUP_DONE(m);
1966
1967 vm_object_paging_end(object);
1968 vm_object_unlock(object);
1969
1970 vm_page_lock_queues();
1971 continue;
1972 } else if (object->pager == MEMORY_OBJECT_NULL) {
1973 /*
1974 * This pager has been destroyed by either
1975 * memory_object_destroy or vm_object_destroy, and
1976 * so there is nowhere for the page to go.
1977 * Just free the page... VM_PAGE_FREE takes
1978 * care of cleaning up all the state...
1979 * including doing the vm_pageout_throttle_up
1980 */
1981 VM_PAGE_FREE(m);
1982
1983 vm_object_paging_end(object);
1984 vm_object_unlock(object);
1985
1986 vm_page_lock_queues();
1987 continue;
1988 }
1989 vm_object_unlock(object);
1990 }
1991 /*
1992 * we expect the paging_in_progress reference to have
1993 * already been taken on the object before it was added
1994 * to the appropriate pageout I/O queue... this will
1995 * keep the object from being terminated and/or the
1996 * paging_offset from changing until the I/O has
1997 * completed... therefore no need to lock the object to
1998 * pull the paging_offset from it.
1999 *
2000 * Send the data to the pager.
2001 * any pageout clustering happens there
2002 */
2003 memory_object_data_return(object->pager,
2004 m->offset + object->paging_offset,
2005 PAGE_SIZE,
2006 NULL,
2007 NULL,
2008 FALSE,
2009 FALSE,
2010 0);
2011
2012 vm_object_lock(object);
2013 vm_object_paging_end(object);
2014 vm_object_unlock(object);
2015
2016 vm_page_lock_queues();
2017 }
2018 assert_wait((event_t) q, THREAD_UNINT);
2019
2020
2021 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2022 q->pgo_throttled = FALSE;
2023 need_wakeup = TRUE;
2024 } else
2025 need_wakeup = FALSE;
2026
2027 q->pgo_busy = FALSE;
2028 q->pgo_idle = TRUE;
2029 vm_page_unlock_queues();
2030
2031 if (need_wakeup == TRUE)
2032 thread_wakeup((event_t) &q->pgo_laundry);
2033
2034 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2035 /*NOTREACHED*/
2036 }
2037
2038
2039 static void
2040 vm_pageout_iothread_external(void)
2041 {
2042
2043 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2044 /*NOTREACHED*/
2045 }
2046
2047
2048 static void
2049 vm_pageout_iothread_internal(void)
2050 {
2051 thread_t self = current_thread();
2052
2053 self->options |= TH_OPT_VMPRIV;
2054
2055 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2056 /*NOTREACHED*/
2057 }
2058
2059 static void
2060 vm_pageout_garbage_collect(int collect)
2061 {
2062 if (collect) {
2063 stack_collect();
2064
2065 /*
2066 * consider_zone_gc should be last, because the other operations
2067 * might return memory to zones.
2068 */
2069 consider_machine_collect();
2070 consider_zone_gc();
2071
2072 consider_machine_adjust();
2073 }
2074
2075 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2076
2077 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2078 /*NOTREACHED*/
2079 }
2080
2081
2082
2083 void
2084 vm_pageout(void)
2085 {
2086 thread_t self = current_thread();
2087 thread_t thread;
2088 kern_return_t result;
2089 spl_t s;
2090
2091 /*
2092 * Set thread privileges.
2093 */
2094 s = splsched();
2095 thread_lock(self);
2096 self->priority = BASEPRI_PREEMPT - 1;
2097 set_sched_pri(self, self->priority);
2098 thread_unlock(self);
2099 splx(s);
2100
2101 /*
2102 * Initialize some paging parameters.
2103 */
2104
2105 if (vm_pageout_idle_wait == 0)
2106 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2107
2108 if (vm_pageout_burst_wait == 0)
2109 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2110
2111 if (vm_pageout_empty_wait == 0)
2112 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2113
2114 if (vm_pageout_deadlock_wait == 0)
2115 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2116
2117 if (vm_pageout_deadlock_relief == 0)
2118 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2119
2120 if (vm_pageout_inactive_relief == 0)
2121 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2122
2123 if (vm_pageout_burst_active_throttle == 0)
2124 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2125
2126 if (vm_pageout_burst_inactive_throttle == 0)
2127 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2128
2129 /*
2130 * Set kernel task to low backing store privileged
2131 * status
2132 */
2133 task_lock(kernel_task);
2134 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2135 task_unlock(kernel_task);
2136
2137 vm_page_free_count_init = vm_page_free_count;
2138 vm_zf_iterator = 0;
2139 /*
2140 * even if we've already called vm_page_free_reserve
2141 * call it again here to insure that the targets are
2142 * accurately calculated (it uses vm_page_free_count_init)
2143 * calling it with an arg of 0 will not change the reserve
2144 * but will re-calculate free_min and free_target
2145 */
2146 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2147 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2148 } else
2149 vm_page_free_reserve(0);
2150
2151
2152 queue_init(&vm_pageout_queue_external.pgo_pending);
2153 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2154 vm_pageout_queue_external.pgo_laundry = 0;
2155 vm_pageout_queue_external.pgo_idle = FALSE;
2156 vm_pageout_queue_external.pgo_busy = FALSE;
2157 vm_pageout_queue_external.pgo_throttled = FALSE;
2158
2159 queue_init(&vm_pageout_queue_internal.pgo_pending);
2160 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2161 vm_pageout_queue_internal.pgo_laundry = 0;
2162 vm_pageout_queue_internal.pgo_idle = FALSE;
2163 vm_pageout_queue_internal.pgo_busy = FALSE;
2164 vm_pageout_queue_internal.pgo_throttled = FALSE;
2165
2166
2167 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2168 if (result != KERN_SUCCESS)
2169 panic("vm_pageout_iothread_internal: create failed");
2170
2171 thread_deallocate(thread);
2172
2173
2174 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2175 if (result != KERN_SUCCESS)
2176 panic("vm_pageout_iothread_external: create failed");
2177
2178 thread_deallocate(thread);
2179
2180
2181 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2182 if (result != KERN_SUCCESS)
2183 panic("vm_pageout_garbage_collect: create failed");
2184
2185 thread_deallocate(thread);
2186
2187
2188 vm_pageout_continue();
2189 /*NOTREACHED*/
2190 }
2191
2192
2193 static upl_t
2194 upl_create(
2195 int flags,
2196 upl_size_t size)
2197 {
2198 upl_t upl;
2199 int page_field_size; /* bit field in word size buf */
2200
2201 page_field_size = 0;
2202 if (flags & UPL_CREATE_LITE) {
2203 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2204 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2205 }
2206 if(flags & UPL_CREATE_INTERNAL) {
2207 upl = (upl_t)kalloc(sizeof(struct upl)
2208 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2209 + page_field_size);
2210 } else {
2211 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2212 }
2213 upl->flags = 0;
2214 upl->src_object = NULL;
2215 upl->kaddr = (vm_offset_t)0;
2216 upl->size = 0;
2217 upl->map_object = NULL;
2218 upl->ref_count = 1;
2219 upl->highest_page = 0;
2220 upl_lock_init(upl);
2221 #ifdef UPL_DEBUG
2222 upl->ubc_alias1 = 0;
2223 upl->ubc_alias2 = 0;
2224 #endif /* UPL_DEBUG */
2225 return(upl);
2226 }
2227
2228 static void
2229 upl_destroy(
2230 upl_t upl)
2231 {
2232 int page_field_size; /* bit field in word size buf */
2233
2234 #ifdef UPL_DEBUG
2235 {
2236 upl_t upl_ele;
2237 vm_object_t object;
2238 if (upl->map_object->pageout) {
2239 object = upl->map_object->shadow;
2240 } else {
2241 object = upl->map_object;
2242 }
2243 vm_object_lock(object);
2244 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2245 if(upl_ele == upl) {
2246 queue_remove(&object->uplq,
2247 upl_ele, upl_t, uplq);
2248 break;
2249 }
2250 }
2251 vm_object_unlock(object);
2252 }
2253 #endif /* UPL_DEBUG */
2254 /* drop a reference on the map_object whether or */
2255 /* not a pageout object is inserted */
2256 if(upl->map_object->pageout)
2257 vm_object_deallocate(upl->map_object);
2258
2259 page_field_size = 0;
2260 if (upl->flags & UPL_LITE) {
2261 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2262 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2263 }
2264 if(upl->flags & UPL_INTERNAL) {
2265 kfree(upl,
2266 sizeof(struct upl) +
2267 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2268 + page_field_size);
2269 } else {
2270 kfree(upl, sizeof(struct upl) + page_field_size);
2271 }
2272 }
2273
2274 void uc_upl_dealloc(upl_t upl);
2275 __private_extern__ void
2276 uc_upl_dealloc(
2277 upl_t upl)
2278 {
2279 upl->ref_count -= 1;
2280 if(upl->ref_count == 0) {
2281 upl_destroy(upl);
2282 }
2283 }
2284
2285 void
2286 upl_deallocate(
2287 upl_t upl)
2288 {
2289
2290 upl->ref_count -= 1;
2291 if(upl->ref_count == 0) {
2292 upl_destroy(upl);
2293 }
2294 }
2295
2296 /*
2297 * Statistics about UPL enforcement of copy-on-write obligations.
2298 */
2299 unsigned long upl_cow = 0;
2300 unsigned long upl_cow_again = 0;
2301 unsigned long upl_cow_contiguous = 0;
2302 unsigned long upl_cow_pages = 0;
2303 unsigned long upl_cow_again_pages = 0;
2304 unsigned long upl_cow_contiguous_pages = 0;
2305
2306 /*
2307 * Routine: vm_object_upl_request
2308 * Purpose:
2309 * Cause the population of a portion of a vm_object.
2310 * Depending on the nature of the request, the pages
2311 * returned may be contain valid data or be uninitialized.
2312 * A page list structure, listing the physical pages
2313 * will be returned upon request.
2314 * This function is called by the file system or any other
2315 * supplier of backing store to a pager.
2316 * IMPORTANT NOTE: The caller must still respect the relationship
2317 * between the vm_object and its backing memory object. The
2318 * caller MUST NOT substitute changes in the backing file
2319 * without first doing a memory_object_lock_request on the
2320 * target range unless it is know that the pages are not
2321 * shared with another entity at the pager level.
2322 * Copy_in_to:
2323 * if a page list structure is present
2324 * return the mapped physical pages, where a
2325 * page is not present, return a non-initialized
2326 * one. If the no_sync bit is turned on, don't
2327 * call the pager unlock to synchronize with other
2328 * possible copies of the page. Leave pages busy
2329 * in the original object, if a page list structure
2330 * was specified. When a commit of the page list
2331 * pages is done, the dirty bit will be set for each one.
2332 * Copy_out_from:
2333 * If a page list structure is present, return
2334 * all mapped pages. Where a page does not exist
2335 * map a zero filled one. Leave pages busy in
2336 * the original object. If a page list structure
2337 * is not specified, this call is a no-op.
2338 *
2339 * Note: access of default pager objects has a rather interesting
2340 * twist. The caller of this routine, presumably the file system
2341 * page cache handling code, will never actually make a request
2342 * against a default pager backed object. Only the default
2343 * pager will make requests on backing store related vm_objects
2344 * In this way the default pager can maintain the relationship
2345 * between backing store files (abstract memory objects) and
2346 * the vm_objects (cache objects), they support.
2347 *
2348 */
2349
2350 __private_extern__ kern_return_t
2351 vm_object_upl_request(
2352 vm_object_t object,
2353 vm_object_offset_t offset,
2354 upl_size_t size,
2355 upl_t *upl_ptr,
2356 upl_page_info_array_t user_page_list,
2357 unsigned int *page_list_count,
2358 int cntrl_flags)
2359 {
2360 vm_page_t dst_page = VM_PAGE_NULL;
2361 vm_object_offset_t dst_offset = offset;
2362 upl_size_t xfer_size = size;
2363 boolean_t do_m_lock = FALSE;
2364 boolean_t dirty;
2365 boolean_t hw_dirty;
2366 upl_t upl = NULL;
2367 unsigned int entry;
2368 #if MACH_CLUSTER_STATS
2369 boolean_t encountered_lrp = FALSE;
2370 #endif
2371 vm_page_t alias_page = NULL;
2372 int page_ticket;
2373 int refmod_state;
2374 wpl_array_t lite_list = NULL;
2375 vm_object_t last_copy_object;
2376
2377
2378 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2379 /*
2380 * For forward compatibility's sake,
2381 * reject any unknown flag.
2382 */
2383 return KERN_INVALID_VALUE;
2384 }
2385
2386 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2387 >> UPL_PAGE_TICKET_SHIFT;
2388
2389 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2390 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2391 }
2392
2393 if(cntrl_flags & UPL_SET_INTERNAL)
2394 if(page_list_count != NULL)
2395 *page_list_count = MAX_UPL_TRANSFER;
2396
2397 if((!object->internal) && (object->paging_offset != 0))
2398 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2399
2400 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2401 return KERN_SUCCESS;
2402 }
2403
2404 vm_object_lock(object);
2405 vm_object_paging_begin(object);
2406 vm_object_unlock(object);
2407
2408 if(upl_ptr) {
2409 if(cntrl_flags & UPL_SET_INTERNAL) {
2410 if(cntrl_flags & UPL_SET_LITE) {
2411 uintptr_t page_field_size;
2412 upl = upl_create(
2413 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2414 size);
2415 user_page_list = (upl_page_info_t *)
2416 (((uintptr_t)upl) + sizeof(struct upl));
2417 lite_list = (wpl_array_t)
2418 (((uintptr_t)user_page_list) +
2419 ((size/PAGE_SIZE) *
2420 sizeof(upl_page_info_t)));
2421 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2422 page_field_size =
2423 (page_field_size + 3) & 0xFFFFFFFC;
2424 bzero((char *)lite_list, page_field_size);
2425 upl->flags =
2426 UPL_LITE | UPL_INTERNAL;
2427 } else {
2428 upl = upl_create(UPL_CREATE_INTERNAL, size);
2429 user_page_list = (upl_page_info_t *)
2430 (((uintptr_t)upl) + sizeof(struct upl));
2431 upl->flags = UPL_INTERNAL;
2432 }
2433 } else {
2434 if(cntrl_flags & UPL_SET_LITE) {
2435 uintptr_t page_field_size;
2436 upl = upl_create(UPL_CREATE_LITE, size);
2437 lite_list = (wpl_array_t)
2438 (((uintptr_t)upl) + sizeof(struct upl));
2439 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2440 page_field_size =
2441 (page_field_size + 3) & 0xFFFFFFFC;
2442 bzero((char *)lite_list, page_field_size);
2443 upl->flags = UPL_LITE;
2444 } else {
2445 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2446 upl->flags = 0;
2447 }
2448 }
2449
2450 if (object->phys_contiguous) {
2451 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2452 object->copy != VM_OBJECT_NULL) {
2453 /* Honor copy-on-write obligations */
2454
2455 /*
2456 * XXX FBDP
2457 * We could still have a race...
2458 * A is here building the UPL for a write().
2459 * A pushes the pages to the current copy
2460 * object.
2461 * A returns the UPL to the caller.
2462 * B comes along and establishes another
2463 * private mapping on this object, inserting
2464 * a new copy object between the original
2465 * object and the old copy object.
2466 * B reads a page and gets the original contents
2467 * from the original object.
2468 * A modifies the page in the original object.
2469 * B reads the page again and sees A's changes,
2470 * which is wrong...
2471 *
2472 * The problem is that the pages are not
2473 * marked "busy" in the original object, so
2474 * nothing prevents B from reading it before
2475 * before A's changes are completed.
2476 *
2477 * The "paging_in_progress" might protect us
2478 * from the insertion of a new copy object
2479 * though... To be verified.
2480 */
2481 vm_object_lock_request(object,
2482 offset,
2483 size,
2484 FALSE,
2485 MEMORY_OBJECT_COPY_SYNC,
2486 VM_PROT_NO_CHANGE);
2487 upl_cow_contiguous++;
2488 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2489 }
2490
2491 upl->map_object = object;
2492 /* don't need any shadow mappings for this one */
2493 /* since it is already I/O memory */
2494 upl->flags |= UPL_DEVICE_MEMORY;
2495
2496
2497 /* paging_in_progress protects paging_offset */
2498 upl->offset = offset + object->paging_offset;
2499 upl->size = size;
2500 *upl_ptr = upl;
2501 if(user_page_list) {
2502 user_page_list[0].phys_addr =
2503 (offset + object->shadow_offset)>>PAGE_SHIFT;
2504 user_page_list[0].device = TRUE;
2505 }
2506 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2507
2508 if(page_list_count != NULL) {
2509 if (upl->flags & UPL_INTERNAL) {
2510 *page_list_count = 0;
2511 } else {
2512 *page_list_count = 1;
2513 }
2514 }
2515
2516 return KERN_SUCCESS;
2517 }
2518
2519 if(user_page_list)
2520 user_page_list[0].device = FALSE;
2521
2522 if(cntrl_flags & UPL_SET_LITE) {
2523 upl->map_object = object;
2524 } else {
2525 upl->map_object = vm_object_allocate(size);
2526 /*
2527 * No neeed to lock the new object: nobody else knows
2528 * about it yet, so it's all ours so far.
2529 */
2530 upl->map_object->shadow = object;
2531 upl->map_object->pageout = TRUE;
2532 upl->map_object->can_persist = FALSE;
2533 upl->map_object->copy_strategy =
2534 MEMORY_OBJECT_COPY_NONE;
2535 upl->map_object->shadow_offset = offset;
2536 upl->map_object->wimg_bits = object->wimg_bits;
2537 }
2538
2539 }
2540 if (!(cntrl_flags & UPL_SET_LITE)) {
2541 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2542 }
2543
2544 /*
2545 * ENCRYPTED SWAP:
2546 * Just mark the UPL as "encrypted" here.
2547 * We'll actually encrypt the pages later,
2548 * in upl_encrypt(), when the caller has
2549 * selected which pages need to go to swap.
2550 */
2551 if (cntrl_flags & UPL_ENCRYPT) {
2552 upl->flags |= UPL_ENCRYPTED;
2553 }
2554 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2555 upl->flags |= UPL_PAGEOUT;
2556 }
2557 vm_object_lock(object);
2558
2559 /* we can lock in the paging_offset once paging_in_progress is set */
2560 if(upl_ptr) {
2561 upl->size = size;
2562 upl->offset = offset + object->paging_offset;
2563 *upl_ptr = upl;
2564 #ifdef UPL_DEBUG
2565 queue_enter(&object->uplq, upl, upl_t, uplq);
2566 #endif /* UPL_DEBUG */
2567 }
2568
2569 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2570 object->copy != VM_OBJECT_NULL) {
2571 /* Honor copy-on-write obligations */
2572
2573 /*
2574 * The caller is gathering these pages and
2575 * might modify their contents. We need to
2576 * make sure that the copy object has its own
2577 * private copies of these pages before we let
2578 * the caller modify them.
2579 */
2580 vm_object_update(object,
2581 offset,
2582 size,
2583 NULL,
2584 NULL,
2585 FALSE, /* should_return */
2586 MEMORY_OBJECT_COPY_SYNC,
2587 VM_PROT_NO_CHANGE);
2588 upl_cow++;
2589 upl_cow_pages += size >> PAGE_SHIFT;
2590
2591 }
2592 /* remember which copy object we synchronized with */
2593 last_copy_object = object->copy;
2594
2595 entry = 0;
2596 if(cntrl_flags & UPL_COPYOUT_FROM) {
2597 upl->flags |= UPL_PAGE_SYNC_DONE;
2598
2599 while (xfer_size) {
2600 if((alias_page == NULL) &&
2601 !(cntrl_flags & UPL_SET_LITE)) {
2602 vm_object_unlock(object);
2603 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2604 vm_object_lock(object);
2605 }
2606 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2607 dst_page->fictitious ||
2608 dst_page->absent ||
2609 dst_page->error ||
2610 (dst_page->wire_count && !dst_page->pageout) ||
2611
2612 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2613 (dst_page->page_ticket != page_ticket) &&
2614 ((dst_page->page_ticket+1) != page_ticket)) ) {
2615
2616 if (user_page_list)
2617 user_page_list[entry].phys_addr = 0;
2618 } else {
2619 /*
2620 * grab this up front...
2621 * a high percentange of the time we're going to
2622 * need the hardware modification state a bit later
2623 * anyway... so we can eliminate an extra call into
2624 * the pmap layer by grabbing it here and recording it
2625 */
2626 refmod_state = pmap_get_refmod(dst_page->phys_page);
2627
2628 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2629 /*
2630 * we're only asking for DIRTY pages to be returned
2631 */
2632
2633 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2634 /*
2635 * if we were the page stolen by vm_pageout_scan to be
2636 * cleaned (as opposed to a buddy being clustered in
2637 * or this request is not being driven by a PAGEOUT cluster
2638 * then we only need to check for the page being diry or
2639 * precious to decide whether to return it
2640 */
2641 if (dst_page->dirty || dst_page->precious ||
2642 (refmod_state & VM_MEM_MODIFIED)) {
2643 goto check_busy;
2644 }
2645 }
2646 /*
2647 * this is a request for a PAGEOUT cluster and this page
2648 * is merely along for the ride as a 'buddy'... not only
2649 * does it have to be dirty to be returned, but it also
2650 * can't have been referenced recently... note that we've
2651 * already filtered above based on whether this page is
2652 * currently on the inactive queue or it meets the page
2653 * ticket (generation count) check
2654 */
2655 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2656 ((refmod_state & VM_MEM_MODIFIED) ||
2657 dst_page->dirty || dst_page->precious) ) {
2658 goto check_busy;
2659 }
2660 /*
2661 * if we reach here, we're not to return
2662 * the page... go on to the next one
2663 */
2664 if (user_page_list)
2665 user_page_list[entry].phys_addr = 0;
2666 entry++;
2667 dst_offset += PAGE_SIZE_64;
2668 xfer_size -= PAGE_SIZE;
2669 continue;
2670 }
2671 check_busy:
2672 if(dst_page->busy &&
2673 (!(dst_page->list_req_pending &&
2674 dst_page->pageout))) {
2675 if(cntrl_flags & UPL_NOBLOCK) {
2676 if(user_page_list) {
2677 user_page_list[entry].phys_addr = 0;
2678 }
2679 entry++;
2680 dst_offset += PAGE_SIZE_64;
2681 xfer_size -= PAGE_SIZE;
2682 continue;
2683 }
2684 /*
2685 * someone else is playing with the
2686 * page. We will have to wait.
2687 */
2688 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2689 continue;
2690 }
2691 /* Someone else already cleaning the page? */
2692 if((dst_page->cleaning || dst_page->absent ||
2693 dst_page->wire_count != 0) &&
2694 !dst_page->list_req_pending) {
2695 if(user_page_list) {
2696 user_page_list[entry].phys_addr = 0;
2697 }
2698 entry++;
2699 dst_offset += PAGE_SIZE_64;
2700 xfer_size -= PAGE_SIZE;
2701 continue;
2702 }
2703 /* eliminate all mappings from the */
2704 /* original object and its prodigy */
2705
2706 vm_page_lock_queues();
2707
2708 if (dst_page->pageout_queue == TRUE)
2709 /*
2710 * we've buddied up a page for a clustered pageout
2711 * that has already been moved to the pageout
2712 * queue by pageout_scan... we need to remove
2713 * it from the queue and drop the laundry count
2714 * on that queue
2715 */
2716 vm_pageout_queue_steal(dst_page);
2717 #if MACH_CLUSTER_STATS
2718 /* pageout statistics gathering. count */
2719 /* all the pages we will page out that */
2720 /* were not counted in the initial */
2721 /* vm_pageout_scan work */
2722 if(dst_page->list_req_pending)
2723 encountered_lrp = TRUE;
2724 if((dst_page->dirty ||
2725 (dst_page->object->internal &&
2726 dst_page->precious)) &&
2727 (dst_page->list_req_pending
2728 == FALSE)) {
2729 if(encountered_lrp) {
2730 CLUSTER_STAT
2731 (pages_at_higher_offsets++;)
2732 } else {
2733 CLUSTER_STAT
2734 (pages_at_lower_offsets++;)
2735 }
2736 }
2737 #endif
2738 /* Turn off busy indication on pending */
2739 /* pageout. Note: we can only get here */
2740 /* in the request pending case. */
2741 dst_page->list_req_pending = FALSE;
2742 dst_page->busy = FALSE;
2743 dst_page->cleaning = FALSE;
2744
2745 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2746 dirty = hw_dirty ? TRUE : dst_page->dirty;
2747
2748 if(cntrl_flags & UPL_SET_LITE) {
2749 int pg_num;
2750 pg_num = (dst_offset-offset)/PAGE_SIZE;
2751 lite_list[pg_num>>5] |=
2752 1 << (pg_num & 31);
2753 if (hw_dirty)
2754 pmap_clear_modify(dst_page->phys_page);
2755 /*
2756 * Record that this page has been
2757 * written out
2758 */
2759 #if MACH_PAGEMAP
2760 vm_external_state_set(
2761 object->existence_map,
2762 dst_page->offset);
2763 #endif /*MACH_PAGEMAP*/
2764
2765 /*
2766 * Mark original page as cleaning
2767 * in place.
2768 */
2769 dst_page->cleaning = TRUE;
2770 dst_page->dirty = TRUE;
2771 dst_page->precious = FALSE;
2772 } else {
2773 /* use pageclean setup, it is more */
2774 /* convenient even for the pageout */
2775 /* cases here */
2776
2777 vm_object_lock(upl->map_object);
2778 vm_pageclean_setup(dst_page,
2779 alias_page, upl->map_object,
2780 size - xfer_size);
2781 vm_object_unlock(upl->map_object);
2782
2783 alias_page->absent = FALSE;
2784 alias_page = NULL;
2785 }
2786
2787 if(!dirty) {
2788 dst_page->dirty = FALSE;
2789 dst_page->precious = TRUE;
2790 }
2791
2792 if(dst_page->pageout)
2793 dst_page->busy = TRUE;
2794
2795 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2796 /*
2797 * ENCRYPTED SWAP:
2798 * We want to deny access to the target page
2799 * because its contents are about to be
2800 * encrypted and the user would be very
2801 * confused to see encrypted data instead
2802 * of their data.
2803 */
2804 dst_page->busy = TRUE;
2805 }
2806 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2807 /*
2808 * deny access to the target page
2809 * while it is being worked on
2810 */
2811 if ((!dst_page->pageout) &&
2812 (dst_page->wire_count == 0)) {
2813 dst_page->busy = TRUE;
2814 dst_page->pageout = TRUE;
2815 vm_page_wire(dst_page);
2816 }
2817 }
2818
2819 if (dst_page->phys_page > upl->highest_page)
2820 upl->highest_page = dst_page->phys_page;
2821
2822 if(user_page_list) {
2823 user_page_list[entry].phys_addr
2824 = dst_page->phys_page;
2825 user_page_list[entry].dirty =
2826 dst_page->dirty;
2827 user_page_list[entry].pageout =
2828 dst_page->pageout;
2829 user_page_list[entry].absent =
2830 dst_page->absent;
2831 user_page_list[entry].precious =
2832 dst_page->precious;
2833 }
2834 vm_page_unlock_queues();
2835
2836 /*
2837 * ENCRYPTED SWAP:
2838 * The caller is gathering this page and might
2839 * access its contents later on. Decrypt the
2840 * page before adding it to the UPL, so that
2841 * the caller never sees encrypted data.
2842 */
2843 if (! (cntrl_flags & UPL_ENCRYPT) &&
2844 dst_page->encrypted) {
2845 assert(dst_page->busy);
2846
2847 vm_page_decrypt(dst_page, 0);
2848 vm_page_decrypt_for_upl_counter++;
2849
2850 /*
2851 * Retry this page, since anything
2852 * could have changed while we were
2853 * decrypting.
2854 */
2855 continue;
2856 }
2857 }
2858 entry++;
2859 dst_offset += PAGE_SIZE_64;
2860 xfer_size -= PAGE_SIZE;
2861 }
2862 } else {
2863 while (xfer_size) {
2864 if((alias_page == NULL) &&
2865 !(cntrl_flags & UPL_SET_LITE)) {
2866 vm_object_unlock(object);
2867 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2868 vm_object_lock(object);
2869 }
2870
2871 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2872 object->copy != last_copy_object) {
2873 /* Honor copy-on-write obligations */
2874
2875 /*
2876 * The copy object has changed since we
2877 * last synchronized for copy-on-write.
2878 * Another copy object might have been
2879 * inserted while we released the object's
2880 * lock. Since someone could have seen the
2881 * original contents of the remaining pages
2882 * through that new object, we have to
2883 * synchronize with it again for the remaining
2884 * pages only. The previous pages are "busy"
2885 * so they can not be seen through the new
2886 * mapping. The new mapping will see our
2887 * upcoming changes for those previous pages,
2888 * but that's OK since they couldn't see what
2889 * was there before. It's just a race anyway
2890 * and there's no guarantee of consistency or
2891 * atomicity. We just don't want new mappings
2892 * to see both the *before* and *after* pages.
2893 */
2894 if (object->copy != VM_OBJECT_NULL) {
2895 vm_object_update(
2896 object,
2897 dst_offset,/* current offset */
2898 xfer_size, /* remaining size */
2899 NULL,
2900 NULL,
2901 FALSE, /* should_return */
2902 MEMORY_OBJECT_COPY_SYNC,
2903 VM_PROT_NO_CHANGE);
2904 upl_cow_again++;
2905 upl_cow_again_pages +=
2906 xfer_size >> PAGE_SHIFT;
2907 }
2908 /* remember the copy object we synced with */
2909 last_copy_object = object->copy;
2910 }
2911
2912 dst_page = vm_page_lookup(object, dst_offset);
2913
2914 if(dst_page != VM_PAGE_NULL) {
2915 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2916 !((dst_page->list_req_pending)
2917 && (dst_page->absent))) {
2918 /* we are doing extended range */
2919 /* requests. we want to grab */
2920 /* pages around some which are */
2921 /* already present. */
2922 if(user_page_list) {
2923 user_page_list[entry].phys_addr = 0;
2924 }
2925 entry++;
2926 dst_offset += PAGE_SIZE_64;
2927 xfer_size -= PAGE_SIZE;
2928 continue;
2929 }
2930 if((dst_page->cleaning) &&
2931 !(dst_page->list_req_pending)) {
2932 /*someone else is writing to the */
2933 /* page. We will have to wait. */
2934 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2935 continue;
2936 }
2937 if ((dst_page->fictitious &&
2938 dst_page->list_req_pending)) {
2939 /* dump the fictitious page */
2940 dst_page->list_req_pending = FALSE;
2941 dst_page->clustered = FALSE;
2942
2943 vm_page_lock_queues();
2944 vm_page_free(dst_page);
2945 vm_page_unlock_queues();
2946
2947 dst_page = NULL;
2948 } else if ((dst_page->absent &&
2949 dst_page->list_req_pending)) {
2950 /* the default_pager case */
2951 dst_page->list_req_pending = FALSE;
2952 dst_page->busy = FALSE;
2953 }
2954 }
2955 if(dst_page == VM_PAGE_NULL) {
2956 if(object->private) {
2957 /*
2958 * This is a nasty wrinkle for users
2959 * of upl who encounter device or
2960 * private memory however, it is
2961 * unavoidable, only a fault can
2962 * reslove the actual backing
2963 * physical page by asking the
2964 * backing device.
2965 */
2966 if(user_page_list) {
2967 user_page_list[entry].phys_addr = 0;
2968 }
2969 entry++;
2970 dst_offset += PAGE_SIZE_64;
2971 xfer_size -= PAGE_SIZE;
2972 continue;
2973 }
2974 /* need to allocate a page */
2975 dst_page = vm_page_alloc(object, dst_offset);
2976 if (dst_page == VM_PAGE_NULL) {
2977 vm_object_unlock(object);
2978 VM_PAGE_WAIT();
2979 vm_object_lock(object);
2980 continue;
2981 }
2982 dst_page->busy = FALSE;
2983 #if 0
2984 if(cntrl_flags & UPL_NO_SYNC) {
2985 dst_page->page_lock = 0;
2986 dst_page->unlock_request = 0;
2987 }
2988 #endif
2989 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2990 /*
2991 * if UPL_RET_ONLY_ABSENT was specified,
2992 * than we're definitely setting up a
2993 * upl for a clustered read/pagein
2994 * operation... mark the pages as clustered
2995 * so vm_fault can correctly attribute them
2996 * to the 'pagein' bucket the first time
2997 * a fault happens on them
2998 */
2999 dst_page->clustered = TRUE;
3000 }
3001 dst_page->absent = TRUE;
3002 object->absent_count++;
3003 }
3004 #if 1
3005 if(cntrl_flags & UPL_NO_SYNC) {
3006 dst_page->page_lock = 0;
3007 dst_page->unlock_request = 0;
3008 }
3009 #endif /* 1 */
3010
3011 /*
3012 * ENCRYPTED SWAP:
3013 */
3014 if (cntrl_flags & UPL_ENCRYPT) {
3015 /*
3016 * The page is going to be encrypted when we
3017 * get it from the pager, so mark it so.
3018 */
3019 dst_page->encrypted = TRUE;
3020 } else {
3021 /*
3022 * Otherwise, the page will not contain
3023 * encrypted data.
3024 */
3025 dst_page->encrypted = FALSE;
3026 }
3027
3028 dst_page->overwriting = TRUE;
3029 if(dst_page->fictitious) {
3030 panic("need corner case for fictitious page");
3031 }
3032 if(dst_page->page_lock) {
3033 do_m_lock = TRUE;
3034 }
3035 if(upl_ptr) {
3036
3037 /* eliminate all mappings from the */
3038 /* original object and its prodigy */
3039
3040 if(dst_page->busy) {
3041 /*someone else is playing with the */
3042 /* page. We will have to wait. */
3043 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3044 continue;
3045 }
3046 vm_page_lock_queues();
3047
3048 if( !(cntrl_flags & UPL_FILE_IO))
3049 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3050 else
3051 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3052 dirty = hw_dirty ? TRUE : dst_page->dirty;
3053
3054 if(cntrl_flags & UPL_SET_LITE) {
3055 int pg_num;
3056 pg_num = (dst_offset-offset)/PAGE_SIZE;
3057 lite_list[pg_num>>5] |=
3058 1 << (pg_num & 31);
3059 if (hw_dirty)
3060 pmap_clear_modify(dst_page->phys_page);
3061 /*
3062 * Record that this page has been
3063 * written out
3064 */
3065 #if MACH_PAGEMAP
3066 vm_external_state_set(
3067 object->existence_map,
3068 dst_page->offset);
3069 #endif /*MACH_PAGEMAP*/
3070
3071 /*
3072 * Mark original page as cleaning
3073 * in place.
3074 */
3075 dst_page->cleaning = TRUE;
3076 dst_page->dirty = TRUE;
3077 dst_page->precious = FALSE;
3078 } else {
3079 /* use pageclean setup, it is more */
3080 /* convenient even for the pageout */
3081 /* cases here */
3082 vm_object_lock(upl->map_object);
3083 vm_pageclean_setup(dst_page,
3084 alias_page, upl->map_object,
3085 size - xfer_size);
3086 vm_object_unlock(upl->map_object);
3087
3088 alias_page->absent = FALSE;
3089 alias_page = NULL;
3090 }
3091
3092 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3093 /* clean in place for read implies */
3094 /* that a write will be done on all */
3095 /* the pages that are dirty before */
3096 /* a upl commit is done. The caller */
3097 /* is obligated to preserve the */
3098 /* contents of all pages marked */
3099 /* dirty. */
3100 upl->flags |= UPL_CLEAR_DIRTY;
3101 }
3102
3103 if(!dirty) {
3104 dst_page->dirty = FALSE;
3105 dst_page->precious = TRUE;
3106 }
3107
3108 if (dst_page->wire_count == 0) {
3109 /* deny access to the target page while */
3110 /* it is being worked on */
3111 dst_page->busy = TRUE;
3112 } else {
3113 vm_page_wire(dst_page);
3114 }
3115 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3116 /*
3117 * expect the page not to be used
3118 * since it's coming in as part
3119 * of a cluster and could be
3120 * speculative... pages that
3121 * are 'consumed' will get a
3122 * hardware reference
3123 */
3124 dst_page->reference = FALSE;
3125 } else {
3126 /*
3127 * expect the page to be used
3128 */
3129 dst_page->reference = TRUE;
3130 }
3131 dst_page->precious =
3132 (cntrl_flags & UPL_PRECIOUS)
3133 ? TRUE : FALSE;
3134
3135 if (dst_page->phys_page > upl->highest_page)
3136 upl->highest_page = dst_page->phys_page;
3137
3138 if(user_page_list) {
3139 user_page_list[entry].phys_addr
3140 = dst_page->phys_page;
3141 user_page_list[entry].dirty =
3142 dst_page->dirty;
3143 user_page_list[entry].pageout =
3144 dst_page->pageout;
3145 user_page_list[entry].absent =
3146 dst_page->absent;
3147 user_page_list[entry].precious =
3148 dst_page->precious;
3149 }
3150 vm_page_unlock_queues();
3151 }
3152 entry++;
3153 dst_offset += PAGE_SIZE_64;
3154 xfer_size -= PAGE_SIZE;
3155 }
3156 }
3157
3158 if (upl->flags & UPL_INTERNAL) {
3159 if(page_list_count != NULL)
3160 *page_list_count = 0;
3161 } else if (*page_list_count > entry) {
3162 if(page_list_count != NULL)
3163 *page_list_count = entry;
3164 }
3165
3166 if(alias_page != NULL) {
3167 vm_page_lock_queues();
3168 vm_page_free(alias_page);
3169 vm_page_unlock_queues();
3170 }
3171
3172 if(do_m_lock) {
3173 vm_prot_t access_required;
3174 /* call back all associated pages from other users of the pager */
3175 /* all future updates will be on data which is based on the */
3176 /* changes we are going to make here. Note: it is assumed that */
3177 /* we already hold copies of the data so we will not be seeing */
3178 /* an avalanche of incoming data from the pager */
3179 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3180 ? VM_PROT_READ : VM_PROT_WRITE;
3181 while (TRUE) {
3182 kern_return_t rc;
3183
3184 if(!object->pager_ready) {
3185 wait_result_t wait_result;
3186
3187 wait_result = vm_object_sleep(object,
3188 VM_OBJECT_EVENT_PAGER_READY,
3189 THREAD_UNINT);
3190 if (wait_result != THREAD_AWAKENED) {
3191 vm_object_unlock(object);
3192 return KERN_FAILURE;
3193 }
3194 continue;
3195 }
3196
3197 vm_object_unlock(object);
3198 rc = memory_object_data_unlock(
3199 object->pager,
3200 dst_offset + object->paging_offset,
3201 size,
3202 access_required);
3203 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3204 return KERN_FAILURE;
3205 vm_object_lock(object);
3206
3207 if (rc == KERN_SUCCESS)
3208 break;
3209 }
3210
3211 /* lets wait on the last page requested */
3212 /* NOTE: we will have to update lock completed routine to signal */
3213 if(dst_page != VM_PAGE_NULL &&
3214 (access_required & dst_page->page_lock) != access_required) {
3215 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3216 vm_object_unlock(object);
3217 thread_block(THREAD_CONTINUE_NULL);
3218 return KERN_SUCCESS;
3219 }
3220 }
3221
3222 vm_object_unlock(object);
3223 return KERN_SUCCESS;
3224 }
3225
3226 /* JMM - Backward compatability for now */
3227 kern_return_t
3228 vm_fault_list_request( /* forward */
3229 memory_object_control_t control,
3230 vm_object_offset_t offset,
3231 upl_size_t size,
3232 upl_t *upl_ptr,
3233 upl_page_info_t **user_page_list_ptr,
3234 int page_list_count,
3235 int cntrl_flags);
3236 kern_return_t
3237 vm_fault_list_request(
3238 memory_object_control_t control,
3239 vm_object_offset_t offset,
3240 upl_size_t size,
3241 upl_t *upl_ptr,
3242 upl_page_info_t **user_page_list_ptr,
3243 int page_list_count,
3244 int cntrl_flags)
3245 {
3246 unsigned int local_list_count;
3247 upl_page_info_t *user_page_list;
3248 kern_return_t kr;
3249
3250 if (user_page_list_ptr != NULL) {
3251 local_list_count = page_list_count;
3252 user_page_list = *user_page_list_ptr;
3253 } else {
3254 local_list_count = 0;
3255 user_page_list = NULL;
3256 }
3257 kr = memory_object_upl_request(control,
3258 offset,
3259 size,
3260 upl_ptr,
3261 user_page_list,
3262 &local_list_count,
3263 cntrl_flags);
3264
3265 if(kr != KERN_SUCCESS)
3266 return kr;
3267
3268 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3269 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3270 }
3271
3272 return KERN_SUCCESS;
3273 }
3274
3275
3276
3277 /*
3278 * Routine: vm_object_super_upl_request
3279 * Purpose:
3280 * Cause the population of a portion of a vm_object
3281 * in much the same way as memory_object_upl_request.
3282 * Depending on the nature of the request, the pages
3283 * returned may be contain valid data or be uninitialized.
3284 * However, the region may be expanded up to the super
3285 * cluster size provided.
3286 */
3287
3288 __private_extern__ kern_return_t
3289 vm_object_super_upl_request(
3290 vm_object_t object,
3291 vm_object_offset_t offset,
3292 upl_size_t size,
3293 upl_size_t super_cluster,
3294 upl_t *upl,
3295 upl_page_info_t *user_page_list,
3296 unsigned int *page_list_count,
3297 int cntrl_flags)
3298 {
3299 vm_page_t target_page;
3300 int ticket;
3301
3302
3303 if(object->paging_offset > offset)
3304 return KERN_FAILURE;
3305
3306 assert(object->paging_in_progress);
3307 offset = offset - object->paging_offset;
3308
3309 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3310
3311 vm_object_lock(object);
3312
3313 if((target_page = vm_page_lookup(object, offset))
3314 != VM_PAGE_NULL) {
3315 ticket = target_page->page_ticket;
3316 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3317 cntrl_flags = cntrl_flags |
3318 ((ticket << UPL_PAGE_TICKET_SHIFT)
3319 & UPL_PAGE_TICKET_MASK);
3320 }
3321 vm_object_unlock(object);
3322 }
3323
3324 if (super_cluster > size) {
3325
3326 vm_object_offset_t base_offset;
3327 upl_size_t super_size;
3328
3329 base_offset = (offset &
3330 ~((vm_object_offset_t) super_cluster - 1));
3331 super_size = (offset+size) > (base_offset + super_cluster) ?
3332 super_cluster<<1 : super_cluster;
3333 super_size = ((base_offset + super_size) > object->size) ?
3334 (object->size - base_offset) : super_size;
3335 if(offset > (base_offset + super_size))
3336 panic("vm_object_super_upl_request: Missed target pageout"
3337 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3338 offset, base_offset, super_size, super_cluster,
3339 size, object->paging_offset);
3340 /*
3341 * apparently there is a case where the vm requests a
3342 * page to be written out who's offset is beyond the
3343 * object size
3344 */
3345 if((offset + size) > (base_offset + super_size))
3346 super_size = (offset + size) - base_offset;
3347
3348 offset = base_offset;
3349 size = super_size;
3350 }
3351 return vm_object_upl_request(object, offset, size,
3352 upl, user_page_list, page_list_count,
3353 cntrl_flags);
3354 }
3355
3356
3357 kern_return_t
3358 vm_map_create_upl(
3359 vm_map_t map,
3360 vm_map_address_t offset,
3361 upl_size_t *upl_size,
3362 upl_t *upl,
3363 upl_page_info_array_t page_list,
3364 unsigned int *count,
3365 int *flags)
3366 {
3367 vm_map_entry_t entry;
3368 int caller_flags;
3369 int force_data_sync;
3370 int sync_cow_data;
3371 vm_object_t local_object;
3372 vm_map_offset_t local_offset;
3373 vm_map_offset_t local_start;
3374 kern_return_t ret;
3375
3376 caller_flags = *flags;
3377
3378 if (caller_flags & ~UPL_VALID_FLAGS) {
3379 /*
3380 * For forward compatibility's sake,
3381 * reject any unknown flag.
3382 */
3383 return KERN_INVALID_VALUE;
3384 }
3385
3386 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3387 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3388
3389 if(upl == NULL)
3390 return KERN_INVALID_ARGUMENT;
3391
3392
3393 REDISCOVER_ENTRY:
3394 vm_map_lock(map);
3395 if (vm_map_lookup_entry(map, offset, &entry)) {
3396 if (entry->object.vm_object == VM_OBJECT_NULL ||
3397 !entry->object.vm_object->phys_contiguous) {
3398 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3399 *upl_size = MAX_UPL_TRANSFER * page_size;
3400 }
3401 }
3402 if((entry->vme_end - offset) < *upl_size) {
3403 *upl_size = entry->vme_end - offset;
3404 }
3405 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3406 if (entry->object.vm_object == VM_OBJECT_NULL) {
3407 *flags = 0;
3408 } else if (entry->object.vm_object->private) {
3409 *flags = UPL_DEV_MEMORY;
3410 if (entry->object.vm_object->phys_contiguous) {
3411 *flags |= UPL_PHYS_CONTIG;
3412 }
3413 } else {
3414 *flags = 0;
3415 }
3416 vm_map_unlock(map);
3417 return KERN_SUCCESS;
3418 }
3419 /*
3420 * Create an object if necessary.
3421 */
3422 if (entry->object.vm_object == VM_OBJECT_NULL) {
3423 entry->object.vm_object = vm_object_allocate(
3424 (vm_size_t)(entry->vme_end - entry->vme_start));
3425 entry->offset = 0;
3426 }
3427 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3428 if (!(entry->protection & VM_PROT_WRITE)) {
3429 vm_map_unlock(map);
3430 return KERN_PROTECTION_FAILURE;
3431 }
3432 if (entry->needs_copy) {
3433 vm_map_t local_map;
3434 vm_object_t object;
3435 vm_map_offset_t offset_hi;
3436 vm_map_offset_t offset_lo;
3437 vm_object_offset_t new_offset;
3438 vm_prot_t prot;
3439 boolean_t wired;
3440 vm_behavior_t behavior;
3441 vm_map_version_t version;
3442 vm_map_t real_map;
3443
3444 local_map = map;
3445 vm_map_lock_write_to_read(map);
3446 if(vm_map_lookup_locked(&local_map,
3447 offset, VM_PROT_WRITE,
3448 &version, &object,
3449 &new_offset, &prot, &wired,
3450 &behavior, &offset_lo,
3451 &offset_hi, &real_map)) {
3452 vm_map_unlock(local_map);
3453 return KERN_FAILURE;
3454 }
3455 if (real_map != map) {
3456 vm_map_unlock(real_map);
3457 }
3458 vm_object_unlock(object);
3459 vm_map_unlock(local_map);
3460
3461 goto REDISCOVER_ENTRY;
3462 }
3463 }
3464 if (entry->is_sub_map) {
3465 vm_map_t submap;
3466
3467 submap = entry->object.sub_map;
3468 local_start = entry->vme_start;
3469 local_offset = entry->offset;
3470 vm_map_reference(submap);
3471 vm_map_unlock(map);
3472
3473 ret = (vm_map_create_upl(submap,
3474 local_offset + (offset - local_start),
3475 upl_size, upl, page_list, count,
3476 flags));
3477
3478 vm_map_deallocate(submap);
3479 return ret;
3480 }
3481
3482 if (sync_cow_data) {
3483 if (entry->object.vm_object->shadow
3484 || entry->object.vm_object->copy) {
3485
3486 local_object = entry->object.vm_object;
3487 local_start = entry->vme_start;
3488 local_offset = entry->offset;
3489 vm_object_reference(local_object);
3490 vm_map_unlock(map);
3491
3492 if (entry->object.vm_object->shadow &&
3493 entry->object.vm_object->copy) {
3494 vm_object_lock_request(
3495 local_object->shadow,
3496 (vm_object_offset_t)
3497 ((offset - local_start) +
3498 local_offset) +
3499 local_object->shadow_offset,
3500 *upl_size, FALSE,
3501 MEMORY_OBJECT_DATA_SYNC,
3502 VM_PROT_NO_CHANGE);
3503 }
3504 sync_cow_data = FALSE;
3505 vm_object_deallocate(local_object);
3506 goto REDISCOVER_ENTRY;
3507 }
3508 }
3509
3510 if (force_data_sync) {
3511
3512 local_object = entry->object.vm_object;
3513 local_start = entry->vme_start;
3514 local_offset = entry->offset;
3515 vm_object_reference(local_object);
3516 vm_map_unlock(map);
3517
3518 vm_object_lock_request(
3519 local_object,
3520 (vm_object_offset_t)
3521 ((offset - local_start) + local_offset),
3522 (vm_object_size_t)*upl_size, FALSE,
3523 MEMORY_OBJECT_DATA_SYNC,
3524 VM_PROT_NO_CHANGE);
3525 force_data_sync = FALSE;
3526 vm_object_deallocate(local_object);
3527 goto REDISCOVER_ENTRY;
3528 }
3529
3530 if(!(entry->object.vm_object->private)) {
3531 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3532 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3533 if(entry->object.vm_object->phys_contiguous) {
3534 *flags = UPL_PHYS_CONTIG;
3535 } else {
3536 *flags = 0;
3537 }
3538 } else {
3539 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3540 }
3541 local_object = entry->object.vm_object;
3542 local_offset = entry->offset;
3543 local_start = entry->vme_start;
3544 vm_object_reference(local_object);
3545 vm_map_unlock(map);
3546 if(caller_flags & UPL_SET_IO_WIRE) {
3547 ret = (vm_object_iopl_request(local_object,
3548 (vm_object_offset_t)
3549 ((offset - local_start)
3550 + local_offset),
3551 *upl_size,
3552 upl,
3553 page_list,
3554 count,
3555 caller_flags));
3556 } else {
3557 ret = (vm_object_upl_request(local_object,
3558 (vm_object_offset_t)
3559 ((offset - local_start)
3560 + local_offset),
3561 *upl_size,
3562 upl,
3563 page_list,
3564 count,
3565 caller_flags));
3566 }
3567 vm_object_deallocate(local_object);
3568 return(ret);
3569 }
3570
3571 vm_map_unlock(map);
3572 return(KERN_FAILURE);
3573
3574 }
3575
3576 /*
3577 * Internal routine to enter a UPL into a VM map.
3578 *
3579 * JMM - This should just be doable through the standard
3580 * vm_map_enter() API.
3581 */
3582 kern_return_t
3583 vm_map_enter_upl(
3584 vm_map_t map,
3585 upl_t upl,
3586 vm_map_offset_t *dst_addr)
3587 {
3588 vm_map_size_t size;
3589 vm_object_offset_t offset;
3590 vm_map_offset_t addr;
3591 vm_page_t m;
3592 kern_return_t kr;
3593
3594 if (upl == UPL_NULL)
3595 return KERN_INVALID_ARGUMENT;
3596
3597 upl_lock(upl);
3598
3599 /* check to see if already mapped */
3600 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3601 upl_unlock(upl);
3602 return KERN_FAILURE;
3603 }
3604
3605 if((!(upl->map_object->pageout)) &&
3606 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3607 (upl->map_object->phys_contiguous))) {
3608 vm_object_t object;
3609 vm_page_t alias_page;
3610 vm_object_offset_t new_offset;
3611 int pg_num;
3612 wpl_array_t lite_list;
3613
3614 if(upl->flags & UPL_INTERNAL) {
3615 lite_list = (wpl_array_t)
3616 ((((uintptr_t)upl) + sizeof(struct upl))
3617 + ((upl->size/PAGE_SIZE)
3618 * sizeof(upl_page_info_t)));
3619 } else {
3620 lite_list = (wpl_array_t)
3621 (((uintptr_t)upl) + sizeof(struct upl));
3622 }
3623 object = upl->map_object;
3624 upl->map_object = vm_object_allocate(upl->size);
3625 vm_object_lock(upl->map_object);
3626 upl->map_object->shadow = object;
3627 upl->map_object->pageout = TRUE;
3628 upl->map_object->can_persist = FALSE;
3629 upl->map_object->copy_strategy =
3630 MEMORY_OBJECT_COPY_NONE;
3631 upl->map_object->shadow_offset =
3632 upl->offset - object->paging_offset;
3633 upl->map_object->wimg_bits = object->wimg_bits;
3634 offset = upl->map_object->shadow_offset;
3635 new_offset = 0;
3636 size = upl->size;
3637
3638 vm_object_lock(object);
3639
3640 while(size) {
3641 pg_num = (new_offset)/PAGE_SIZE;
3642 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3643 vm_object_unlock(object);
3644 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3645 vm_object_lock(object);
3646 m = vm_page_lookup(object, offset);
3647 if (m == VM_PAGE_NULL) {
3648 panic("vm_upl_map: page missing\n");
3649 }
3650
3651 vm_object_paging_begin(object);
3652
3653 /*
3654 * Convert the fictitious page to a private
3655 * shadow of the real page.
3656 */
3657 assert(alias_page->fictitious);
3658 alias_page->fictitious = FALSE;
3659 alias_page->private = TRUE;
3660 alias_page->pageout = TRUE;
3661 alias_page->phys_page = m->phys_page;
3662
3663 vm_page_lock_queues();
3664 vm_page_wire(alias_page);
3665 vm_page_unlock_queues();
3666
3667 /*
3668 * ENCRYPTED SWAP:
3669 * The virtual page ("m") has to be wired in some way
3670 * here or its physical page ("m->phys_page") could
3671 * be recycled at any time.
3672 * Assuming this is enforced by the caller, we can't
3673 * get an encrypted page here. Since the encryption
3674 * key depends on the VM page's "pager" object and
3675 * the "paging_offset", we couldn't handle 2 pageable
3676 * VM pages (with different pagers and paging_offsets)
3677 * sharing the same physical page: we could end up
3678 * encrypting with one key (via one VM page) and
3679 * decrypting with another key (via the alias VM page).
3680 */
3681 ASSERT_PAGE_DECRYPTED(m);
3682
3683 vm_page_insert(alias_page,
3684 upl->map_object, new_offset);
3685 assert(!alias_page->wanted);
3686 alias_page->busy = FALSE;
3687 alias_page->absent = FALSE;
3688 }
3689
3690 size -= PAGE_SIZE;
3691 offset += PAGE_SIZE_64;
3692 new_offset += PAGE_SIZE_64;
3693 }
3694 vm_object_unlock(object);
3695 vm_object_unlock(upl->map_object);
3696 }
3697 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3698 offset = upl->offset - upl->map_object->paging_offset;
3699 else
3700 offset = 0;
3701
3702 size = upl->size;
3703
3704 vm_object_lock(upl->map_object);
3705 upl->map_object->ref_count++;
3706 vm_object_res_reference(upl->map_object);
3707 vm_object_unlock(upl->map_object);
3708
3709 *dst_addr = 0;
3710
3711
3712 /* NEED A UPL_MAP ALIAS */
3713 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3714 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3715 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3716
3717 if (kr != KERN_SUCCESS) {
3718 upl_unlock(upl);
3719 return(kr);
3720 }
3721
3722 vm_object_lock(upl->map_object);
3723
3724 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3725 m = vm_page_lookup(upl->map_object, offset);
3726 if(m) {
3727 unsigned int cache_attr;
3728 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3729
3730 PMAP_ENTER(map->pmap, addr,
3731 m, VM_PROT_ALL,
3732 cache_attr, TRUE);
3733 }
3734 offset+=PAGE_SIZE_64;
3735 }
3736 vm_object_unlock(upl->map_object);
3737
3738 upl->ref_count++; /* hold a reference for the mapping */
3739 upl->flags |= UPL_PAGE_LIST_MAPPED;
3740 upl->kaddr = *dst_addr;
3741 upl_unlock(upl);
3742 return KERN_SUCCESS;
3743 }
3744
3745 /*
3746 * Internal routine to remove a UPL mapping from a VM map.
3747 *
3748 * XXX - This should just be doable through a standard
3749 * vm_map_remove() operation. Otherwise, implicit clean-up
3750 * of the target map won't be able to correctly remove
3751 * these (and release the reference on the UPL). Having
3752 * to do this means we can't map these into user-space
3753 * maps yet.
3754 */
3755 kern_return_t
3756 vm_map_remove_upl(
3757 vm_map_t map,
3758 upl_t upl)
3759 {
3760 vm_address_t addr;
3761 upl_size_t size;
3762
3763 if (upl == UPL_NULL)
3764 return KERN_INVALID_ARGUMENT;
3765
3766 upl_lock(upl);
3767 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3768 addr = upl->kaddr;
3769 size = upl->size;
3770 assert(upl->ref_count > 1);
3771 upl->ref_count--; /* removing mapping ref */
3772 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3773 upl->kaddr = (vm_offset_t) 0;
3774 upl_unlock(upl);
3775
3776 vm_map_remove( map,
3777 vm_map_trunc_page(addr),
3778 vm_map_round_page(addr + size),
3779 VM_MAP_NO_FLAGS);
3780 return KERN_SUCCESS;
3781 }
3782 upl_unlock(upl);
3783 return KERN_FAILURE;
3784 }
3785
3786 kern_return_t
3787 upl_commit_range(
3788 upl_t upl,
3789 upl_offset_t offset,
3790 upl_size_t size,
3791 int flags,
3792 upl_page_info_t *page_list,
3793 mach_msg_type_number_t count,
3794 boolean_t *empty)
3795 {
3796 upl_size_t xfer_size = size;
3797 vm_object_t shadow_object;
3798 vm_object_t object = upl->map_object;
3799 vm_object_offset_t target_offset;
3800 int entry;
3801 wpl_array_t lite_list;
3802 int occupied;
3803 int delayed_unlock = 0;
3804 int clear_refmod = 0;
3805 boolean_t shadow_internal;
3806
3807 *empty = FALSE;
3808
3809 if (upl == UPL_NULL)
3810 return KERN_INVALID_ARGUMENT;
3811
3812
3813 if (count == 0)
3814 page_list = NULL;
3815
3816 if (object->pageout) {
3817 shadow_object = object->shadow;
3818 } else {
3819 shadow_object = object;
3820 }
3821
3822 upl_lock(upl);
3823
3824 if (upl->flags & UPL_ACCESS_BLOCKED) {
3825 /*
3826 * We used this UPL to block access to the pages by marking
3827 * them "busy". Now we need to clear the "busy" bit to allow
3828 * access to these pages again.
3829 */
3830 flags |= UPL_COMMIT_ALLOW_ACCESS;
3831 }
3832
3833 if (upl->flags & UPL_CLEAR_DIRTY)
3834 flags |= UPL_COMMIT_CLEAR_DIRTY;
3835
3836 if (upl->flags & UPL_DEVICE_MEMORY) {
3837 xfer_size = 0;
3838 } else if ((offset + size) > upl->size) {
3839 upl_unlock(upl);
3840 return KERN_FAILURE;
3841 }
3842
3843 if (upl->flags & UPL_INTERNAL) {
3844 lite_list = (wpl_array_t)
3845 ((((uintptr_t)upl) + sizeof(struct upl))
3846 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3847 } else {
3848 lite_list = (wpl_array_t)
3849 (((uintptr_t)upl) + sizeof(struct upl));
3850 }
3851 if (object != shadow_object)
3852 vm_object_lock(object);
3853 vm_object_lock(shadow_object);
3854
3855 shadow_internal = shadow_object->internal;
3856
3857 entry = offset/PAGE_SIZE;
3858 target_offset = (vm_object_offset_t)offset;
3859
3860 while (xfer_size) {
3861 vm_page_t t,m;
3862 upl_page_info_t *p;
3863
3864 m = VM_PAGE_NULL;
3865
3866 if (upl->flags & UPL_LITE) {
3867 int pg_num;
3868
3869 pg_num = target_offset/PAGE_SIZE;
3870
3871 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3872 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3873 m = vm_page_lookup(shadow_object,
3874 target_offset + (upl->offset -
3875 shadow_object->paging_offset));
3876 }
3877 }
3878 if (object->pageout) {
3879 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3880 t->pageout = FALSE;
3881
3882 if (delayed_unlock) {
3883 delayed_unlock = 0;
3884 vm_page_unlock_queues();
3885 }
3886 VM_PAGE_FREE(t);
3887
3888 if (m == NULL) {
3889 m = vm_page_lookup(
3890 shadow_object,
3891 target_offset +
3892 object->shadow_offset);
3893 }
3894 if (m != VM_PAGE_NULL)
3895 vm_object_paging_end(m->object);
3896 }
3897 }
3898 if (m != VM_PAGE_NULL) {
3899
3900 clear_refmod = 0;
3901
3902 if (upl->flags & UPL_IO_WIRE) {
3903
3904 if (delayed_unlock == 0)
3905 vm_page_lock_queues();
3906
3907 vm_page_unwire(m);
3908
3909 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3910 delayed_unlock = 0;
3911 vm_page_unlock_queues();
3912 }
3913 if (page_list) {
3914 page_list[entry].phys_addr = 0;
3915 }
3916 if (flags & UPL_COMMIT_SET_DIRTY) {
3917 m->dirty = TRUE;
3918 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3919 m->dirty = FALSE;
3920 clear_refmod |= VM_MEM_MODIFIED;
3921 }
3922 if (flags & UPL_COMMIT_INACTIVATE) {
3923 m->reference = FALSE;
3924 clear_refmod |= VM_MEM_REFERENCED;
3925 vm_page_deactivate(m);
3926 }
3927 if (clear_refmod)
3928 pmap_clear_refmod(m->phys_page, clear_refmod);
3929
3930 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3931 /*
3932 * We blocked access to the pages in this UPL.
3933 * Clear the "busy" bit and wake up any waiter
3934 * for this page.
3935 */
3936 PAGE_WAKEUP_DONE(m);
3937 }
3938
3939 target_offset += PAGE_SIZE_64;
3940 xfer_size -= PAGE_SIZE;
3941 entry++;
3942 continue;
3943 }
3944 if (delayed_unlock == 0)
3945 vm_page_lock_queues();
3946 /*
3947 * make sure to clear the hardware
3948 * modify or reference bits before
3949 * releasing the BUSY bit on this page
3950 * otherwise we risk losing a legitimate
3951 * change of state
3952 */
3953 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3954 m->dirty = FALSE;
3955 clear_refmod |= VM_MEM_MODIFIED;
3956 }
3957 if (flags & UPL_COMMIT_INACTIVATE)
3958 clear_refmod |= VM_MEM_REFERENCED;
3959
3960 if (clear_refmod)
3961 pmap_clear_refmod(m->phys_page, clear_refmod);
3962
3963 if (page_list) {
3964 p = &(page_list[entry]);
3965 if(p->phys_addr && p->pageout && !m->pageout) {
3966 m->busy = TRUE;
3967 m->pageout = TRUE;
3968 vm_page_wire(m);
3969 } else if (page_list[entry].phys_addr &&
3970 !p->pageout && m->pageout &&
3971 !m->dump_cleaning) {
3972 m->pageout = FALSE;
3973 m->absent = FALSE;
3974 m->overwriting = FALSE;
3975 vm_page_unwire(m);
3976 PAGE_WAKEUP_DONE(m);
3977 }
3978 page_list[entry].phys_addr = 0;
3979 }
3980 m->dump_cleaning = FALSE;
3981 if(m->laundry) {
3982 vm_pageout_throttle_up(m);
3983 }
3984 if(m->pageout) {
3985 m->cleaning = FALSE;
3986 m->pageout = FALSE;
3987 #if MACH_CLUSTER_STATS
3988 if (m->wanted) vm_pageout_target_collisions++;
3989 #endif
3990 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3991 m->dirty = TRUE;
3992 else
3993 m->dirty = FALSE;
3994
3995 if(m->dirty) {
3996 vm_page_unwire(m);/* reactivates */
3997
3998 if (upl->flags & UPL_PAGEOUT) {
3999 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4000 VM_STAT(reactivations++);
4001 }
4002 PAGE_WAKEUP_DONE(m);
4003 } else {
4004 vm_page_free(m);/* clears busy, etc. */
4005
4006 if (upl->flags & UPL_PAGEOUT) {
4007 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4008
4009 if (page_list[entry].dirty)
4010 VM_STAT(pageouts++);
4011 }
4012 }
4013 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4014 delayed_unlock = 0;
4015 vm_page_unlock_queues();
4016 }
4017 target_offset += PAGE_SIZE_64;
4018 xfer_size -= PAGE_SIZE;
4019 entry++;
4020 continue;
4021 }
4022 #if MACH_CLUSTER_STATS
4023 m->dirty = pmap_is_modified(m->phys_page);
4024
4025 if (m->dirty) vm_pageout_cluster_dirtied++;
4026 else vm_pageout_cluster_cleaned++;
4027 if (m->wanted) vm_pageout_cluster_collisions++;
4028 #else
4029 m->dirty = 0;
4030 #endif
4031
4032 if((m->busy) && (m->cleaning)) {
4033 /* the request_page_list case */
4034 if(m->absent) {
4035 m->absent = FALSE;
4036 if(shadow_object->absent_count == 1)
4037 vm_object_absent_release(shadow_object);
4038 else
4039 shadow_object->absent_count--;
4040 }
4041 m->overwriting = FALSE;
4042 m->busy = FALSE;
4043 m->dirty = FALSE;
4044 } else if (m->overwriting) {
4045 /* alternate request page list, write to
4046 * page_list case. Occurs when the original
4047 * page was wired at the time of the list
4048 * request */
4049 assert(m->wire_count != 0);
4050 vm_page_unwire(m);/* reactivates */
4051 m->overwriting = FALSE;
4052 }
4053 m->cleaning = FALSE;
4054
4055 /* It is a part of the semantic of COPYOUT_FROM */
4056 /* UPLs that a commit implies cache sync */
4057 /* between the vm page and the backing store */
4058 /* this can be used to strip the precious bit */
4059 /* as well as clean */
4060 if (upl->flags & UPL_PAGE_SYNC_DONE)
4061 m->precious = FALSE;
4062
4063 if (flags & UPL_COMMIT_SET_DIRTY)
4064 m->dirty = TRUE;
4065
4066 if (flags & UPL_COMMIT_INACTIVATE) {
4067 m->reference = FALSE;
4068 vm_page_deactivate(m);
4069 } else if (!m->active && !m->inactive) {
4070 if (m->reference)
4071 vm_page_activate(m);
4072 else
4073 vm_page_deactivate(m);
4074 }
4075
4076 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4077 /*
4078 * We blocked access to the pages in this URL.
4079 * Clear the "busy" bit on this page before we
4080 * wake up any waiter.
4081 */
4082 m->busy = FALSE;
4083 }
4084
4085 /*
4086 * Wakeup any thread waiting for the page to be un-cleaning.
4087 */
4088 PAGE_WAKEUP(m);
4089
4090 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4091 delayed_unlock = 0;
4092 vm_page_unlock_queues();
4093 }
4094 }
4095 target_offset += PAGE_SIZE_64;
4096 xfer_size -= PAGE_SIZE;
4097 entry++;
4098 }
4099 if (delayed_unlock)
4100 vm_page_unlock_queues();
4101
4102 occupied = 1;
4103
4104 if (upl->flags & UPL_DEVICE_MEMORY) {
4105 occupied = 0;
4106 } else if (upl->flags & UPL_LITE) {
4107 int pg_num;
4108 int i;
4109 pg_num = upl->size/PAGE_SIZE;
4110 pg_num = (pg_num + 31) >> 5;
4111 occupied = 0;
4112 for(i= 0; i<pg_num; i++) {
4113 if(lite_list[i] != 0) {
4114 occupied = 1;
4115 break;
4116 }
4117 }
4118 } else {
4119 if(queue_empty(&upl->map_object->memq)) {
4120 occupied = 0;
4121 }
4122 }
4123
4124 if(occupied == 0) {
4125 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4126 *empty = TRUE;
4127 }
4128 if(object == shadow_object)
4129 vm_object_paging_end(shadow_object);
4130 }
4131 vm_object_unlock(shadow_object);
4132 if (object != shadow_object)
4133 vm_object_unlock(object);
4134 upl_unlock(upl);
4135
4136 return KERN_SUCCESS;
4137 }
4138
4139 kern_return_t
4140 upl_abort_range(
4141 upl_t upl,
4142 upl_offset_t offset,
4143 upl_size_t size,
4144 int error,
4145 boolean_t *empty)
4146 {
4147 upl_size_t xfer_size = size;
4148 vm_object_t shadow_object;
4149 vm_object_t object = upl->map_object;
4150 vm_object_offset_t target_offset;
4151 int entry;
4152 wpl_array_t lite_list;
4153 int occupied;
4154 boolean_t shadow_internal;
4155
4156 *empty = FALSE;
4157
4158 if (upl == UPL_NULL)
4159 return KERN_INVALID_ARGUMENT;
4160
4161 if (upl->flags & UPL_IO_WIRE) {
4162 return upl_commit_range(upl,
4163 offset, size, 0,
4164 NULL, 0, empty);
4165 }
4166
4167 if(object->pageout) {
4168 shadow_object = object->shadow;
4169 } else {
4170 shadow_object = object;
4171 }
4172
4173 upl_lock(upl);
4174 if(upl->flags & UPL_DEVICE_MEMORY) {
4175 xfer_size = 0;
4176 } else if ((offset + size) > upl->size) {
4177 upl_unlock(upl);
4178 return KERN_FAILURE;
4179 }
4180 if (object != shadow_object)
4181 vm_object_lock(object);
4182 vm_object_lock(shadow_object);
4183
4184 shadow_internal = shadow_object->internal;
4185
4186 if(upl->flags & UPL_INTERNAL) {
4187 lite_list = (wpl_array_t)
4188 ((((uintptr_t)upl) + sizeof(struct upl))
4189 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4190 } else {
4191 lite_list = (wpl_array_t)
4192 (((uintptr_t)upl) + sizeof(struct upl));
4193 }
4194
4195 entry = offset/PAGE_SIZE;
4196 target_offset = (vm_object_offset_t)offset;
4197 while(xfer_size) {
4198 vm_page_t t,m;
4199
4200 m = VM_PAGE_NULL;
4201 if(upl->flags & UPL_LITE) {
4202 int pg_num;
4203 pg_num = target_offset/PAGE_SIZE;
4204 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4205 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4206 m = vm_page_lookup(shadow_object,
4207 target_offset + (upl->offset -
4208 shadow_object->paging_offset));
4209 }
4210 }
4211 if(object->pageout) {
4212 if ((t = vm_page_lookup(object, target_offset))
4213 != NULL) {
4214 t->pageout = FALSE;
4215 VM_PAGE_FREE(t);
4216 if(m == NULL) {
4217 m = vm_page_lookup(
4218 shadow_object,
4219 target_offset +
4220 object->shadow_offset);
4221 }
4222 if(m != VM_PAGE_NULL)
4223 vm_object_paging_end(m->object);
4224 }
4225 }
4226 if(m != VM_PAGE_NULL) {
4227 vm_page_lock_queues();
4228 if(m->absent) {
4229 boolean_t must_free = TRUE;
4230
4231 /* COPYOUT = FALSE case */
4232 /* check for error conditions which must */
4233 /* be passed back to the pages customer */
4234 if(error & UPL_ABORT_RESTART) {
4235 m->restart = TRUE;
4236 m->absent = FALSE;
4237 vm_object_absent_release(m->object);
4238 m->page_error = KERN_MEMORY_ERROR;
4239 m->error = TRUE;
4240 must_free = FALSE;
4241 } else if(error & UPL_ABORT_UNAVAILABLE) {
4242 m->restart = FALSE;
4243 m->unusual = TRUE;
4244 must_free = FALSE;
4245 } else if(error & UPL_ABORT_ERROR) {
4246 m->restart = FALSE;
4247 m->absent = FALSE;
4248 vm_object_absent_release(m->object);
4249 m->page_error = KERN_MEMORY_ERROR;
4250 m->error = TRUE;
4251 must_free = FALSE;
4252 }
4253
4254 /*
4255 * ENCRYPTED SWAP:
4256 * If the page was already encrypted,
4257 * we don't really need to decrypt it
4258 * now. It will get decrypted later,
4259 * on demand, as soon as someone needs
4260 * to access its contents.
4261 */
4262
4263 m->cleaning = FALSE;
4264 m->overwriting = FALSE;
4265 PAGE_WAKEUP_DONE(m);
4266
4267 if (must_free == TRUE) {
4268 vm_page_free(m);
4269 } else {
4270 vm_page_activate(m);
4271 }
4272 vm_page_unlock_queues();
4273
4274 target_offset += PAGE_SIZE_64;
4275 xfer_size -= PAGE_SIZE;
4276 entry++;
4277 continue;
4278 }
4279 /*
4280 * Handle the trusted pager throttle.
4281 */
4282 if (m->laundry) {
4283 vm_pageout_throttle_up(m);
4284 }
4285 if(m->pageout) {
4286 assert(m->busy);
4287 assert(m->wire_count == 1);
4288 m->pageout = FALSE;
4289 vm_page_unwire(m);
4290 }
4291 m->dump_cleaning = FALSE;
4292 m->cleaning = FALSE;
4293 m->overwriting = FALSE;
4294 #if MACH_PAGEMAP
4295 vm_external_state_clr(
4296 m->object->existence_map, m->offset);
4297 #endif /* MACH_PAGEMAP */
4298 if(error & UPL_ABORT_DUMP_PAGES) {
4299 vm_page_free(m);
4300 pmap_disconnect(m->phys_page);
4301 } else {
4302 PAGE_WAKEUP_DONE(m);
4303 }
4304 vm_page_unlock_queues();
4305 }
4306 target_offset += PAGE_SIZE_64;
4307 xfer_size -= PAGE_SIZE;
4308 entry++;
4309 }
4310 occupied = 1;
4311 if (upl->flags & UPL_DEVICE_MEMORY) {
4312 occupied = 0;
4313 } else if (upl->flags & UPL_LITE) {
4314 int pg_num;
4315 int i;
4316 pg_num = upl->size/PAGE_SIZE;
4317 pg_num = (pg_num + 31) >> 5;
4318 occupied = 0;
4319 for(i= 0; i<pg_num; i++) {
4320 if(lite_list[i] != 0) {
4321 occupied = 1;
4322 break;
4323 }
4324 }
4325 } else {
4326 if(queue_empty(&upl->map_object->memq)) {
4327 occupied = 0;
4328 }
4329 }
4330
4331 if(occupied == 0) {
4332 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4333 *empty = TRUE;
4334 }
4335 if(object == shadow_object)
4336 vm_object_paging_end(shadow_object);
4337 }
4338 vm_object_unlock(shadow_object);
4339 if (object != shadow_object)
4340 vm_object_unlock(object);
4341
4342 upl_unlock(upl);
4343
4344 return KERN_SUCCESS;
4345 }
4346
4347 kern_return_t
4348 upl_abort(
4349 upl_t upl,
4350 int error)
4351 {
4352 vm_object_t object = NULL;
4353 vm_object_t shadow_object = NULL;
4354 vm_object_offset_t offset;
4355 vm_object_offset_t shadow_offset;
4356 vm_object_offset_t target_offset;
4357 upl_size_t i;
4358 wpl_array_t lite_list;
4359 vm_page_t t,m;
4360 int occupied;
4361 boolean_t shadow_internal;
4362
4363 if (upl == UPL_NULL)
4364 return KERN_INVALID_ARGUMENT;
4365
4366 if (upl->flags & UPL_IO_WIRE) {
4367 boolean_t empty;
4368 return upl_commit_range(upl,
4369 0, upl->size, 0,
4370 NULL, 0, &empty);
4371 }
4372
4373 upl_lock(upl);
4374 if(upl->flags & UPL_DEVICE_MEMORY) {
4375 upl_unlock(upl);
4376 return KERN_SUCCESS;
4377 }
4378
4379 object = upl->map_object;
4380
4381 if (object == NULL) {
4382 panic("upl_abort: upl object is not backed by an object");
4383 upl_unlock(upl);
4384 return KERN_INVALID_ARGUMENT;
4385 }
4386
4387 if(object->pageout) {
4388 shadow_object = object->shadow;
4389 shadow_offset = object->shadow_offset;
4390 } else {
4391 shadow_object = object;
4392 shadow_offset = upl->offset - object->paging_offset;
4393 }
4394
4395 if(upl->flags & UPL_INTERNAL) {
4396 lite_list = (wpl_array_t)
4397 ((((uintptr_t)upl) + sizeof(struct upl))
4398 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4399 } else {
4400 lite_list = (wpl_array_t)
4401 (((uintptr_t)upl) + sizeof(struct upl));
4402 }
4403 offset = 0;
4404
4405 if (object != shadow_object)
4406 vm_object_lock(object);
4407 vm_object_lock(shadow_object);
4408
4409 shadow_internal = shadow_object->internal;
4410
4411 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4412 m = VM_PAGE_NULL;
4413 target_offset = offset + shadow_offset;
4414 if(upl->flags & UPL_LITE) {
4415 int pg_num;
4416 pg_num = offset/PAGE_SIZE;
4417 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4418 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4419 m = vm_page_lookup(
4420 shadow_object, target_offset);
4421 }
4422 }
4423 if(object->pageout) {
4424 if ((t = vm_page_lookup(object, offset)) != NULL) {
4425 t->pageout = FALSE;
4426 VM_PAGE_FREE(t);
4427 if(m == NULL) {
4428 m = vm_page_lookup(
4429 shadow_object, target_offset);
4430 }
4431 if(m != VM_PAGE_NULL)
4432 vm_object_paging_end(m->object);
4433 }
4434 }
4435 if(m != VM_PAGE_NULL) {
4436 vm_page_lock_queues();
4437 if(m->absent) {
4438 boolean_t must_free = TRUE;
4439
4440 /* COPYOUT = FALSE case */
4441 /* check for error conditions which must */
4442 /* be passed back to the pages customer */
4443 if(error & UPL_ABORT_RESTART) {
4444 m->restart = TRUE;
4445 m->absent = FALSE;
4446 vm_object_absent_release(m->object);
4447 m->page_error = KERN_MEMORY_ERROR;
4448 m->error = TRUE;
4449 must_free = FALSE;
4450 } else if(error & UPL_ABORT_UNAVAILABLE) {
4451 m->restart = FALSE;
4452 m->unusual = TRUE;
4453 must_free = FALSE;
4454 } else if(error & UPL_ABORT_ERROR) {
4455 m->restart = FALSE;
4456 m->absent = FALSE;
4457 vm_object_absent_release(m->object);
4458 m->page_error = KERN_MEMORY_ERROR;
4459 m->error = TRUE;
4460 must_free = FALSE;
4461 }
4462
4463 /*
4464 * ENCRYPTED SWAP:
4465 * If the page was already encrypted,
4466 * we don't really need to decrypt it
4467 * now. It will get decrypted later,
4468 * on demand, as soon as someone needs
4469 * to access its contents.
4470 */
4471
4472 m->cleaning = FALSE;
4473 m->overwriting = FALSE;
4474 PAGE_WAKEUP_DONE(m);
4475
4476 if (must_free == TRUE) {
4477 vm_page_free(m);
4478 } else {
4479 vm_page_activate(m);
4480 }
4481 vm_page_unlock_queues();
4482 continue;
4483 }
4484 /*
4485 * Handle the trusted pager throttle.
4486 */
4487 if (m->laundry) {
4488 vm_pageout_throttle_up(m);
4489 }
4490 if(m->pageout) {
4491 assert(m->busy);
4492 assert(m->wire_count == 1);
4493 m->pageout = FALSE;
4494 vm_page_unwire(m);
4495 }
4496 m->dump_cleaning = FALSE;
4497 m->cleaning = FALSE;
4498 m->overwriting = FALSE;
4499 #if MACH_PAGEMAP
4500 vm_external_state_clr(
4501 m->object->existence_map, m->offset);
4502 #endif /* MACH_PAGEMAP */
4503 if(error & UPL_ABORT_DUMP_PAGES) {
4504 vm_page_free(m);
4505 pmap_disconnect(m->phys_page);
4506 } else {
4507 PAGE_WAKEUP_DONE(m);
4508 }
4509 vm_page_unlock_queues();
4510 }
4511 }
4512 occupied = 1;
4513 if (upl->flags & UPL_DEVICE_MEMORY) {
4514 occupied = 0;
4515 } else if (upl->flags & UPL_LITE) {
4516 int pg_num;
4517 int j;
4518 pg_num = upl->size/PAGE_SIZE;
4519 pg_num = (pg_num + 31) >> 5;
4520 occupied = 0;
4521 for(j= 0; j<pg_num; j++) {
4522 if(lite_list[j] != 0) {
4523 occupied = 1;
4524 break;
4525 }
4526 }
4527 } else {
4528 if(queue_empty(&upl->map_object->memq)) {
4529 occupied = 0;
4530 }
4531 }
4532
4533 if(occupied == 0) {
4534 if(object == shadow_object)
4535 vm_object_paging_end(shadow_object);
4536 }
4537 vm_object_unlock(shadow_object);
4538 if (object != shadow_object)
4539 vm_object_unlock(object);
4540
4541 upl_unlock(upl);
4542 return KERN_SUCCESS;
4543 }
4544
4545 /* an option on commit should be wire */
4546 kern_return_t
4547 upl_commit(
4548 upl_t upl,
4549 upl_page_info_t *page_list,
4550 mach_msg_type_number_t count)
4551 {
4552 if (upl == UPL_NULL)
4553 return KERN_INVALID_ARGUMENT;
4554
4555 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4556 boolean_t empty;
4557 return upl_commit_range(upl, 0, upl->size, 0,
4558 page_list, count, &empty);
4559 }
4560
4561 if (count == 0)
4562 page_list = NULL;
4563
4564 upl_lock(upl);
4565 if (upl->flags & UPL_DEVICE_MEMORY)
4566 page_list = NULL;
4567
4568 if (upl->flags & UPL_ENCRYPTED) {
4569 /*
4570 * ENCRYPTED SWAP:
4571 * This UPL was encrypted, but we don't need
4572 * to decrypt here. We'll decrypt each page
4573 * later, on demand, as soon as someone needs
4574 * to access the page's contents.
4575 */
4576 }
4577
4578 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4579 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4580 vm_object_t shadow_object = upl->map_object->shadow;
4581 vm_object_t object = upl->map_object;
4582 vm_object_offset_t target_offset;
4583 upl_size_t xfer_end;
4584 int entry;
4585
4586 vm_page_t t, m;
4587 upl_page_info_t *p;
4588
4589 if (object != shadow_object)
4590 vm_object_lock(object);
4591 vm_object_lock(shadow_object);
4592
4593 entry = 0;
4594 target_offset = object->shadow_offset;
4595 xfer_end = upl->size + object->shadow_offset;
4596
4597 while(target_offset < xfer_end) {
4598
4599 if ((t = vm_page_lookup(object,
4600 target_offset - object->shadow_offset))
4601 == NULL) {
4602 target_offset += PAGE_SIZE_64;
4603 entry++;
4604 continue;
4605 }
4606
4607 m = vm_page_lookup(shadow_object, target_offset);
4608 if(m != VM_PAGE_NULL) {
4609 /*
4610 * ENCRYPTED SWAP:
4611 * If this page was encrypted, we
4612 * don't need to decrypt it here.
4613 * We'll decrypt it later, on demand,
4614 * as soon as someone needs to access
4615 * its contents.
4616 */
4617
4618 if (upl->flags & UPL_CLEAR_DIRTY) {
4619 pmap_clear_modify(m->phys_page);
4620 m->dirty = FALSE;
4621 }
4622 /* It is a part of the semantic of */
4623 /* COPYOUT_FROM UPLs that a commit */
4624 /* implies cache sync between the */
4625 /* vm page and the backing store */
4626 /* this can be used to strip the */
4627 /* precious bit as well as clean */
4628 if (upl->flags & UPL_PAGE_SYNC_DONE)
4629 m->precious = FALSE;
4630
4631 if(page_list) {
4632 p = &(page_list[entry]);
4633 if(page_list[entry].phys_addr &&
4634 p->pageout && !m->pageout) {
4635 vm_page_lock_queues();
4636 m->busy = TRUE;
4637 m->pageout = TRUE;
4638 vm_page_wire(m);
4639 vm_page_unlock_queues();
4640 } else if (page_list[entry].phys_addr &&
4641 !p->pageout && m->pageout &&
4642 !m->dump_cleaning) {
4643 vm_page_lock_queues();
4644 m->pageout = FALSE;
4645 m->absent = FALSE;
4646 m->overwriting = FALSE;
4647 vm_page_unwire(m);
4648 PAGE_WAKEUP_DONE(m);
4649 vm_page_unlock_queues();
4650 }
4651 page_list[entry].phys_addr = 0;
4652 }
4653 }
4654 target_offset += PAGE_SIZE_64;
4655 entry++;
4656 }
4657 vm_object_unlock(shadow_object);
4658 if (object != shadow_object)
4659 vm_object_unlock(object);
4660
4661 }
4662 if (upl->flags & UPL_DEVICE_MEMORY) {
4663 vm_object_lock(upl->map_object->shadow);
4664 if(upl->map_object == upl->map_object->shadow)
4665 vm_object_paging_end(upl->map_object->shadow);
4666 vm_object_unlock(upl->map_object->shadow);
4667 }
4668 upl_unlock(upl);
4669 return KERN_SUCCESS;
4670 }
4671
4672
4673
4674 kern_return_t
4675 vm_object_iopl_request(
4676 vm_object_t object,
4677 vm_object_offset_t offset,
4678 upl_size_t size,
4679 upl_t *upl_ptr,
4680 upl_page_info_array_t user_page_list,
4681 unsigned int *page_list_count,
4682 int cntrl_flags)
4683 {
4684 vm_page_t dst_page;
4685 vm_object_offset_t dst_offset = offset;
4686 upl_size_t xfer_size = size;
4687 upl_t upl = NULL;
4688 unsigned int entry;
4689 wpl_array_t lite_list = NULL;
4690 int page_field_size;
4691 int delayed_unlock = 0;
4692 int no_zero_fill = FALSE;
4693 vm_page_t alias_page = NULL;
4694 kern_return_t ret;
4695 vm_prot_t prot;
4696
4697
4698 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4699 /*
4700 * For forward compatibility's sake,
4701 * reject any unknown flag.
4702 */
4703 return KERN_INVALID_VALUE;
4704 }
4705 if (vm_lopage_poolsize == 0)
4706 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4707
4708 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4709 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4710 return KERN_INVALID_VALUE;
4711
4712 if (object->phys_contiguous) {
4713 if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4714 return KERN_INVALID_ADDRESS;
4715
4716 if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4717 return KERN_INVALID_ADDRESS;
4718 }
4719 }
4720
4721 if (cntrl_flags & UPL_ENCRYPT) {
4722 /*
4723 * ENCRYPTED SWAP:
4724 * The paging path doesn't use this interface,
4725 * so we don't support the UPL_ENCRYPT flag
4726 * here. We won't encrypt the pages.
4727 */
4728 assert(! (cntrl_flags & UPL_ENCRYPT));
4729 }
4730
4731 if (cntrl_flags & UPL_NOZEROFILL)
4732 no_zero_fill = TRUE;
4733
4734 if (cntrl_flags & UPL_COPYOUT_FROM)
4735 prot = VM_PROT_READ;
4736 else
4737 prot = VM_PROT_READ | VM_PROT_WRITE;
4738
4739 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4740 size = MAX_UPL_TRANSFER * page_size;
4741 }
4742
4743 if(cntrl_flags & UPL_SET_INTERNAL)
4744 if(page_list_count != NULL)
4745 *page_list_count = MAX_UPL_TRANSFER;
4746 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4747 ((page_list_count != NULL) && (*page_list_count != 0)
4748 && *page_list_count < (size/page_size)))
4749 return KERN_INVALID_ARGUMENT;
4750
4751 if((!object->internal) && (object->paging_offset != 0))
4752 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4753
4754 if(object->phys_contiguous) {
4755 /* No paging operations are possible against this memory */
4756 /* and so no need for map object, ever */
4757 cntrl_flags |= UPL_SET_LITE;
4758 }
4759
4760 if(upl_ptr) {
4761 if(cntrl_flags & UPL_SET_INTERNAL) {
4762 if(cntrl_flags & UPL_SET_LITE) {
4763 upl = upl_create(
4764 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4765 size);
4766 user_page_list = (upl_page_info_t *)
4767 (((uintptr_t)upl) + sizeof(struct upl));
4768 lite_list = (wpl_array_t)
4769 (((uintptr_t)user_page_list) +
4770 ((size/PAGE_SIZE) *
4771 sizeof(upl_page_info_t)));
4772 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4773 page_field_size =
4774 (page_field_size + 3) & 0xFFFFFFFC;
4775 bzero((char *)lite_list, page_field_size);
4776 upl->flags =
4777 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4778 } else {
4779 upl = upl_create(UPL_CREATE_INTERNAL, size);
4780 user_page_list = (upl_page_info_t *)
4781 (((uintptr_t)upl)
4782 + sizeof(struct upl));
4783 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4784 }
4785 } else {
4786 if(cntrl_flags & UPL_SET_LITE) {
4787 upl = upl_create(UPL_CREATE_LITE, size);
4788 lite_list = (wpl_array_t)
4789 (((uintptr_t)upl) + sizeof(struct upl));
4790 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4791 page_field_size =
4792 (page_field_size + 3) & 0xFFFFFFFC;
4793 bzero((char *)lite_list, page_field_size);
4794 upl->flags = UPL_LITE | UPL_IO_WIRE;
4795 } else {
4796 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4797 upl->flags = UPL_IO_WIRE;
4798 }
4799 }
4800
4801 if(object->phys_contiguous) {
4802 upl->map_object = object;
4803 /* don't need any shadow mappings for this one */
4804 /* since it is already I/O memory */
4805 upl->flags |= UPL_DEVICE_MEMORY;
4806
4807 vm_object_lock(object);
4808 vm_object_paging_begin(object);
4809 vm_object_unlock(object);
4810
4811 /* paging in progress also protects the paging_offset */
4812 upl->offset = offset + object->paging_offset;
4813 upl->size = size;
4814 *upl_ptr = upl;
4815 if(user_page_list) {
4816 user_page_list[0].phys_addr =
4817 (offset + object->shadow_offset)>>PAGE_SHIFT;
4818 user_page_list[0].device = TRUE;
4819 }
4820 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4821
4822 if(page_list_count != NULL) {
4823 if (upl->flags & UPL_INTERNAL) {
4824 *page_list_count = 0;
4825 } else {
4826 *page_list_count = 1;
4827 }
4828 }
4829 return KERN_SUCCESS;
4830 }
4831 if(user_page_list)
4832 user_page_list[0].device = FALSE;
4833
4834 if(cntrl_flags & UPL_SET_LITE) {
4835 upl->map_object = object;
4836 } else {
4837 upl->map_object = vm_object_allocate(size);
4838 vm_object_lock(upl->map_object);
4839 upl->map_object->shadow = object;
4840 upl->map_object->pageout = TRUE;
4841 upl->map_object->can_persist = FALSE;
4842 upl->map_object->copy_strategy =
4843 MEMORY_OBJECT_COPY_NONE;
4844 upl->map_object->shadow_offset = offset;
4845 upl->map_object->wimg_bits = object->wimg_bits;
4846 vm_object_unlock(upl->map_object);
4847 }
4848 }
4849 vm_object_lock(object);
4850 vm_object_paging_begin(object);
4851
4852 if (!object->phys_contiguous) {
4853 /* Protect user space from future COW operations */
4854 object->true_share = TRUE;
4855 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4856 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4857 }
4858
4859 /* we can lock the upl offset now that paging_in_progress is set */
4860 if(upl_ptr) {
4861 upl->size = size;
4862 upl->offset = offset + object->paging_offset;
4863 *upl_ptr = upl;
4864 #ifdef UPL_DEBUG
4865 queue_enter(&object->uplq, upl, upl_t, uplq);
4866 #endif /* UPL_DEBUG */
4867 }
4868
4869 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4870 /*
4871 * The user requested that access to the pages in this URL
4872 * be blocked until the UPL is commited or aborted.
4873 */
4874 upl->flags |= UPL_ACCESS_BLOCKED;
4875 }
4876
4877 entry = 0;
4878 while (xfer_size) {
4879 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4880 if (delayed_unlock) {
4881 delayed_unlock = 0;
4882 vm_page_unlock_queues();
4883 }
4884 vm_object_unlock(object);
4885 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4886 vm_object_lock(object);
4887 }
4888 dst_page = vm_page_lookup(object, dst_offset);
4889
4890 /*
4891 * ENCRYPTED SWAP:
4892 * If the page is encrypted, we need to decrypt it,
4893 * so force a soft page fault.
4894 */
4895 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4896 (dst_page->encrypted) ||
4897 (dst_page->unusual && (dst_page->error ||
4898 dst_page->restart ||
4899 dst_page->absent ||
4900 dst_page->fictitious ||
4901 (prot & dst_page->page_lock)))) {
4902 vm_fault_return_t result;
4903 do {
4904 vm_page_t top_page;
4905 kern_return_t error_code;
4906 int interruptible;
4907
4908 vm_object_offset_t lo_offset = offset;
4909 vm_object_offset_t hi_offset = offset + size;
4910
4911
4912 if (delayed_unlock) {
4913 delayed_unlock = 0;
4914 vm_page_unlock_queues();
4915 }
4916
4917 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4918 interruptible = THREAD_ABORTSAFE;
4919 } else {
4920 interruptible = THREAD_UNINT;
4921 }
4922
4923 result = vm_fault_page(object, dst_offset,
4924 prot | VM_PROT_WRITE, FALSE,
4925 interruptible,
4926 lo_offset, hi_offset,
4927 VM_BEHAVIOR_SEQUENTIAL,
4928 &prot, &dst_page, &top_page,
4929 (int *)0,
4930 &error_code, no_zero_fill, FALSE, NULL, 0);
4931
4932 switch(result) {
4933 case VM_FAULT_SUCCESS:
4934
4935 PAGE_WAKEUP_DONE(dst_page);
4936
4937 /*
4938 * Release paging references and
4939 * top-level placeholder page, if any.
4940 */
4941
4942 if(top_page != VM_PAGE_NULL) {
4943 vm_object_t local_object;
4944 local_object =
4945 top_page->object;
4946 if(top_page->object
4947 != dst_page->object) {
4948 vm_object_lock(
4949 local_object);
4950 VM_PAGE_FREE(top_page);
4951 vm_object_paging_end(
4952 local_object);
4953 vm_object_unlock(
4954 local_object);
4955 } else {
4956 VM_PAGE_FREE(top_page);
4957 vm_object_paging_end(
4958 local_object);
4959 }
4960 }
4961
4962 break;
4963
4964
4965 case VM_FAULT_RETRY:
4966 vm_object_lock(object);
4967 vm_object_paging_begin(object);
4968 break;
4969
4970 case VM_FAULT_FICTITIOUS_SHORTAGE:
4971 vm_page_more_fictitious();
4972 vm_object_lock(object);
4973 vm_object_paging_begin(object);
4974 break;
4975
4976 case VM_FAULT_MEMORY_SHORTAGE:
4977 if (vm_page_wait(interruptible)) {
4978 vm_object_lock(object);
4979 vm_object_paging_begin(object);
4980 break;
4981 }
4982 /* fall thru */
4983
4984 case VM_FAULT_INTERRUPTED:
4985 error_code = MACH_SEND_INTERRUPTED;
4986 case VM_FAULT_MEMORY_ERROR:
4987 ret = (error_code ? error_code:
4988 KERN_MEMORY_ERROR);
4989 vm_object_lock(object);
4990
4991 goto return_err;
4992 }
4993 } while ((result != VM_FAULT_SUCCESS)
4994 || (result == VM_FAULT_INTERRUPTED));
4995 }
4996
4997 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4998 dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4999 vm_page_t low_page;
5000 int refmod;
5001
5002 /*
5003 * support devices that can't DMA above 32 bits
5004 * by substituting pages from a pool of low address
5005 * memory for any pages we find above the 4G mark
5006 * can't substitute if the page is already wired because
5007 * we don't know whether that physical address has been
5008 * handed out to some other 64 bit capable DMA device to use
5009 */
5010 if (dst_page->wire_count) {
5011 ret = KERN_PROTECTION_FAILURE;
5012 goto return_err;
5013 }
5014 if (delayed_unlock) {
5015 delayed_unlock = 0;
5016 vm_page_unlock_queues();
5017 }
5018 low_page = vm_page_grablo();
5019
5020 if (low_page == VM_PAGE_NULL) {
5021 ret = KERN_RESOURCE_SHORTAGE;
5022 goto return_err;
5023 }
5024 /*
5025 * from here until the vm_page_replace completes
5026 * we musn't drop the object lock... we don't
5027 * want anyone refaulting this page in and using
5028 * it after we disconnect it... we want the fault
5029 * to find the new page being substituted.
5030 */
5031 refmod = pmap_disconnect(dst_page->phys_page);
5032
5033 vm_page_copy(dst_page, low_page);
5034
5035 low_page->reference = dst_page->reference;
5036 low_page->dirty = dst_page->dirty;
5037
5038 if (refmod & VM_MEM_REFERENCED)
5039 low_page->reference = TRUE;
5040 if (refmod & VM_MEM_MODIFIED)
5041 low_page->dirty = TRUE;
5042
5043 vm_page_lock_queues();
5044 vm_page_replace(low_page, object, dst_offset);
5045 /*
5046 * keep the queue lock since we're going to
5047 * need it immediately
5048 */
5049 delayed_unlock = 1;
5050
5051 dst_page = low_page;
5052 /*
5053 * vm_page_grablo returned the page marked
5054 * BUSY... we don't need a PAGE_WAKEUP_DONE
5055 * here, because we've never dropped the object lock
5056 */
5057 dst_page->busy = FALSE;
5058 }
5059 if (delayed_unlock == 0)
5060 vm_page_lock_queues();
5061 vm_page_wire(dst_page);
5062
5063 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5064 /*
5065 * Mark the page "busy" to block any future page fault
5066 * on this page. We'll also remove the mapping
5067 * of all these pages before leaving this routine.
5068 */
5069 assert(!dst_page->fictitious);
5070 dst_page->busy = TRUE;
5071 }
5072
5073 if (upl_ptr) {
5074 if (cntrl_flags & UPL_SET_LITE) {
5075 int pg_num;
5076 pg_num = (dst_offset-offset)/PAGE_SIZE;
5077 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5078 } else {
5079 /*
5080 * Convert the fictitious page to a
5081 * private shadow of the real page.
5082 */
5083 assert(alias_page->fictitious);
5084 alias_page->fictitious = FALSE;
5085 alias_page->private = TRUE;
5086 alias_page->pageout = TRUE;
5087 alias_page->phys_page = dst_page->phys_page;
5088 vm_page_wire(alias_page);
5089
5090 vm_page_insert(alias_page,
5091 upl->map_object, size - xfer_size);
5092 assert(!alias_page->wanted);
5093 alias_page->busy = FALSE;
5094 alias_page->absent = FALSE;
5095 }
5096
5097 /* expect the page to be used */
5098 dst_page->reference = TRUE;
5099
5100 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5101 dst_page->dirty = TRUE;
5102 alias_page = NULL;
5103
5104 if (dst_page->phys_page > upl->highest_page)
5105 upl->highest_page = dst_page->phys_page;
5106
5107 if (user_page_list) {
5108 user_page_list[entry].phys_addr
5109 = dst_page->phys_page;
5110 user_page_list[entry].dirty =
5111 dst_page->dirty;
5112 user_page_list[entry].pageout =
5113 dst_page->pageout;
5114 user_page_list[entry].absent =
5115 dst_page->absent;
5116 user_page_list[entry].precious =
5117 dst_page->precious;
5118 }
5119 }
5120 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5121 delayed_unlock = 0;
5122 vm_page_unlock_queues();
5123 }
5124 entry++;
5125 dst_offset += PAGE_SIZE_64;
5126 xfer_size -= PAGE_SIZE;
5127 }
5128 if (delayed_unlock)
5129 vm_page_unlock_queues();
5130
5131 if (upl->flags & UPL_INTERNAL) {
5132 if(page_list_count != NULL)
5133 *page_list_count = 0;
5134 } else if (*page_list_count > entry) {
5135 if(page_list_count != NULL)
5136 *page_list_count = entry;
5137 }
5138
5139 if (alias_page != NULL) {
5140 vm_page_lock_queues();
5141 vm_page_free(alias_page);
5142 vm_page_unlock_queues();
5143 }
5144
5145 vm_object_unlock(object);
5146
5147 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5148 /*
5149 * We've marked all the pages "busy" so that future
5150 * page faults will block.
5151 * Now remove the mapping for these pages, so that they
5152 * can't be accessed without causing a page fault.
5153 */
5154 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5155 PMAP_NULL, 0, VM_PROT_NONE);
5156 }
5157
5158 return KERN_SUCCESS;
5159
5160
5161 return_err:
5162 if (delayed_unlock)
5163 vm_page_unlock_queues();
5164
5165 for (; offset < dst_offset; offset += PAGE_SIZE) {
5166 dst_page = vm_page_lookup(object, offset);
5167
5168 if (dst_page == VM_PAGE_NULL)
5169 panic("vm_object_iopl_request: Wired pages missing. \n");
5170 vm_page_lock_queues();
5171 vm_page_unwire(dst_page);
5172 vm_page_unlock_queues();
5173 VM_STAT(reactivations++);
5174 }
5175 vm_object_paging_end(object);
5176 vm_object_unlock(object);
5177 upl_destroy(upl);
5178
5179 return ret;
5180 }
5181
5182
5183 kern_return_t
5184 upl_transpose(
5185 upl_t upl1,
5186 upl_t upl2)
5187 {
5188 kern_return_t retval;
5189 boolean_t upls_locked;
5190 vm_object_t object1, object2;
5191
5192 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5193 return KERN_INVALID_ARGUMENT;
5194 }
5195
5196 upls_locked = FALSE;
5197
5198 /*
5199 * Since we need to lock both UPLs at the same time,
5200 * avoid deadlocks by always taking locks in the same order.
5201 */
5202 if (upl1 < upl2) {
5203 upl_lock(upl1);
5204 upl_lock(upl2);
5205 } else {
5206 upl_lock(upl2);
5207 upl_lock(upl1);
5208 }
5209 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5210
5211 object1 = upl1->map_object;
5212 object2 = upl2->map_object;
5213
5214 if (upl1->offset != 0 || upl2->offset != 0 ||
5215 upl1->size != upl2->size) {
5216 /*
5217 * We deal only with full objects, not subsets.
5218 * That's because we exchange the entire backing store info
5219 * for the objects: pager, resident pages, etc... We can't do
5220 * only part of it.
5221 */
5222 retval = KERN_INVALID_VALUE;
5223 goto done;
5224 }
5225
5226 /*
5227 * Tranpose the VM objects' backing store.
5228 */
5229 retval = vm_object_transpose(object1, object2,
5230 (vm_object_size_t) upl1->size);
5231
5232 if (retval == KERN_SUCCESS) {
5233 /*
5234 * Make each UPL point to the correct VM object, i.e. the
5235 * object holding the pages that the UPL refers to...
5236 */
5237 upl1->map_object = object2;
5238 upl2->map_object = object1;
5239 }
5240
5241 done:
5242 /*
5243 * Cleanup.
5244 */
5245 if (upls_locked) {
5246 upl_unlock(upl1);
5247 upl_unlock(upl2);
5248 upls_locked = FALSE;
5249 }
5250
5251 return retval;
5252 }
5253
5254 /*
5255 * ENCRYPTED SWAP:
5256 *
5257 * Rationale: the user might have some encrypted data on disk (via
5258 * FileVault or any other mechanism). That data is then decrypted in
5259 * memory, which is safe as long as the machine is secure. But that
5260 * decrypted data in memory could be paged out to disk by the default
5261 * pager. The data would then be stored on disk in clear (not encrypted)
5262 * and it could be accessed by anyone who gets physical access to the
5263 * disk (if the laptop or the disk gets stolen for example). This weakens
5264 * the security offered by FileVault.
5265 *
5266 * Solution: the default pager will optionally request that all the
5267 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5268 * before it sends this UPL to disk via the vnode_pageout() path.
5269 *
5270 * Notes:
5271 *
5272 * To avoid disrupting the VM LRU algorithms, we want to keep the
5273 * clean-in-place mechanisms, which allow us to send some extra pages to
5274 * swap (clustering) without actually removing them from the user's
5275 * address space. We don't want the user to unknowingly access encrypted
5276 * data, so we have to actually remove the encrypted pages from the page
5277 * table. When the user accesses the data, the hardware will fail to
5278 * locate the virtual page in its page table and will trigger a page
5279 * fault. We can then decrypt the page and enter it in the page table
5280 * again. Whenever we allow the user to access the contents of a page,
5281 * we have to make sure it's not encrypted.
5282 *
5283 *
5284 */
5285 /*
5286 * ENCRYPTED SWAP:
5287 * Reserve of virtual addresses in the kernel address space.
5288 * We need to map the physical pages in the kernel, so that we
5289 * can call the encryption/decryption routines with a kernel
5290 * virtual address. We keep this pool of pre-allocated kernel
5291 * virtual addresses so that we don't have to scan the kernel's
5292 * virtaul address space each time we need to encrypt or decrypt
5293 * a physical page.
5294 * It would be nice to be able to encrypt and decrypt in physical
5295 * mode but that might not always be more efficient...
5296 */
5297 decl_simple_lock_data(,vm_paging_lock)
5298 #define VM_PAGING_NUM_PAGES 64
5299 vm_map_offset_t vm_paging_base_address = 0;
5300 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5301 int vm_paging_max_index = 0;
5302 unsigned long vm_paging_no_kernel_page = 0;
5303 unsigned long vm_paging_objects_mapped = 0;
5304 unsigned long vm_paging_pages_mapped = 0;
5305 unsigned long vm_paging_objects_mapped_slow = 0;
5306 unsigned long vm_paging_pages_mapped_slow = 0;
5307
5308 /*
5309 * ENCRYPTED SWAP:
5310 * vm_paging_map_object:
5311 * Maps part of a VM object's pages in the kernel
5312 * virtual address space, using the pre-allocated
5313 * kernel virtual addresses, if possible.
5314 * Context:
5315 * The VM object is locked. This lock will get
5316 * dropped and re-acquired though.
5317 */
5318 kern_return_t
5319 vm_paging_map_object(
5320 vm_map_offset_t *address,
5321 vm_page_t page,
5322 vm_object_t object,
5323 vm_object_offset_t offset,
5324 vm_map_size_t *size)
5325 {
5326 kern_return_t kr;
5327 vm_map_offset_t page_map_offset;
5328 vm_map_size_t map_size;
5329 vm_object_offset_t object_offset;
5330 #ifdef __ppc__
5331 int i;
5332 vm_map_entry_t map_entry;
5333 #endif /* __ppc__ */
5334
5335
5336 #ifdef __ppc__
5337 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5338 /*
5339 * Optimization for the PowerPC.
5340 * Use one of the pre-allocated kernel virtual addresses
5341 * and just enter the VM page in the kernel address space
5342 * at that virtual address.
5343 */
5344 vm_object_unlock(object);
5345 simple_lock(&vm_paging_lock);
5346
5347 if (vm_paging_base_address == 0) {
5348 /*
5349 * Initialize our pool of pre-allocated kernel
5350 * virtual addresses.
5351 */
5352 simple_unlock(&vm_paging_lock);
5353 page_map_offset = 0;
5354 kr = vm_map_find_space(kernel_map,
5355 &page_map_offset,
5356 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5357 0,
5358 0,
5359 &map_entry);
5360 if (kr != KERN_SUCCESS) {
5361 panic("vm_paging_map_object: "
5362 "kernel_map full\n");
5363 }
5364 map_entry->object.vm_object = kernel_object;
5365 map_entry->offset =
5366 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5367 vm_object_reference(kernel_object);
5368 vm_map_unlock(kernel_map);
5369
5370 simple_lock(&vm_paging_lock);
5371 if (vm_paging_base_address != 0) {
5372 /* someone raced us and won: undo */
5373 simple_unlock(&vm_paging_lock);
5374 kr = vm_map_remove(kernel_map,
5375 page_map_offset,
5376 page_map_offset +
5377 (VM_PAGING_NUM_PAGES
5378 * PAGE_SIZE),
5379 VM_MAP_NO_FLAGS);
5380 assert(kr == KERN_SUCCESS);
5381 simple_lock(&vm_paging_lock);
5382 } else {
5383 vm_paging_base_address = page_map_offset;
5384 }
5385 }
5386
5387 /*
5388 * Try and find an available kernel virtual address
5389 * from our pre-allocated pool.
5390 */
5391 page_map_offset = 0;
5392 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5393 if (vm_paging_page_inuse[i] == FALSE) {
5394 page_map_offset = vm_paging_base_address +
5395 (i * PAGE_SIZE);
5396 break;
5397 }
5398 }
5399
5400 if (page_map_offset != 0) {
5401 /*
5402 * We found a kernel virtual address;
5403 * map the physical page to that virtual address.
5404 */
5405 if (i > vm_paging_max_index) {
5406 vm_paging_max_index = i;
5407 }
5408 vm_paging_page_inuse[i] = TRUE;
5409 simple_unlock(&vm_paging_lock);
5410 pmap_map_block(kernel_pmap,
5411 page_map_offset,
5412 page->phys_page,
5413 1, /* Size is number of 4k pages */
5414 VM_PROT_DEFAULT,
5415 ((int) page->object->wimg_bits &
5416 VM_WIMG_MASK),
5417 0);
5418 vm_paging_objects_mapped++;
5419 vm_paging_pages_mapped++;
5420 *address = page_map_offset;
5421 vm_object_lock(object);
5422
5423 /* all done and mapped, ready to use ! */
5424 return KERN_SUCCESS;
5425 }
5426
5427 /*
5428 * We ran out of pre-allocated kernel virtual
5429 * addresses. Just map the page in the kernel
5430 * the slow and regular way.
5431 */
5432 vm_paging_no_kernel_page++;
5433 simple_unlock(&vm_paging_lock);
5434 vm_object_lock(object);
5435 }
5436 #endif /* __ppc__ */
5437
5438 object_offset = vm_object_trunc_page(offset);
5439 map_size = vm_map_round_page(*size);
5440
5441 /*
5442 * Try and map the required range of the object
5443 * in the kernel_map
5444 */
5445
5446 /* don't go beyond the object's end... */
5447 if (object_offset >= object->size) {
5448 map_size = 0;
5449 } else if (map_size > object->size - offset) {
5450 map_size = object->size - offset;
5451 }
5452
5453 vm_object_reference_locked(object); /* for the map entry */
5454 vm_object_unlock(object);
5455
5456 kr = vm_map_enter(kernel_map,
5457 address,
5458 map_size,
5459 0,
5460 VM_FLAGS_ANYWHERE,
5461 object,
5462 object_offset,
5463 FALSE,
5464 VM_PROT_DEFAULT,
5465 VM_PROT_ALL,
5466 VM_INHERIT_NONE);
5467 if (kr != KERN_SUCCESS) {
5468 *address = 0;
5469 *size = 0;
5470 vm_object_deallocate(object); /* for the map entry */
5471 return kr;
5472 }
5473
5474 *size = map_size;
5475
5476 /*
5477 * Enter the mapped pages in the page table now.
5478 */
5479 vm_object_lock(object);
5480 for (page_map_offset = 0;
5481 map_size != 0;
5482 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5483 unsigned int cache_attr;
5484
5485 page = vm_page_lookup(object, offset + page_map_offset);
5486 if (page == VM_PAGE_NULL) {
5487 panic("vm_paging_map_object: no page !?");
5488 }
5489 if (page->no_isync == TRUE) {
5490 pmap_sync_page_data_phys(page->phys_page);
5491 }
5492 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5493
5494 PMAP_ENTER(kernel_pmap,
5495 *address + page_map_offset,
5496 page,
5497 VM_PROT_DEFAULT,
5498 cache_attr,
5499 FALSE);
5500 }
5501
5502 vm_paging_objects_mapped_slow++;
5503 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5504
5505 return KERN_SUCCESS;
5506 }
5507
5508 /*
5509 * ENCRYPTED SWAP:
5510 * vm_paging_unmap_object:
5511 * Unmaps part of a VM object's pages from the kernel
5512 * virtual address space.
5513 * Context:
5514 * The VM object is locked. This lock will get
5515 * dropped and re-acquired though.
5516 */
5517 void
5518 vm_paging_unmap_object(
5519 vm_object_t object,
5520 vm_map_offset_t start,
5521 vm_map_offset_t end)
5522 {
5523 kern_return_t kr;
5524 #ifdef __ppc__
5525 int i;
5526 #endif /* __ppc__ */
5527
5528 if ((vm_paging_base_address == 0) &&
5529 ((start < vm_paging_base_address) ||
5530 (end > (vm_paging_base_address
5531 + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5532 /*
5533 * We didn't use our pre-allocated pool of
5534 * kernel virtual address. Deallocate the
5535 * virtual memory.
5536 */
5537 if (object != VM_OBJECT_NULL) {
5538 vm_object_unlock(object);
5539 }
5540 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5541 if (object != VM_OBJECT_NULL) {
5542 vm_object_lock(object);
5543 }
5544 assert(kr == KERN_SUCCESS);
5545 } else {
5546 /*
5547 * We used a kernel virtual address from our
5548 * pre-allocated pool. Put it back in the pool
5549 * for next time.
5550 */
5551 #ifdef __ppc__
5552 assert(end - start == PAGE_SIZE);
5553 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5554
5555 /* undo the pmap mapping */
5556 mapping_remove(kernel_pmap, start);
5557
5558 simple_lock(&vm_paging_lock);
5559 vm_paging_page_inuse[i] = FALSE;
5560 simple_unlock(&vm_paging_lock);
5561 #endif /* __ppc__ */
5562 }
5563 }
5564
5565 /*
5566 * Encryption data.
5567 * "iv" is the "initial vector". Ideally, we want to
5568 * have a different one for each page we encrypt, so that
5569 * crackers can't find encryption patterns too easily.
5570 */
5571 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5572 boolean_t swap_crypt_ctx_initialized = FALSE;
5573 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5574 aes_ctx swap_crypt_ctx;
5575 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5576
5577 #if DEBUG
5578 boolean_t swap_crypt_ctx_tested = FALSE;
5579 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5580 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5581 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5582 #endif /* DEBUG */
5583
5584 extern u_long random(void);
5585
5586 /*
5587 * Initialize the encryption context: key and key size.
5588 */
5589 void swap_crypt_ctx_initialize(void); /* forward */
5590 void
5591 swap_crypt_ctx_initialize(void)
5592 {
5593 unsigned int i;
5594
5595 /*
5596 * No need for locking to protect swap_crypt_ctx_initialized
5597 * because the first use of encryption will come from the
5598 * pageout thread (we won't pagein before there's been a pageout)
5599 * and there's only one pageout thread.
5600 */
5601 if (swap_crypt_ctx_initialized == FALSE) {
5602 for (i = 0;
5603 i < (sizeof (swap_crypt_key) /
5604 sizeof (swap_crypt_key[0]));
5605 i++) {
5606 swap_crypt_key[i] = random();
5607 }
5608 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5609 SWAP_CRYPT_AES_KEY_SIZE,
5610 &swap_crypt_ctx.encrypt);
5611 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5612 SWAP_CRYPT_AES_KEY_SIZE,
5613 &swap_crypt_ctx.decrypt);
5614 swap_crypt_ctx_initialized = TRUE;
5615 }
5616
5617 #if DEBUG
5618 /*
5619 * Validate the encryption algorithms.
5620 */
5621 if (swap_crypt_ctx_tested == FALSE) {
5622 /* initialize */
5623 for (i = 0; i < 4096; i++) {
5624 swap_crypt_test_page_ref[i] = (char) i;
5625 }
5626 /* encrypt */
5627 aes_encrypt_cbc(swap_crypt_test_page_ref,
5628 swap_crypt_null_iv,
5629 PAGE_SIZE / AES_BLOCK_SIZE,
5630 swap_crypt_test_page_encrypt,
5631 &swap_crypt_ctx.encrypt);
5632 /* decrypt */
5633 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5634 swap_crypt_null_iv,
5635 PAGE_SIZE / AES_BLOCK_SIZE,
5636 swap_crypt_test_page_decrypt,
5637 &swap_crypt_ctx.decrypt);
5638 /* compare result with original */
5639 for (i = 0; i < 4096; i ++) {
5640 if (swap_crypt_test_page_decrypt[i] !=
5641 swap_crypt_test_page_ref[i]) {
5642 panic("encryption test failed");
5643 }
5644 }
5645
5646 /* encrypt again */
5647 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5648 swap_crypt_null_iv,
5649 PAGE_SIZE / AES_BLOCK_SIZE,
5650 swap_crypt_test_page_decrypt,
5651 &swap_crypt_ctx.encrypt);
5652 /* decrypt in place */
5653 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5654 swap_crypt_null_iv,
5655 PAGE_SIZE / AES_BLOCK_SIZE,
5656 swap_crypt_test_page_decrypt,
5657 &swap_crypt_ctx.decrypt);
5658 for (i = 0; i < 4096; i ++) {
5659 if (swap_crypt_test_page_decrypt[i] !=
5660 swap_crypt_test_page_ref[i]) {
5661 panic("in place encryption test failed");
5662 }
5663 }
5664
5665 swap_crypt_ctx_tested = TRUE;
5666 }
5667 #endif /* DEBUG */
5668 }
5669
5670 /*
5671 * ENCRYPTED SWAP:
5672 * vm_page_encrypt:
5673 * Encrypt the given page, for secure paging.
5674 * The page might already be mapped at kernel virtual
5675 * address "kernel_mapping_offset". Otherwise, we need
5676 * to map it.
5677 *
5678 * Context:
5679 * The page's object is locked, but this lock will be released
5680 * and re-acquired.
5681 * The page is busy and not accessible by users (not entered in any pmap).
5682 */
5683 void
5684 vm_page_encrypt(
5685 vm_page_t page,
5686 vm_map_offset_t kernel_mapping_offset)
5687 {
5688 int clear_refmod = 0;
5689 kern_return_t kr;
5690 boolean_t page_was_referenced;
5691 boolean_t page_was_modified;
5692 vm_map_size_t kernel_mapping_size;
5693 vm_offset_t kernel_vaddr;
5694 union {
5695 unsigned char aes_iv[AES_BLOCK_SIZE];
5696 struct {
5697 memory_object_t pager_object;
5698 vm_object_offset_t paging_offset;
5699 } vm;
5700 } encrypt_iv;
5701
5702 if (! vm_pages_encrypted) {
5703 vm_pages_encrypted = TRUE;
5704 }
5705
5706 assert(page->busy);
5707 assert(page->dirty || page->precious);
5708
5709 if (page->encrypted) {
5710 /*
5711 * Already encrypted: no need to do it again.
5712 */
5713 vm_page_encrypt_already_encrypted_counter++;
5714 return;
5715 }
5716 ASSERT_PAGE_DECRYPTED(page);
5717
5718 /*
5719 * Gather the "reference" and "modified" status of the page.
5720 * We'll restore these values after the encryption, so that
5721 * the encryption is transparent to the rest of the system
5722 * and doesn't impact the VM's LRU logic.
5723 */
5724 page_was_referenced =
5725 (page->reference || pmap_is_referenced(page->phys_page));
5726 page_was_modified =
5727 (page->dirty || pmap_is_modified(page->phys_page));
5728
5729 if (kernel_mapping_offset == 0) {
5730 /*
5731 * The page hasn't already been mapped in kernel space
5732 * by the caller. Map it now, so that we can access
5733 * its contents and encrypt them.
5734 */
5735 kernel_mapping_size = PAGE_SIZE;
5736 kr = vm_paging_map_object(&kernel_mapping_offset,
5737 page,
5738 page->object,
5739 page->offset,
5740 &kernel_mapping_size);
5741 if (kr != KERN_SUCCESS) {
5742 panic("vm_page_encrypt: "
5743 "could not map page in kernel: 0x%x\n",
5744 kr);
5745 }
5746 } else {
5747 kernel_mapping_size = 0;
5748 }
5749 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5750
5751 if (swap_crypt_ctx_initialized == FALSE) {
5752 swap_crypt_ctx_initialize();
5753 }
5754 assert(swap_crypt_ctx_initialized);
5755
5756 /*
5757 * Prepare an "initial vector" for the encryption.
5758 * We use the "pager" and the "paging_offset" for that
5759 * page to obfuscate the encrypted data a bit more and
5760 * prevent crackers from finding patterns that they could
5761 * use to break the key.
5762 */
5763 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5764 encrypt_iv.vm.pager_object = page->object->pager;
5765 encrypt_iv.vm.paging_offset =
5766 page->object->paging_offset + page->offset;
5767
5768 vm_object_unlock(page->object);
5769
5770 /* encrypt the "initial vector" */
5771 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5772 swap_crypt_null_iv,
5773 1,
5774 &encrypt_iv.aes_iv[0],
5775 &swap_crypt_ctx.encrypt);
5776
5777 /*
5778 * Encrypt the page.
5779 */
5780 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5781 &encrypt_iv.aes_iv[0],
5782 PAGE_SIZE / AES_BLOCK_SIZE,
5783 (unsigned char *) kernel_vaddr,
5784 &swap_crypt_ctx.encrypt);
5785
5786 vm_page_encrypt_counter++;
5787
5788 vm_object_lock(page->object);
5789
5790 /*
5791 * Unmap the page from the kernel's address space,
5792 * if we had to map it ourselves. Otherwise, let
5793 * the caller undo the mapping if needed.
5794 */
5795 if (kernel_mapping_size != 0) {
5796 vm_paging_unmap_object(page->object,
5797 kernel_mapping_offset,
5798 kernel_mapping_offset + kernel_mapping_size);
5799 }
5800
5801 /*
5802 * Restore the "reference" and "modified" bits.
5803 * This should clean up any impact the encryption had
5804 * on them.
5805 */
5806 if (! page_was_referenced) {
5807 clear_refmod |= VM_MEM_REFERENCED;
5808 page->reference = FALSE;
5809 }
5810 if (! page_was_modified) {
5811 clear_refmod |= VM_MEM_MODIFIED;
5812 page->dirty = FALSE;
5813 }
5814 if (clear_refmod)
5815 pmap_clear_refmod(page->phys_page, clear_refmod);
5816
5817 page->encrypted = TRUE;
5818 }
5819
5820 /*
5821 * ENCRYPTED SWAP:
5822 * vm_page_decrypt:
5823 * Decrypt the given page.
5824 * The page might already be mapped at kernel virtual
5825 * address "kernel_mapping_offset". Otherwise, we need
5826 * to map it.
5827 *
5828 * Context:
5829 * The page's VM object is locked but will be unlocked and relocked.
5830 * The page is busy and not accessible by users (not entered in any pmap).
5831 */
5832 void
5833 vm_page_decrypt(
5834 vm_page_t page,
5835 vm_map_offset_t kernel_mapping_offset)
5836 {
5837 int clear_refmod = 0;
5838 kern_return_t kr;
5839 vm_map_size_t kernel_mapping_size;
5840 vm_offset_t kernel_vaddr;
5841 boolean_t page_was_referenced;
5842 union {
5843 unsigned char aes_iv[AES_BLOCK_SIZE];
5844 struct {
5845 memory_object_t pager_object;
5846 vm_object_offset_t paging_offset;
5847 } vm;
5848 } decrypt_iv;
5849
5850 assert(page->busy);
5851 assert(page->encrypted);
5852
5853 /*
5854 * Gather the "reference" status of the page.
5855 * We'll restore its value after the decryption, so that
5856 * the decryption is transparent to the rest of the system
5857 * and doesn't impact the VM's LRU logic.
5858 */
5859 page_was_referenced =
5860 (page->reference || pmap_is_referenced(page->phys_page));
5861
5862 if (kernel_mapping_offset == 0) {
5863 /*
5864 * The page hasn't already been mapped in kernel space
5865 * by the caller. Map it now, so that we can access
5866 * its contents and decrypt them.
5867 */
5868 kernel_mapping_size = PAGE_SIZE;
5869 kr = vm_paging_map_object(&kernel_mapping_offset,
5870 page,
5871 page->object,
5872 page->offset,
5873 &kernel_mapping_size);
5874 if (kr != KERN_SUCCESS) {
5875 panic("vm_page_decrypt: "
5876 "could not map page in kernel: 0x%x\n");
5877 }
5878 } else {
5879 kernel_mapping_size = 0;
5880 }
5881 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5882
5883 assert(swap_crypt_ctx_initialized);
5884
5885 /*
5886 * Prepare an "initial vector" for the decryption.
5887 * It has to be the same as the "initial vector" we
5888 * used to encrypt that page.
5889 */
5890 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5891 decrypt_iv.vm.pager_object = page->object->pager;
5892 decrypt_iv.vm.paging_offset =
5893 page->object->paging_offset + page->offset;
5894
5895 vm_object_unlock(page->object);
5896
5897 /* encrypt the "initial vector" */
5898 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5899 swap_crypt_null_iv,
5900 1,
5901 &decrypt_iv.aes_iv[0],
5902 &swap_crypt_ctx.encrypt);
5903
5904 /*
5905 * Decrypt the page.
5906 */
5907 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5908 &decrypt_iv.aes_iv[0],
5909 PAGE_SIZE / AES_BLOCK_SIZE,
5910 (unsigned char *) kernel_vaddr,
5911 &swap_crypt_ctx.decrypt);
5912 vm_page_decrypt_counter++;
5913
5914 vm_object_lock(page->object);
5915
5916 /*
5917 * Unmap the page from the kernel's address space,
5918 * if we had to map it ourselves. Otherwise, let
5919 * the caller undo the mapping if needed.
5920 */
5921 if (kernel_mapping_size != 0) {
5922 vm_paging_unmap_object(page->object,
5923 kernel_vaddr,
5924 kernel_vaddr + PAGE_SIZE);
5925 }
5926
5927 /*
5928 * After decryption, the page is actually clean.
5929 * It was encrypted as part of paging, which "cleans"
5930 * the "dirty" pages.
5931 * Noone could access it after it was encrypted
5932 * and the decryption doesn't count.
5933 */
5934 page->dirty = FALSE;
5935 clear_refmod = VM_MEM_MODIFIED;
5936
5937 /* restore the "reference" bit */
5938 if (! page_was_referenced) {
5939 page->reference = FALSE;
5940 clear_refmod |= VM_MEM_REFERENCED;
5941 }
5942 pmap_clear_refmod(page->phys_page, clear_refmod);
5943
5944 page->encrypted = FALSE;
5945
5946 /*
5947 * We've just modified the page's contents via the data cache and part
5948 * of the new contents might still be in the cache and not yet in RAM.
5949 * Since the page is now available and might get gathered in a UPL to
5950 * be part of a DMA transfer from a driver that expects the memory to
5951 * be coherent at this point, we have to flush the data cache.
5952 */
5953 pmap_sync_page_attributes_phys(page->phys_page);
5954 /*
5955 * Since the page is not mapped yet, some code might assume that it
5956 * doesn't need to invalidate the instruction cache when writing to
5957 * that page. That code relies on "no_isync" being set, so that the
5958 * caches get syncrhonized when the page is first mapped. So we need
5959 * to set "no_isync" here too, despite the fact that we just
5960 * synchronized the caches above...
5961 */
5962 page->no_isync = TRUE;
5963 }
5964
5965 unsigned long upl_encrypt_upls = 0;
5966 unsigned long upl_encrypt_pages = 0;
5967
5968 /*
5969 * ENCRYPTED SWAP:
5970 *
5971 * upl_encrypt:
5972 * Encrypts all the pages in the UPL, within the specified range.
5973 *
5974 */
5975 void
5976 upl_encrypt(
5977 upl_t upl,
5978 upl_offset_t crypt_offset,
5979 upl_size_t crypt_size)
5980 {
5981 upl_size_t upl_size;
5982 upl_offset_t upl_offset;
5983 vm_object_t upl_object;
5984 vm_page_t page;
5985 vm_object_t shadow_object;
5986 vm_object_offset_t shadow_offset;
5987 vm_object_offset_t paging_offset;
5988 vm_object_offset_t base_offset;
5989
5990 upl_encrypt_upls++;
5991 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5992
5993 upl_lock(upl);
5994
5995 upl_object = upl->map_object;
5996 upl_offset = upl->offset;
5997 upl_size = upl->size;
5998
5999 upl_unlock(upl);
6000
6001 vm_object_lock(upl_object);
6002
6003 /*
6004 * Find the VM object that contains the actual pages.
6005 */
6006 if (upl_object->pageout) {
6007 shadow_object = upl_object->shadow;
6008 /*
6009 * The offset in the shadow object is actually also
6010 * accounted for in upl->offset. It possibly shouldn't be
6011 * this way, but for now don't account for it twice.
6012 */
6013 shadow_offset = 0;
6014 assert(upl_object->paging_offset == 0); /* XXX ? */
6015 vm_object_lock(shadow_object);
6016 } else {
6017 shadow_object = upl_object;
6018 shadow_offset = 0;
6019 }
6020
6021 paging_offset = shadow_object->paging_offset;
6022 vm_object_paging_begin(shadow_object);
6023
6024 if (shadow_object != upl_object) {
6025 vm_object_unlock(shadow_object);
6026 }
6027 vm_object_unlock(upl_object);
6028
6029 base_offset = shadow_offset;
6030 base_offset += upl_offset;
6031 base_offset += crypt_offset;
6032 base_offset -= paging_offset;
6033 /*
6034 * Unmap the pages, so that nobody can continue accessing them while
6035 * they're encrypted. After that point, all accesses to these pages
6036 * will cause a page fault and block while the page is being encrypted
6037 * (busy). After the encryption completes, any access will cause a
6038 * page fault and the page gets decrypted at that time.
6039 */
6040 assert(crypt_offset + crypt_size <= upl_size);
6041 vm_object_pmap_protect(shadow_object,
6042 base_offset,
6043 (vm_object_size_t)crypt_size,
6044 PMAP_NULL,
6045 0,
6046 VM_PROT_NONE);
6047
6048 /* XXX FBDP could the object have changed significantly here ? */
6049 vm_object_lock(shadow_object);
6050
6051 for (upl_offset = 0;
6052 upl_offset < crypt_size;
6053 upl_offset += PAGE_SIZE) {
6054 page = vm_page_lookup(shadow_object,
6055 base_offset + upl_offset);
6056 if (page == VM_PAGE_NULL) {
6057 panic("upl_encrypt: "
6058 "no page for (obj=%p,off=%lld+%d)!\n",
6059 shadow_object,
6060 base_offset,
6061 upl_offset);
6062 }
6063 vm_page_encrypt(page, 0);
6064 }
6065
6066 vm_object_paging_end(shadow_object);
6067 vm_object_unlock(shadow_object);
6068 }
6069
6070 vm_size_t
6071 upl_get_internal_pagelist_offset(void)
6072 {
6073 return sizeof(struct upl);
6074 }
6075
6076 void
6077 upl_clear_dirty(
6078 upl_t upl,
6079 boolean_t value)
6080 {
6081 if (value) {
6082 upl->flags |= UPL_CLEAR_DIRTY;
6083 } else {
6084 upl->flags &= ~UPL_CLEAR_DIRTY;
6085 }
6086 }
6087
6088
6089 #ifdef MACH_BSD
6090
6091 boolean_t upl_page_present(upl_page_info_t *upl, int index)
6092 {
6093 return(UPL_PAGE_PRESENT(upl, index));
6094 }
6095 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
6096 {
6097 return(UPL_DIRTY_PAGE(upl, index));
6098 }
6099 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
6100 {
6101 return(UPL_VALID_PAGE(upl, index));
6102 }
6103 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6104 {
6105 return(UPL_PHYS_PAGE(upl, index));
6106 }
6107
6108 void
6109 vm_countdirtypages(void)
6110 {
6111 vm_page_t m;
6112 int dpages;
6113 int pgopages;
6114 int precpages;
6115
6116
6117 dpages=0;
6118 pgopages=0;
6119 precpages=0;
6120
6121 vm_page_lock_queues();
6122 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6123 do {
6124 if (m ==(vm_page_t )0) break;
6125
6126 if(m->dirty) dpages++;
6127 if(m->pageout) pgopages++;
6128 if(m->precious) precpages++;
6129
6130 assert(m->object != kernel_object);
6131 m = (vm_page_t) queue_next(&m->pageq);
6132 if (m ==(vm_page_t )0) break;
6133
6134 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6135 vm_page_unlock_queues();
6136
6137 vm_page_lock_queues();
6138 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6139 do {
6140 if (m ==(vm_page_t )0) break;
6141
6142 if(m->dirty) dpages++;
6143 if(m->pageout) pgopages++;
6144 if(m->precious) precpages++;
6145
6146 assert(m->object != kernel_object);
6147 m = (vm_page_t) queue_next(&m->pageq);
6148 if (m ==(vm_page_t )0) break;
6149
6150 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6151 vm_page_unlock_queues();
6152
6153 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6154
6155 dpages=0;
6156 pgopages=0;
6157 precpages=0;
6158
6159 vm_page_lock_queues();
6160 m = (vm_page_t) queue_first(&vm_page_queue_active);
6161
6162 do {
6163 if(m == (vm_page_t )0) break;
6164 if(m->dirty) dpages++;
6165 if(m->pageout) pgopages++;
6166 if(m->precious) precpages++;
6167
6168 assert(m->object != kernel_object);
6169 m = (vm_page_t) queue_next(&m->pageq);
6170 if(m == (vm_page_t )0) break;
6171
6172 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6173 vm_page_unlock_queues();
6174
6175 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6176
6177 }
6178 #endif /* MACH_BSD */
6179
6180 ppnum_t upl_get_highest_page(
6181 upl_t upl)
6182 {
6183 return upl->highest_page;
6184 }
6185
6186 #ifdef UPL_DEBUG
6187 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6188 {
6189 upl->ubc_alias1 = alias1;
6190 upl->ubc_alias2 = alias2;
6191 return KERN_SUCCESS;
6192 }
6193 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6194 {
6195 if(al)
6196 *al = upl->ubc_alias1;
6197 if(al2)
6198 *al2 = upl->ubc_alias2;
6199 return KERN_SUCCESS;
6200 }
6201 #endif /* UPL_DEBUG */
6202
6203
6204
6205 #if MACH_KDB
6206 #include <ddb/db_output.h>
6207 #include <ddb/db_print.h>
6208 #include <vm/vm_print.h>
6209
6210 #define printf kdbprintf
6211 void db_pageout(void);
6212
6213 void
6214 db_vm(void)
6215 {
6216
6217 iprintf("VM Statistics:\n");
6218 db_indent += 2;
6219 iprintf("pages:\n");
6220 db_indent += 2;
6221 iprintf("activ %5d inact %5d free %5d",
6222 vm_page_active_count, vm_page_inactive_count,
6223 vm_page_free_count);
6224 printf(" wire %5d gobbl %5d\n",
6225 vm_page_wire_count, vm_page_gobble_count);
6226 db_indent -= 2;
6227 iprintf("target:\n");
6228 db_indent += 2;
6229 iprintf("min %5d inact %5d free %5d",
6230 vm_page_free_min, vm_page_inactive_target,
6231 vm_page_free_target);
6232 printf(" resrv %5d\n", vm_page_free_reserved);
6233 db_indent -= 2;
6234 iprintf("pause:\n");
6235 db_pageout();
6236 db_indent -= 2;
6237 }
6238
6239 #if MACH_COUNTERS
6240 extern int c_laundry_pages_freed;
6241 #endif /* MACH_COUNTERS */
6242
6243 void
6244 db_pageout(void)
6245 {
6246 iprintf("Pageout Statistics:\n");
6247 db_indent += 2;
6248 iprintf("active %5d inactv %5d\n",
6249 vm_pageout_active, vm_pageout_inactive);
6250 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6251 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6252 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6253 iprintf("used %5d clean %5d dirty %5d\n",
6254 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6255 vm_pageout_inactive_dirty);
6256 #if MACH_COUNTERS
6257 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6258 #endif /* MACH_COUNTERS */
6259 #if MACH_CLUSTER_STATS
6260 iprintf("Cluster Statistics:\n");
6261 db_indent += 2;
6262 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6263 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6264 vm_pageout_cluster_collisions);
6265 iprintf("clusters %5d conversions %5d\n",
6266 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6267 db_indent -= 2;
6268 iprintf("Target Statistics:\n");
6269 db_indent += 2;
6270 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6271 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6272 vm_pageout_target_page_freed);
6273 db_indent -= 2;
6274 #endif /* MACH_CLUSTER_STATS */
6275 db_indent -= 2;
6276 }
6277
6278 #endif /* MACH_KDB */