]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
xnu-792.6.61.tar.gz
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * @OSF_COPYRIGHT@
24 */
25 /*
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
28 * All Rights Reserved.
29 *
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
35 *
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
39 *
40 * Carnegie Mellon requests users of this software to return to
41 *
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
46 *
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
49 */
50 /*
51 */
52 /*
53 * File: vm/vm_pageout.c
54 * Author: Avadis Tevanian, Jr., Michael Wayne Young
55 * Date: 1985
56 *
57 * The proverbial page-out daemon.
58 */
59
60 #include <stdint.h>
61
62 #include <debug.h>
63 #include <mach_pagemap.h>
64 #include <mach_cluster_stats.h>
65 #include <mach_kdb.h>
66 #include <advisory_pageout.h>
67
68 #include <mach/mach_types.h>
69 #include <mach/memory_object.h>
70 #include <mach/memory_object_default.h>
71 #include <mach/memory_object_control_server.h>
72 #include <mach/mach_host_server.h>
73 #include <mach/upl.h>
74 #include <mach/vm_map.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_statistics.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/counters.h>
80 #include <kern/host_statistics.h>
81 #include <kern/machine.h>
82 #include <kern/misc_protos.h>
83 #include <kern/thread.h>
84 #include <kern/xpr.h>
85 #include <kern/kalloc.h>
86
87 #include <machine/vm_tuning.h>
88
89 #include <vm/pmap.h>
90 #include <vm/vm_fault.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_object.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_protos.h> /* must be last */
96
97 /*
98 * ENCRYPTED SWAP:
99 */
100 #ifdef __ppc__
101 #include <ppc/mappings.h>
102 #endif /* __ppc__ */
103 #include <../bsd/crypto/aes/aes.h>
104
105 extern ipc_port_t memory_manager_default;
106
107
108 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
109 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
110 #endif
111
112 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
113 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
114 #endif
115
116 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
117 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
118 #endif
119
120 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
121 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
122 #endif
123
124 #ifndef VM_PAGE_LAUNDRY_MAX
125 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
126 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
127
128 #ifndef VM_PAGEOUT_BURST_WAIT
129 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
130 #endif /* VM_PAGEOUT_BURST_WAIT */
131
132 #ifndef VM_PAGEOUT_EMPTY_WAIT
133 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
134 #endif /* VM_PAGEOUT_EMPTY_WAIT */
135
136 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
137 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
138 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
139
140 #ifndef VM_PAGEOUT_IDLE_WAIT
141 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
142 #endif /* VM_PAGEOUT_IDLE_WAIT */
143
144
145 /*
146 * To obtain a reasonable LRU approximation, the inactive queue
147 * needs to be large enough to give pages on it a chance to be
148 * referenced a second time. This macro defines the fraction
149 * of active+inactive pages that should be inactive.
150 * The pageout daemon uses it to update vm_page_inactive_target.
151 *
152 * If vm_page_free_count falls below vm_page_free_target and
153 * vm_page_inactive_count is below vm_page_inactive_target,
154 * then the pageout daemon starts running.
155 */
156
157 #ifndef VM_PAGE_INACTIVE_TARGET
158 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
159 #endif /* VM_PAGE_INACTIVE_TARGET */
160
161 /*
162 * Once the pageout daemon starts running, it keeps going
163 * until vm_page_free_count meets or exceeds vm_page_free_target.
164 */
165
166 #ifndef VM_PAGE_FREE_TARGET
167 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
168 #endif /* VM_PAGE_FREE_TARGET */
169
170 /*
171 * The pageout daemon always starts running once vm_page_free_count
172 * falls below vm_page_free_min.
173 */
174
175 #ifndef VM_PAGE_FREE_MIN
176 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
177 #endif /* VM_PAGE_FREE_MIN */
178
179 /*
180 * When vm_page_free_count falls below vm_page_free_reserved,
181 * only vm-privileged threads can allocate pages. vm-privilege
182 * allows the pageout daemon and default pager (and any other
183 * associated threads needed for default pageout) to continue
184 * operation by dipping into the reserved pool of pages.
185 */
186
187 #ifndef VM_PAGE_FREE_RESERVED
188 #define VM_PAGE_FREE_RESERVED(n) \
189 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
190 #endif /* VM_PAGE_FREE_RESERVED */
191
192
193 /*
194 * must hold the page queues lock to
195 * manipulate this structure
196 */
197 struct vm_pageout_queue {
198 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
199 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
200 unsigned int pgo_maxlaundry;
201
202 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
203 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
204 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
205 :0;
206 };
207
208 #define VM_PAGE_Q_THROTTLED(q) \
209 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
210
211
212 /*
213 * Exported variable used to broadcast the activation of the pageout scan
214 * Working Set uses this to throttle its use of pmap removes. In this
215 * way, code which runs within memory in an uncontested context does
216 * not keep encountering soft faults.
217 */
218
219 unsigned int vm_pageout_scan_event_counter = 0;
220
221 /*
222 * Forward declarations for internal routines.
223 */
224
225 static void vm_pageout_garbage_collect(int);
226 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
227 static void vm_pageout_iothread_external(void);
228 static void vm_pageout_iothread_internal(void);
229 static void vm_pageout_queue_steal(vm_page_t);
230
231 extern void vm_pageout_continue(void);
232 extern void vm_pageout_scan(void);
233
234 unsigned int vm_pageout_reserved_internal = 0;
235 unsigned int vm_pageout_reserved_really = 0;
236
237 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
238 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
239 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
240 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
241 unsigned int vm_pageout_deadlock_relief = 0;
242 unsigned int vm_pageout_inactive_relief = 0;
243 unsigned int vm_pageout_burst_active_throttle = 0;
244 unsigned int vm_pageout_burst_inactive_throttle = 0;
245
246 /*
247 * Protection against zero fill flushing live working sets derived
248 * from existing backing store and files
249 */
250 unsigned int vm_accellerate_zf_pageout_trigger = 400;
251 unsigned int vm_zf_iterator;
252 unsigned int vm_zf_iterator_count = 40;
253 unsigned int last_page_zf;
254 unsigned int vm_zf_count = 0;
255
256 /*
257 * These variables record the pageout daemon's actions:
258 * how many pages it looks at and what happens to those pages.
259 * No locking needed because only one thread modifies the variables.
260 */
261
262 unsigned int vm_pageout_active = 0; /* debugging */
263 unsigned int vm_pageout_inactive = 0; /* debugging */
264 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
265 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
266 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
267 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
268 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
269 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
270 unsigned int vm_pageout_inactive_used = 0; /* debugging */
271 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
272 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
273 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
274 unsigned int vm_pageout_purged_objects = 0; /* debugging */
275 unsigned int vm_stat_discard = 0; /* debugging */
276 unsigned int vm_stat_discard_sent = 0; /* debugging */
277 unsigned int vm_stat_discard_failure = 0; /* debugging */
278 unsigned int vm_stat_discard_throttle = 0; /* debugging */
279
280 unsigned int vm_pageout_scan_active_throttled = 0;
281 unsigned int vm_pageout_scan_inactive_throttled = 0;
282 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
283 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
284 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
285 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
286 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
287 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
288 /*
289 * Backing store throttle when BS is exhausted
290 */
291 unsigned int vm_backing_store_low = 0;
292
293 unsigned int vm_pageout_out_of_line = 0;
294 unsigned int vm_pageout_in_place = 0;
295
296 /*
297 * ENCRYPTED SWAP:
298 * counters and statistics...
299 */
300 unsigned long vm_page_decrypt_counter = 0;
301 unsigned long vm_page_decrypt_for_upl_counter = 0;
302 unsigned long vm_page_encrypt_counter = 0;
303 unsigned long vm_page_encrypt_abort_counter = 0;
304 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
305 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
306
307
308 struct vm_pageout_queue vm_pageout_queue_internal;
309 struct vm_pageout_queue vm_pageout_queue_external;
310
311
312 /*
313 * Routine: vm_backing_store_disable
314 * Purpose:
315 * Suspend non-privileged threads wishing to extend
316 * backing store when we are low on backing store
317 * (Synchronized by caller)
318 */
319 void
320 vm_backing_store_disable(
321 boolean_t disable)
322 {
323 if(disable) {
324 vm_backing_store_low = 1;
325 } else {
326 if(vm_backing_store_low) {
327 vm_backing_store_low = 0;
328 thread_wakeup((event_t) &vm_backing_store_low);
329 }
330 }
331 }
332
333
334 /*
335 * Routine: vm_pageout_object_allocate
336 * Purpose:
337 * Allocate an object for use as out-of-line memory in a
338 * data_return/data_initialize message.
339 * The page must be in an unlocked object.
340 *
341 * If the page belongs to a trusted pager, cleaning in place
342 * will be used, which utilizes a special "pageout object"
343 * containing private alias pages for the real page frames.
344 * Untrusted pagers use normal out-of-line memory.
345 */
346 vm_object_t
347 vm_pageout_object_allocate(
348 vm_page_t m,
349 vm_size_t size,
350 vm_object_offset_t offset)
351 {
352 vm_object_t object = m->object;
353 vm_object_t new_object;
354
355 assert(object->pager_ready);
356
357 new_object = vm_object_allocate(size);
358
359 if (object->pager_trusted) {
360 assert (offset < object->size);
361
362 vm_object_lock(new_object);
363 new_object->pageout = TRUE;
364 new_object->shadow = object;
365 new_object->can_persist = FALSE;
366 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
367 new_object->shadow_offset = offset;
368 vm_object_unlock(new_object);
369
370 /*
371 * Take a paging reference on the object. This will be dropped
372 * in vm_pageout_object_terminate()
373 */
374 vm_object_lock(object);
375 vm_object_paging_begin(object);
376 vm_page_lock_queues();
377 vm_page_unlock_queues();
378 vm_object_unlock(object);
379
380 vm_pageout_in_place++;
381 } else
382 vm_pageout_out_of_line++;
383 return(new_object);
384 }
385
386 #if MACH_CLUSTER_STATS
387 unsigned long vm_pageout_cluster_dirtied = 0;
388 unsigned long vm_pageout_cluster_cleaned = 0;
389 unsigned long vm_pageout_cluster_collisions = 0;
390 unsigned long vm_pageout_cluster_clusters = 0;
391 unsigned long vm_pageout_cluster_conversions = 0;
392 unsigned long vm_pageout_target_collisions = 0;
393 unsigned long vm_pageout_target_page_dirtied = 0;
394 unsigned long vm_pageout_target_page_freed = 0;
395 #define CLUSTER_STAT(clause) clause
396 #else /* MACH_CLUSTER_STATS */
397 #define CLUSTER_STAT(clause)
398 #endif /* MACH_CLUSTER_STATS */
399
400 /*
401 * Routine: vm_pageout_object_terminate
402 * Purpose:
403 * Destroy the pageout_object allocated by
404 * vm_pageout_object_allocate(), and perform all of the
405 * required cleanup actions.
406 *
407 * In/Out conditions:
408 * The object must be locked, and will be returned locked.
409 */
410 void
411 vm_pageout_object_terminate(
412 vm_object_t object)
413 {
414 vm_object_t shadow_object;
415 boolean_t shadow_internal;
416
417 /*
418 * Deal with the deallocation (last reference) of a pageout object
419 * (used for cleaning-in-place) by dropping the paging references/
420 * freeing pages in the original object.
421 */
422
423 assert(object->pageout);
424 shadow_object = object->shadow;
425 vm_object_lock(shadow_object);
426 shadow_internal = shadow_object->internal;
427
428 while (!queue_empty(&object->memq)) {
429 vm_page_t p, m;
430 vm_object_offset_t offset;
431
432 p = (vm_page_t) queue_first(&object->memq);
433
434 assert(p->private);
435 assert(p->pageout);
436 p->pageout = FALSE;
437 assert(!p->cleaning);
438
439 offset = p->offset;
440 VM_PAGE_FREE(p);
441 p = VM_PAGE_NULL;
442
443 m = vm_page_lookup(shadow_object,
444 offset + object->shadow_offset);
445
446 if(m == VM_PAGE_NULL)
447 continue;
448 assert(m->cleaning);
449 /* used as a trigger on upl_commit etc to recognize the */
450 /* pageout daemon's subseqent desire to pageout a cleaning */
451 /* page. When the bit is on the upl commit code will */
452 /* respect the pageout bit in the target page over the */
453 /* caller's page list indication */
454 m->dump_cleaning = FALSE;
455
456 /*
457 * Account for the paging reference taken when
458 * m->cleaning was set on this page.
459 */
460 vm_object_paging_end(shadow_object);
461 assert((m->dirty) || (m->precious) ||
462 (m->busy && m->cleaning));
463
464 /*
465 * Handle the trusted pager throttle.
466 * Also decrement the burst throttle (if external).
467 */
468 vm_page_lock_queues();
469 if (m->laundry) {
470 vm_pageout_throttle_up(m);
471 }
472
473 /*
474 * Handle the "target" page(s). These pages are to be freed if
475 * successfully cleaned. Target pages are always busy, and are
476 * wired exactly once. The initial target pages are not mapped,
477 * (so cannot be referenced or modified) but converted target
478 * pages may have been modified between the selection as an
479 * adjacent page and conversion to a target.
480 */
481 if (m->pageout) {
482 assert(m->busy);
483 assert(m->wire_count == 1);
484 m->cleaning = FALSE;
485 m->pageout = FALSE;
486 #if MACH_CLUSTER_STATS
487 if (m->wanted) vm_pageout_target_collisions++;
488 #endif
489 /*
490 * Revoke all access to the page. Since the object is
491 * locked, and the page is busy, this prevents the page
492 * from being dirtied after the pmap_disconnect() call
493 * returns.
494 *
495 * Since the page is left "dirty" but "not modifed", we
496 * can detect whether the page was redirtied during
497 * pageout by checking the modify state.
498 */
499 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
500 m->dirty = TRUE;
501 else
502 m->dirty = FALSE;
503
504 if (m->dirty) {
505 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
506 vm_page_unwire(m);/* reactivates */
507 VM_STAT(reactivations++);
508 PAGE_WAKEUP_DONE(m);
509 } else {
510 CLUSTER_STAT(vm_pageout_target_page_freed++;)
511 vm_page_free(m);/* clears busy, etc. */
512 }
513 vm_page_unlock_queues();
514 continue;
515 }
516 /*
517 * Handle the "adjacent" pages. These pages were cleaned in
518 * place, and should be left alone.
519 * If prep_pin_count is nonzero, then someone is using the
520 * page, so make it active.
521 */
522 if (!m->active && !m->inactive && !m->private) {
523 if (m->reference)
524 vm_page_activate(m);
525 else
526 vm_page_deactivate(m);
527 }
528 if((m->busy) && (m->cleaning)) {
529
530 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
531 m->busy = FALSE;
532
533 /* We do not re-set m->dirty ! */
534 /* The page was busy so no extraneous activity */
535 /* could have occurred. COPY_INTO is a read into the */
536 /* new pages. CLEAN_IN_PLACE does actually write */
537 /* out the pages but handling outside of this code */
538 /* will take care of resetting dirty. We clear the */
539 /* modify however for the Programmed I/O case. */
540 pmap_clear_modify(m->phys_page);
541 if(m->absent) {
542 m->absent = FALSE;
543 if(shadow_object->absent_count == 1)
544 vm_object_absent_release(shadow_object);
545 else
546 shadow_object->absent_count--;
547 }
548 m->overwriting = FALSE;
549 } else if (m->overwriting) {
550 /* alternate request page list, write to page_list */
551 /* case. Occurs when the original page was wired */
552 /* at the time of the list request */
553 assert(m->wire_count != 0);
554 vm_page_unwire(m);/* reactivates */
555 m->overwriting = FALSE;
556 } else {
557 /*
558 * Set the dirty state according to whether or not the page was
559 * modified during the pageout. Note that we purposefully do
560 * NOT call pmap_clear_modify since the page is still mapped.
561 * If the page were to be dirtied between the 2 calls, this
562 * this fact would be lost. This code is only necessary to
563 * maintain statistics, since the pmap module is always
564 * consulted if m->dirty is false.
565 */
566 #if MACH_CLUSTER_STATS
567 m->dirty = pmap_is_modified(m->phys_page);
568
569 if (m->dirty) vm_pageout_cluster_dirtied++;
570 else vm_pageout_cluster_cleaned++;
571 if (m->wanted) vm_pageout_cluster_collisions++;
572 #else
573 m->dirty = 0;
574 #endif
575 }
576 m->cleaning = FALSE;
577
578 /*
579 * Wakeup any thread waiting for the page to be un-cleaning.
580 */
581 PAGE_WAKEUP(m);
582 vm_page_unlock_queues();
583 }
584 /*
585 * Account for the paging reference taken in vm_paging_object_allocate.
586 */
587 vm_object_paging_end(shadow_object);
588 vm_object_unlock(shadow_object);
589
590 assert(object->ref_count == 0);
591 assert(object->paging_in_progress == 0);
592 assert(object->resident_page_count == 0);
593 return;
594 }
595
596 /*
597 * Routine: vm_pageout_setup
598 * Purpose:
599 * Set up a page for pageout (clean & flush).
600 *
601 * Move the page to a new object, as part of which it will be
602 * sent to its memory manager in a memory_object_data_write or
603 * memory_object_initialize message.
604 *
605 * The "new_object" and "new_offset" arguments
606 * indicate where the page should be moved.
607 *
608 * In/Out conditions:
609 * The page in question must not be on any pageout queues,
610 * and must be busy. The object to which it belongs
611 * must be unlocked, and the caller must hold a paging
612 * reference to it. The new_object must not be locked.
613 *
614 * This routine returns a pointer to a place-holder page,
615 * inserted at the same offset, to block out-of-order
616 * requests for the page. The place-holder page must
617 * be freed after the data_write or initialize message
618 * has been sent.
619 *
620 * The original page is put on a paging queue and marked
621 * not busy on exit.
622 */
623 vm_page_t
624 vm_pageout_setup(
625 register vm_page_t m,
626 register vm_object_t new_object,
627 vm_object_offset_t new_offset)
628 {
629 register vm_object_t old_object = m->object;
630 vm_object_offset_t paging_offset;
631 vm_object_offset_t offset;
632 register vm_page_t holding_page;
633 register vm_page_t new_m;
634 boolean_t need_to_wire = FALSE;
635
636
637 XPR(XPR_VM_PAGEOUT,
638 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
639 (integer_t)m->object, (integer_t)m->offset,
640 (integer_t)m, (integer_t)new_object,
641 (integer_t)new_offset);
642 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
643 !m->restart);
644
645 assert(m->dirty || m->precious);
646
647 /*
648 * Create a place-holder page where the old one was, to prevent
649 * attempted pageins of this page while we're unlocked.
650 */
651 VM_PAGE_GRAB_FICTITIOUS(holding_page);
652
653 vm_object_lock(old_object);
654
655 offset = m->offset;
656 paging_offset = offset + old_object->paging_offset;
657
658 if (old_object->pager_trusted) {
659 /*
660 * This pager is trusted, so we can clean this page
661 * in place. Leave it in the old object, and mark it
662 * cleaning & pageout.
663 */
664 new_m = holding_page;
665 holding_page = VM_PAGE_NULL;
666
667 /*
668 * Set up new page to be private shadow of real page.
669 */
670 new_m->phys_page = m->phys_page;
671 new_m->fictitious = FALSE;
672 new_m->pageout = TRUE;
673
674 /*
675 * Mark real page as cleaning (indicating that we hold a
676 * paging reference to be released via m_o_d_r_c) and
677 * pageout (indicating that the page should be freed
678 * when the pageout completes).
679 */
680 pmap_clear_modify(m->phys_page);
681 vm_page_lock_queues();
682 new_m->private = TRUE;
683 vm_page_wire(new_m);
684 m->cleaning = TRUE;
685 m->pageout = TRUE;
686
687 vm_page_wire(m);
688 assert(m->wire_count == 1);
689 vm_page_unlock_queues();
690
691 m->dirty = TRUE;
692 m->precious = FALSE;
693 m->page_lock = VM_PROT_NONE;
694 m->unusual = FALSE;
695 m->unlock_request = VM_PROT_NONE;
696 } else {
697 /*
698 * Cannot clean in place, so rip the old page out of the
699 * object, and stick the holding page in. Set new_m to the
700 * page in the new object.
701 */
702 vm_page_lock_queues();
703 VM_PAGE_QUEUES_REMOVE(m);
704 vm_page_remove(m);
705
706 vm_page_insert(holding_page, old_object, offset);
707 vm_page_unlock_queues();
708
709 m->dirty = TRUE;
710 m->precious = FALSE;
711 new_m = m;
712 new_m->page_lock = VM_PROT_NONE;
713 new_m->unlock_request = VM_PROT_NONE;
714
715 if (old_object->internal)
716 need_to_wire = TRUE;
717 }
718 /*
719 * Record that this page has been written out
720 */
721 #if MACH_PAGEMAP
722 vm_external_state_set(old_object->existence_map, offset);
723 #endif /* MACH_PAGEMAP */
724
725 vm_object_unlock(old_object);
726
727 vm_object_lock(new_object);
728
729 /*
730 * Put the page into the new object. If it is a not wired
731 * (if it's the real page) it will be activated.
732 */
733
734 vm_page_lock_queues();
735 vm_page_insert(new_m, new_object, new_offset);
736 if (need_to_wire)
737 vm_page_wire(new_m);
738 else
739 vm_page_activate(new_m);
740 PAGE_WAKEUP_DONE(new_m);
741 vm_page_unlock_queues();
742
743 vm_object_unlock(new_object);
744
745 /*
746 * Return the placeholder page to simplify cleanup.
747 */
748 return (holding_page);
749 }
750
751 /*
752 * Routine: vm_pageclean_setup
753 *
754 * Purpose: setup a page to be cleaned (made non-dirty), but not
755 * necessarily flushed from the VM page cache.
756 * This is accomplished by cleaning in place.
757 *
758 * The page must not be busy, and the object and page
759 * queues must be locked.
760 *
761 */
762 void
763 vm_pageclean_setup(
764 vm_page_t m,
765 vm_page_t new_m,
766 vm_object_t new_object,
767 vm_object_offset_t new_offset)
768 {
769 vm_object_t old_object = m->object;
770 assert(!m->busy);
771 assert(!m->cleaning);
772
773 XPR(XPR_VM_PAGEOUT,
774 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
775 (integer_t)old_object, m->offset, (integer_t)m,
776 (integer_t)new_m, new_offset);
777
778 pmap_clear_modify(m->phys_page);
779 vm_object_paging_begin(old_object);
780
781 /*
782 * Record that this page has been written out
783 */
784 #if MACH_PAGEMAP
785 vm_external_state_set(old_object->existence_map, m->offset);
786 #endif /*MACH_PAGEMAP*/
787
788 /*
789 * Mark original page as cleaning in place.
790 */
791 m->cleaning = TRUE;
792 m->dirty = TRUE;
793 m->precious = FALSE;
794
795 /*
796 * Convert the fictitious page to a private shadow of
797 * the real page.
798 */
799 assert(new_m->fictitious);
800 new_m->fictitious = FALSE;
801 new_m->private = TRUE;
802 new_m->pageout = TRUE;
803 new_m->phys_page = m->phys_page;
804 vm_page_wire(new_m);
805
806 vm_page_insert(new_m, new_object, new_offset);
807 assert(!new_m->wanted);
808 new_m->busy = FALSE;
809 }
810
811 void
812 vm_pageclean_copy(
813 vm_page_t m,
814 vm_page_t new_m,
815 vm_object_t new_object,
816 vm_object_offset_t new_offset)
817 {
818 XPR(XPR_VM_PAGEOUT,
819 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
820 m, new_m, new_object, new_offset, 0);
821
822 assert((!m->busy) && (!m->cleaning));
823
824 assert(!new_m->private && !new_m->fictitious);
825
826 pmap_clear_modify(m->phys_page);
827
828 m->busy = TRUE;
829 vm_object_paging_begin(m->object);
830 vm_page_unlock_queues();
831 vm_object_unlock(m->object);
832
833 /*
834 * Copy the original page to the new page.
835 */
836 vm_page_copy(m, new_m);
837
838 /*
839 * Mark the old page as clean. A request to pmap_is_modified
840 * will get the right answer.
841 */
842 vm_object_lock(m->object);
843 m->dirty = FALSE;
844
845 vm_object_paging_end(m->object);
846
847 vm_page_lock_queues();
848 if (!m->active && !m->inactive)
849 vm_page_activate(m);
850 PAGE_WAKEUP_DONE(m);
851
852 vm_page_insert(new_m, new_object, new_offset);
853 vm_page_activate(new_m);
854 new_m->busy = FALSE; /* No other thread can be waiting */
855 }
856
857
858 /*
859 * Routine: vm_pageout_initialize_page
860 * Purpose:
861 * Causes the specified page to be initialized in
862 * the appropriate memory object. This routine is used to push
863 * pages into a copy-object when they are modified in the
864 * permanent object.
865 *
866 * The page is moved to a temporary object and paged out.
867 *
868 * In/out conditions:
869 * The page in question must not be on any pageout queues.
870 * The object to which it belongs must be locked.
871 * The page must be busy, but not hold a paging reference.
872 *
873 * Implementation:
874 * Move this page to a completely new object.
875 */
876 void
877 vm_pageout_initialize_page(
878 vm_page_t m)
879 {
880 vm_object_t object;
881 vm_object_offset_t paging_offset;
882 vm_page_t holding_page;
883
884
885 XPR(XPR_VM_PAGEOUT,
886 "vm_pageout_initialize_page, page 0x%X\n",
887 (integer_t)m, 0, 0, 0, 0);
888 assert(m->busy);
889
890 /*
891 * Verify that we really want to clean this page
892 */
893 assert(!m->absent);
894 assert(!m->error);
895 assert(m->dirty);
896
897 /*
898 * Create a paging reference to let us play with the object.
899 */
900 object = m->object;
901 paging_offset = m->offset + object->paging_offset;
902 vm_object_paging_begin(object);
903 if (m->absent || m->error || m->restart ||
904 (!m->dirty && !m->precious)) {
905 VM_PAGE_FREE(m);
906 panic("reservation without pageout?"); /* alan */
907 vm_object_unlock(object);
908 return;
909 }
910
911 /* set the page for future call to vm_fault_list_request */
912 holding_page = NULL;
913 vm_page_lock_queues();
914 pmap_clear_modify(m->phys_page);
915 m->dirty = TRUE;
916 m->busy = TRUE;
917 m->list_req_pending = TRUE;
918 m->cleaning = TRUE;
919 m->pageout = TRUE;
920 vm_page_wire(m);
921 vm_page_unlock_queues();
922 vm_object_unlock(object);
923
924 /*
925 * Write the data to its pager.
926 * Note that the data is passed by naming the new object,
927 * not a virtual address; the pager interface has been
928 * manipulated to use the "internal memory" data type.
929 * [The object reference from its allocation is donated
930 * to the eventual recipient.]
931 */
932 memory_object_data_initialize(object->pager,
933 paging_offset,
934 PAGE_SIZE);
935
936 vm_object_lock(object);
937 }
938
939 #if MACH_CLUSTER_STATS
940 #define MAXCLUSTERPAGES 16
941 struct {
942 unsigned long pages_in_cluster;
943 unsigned long pages_at_higher_offsets;
944 unsigned long pages_at_lower_offsets;
945 } cluster_stats[MAXCLUSTERPAGES];
946 #endif /* MACH_CLUSTER_STATS */
947
948 boolean_t allow_clustered_pageouts = FALSE;
949
950 /*
951 * vm_pageout_cluster:
952 *
953 * Given a page, queue it to the appropriate I/O thread,
954 * which will page it out and attempt to clean adjacent pages
955 * in the same operation.
956 *
957 * The page must be busy, and the object and queues locked. We will take a
958 * paging reference to prevent deallocation or collapse when we
959 * release the object lock back at the call site. The I/O thread
960 * is responsible for consuming this reference
961 *
962 * The page must not be on any pageout queue.
963 */
964
965 void
966 vm_pageout_cluster(vm_page_t m)
967 {
968 vm_object_t object = m->object;
969 struct vm_pageout_queue *q;
970
971
972 XPR(XPR_VM_PAGEOUT,
973 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
974 (integer_t)object, m->offset, (integer_t)m, 0, 0);
975
976 /*
977 * Only a certain kind of page is appreciated here.
978 */
979 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
980 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
981
982 /*
983 * protect the object from collapse -
984 * locking in the object's paging_offset.
985 */
986 vm_object_paging_begin(object);
987
988 /*
989 * set the page for future call to vm_fault_list_request
990 * page should already be marked busy
991 */
992 vm_page_wire(m);
993 m->list_req_pending = TRUE;
994 m->cleaning = TRUE;
995 m->pageout = TRUE;
996 m->laundry = TRUE;
997
998 if (object->internal == TRUE)
999 q = &vm_pageout_queue_internal;
1000 else
1001 q = &vm_pageout_queue_external;
1002 q->pgo_laundry++;
1003
1004 m->pageout_queue = TRUE;
1005 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1006
1007 if (q->pgo_idle == TRUE) {
1008 q->pgo_idle = FALSE;
1009 thread_wakeup((event_t) &q->pgo_pending);
1010 }
1011 }
1012
1013
1014 unsigned long vm_pageout_throttle_up_count = 0;
1015
1016 /*
1017 * A page is back from laundry. See if there are some pages waiting to
1018 * go to laundry and if we can let some of them go now.
1019 *
1020 * Object and page queues must be locked.
1021 */
1022 void
1023 vm_pageout_throttle_up(
1024 vm_page_t m)
1025 {
1026 struct vm_pageout_queue *q;
1027
1028 vm_pageout_throttle_up_count++;
1029
1030 assert(m->laundry);
1031 assert(m->object != VM_OBJECT_NULL);
1032 assert(m->object != kernel_object);
1033
1034 if (m->object->internal == TRUE)
1035 q = &vm_pageout_queue_internal;
1036 else
1037 q = &vm_pageout_queue_external;
1038
1039 m->laundry = FALSE;
1040 q->pgo_laundry--;
1041
1042 if (q->pgo_throttled == TRUE) {
1043 q->pgo_throttled = FALSE;
1044 thread_wakeup((event_t) &q->pgo_laundry);
1045 }
1046 }
1047
1048
1049 /*
1050 * vm_pageout_scan does the dirty work for the pageout daemon.
1051 * It returns with vm_page_queue_free_lock held and
1052 * vm_page_free_wanted == 0.
1053 */
1054
1055 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1056
1057 #define FCS_IDLE 0
1058 #define FCS_DELAYED 1
1059 #define FCS_DEADLOCK_DETECTED 2
1060
1061 struct flow_control {
1062 int state;
1063 mach_timespec_t ts;
1064 };
1065
1066 extern kern_return_t sysclk_gettime(mach_timespec_t *);
1067
1068
1069 void
1070 vm_pageout_scan(void)
1071 {
1072 unsigned int loop_count = 0;
1073 unsigned int inactive_burst_count = 0;
1074 unsigned int active_burst_count = 0;
1075 vm_page_t local_freeq = 0;
1076 int local_freed = 0;
1077 int delayed_unlock = 0;
1078 int need_internal_inactive = 0;
1079 int refmod_state = 0;
1080 int vm_pageout_deadlock_target = 0;
1081 struct vm_pageout_queue *iq;
1082 struct vm_pageout_queue *eq;
1083 struct flow_control flow_control;
1084 boolean_t active_throttled = FALSE;
1085 boolean_t inactive_throttled = FALSE;
1086 mach_timespec_t ts;
1087 unsigned int msecs = 0;
1088 vm_object_t object;
1089
1090
1091 flow_control.state = FCS_IDLE;
1092 iq = &vm_pageout_queue_internal;
1093 eq = &vm_pageout_queue_external;
1094
1095 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1096
1097 /*???*/ /*
1098 * We want to gradually dribble pages from the active queue
1099 * to the inactive queue. If we let the inactive queue get
1100 * very small, and then suddenly dump many pages into it,
1101 * those pages won't get a sufficient chance to be referenced
1102 * before we start taking them from the inactive queue.
1103 *
1104 * We must limit the rate at which we send pages to the pagers.
1105 * data_write messages consume memory, for message buffers and
1106 * for map-copy objects. If we get too far ahead of the pagers,
1107 * we can potentially run out of memory.
1108 *
1109 * We can use the laundry count to limit directly the number
1110 * of pages outstanding to the default pager. A similar
1111 * strategy for external pagers doesn't work, because
1112 * external pagers don't have to deallocate the pages sent them,
1113 * and because we might have to send pages to external pagers
1114 * even if they aren't processing writes. So we also
1115 * use a burst count to limit writes to external pagers.
1116 *
1117 * When memory is very tight, we can't rely on external pagers to
1118 * clean pages. They probably aren't running, because they
1119 * aren't vm-privileged. If we kept sending dirty pages to them,
1120 * we could exhaust the free list.
1121 */
1122 vm_page_lock_queues();
1123 delayed_unlock = 1;
1124
1125
1126 Restart:
1127 /*
1128 * Recalculate vm_page_inactivate_target.
1129 */
1130 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1131 vm_page_inactive_count);
1132 object = NULL;
1133
1134 for (;;) {
1135 vm_page_t m;
1136
1137 if (delayed_unlock == 0)
1138 vm_page_lock_queues();
1139
1140 active_burst_count = vm_page_active_count;
1141
1142 if (active_burst_count > vm_pageout_burst_active_throttle)
1143 active_burst_count = vm_pageout_burst_active_throttle;
1144
1145 /*
1146 * Move pages from active to inactive.
1147 */
1148 while ((need_internal_inactive ||
1149 vm_page_inactive_count < vm_page_inactive_target) &&
1150 !queue_empty(&vm_page_queue_active) &&
1151 ((active_burst_count--) > 0)) {
1152
1153 vm_pageout_active++;
1154
1155 m = (vm_page_t) queue_first(&vm_page_queue_active);
1156
1157 assert(m->active && !m->inactive);
1158 assert(!m->laundry);
1159 assert(m->object != kernel_object);
1160
1161 /*
1162 * Try to lock object; since we've already got the
1163 * page queues lock, we can only 'try' for this one.
1164 * if the 'try' fails, we need to do a mutex_pause
1165 * to allow the owner of the object lock a chance to
1166 * run... otherwise, we're likely to trip over this
1167 * object in the same state as we work our way through
1168 * the queue... clumps of pages associated with the same
1169 * object are fairly typical on the inactive and active queues
1170 */
1171 if (m->object != object) {
1172 if (object != NULL) {
1173 vm_object_unlock(object);
1174 object = NULL;
1175 }
1176 if (!vm_object_lock_try(m->object)) {
1177 /*
1178 * move page to end of active queue and continue
1179 */
1180 queue_remove(&vm_page_queue_active, m,
1181 vm_page_t, pageq);
1182 queue_enter(&vm_page_queue_active, m,
1183 vm_page_t, pageq);
1184
1185 goto done_with_activepage;
1186 }
1187 object = m->object;
1188 }
1189 /*
1190 * if the page is BUSY, then we pull it
1191 * off the active queue and leave it alone.
1192 * when BUSY is cleared, it will get stuck
1193 * back on the appropriate queue
1194 */
1195 if (m->busy) {
1196 queue_remove(&vm_page_queue_active, m,
1197 vm_page_t, pageq);
1198 m->pageq.next = NULL;
1199 m->pageq.prev = NULL;
1200
1201 if (!m->fictitious)
1202 vm_page_active_count--;
1203 m->active = FALSE;
1204
1205 goto done_with_activepage;
1206 }
1207 if (need_internal_inactive) {
1208 /*
1209 * If we're unable to make forward progress
1210 * with the current set of pages on the
1211 * inactive queue due to busy objects or
1212 * throttled pageout queues, then
1213 * move a page that is already clean
1214 * or belongs to a pageout queue that
1215 * isn't currently throttled
1216 */
1217 active_throttled = FALSE;
1218
1219 if (object->internal) {
1220 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1221 active_throttled = TRUE;
1222 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1223 active_throttled = TRUE;
1224 }
1225 if (active_throttled == TRUE) {
1226 if (!m->dirty) {
1227 refmod_state = pmap_get_refmod(m->phys_page);
1228
1229 if (refmod_state & VM_MEM_REFERENCED)
1230 m->reference = TRUE;
1231 if (refmod_state & VM_MEM_MODIFIED)
1232 m->dirty = TRUE;
1233 }
1234 if (m->dirty || m->precious) {
1235 /*
1236 * page is dirty and targets a THROTTLED queue
1237 * so all we can do is move it back to the
1238 * end of the active queue to get it out
1239 * of the way
1240 */
1241 queue_remove(&vm_page_queue_active, m,
1242 vm_page_t, pageq);
1243 queue_enter(&vm_page_queue_active, m,
1244 vm_page_t, pageq);
1245
1246 vm_pageout_scan_active_throttled++;
1247
1248 goto done_with_activepage;
1249 }
1250 }
1251 vm_pageout_scan_active_throttle_success++;
1252 need_internal_inactive--;
1253 }
1254 /*
1255 * Deactivate the page while holding the object
1256 * locked, so we know the page is still not busy.
1257 * This should prevent races between pmap_enter
1258 * and pmap_clear_reference. The page might be
1259 * absent or fictitious, but vm_page_deactivate
1260 * can handle that.
1261 */
1262 vm_page_deactivate(m);
1263 done_with_activepage:
1264 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1265
1266 if (object != NULL) {
1267 vm_object_unlock(object);
1268 object = NULL;
1269 }
1270 if (local_freeq) {
1271 vm_page_free_list(local_freeq);
1272
1273 local_freeq = 0;
1274 local_freed = 0;
1275 }
1276 delayed_unlock = 0;
1277 vm_page_unlock_queues();
1278
1279 mutex_pause();
1280 vm_page_lock_queues();
1281 /*
1282 * continue the while loop processing
1283 * the active queue... need to hold
1284 * the page queues lock
1285 */
1286 continue;
1287 }
1288 }
1289
1290
1291
1292 /**********************************************************************
1293 * above this point we're playing with the active queue
1294 * below this point we're playing with the throttling mechanisms
1295 * and the inactive queue
1296 **********************************************************************/
1297
1298
1299
1300 /*
1301 * We are done if we have met our target *and*
1302 * nobody is still waiting for a page.
1303 */
1304 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1305 if (object != NULL) {
1306 vm_object_unlock(object);
1307 object = NULL;
1308 }
1309 if (local_freeq) {
1310 vm_page_free_list(local_freeq);
1311
1312 local_freeq = 0;
1313 local_freed = 0;
1314 }
1315 mutex_lock(&vm_page_queue_free_lock);
1316
1317 if ((vm_page_free_count >= vm_page_free_target) &&
1318 (vm_page_free_wanted == 0)) {
1319
1320 vm_page_unlock_queues();
1321
1322 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1323 return;
1324 }
1325 mutex_unlock(&vm_page_queue_free_lock);
1326 }
1327
1328
1329 /*
1330 * Sometimes we have to pause:
1331 * 1) No inactive pages - nothing to do.
1332 * 2) Flow control - default pageout queue is full
1333 * 3) Loop control - no acceptable pages found on the inactive queue
1334 * within the last vm_pageout_burst_inactive_throttle iterations
1335 */
1336 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1337 vm_pageout_scan_empty_throttle++;
1338 msecs = vm_pageout_empty_wait;
1339 goto vm_pageout_scan_delay;
1340
1341 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1342 vm_pageout_scan_burst_throttle++;
1343 msecs = vm_pageout_burst_wait;
1344 goto vm_pageout_scan_delay;
1345
1346 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1347
1348 switch (flow_control.state) {
1349
1350 case FCS_IDLE:
1351 reset_deadlock_timer:
1352 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1353 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1354 sysclk_gettime(&flow_control.ts);
1355 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1356
1357 flow_control.state = FCS_DELAYED;
1358 msecs = vm_pageout_deadlock_wait;
1359
1360 break;
1361
1362 case FCS_DELAYED:
1363 sysclk_gettime(&ts);
1364
1365 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1366 /*
1367 * the pageout thread for the default pager is potentially
1368 * deadlocked since the
1369 * default pager queue has been throttled for more than the
1370 * allowable time... we need to move some clean pages or dirty
1371 * pages belonging to the external pagers if they aren't throttled
1372 * vm_page_free_wanted represents the number of threads currently
1373 * blocked waiting for pages... we'll move one page for each of
1374 * these plus a fixed amount to break the logjam... once we're done
1375 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1376 * with a new timeout target since we have no way of knowing
1377 * whether we've broken the deadlock except through observation
1378 * of the queue associated with the default pager... we need to
1379 * stop moving pagings and allow the system to run to see what
1380 * state it settles into.
1381 */
1382 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1383 vm_pageout_scan_deadlock_detected++;
1384 flow_control.state = FCS_DEADLOCK_DETECTED;
1385
1386 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1387 goto consider_inactive;
1388 }
1389 /*
1390 * just resniff instead of trying
1391 * to compute a new delay time... we're going to be
1392 * awakened immediately upon a laundry completion,
1393 * so we won't wait any longer than necessary
1394 */
1395 msecs = vm_pageout_idle_wait;
1396 break;
1397
1398 case FCS_DEADLOCK_DETECTED:
1399 if (vm_pageout_deadlock_target)
1400 goto consider_inactive;
1401 goto reset_deadlock_timer;
1402
1403 }
1404 vm_pageout_scan_throttle++;
1405 iq->pgo_throttled = TRUE;
1406 vm_pageout_scan_delay:
1407 if (object != NULL) {
1408 vm_object_unlock(object);
1409 object = NULL;
1410 }
1411 if (local_freeq) {
1412 vm_page_free_list(local_freeq);
1413
1414 local_freeq = 0;
1415 local_freed = 0;
1416 }
1417 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1418
1419 counter(c_vm_pageout_scan_block++);
1420
1421 vm_page_unlock_queues();
1422
1423 thread_block(THREAD_CONTINUE_NULL);
1424
1425 vm_page_lock_queues();
1426 delayed_unlock = 1;
1427
1428 iq->pgo_throttled = FALSE;
1429
1430 if (loop_count >= vm_page_inactive_count) {
1431 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1432 /*
1433 * Make sure we move enough "appropriate"
1434 * pages to the inactive queue before trying
1435 * again.
1436 */
1437 need_internal_inactive = vm_pageout_inactive_relief;
1438 }
1439 loop_count = 0;
1440 }
1441 inactive_burst_count = 0;
1442
1443 goto Restart;
1444 /*NOTREACHED*/
1445 }
1446
1447
1448 flow_control.state = FCS_IDLE;
1449 consider_inactive:
1450 loop_count++;
1451 inactive_burst_count++;
1452 vm_pageout_inactive++;
1453
1454 if (!queue_empty(&vm_page_queue_inactive)) {
1455 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1456
1457 if (m->clustered && (m->no_isync == TRUE)) {
1458 goto use_this_page;
1459 }
1460 }
1461 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1462 vm_zf_iterator = 0;
1463 } else {
1464 last_page_zf = 0;
1465 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1466 vm_zf_iterator = 0;
1467 }
1468 }
1469 if (queue_empty(&vm_page_queue_zf) ||
1470 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1471 !queue_empty(&vm_page_queue_inactive))) {
1472 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1473 last_page_zf = 0;
1474 } else {
1475 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1476 last_page_zf = 1;
1477 }
1478 use_this_page:
1479 assert(!m->active && m->inactive);
1480 assert(!m->laundry);
1481 assert(m->object != kernel_object);
1482
1483 /*
1484 * Try to lock object; since we've alread got the
1485 * page queues lock, we can only 'try' for this one.
1486 * if the 'try' fails, we need to do a mutex_pause
1487 * to allow the owner of the object lock a chance to
1488 * run... otherwise, we're likely to trip over this
1489 * object in the same state as we work our way through
1490 * the queue... clumps of pages associated with the same
1491 * object are fairly typical on the inactive and active queues
1492 */
1493 if (m->object != object) {
1494 if (object != NULL) {
1495 vm_object_unlock(object);
1496 object = NULL;
1497 }
1498 if (!vm_object_lock_try(m->object)) {
1499 /*
1500 * Move page to end and continue.
1501 * Don't re-issue ticket
1502 */
1503 if (m->zero_fill) {
1504 queue_remove(&vm_page_queue_zf, m,
1505 vm_page_t, pageq);
1506 queue_enter(&vm_page_queue_zf, m,
1507 vm_page_t, pageq);
1508 } else {
1509 queue_remove(&vm_page_queue_inactive, m,
1510 vm_page_t, pageq);
1511 queue_enter(&vm_page_queue_inactive, m,
1512 vm_page_t, pageq);
1513 }
1514 vm_pageout_inactive_nolock++;
1515
1516 /*
1517 * force us to dump any collected free pages
1518 * and to pause before moving on
1519 */
1520 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1521
1522 goto done_with_inactivepage;
1523 }
1524 object = m->object;
1525 }
1526 /*
1527 * If the page belongs to a purgable object with no pending copies
1528 * against it, then we reap all of the pages in the object
1529 * and note that the object has been "emptied". It'll be up to the
1530 * application the discover this and recreate its contents if desired.
1531 */
1532 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1533 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1534 object->copy == VM_OBJECT_NULL) {
1535
1536 (void) vm_object_purge(object);
1537 vm_pageout_purged_objects++;
1538 /*
1539 * we've just taken all of the pages from this object,
1540 * so drop the lock now since we're not going to find
1541 * any more pages belonging to it anytime soon
1542 */
1543 vm_object_unlock(object);
1544 object = NULL;
1545
1546 inactive_burst_count = 0;
1547
1548 goto done_with_inactivepage;
1549 }
1550
1551 /*
1552 * Paging out pages of external objects which
1553 * are currently being created must be avoided.
1554 * The pager may claim for memory, thus leading to a
1555 * possible dead lock between it and the pageout thread,
1556 * if such pages are finally chosen. The remaining assumption
1557 * is that there will finally be enough available pages in the
1558 * inactive pool to page out in order to satisfy all memory
1559 * claimed by the thread which concurrently creates the pager.
1560 */
1561 if (!object->pager_initialized && object->pager_created) {
1562 /*
1563 * Move page to end and continue, hoping that
1564 * there will be enough other inactive pages to
1565 * page out so that the thread which currently
1566 * initializes the pager will succeed.
1567 * Don't re-grant the ticket, the page should
1568 * pulled from the queue and paged out whenever
1569 * one of its logically adjacent fellows is
1570 * targeted.
1571 */
1572 if (m->zero_fill) {
1573 queue_remove(&vm_page_queue_zf, m,
1574 vm_page_t, pageq);
1575 queue_enter(&vm_page_queue_zf, m,
1576 vm_page_t, pageq);
1577 last_page_zf = 1;
1578 vm_zf_iterator = vm_zf_iterator_count - 1;
1579 } else {
1580 queue_remove(&vm_page_queue_inactive, m,
1581 vm_page_t, pageq);
1582 queue_enter(&vm_page_queue_inactive, m,
1583 vm_page_t, pageq);
1584 last_page_zf = 0;
1585 vm_zf_iterator = 1;
1586 }
1587 vm_pageout_inactive_avoid++;
1588
1589 goto done_with_inactivepage;
1590 }
1591 /*
1592 * Remove the page from the inactive list.
1593 */
1594 if (m->zero_fill) {
1595 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1596 } else {
1597 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1598 }
1599 m->pageq.next = NULL;
1600 m->pageq.prev = NULL;
1601 m->inactive = FALSE;
1602 if (!m->fictitious)
1603 vm_page_inactive_count--;
1604
1605 if (m->busy || !object->alive) {
1606 /*
1607 * Somebody is already playing with this page.
1608 * Leave it off the pageout queues.
1609 */
1610 vm_pageout_inactive_busy++;
1611
1612 goto done_with_inactivepage;
1613 }
1614
1615 /*
1616 * If it's absent or in error, we can reclaim the page.
1617 */
1618
1619 if (m->absent || m->error) {
1620 vm_pageout_inactive_absent++;
1621 reclaim_page:
1622 if (vm_pageout_deadlock_target) {
1623 vm_pageout_scan_inactive_throttle_success++;
1624 vm_pageout_deadlock_target--;
1625 }
1626 if (m->tabled)
1627 vm_page_remove(m); /* clears tabled, object, offset */
1628 if (m->absent)
1629 vm_object_absent_release(object);
1630
1631 assert(m->pageq.next == NULL &&
1632 m->pageq.prev == NULL);
1633 m->pageq.next = (queue_entry_t)local_freeq;
1634 local_freeq = m;
1635 local_freed++;
1636
1637 inactive_burst_count = 0;
1638
1639 goto done_with_inactivepage;
1640 }
1641
1642 assert(!m->private);
1643 assert(!m->fictitious);
1644
1645 /*
1646 * If already cleaning this page in place, convert from
1647 * "adjacent" to "target". We can leave the page mapped,
1648 * and vm_pageout_object_terminate will determine whether
1649 * to free or reactivate.
1650 */
1651
1652 if (m->cleaning) {
1653 m->busy = TRUE;
1654 m->pageout = TRUE;
1655 m->dump_cleaning = TRUE;
1656 vm_page_wire(m);
1657
1658 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1659
1660 inactive_burst_count = 0;
1661
1662 goto done_with_inactivepage;
1663 }
1664
1665 /*
1666 * If it's being used, reactivate.
1667 * (Fictitious pages are either busy or absent.)
1668 */
1669 if ( (!m->reference) ) {
1670 refmod_state = pmap_get_refmod(m->phys_page);
1671
1672 if (refmod_state & VM_MEM_REFERENCED)
1673 m->reference = TRUE;
1674 if (refmod_state & VM_MEM_MODIFIED)
1675 m->dirty = TRUE;
1676 }
1677 if (m->reference) {
1678 was_referenced:
1679 vm_page_activate(m);
1680 VM_STAT(reactivations++);
1681
1682 vm_pageout_inactive_used++;
1683 last_page_zf = 0;
1684 inactive_burst_count = 0;
1685
1686 goto done_with_inactivepage;
1687 }
1688
1689 XPR(XPR_VM_PAGEOUT,
1690 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1691 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1692
1693 /*
1694 * we've got a candidate page to steal...
1695 *
1696 * m->dirty is up to date courtesy of the
1697 * preceding check for m->reference... if
1698 * we get here, then m->reference had to be
1699 * FALSE which means we did a pmap_get_refmod
1700 * and updated both m->reference and m->dirty
1701 *
1702 * if it's dirty or precious we need to
1703 * see if the target queue is throtttled
1704 * it if is, we need to skip over it by moving it back
1705 * to the end of the inactive queue
1706 */
1707 inactive_throttled = FALSE;
1708
1709 if (m->dirty || m->precious) {
1710 if (object->internal) {
1711 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1712 inactive_throttled = TRUE;
1713 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1714 inactive_throttled = TRUE;
1715 }
1716 }
1717 if (inactive_throttled == TRUE) {
1718 if (m->zero_fill) {
1719 queue_enter(&vm_page_queue_zf, m,
1720 vm_page_t, pageq);
1721 } else {
1722 queue_enter(&vm_page_queue_inactive, m,
1723 vm_page_t, pageq);
1724 }
1725 if (!m->fictitious)
1726 vm_page_inactive_count++;
1727 m->inactive = TRUE;
1728
1729 vm_pageout_scan_inactive_throttled++;
1730
1731 goto done_with_inactivepage;
1732 }
1733 /*
1734 * we've got a page that we can steal...
1735 * eliminate all mappings and make sure
1736 * we have the up-to-date modified state
1737 * first take the page BUSY, so that no new
1738 * mappings can be made
1739 */
1740 m->busy = TRUE;
1741
1742 /*
1743 * if we need to do a pmap_disconnect then we
1744 * need to re-evaluate m->dirty since the pmap_disconnect
1745 * provides the true state atomically... the
1746 * page was still mapped up to the pmap_disconnect
1747 * and may have been dirtied at the last microsecond
1748 *
1749 * we also check for the page being referenced 'late'
1750 * if it was, we first need to do a WAKEUP_DONE on it
1751 * since we already set m->busy = TRUE, before
1752 * going off to reactivate it
1753 *
1754 * if we don't need the pmap_disconnect, then
1755 * m->dirty is up to date courtesy of the
1756 * earlier check for m->reference... if
1757 * we get here, then m->reference had to be
1758 * FALSE which means we did a pmap_get_refmod
1759 * and updated both m->reference and m->dirty...
1760 */
1761 if (m->no_isync == FALSE) {
1762 refmod_state = pmap_disconnect(m->phys_page);
1763
1764 if (refmod_state & VM_MEM_MODIFIED)
1765 m->dirty = TRUE;
1766 if (refmod_state & VM_MEM_REFERENCED) {
1767 m->reference = TRUE;
1768
1769 PAGE_WAKEUP_DONE(m);
1770 goto was_referenced;
1771 }
1772 }
1773 /*
1774 * If it's clean and not precious, we can free the page.
1775 */
1776 if (!m->dirty && !m->precious) {
1777 vm_pageout_inactive_clean++;
1778 goto reclaim_page;
1779 }
1780 vm_pageout_cluster(m);
1781
1782 vm_pageout_inactive_dirty++;
1783
1784 inactive_burst_count = 0;
1785
1786 done_with_inactivepage:
1787 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1788
1789 if (object != NULL) {
1790 vm_object_unlock(object);
1791 object = NULL;
1792 }
1793 if (local_freeq) {
1794 vm_page_free_list(local_freeq);
1795
1796 local_freeq = 0;
1797 local_freed = 0;
1798 }
1799 delayed_unlock = 0;
1800 vm_page_unlock_queues();
1801 mutex_pause();
1802 }
1803 /*
1804 * back to top of pageout scan loop
1805 */
1806 }
1807 }
1808
1809
1810 int vm_page_free_count_init;
1811
1812 void
1813 vm_page_free_reserve(
1814 int pages)
1815 {
1816 int free_after_reserve;
1817
1818 vm_page_free_reserved += pages;
1819
1820 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1821
1822 vm_page_free_min = vm_page_free_reserved +
1823 VM_PAGE_FREE_MIN(free_after_reserve);
1824
1825 vm_page_free_target = vm_page_free_reserved +
1826 VM_PAGE_FREE_TARGET(free_after_reserve);
1827
1828 if (vm_page_free_target < vm_page_free_min + 5)
1829 vm_page_free_target = vm_page_free_min + 5;
1830 }
1831
1832 /*
1833 * vm_pageout is the high level pageout daemon.
1834 */
1835
1836 void
1837 vm_pageout_continue(void)
1838 {
1839 vm_pageout_scan_event_counter++;
1840 vm_pageout_scan();
1841 /* we hold vm_page_queue_free_lock now */
1842 assert(vm_page_free_wanted == 0);
1843 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1844 mutex_unlock(&vm_page_queue_free_lock);
1845
1846 counter(c_vm_pageout_block++);
1847 thread_block((thread_continue_t)vm_pageout_continue);
1848 /*NOTREACHED*/
1849 }
1850
1851
1852 /*
1853 * must be called with the
1854 * queues and object locks held
1855 */
1856 static void
1857 vm_pageout_queue_steal(vm_page_t m)
1858 {
1859 struct vm_pageout_queue *q;
1860
1861 if (m->object->internal == TRUE)
1862 q = &vm_pageout_queue_internal;
1863 else
1864 q = &vm_pageout_queue_external;
1865
1866 m->laundry = FALSE;
1867 m->pageout_queue = FALSE;
1868 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1869
1870 m->pageq.next = NULL;
1871 m->pageq.prev = NULL;
1872
1873 vm_object_paging_end(m->object);
1874
1875 q->pgo_laundry--;
1876 }
1877
1878
1879 #ifdef FAKE_DEADLOCK
1880
1881 #define FAKE_COUNT 5000
1882
1883 int internal_count = 0;
1884 int fake_deadlock = 0;
1885
1886 #endif
1887
1888 static void
1889 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1890 {
1891 vm_page_t m = NULL;
1892 vm_object_t object;
1893 boolean_t need_wakeup;
1894
1895 vm_page_lock_queues();
1896
1897 while ( !queue_empty(&q->pgo_pending) ) {
1898
1899 q->pgo_busy = TRUE;
1900 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1901 m->pageout_queue = FALSE;
1902 vm_page_unlock_queues();
1903
1904 m->pageq.next = NULL;
1905 m->pageq.prev = NULL;
1906 #ifdef FAKE_DEADLOCK
1907 if (q == &vm_pageout_queue_internal) {
1908 vm_offset_t addr;
1909 int pg_count;
1910
1911 internal_count++;
1912
1913 if ((internal_count == FAKE_COUNT)) {
1914
1915 pg_count = vm_page_free_count + vm_page_free_reserved;
1916
1917 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1918 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1919 }
1920 internal_count = 0;
1921 fake_deadlock++;
1922 }
1923 }
1924 #endif
1925 object = m->object;
1926
1927 if (!object->pager_initialized) {
1928 vm_object_lock(object);
1929
1930 /*
1931 * If there is no memory object for the page, create
1932 * one and hand it to the default pager.
1933 */
1934
1935 if (!object->pager_initialized)
1936 vm_object_collapse(object, (vm_object_offset_t)0);
1937 if (!object->pager_initialized)
1938 vm_object_pager_create(object);
1939 if (!object->pager_initialized) {
1940 /*
1941 * Still no pager for the object.
1942 * Reactivate the page.
1943 *
1944 * Should only happen if there is no
1945 * default pager.
1946 */
1947 m->list_req_pending = FALSE;
1948 m->cleaning = FALSE;
1949 m->pageout = FALSE;
1950 vm_page_unwire(m);
1951
1952 vm_pageout_throttle_up(m);
1953
1954 vm_page_lock_queues();
1955 vm_pageout_dirty_no_pager++;
1956 vm_page_activate(m);
1957 vm_page_unlock_queues();
1958
1959 /*
1960 * And we are done with it.
1961 */
1962 PAGE_WAKEUP_DONE(m);
1963
1964 vm_object_paging_end(object);
1965 vm_object_unlock(object);
1966
1967 vm_page_lock_queues();
1968 continue;
1969 } else if (object->pager == MEMORY_OBJECT_NULL) {
1970 /*
1971 * This pager has been destroyed by either
1972 * memory_object_destroy or vm_object_destroy, and
1973 * so there is nowhere for the page to go.
1974 * Just free the page... VM_PAGE_FREE takes
1975 * care of cleaning up all the state...
1976 * including doing the vm_pageout_throttle_up
1977 */
1978 VM_PAGE_FREE(m);
1979
1980 vm_object_paging_end(object);
1981 vm_object_unlock(object);
1982
1983 vm_page_lock_queues();
1984 continue;
1985 }
1986 vm_object_unlock(object);
1987 }
1988 /*
1989 * we expect the paging_in_progress reference to have
1990 * already been taken on the object before it was added
1991 * to the appropriate pageout I/O queue... this will
1992 * keep the object from being terminated and/or the
1993 * paging_offset from changing until the I/O has
1994 * completed... therefore no need to lock the object to
1995 * pull the paging_offset from it.
1996 *
1997 * Send the data to the pager.
1998 * any pageout clustering happens there
1999 */
2000 memory_object_data_return(object->pager,
2001 m->offset + object->paging_offset,
2002 PAGE_SIZE,
2003 NULL,
2004 NULL,
2005 FALSE,
2006 FALSE,
2007 0);
2008
2009 vm_object_lock(object);
2010 vm_object_paging_end(object);
2011 vm_object_unlock(object);
2012
2013 vm_page_lock_queues();
2014 }
2015 assert_wait((event_t) q, THREAD_UNINT);
2016
2017
2018 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2019 q->pgo_throttled = FALSE;
2020 need_wakeup = TRUE;
2021 } else
2022 need_wakeup = FALSE;
2023
2024 q->pgo_busy = FALSE;
2025 q->pgo_idle = TRUE;
2026 vm_page_unlock_queues();
2027
2028 if (need_wakeup == TRUE)
2029 thread_wakeup((event_t) &q->pgo_laundry);
2030
2031 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2032 /*NOTREACHED*/
2033 }
2034
2035
2036 static void
2037 vm_pageout_iothread_external(void)
2038 {
2039
2040 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2041 /*NOTREACHED*/
2042 }
2043
2044
2045 static void
2046 vm_pageout_iothread_internal(void)
2047 {
2048 thread_t self = current_thread();
2049
2050 self->options |= TH_OPT_VMPRIV;
2051
2052 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2053 /*NOTREACHED*/
2054 }
2055
2056 static void
2057 vm_pageout_garbage_collect(int collect)
2058 {
2059 if (collect) {
2060 stack_collect();
2061
2062 /*
2063 * consider_zone_gc should be last, because the other operations
2064 * might return memory to zones.
2065 */
2066 consider_machine_collect();
2067 consider_zone_gc();
2068
2069 consider_machine_adjust();
2070 }
2071
2072 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2073
2074 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2075 /*NOTREACHED*/
2076 }
2077
2078
2079
2080 void
2081 vm_pageout(void)
2082 {
2083 thread_t self = current_thread();
2084 thread_t thread;
2085 kern_return_t result;
2086 spl_t s;
2087
2088 /*
2089 * Set thread privileges.
2090 */
2091 s = splsched();
2092 thread_lock(self);
2093 self->priority = BASEPRI_PREEMPT - 1;
2094 set_sched_pri(self, self->priority);
2095 thread_unlock(self);
2096 splx(s);
2097
2098 /*
2099 * Initialize some paging parameters.
2100 */
2101
2102 if (vm_pageout_idle_wait == 0)
2103 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2104
2105 if (vm_pageout_burst_wait == 0)
2106 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2107
2108 if (vm_pageout_empty_wait == 0)
2109 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2110
2111 if (vm_pageout_deadlock_wait == 0)
2112 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2113
2114 if (vm_pageout_deadlock_relief == 0)
2115 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2116
2117 if (vm_pageout_inactive_relief == 0)
2118 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2119
2120 if (vm_pageout_burst_active_throttle == 0)
2121 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2122
2123 if (vm_pageout_burst_inactive_throttle == 0)
2124 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2125
2126 /*
2127 * Set kernel task to low backing store privileged
2128 * status
2129 */
2130 task_lock(kernel_task);
2131 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2132 task_unlock(kernel_task);
2133
2134 vm_page_free_count_init = vm_page_free_count;
2135 vm_zf_iterator = 0;
2136 /*
2137 * even if we've already called vm_page_free_reserve
2138 * call it again here to insure that the targets are
2139 * accurately calculated (it uses vm_page_free_count_init)
2140 * calling it with an arg of 0 will not change the reserve
2141 * but will re-calculate free_min and free_target
2142 */
2143 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2144 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2145 } else
2146 vm_page_free_reserve(0);
2147
2148
2149 queue_init(&vm_pageout_queue_external.pgo_pending);
2150 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2151 vm_pageout_queue_external.pgo_laundry = 0;
2152 vm_pageout_queue_external.pgo_idle = FALSE;
2153 vm_pageout_queue_external.pgo_busy = FALSE;
2154 vm_pageout_queue_external.pgo_throttled = FALSE;
2155
2156 queue_init(&vm_pageout_queue_internal.pgo_pending);
2157 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2158 vm_pageout_queue_internal.pgo_laundry = 0;
2159 vm_pageout_queue_internal.pgo_idle = FALSE;
2160 vm_pageout_queue_internal.pgo_busy = FALSE;
2161 vm_pageout_queue_internal.pgo_throttled = FALSE;
2162
2163
2164 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2165 if (result != KERN_SUCCESS)
2166 panic("vm_pageout_iothread_internal: create failed");
2167
2168 thread_deallocate(thread);
2169
2170
2171 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2172 if (result != KERN_SUCCESS)
2173 panic("vm_pageout_iothread_external: create failed");
2174
2175 thread_deallocate(thread);
2176
2177
2178 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2179 if (result != KERN_SUCCESS)
2180 panic("vm_pageout_garbage_collect: create failed");
2181
2182 thread_deallocate(thread);
2183
2184
2185 vm_pageout_continue();
2186 /*NOTREACHED*/
2187 }
2188
2189
2190 static upl_t
2191 upl_create(
2192 int flags,
2193 upl_size_t size)
2194 {
2195 upl_t upl;
2196 int page_field_size; /* bit field in word size buf */
2197
2198 page_field_size = 0;
2199 if (flags & UPL_CREATE_LITE) {
2200 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2201 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2202 }
2203 if(flags & UPL_CREATE_INTERNAL) {
2204 upl = (upl_t)kalloc(sizeof(struct upl)
2205 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2206 + page_field_size);
2207 } else {
2208 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2209 }
2210 upl->flags = 0;
2211 upl->src_object = NULL;
2212 upl->kaddr = (vm_offset_t)0;
2213 upl->size = 0;
2214 upl->map_object = NULL;
2215 upl->ref_count = 1;
2216 upl_lock_init(upl);
2217 #ifdef UPL_DEBUG
2218 upl->ubc_alias1 = 0;
2219 upl->ubc_alias2 = 0;
2220 #endif /* UPL_DEBUG */
2221 return(upl);
2222 }
2223
2224 static void
2225 upl_destroy(
2226 upl_t upl)
2227 {
2228 int page_field_size; /* bit field in word size buf */
2229
2230 #ifdef UPL_DEBUG
2231 {
2232 upl_t upl_ele;
2233 vm_object_t object;
2234 if (upl->map_object->pageout) {
2235 object = upl->map_object->shadow;
2236 } else {
2237 object = upl->map_object;
2238 }
2239 vm_object_lock(object);
2240 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2241 if(upl_ele == upl) {
2242 queue_remove(&object->uplq,
2243 upl_ele, upl_t, uplq);
2244 break;
2245 }
2246 }
2247 vm_object_unlock(object);
2248 }
2249 #endif /* UPL_DEBUG */
2250 /* drop a reference on the map_object whether or */
2251 /* not a pageout object is inserted */
2252 if(upl->map_object->pageout)
2253 vm_object_deallocate(upl->map_object);
2254
2255 page_field_size = 0;
2256 if (upl->flags & UPL_LITE) {
2257 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2258 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2259 }
2260 if(upl->flags & UPL_INTERNAL) {
2261 kfree(upl,
2262 sizeof(struct upl) +
2263 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2264 + page_field_size);
2265 } else {
2266 kfree(upl, sizeof(struct upl) + page_field_size);
2267 }
2268 }
2269
2270 void uc_upl_dealloc(upl_t upl);
2271 __private_extern__ void
2272 uc_upl_dealloc(
2273 upl_t upl)
2274 {
2275 upl->ref_count -= 1;
2276 if(upl->ref_count == 0) {
2277 upl_destroy(upl);
2278 }
2279 }
2280
2281 void
2282 upl_deallocate(
2283 upl_t upl)
2284 {
2285
2286 upl->ref_count -= 1;
2287 if(upl->ref_count == 0) {
2288 upl_destroy(upl);
2289 }
2290 }
2291
2292 /*
2293 * Statistics about UPL enforcement of copy-on-write obligations.
2294 */
2295 unsigned long upl_cow = 0;
2296 unsigned long upl_cow_again = 0;
2297 unsigned long upl_cow_contiguous = 0;
2298 unsigned long upl_cow_pages = 0;
2299 unsigned long upl_cow_again_pages = 0;
2300 unsigned long upl_cow_contiguous_pages = 0;
2301
2302 /*
2303 * Routine: vm_object_upl_request
2304 * Purpose:
2305 * Cause the population of a portion of a vm_object.
2306 * Depending on the nature of the request, the pages
2307 * returned may be contain valid data or be uninitialized.
2308 * A page list structure, listing the physical pages
2309 * will be returned upon request.
2310 * This function is called by the file system or any other
2311 * supplier of backing store to a pager.
2312 * IMPORTANT NOTE: The caller must still respect the relationship
2313 * between the vm_object and its backing memory object. The
2314 * caller MUST NOT substitute changes in the backing file
2315 * without first doing a memory_object_lock_request on the
2316 * target range unless it is know that the pages are not
2317 * shared with another entity at the pager level.
2318 * Copy_in_to:
2319 * if a page list structure is present
2320 * return the mapped physical pages, where a
2321 * page is not present, return a non-initialized
2322 * one. If the no_sync bit is turned on, don't
2323 * call the pager unlock to synchronize with other
2324 * possible copies of the page. Leave pages busy
2325 * in the original object, if a page list structure
2326 * was specified. When a commit of the page list
2327 * pages is done, the dirty bit will be set for each one.
2328 * Copy_out_from:
2329 * If a page list structure is present, return
2330 * all mapped pages. Where a page does not exist
2331 * map a zero filled one. Leave pages busy in
2332 * the original object. If a page list structure
2333 * is not specified, this call is a no-op.
2334 *
2335 * Note: access of default pager objects has a rather interesting
2336 * twist. The caller of this routine, presumably the file system
2337 * page cache handling code, will never actually make a request
2338 * against a default pager backed object. Only the default
2339 * pager will make requests on backing store related vm_objects
2340 * In this way the default pager can maintain the relationship
2341 * between backing store files (abstract memory objects) and
2342 * the vm_objects (cache objects), they support.
2343 *
2344 */
2345
2346 __private_extern__ kern_return_t
2347 vm_object_upl_request(
2348 vm_object_t object,
2349 vm_object_offset_t offset,
2350 upl_size_t size,
2351 upl_t *upl_ptr,
2352 upl_page_info_array_t user_page_list,
2353 unsigned int *page_list_count,
2354 int cntrl_flags)
2355 {
2356 vm_page_t dst_page = VM_PAGE_NULL;
2357 vm_object_offset_t dst_offset = offset;
2358 upl_size_t xfer_size = size;
2359 boolean_t do_m_lock = FALSE;
2360 boolean_t dirty;
2361 boolean_t hw_dirty;
2362 upl_t upl = NULL;
2363 unsigned int entry;
2364 #if MACH_CLUSTER_STATS
2365 boolean_t encountered_lrp = FALSE;
2366 #endif
2367 vm_page_t alias_page = NULL;
2368 int page_ticket;
2369 int refmod_state;
2370 wpl_array_t lite_list = NULL;
2371 vm_object_t last_copy_object;
2372
2373
2374 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2375 /*
2376 * For forward compatibility's sake,
2377 * reject any unknown flag.
2378 */
2379 return KERN_INVALID_VALUE;
2380 }
2381
2382 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2383 >> UPL_PAGE_TICKET_SHIFT;
2384
2385 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2386 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2387 }
2388
2389 if(cntrl_flags & UPL_SET_INTERNAL)
2390 if(page_list_count != NULL)
2391 *page_list_count = MAX_UPL_TRANSFER;
2392
2393 if((!object->internal) && (object->paging_offset != 0))
2394 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
2395
2396 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2397 return KERN_SUCCESS;
2398 }
2399
2400 vm_object_lock(object);
2401 vm_object_paging_begin(object);
2402 vm_object_unlock(object);
2403
2404 if(upl_ptr) {
2405 if(cntrl_flags & UPL_SET_INTERNAL) {
2406 if(cntrl_flags & UPL_SET_LITE) {
2407 uintptr_t page_field_size;
2408 upl = upl_create(
2409 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2410 size);
2411 user_page_list = (upl_page_info_t *)
2412 (((uintptr_t)upl) + sizeof(struct upl));
2413 lite_list = (wpl_array_t)
2414 (((uintptr_t)user_page_list) +
2415 ((size/PAGE_SIZE) *
2416 sizeof(upl_page_info_t)));
2417 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2418 page_field_size =
2419 (page_field_size + 3) & 0xFFFFFFFC;
2420 bzero((char *)lite_list, page_field_size);
2421 upl->flags =
2422 UPL_LITE | UPL_INTERNAL;
2423 } else {
2424 upl = upl_create(UPL_CREATE_INTERNAL, size);
2425 user_page_list = (upl_page_info_t *)
2426 (((uintptr_t)upl) + sizeof(struct upl));
2427 upl->flags = UPL_INTERNAL;
2428 }
2429 } else {
2430 if(cntrl_flags & UPL_SET_LITE) {
2431 uintptr_t page_field_size;
2432 upl = upl_create(UPL_CREATE_LITE, size);
2433 lite_list = (wpl_array_t)
2434 (((uintptr_t)upl) + sizeof(struct upl));
2435 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2436 page_field_size =
2437 (page_field_size + 3) & 0xFFFFFFFC;
2438 bzero((char *)lite_list, page_field_size);
2439 upl->flags = UPL_LITE;
2440 } else {
2441 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2442 upl->flags = 0;
2443 }
2444 }
2445
2446 if (object->phys_contiguous) {
2447 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2448 object->copy != VM_OBJECT_NULL) {
2449 /* Honor copy-on-write obligations */
2450
2451 /*
2452 * XXX FBDP
2453 * We could still have a race...
2454 * A is here building the UPL for a write().
2455 * A pushes the pages to the current copy
2456 * object.
2457 * A returns the UPL to the caller.
2458 * B comes along and establishes another
2459 * private mapping on this object, inserting
2460 * a new copy object between the original
2461 * object and the old copy object.
2462 * B reads a page and gets the original contents
2463 * from the original object.
2464 * A modifies the page in the original object.
2465 * B reads the page again and sees A's changes,
2466 * which is wrong...
2467 *
2468 * The problem is that the pages are not
2469 * marked "busy" in the original object, so
2470 * nothing prevents B from reading it before
2471 * before A's changes are completed.
2472 *
2473 * The "paging_in_progress" might protect us
2474 * from the insertion of a new copy object
2475 * though... To be verified.
2476 */
2477 vm_object_lock_request(object,
2478 offset,
2479 size,
2480 FALSE,
2481 MEMORY_OBJECT_COPY_SYNC,
2482 VM_PROT_NO_CHANGE);
2483 upl_cow_contiguous++;
2484 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2485 }
2486
2487 upl->map_object = object;
2488 /* don't need any shadow mappings for this one */
2489 /* since it is already I/O memory */
2490 upl->flags |= UPL_DEVICE_MEMORY;
2491
2492
2493 /* paging_in_progress protects paging_offset */
2494 upl->offset = offset + object->paging_offset;
2495 upl->size = size;
2496 *upl_ptr = upl;
2497 if(user_page_list) {
2498 user_page_list[0].phys_addr =
2499 (offset + object->shadow_offset)>>PAGE_SHIFT;
2500 user_page_list[0].device = TRUE;
2501 }
2502
2503 if(page_list_count != NULL) {
2504 if (upl->flags & UPL_INTERNAL) {
2505 *page_list_count = 0;
2506 } else {
2507 *page_list_count = 1;
2508 }
2509 }
2510
2511 return KERN_SUCCESS;
2512 }
2513
2514 if(user_page_list)
2515 user_page_list[0].device = FALSE;
2516
2517 if(cntrl_flags & UPL_SET_LITE) {
2518 upl->map_object = object;
2519 } else {
2520 upl->map_object = vm_object_allocate(size);
2521 /*
2522 * No neeed to lock the new object: nobody else knows
2523 * about it yet, so it's all ours so far.
2524 */
2525 upl->map_object->shadow = object;
2526 upl->map_object->pageout = TRUE;
2527 upl->map_object->can_persist = FALSE;
2528 upl->map_object->copy_strategy =
2529 MEMORY_OBJECT_COPY_NONE;
2530 upl->map_object->shadow_offset = offset;
2531 upl->map_object->wimg_bits = object->wimg_bits;
2532 }
2533
2534 }
2535 if (!(cntrl_flags & UPL_SET_LITE)) {
2536 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2537 }
2538
2539 /*
2540 * ENCRYPTED SWAP:
2541 * Just mark the UPL as "encrypted" here.
2542 * We'll actually encrypt the pages later,
2543 * in upl_encrypt(), when the caller has
2544 * selected which pages need to go to swap.
2545 */
2546 if (cntrl_flags & UPL_ENCRYPT) {
2547 upl->flags |= UPL_ENCRYPTED;
2548 }
2549 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2550 upl->flags |= UPL_PAGEOUT;
2551 }
2552 vm_object_lock(object);
2553
2554 /* we can lock in the paging_offset once paging_in_progress is set */
2555 if(upl_ptr) {
2556 upl->size = size;
2557 upl->offset = offset + object->paging_offset;
2558 *upl_ptr = upl;
2559 #ifdef UPL_DEBUG
2560 queue_enter(&object->uplq, upl, upl_t, uplq);
2561 #endif /* UPL_DEBUG */
2562 }
2563
2564 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2565 object->copy != VM_OBJECT_NULL) {
2566 /* Honor copy-on-write obligations */
2567
2568 /*
2569 * The caller is gathering these pages and
2570 * might modify their contents. We need to
2571 * make sure that the copy object has its own
2572 * private copies of these pages before we let
2573 * the caller modify them.
2574 */
2575 vm_object_update(object,
2576 offset,
2577 size,
2578 NULL,
2579 NULL,
2580 FALSE, /* should_return */
2581 MEMORY_OBJECT_COPY_SYNC,
2582 VM_PROT_NO_CHANGE);
2583 upl_cow++;
2584 upl_cow_pages += size >> PAGE_SHIFT;
2585
2586 }
2587 /* remember which copy object we synchronized with */
2588 last_copy_object = object->copy;
2589
2590 entry = 0;
2591 if(cntrl_flags & UPL_COPYOUT_FROM) {
2592 upl->flags |= UPL_PAGE_SYNC_DONE;
2593
2594 while (xfer_size) {
2595 if((alias_page == NULL) &&
2596 !(cntrl_flags & UPL_SET_LITE)) {
2597 vm_object_unlock(object);
2598 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2599 vm_object_lock(object);
2600 }
2601 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2602 dst_page->fictitious ||
2603 dst_page->absent ||
2604 dst_page->error ||
2605 (dst_page->wire_count && !dst_page->pageout) ||
2606
2607 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2608 (dst_page->page_ticket != page_ticket) &&
2609 ((dst_page->page_ticket+1) != page_ticket)) ) {
2610
2611 if (user_page_list)
2612 user_page_list[entry].phys_addr = 0;
2613 } else {
2614 /*
2615 * grab this up front...
2616 * a high percentange of the time we're going to
2617 * need the hardware modification state a bit later
2618 * anyway... so we can eliminate an extra call into
2619 * the pmap layer by grabbing it here and recording it
2620 */
2621 refmod_state = pmap_get_refmod(dst_page->phys_page);
2622
2623 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2624 /*
2625 * we're only asking for DIRTY pages to be returned
2626 */
2627
2628 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2629 /*
2630 * if we were the page stolen by vm_pageout_scan to be
2631 * cleaned (as opposed to a buddy being clustered in
2632 * or this request is not being driven by a PAGEOUT cluster
2633 * then we only need to check for the page being diry or
2634 * precious to decide whether to return it
2635 */
2636 if (dst_page->dirty || dst_page->precious ||
2637 (refmod_state & VM_MEM_MODIFIED)) {
2638 goto check_busy;
2639 }
2640 }
2641 /*
2642 * this is a request for a PAGEOUT cluster and this page
2643 * is merely along for the ride as a 'buddy'... not only
2644 * does it have to be dirty to be returned, but it also
2645 * can't have been referenced recently... note that we've
2646 * already filtered above based on whether this page is
2647 * currently on the inactive queue or it meets the page
2648 * ticket (generation count) check
2649 */
2650 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2651 ((refmod_state & VM_MEM_MODIFIED) ||
2652 dst_page->dirty || dst_page->precious) ) {
2653 goto check_busy;
2654 }
2655 /*
2656 * if we reach here, we're not to return
2657 * the page... go on to the next one
2658 */
2659 if (user_page_list)
2660 user_page_list[entry].phys_addr = 0;
2661 entry++;
2662 dst_offset += PAGE_SIZE_64;
2663 xfer_size -= PAGE_SIZE;
2664 continue;
2665 }
2666 check_busy:
2667 if(dst_page->busy &&
2668 (!(dst_page->list_req_pending &&
2669 dst_page->pageout))) {
2670 if(cntrl_flags & UPL_NOBLOCK) {
2671 if(user_page_list) {
2672 user_page_list[entry].phys_addr = 0;
2673 }
2674 entry++;
2675 dst_offset += PAGE_SIZE_64;
2676 xfer_size -= PAGE_SIZE;
2677 continue;
2678 }
2679 /*
2680 * someone else is playing with the
2681 * page. We will have to wait.
2682 */
2683 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2684 continue;
2685 }
2686 /* Someone else already cleaning the page? */
2687 if((dst_page->cleaning || dst_page->absent ||
2688 dst_page->wire_count != 0) &&
2689 !dst_page->list_req_pending) {
2690 if(user_page_list) {
2691 user_page_list[entry].phys_addr = 0;
2692 }
2693 entry++;
2694 dst_offset += PAGE_SIZE_64;
2695 xfer_size -= PAGE_SIZE;
2696 continue;
2697 }
2698 /* eliminate all mappings from the */
2699 /* original object and its prodigy */
2700
2701 vm_page_lock_queues();
2702
2703 if (dst_page->pageout_queue == TRUE)
2704 /*
2705 * we've buddied up a page for a clustered pageout
2706 * that has already been moved to the pageout
2707 * queue by pageout_scan... we need to remove
2708 * it from the queue and drop the laundry count
2709 * on that queue
2710 */
2711 vm_pageout_queue_steal(dst_page);
2712 #if MACH_CLUSTER_STATS
2713 /* pageout statistics gathering. count */
2714 /* all the pages we will page out that */
2715 /* were not counted in the initial */
2716 /* vm_pageout_scan work */
2717 if(dst_page->list_req_pending)
2718 encountered_lrp = TRUE;
2719 if((dst_page->dirty ||
2720 (dst_page->object->internal &&
2721 dst_page->precious)) &&
2722 (dst_page->list_req_pending
2723 == FALSE)) {
2724 if(encountered_lrp) {
2725 CLUSTER_STAT
2726 (pages_at_higher_offsets++;)
2727 } else {
2728 CLUSTER_STAT
2729 (pages_at_lower_offsets++;)
2730 }
2731 }
2732 #endif
2733 /* Turn off busy indication on pending */
2734 /* pageout. Note: we can only get here */
2735 /* in the request pending case. */
2736 dst_page->list_req_pending = FALSE;
2737 dst_page->busy = FALSE;
2738 dst_page->cleaning = FALSE;
2739
2740 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2741 dirty = hw_dirty ? TRUE : dst_page->dirty;
2742
2743 if(cntrl_flags & UPL_SET_LITE) {
2744 int pg_num;
2745 pg_num = (dst_offset-offset)/PAGE_SIZE;
2746 lite_list[pg_num>>5] |=
2747 1 << (pg_num & 31);
2748 if (hw_dirty)
2749 pmap_clear_modify(dst_page->phys_page);
2750 /*
2751 * Record that this page has been
2752 * written out
2753 */
2754 #if MACH_PAGEMAP
2755 vm_external_state_set(
2756 object->existence_map,
2757 dst_page->offset);
2758 #endif /*MACH_PAGEMAP*/
2759
2760 /*
2761 * Mark original page as cleaning
2762 * in place.
2763 */
2764 dst_page->cleaning = TRUE;
2765 dst_page->dirty = TRUE;
2766 dst_page->precious = FALSE;
2767 } else {
2768 /* use pageclean setup, it is more */
2769 /* convenient even for the pageout */
2770 /* cases here */
2771
2772 vm_object_lock(upl->map_object);
2773 vm_pageclean_setup(dst_page,
2774 alias_page, upl->map_object,
2775 size - xfer_size);
2776 vm_object_unlock(upl->map_object);
2777
2778 alias_page->absent = FALSE;
2779 alias_page = NULL;
2780 }
2781
2782 if(!dirty) {
2783 dst_page->dirty = FALSE;
2784 dst_page->precious = TRUE;
2785 }
2786
2787 if(dst_page->pageout)
2788 dst_page->busy = TRUE;
2789
2790 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2791 /*
2792 * ENCRYPTED SWAP:
2793 * We want to deny access to the target page
2794 * because its contents are about to be
2795 * encrypted and the user would be very
2796 * confused to see encrypted data instead
2797 * of their data.
2798 */
2799 dst_page->busy = TRUE;
2800 }
2801 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2802 /*
2803 * deny access to the target page
2804 * while it is being worked on
2805 */
2806 if ((!dst_page->pageout) &&
2807 (dst_page->wire_count == 0)) {
2808 dst_page->busy = TRUE;
2809 dst_page->pageout = TRUE;
2810 vm_page_wire(dst_page);
2811 }
2812 }
2813
2814 if(user_page_list) {
2815 user_page_list[entry].phys_addr
2816 = dst_page->phys_page;
2817 user_page_list[entry].dirty =
2818 dst_page->dirty;
2819 user_page_list[entry].pageout =
2820 dst_page->pageout;
2821 user_page_list[entry].absent =
2822 dst_page->absent;
2823 user_page_list[entry].precious =
2824 dst_page->precious;
2825 }
2826 vm_page_unlock_queues();
2827
2828 /*
2829 * ENCRYPTED SWAP:
2830 * The caller is gathering this page and might
2831 * access its contents later on. Decrypt the
2832 * page before adding it to the UPL, so that
2833 * the caller never sees encrypted data.
2834 */
2835 if (! (cntrl_flags & UPL_ENCRYPT) &&
2836 dst_page->encrypted) {
2837 assert(dst_page->busy);
2838
2839 vm_page_decrypt(dst_page, 0);
2840 vm_page_decrypt_for_upl_counter++;
2841
2842 /*
2843 * Retry this page, since anything
2844 * could have changed while we were
2845 * decrypting.
2846 */
2847 continue;
2848 }
2849 }
2850 entry++;
2851 dst_offset += PAGE_SIZE_64;
2852 xfer_size -= PAGE_SIZE;
2853 }
2854 } else {
2855 while (xfer_size) {
2856 if((alias_page == NULL) &&
2857 !(cntrl_flags & UPL_SET_LITE)) {
2858 vm_object_unlock(object);
2859 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2860 vm_object_lock(object);
2861 }
2862
2863 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2864 object->copy != last_copy_object) {
2865 /* Honor copy-on-write obligations */
2866
2867 /*
2868 * The copy object has changed since we
2869 * last synchronized for copy-on-write.
2870 * Another copy object might have been
2871 * inserted while we released the object's
2872 * lock. Since someone could have seen the
2873 * original contents of the remaining pages
2874 * through that new object, we have to
2875 * synchronize with it again for the remaining
2876 * pages only. The previous pages are "busy"
2877 * so they can not be seen through the new
2878 * mapping. The new mapping will see our
2879 * upcoming changes for those previous pages,
2880 * but that's OK since they couldn't see what
2881 * was there before. It's just a race anyway
2882 * and there's no guarantee of consistency or
2883 * atomicity. We just don't want new mappings
2884 * to see both the *before* and *after* pages.
2885 */
2886 if (object->copy != VM_OBJECT_NULL) {
2887 vm_object_update(
2888 object,
2889 dst_offset,/* current offset */
2890 xfer_size, /* remaining size */
2891 NULL,
2892 NULL,
2893 FALSE, /* should_return */
2894 MEMORY_OBJECT_COPY_SYNC,
2895 VM_PROT_NO_CHANGE);
2896 upl_cow_again++;
2897 upl_cow_again_pages +=
2898 xfer_size >> PAGE_SHIFT;
2899 }
2900 /* remember the copy object we synced with */
2901 last_copy_object = object->copy;
2902 }
2903
2904 dst_page = vm_page_lookup(object, dst_offset);
2905
2906 if(dst_page != VM_PAGE_NULL) {
2907 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2908 !((dst_page->list_req_pending)
2909 && (dst_page->absent))) {
2910 /* we are doing extended range */
2911 /* requests. we want to grab */
2912 /* pages around some which are */
2913 /* already present. */
2914 if(user_page_list) {
2915 user_page_list[entry].phys_addr = 0;
2916 }
2917 entry++;
2918 dst_offset += PAGE_SIZE_64;
2919 xfer_size -= PAGE_SIZE;
2920 continue;
2921 }
2922 if((dst_page->cleaning) &&
2923 !(dst_page->list_req_pending)) {
2924 /*someone else is writing to the */
2925 /* page. We will have to wait. */
2926 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2927 continue;
2928 }
2929 if ((dst_page->fictitious &&
2930 dst_page->list_req_pending)) {
2931 /* dump the fictitious page */
2932 dst_page->list_req_pending = FALSE;
2933 dst_page->clustered = FALSE;
2934
2935 vm_page_lock_queues();
2936 vm_page_free(dst_page);
2937 vm_page_unlock_queues();
2938
2939 dst_page = NULL;
2940 } else if ((dst_page->absent &&
2941 dst_page->list_req_pending)) {
2942 /* the default_pager case */
2943 dst_page->list_req_pending = FALSE;
2944 dst_page->busy = FALSE;
2945 }
2946 }
2947 if(dst_page == VM_PAGE_NULL) {
2948 if(object->private) {
2949 /*
2950 * This is a nasty wrinkle for users
2951 * of upl who encounter device or
2952 * private memory however, it is
2953 * unavoidable, only a fault can
2954 * reslove the actual backing
2955 * physical page by asking the
2956 * backing device.
2957 */
2958 if(user_page_list) {
2959 user_page_list[entry].phys_addr = 0;
2960 }
2961 entry++;
2962 dst_offset += PAGE_SIZE_64;
2963 xfer_size -= PAGE_SIZE;
2964 continue;
2965 }
2966 /* need to allocate a page */
2967 dst_page = vm_page_alloc(object, dst_offset);
2968 if (dst_page == VM_PAGE_NULL) {
2969 vm_object_unlock(object);
2970 VM_PAGE_WAIT();
2971 vm_object_lock(object);
2972 continue;
2973 }
2974 dst_page->busy = FALSE;
2975 #if 0
2976 if(cntrl_flags & UPL_NO_SYNC) {
2977 dst_page->page_lock = 0;
2978 dst_page->unlock_request = 0;
2979 }
2980 #endif
2981 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2982 /*
2983 * if UPL_RET_ONLY_ABSENT was specified,
2984 * than we're definitely setting up a
2985 * upl for a clustered read/pagein
2986 * operation... mark the pages as clustered
2987 * so vm_fault can correctly attribute them
2988 * to the 'pagein' bucket the first time
2989 * a fault happens on them
2990 */
2991 dst_page->clustered = TRUE;
2992 }
2993 dst_page->absent = TRUE;
2994 object->absent_count++;
2995 }
2996 #if 1
2997 if(cntrl_flags & UPL_NO_SYNC) {
2998 dst_page->page_lock = 0;
2999 dst_page->unlock_request = 0;
3000 }
3001 #endif /* 1 */
3002
3003 /*
3004 * ENCRYPTED SWAP:
3005 */
3006 if (cntrl_flags & UPL_ENCRYPT) {
3007 /*
3008 * The page is going to be encrypted when we
3009 * get it from the pager, so mark it so.
3010 */
3011 dst_page->encrypted = TRUE;
3012 } else {
3013 /*
3014 * Otherwise, the page will not contain
3015 * encrypted data.
3016 */
3017 dst_page->encrypted = FALSE;
3018 }
3019
3020 dst_page->overwriting = TRUE;
3021 if(dst_page->fictitious) {
3022 panic("need corner case for fictitious page");
3023 }
3024 if(dst_page->page_lock) {
3025 do_m_lock = TRUE;
3026 }
3027 if(upl_ptr) {
3028
3029 /* eliminate all mappings from the */
3030 /* original object and its prodigy */
3031
3032 if(dst_page->busy) {
3033 /*someone else is playing with the */
3034 /* page. We will have to wait. */
3035 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3036 continue;
3037 }
3038 vm_page_lock_queues();
3039
3040 if( !(cntrl_flags & UPL_FILE_IO))
3041 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3042 else
3043 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3044 dirty = hw_dirty ? TRUE : dst_page->dirty;
3045
3046 if(cntrl_flags & UPL_SET_LITE) {
3047 int pg_num;
3048 pg_num = (dst_offset-offset)/PAGE_SIZE;
3049 lite_list[pg_num>>5] |=
3050 1 << (pg_num & 31);
3051 if (hw_dirty)
3052 pmap_clear_modify(dst_page->phys_page);
3053 /*
3054 * Record that this page has been
3055 * written out
3056 */
3057 #if MACH_PAGEMAP
3058 vm_external_state_set(
3059 object->existence_map,
3060 dst_page->offset);
3061 #endif /*MACH_PAGEMAP*/
3062
3063 /*
3064 * Mark original page as cleaning
3065 * in place.
3066 */
3067 dst_page->cleaning = TRUE;
3068 dst_page->dirty = TRUE;
3069 dst_page->precious = FALSE;
3070 } else {
3071 /* use pageclean setup, it is more */
3072 /* convenient even for the pageout */
3073 /* cases here */
3074 vm_object_lock(upl->map_object);
3075 vm_pageclean_setup(dst_page,
3076 alias_page, upl->map_object,
3077 size - xfer_size);
3078 vm_object_unlock(upl->map_object);
3079
3080 alias_page->absent = FALSE;
3081 alias_page = NULL;
3082 }
3083
3084 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3085 /* clean in place for read implies */
3086 /* that a write will be done on all */
3087 /* the pages that are dirty before */
3088 /* a upl commit is done. The caller */
3089 /* is obligated to preserve the */
3090 /* contents of all pages marked */
3091 /* dirty. */
3092 upl->flags |= UPL_CLEAR_DIRTY;
3093 }
3094
3095 if(!dirty) {
3096 dst_page->dirty = FALSE;
3097 dst_page->precious = TRUE;
3098 }
3099
3100 if (dst_page->wire_count == 0) {
3101 /* deny access to the target page while */
3102 /* it is being worked on */
3103 dst_page->busy = TRUE;
3104 } else {
3105 vm_page_wire(dst_page);
3106 }
3107 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3108 /*
3109 * expect the page not to be used
3110 * since it's coming in as part
3111 * of a cluster and could be
3112 * speculative... pages that
3113 * are 'consumed' will get a
3114 * hardware reference
3115 */
3116 dst_page->reference = FALSE;
3117 } else {
3118 /*
3119 * expect the page to be used
3120 */
3121 dst_page->reference = TRUE;
3122 }
3123 dst_page->precious =
3124 (cntrl_flags & UPL_PRECIOUS)
3125 ? TRUE : FALSE;
3126 if(user_page_list) {
3127 user_page_list[entry].phys_addr
3128 = dst_page->phys_page;
3129 user_page_list[entry].dirty =
3130 dst_page->dirty;
3131 user_page_list[entry].pageout =
3132 dst_page->pageout;
3133 user_page_list[entry].absent =
3134 dst_page->absent;
3135 user_page_list[entry].precious =
3136 dst_page->precious;
3137 }
3138 vm_page_unlock_queues();
3139 }
3140 entry++;
3141 dst_offset += PAGE_SIZE_64;
3142 xfer_size -= PAGE_SIZE;
3143 }
3144 }
3145
3146 if (upl->flags & UPL_INTERNAL) {
3147 if(page_list_count != NULL)
3148 *page_list_count = 0;
3149 } else if (*page_list_count > entry) {
3150 if(page_list_count != NULL)
3151 *page_list_count = entry;
3152 }
3153
3154 if(alias_page != NULL) {
3155 vm_page_lock_queues();
3156 vm_page_free(alias_page);
3157 vm_page_unlock_queues();
3158 }
3159
3160 if(do_m_lock) {
3161 vm_prot_t access_required;
3162 /* call back all associated pages from other users of the pager */
3163 /* all future updates will be on data which is based on the */
3164 /* changes we are going to make here. Note: it is assumed that */
3165 /* we already hold copies of the data so we will not be seeing */
3166 /* an avalanche of incoming data from the pager */
3167 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3168 ? VM_PROT_READ : VM_PROT_WRITE;
3169 while (TRUE) {
3170 kern_return_t rc;
3171
3172 if(!object->pager_ready) {
3173 wait_result_t wait_result;
3174
3175 wait_result = vm_object_sleep(object,
3176 VM_OBJECT_EVENT_PAGER_READY,
3177 THREAD_UNINT);
3178 if (wait_result != THREAD_AWAKENED) {
3179 vm_object_unlock(object);
3180 return KERN_FAILURE;
3181 }
3182 continue;
3183 }
3184
3185 vm_object_unlock(object);
3186 rc = memory_object_data_unlock(
3187 object->pager,
3188 dst_offset + object->paging_offset,
3189 size,
3190 access_required);
3191 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3192 return KERN_FAILURE;
3193 vm_object_lock(object);
3194
3195 if (rc == KERN_SUCCESS)
3196 break;
3197 }
3198
3199 /* lets wait on the last page requested */
3200 /* NOTE: we will have to update lock completed routine to signal */
3201 if(dst_page != VM_PAGE_NULL &&
3202 (access_required & dst_page->page_lock) != access_required) {
3203 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3204 vm_object_unlock(object);
3205 thread_block(THREAD_CONTINUE_NULL);
3206 return KERN_SUCCESS;
3207 }
3208 }
3209
3210 vm_object_unlock(object);
3211 return KERN_SUCCESS;
3212 }
3213
3214 /* JMM - Backward compatability for now */
3215 kern_return_t
3216 vm_fault_list_request( /* forward */
3217 memory_object_control_t control,
3218 vm_object_offset_t offset,
3219 upl_size_t size,
3220 upl_t *upl_ptr,
3221 upl_page_info_t **user_page_list_ptr,
3222 int page_list_count,
3223 int cntrl_flags);
3224 kern_return_t
3225 vm_fault_list_request(
3226 memory_object_control_t control,
3227 vm_object_offset_t offset,
3228 upl_size_t size,
3229 upl_t *upl_ptr,
3230 upl_page_info_t **user_page_list_ptr,
3231 int page_list_count,
3232 int cntrl_flags)
3233 {
3234 int local_list_count;
3235 upl_page_info_t *user_page_list;
3236 kern_return_t kr;
3237
3238 if (user_page_list_ptr != NULL) {
3239 local_list_count = page_list_count;
3240 user_page_list = *user_page_list_ptr;
3241 } else {
3242 local_list_count = 0;
3243 user_page_list = NULL;
3244 }
3245 kr = memory_object_upl_request(control,
3246 offset,
3247 size,
3248 upl_ptr,
3249 user_page_list,
3250 &local_list_count,
3251 cntrl_flags);
3252
3253 if(kr != KERN_SUCCESS)
3254 return kr;
3255
3256 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3257 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3258 }
3259
3260 return KERN_SUCCESS;
3261 }
3262
3263
3264
3265 /*
3266 * Routine: vm_object_super_upl_request
3267 * Purpose:
3268 * Cause the population of a portion of a vm_object
3269 * in much the same way as memory_object_upl_request.
3270 * Depending on the nature of the request, the pages
3271 * returned may be contain valid data or be uninitialized.
3272 * However, the region may be expanded up to the super
3273 * cluster size provided.
3274 */
3275
3276 __private_extern__ kern_return_t
3277 vm_object_super_upl_request(
3278 vm_object_t object,
3279 vm_object_offset_t offset,
3280 upl_size_t size,
3281 upl_size_t super_cluster,
3282 upl_t *upl,
3283 upl_page_info_t *user_page_list,
3284 unsigned int *page_list_count,
3285 int cntrl_flags)
3286 {
3287 vm_page_t target_page;
3288 int ticket;
3289
3290
3291 if(object->paging_offset > offset)
3292 return KERN_FAILURE;
3293
3294 assert(object->paging_in_progress);
3295 offset = offset - object->paging_offset;
3296
3297 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3298
3299 vm_object_lock(object);
3300
3301 if((target_page = vm_page_lookup(object, offset))
3302 != VM_PAGE_NULL) {
3303 ticket = target_page->page_ticket;
3304 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3305 cntrl_flags = cntrl_flags |
3306 ((ticket << UPL_PAGE_TICKET_SHIFT)
3307 & UPL_PAGE_TICKET_MASK);
3308 }
3309 vm_object_unlock(object);
3310 }
3311
3312 if (super_cluster > size) {
3313
3314 vm_object_offset_t base_offset;
3315 upl_size_t super_size;
3316
3317 base_offset = (offset &
3318 ~((vm_object_offset_t) super_cluster - 1));
3319 super_size = (offset+size) > (base_offset + super_cluster) ?
3320 super_cluster<<1 : super_cluster;
3321 super_size = ((base_offset + super_size) > object->size) ?
3322 (object->size - base_offset) : super_size;
3323 if(offset > (base_offset + super_size))
3324 panic("vm_object_super_upl_request: Missed target pageout"
3325 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3326 offset, base_offset, super_size, super_cluster,
3327 size, object->paging_offset);
3328 /*
3329 * apparently there is a case where the vm requests a
3330 * page to be written out who's offset is beyond the
3331 * object size
3332 */
3333 if((offset + size) > (base_offset + super_size))
3334 super_size = (offset + size) - base_offset;
3335
3336 offset = base_offset;
3337 size = super_size;
3338 }
3339 return vm_object_upl_request(object, offset, size,
3340 upl, user_page_list, page_list_count,
3341 cntrl_flags);
3342 }
3343
3344
3345 kern_return_t
3346 vm_map_create_upl(
3347 vm_map_t map,
3348 vm_map_address_t offset,
3349 upl_size_t *upl_size,
3350 upl_t *upl,
3351 upl_page_info_array_t page_list,
3352 unsigned int *count,
3353 int *flags)
3354 {
3355 vm_map_entry_t entry;
3356 int caller_flags;
3357 int force_data_sync;
3358 int sync_cow_data;
3359 vm_object_t local_object;
3360 vm_map_offset_t local_offset;
3361 vm_map_offset_t local_start;
3362 kern_return_t ret;
3363
3364 caller_flags = *flags;
3365
3366 if (caller_flags & ~UPL_VALID_FLAGS) {
3367 /*
3368 * For forward compatibility's sake,
3369 * reject any unknown flag.
3370 */
3371 return KERN_INVALID_VALUE;
3372 }
3373
3374 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3375 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3376
3377 if(upl == NULL)
3378 return KERN_INVALID_ARGUMENT;
3379
3380
3381 REDISCOVER_ENTRY:
3382 vm_map_lock(map);
3383 if (vm_map_lookup_entry(map, offset, &entry)) {
3384 if (entry->object.vm_object == VM_OBJECT_NULL ||
3385 !entry->object.vm_object->phys_contiguous) {
3386 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3387 *upl_size = MAX_UPL_TRANSFER * page_size;
3388 }
3389 }
3390 if((entry->vme_end - offset) < *upl_size) {
3391 *upl_size = entry->vme_end - offset;
3392 }
3393 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3394 if (entry->object.vm_object == VM_OBJECT_NULL) {
3395 *flags = 0;
3396 } else if (entry->object.vm_object->private) {
3397 *flags = UPL_DEV_MEMORY;
3398 if (entry->object.vm_object->phys_contiguous) {
3399 *flags |= UPL_PHYS_CONTIG;
3400 }
3401 } else {
3402 *flags = 0;
3403 }
3404 vm_map_unlock(map);
3405 return KERN_SUCCESS;
3406 }
3407 /*
3408 * Create an object if necessary.
3409 */
3410 if (entry->object.vm_object == VM_OBJECT_NULL) {
3411 entry->object.vm_object = vm_object_allocate(
3412 (vm_size_t)(entry->vme_end - entry->vme_start));
3413 entry->offset = 0;
3414 }
3415 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3416 if (!(entry->protection & VM_PROT_WRITE)) {
3417 vm_map_unlock(map);
3418 return KERN_PROTECTION_FAILURE;
3419 }
3420 if (entry->needs_copy) {
3421 vm_map_t local_map;
3422 vm_object_t object;
3423 vm_map_offset_t offset_hi;
3424 vm_map_offset_t offset_lo;
3425 vm_object_offset_t new_offset;
3426 vm_prot_t prot;
3427 boolean_t wired;
3428 vm_behavior_t behavior;
3429 vm_map_version_t version;
3430 vm_map_t real_map;
3431
3432 local_map = map;
3433 vm_map_lock_write_to_read(map);
3434 if(vm_map_lookup_locked(&local_map,
3435 offset, VM_PROT_WRITE,
3436 &version, &object,
3437 &new_offset, &prot, &wired,
3438 &behavior, &offset_lo,
3439 &offset_hi, &real_map)) {
3440 vm_map_unlock(local_map);
3441 return KERN_FAILURE;
3442 }
3443 if (real_map != map) {
3444 vm_map_unlock(real_map);
3445 }
3446 vm_object_unlock(object);
3447 vm_map_unlock(local_map);
3448
3449 goto REDISCOVER_ENTRY;
3450 }
3451 }
3452 if (entry->is_sub_map) {
3453 vm_map_t submap;
3454
3455 submap = entry->object.sub_map;
3456 local_start = entry->vme_start;
3457 local_offset = entry->offset;
3458 vm_map_reference(submap);
3459 vm_map_unlock(map);
3460
3461 ret = (vm_map_create_upl(submap,
3462 local_offset + (offset - local_start),
3463 upl_size, upl, page_list, count,
3464 flags));
3465
3466 vm_map_deallocate(submap);
3467 return ret;
3468 }
3469
3470 if (sync_cow_data) {
3471 if (entry->object.vm_object->shadow
3472 || entry->object.vm_object->copy) {
3473
3474 local_object = entry->object.vm_object;
3475 local_start = entry->vme_start;
3476 local_offset = entry->offset;
3477 vm_object_reference(local_object);
3478 vm_map_unlock(map);
3479
3480 if (entry->object.vm_object->shadow &&
3481 entry->object.vm_object->copy) {
3482 vm_object_lock_request(
3483 local_object->shadow,
3484 (vm_object_offset_t)
3485 ((offset - local_start) +
3486 local_offset) +
3487 local_object->shadow_offset,
3488 *upl_size, FALSE,
3489 MEMORY_OBJECT_DATA_SYNC,
3490 VM_PROT_NO_CHANGE);
3491 }
3492 sync_cow_data = FALSE;
3493 vm_object_deallocate(local_object);
3494 goto REDISCOVER_ENTRY;
3495 }
3496 }
3497
3498 if (force_data_sync) {
3499
3500 local_object = entry->object.vm_object;
3501 local_start = entry->vme_start;
3502 local_offset = entry->offset;
3503 vm_object_reference(local_object);
3504 vm_map_unlock(map);
3505
3506 vm_object_lock_request(
3507 local_object,
3508 (vm_object_offset_t)
3509 ((offset - local_start) + local_offset),
3510 (vm_object_size_t)*upl_size, FALSE,
3511 MEMORY_OBJECT_DATA_SYNC,
3512 VM_PROT_NO_CHANGE);
3513 force_data_sync = FALSE;
3514 vm_object_deallocate(local_object);
3515 goto REDISCOVER_ENTRY;
3516 }
3517
3518 if(!(entry->object.vm_object->private)) {
3519 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3520 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3521 if(entry->object.vm_object->phys_contiguous) {
3522 *flags = UPL_PHYS_CONTIG;
3523 } else {
3524 *flags = 0;
3525 }
3526 } else {
3527 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3528 }
3529 local_object = entry->object.vm_object;
3530 local_offset = entry->offset;
3531 local_start = entry->vme_start;
3532 vm_object_reference(local_object);
3533 vm_map_unlock(map);
3534 if(caller_flags & UPL_SET_IO_WIRE) {
3535 ret = (vm_object_iopl_request(local_object,
3536 (vm_object_offset_t)
3537 ((offset - local_start)
3538 + local_offset),
3539 *upl_size,
3540 upl,
3541 page_list,
3542 count,
3543 caller_flags));
3544 } else {
3545 ret = (vm_object_upl_request(local_object,
3546 (vm_object_offset_t)
3547 ((offset - local_start)
3548 + local_offset),
3549 *upl_size,
3550 upl,
3551 page_list,
3552 count,
3553 caller_flags));
3554 }
3555 vm_object_deallocate(local_object);
3556 return(ret);
3557 }
3558
3559 vm_map_unlock(map);
3560 return(KERN_FAILURE);
3561
3562 }
3563
3564 /*
3565 * Internal routine to enter a UPL into a VM map.
3566 *
3567 * JMM - This should just be doable through the standard
3568 * vm_map_enter() API.
3569 */
3570 kern_return_t
3571 vm_map_enter_upl(
3572 vm_map_t map,
3573 upl_t upl,
3574 vm_map_offset_t *dst_addr)
3575 {
3576 vm_map_size_t size;
3577 vm_object_offset_t offset;
3578 vm_map_offset_t addr;
3579 vm_page_t m;
3580 kern_return_t kr;
3581
3582 if (upl == UPL_NULL)
3583 return KERN_INVALID_ARGUMENT;
3584
3585 upl_lock(upl);
3586
3587 /* check to see if already mapped */
3588 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3589 upl_unlock(upl);
3590 return KERN_FAILURE;
3591 }
3592
3593 if((!(upl->map_object->pageout)) &&
3594 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3595 (upl->map_object->phys_contiguous))) {
3596 vm_object_t object;
3597 vm_page_t alias_page;
3598 vm_object_offset_t new_offset;
3599 int pg_num;
3600 wpl_array_t lite_list;
3601
3602 if(upl->flags & UPL_INTERNAL) {
3603 lite_list = (wpl_array_t)
3604 ((((uintptr_t)upl) + sizeof(struct upl))
3605 + ((upl->size/PAGE_SIZE)
3606 * sizeof(upl_page_info_t)));
3607 } else {
3608 lite_list = (wpl_array_t)
3609 (((uintptr_t)upl) + sizeof(struct upl));
3610 }
3611 object = upl->map_object;
3612 upl->map_object = vm_object_allocate(upl->size);
3613 vm_object_lock(upl->map_object);
3614 upl->map_object->shadow = object;
3615 upl->map_object->pageout = TRUE;
3616 upl->map_object->can_persist = FALSE;
3617 upl->map_object->copy_strategy =
3618 MEMORY_OBJECT_COPY_NONE;
3619 upl->map_object->shadow_offset =
3620 upl->offset - object->paging_offset;
3621 upl->map_object->wimg_bits = object->wimg_bits;
3622 offset = upl->map_object->shadow_offset;
3623 new_offset = 0;
3624 size = upl->size;
3625
3626 vm_object_lock(object);
3627
3628 while(size) {
3629 pg_num = (new_offset)/PAGE_SIZE;
3630 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3631 vm_object_unlock(object);
3632 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3633 vm_object_lock(object);
3634 m = vm_page_lookup(object, offset);
3635 if (m == VM_PAGE_NULL) {
3636 panic("vm_upl_map: page missing\n");
3637 }
3638
3639 vm_object_paging_begin(object);
3640
3641 /*
3642 * Convert the fictitious page to a private
3643 * shadow of the real page.
3644 */
3645 assert(alias_page->fictitious);
3646 alias_page->fictitious = FALSE;
3647 alias_page->private = TRUE;
3648 alias_page->pageout = TRUE;
3649 alias_page->phys_page = m->phys_page;
3650
3651 vm_page_lock_queues();
3652 vm_page_wire(alias_page);
3653 vm_page_unlock_queues();
3654
3655 /*
3656 * ENCRYPTED SWAP:
3657 * The virtual page ("m") has to be wired in some way
3658 * here or its physical page ("m->phys_page") could
3659 * be recycled at any time.
3660 * Assuming this is enforced by the caller, we can't
3661 * get an encrypted page here. Since the encryption
3662 * key depends on the VM page's "pager" object and
3663 * the "paging_offset", we couldn't handle 2 pageable
3664 * VM pages (with different pagers and paging_offsets)
3665 * sharing the same physical page: we could end up
3666 * encrypting with one key (via one VM page) and
3667 * decrypting with another key (via the alias VM page).
3668 */
3669 ASSERT_PAGE_DECRYPTED(m);
3670
3671 vm_page_insert(alias_page,
3672 upl->map_object, new_offset);
3673 assert(!alias_page->wanted);
3674 alias_page->busy = FALSE;
3675 alias_page->absent = FALSE;
3676 }
3677
3678 size -= PAGE_SIZE;
3679 offset += PAGE_SIZE_64;
3680 new_offset += PAGE_SIZE_64;
3681 }
3682 vm_object_unlock(object);
3683 vm_object_unlock(upl->map_object);
3684 }
3685 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3686 offset = upl->offset - upl->map_object->paging_offset;
3687 else
3688 offset = 0;
3689
3690 size = upl->size;
3691
3692 vm_object_lock(upl->map_object);
3693 upl->map_object->ref_count++;
3694 vm_object_res_reference(upl->map_object);
3695 vm_object_unlock(upl->map_object);
3696
3697 *dst_addr = 0;
3698
3699
3700 /* NEED A UPL_MAP ALIAS */
3701 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3702 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3703 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3704
3705 if (kr != KERN_SUCCESS) {
3706 upl_unlock(upl);
3707 return(kr);
3708 }
3709
3710 vm_object_lock(upl->map_object);
3711
3712 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3713 m = vm_page_lookup(upl->map_object, offset);
3714 if(m) {
3715 unsigned int cache_attr;
3716 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3717
3718 PMAP_ENTER(map->pmap, addr,
3719 m, VM_PROT_ALL,
3720 cache_attr, TRUE);
3721 }
3722 offset+=PAGE_SIZE_64;
3723 }
3724 vm_object_unlock(upl->map_object);
3725
3726 upl->ref_count++; /* hold a reference for the mapping */
3727 upl->flags |= UPL_PAGE_LIST_MAPPED;
3728 upl->kaddr = *dst_addr;
3729 upl_unlock(upl);
3730 return KERN_SUCCESS;
3731 }
3732
3733 /*
3734 * Internal routine to remove a UPL mapping from a VM map.
3735 *
3736 * XXX - This should just be doable through a standard
3737 * vm_map_remove() operation. Otherwise, implicit clean-up
3738 * of the target map won't be able to correctly remove
3739 * these (and release the reference on the UPL). Having
3740 * to do this means we can't map these into user-space
3741 * maps yet.
3742 */
3743 kern_return_t
3744 vm_map_remove_upl(
3745 vm_map_t map,
3746 upl_t upl)
3747 {
3748 vm_address_t addr;
3749 upl_size_t size;
3750
3751 if (upl == UPL_NULL)
3752 return KERN_INVALID_ARGUMENT;
3753
3754 upl_lock(upl);
3755 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3756 addr = upl->kaddr;
3757 size = upl->size;
3758 assert(upl->ref_count > 1);
3759 upl->ref_count--; /* removing mapping ref */
3760 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3761 upl->kaddr = (vm_offset_t) 0;
3762 upl_unlock(upl);
3763
3764 vm_map_remove( map,
3765 vm_map_trunc_page(addr),
3766 vm_map_round_page(addr + size),
3767 VM_MAP_NO_FLAGS);
3768 return KERN_SUCCESS;
3769 }
3770 upl_unlock(upl);
3771 return KERN_FAILURE;
3772 }
3773
3774 kern_return_t
3775 upl_commit_range(
3776 upl_t upl,
3777 upl_offset_t offset,
3778 upl_size_t size,
3779 int flags,
3780 upl_page_info_t *page_list,
3781 mach_msg_type_number_t count,
3782 boolean_t *empty)
3783 {
3784 upl_size_t xfer_size = size;
3785 vm_object_t shadow_object;
3786 vm_object_t object = upl->map_object;
3787 vm_object_offset_t target_offset;
3788 int entry;
3789 wpl_array_t lite_list;
3790 int occupied;
3791 int delayed_unlock = 0;
3792 int clear_refmod = 0;
3793 boolean_t shadow_internal;
3794
3795 *empty = FALSE;
3796
3797 if (upl == UPL_NULL)
3798 return KERN_INVALID_ARGUMENT;
3799
3800
3801 if (count == 0)
3802 page_list = NULL;
3803
3804 if (object->pageout) {
3805 shadow_object = object->shadow;
3806 } else {
3807 shadow_object = object;
3808 }
3809
3810 upl_lock(upl);
3811
3812 if (upl->flags & UPL_ACCESS_BLOCKED) {
3813 /*
3814 * We used this UPL to block access to the pages by marking
3815 * them "busy". Now we need to clear the "busy" bit to allow
3816 * access to these pages again.
3817 */
3818 flags |= UPL_COMMIT_ALLOW_ACCESS;
3819 }
3820
3821 if (upl->flags & UPL_CLEAR_DIRTY)
3822 flags |= UPL_COMMIT_CLEAR_DIRTY;
3823
3824 if (upl->flags & UPL_DEVICE_MEMORY) {
3825 xfer_size = 0;
3826 } else if ((offset + size) > upl->size) {
3827 upl_unlock(upl);
3828 return KERN_FAILURE;
3829 }
3830
3831 if (upl->flags & UPL_INTERNAL) {
3832 lite_list = (wpl_array_t)
3833 ((((uintptr_t)upl) + sizeof(struct upl))
3834 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3835 } else {
3836 lite_list = (wpl_array_t)
3837 (((uintptr_t)upl) + sizeof(struct upl));
3838 }
3839 if (object != shadow_object)
3840 vm_object_lock(object);
3841 vm_object_lock(shadow_object);
3842
3843 shadow_internal = shadow_object->internal;
3844
3845 entry = offset/PAGE_SIZE;
3846 target_offset = (vm_object_offset_t)offset;
3847
3848 while (xfer_size) {
3849 vm_page_t t,m;
3850 upl_page_info_t *p;
3851
3852 m = VM_PAGE_NULL;
3853
3854 if (upl->flags & UPL_LITE) {
3855 int pg_num;
3856
3857 pg_num = target_offset/PAGE_SIZE;
3858
3859 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3860 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3861 m = vm_page_lookup(shadow_object,
3862 target_offset + (upl->offset -
3863 shadow_object->paging_offset));
3864 }
3865 }
3866 if (object->pageout) {
3867 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3868 t->pageout = FALSE;
3869
3870 if (delayed_unlock) {
3871 delayed_unlock = 0;
3872 vm_page_unlock_queues();
3873 }
3874 VM_PAGE_FREE(t);
3875
3876 if (m == NULL) {
3877 m = vm_page_lookup(
3878 shadow_object,
3879 target_offset +
3880 object->shadow_offset);
3881 }
3882 if (m != VM_PAGE_NULL)
3883 vm_object_paging_end(m->object);
3884 }
3885 }
3886 if (m != VM_PAGE_NULL) {
3887
3888 clear_refmod = 0;
3889
3890 if (upl->flags & UPL_IO_WIRE) {
3891
3892 if (delayed_unlock == 0)
3893 vm_page_lock_queues();
3894
3895 vm_page_unwire(m);
3896
3897 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3898 delayed_unlock = 0;
3899 vm_page_unlock_queues();
3900 }
3901 if (page_list) {
3902 page_list[entry].phys_addr = 0;
3903 }
3904 if (flags & UPL_COMMIT_SET_DIRTY) {
3905 m->dirty = TRUE;
3906 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3907 m->dirty = FALSE;
3908 clear_refmod |= VM_MEM_MODIFIED;
3909 }
3910 if (flags & UPL_COMMIT_INACTIVATE) {
3911 m->reference = FALSE;
3912 clear_refmod |= VM_MEM_REFERENCED;
3913 vm_page_deactivate(m);
3914 }
3915 if (clear_refmod)
3916 pmap_clear_refmod(m->phys_page, clear_refmod);
3917
3918 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3919 /*
3920 * We blocked access to the pages in this UPL.
3921 * Clear the "busy" bit and wake up any waiter
3922 * for this page.
3923 */
3924 PAGE_WAKEUP_DONE(m);
3925 }
3926
3927 target_offset += PAGE_SIZE_64;
3928 xfer_size -= PAGE_SIZE;
3929 entry++;
3930 continue;
3931 }
3932 if (delayed_unlock == 0)
3933 vm_page_lock_queues();
3934 /*
3935 * make sure to clear the hardware
3936 * modify or reference bits before
3937 * releasing the BUSY bit on this page
3938 * otherwise we risk losing a legitimate
3939 * change of state
3940 */
3941 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3942 m->dirty = FALSE;
3943 clear_refmod |= VM_MEM_MODIFIED;
3944 }
3945 if (flags & UPL_COMMIT_INACTIVATE)
3946 clear_refmod |= VM_MEM_REFERENCED;
3947
3948 if (clear_refmod)
3949 pmap_clear_refmod(m->phys_page, clear_refmod);
3950
3951 if (page_list) {
3952 p = &(page_list[entry]);
3953 if(p->phys_addr && p->pageout && !m->pageout) {
3954 m->busy = TRUE;
3955 m->pageout = TRUE;
3956 vm_page_wire(m);
3957 } else if (page_list[entry].phys_addr &&
3958 !p->pageout && m->pageout &&
3959 !m->dump_cleaning) {
3960 m->pageout = FALSE;
3961 m->absent = FALSE;
3962 m->overwriting = FALSE;
3963 vm_page_unwire(m);
3964 PAGE_WAKEUP_DONE(m);
3965 }
3966 page_list[entry].phys_addr = 0;
3967 }
3968 m->dump_cleaning = FALSE;
3969 if(m->laundry) {
3970 vm_pageout_throttle_up(m);
3971 }
3972 if(m->pageout) {
3973 m->cleaning = FALSE;
3974 m->pageout = FALSE;
3975 #if MACH_CLUSTER_STATS
3976 if (m->wanted) vm_pageout_target_collisions++;
3977 #endif
3978 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3979 m->dirty = TRUE;
3980 else
3981 m->dirty = FALSE;
3982
3983 if(m->dirty) {
3984 vm_page_unwire(m);/* reactivates */
3985
3986 if (upl->flags & UPL_PAGEOUT) {
3987 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
3988 VM_STAT(reactivations++);
3989 }
3990 PAGE_WAKEUP_DONE(m);
3991 } else {
3992 vm_page_free(m);/* clears busy, etc. */
3993
3994 if (upl->flags & UPL_PAGEOUT) {
3995 CLUSTER_STAT(vm_pageout_target_page_freed++;)
3996
3997 if (page_list[entry].dirty)
3998 VM_STAT(pageouts++);
3999 }
4000 }
4001 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4002 delayed_unlock = 0;
4003 vm_page_unlock_queues();
4004 }
4005 target_offset += PAGE_SIZE_64;
4006 xfer_size -= PAGE_SIZE;
4007 entry++;
4008 continue;
4009 }
4010 #if MACH_CLUSTER_STATS
4011 m->dirty = pmap_is_modified(m->phys_page);
4012
4013 if (m->dirty) vm_pageout_cluster_dirtied++;
4014 else vm_pageout_cluster_cleaned++;
4015 if (m->wanted) vm_pageout_cluster_collisions++;
4016 #else
4017 m->dirty = 0;
4018 #endif
4019
4020 if((m->busy) && (m->cleaning)) {
4021 /* the request_page_list case */
4022 if(m->absent) {
4023 m->absent = FALSE;
4024 if(shadow_object->absent_count == 1)
4025 vm_object_absent_release(shadow_object);
4026 else
4027 shadow_object->absent_count--;
4028 }
4029 m->overwriting = FALSE;
4030 m->busy = FALSE;
4031 m->dirty = FALSE;
4032 } else if (m->overwriting) {
4033 /* alternate request page list, write to
4034 * page_list case. Occurs when the original
4035 * page was wired at the time of the list
4036 * request */
4037 assert(m->wire_count != 0);
4038 vm_page_unwire(m);/* reactivates */
4039 m->overwriting = FALSE;
4040 }
4041 m->cleaning = FALSE;
4042
4043 /* It is a part of the semantic of COPYOUT_FROM */
4044 /* UPLs that a commit implies cache sync */
4045 /* between the vm page and the backing store */
4046 /* this can be used to strip the precious bit */
4047 /* as well as clean */
4048 if (upl->flags & UPL_PAGE_SYNC_DONE)
4049 m->precious = FALSE;
4050
4051 if (flags & UPL_COMMIT_SET_DIRTY)
4052 m->dirty = TRUE;
4053
4054 if (flags & UPL_COMMIT_INACTIVATE) {
4055 m->reference = FALSE;
4056 vm_page_deactivate(m);
4057 } else if (!m->active && !m->inactive) {
4058 if (m->reference)
4059 vm_page_activate(m);
4060 else
4061 vm_page_deactivate(m);
4062 }
4063
4064 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4065 /*
4066 * We blocked access to the pages in this URL.
4067 * Clear the "busy" bit on this page before we
4068 * wake up any waiter.
4069 */
4070 m->busy = FALSE;
4071 }
4072
4073 /*
4074 * Wakeup any thread waiting for the page to be un-cleaning.
4075 */
4076 PAGE_WAKEUP(m);
4077
4078 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4079 delayed_unlock = 0;
4080 vm_page_unlock_queues();
4081 }
4082 }
4083 target_offset += PAGE_SIZE_64;
4084 xfer_size -= PAGE_SIZE;
4085 entry++;
4086 }
4087 if (delayed_unlock)
4088 vm_page_unlock_queues();
4089
4090 occupied = 1;
4091
4092 if (upl->flags & UPL_DEVICE_MEMORY) {
4093 occupied = 0;
4094 } else if (upl->flags & UPL_LITE) {
4095 int pg_num;
4096 int i;
4097 pg_num = upl->size/PAGE_SIZE;
4098 pg_num = (pg_num + 31) >> 5;
4099 occupied = 0;
4100 for(i= 0; i<pg_num; i++) {
4101 if(lite_list[i] != 0) {
4102 occupied = 1;
4103 break;
4104 }
4105 }
4106 } else {
4107 if(queue_empty(&upl->map_object->memq)) {
4108 occupied = 0;
4109 }
4110 }
4111
4112 if(occupied == 0) {
4113 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4114 *empty = TRUE;
4115 }
4116 if(object == shadow_object)
4117 vm_object_paging_end(shadow_object);
4118 }
4119 vm_object_unlock(shadow_object);
4120 if (object != shadow_object)
4121 vm_object_unlock(object);
4122 upl_unlock(upl);
4123
4124 return KERN_SUCCESS;
4125 }
4126
4127 kern_return_t
4128 upl_abort_range(
4129 upl_t upl,
4130 upl_offset_t offset,
4131 upl_size_t size,
4132 int error,
4133 boolean_t *empty)
4134 {
4135 upl_size_t xfer_size = size;
4136 vm_object_t shadow_object;
4137 vm_object_t object = upl->map_object;
4138 vm_object_offset_t target_offset;
4139 int entry;
4140 wpl_array_t lite_list;
4141 int occupied;
4142 boolean_t shadow_internal;
4143
4144 *empty = FALSE;
4145
4146 if (upl == UPL_NULL)
4147 return KERN_INVALID_ARGUMENT;
4148
4149 if (upl->flags & UPL_IO_WIRE) {
4150 return upl_commit_range(upl,
4151 offset, size, 0,
4152 NULL, 0, empty);
4153 }
4154
4155 if(object->pageout) {
4156 shadow_object = object->shadow;
4157 } else {
4158 shadow_object = object;
4159 }
4160
4161 upl_lock(upl);
4162 if(upl->flags & UPL_DEVICE_MEMORY) {
4163 xfer_size = 0;
4164 } else if ((offset + size) > upl->size) {
4165 upl_unlock(upl);
4166 return KERN_FAILURE;
4167 }
4168 if (object != shadow_object)
4169 vm_object_lock(object);
4170 vm_object_lock(shadow_object);
4171
4172 shadow_internal = shadow_object->internal;
4173
4174 if(upl->flags & UPL_INTERNAL) {
4175 lite_list = (wpl_array_t)
4176 ((((uintptr_t)upl) + sizeof(struct upl))
4177 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4178 } else {
4179 lite_list = (wpl_array_t)
4180 (((uintptr_t)upl) + sizeof(struct upl));
4181 }
4182
4183 entry = offset/PAGE_SIZE;
4184 target_offset = (vm_object_offset_t)offset;
4185 while(xfer_size) {
4186 vm_page_t t,m;
4187
4188 m = VM_PAGE_NULL;
4189 if(upl->flags & UPL_LITE) {
4190 int pg_num;
4191 pg_num = target_offset/PAGE_SIZE;
4192 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4193 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4194 m = vm_page_lookup(shadow_object,
4195 target_offset + (upl->offset -
4196 shadow_object->paging_offset));
4197 }
4198 }
4199 if(object->pageout) {
4200 if ((t = vm_page_lookup(object, target_offset))
4201 != NULL) {
4202 t->pageout = FALSE;
4203 VM_PAGE_FREE(t);
4204 if(m == NULL) {
4205 m = vm_page_lookup(
4206 shadow_object,
4207 target_offset +
4208 object->shadow_offset);
4209 }
4210 if(m != VM_PAGE_NULL)
4211 vm_object_paging_end(m->object);
4212 }
4213 }
4214 if(m != VM_PAGE_NULL) {
4215 vm_page_lock_queues();
4216 if(m->absent) {
4217 boolean_t must_free = TRUE;
4218
4219 /* COPYOUT = FALSE case */
4220 /* check for error conditions which must */
4221 /* be passed back to the pages customer */
4222 if(error & UPL_ABORT_RESTART) {
4223 m->restart = TRUE;
4224 m->absent = FALSE;
4225 vm_object_absent_release(m->object);
4226 m->page_error = KERN_MEMORY_ERROR;
4227 m->error = TRUE;
4228 must_free = FALSE;
4229 } else if(error & UPL_ABORT_UNAVAILABLE) {
4230 m->restart = FALSE;
4231 m->unusual = TRUE;
4232 must_free = FALSE;
4233 } else if(error & UPL_ABORT_ERROR) {
4234 m->restart = FALSE;
4235 m->absent = FALSE;
4236 vm_object_absent_release(m->object);
4237 m->page_error = KERN_MEMORY_ERROR;
4238 m->error = TRUE;
4239 must_free = FALSE;
4240 }
4241
4242 /*
4243 * ENCRYPTED SWAP:
4244 * If the page was already encrypted,
4245 * we don't really need to decrypt it
4246 * now. It will get decrypted later,
4247 * on demand, as soon as someone needs
4248 * to access its contents.
4249 */
4250
4251 m->cleaning = FALSE;
4252 m->overwriting = FALSE;
4253 PAGE_WAKEUP_DONE(m);
4254
4255 if (must_free == TRUE) {
4256 vm_page_free(m);
4257 } else {
4258 vm_page_activate(m);
4259 }
4260 vm_page_unlock_queues();
4261
4262 target_offset += PAGE_SIZE_64;
4263 xfer_size -= PAGE_SIZE;
4264 entry++;
4265 continue;
4266 }
4267 /*
4268 * Handle the trusted pager throttle.
4269 */
4270 if (m->laundry) {
4271 vm_pageout_throttle_up(m);
4272 }
4273 if(m->pageout) {
4274 assert(m->busy);
4275 assert(m->wire_count == 1);
4276 m->pageout = FALSE;
4277 vm_page_unwire(m);
4278 }
4279 m->dump_cleaning = FALSE;
4280 m->cleaning = FALSE;
4281 m->overwriting = FALSE;
4282 #if MACH_PAGEMAP
4283 vm_external_state_clr(
4284 m->object->existence_map, m->offset);
4285 #endif /* MACH_PAGEMAP */
4286 if(error & UPL_ABORT_DUMP_PAGES) {
4287 vm_page_free(m);
4288 pmap_disconnect(m->phys_page);
4289 } else {
4290 PAGE_WAKEUP_DONE(m);
4291 }
4292 vm_page_unlock_queues();
4293 }
4294 target_offset += PAGE_SIZE_64;
4295 xfer_size -= PAGE_SIZE;
4296 entry++;
4297 }
4298 occupied = 1;
4299 if (upl->flags & UPL_DEVICE_MEMORY) {
4300 occupied = 0;
4301 } else if (upl->flags & UPL_LITE) {
4302 int pg_num;
4303 int i;
4304 pg_num = upl->size/PAGE_SIZE;
4305 pg_num = (pg_num + 31) >> 5;
4306 occupied = 0;
4307 for(i= 0; i<pg_num; i++) {
4308 if(lite_list[i] != 0) {
4309 occupied = 1;
4310 break;
4311 }
4312 }
4313 } else {
4314 if(queue_empty(&upl->map_object->memq)) {
4315 occupied = 0;
4316 }
4317 }
4318
4319 if(occupied == 0) {
4320 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4321 *empty = TRUE;
4322 }
4323 if(object == shadow_object)
4324 vm_object_paging_end(shadow_object);
4325 }
4326 vm_object_unlock(shadow_object);
4327 if (object != shadow_object)
4328 vm_object_unlock(object);
4329
4330 upl_unlock(upl);
4331
4332 return KERN_SUCCESS;
4333 }
4334
4335 kern_return_t
4336 upl_abort(
4337 upl_t upl,
4338 int error)
4339 {
4340 vm_object_t object = NULL;
4341 vm_object_t shadow_object = NULL;
4342 vm_object_offset_t offset;
4343 vm_object_offset_t shadow_offset;
4344 vm_object_offset_t target_offset;
4345 upl_size_t i;
4346 wpl_array_t lite_list;
4347 vm_page_t t,m;
4348 int occupied;
4349 boolean_t shadow_internal;
4350
4351 if (upl == UPL_NULL)
4352 return KERN_INVALID_ARGUMENT;
4353
4354 if (upl->flags & UPL_IO_WIRE) {
4355 boolean_t empty;
4356 return upl_commit_range(upl,
4357 0, upl->size, 0,
4358 NULL, 0, &empty);
4359 }
4360
4361 upl_lock(upl);
4362 if(upl->flags & UPL_DEVICE_MEMORY) {
4363 upl_unlock(upl);
4364 return KERN_SUCCESS;
4365 }
4366
4367 object = upl->map_object;
4368
4369 if (object == NULL) {
4370 panic("upl_abort: upl object is not backed by an object");
4371 upl_unlock(upl);
4372 return KERN_INVALID_ARGUMENT;
4373 }
4374
4375 if(object->pageout) {
4376 shadow_object = object->shadow;
4377 shadow_offset = object->shadow_offset;
4378 } else {
4379 shadow_object = object;
4380 shadow_offset = upl->offset - object->paging_offset;
4381 }
4382
4383 if(upl->flags & UPL_INTERNAL) {
4384 lite_list = (wpl_array_t)
4385 ((((uintptr_t)upl) + sizeof(struct upl))
4386 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4387 } else {
4388 lite_list = (wpl_array_t)
4389 (((uintptr_t)upl) + sizeof(struct upl));
4390 }
4391 offset = 0;
4392
4393 if (object != shadow_object)
4394 vm_object_lock(object);
4395 vm_object_lock(shadow_object);
4396
4397 shadow_internal = shadow_object->internal;
4398
4399 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4400 m = VM_PAGE_NULL;
4401 target_offset = offset + shadow_offset;
4402 if(upl->flags & UPL_LITE) {
4403 int pg_num;
4404 pg_num = offset/PAGE_SIZE;
4405 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4406 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4407 m = vm_page_lookup(
4408 shadow_object, target_offset);
4409 }
4410 }
4411 if(object->pageout) {
4412 if ((t = vm_page_lookup(object, offset)) != NULL) {
4413 t->pageout = FALSE;
4414 VM_PAGE_FREE(t);
4415 if(m == NULL) {
4416 m = vm_page_lookup(
4417 shadow_object, target_offset);
4418 }
4419 if(m != VM_PAGE_NULL)
4420 vm_object_paging_end(m->object);
4421 }
4422 }
4423 if(m != VM_PAGE_NULL) {
4424 vm_page_lock_queues();
4425 if(m->absent) {
4426 boolean_t must_free = TRUE;
4427
4428 /* COPYOUT = FALSE case */
4429 /* check for error conditions which must */
4430 /* be passed back to the pages customer */
4431 if(error & UPL_ABORT_RESTART) {
4432 m->restart = TRUE;
4433 m->absent = FALSE;
4434 vm_object_absent_release(m->object);
4435 m->page_error = KERN_MEMORY_ERROR;
4436 m->error = TRUE;
4437 must_free = FALSE;
4438 } else if(error & UPL_ABORT_UNAVAILABLE) {
4439 m->restart = FALSE;
4440 m->unusual = TRUE;
4441 must_free = FALSE;
4442 } else if(error & UPL_ABORT_ERROR) {
4443 m->restart = FALSE;
4444 m->absent = FALSE;
4445 vm_object_absent_release(m->object);
4446 m->page_error = KERN_MEMORY_ERROR;
4447 m->error = TRUE;
4448 must_free = FALSE;
4449 }
4450
4451 /*
4452 * ENCRYPTED SWAP:
4453 * If the page was already encrypted,
4454 * we don't really need to decrypt it
4455 * now. It will get decrypted later,
4456 * on demand, as soon as someone needs
4457 * to access its contents.
4458 */
4459
4460 m->cleaning = FALSE;
4461 m->overwriting = FALSE;
4462 PAGE_WAKEUP_DONE(m);
4463
4464 if (must_free == TRUE) {
4465 vm_page_free(m);
4466 } else {
4467 vm_page_activate(m);
4468 }
4469 vm_page_unlock_queues();
4470 continue;
4471 }
4472 /*
4473 * Handle the trusted pager throttle.
4474 */
4475 if (m->laundry) {
4476 vm_pageout_throttle_up(m);
4477 }
4478 if(m->pageout) {
4479 assert(m->busy);
4480 assert(m->wire_count == 1);
4481 m->pageout = FALSE;
4482 vm_page_unwire(m);
4483 }
4484 m->dump_cleaning = FALSE;
4485 m->cleaning = FALSE;
4486 m->overwriting = FALSE;
4487 #if MACH_PAGEMAP
4488 vm_external_state_clr(
4489 m->object->existence_map, m->offset);
4490 #endif /* MACH_PAGEMAP */
4491 if(error & UPL_ABORT_DUMP_PAGES) {
4492 vm_page_free(m);
4493 pmap_disconnect(m->phys_page);
4494 } else {
4495 PAGE_WAKEUP_DONE(m);
4496 }
4497 vm_page_unlock_queues();
4498 }
4499 }
4500 occupied = 1;
4501 if (upl->flags & UPL_DEVICE_MEMORY) {
4502 occupied = 0;
4503 } else if (upl->flags & UPL_LITE) {
4504 int pg_num;
4505 int j;
4506 pg_num = upl->size/PAGE_SIZE;
4507 pg_num = (pg_num + 31) >> 5;
4508 occupied = 0;
4509 for(j= 0; j<pg_num; j++) {
4510 if(lite_list[j] != 0) {
4511 occupied = 1;
4512 break;
4513 }
4514 }
4515 } else {
4516 if(queue_empty(&upl->map_object->memq)) {
4517 occupied = 0;
4518 }
4519 }
4520
4521 if(occupied == 0) {
4522 if(object == shadow_object)
4523 vm_object_paging_end(shadow_object);
4524 }
4525 vm_object_unlock(shadow_object);
4526 if (object != shadow_object)
4527 vm_object_unlock(object);
4528
4529 upl_unlock(upl);
4530 return KERN_SUCCESS;
4531 }
4532
4533 /* an option on commit should be wire */
4534 kern_return_t
4535 upl_commit(
4536 upl_t upl,
4537 upl_page_info_t *page_list,
4538 mach_msg_type_number_t count)
4539 {
4540 if (upl == UPL_NULL)
4541 return KERN_INVALID_ARGUMENT;
4542
4543 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4544 boolean_t empty;
4545 return upl_commit_range(upl, 0, upl->size, 0,
4546 page_list, count, &empty);
4547 }
4548
4549 if (count == 0)
4550 page_list = NULL;
4551
4552 upl_lock(upl);
4553 if (upl->flags & UPL_DEVICE_MEMORY)
4554 page_list = NULL;
4555
4556 if (upl->flags & UPL_ENCRYPTED) {
4557 /*
4558 * ENCRYPTED SWAP:
4559 * This UPL was encrypted, but we don't need
4560 * to decrypt here. We'll decrypt each page
4561 * later, on demand, as soon as someone needs
4562 * to access the page's contents.
4563 */
4564 }
4565
4566 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4567 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4568 vm_object_t shadow_object = upl->map_object->shadow;
4569 vm_object_t object = upl->map_object;
4570 vm_object_offset_t target_offset;
4571 upl_size_t xfer_end;
4572 int entry;
4573
4574 vm_page_t t, m;
4575 upl_page_info_t *p;
4576
4577 if (object != shadow_object)
4578 vm_object_lock(object);
4579 vm_object_lock(shadow_object);
4580
4581 entry = 0;
4582 target_offset = object->shadow_offset;
4583 xfer_end = upl->size + object->shadow_offset;
4584
4585 while(target_offset < xfer_end) {
4586
4587 if ((t = vm_page_lookup(object,
4588 target_offset - object->shadow_offset))
4589 == NULL) {
4590 target_offset += PAGE_SIZE_64;
4591 entry++;
4592 continue;
4593 }
4594
4595 m = vm_page_lookup(shadow_object, target_offset);
4596 if(m != VM_PAGE_NULL) {
4597 /*
4598 * ENCRYPTED SWAP:
4599 * If this page was encrypted, we
4600 * don't need to decrypt it here.
4601 * We'll decrypt it later, on demand,
4602 * as soon as someone needs to access
4603 * its contents.
4604 */
4605
4606 if (upl->flags & UPL_CLEAR_DIRTY) {
4607 pmap_clear_modify(m->phys_page);
4608 m->dirty = FALSE;
4609 }
4610 /* It is a part of the semantic of */
4611 /* COPYOUT_FROM UPLs that a commit */
4612 /* implies cache sync between the */
4613 /* vm page and the backing store */
4614 /* this can be used to strip the */
4615 /* precious bit as well as clean */
4616 if (upl->flags & UPL_PAGE_SYNC_DONE)
4617 m->precious = FALSE;
4618
4619 if(page_list) {
4620 p = &(page_list[entry]);
4621 if(page_list[entry].phys_addr &&
4622 p->pageout && !m->pageout) {
4623 vm_page_lock_queues();
4624 m->busy = TRUE;
4625 m->pageout = TRUE;
4626 vm_page_wire(m);
4627 vm_page_unlock_queues();
4628 } else if (page_list[entry].phys_addr &&
4629 !p->pageout && m->pageout &&
4630 !m->dump_cleaning) {
4631 vm_page_lock_queues();
4632 m->pageout = FALSE;
4633 m->absent = FALSE;
4634 m->overwriting = FALSE;
4635 vm_page_unwire(m);
4636 PAGE_WAKEUP_DONE(m);
4637 vm_page_unlock_queues();
4638 }
4639 page_list[entry].phys_addr = 0;
4640 }
4641 }
4642 target_offset += PAGE_SIZE_64;
4643 entry++;
4644 }
4645 vm_object_unlock(shadow_object);
4646 if (object != shadow_object)
4647 vm_object_unlock(object);
4648
4649 }
4650 if (upl->flags & UPL_DEVICE_MEMORY) {
4651 vm_object_lock(upl->map_object->shadow);
4652 if(upl->map_object == upl->map_object->shadow)
4653 vm_object_paging_end(upl->map_object->shadow);
4654 vm_object_unlock(upl->map_object->shadow);
4655 }
4656 upl_unlock(upl);
4657 return KERN_SUCCESS;
4658 }
4659
4660
4661
4662 kern_return_t
4663 vm_object_iopl_request(
4664 vm_object_t object,
4665 vm_object_offset_t offset,
4666 upl_size_t size,
4667 upl_t *upl_ptr,
4668 upl_page_info_array_t user_page_list,
4669 unsigned int *page_list_count,
4670 int cntrl_flags)
4671 {
4672 vm_page_t dst_page;
4673 vm_object_offset_t dst_offset = offset;
4674 upl_size_t xfer_size = size;
4675 upl_t upl = NULL;
4676 unsigned int entry;
4677 wpl_array_t lite_list = NULL;
4678 int page_field_size;
4679 int delayed_unlock = 0;
4680 int no_zero_fill = FALSE;
4681 vm_page_t alias_page = NULL;
4682 kern_return_t ret;
4683 vm_prot_t prot;
4684
4685
4686 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4687 /*
4688 * For forward compatibility's sake,
4689 * reject any unknown flag.
4690 */
4691 return KERN_INVALID_VALUE;
4692 }
4693
4694 if (cntrl_flags & UPL_ENCRYPT) {
4695 /*
4696 * ENCRYPTED SWAP:
4697 * The paging path doesn't use this interface,
4698 * so we don't support the UPL_ENCRYPT flag
4699 * here. We won't encrypt the pages.
4700 */
4701 assert(! (cntrl_flags & UPL_ENCRYPT));
4702 }
4703
4704 if (cntrl_flags & UPL_NOZEROFILL)
4705 no_zero_fill = TRUE;
4706
4707 if (cntrl_flags & UPL_COPYOUT_FROM)
4708 prot = VM_PROT_READ;
4709 else
4710 prot = VM_PROT_READ | VM_PROT_WRITE;
4711
4712 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4713 size = MAX_UPL_TRANSFER * page_size;
4714 }
4715
4716 if(cntrl_flags & UPL_SET_INTERNAL)
4717 if(page_list_count != NULL)
4718 *page_list_count = MAX_UPL_TRANSFER;
4719 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4720 ((page_list_count != NULL) && (*page_list_count != 0)
4721 && *page_list_count < (size/page_size)))
4722 return KERN_INVALID_ARGUMENT;
4723
4724 if((!object->internal) && (object->paging_offset != 0))
4725 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
4726
4727 if(object->phys_contiguous) {
4728 /* No paging operations are possible against this memory */
4729 /* and so no need for map object, ever */
4730 cntrl_flags |= UPL_SET_LITE;
4731 }
4732
4733 if(upl_ptr) {
4734 if(cntrl_flags & UPL_SET_INTERNAL) {
4735 if(cntrl_flags & UPL_SET_LITE) {
4736 upl = upl_create(
4737 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4738 size);
4739 user_page_list = (upl_page_info_t *)
4740 (((uintptr_t)upl) + sizeof(struct upl));
4741 lite_list = (wpl_array_t)
4742 (((uintptr_t)user_page_list) +
4743 ((size/PAGE_SIZE) *
4744 sizeof(upl_page_info_t)));
4745 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4746 page_field_size =
4747 (page_field_size + 3) & 0xFFFFFFFC;
4748 bzero((char *)lite_list, page_field_size);
4749 upl->flags =
4750 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4751 } else {
4752 upl = upl_create(UPL_CREATE_INTERNAL, size);
4753 user_page_list = (upl_page_info_t *)
4754 (((uintptr_t)upl)
4755 + sizeof(struct upl));
4756 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4757 }
4758 } else {
4759 if(cntrl_flags & UPL_SET_LITE) {
4760 upl = upl_create(UPL_CREATE_LITE, size);
4761 lite_list = (wpl_array_t)
4762 (((uintptr_t)upl) + sizeof(struct upl));
4763 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4764 page_field_size =
4765 (page_field_size + 3) & 0xFFFFFFFC;
4766 bzero((char *)lite_list, page_field_size);
4767 upl->flags = UPL_LITE | UPL_IO_WIRE;
4768 } else {
4769 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4770 upl->flags = UPL_IO_WIRE;
4771 }
4772 }
4773
4774 if(object->phys_contiguous) {
4775 upl->map_object = object;
4776 /* don't need any shadow mappings for this one */
4777 /* since it is already I/O memory */
4778 upl->flags |= UPL_DEVICE_MEMORY;
4779
4780 vm_object_lock(object);
4781 vm_object_paging_begin(object);
4782 vm_object_unlock(object);
4783
4784 /* paging in progress also protects the paging_offset */
4785 upl->offset = offset + object->paging_offset;
4786 upl->size = size;
4787 *upl_ptr = upl;
4788 if(user_page_list) {
4789 user_page_list[0].phys_addr =
4790 (offset + object->shadow_offset)>>PAGE_SHIFT;
4791 user_page_list[0].device = TRUE;
4792 }
4793
4794 if(page_list_count != NULL) {
4795 if (upl->flags & UPL_INTERNAL) {
4796 *page_list_count = 0;
4797 } else {
4798 *page_list_count = 1;
4799 }
4800 }
4801 return KERN_SUCCESS;
4802 }
4803 if(user_page_list)
4804 user_page_list[0].device = FALSE;
4805
4806 if(cntrl_flags & UPL_SET_LITE) {
4807 upl->map_object = object;
4808 } else {
4809 upl->map_object = vm_object_allocate(size);
4810 vm_object_lock(upl->map_object);
4811 upl->map_object->shadow = object;
4812 upl->map_object->pageout = TRUE;
4813 upl->map_object->can_persist = FALSE;
4814 upl->map_object->copy_strategy =
4815 MEMORY_OBJECT_COPY_NONE;
4816 upl->map_object->shadow_offset = offset;
4817 upl->map_object->wimg_bits = object->wimg_bits;
4818 vm_object_unlock(upl->map_object);
4819 }
4820 }
4821 vm_object_lock(object);
4822 vm_object_paging_begin(object);
4823
4824 if (!object->phys_contiguous) {
4825 /* Protect user space from future COW operations */
4826 object->true_share = TRUE;
4827 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4828 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4829 }
4830
4831 /* we can lock the upl offset now that paging_in_progress is set */
4832 if(upl_ptr) {
4833 upl->size = size;
4834 upl->offset = offset + object->paging_offset;
4835 *upl_ptr = upl;
4836 #ifdef UPL_DEBUG
4837 queue_enter(&object->uplq, upl, upl_t, uplq);
4838 #endif /* UPL_DEBUG */
4839 }
4840
4841 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4842 /*
4843 * The user requested that access to the pages in this URL
4844 * be blocked until the UPL is commited or aborted.
4845 */
4846 upl->flags |= UPL_ACCESS_BLOCKED;
4847 }
4848
4849 entry = 0;
4850 while (xfer_size) {
4851 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4852 if (delayed_unlock) {
4853 delayed_unlock = 0;
4854 vm_page_unlock_queues();
4855 }
4856 vm_object_unlock(object);
4857 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4858 vm_object_lock(object);
4859 }
4860 dst_page = vm_page_lookup(object, dst_offset);
4861
4862 /*
4863 * ENCRYPTED SWAP:
4864 * If the page is encrypted, we need to decrypt it,
4865 * so force a soft page fault.
4866 */
4867 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4868 (dst_page->encrypted) ||
4869 (dst_page->unusual && (dst_page->error ||
4870 dst_page->restart ||
4871 dst_page->absent ||
4872 dst_page->fictitious ||
4873 (prot & dst_page->page_lock)))) {
4874 vm_fault_return_t result;
4875 do {
4876 vm_page_t top_page;
4877 kern_return_t error_code;
4878 int interruptible;
4879
4880 vm_object_offset_t lo_offset = offset;
4881 vm_object_offset_t hi_offset = offset + size;
4882
4883
4884 if (delayed_unlock) {
4885 delayed_unlock = 0;
4886 vm_page_unlock_queues();
4887 }
4888
4889 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4890 interruptible = THREAD_ABORTSAFE;
4891 } else {
4892 interruptible = THREAD_UNINT;
4893 }
4894
4895 result = vm_fault_page(object, dst_offset,
4896 prot | VM_PROT_WRITE, FALSE,
4897 interruptible,
4898 lo_offset, hi_offset,
4899 VM_BEHAVIOR_SEQUENTIAL,
4900 &prot, &dst_page, &top_page,
4901 (int *)0,
4902 &error_code, no_zero_fill, FALSE, NULL, 0);
4903
4904 switch(result) {
4905 case VM_FAULT_SUCCESS:
4906
4907 PAGE_WAKEUP_DONE(dst_page);
4908
4909 /*
4910 * Release paging references and
4911 * top-level placeholder page, if any.
4912 */
4913
4914 if(top_page != VM_PAGE_NULL) {
4915 vm_object_t local_object;
4916 local_object =
4917 top_page->object;
4918 if(top_page->object
4919 != dst_page->object) {
4920 vm_object_lock(
4921 local_object);
4922 VM_PAGE_FREE(top_page);
4923 vm_object_paging_end(
4924 local_object);
4925 vm_object_unlock(
4926 local_object);
4927 } else {
4928 VM_PAGE_FREE(top_page);
4929 vm_object_paging_end(
4930 local_object);
4931 }
4932 }
4933
4934 break;
4935
4936
4937 case VM_FAULT_RETRY:
4938 vm_object_lock(object);
4939 vm_object_paging_begin(object);
4940 break;
4941
4942 case VM_FAULT_FICTITIOUS_SHORTAGE:
4943 vm_page_more_fictitious();
4944 vm_object_lock(object);
4945 vm_object_paging_begin(object);
4946 break;
4947
4948 case VM_FAULT_MEMORY_SHORTAGE:
4949 if (vm_page_wait(interruptible)) {
4950 vm_object_lock(object);
4951 vm_object_paging_begin(object);
4952 break;
4953 }
4954 /* fall thru */
4955
4956 case VM_FAULT_INTERRUPTED:
4957 error_code = MACH_SEND_INTERRUPTED;
4958 case VM_FAULT_MEMORY_ERROR:
4959 ret = (error_code ? error_code:
4960 KERN_MEMORY_ERROR);
4961 vm_object_lock(object);
4962 for(; offset < dst_offset;
4963 offset += PAGE_SIZE) {
4964 dst_page = vm_page_lookup(
4965 object, offset);
4966 if(dst_page == VM_PAGE_NULL)
4967 panic("vm_object_iopl_request: Wired pages missing. \n");
4968 vm_page_lock_queues();
4969 vm_page_unwire(dst_page);
4970 vm_page_unlock_queues();
4971 VM_STAT(reactivations++);
4972 }
4973 vm_object_unlock(object);
4974 upl_destroy(upl);
4975 return ret;
4976 }
4977 } while ((result != VM_FAULT_SUCCESS)
4978 || (result == VM_FAULT_INTERRUPTED));
4979 }
4980 if (delayed_unlock == 0)
4981 vm_page_lock_queues();
4982 vm_page_wire(dst_page);
4983
4984 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4985 /*
4986 * Mark the page "busy" to block any future page fault
4987 * on this page. We'll also remove the mapping
4988 * of all these pages before leaving this routine.
4989 */
4990 assert(!dst_page->fictitious);
4991 dst_page->busy = TRUE;
4992 }
4993
4994 if (upl_ptr) {
4995 if (cntrl_flags & UPL_SET_LITE) {
4996 int pg_num;
4997 pg_num = (dst_offset-offset)/PAGE_SIZE;
4998 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4999 } else {
5000 /*
5001 * Convert the fictitious page to a
5002 * private shadow of the real page.
5003 */
5004 assert(alias_page->fictitious);
5005 alias_page->fictitious = FALSE;
5006 alias_page->private = TRUE;
5007 alias_page->pageout = TRUE;
5008 alias_page->phys_page = dst_page->phys_page;
5009 vm_page_wire(alias_page);
5010
5011 vm_page_insert(alias_page,
5012 upl->map_object, size - xfer_size);
5013 assert(!alias_page->wanted);
5014 alias_page->busy = FALSE;
5015 alias_page->absent = FALSE;
5016 }
5017
5018 /* expect the page to be used */
5019 dst_page->reference = TRUE;
5020
5021 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5022 dst_page->dirty = TRUE;
5023 alias_page = NULL;
5024
5025 if (user_page_list) {
5026 user_page_list[entry].phys_addr
5027 = dst_page->phys_page;
5028 user_page_list[entry].dirty =
5029 dst_page->dirty;
5030 user_page_list[entry].pageout =
5031 dst_page->pageout;
5032 user_page_list[entry].absent =
5033 dst_page->absent;
5034 user_page_list[entry].precious =
5035 dst_page->precious;
5036 }
5037 }
5038 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5039 delayed_unlock = 0;
5040 vm_page_unlock_queues();
5041 }
5042 entry++;
5043 dst_offset += PAGE_SIZE_64;
5044 xfer_size -= PAGE_SIZE;
5045 }
5046 if (delayed_unlock)
5047 vm_page_unlock_queues();
5048
5049 if (upl->flags & UPL_INTERNAL) {
5050 if(page_list_count != NULL)
5051 *page_list_count = 0;
5052 } else if (*page_list_count > entry) {
5053 if(page_list_count != NULL)
5054 *page_list_count = entry;
5055 }
5056
5057 if (alias_page != NULL) {
5058 vm_page_lock_queues();
5059 vm_page_free(alias_page);
5060 vm_page_unlock_queues();
5061 }
5062
5063 vm_object_unlock(object);
5064
5065 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5066 /*
5067 * We've marked all the pages "busy" so that future
5068 * page faults will block.
5069 * Now remove the mapping for these pages, so that they
5070 * can't be accessed without causing a page fault.
5071 */
5072 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5073 PMAP_NULL, 0, VM_PROT_NONE);
5074 }
5075
5076 return KERN_SUCCESS;
5077 }
5078
5079 kern_return_t
5080 upl_transpose(
5081 upl_t upl1,
5082 upl_t upl2)
5083 {
5084 kern_return_t retval;
5085 boolean_t upls_locked;
5086 vm_object_t object1, object2;
5087
5088 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5089 return KERN_INVALID_ARGUMENT;
5090 }
5091
5092 upls_locked = FALSE;
5093
5094 /*
5095 * Since we need to lock both UPLs at the same time,
5096 * avoid deadlocks by always taking locks in the same order.
5097 */
5098 if (upl1 < upl2) {
5099 upl_lock(upl1);
5100 upl_lock(upl2);
5101 } else {
5102 upl_lock(upl2);
5103 upl_lock(upl1);
5104 }
5105 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5106
5107 object1 = upl1->map_object;
5108 object2 = upl2->map_object;
5109
5110 if (upl1->offset != 0 || upl2->offset != 0 ||
5111 upl1->size != upl2->size) {
5112 /*
5113 * We deal only with full objects, not subsets.
5114 * That's because we exchange the entire backing store info
5115 * for the objects: pager, resident pages, etc... We can't do
5116 * only part of it.
5117 */
5118 retval = KERN_INVALID_VALUE;
5119 goto done;
5120 }
5121
5122 /*
5123 * Tranpose the VM objects' backing store.
5124 */
5125 retval = vm_object_transpose(object1, object2,
5126 (vm_object_size_t) upl1->size);
5127
5128 if (retval == KERN_SUCCESS) {
5129 /*
5130 * Make each UPL point to the correct VM object, i.e. the
5131 * object holding the pages that the UPL refers to...
5132 */
5133 upl1->map_object = object2;
5134 upl2->map_object = object1;
5135 }
5136
5137 done:
5138 /*
5139 * Cleanup.
5140 */
5141 if (upls_locked) {
5142 upl_unlock(upl1);
5143 upl_unlock(upl2);
5144 upls_locked = FALSE;
5145 }
5146
5147 return retval;
5148 }
5149
5150 /*
5151 * ENCRYPTED SWAP:
5152 *
5153 * Rationale: the user might have some encrypted data on disk (via
5154 * FileVault or any other mechanism). That data is then decrypted in
5155 * memory, which is safe as long as the machine is secure. But that
5156 * decrypted data in memory could be paged out to disk by the default
5157 * pager. The data would then be stored on disk in clear (not encrypted)
5158 * and it could be accessed by anyone who gets physical access to the
5159 * disk (if the laptop or the disk gets stolen for example). This weakens
5160 * the security offered by FileVault.
5161 *
5162 * Solution: the default pager will optionally request that all the
5163 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5164 * before it sends this UPL to disk via the vnode_pageout() path.
5165 *
5166 * Notes:
5167 *
5168 * To avoid disrupting the VM LRU algorithms, we want to keep the
5169 * clean-in-place mechanisms, which allow us to send some extra pages to
5170 * swap (clustering) without actually removing them from the user's
5171 * address space. We don't want the user to unknowingly access encrypted
5172 * data, so we have to actually remove the encrypted pages from the page
5173 * table. When the user accesses the data, the hardware will fail to
5174 * locate the virtual page in its page table and will trigger a page
5175 * fault. We can then decrypt the page and enter it in the page table
5176 * again. Whenever we allow the user to access the contents of a page,
5177 * we have to make sure it's not encrypted.
5178 *
5179 *
5180 */
5181 /*
5182 * ENCRYPTED SWAP:
5183 * Reserve of virtual addresses in the kernel address space.
5184 * We need to map the physical pages in the kernel, so that we
5185 * can call the encryption/decryption routines with a kernel
5186 * virtual address. We keep this pool of pre-allocated kernel
5187 * virtual addresses so that we don't have to scan the kernel's
5188 * virtaul address space each time we need to encrypt or decrypt
5189 * a physical page.
5190 * It would be nice to be able to encrypt and decrypt in physical
5191 * mode but that might not always be more efficient...
5192 */
5193 decl_simple_lock_data(,vm_paging_lock)
5194 #define VM_PAGING_NUM_PAGES 64
5195 vm_map_offset_t vm_paging_base_address = 0;
5196 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5197 int vm_paging_max_index = 0;
5198 unsigned long vm_paging_no_kernel_page = 0;
5199 unsigned long vm_paging_objects_mapped = 0;
5200 unsigned long vm_paging_pages_mapped = 0;
5201 unsigned long vm_paging_objects_mapped_slow = 0;
5202 unsigned long vm_paging_pages_mapped_slow = 0;
5203
5204 /*
5205 * ENCRYPTED SWAP:
5206 * vm_paging_map_object:
5207 * Maps part of a VM object's pages in the kernel
5208 * virtual address space, using the pre-allocated
5209 * kernel virtual addresses, if possible.
5210 * Context:
5211 * The VM object is locked. This lock will get
5212 * dropped and re-acquired though.
5213 */
5214 kern_return_t
5215 vm_paging_map_object(
5216 vm_map_offset_t *address,
5217 vm_page_t page,
5218 vm_object_t object,
5219 vm_object_offset_t offset,
5220 vm_map_size_t *size)
5221 {
5222 kern_return_t kr;
5223 vm_map_offset_t page_map_offset;
5224 vm_map_size_t map_size;
5225 vm_object_offset_t object_offset;
5226 #ifdef __ppc__
5227 int i;
5228 vm_map_entry_t map_entry;
5229 #endif /* __ppc__ */
5230
5231
5232 #ifdef __ppc__
5233 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5234 /*
5235 * Optimization for the PowerPC.
5236 * Use one of the pre-allocated kernel virtual addresses
5237 * and just enter the VM page in the kernel address space
5238 * at that virtual address.
5239 */
5240 vm_object_unlock(object);
5241 simple_lock(&vm_paging_lock);
5242
5243 if (vm_paging_base_address == 0) {
5244 /*
5245 * Initialize our pool of pre-allocated kernel
5246 * virtual addresses.
5247 */
5248 simple_unlock(&vm_paging_lock);
5249 page_map_offset = 0;
5250 kr = vm_map_find_space(kernel_map,
5251 &page_map_offset,
5252 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5253 0,
5254 &map_entry);
5255 if (kr != KERN_SUCCESS) {
5256 panic("vm_paging_map_object: "
5257 "kernel_map full\n");
5258 }
5259 map_entry->object.vm_object = kernel_object;
5260 map_entry->offset =
5261 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5262 vm_object_reference(kernel_object);
5263 vm_map_unlock(kernel_map);
5264
5265 simple_lock(&vm_paging_lock);
5266 if (vm_paging_base_address != 0) {
5267 /* someone raced us and won: undo */
5268 simple_unlock(&vm_paging_lock);
5269 kr = vm_map_remove(kernel_map,
5270 page_map_offset,
5271 page_map_offset +
5272 (VM_PAGING_NUM_PAGES
5273 * PAGE_SIZE),
5274 VM_MAP_NO_FLAGS);
5275 assert(kr == KERN_SUCCESS);
5276 simple_lock(&vm_paging_lock);
5277 } else {
5278 vm_paging_base_address = page_map_offset;
5279 }
5280 }
5281
5282 /*
5283 * Try and find an available kernel virtual address
5284 * from our pre-allocated pool.
5285 */
5286 page_map_offset = 0;
5287 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5288 if (vm_paging_page_inuse[i] == FALSE) {
5289 page_map_offset = vm_paging_base_address +
5290 (i * PAGE_SIZE);
5291 break;
5292 }
5293 }
5294
5295 if (page_map_offset != 0) {
5296 /*
5297 * We found a kernel virtual address;
5298 * map the physical page to that virtual address.
5299 */
5300 if (i > vm_paging_max_index) {
5301 vm_paging_max_index = i;
5302 }
5303 vm_paging_page_inuse[i] = TRUE;
5304 simple_unlock(&vm_paging_lock);
5305 pmap_map_block(kernel_pmap,
5306 page_map_offset,
5307 page->phys_page,
5308 1, /* Size is number of 4k pages */
5309 VM_PROT_DEFAULT,
5310 ((int) page->object->wimg_bits &
5311 VM_WIMG_MASK),
5312 0);
5313 vm_paging_objects_mapped++;
5314 vm_paging_pages_mapped++;
5315 *address = page_map_offset;
5316 vm_object_lock(object);
5317
5318 /* all done and mapped, ready to use ! */
5319 return KERN_SUCCESS;
5320 }
5321
5322 /*
5323 * We ran out of pre-allocated kernel virtual
5324 * addresses. Just map the page in the kernel
5325 * the slow and regular way.
5326 */
5327 vm_paging_no_kernel_page++;
5328 simple_unlock(&vm_paging_lock);
5329 vm_object_lock(object);
5330 }
5331 #endif /* __ppc__ */
5332
5333 object_offset = vm_object_trunc_page(offset);
5334 map_size = vm_map_round_page(*size);
5335
5336 /*
5337 * Try and map the required range of the object
5338 * in the kernel_map
5339 */
5340
5341 /* don't go beyond the object's end... */
5342 if (object_offset >= object->size) {
5343 map_size = 0;
5344 } else if (map_size > object->size - offset) {
5345 map_size = object->size - offset;
5346 }
5347
5348 vm_object_reference_locked(object); /* for the map entry */
5349 vm_object_unlock(object);
5350
5351 kr = vm_map_enter(kernel_map,
5352 address,
5353 map_size,
5354 0,
5355 VM_FLAGS_ANYWHERE,
5356 object,
5357 object_offset,
5358 FALSE,
5359 VM_PROT_DEFAULT,
5360 VM_PROT_ALL,
5361 VM_INHERIT_NONE);
5362 if (kr != KERN_SUCCESS) {
5363 *address = 0;
5364 *size = 0;
5365 vm_object_deallocate(object); /* for the map entry */
5366 return kr;
5367 }
5368
5369 *size = map_size;
5370
5371 /*
5372 * Enter the mapped pages in the page table now.
5373 */
5374 vm_object_lock(object);
5375 for (page_map_offset = 0;
5376 map_size != 0;
5377 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5378 unsigned int cache_attr;
5379
5380 page = vm_page_lookup(object, offset + page_map_offset);
5381 if (page == VM_PAGE_NULL) {
5382 panic("vm_paging_map_object: no page !?");
5383 }
5384 if (page->no_isync == TRUE) {
5385 pmap_sync_page_data_phys(page->phys_page);
5386 }
5387 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5388
5389 PMAP_ENTER(kernel_pmap,
5390 *address + page_map_offset,
5391 page,
5392 VM_PROT_DEFAULT,
5393 cache_attr,
5394 FALSE);
5395 }
5396
5397 vm_paging_objects_mapped_slow++;
5398 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5399
5400 return KERN_SUCCESS;
5401 }
5402
5403 /*
5404 * ENCRYPTED SWAP:
5405 * vm_paging_unmap_object:
5406 * Unmaps part of a VM object's pages from the kernel
5407 * virtual address space.
5408 * Context:
5409 * The VM object is locked. This lock will get
5410 * dropped and re-acquired though.
5411 */
5412 void
5413 vm_paging_unmap_object(
5414 vm_object_t object,
5415 vm_map_offset_t start,
5416 vm_map_offset_t end)
5417 {
5418 kern_return_t kr;
5419 #ifdef __ppc__
5420 int i;
5421 #endif /* __ppc__ */
5422
5423 if ((vm_paging_base_address != 0) &&
5424 ((start < vm_paging_base_address) ||
5425 (end > (vm_paging_base_address
5426 + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5427 /*
5428 * We didn't use our pre-allocated pool of
5429 * kernel virtual address. Deallocate the
5430 * virtual memory.
5431 */
5432 if (object != VM_OBJECT_NULL) {
5433 vm_object_unlock(object);
5434 }
5435 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5436 if (object != VM_OBJECT_NULL) {
5437 vm_object_lock(object);
5438 }
5439 assert(kr == KERN_SUCCESS);
5440 } else {
5441 /*
5442 * We used a kernel virtual address from our
5443 * pre-allocated pool. Put it back in the pool
5444 * for next time.
5445 */
5446 #ifdef __ppc__
5447 assert(end - start == PAGE_SIZE);
5448 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5449
5450 /* undo the pmap mapping */
5451 mapping_remove(kernel_pmap, start);
5452
5453 simple_lock(&vm_paging_lock);
5454 vm_paging_page_inuse[i] = FALSE;
5455 simple_unlock(&vm_paging_lock);
5456 #endif /* __ppc__ */
5457 }
5458 }
5459
5460 /*
5461 * Encryption data.
5462 * "iv" is the "initial vector". Ideally, we want to
5463 * have a different one for each page we encrypt, so that
5464 * crackers can't find encryption patterns too easily.
5465 */
5466 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5467 boolean_t swap_crypt_ctx_initialized = FALSE;
5468 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5469 aes_ctx swap_crypt_ctx;
5470 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5471
5472 #if DEBUG
5473 boolean_t swap_crypt_ctx_tested = FALSE;
5474 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5475 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5476 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5477 #endif /* DEBUG */
5478
5479 extern u_long random(void);
5480
5481 /*
5482 * Initialize the encryption context: key and key size.
5483 */
5484 void swap_crypt_ctx_initialize(void); /* forward */
5485 void
5486 swap_crypt_ctx_initialize(void)
5487 {
5488 unsigned int i;
5489
5490 /*
5491 * No need for locking to protect swap_crypt_ctx_initialized
5492 * because the first use of encryption will come from the
5493 * pageout thread (we won't pagein before there's been a pageout)
5494 * and there's only one pageout thread.
5495 */
5496 if (swap_crypt_ctx_initialized == FALSE) {
5497 for (i = 0;
5498 i < (sizeof (swap_crypt_key) /
5499 sizeof (swap_crypt_key[0]));
5500 i++) {
5501 swap_crypt_key[i] = random();
5502 }
5503 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5504 SWAP_CRYPT_AES_KEY_SIZE,
5505 &swap_crypt_ctx.encrypt);
5506 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5507 SWAP_CRYPT_AES_KEY_SIZE,
5508 &swap_crypt_ctx.decrypt);
5509 swap_crypt_ctx_initialized = TRUE;
5510 }
5511
5512 #if DEBUG
5513 /*
5514 * Validate the encryption algorithms.
5515 */
5516 if (swap_crypt_ctx_tested == FALSE) {
5517 /* initialize */
5518 for (i = 0; i < 4096; i++) {
5519 swap_crypt_test_page_ref[i] = (char) i;
5520 }
5521 /* encrypt */
5522 aes_encrypt_cbc(swap_crypt_test_page_ref,
5523 swap_crypt_null_iv,
5524 PAGE_SIZE / AES_BLOCK_SIZE,
5525 swap_crypt_test_page_encrypt,
5526 &swap_crypt_ctx.encrypt);
5527 /* decrypt */
5528 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5529 swap_crypt_null_iv,
5530 PAGE_SIZE / AES_BLOCK_SIZE,
5531 swap_crypt_test_page_decrypt,
5532 &swap_crypt_ctx.decrypt);
5533 /* compare result with original */
5534 for (i = 0; i < 4096; i ++) {
5535 if (swap_crypt_test_page_decrypt[i] !=
5536 swap_crypt_test_page_ref[i]) {
5537 panic("encryption test failed");
5538 }
5539 }
5540
5541 /* encrypt again */
5542 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5543 swap_crypt_null_iv,
5544 PAGE_SIZE / AES_BLOCK_SIZE,
5545 swap_crypt_test_page_decrypt,
5546 &swap_crypt_ctx.encrypt);
5547 /* decrypt in place */
5548 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5549 swap_crypt_null_iv,
5550 PAGE_SIZE / AES_BLOCK_SIZE,
5551 swap_crypt_test_page_decrypt,
5552 &swap_crypt_ctx.decrypt);
5553 for (i = 0; i < 4096; i ++) {
5554 if (swap_crypt_test_page_decrypt[i] !=
5555 swap_crypt_test_page_ref[i]) {
5556 panic("in place encryption test failed");
5557 }
5558 }
5559
5560 swap_crypt_ctx_tested = TRUE;
5561 }
5562 #endif /* DEBUG */
5563 }
5564
5565 /*
5566 * ENCRYPTED SWAP:
5567 * vm_page_encrypt:
5568 * Encrypt the given page, for secure paging.
5569 * The page might already be mapped at kernel virtual
5570 * address "kernel_mapping_offset". Otherwise, we need
5571 * to map it.
5572 *
5573 * Context:
5574 * The page's object is locked, but this lock will be released
5575 * and re-acquired.
5576 * The page is busy and not accessible by users (not entered in any pmap).
5577 */
5578 void
5579 vm_page_encrypt(
5580 vm_page_t page,
5581 vm_map_offset_t kernel_mapping_offset)
5582 {
5583 int clear_refmod = 0;
5584 kern_return_t kr;
5585 boolean_t page_was_referenced;
5586 boolean_t page_was_modified;
5587 vm_map_size_t kernel_mapping_size;
5588 vm_offset_t kernel_vaddr;
5589 union {
5590 unsigned char aes_iv[AES_BLOCK_SIZE];
5591 struct {
5592 memory_object_t pager_object;
5593 vm_object_offset_t paging_offset;
5594 } vm;
5595 } encrypt_iv;
5596
5597 if (! vm_pages_encrypted) {
5598 vm_pages_encrypted = TRUE;
5599 }
5600
5601 assert(page->busy);
5602 assert(page->dirty || page->precious);
5603
5604 if (page->encrypted) {
5605 /*
5606 * Already encrypted: no need to do it again.
5607 */
5608 vm_page_encrypt_already_encrypted_counter++;
5609 return;
5610 }
5611 ASSERT_PAGE_DECRYPTED(page);
5612
5613 /*
5614 * Gather the "reference" and "modified" status of the page.
5615 * We'll restore these values after the encryption, so that
5616 * the encryption is transparent to the rest of the system
5617 * and doesn't impact the VM's LRU logic.
5618 */
5619 page_was_referenced =
5620 (page->reference || pmap_is_referenced(page->phys_page));
5621 page_was_modified =
5622 (page->dirty || pmap_is_modified(page->phys_page));
5623
5624 if (kernel_mapping_offset == 0) {
5625 /*
5626 * The page hasn't already been mapped in kernel space
5627 * by the caller. Map it now, so that we can access
5628 * its contents and encrypt them.
5629 */
5630 kernel_mapping_size = PAGE_SIZE;
5631 kr = vm_paging_map_object(&kernel_mapping_offset,
5632 page,
5633 page->object,
5634 page->offset,
5635 &kernel_mapping_size);
5636 if (kr != KERN_SUCCESS) {
5637 panic("vm_page_encrypt: "
5638 "could not map page in kernel: 0x%x\n",
5639 kr);
5640 }
5641 } else {
5642 kernel_mapping_size = 0;
5643 }
5644 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5645
5646 if (swap_crypt_ctx_initialized == FALSE) {
5647 swap_crypt_ctx_initialize();
5648 }
5649 assert(swap_crypt_ctx_initialized);
5650
5651 /*
5652 * Prepare an "initial vector" for the encryption.
5653 * We use the "pager" and the "paging_offset" for that
5654 * page to obfuscate the encrypted data a bit more and
5655 * prevent crackers from finding patterns that they could
5656 * use to break the key.
5657 */
5658 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5659 encrypt_iv.vm.pager_object = page->object->pager;
5660 encrypt_iv.vm.paging_offset =
5661 page->object->paging_offset + page->offset;
5662
5663 vm_object_unlock(page->object);
5664
5665 /* encrypt the "initial vector" */
5666 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5667 swap_crypt_null_iv,
5668 1,
5669 &encrypt_iv.aes_iv[0],
5670 &swap_crypt_ctx.encrypt);
5671
5672 /*
5673 * Encrypt the page.
5674 */
5675 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5676 &encrypt_iv.aes_iv[0],
5677 PAGE_SIZE / AES_BLOCK_SIZE,
5678 (unsigned char *) kernel_vaddr,
5679 &swap_crypt_ctx.encrypt);
5680
5681 vm_page_encrypt_counter++;
5682
5683 vm_object_lock(page->object);
5684
5685 /*
5686 * Unmap the page from the kernel's address space,
5687 * if we had to map it ourselves. Otherwise, let
5688 * the caller undo the mapping if needed.
5689 */
5690 if (kernel_mapping_size != 0) {
5691 vm_paging_unmap_object(page->object,
5692 kernel_mapping_offset,
5693 kernel_mapping_offset + kernel_mapping_size);
5694 }
5695
5696 /*
5697 * Restore the "reference" and "modified" bits.
5698 * This should clean up any impact the encryption had
5699 * on them.
5700 */
5701 if (! page_was_referenced) {
5702 clear_refmod |= VM_MEM_REFERENCED;
5703 page->reference = FALSE;
5704 }
5705 if (! page_was_modified) {
5706 clear_refmod |= VM_MEM_MODIFIED;
5707 page->dirty = FALSE;
5708 }
5709 if (clear_refmod)
5710 pmap_clear_refmod(page->phys_page, clear_refmod);
5711
5712 page->encrypted = TRUE;
5713 }
5714
5715 /*
5716 * ENCRYPTED SWAP:
5717 * vm_page_decrypt:
5718 * Decrypt the given page.
5719 * The page might already be mapped at kernel virtual
5720 * address "kernel_mapping_offset". Otherwise, we need
5721 * to map it.
5722 *
5723 * Context:
5724 * The page's VM object is locked but will be unlocked and relocked.
5725 * The page is busy and not accessible by users (not entered in any pmap).
5726 */
5727 void
5728 vm_page_decrypt(
5729 vm_page_t page,
5730 vm_map_offset_t kernel_mapping_offset)
5731 {
5732 int clear_refmod = 0;
5733 kern_return_t kr;
5734 vm_map_size_t kernel_mapping_size;
5735 vm_offset_t kernel_vaddr;
5736 boolean_t page_was_referenced;
5737 union {
5738 unsigned char aes_iv[AES_BLOCK_SIZE];
5739 struct {
5740 memory_object_t pager_object;
5741 vm_object_offset_t paging_offset;
5742 } vm;
5743 } decrypt_iv;
5744
5745 assert(page->busy);
5746 assert(page->encrypted);
5747
5748 /*
5749 * Gather the "reference" status of the page.
5750 * We'll restore its value after the decryption, so that
5751 * the decryption is transparent to the rest of the system
5752 * and doesn't impact the VM's LRU logic.
5753 */
5754 page_was_referenced =
5755 (page->reference || pmap_is_referenced(page->phys_page));
5756
5757 if (kernel_mapping_offset == 0) {
5758 /*
5759 * The page hasn't already been mapped in kernel space
5760 * by the caller. Map it now, so that we can access
5761 * its contents and decrypt them.
5762 */
5763 kernel_mapping_size = PAGE_SIZE;
5764 kr = vm_paging_map_object(&kernel_mapping_offset,
5765 page,
5766 page->object,
5767 page->offset,
5768 &kernel_mapping_size);
5769 if (kr != KERN_SUCCESS) {
5770 panic("vm_page_decrypt: "
5771 "could not map page in kernel: 0x%x\n");
5772 }
5773 } else {
5774 kernel_mapping_size = 0;
5775 }
5776 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5777
5778 assert(swap_crypt_ctx_initialized);
5779
5780 /*
5781 * Prepare an "initial vector" for the decryption.
5782 * It has to be the same as the "initial vector" we
5783 * used to encrypt that page.
5784 */
5785 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5786 decrypt_iv.vm.pager_object = page->object->pager;
5787 decrypt_iv.vm.paging_offset =
5788 page->object->paging_offset + page->offset;
5789
5790 vm_object_unlock(page->object);
5791
5792 /* encrypt the "initial vector" */
5793 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5794 swap_crypt_null_iv,
5795 1,
5796 &decrypt_iv.aes_iv[0],
5797 &swap_crypt_ctx.encrypt);
5798
5799 /*
5800 * Decrypt the page.
5801 */
5802 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5803 &decrypt_iv.aes_iv[0],
5804 PAGE_SIZE / AES_BLOCK_SIZE,
5805 (unsigned char *) kernel_vaddr,
5806 &swap_crypt_ctx.decrypt);
5807 vm_page_decrypt_counter++;
5808
5809 vm_object_lock(page->object);
5810
5811 /*
5812 * Unmap the page from the kernel's address space,
5813 * if we had to map it ourselves. Otherwise, let
5814 * the caller undo the mapping if needed.
5815 */
5816 if (kernel_mapping_size != 0) {
5817 vm_paging_unmap_object(page->object,
5818 kernel_vaddr,
5819 kernel_vaddr + PAGE_SIZE);
5820 }
5821
5822 /*
5823 * After decryption, the page is actually clean.
5824 * It was encrypted as part of paging, which "cleans"
5825 * the "dirty" pages.
5826 * Noone could access it after it was encrypted
5827 * and the decryption doesn't count.
5828 */
5829 page->dirty = FALSE;
5830 clear_refmod = VM_MEM_MODIFIED;
5831
5832 /* restore the "reference" bit */
5833 if (! page_was_referenced) {
5834 page->reference = FALSE;
5835 clear_refmod |= VM_MEM_REFERENCED;
5836 }
5837 pmap_clear_refmod(page->phys_page, clear_refmod);
5838
5839 page->encrypted = FALSE;
5840
5841 /*
5842 * We've just modified the page's contents via the data cache and part
5843 * of the new contents might still be in the cache and not yet in RAM.
5844 * Since the page is now available and might get gathered in a UPL to
5845 * be part of a DMA transfer from a driver that expects the memory to
5846 * be coherent at this point, we have to flush the data cache.
5847 */
5848 pmap_sync_page_data_phys(page->phys_page);
5849 /*
5850 * Since the page is not mapped yet, some code might assume that it
5851 * doesn't need to invalidate the instruction cache when writing to
5852 * that page. That code relies on "no_isync" being set, so that the
5853 * caches get syncrhonized when the page is first mapped. So we need
5854 * to set "no_isync" here too, despite the fact that we just
5855 * synchronized the caches above...
5856 */
5857 page->no_isync = TRUE;
5858 }
5859
5860 unsigned long upl_encrypt_upls = 0;
5861 unsigned long upl_encrypt_pages = 0;
5862
5863 /*
5864 * ENCRYPTED SWAP:
5865 *
5866 * upl_encrypt:
5867 * Encrypts all the pages in the UPL, within the specified range.
5868 *
5869 */
5870 void
5871 upl_encrypt(
5872 upl_t upl,
5873 upl_offset_t crypt_offset,
5874 upl_size_t crypt_size)
5875 {
5876 upl_size_t upl_size;
5877 upl_offset_t upl_offset;
5878 vm_object_t upl_object;
5879 vm_page_t page;
5880 vm_object_t shadow_object;
5881 vm_object_offset_t shadow_offset;
5882 vm_object_offset_t paging_offset;
5883 vm_object_offset_t base_offset;
5884
5885 upl_encrypt_upls++;
5886 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5887
5888 upl_lock(upl);
5889
5890 upl_object = upl->map_object;
5891 upl_offset = upl->offset;
5892 upl_size = upl->size;
5893
5894 upl_unlock(upl);
5895
5896 vm_object_lock(upl_object);
5897
5898 /*
5899 * Find the VM object that contains the actual pages.
5900 */
5901 if (upl_object->pageout) {
5902 shadow_object = upl_object->shadow;
5903 /*
5904 * The offset in the shadow object is actually also
5905 * accounted for in upl->offset. It possibly shouldn't be
5906 * this way, but for now don't account for it twice.
5907 */
5908 shadow_offset = 0;
5909 assert(upl_object->paging_offset == 0); /* XXX ? */
5910 vm_object_lock(shadow_object);
5911 } else {
5912 shadow_object = upl_object;
5913 shadow_offset = 0;
5914 }
5915
5916 paging_offset = shadow_object->paging_offset;
5917 vm_object_paging_begin(shadow_object);
5918
5919 if (shadow_object != upl_object) {
5920 vm_object_unlock(shadow_object);
5921 }
5922 vm_object_unlock(upl_object);
5923
5924 base_offset = shadow_offset;
5925 base_offset += upl_offset;
5926 base_offset += crypt_offset;
5927 base_offset -= paging_offset;
5928 /*
5929 * Unmap the pages, so that nobody can continue accessing them while
5930 * they're encrypted. After that point, all accesses to these pages
5931 * will cause a page fault and block while the page is being encrypted
5932 * (busy). After the encryption completes, any access will cause a
5933 * page fault and the page gets decrypted at that time.
5934 */
5935 assert(crypt_offset + crypt_size <= upl_size);
5936 vm_object_pmap_protect(shadow_object,
5937 base_offset,
5938 (vm_object_size_t)crypt_size,
5939 PMAP_NULL,
5940 0,
5941 VM_PROT_NONE);
5942
5943 /* XXX FBDP could the object have changed significantly here ? */
5944 vm_object_lock(shadow_object);
5945
5946 for (upl_offset = 0;
5947 upl_offset < crypt_size;
5948 upl_offset += PAGE_SIZE) {
5949 page = vm_page_lookup(shadow_object,
5950 base_offset + upl_offset);
5951 if (page == VM_PAGE_NULL) {
5952 panic("upl_encrypt: "
5953 "no page for (obj=%p,off=%lld+%d)!\n",
5954 shadow_object,
5955 base_offset,
5956 upl_offset);
5957 }
5958 vm_page_encrypt(page, 0);
5959 }
5960
5961 vm_object_paging_end(shadow_object);
5962 vm_object_unlock(shadow_object);
5963 }
5964
5965 vm_size_t
5966 upl_get_internal_pagelist_offset(void)
5967 {
5968 return sizeof(struct upl);
5969 }
5970
5971 void
5972 upl_set_dirty(
5973 upl_t upl)
5974 {
5975 upl->flags |= UPL_CLEAR_DIRTY;
5976 }
5977
5978 void
5979 upl_clear_dirty(
5980 upl_t upl)
5981 {
5982 upl->flags &= ~UPL_CLEAR_DIRTY;
5983 }
5984
5985
5986 #ifdef MACH_BSD
5987
5988 boolean_t upl_page_present(upl_page_info_t *upl, int index)
5989 {
5990 return(UPL_PAGE_PRESENT(upl, index));
5991 }
5992 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
5993 {
5994 return(UPL_DIRTY_PAGE(upl, index));
5995 }
5996 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
5997 {
5998 return(UPL_VALID_PAGE(upl, index));
5999 }
6000 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6001 {
6002 return(UPL_PHYS_PAGE(upl, index));
6003 }
6004
6005 void
6006 vm_countdirtypages(void)
6007 {
6008 vm_page_t m;
6009 int dpages;
6010 int pgopages;
6011 int precpages;
6012
6013
6014 dpages=0;
6015 pgopages=0;
6016 precpages=0;
6017
6018 vm_page_lock_queues();
6019 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6020 do {
6021 if (m ==(vm_page_t )0) break;
6022
6023 if(m->dirty) dpages++;
6024 if(m->pageout) pgopages++;
6025 if(m->precious) precpages++;
6026
6027 assert(m->object != kernel_object);
6028 m = (vm_page_t) queue_next(&m->pageq);
6029 if (m ==(vm_page_t )0) break;
6030
6031 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6032 vm_page_unlock_queues();
6033
6034 vm_page_lock_queues();
6035 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6036 do {
6037 if (m ==(vm_page_t )0) break;
6038
6039 if(m->dirty) dpages++;
6040 if(m->pageout) pgopages++;
6041 if(m->precious) precpages++;
6042
6043 assert(m->object != kernel_object);
6044 m = (vm_page_t) queue_next(&m->pageq);
6045 if (m ==(vm_page_t )0) break;
6046
6047 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6048 vm_page_unlock_queues();
6049
6050 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6051
6052 dpages=0;
6053 pgopages=0;
6054 precpages=0;
6055
6056 vm_page_lock_queues();
6057 m = (vm_page_t) queue_first(&vm_page_queue_active);
6058
6059 do {
6060 if(m == (vm_page_t )0) break;
6061 if(m->dirty) dpages++;
6062 if(m->pageout) pgopages++;
6063 if(m->precious) precpages++;
6064
6065 assert(m->object != kernel_object);
6066 m = (vm_page_t) queue_next(&m->pageq);
6067 if(m == (vm_page_t )0) break;
6068
6069 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6070 vm_page_unlock_queues();
6071
6072 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6073
6074 }
6075 #endif /* MACH_BSD */
6076
6077 #ifdef UPL_DEBUG
6078 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6079 {
6080 upl->ubc_alias1 = alias1;
6081 upl->ubc_alias2 = alias2;
6082 return KERN_SUCCESS;
6083 }
6084 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6085 {
6086 if(al)
6087 *al = upl->ubc_alias1;
6088 if(al2)
6089 *al2 = upl->ubc_alias2;
6090 return KERN_SUCCESS;
6091 }
6092 #endif /* UPL_DEBUG */
6093
6094
6095
6096 #if MACH_KDB
6097 #include <ddb/db_output.h>
6098 #include <ddb/db_print.h>
6099 #include <vm/vm_print.h>
6100
6101 #define printf kdbprintf
6102 void db_pageout(void);
6103
6104 void
6105 db_vm(void)
6106 {
6107
6108 iprintf("VM Statistics:\n");
6109 db_indent += 2;
6110 iprintf("pages:\n");
6111 db_indent += 2;
6112 iprintf("activ %5d inact %5d free %5d",
6113 vm_page_active_count, vm_page_inactive_count,
6114 vm_page_free_count);
6115 printf(" wire %5d gobbl %5d\n",
6116 vm_page_wire_count, vm_page_gobble_count);
6117 db_indent -= 2;
6118 iprintf("target:\n");
6119 db_indent += 2;
6120 iprintf("min %5d inact %5d free %5d",
6121 vm_page_free_min, vm_page_inactive_target,
6122 vm_page_free_target);
6123 printf(" resrv %5d\n", vm_page_free_reserved);
6124 db_indent -= 2;
6125 iprintf("pause:\n");
6126 db_pageout();
6127 db_indent -= 2;
6128 }
6129
6130 #if MACH_COUNTERS
6131 extern int c_laundry_pages_freed;
6132 #endif /* MACH_COUNTERS */
6133
6134 void
6135 db_pageout(void)
6136 {
6137 iprintf("Pageout Statistics:\n");
6138 db_indent += 2;
6139 iprintf("active %5d inactv %5d\n",
6140 vm_pageout_active, vm_pageout_inactive);
6141 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6142 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6143 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6144 iprintf("used %5d clean %5d dirty %5d\n",
6145 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6146 vm_pageout_inactive_dirty);
6147 #if MACH_COUNTERS
6148 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6149 #endif /* MACH_COUNTERS */
6150 #if MACH_CLUSTER_STATS
6151 iprintf("Cluster Statistics:\n");
6152 db_indent += 2;
6153 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6154 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6155 vm_pageout_cluster_collisions);
6156 iprintf("clusters %5d conversions %5d\n",
6157 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6158 db_indent -= 2;
6159 iprintf("Target Statistics:\n");
6160 db_indent += 2;
6161 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6162 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6163 vm_pageout_target_page_freed);
6164 db_indent -= 2;
6165 #endif /* MACH_CLUSTER_STATS */
6166 db_indent -= 2;
6167 }
6168
6169 #endif /* MACH_KDB */