]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_pageout.c
82466f9ab987f54f4406a9bfee449f28a2da8f15
[apple/xnu.git] / osfmk / vm / vm_pageout.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * @OSF_COPYRIGHT@
25 */
26 /*
27 * Mach Operating System
28 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
29 * All Rights Reserved.
30 *
31 * Permission to use, copy, modify and distribute this software and its
32 * documentation is hereby granted, provided that both the copyright
33 * notice and this permission notice appear in all copies of the
34 * software, derivative works or modified versions, and any portions
35 * thereof, and that both notices appear in supporting documentation.
36 *
37 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
38 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
39 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 *
41 * Carnegie Mellon requests users of this software to return to
42 *
43 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
44 * School of Computer Science
45 * Carnegie Mellon University
46 * Pittsburgh PA 15213-3890
47 *
48 * any improvements or extensions that they make and grant Carnegie Mellon
49 * the rights to redistribute these changes.
50 */
51 /*
52 */
53 /*
54 * File: vm/vm_pageout.c
55 * Author: Avadis Tevanian, Jr., Michael Wayne Young
56 * Date: 1985
57 *
58 * The proverbial page-out daemon.
59 */
60
61 #include <stdint.h>
62
63 #include <debug.h>
64 #include <mach_pagemap.h>
65 #include <mach_cluster_stats.h>
66 #include <mach_kdb.h>
67 #include <advisory_pageout.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/memory_object.h>
71 #include <mach/memory_object_default.h>
72 #include <mach/memory_object_control_server.h>
73 #include <mach/mach_host_server.h>
74 #include <mach/upl.h>
75 #include <mach/vm_map.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_statistics.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/counters.h>
81 #include <kern/host_statistics.h>
82 #include <kern/machine.h>
83 #include <kern/misc_protos.h>
84 #include <kern/thread.h>
85 #include <kern/xpr.h>
86 #include <kern/kalloc.h>
87
88 #include <machine/vm_tuning.h>
89
90 #include <vm/pmap.h>
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_pageout.h>
96 #include <vm/vm_protos.h> /* must be last */
97
98 /*
99 * ENCRYPTED SWAP:
100 */
101 #ifdef __ppc__
102 #include <ppc/mappings.h>
103 #endif /* __ppc__ */
104 #include <../bsd/crypto/aes/aes.h>
105
106 extern ipc_port_t memory_manager_default;
107
108
109 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
110 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */
111 #endif
112
113 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
114 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
115 #endif
116
117 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
118 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
119 #endif
120
121 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
122 #define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
123 #endif
124
125 #ifndef VM_PAGE_LAUNDRY_MAX
126 #define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
127 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
128
129 #ifndef VM_PAGEOUT_BURST_WAIT
130 #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
131 #endif /* VM_PAGEOUT_BURST_WAIT */
132
133 #ifndef VM_PAGEOUT_EMPTY_WAIT
134 #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
135 #endif /* VM_PAGEOUT_EMPTY_WAIT */
136
137 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
138 #define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
139 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
140
141 #ifndef VM_PAGEOUT_IDLE_WAIT
142 #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
143 #endif /* VM_PAGEOUT_IDLE_WAIT */
144
145
146 /*
147 * To obtain a reasonable LRU approximation, the inactive queue
148 * needs to be large enough to give pages on it a chance to be
149 * referenced a second time. This macro defines the fraction
150 * of active+inactive pages that should be inactive.
151 * The pageout daemon uses it to update vm_page_inactive_target.
152 *
153 * If vm_page_free_count falls below vm_page_free_target and
154 * vm_page_inactive_count is below vm_page_inactive_target,
155 * then the pageout daemon starts running.
156 */
157
158 #ifndef VM_PAGE_INACTIVE_TARGET
159 #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
160 #endif /* VM_PAGE_INACTIVE_TARGET */
161
162 /*
163 * Once the pageout daemon starts running, it keeps going
164 * until vm_page_free_count meets or exceeds vm_page_free_target.
165 */
166
167 #ifndef VM_PAGE_FREE_TARGET
168 #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
169 #endif /* VM_PAGE_FREE_TARGET */
170
171 /*
172 * The pageout daemon always starts running once vm_page_free_count
173 * falls below vm_page_free_min.
174 */
175
176 #ifndef VM_PAGE_FREE_MIN
177 #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
178 #endif /* VM_PAGE_FREE_MIN */
179
180 /*
181 * When vm_page_free_count falls below vm_page_free_reserved,
182 * only vm-privileged threads can allocate pages. vm-privilege
183 * allows the pageout daemon and default pager (and any other
184 * associated threads needed for default pageout) to continue
185 * operation by dipping into the reserved pool of pages.
186 */
187
188 #ifndef VM_PAGE_FREE_RESERVED
189 #define VM_PAGE_FREE_RESERVED(n) \
190 ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
191 #endif /* VM_PAGE_FREE_RESERVED */
192
193
194 /*
195 * must hold the page queues lock to
196 * manipulate this structure
197 */
198 struct vm_pageout_queue {
199 queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */
200 unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */
201 unsigned int pgo_maxlaundry;
202
203 unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */
204 pgo_busy:1, /* iothread is currently processing request from pgo_pending */
205 pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
206 :0;
207 };
208
209 #define VM_PAGE_Q_THROTTLED(q) \
210 ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
211
212
213 /*
214 * Exported variable used to broadcast the activation of the pageout scan
215 * Working Set uses this to throttle its use of pmap removes. In this
216 * way, code which runs within memory in an uncontested context does
217 * not keep encountering soft faults.
218 */
219
220 unsigned int vm_pageout_scan_event_counter = 0;
221
222 /*
223 * Forward declarations for internal routines.
224 */
225
226 static void vm_pageout_garbage_collect(int);
227 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
228 static void vm_pageout_iothread_external(void);
229 static void vm_pageout_iothread_internal(void);
230 static void vm_pageout_queue_steal(vm_page_t);
231
232 extern void vm_pageout_continue(void);
233 extern void vm_pageout_scan(void);
234
235 unsigned int vm_pageout_reserved_internal = 0;
236 unsigned int vm_pageout_reserved_really = 0;
237
238 unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
239 unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
240 unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
241 unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
242 unsigned int vm_pageout_deadlock_relief = 0;
243 unsigned int vm_pageout_inactive_relief = 0;
244 unsigned int vm_pageout_burst_active_throttle = 0;
245 unsigned int vm_pageout_burst_inactive_throttle = 0;
246
247 /*
248 * Protection against zero fill flushing live working sets derived
249 * from existing backing store and files
250 */
251 unsigned int vm_accellerate_zf_pageout_trigger = 400;
252 unsigned int vm_zf_iterator;
253 unsigned int vm_zf_iterator_count = 40;
254 unsigned int last_page_zf;
255 unsigned int vm_zf_count = 0;
256
257 /*
258 * These variables record the pageout daemon's actions:
259 * how many pages it looks at and what happens to those pages.
260 * No locking needed because only one thread modifies the variables.
261 */
262
263 unsigned int vm_pageout_active = 0; /* debugging */
264 unsigned int vm_pageout_inactive = 0; /* debugging */
265 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
266 unsigned int vm_pageout_inactive_forced = 0; /* debugging */
267 unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
268 unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
269 unsigned int vm_pageout_inactive_busy = 0; /* debugging */
270 unsigned int vm_pageout_inactive_absent = 0; /* debugging */
271 unsigned int vm_pageout_inactive_used = 0; /* debugging */
272 unsigned int vm_pageout_inactive_clean = 0; /* debugging */
273 unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
274 unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
275 unsigned int vm_pageout_purged_objects = 0; /* debugging */
276 unsigned int vm_stat_discard = 0; /* debugging */
277 unsigned int vm_stat_discard_sent = 0; /* debugging */
278 unsigned int vm_stat_discard_failure = 0; /* debugging */
279 unsigned int vm_stat_discard_throttle = 0; /* debugging */
280
281 unsigned int vm_pageout_scan_active_throttled = 0;
282 unsigned int vm_pageout_scan_inactive_throttled = 0;
283 unsigned int vm_pageout_scan_throttle = 0; /* debugging */
284 unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
285 unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
286 unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
287 unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
288 unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
289 /*
290 * Backing store throttle when BS is exhausted
291 */
292 unsigned int vm_backing_store_low = 0;
293
294 unsigned int vm_pageout_out_of_line = 0;
295 unsigned int vm_pageout_in_place = 0;
296
297 /*
298 * ENCRYPTED SWAP:
299 * counters and statistics...
300 */
301 unsigned long vm_page_decrypt_counter = 0;
302 unsigned long vm_page_decrypt_for_upl_counter = 0;
303 unsigned long vm_page_encrypt_counter = 0;
304 unsigned long vm_page_encrypt_abort_counter = 0;
305 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
306 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
307
308
309 struct vm_pageout_queue vm_pageout_queue_internal;
310 struct vm_pageout_queue vm_pageout_queue_external;
311
312
313 /*
314 * Routine: vm_backing_store_disable
315 * Purpose:
316 * Suspend non-privileged threads wishing to extend
317 * backing store when we are low on backing store
318 * (Synchronized by caller)
319 */
320 void
321 vm_backing_store_disable(
322 boolean_t disable)
323 {
324 if(disable) {
325 vm_backing_store_low = 1;
326 } else {
327 if(vm_backing_store_low) {
328 vm_backing_store_low = 0;
329 thread_wakeup((event_t) &vm_backing_store_low);
330 }
331 }
332 }
333
334
335 /*
336 * Routine: vm_pageout_object_allocate
337 * Purpose:
338 * Allocate an object for use as out-of-line memory in a
339 * data_return/data_initialize message.
340 * The page must be in an unlocked object.
341 *
342 * If the page belongs to a trusted pager, cleaning in place
343 * will be used, which utilizes a special "pageout object"
344 * containing private alias pages for the real page frames.
345 * Untrusted pagers use normal out-of-line memory.
346 */
347 vm_object_t
348 vm_pageout_object_allocate(
349 vm_page_t m,
350 vm_size_t size,
351 vm_object_offset_t offset)
352 {
353 vm_object_t object = m->object;
354 vm_object_t new_object;
355
356 assert(object->pager_ready);
357
358 new_object = vm_object_allocate(size);
359
360 if (object->pager_trusted) {
361 assert (offset < object->size);
362
363 vm_object_lock(new_object);
364 new_object->pageout = TRUE;
365 new_object->shadow = object;
366 new_object->can_persist = FALSE;
367 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
368 new_object->shadow_offset = offset;
369 vm_object_unlock(new_object);
370
371 /*
372 * Take a paging reference on the object. This will be dropped
373 * in vm_pageout_object_terminate()
374 */
375 vm_object_lock(object);
376 vm_object_paging_begin(object);
377 vm_page_lock_queues();
378 vm_page_unlock_queues();
379 vm_object_unlock(object);
380
381 vm_pageout_in_place++;
382 } else
383 vm_pageout_out_of_line++;
384 return(new_object);
385 }
386
387 #if MACH_CLUSTER_STATS
388 unsigned long vm_pageout_cluster_dirtied = 0;
389 unsigned long vm_pageout_cluster_cleaned = 0;
390 unsigned long vm_pageout_cluster_collisions = 0;
391 unsigned long vm_pageout_cluster_clusters = 0;
392 unsigned long vm_pageout_cluster_conversions = 0;
393 unsigned long vm_pageout_target_collisions = 0;
394 unsigned long vm_pageout_target_page_dirtied = 0;
395 unsigned long vm_pageout_target_page_freed = 0;
396 #define CLUSTER_STAT(clause) clause
397 #else /* MACH_CLUSTER_STATS */
398 #define CLUSTER_STAT(clause)
399 #endif /* MACH_CLUSTER_STATS */
400
401 /*
402 * Routine: vm_pageout_object_terminate
403 * Purpose:
404 * Destroy the pageout_object allocated by
405 * vm_pageout_object_allocate(), and perform all of the
406 * required cleanup actions.
407 *
408 * In/Out conditions:
409 * The object must be locked, and will be returned locked.
410 */
411 void
412 vm_pageout_object_terminate(
413 vm_object_t object)
414 {
415 vm_object_t shadow_object;
416 boolean_t shadow_internal;
417
418 /*
419 * Deal with the deallocation (last reference) of a pageout object
420 * (used for cleaning-in-place) by dropping the paging references/
421 * freeing pages in the original object.
422 */
423
424 assert(object->pageout);
425 shadow_object = object->shadow;
426 vm_object_lock(shadow_object);
427 shadow_internal = shadow_object->internal;
428
429 while (!queue_empty(&object->memq)) {
430 vm_page_t p, m;
431 vm_object_offset_t offset;
432
433 p = (vm_page_t) queue_first(&object->memq);
434
435 assert(p->private);
436 assert(p->pageout);
437 p->pageout = FALSE;
438 assert(!p->cleaning);
439
440 offset = p->offset;
441 VM_PAGE_FREE(p);
442 p = VM_PAGE_NULL;
443
444 m = vm_page_lookup(shadow_object,
445 offset + object->shadow_offset);
446
447 if(m == VM_PAGE_NULL)
448 continue;
449 assert(m->cleaning);
450 /* used as a trigger on upl_commit etc to recognize the */
451 /* pageout daemon's subseqent desire to pageout a cleaning */
452 /* page. When the bit is on the upl commit code will */
453 /* respect the pageout bit in the target page over the */
454 /* caller's page list indication */
455 m->dump_cleaning = FALSE;
456
457 /*
458 * Account for the paging reference taken when
459 * m->cleaning was set on this page.
460 */
461 vm_object_paging_end(shadow_object);
462 assert((m->dirty) || (m->precious) ||
463 (m->busy && m->cleaning));
464
465 /*
466 * Handle the trusted pager throttle.
467 * Also decrement the burst throttle (if external).
468 */
469 vm_page_lock_queues();
470 if (m->laundry) {
471 vm_pageout_throttle_up(m);
472 }
473
474 /*
475 * Handle the "target" page(s). These pages are to be freed if
476 * successfully cleaned. Target pages are always busy, and are
477 * wired exactly once. The initial target pages are not mapped,
478 * (so cannot be referenced or modified) but converted target
479 * pages may have been modified between the selection as an
480 * adjacent page and conversion to a target.
481 */
482 if (m->pageout) {
483 assert(m->busy);
484 assert(m->wire_count == 1);
485 m->cleaning = FALSE;
486 m->pageout = FALSE;
487 #if MACH_CLUSTER_STATS
488 if (m->wanted) vm_pageout_target_collisions++;
489 #endif
490 /*
491 * Revoke all access to the page. Since the object is
492 * locked, and the page is busy, this prevents the page
493 * from being dirtied after the pmap_disconnect() call
494 * returns.
495 *
496 * Since the page is left "dirty" but "not modifed", we
497 * can detect whether the page was redirtied during
498 * pageout by checking the modify state.
499 */
500 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
501 m->dirty = TRUE;
502 else
503 m->dirty = FALSE;
504
505 if (m->dirty) {
506 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
507 vm_page_unwire(m);/* reactivates */
508 VM_STAT(reactivations++);
509 PAGE_WAKEUP_DONE(m);
510 } else {
511 CLUSTER_STAT(vm_pageout_target_page_freed++;)
512 vm_page_free(m);/* clears busy, etc. */
513 }
514 vm_page_unlock_queues();
515 continue;
516 }
517 /*
518 * Handle the "adjacent" pages. These pages were cleaned in
519 * place, and should be left alone.
520 * If prep_pin_count is nonzero, then someone is using the
521 * page, so make it active.
522 */
523 if (!m->active && !m->inactive && !m->private) {
524 if (m->reference)
525 vm_page_activate(m);
526 else
527 vm_page_deactivate(m);
528 }
529 if((m->busy) && (m->cleaning)) {
530
531 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
532 m->busy = FALSE;
533
534 /* We do not re-set m->dirty ! */
535 /* The page was busy so no extraneous activity */
536 /* could have occurred. COPY_INTO is a read into the */
537 /* new pages. CLEAN_IN_PLACE does actually write */
538 /* out the pages but handling outside of this code */
539 /* will take care of resetting dirty. We clear the */
540 /* modify however for the Programmed I/O case. */
541 pmap_clear_modify(m->phys_page);
542 if(m->absent) {
543 m->absent = FALSE;
544 if(shadow_object->absent_count == 1)
545 vm_object_absent_release(shadow_object);
546 else
547 shadow_object->absent_count--;
548 }
549 m->overwriting = FALSE;
550 } else if (m->overwriting) {
551 /* alternate request page list, write to page_list */
552 /* case. Occurs when the original page was wired */
553 /* at the time of the list request */
554 assert(m->wire_count != 0);
555 vm_page_unwire(m);/* reactivates */
556 m->overwriting = FALSE;
557 } else {
558 /*
559 * Set the dirty state according to whether or not the page was
560 * modified during the pageout. Note that we purposefully do
561 * NOT call pmap_clear_modify since the page is still mapped.
562 * If the page were to be dirtied between the 2 calls, this
563 * this fact would be lost. This code is only necessary to
564 * maintain statistics, since the pmap module is always
565 * consulted if m->dirty is false.
566 */
567 #if MACH_CLUSTER_STATS
568 m->dirty = pmap_is_modified(m->phys_page);
569
570 if (m->dirty) vm_pageout_cluster_dirtied++;
571 else vm_pageout_cluster_cleaned++;
572 if (m->wanted) vm_pageout_cluster_collisions++;
573 #else
574 m->dirty = 0;
575 #endif
576 }
577 m->cleaning = FALSE;
578
579 /*
580 * Wakeup any thread waiting for the page to be un-cleaning.
581 */
582 PAGE_WAKEUP(m);
583 vm_page_unlock_queues();
584 }
585 /*
586 * Account for the paging reference taken in vm_paging_object_allocate.
587 */
588 vm_object_paging_end(shadow_object);
589 vm_object_unlock(shadow_object);
590
591 assert(object->ref_count == 0);
592 assert(object->paging_in_progress == 0);
593 assert(object->resident_page_count == 0);
594 return;
595 }
596
597 /*
598 * Routine: vm_pageout_setup
599 * Purpose:
600 * Set up a page for pageout (clean & flush).
601 *
602 * Move the page to a new object, as part of which it will be
603 * sent to its memory manager in a memory_object_data_write or
604 * memory_object_initialize message.
605 *
606 * The "new_object" and "new_offset" arguments
607 * indicate where the page should be moved.
608 *
609 * In/Out conditions:
610 * The page in question must not be on any pageout queues,
611 * and must be busy. The object to which it belongs
612 * must be unlocked, and the caller must hold a paging
613 * reference to it. The new_object must not be locked.
614 *
615 * This routine returns a pointer to a place-holder page,
616 * inserted at the same offset, to block out-of-order
617 * requests for the page. The place-holder page must
618 * be freed after the data_write or initialize message
619 * has been sent.
620 *
621 * The original page is put on a paging queue and marked
622 * not busy on exit.
623 */
624 vm_page_t
625 vm_pageout_setup(
626 register vm_page_t m,
627 register vm_object_t new_object,
628 vm_object_offset_t new_offset)
629 {
630 register vm_object_t old_object = m->object;
631 vm_object_offset_t paging_offset;
632 vm_object_offset_t offset;
633 register vm_page_t holding_page;
634 register vm_page_t new_m;
635 boolean_t need_to_wire = FALSE;
636
637
638 XPR(XPR_VM_PAGEOUT,
639 "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
640 (integer_t)m->object, (integer_t)m->offset,
641 (integer_t)m, (integer_t)new_object,
642 (integer_t)new_offset);
643 assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
644 !m->restart);
645
646 assert(m->dirty || m->precious);
647
648 /*
649 * Create a place-holder page where the old one was, to prevent
650 * attempted pageins of this page while we're unlocked.
651 */
652 VM_PAGE_GRAB_FICTITIOUS(holding_page);
653
654 vm_object_lock(old_object);
655
656 offset = m->offset;
657 paging_offset = offset + old_object->paging_offset;
658
659 if (old_object->pager_trusted) {
660 /*
661 * This pager is trusted, so we can clean this page
662 * in place. Leave it in the old object, and mark it
663 * cleaning & pageout.
664 */
665 new_m = holding_page;
666 holding_page = VM_PAGE_NULL;
667
668 /*
669 * Set up new page to be private shadow of real page.
670 */
671 new_m->phys_page = m->phys_page;
672 new_m->fictitious = FALSE;
673 new_m->pageout = TRUE;
674
675 /*
676 * Mark real page as cleaning (indicating that we hold a
677 * paging reference to be released via m_o_d_r_c) and
678 * pageout (indicating that the page should be freed
679 * when the pageout completes).
680 */
681 pmap_clear_modify(m->phys_page);
682 vm_page_lock_queues();
683 new_m->private = TRUE;
684 vm_page_wire(new_m);
685 m->cleaning = TRUE;
686 m->pageout = TRUE;
687
688 vm_page_wire(m);
689 assert(m->wire_count == 1);
690 vm_page_unlock_queues();
691
692 m->dirty = TRUE;
693 m->precious = FALSE;
694 m->page_lock = VM_PROT_NONE;
695 m->unusual = FALSE;
696 m->unlock_request = VM_PROT_NONE;
697 } else {
698 /*
699 * Cannot clean in place, so rip the old page out of the
700 * object, and stick the holding page in. Set new_m to the
701 * page in the new object.
702 */
703 vm_page_lock_queues();
704 VM_PAGE_QUEUES_REMOVE(m);
705 vm_page_remove(m);
706
707 vm_page_insert(holding_page, old_object, offset);
708 vm_page_unlock_queues();
709
710 m->dirty = TRUE;
711 m->precious = FALSE;
712 new_m = m;
713 new_m->page_lock = VM_PROT_NONE;
714 new_m->unlock_request = VM_PROT_NONE;
715
716 if (old_object->internal)
717 need_to_wire = TRUE;
718 }
719 /*
720 * Record that this page has been written out
721 */
722 #if MACH_PAGEMAP
723 vm_external_state_set(old_object->existence_map, offset);
724 #endif /* MACH_PAGEMAP */
725
726 vm_object_unlock(old_object);
727
728 vm_object_lock(new_object);
729
730 /*
731 * Put the page into the new object. If it is a not wired
732 * (if it's the real page) it will be activated.
733 */
734
735 vm_page_lock_queues();
736 vm_page_insert(new_m, new_object, new_offset);
737 if (need_to_wire)
738 vm_page_wire(new_m);
739 else
740 vm_page_activate(new_m);
741 PAGE_WAKEUP_DONE(new_m);
742 vm_page_unlock_queues();
743
744 vm_object_unlock(new_object);
745
746 /*
747 * Return the placeholder page to simplify cleanup.
748 */
749 return (holding_page);
750 }
751
752 /*
753 * Routine: vm_pageclean_setup
754 *
755 * Purpose: setup a page to be cleaned (made non-dirty), but not
756 * necessarily flushed from the VM page cache.
757 * This is accomplished by cleaning in place.
758 *
759 * The page must not be busy, and the object and page
760 * queues must be locked.
761 *
762 */
763 void
764 vm_pageclean_setup(
765 vm_page_t m,
766 vm_page_t new_m,
767 vm_object_t new_object,
768 vm_object_offset_t new_offset)
769 {
770 vm_object_t old_object = m->object;
771 assert(!m->busy);
772 assert(!m->cleaning);
773
774 XPR(XPR_VM_PAGEOUT,
775 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
776 (integer_t)old_object, m->offset, (integer_t)m,
777 (integer_t)new_m, new_offset);
778
779 pmap_clear_modify(m->phys_page);
780 vm_object_paging_begin(old_object);
781
782 /*
783 * Record that this page has been written out
784 */
785 #if MACH_PAGEMAP
786 vm_external_state_set(old_object->existence_map, m->offset);
787 #endif /*MACH_PAGEMAP*/
788
789 /*
790 * Mark original page as cleaning in place.
791 */
792 m->cleaning = TRUE;
793 m->dirty = TRUE;
794 m->precious = FALSE;
795
796 /*
797 * Convert the fictitious page to a private shadow of
798 * the real page.
799 */
800 assert(new_m->fictitious);
801 new_m->fictitious = FALSE;
802 new_m->private = TRUE;
803 new_m->pageout = TRUE;
804 new_m->phys_page = m->phys_page;
805 vm_page_wire(new_m);
806
807 vm_page_insert(new_m, new_object, new_offset);
808 assert(!new_m->wanted);
809 new_m->busy = FALSE;
810 }
811
812 void
813 vm_pageclean_copy(
814 vm_page_t m,
815 vm_page_t new_m,
816 vm_object_t new_object,
817 vm_object_offset_t new_offset)
818 {
819 XPR(XPR_VM_PAGEOUT,
820 "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
821 m, new_m, new_object, new_offset, 0);
822
823 assert((!m->busy) && (!m->cleaning));
824
825 assert(!new_m->private && !new_m->fictitious);
826
827 pmap_clear_modify(m->phys_page);
828
829 m->busy = TRUE;
830 vm_object_paging_begin(m->object);
831 vm_page_unlock_queues();
832 vm_object_unlock(m->object);
833
834 /*
835 * Copy the original page to the new page.
836 */
837 vm_page_copy(m, new_m);
838
839 /*
840 * Mark the old page as clean. A request to pmap_is_modified
841 * will get the right answer.
842 */
843 vm_object_lock(m->object);
844 m->dirty = FALSE;
845
846 vm_object_paging_end(m->object);
847
848 vm_page_lock_queues();
849 if (!m->active && !m->inactive)
850 vm_page_activate(m);
851 PAGE_WAKEUP_DONE(m);
852
853 vm_page_insert(new_m, new_object, new_offset);
854 vm_page_activate(new_m);
855 new_m->busy = FALSE; /* No other thread can be waiting */
856 }
857
858
859 /*
860 * Routine: vm_pageout_initialize_page
861 * Purpose:
862 * Causes the specified page to be initialized in
863 * the appropriate memory object. This routine is used to push
864 * pages into a copy-object when they are modified in the
865 * permanent object.
866 *
867 * The page is moved to a temporary object and paged out.
868 *
869 * In/out conditions:
870 * The page in question must not be on any pageout queues.
871 * The object to which it belongs must be locked.
872 * The page must be busy, but not hold a paging reference.
873 *
874 * Implementation:
875 * Move this page to a completely new object.
876 */
877 void
878 vm_pageout_initialize_page(
879 vm_page_t m)
880 {
881 vm_object_t object;
882 vm_object_offset_t paging_offset;
883 vm_page_t holding_page;
884
885
886 XPR(XPR_VM_PAGEOUT,
887 "vm_pageout_initialize_page, page 0x%X\n",
888 (integer_t)m, 0, 0, 0, 0);
889 assert(m->busy);
890
891 /*
892 * Verify that we really want to clean this page
893 */
894 assert(!m->absent);
895 assert(!m->error);
896 assert(m->dirty);
897
898 /*
899 * Create a paging reference to let us play with the object.
900 */
901 object = m->object;
902 paging_offset = m->offset + object->paging_offset;
903 vm_object_paging_begin(object);
904 if (m->absent || m->error || m->restart ||
905 (!m->dirty && !m->precious)) {
906 VM_PAGE_FREE(m);
907 panic("reservation without pageout?"); /* alan */
908 vm_object_unlock(object);
909 return;
910 }
911
912 /* set the page for future call to vm_fault_list_request */
913 holding_page = NULL;
914 vm_page_lock_queues();
915 pmap_clear_modify(m->phys_page);
916 m->dirty = TRUE;
917 m->busy = TRUE;
918 m->list_req_pending = TRUE;
919 m->cleaning = TRUE;
920 m->pageout = TRUE;
921 vm_page_wire(m);
922 vm_page_unlock_queues();
923 vm_object_unlock(object);
924
925 /*
926 * Write the data to its pager.
927 * Note that the data is passed by naming the new object,
928 * not a virtual address; the pager interface has been
929 * manipulated to use the "internal memory" data type.
930 * [The object reference from its allocation is donated
931 * to the eventual recipient.]
932 */
933 memory_object_data_initialize(object->pager,
934 paging_offset,
935 PAGE_SIZE);
936
937 vm_object_lock(object);
938 }
939
940 #if MACH_CLUSTER_STATS
941 #define MAXCLUSTERPAGES 16
942 struct {
943 unsigned long pages_in_cluster;
944 unsigned long pages_at_higher_offsets;
945 unsigned long pages_at_lower_offsets;
946 } cluster_stats[MAXCLUSTERPAGES];
947 #endif /* MACH_CLUSTER_STATS */
948
949 boolean_t allow_clustered_pageouts = FALSE;
950
951 /*
952 * vm_pageout_cluster:
953 *
954 * Given a page, queue it to the appropriate I/O thread,
955 * which will page it out and attempt to clean adjacent pages
956 * in the same operation.
957 *
958 * The page must be busy, and the object and queues locked. We will take a
959 * paging reference to prevent deallocation or collapse when we
960 * release the object lock back at the call site. The I/O thread
961 * is responsible for consuming this reference
962 *
963 * The page must not be on any pageout queue.
964 */
965
966 void
967 vm_pageout_cluster(vm_page_t m)
968 {
969 vm_object_t object = m->object;
970 struct vm_pageout_queue *q;
971
972
973 XPR(XPR_VM_PAGEOUT,
974 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
975 (integer_t)object, m->offset, (integer_t)m, 0, 0);
976
977 /*
978 * Only a certain kind of page is appreciated here.
979 */
980 assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
981 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
982
983 /*
984 * protect the object from collapse -
985 * locking in the object's paging_offset.
986 */
987 vm_object_paging_begin(object);
988
989 /*
990 * set the page for future call to vm_fault_list_request
991 * page should already be marked busy
992 */
993 vm_page_wire(m);
994 m->list_req_pending = TRUE;
995 m->cleaning = TRUE;
996 m->pageout = TRUE;
997 m->laundry = TRUE;
998
999 if (object->internal == TRUE)
1000 q = &vm_pageout_queue_internal;
1001 else
1002 q = &vm_pageout_queue_external;
1003 q->pgo_laundry++;
1004
1005 m->pageout_queue = TRUE;
1006 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1007
1008 if (q->pgo_idle == TRUE) {
1009 q->pgo_idle = FALSE;
1010 thread_wakeup((event_t) &q->pgo_pending);
1011 }
1012 }
1013
1014
1015 unsigned long vm_pageout_throttle_up_count = 0;
1016
1017 /*
1018 * A page is back from laundry. See if there are some pages waiting to
1019 * go to laundry and if we can let some of them go now.
1020 *
1021 * Object and page queues must be locked.
1022 */
1023 void
1024 vm_pageout_throttle_up(
1025 vm_page_t m)
1026 {
1027 struct vm_pageout_queue *q;
1028
1029 vm_pageout_throttle_up_count++;
1030
1031 assert(m->laundry);
1032 assert(m->object != VM_OBJECT_NULL);
1033 assert(m->object != kernel_object);
1034
1035 if (m->object->internal == TRUE)
1036 q = &vm_pageout_queue_internal;
1037 else
1038 q = &vm_pageout_queue_external;
1039
1040 m->laundry = FALSE;
1041 q->pgo_laundry--;
1042
1043 if (q->pgo_throttled == TRUE) {
1044 q->pgo_throttled = FALSE;
1045 thread_wakeup((event_t) &q->pgo_laundry);
1046 }
1047 }
1048
1049
1050 /*
1051 * vm_pageout_scan does the dirty work for the pageout daemon.
1052 * It returns with vm_page_queue_free_lock held and
1053 * vm_page_free_wanted == 0.
1054 */
1055
1056 #define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
1057
1058 #define FCS_IDLE 0
1059 #define FCS_DELAYED 1
1060 #define FCS_DEADLOCK_DETECTED 2
1061
1062 struct flow_control {
1063 int state;
1064 mach_timespec_t ts;
1065 };
1066
1067 extern kern_return_t sysclk_gettime(mach_timespec_t *);
1068
1069
1070 void
1071 vm_pageout_scan(void)
1072 {
1073 unsigned int loop_count = 0;
1074 unsigned int inactive_burst_count = 0;
1075 unsigned int active_burst_count = 0;
1076 vm_page_t local_freeq = 0;
1077 int local_freed = 0;
1078 int delayed_unlock = 0;
1079 int need_internal_inactive = 0;
1080 int refmod_state = 0;
1081 int vm_pageout_deadlock_target = 0;
1082 struct vm_pageout_queue *iq;
1083 struct vm_pageout_queue *eq;
1084 struct flow_control flow_control;
1085 boolean_t active_throttled = FALSE;
1086 boolean_t inactive_throttled = FALSE;
1087 mach_timespec_t ts;
1088 unsigned int msecs = 0;
1089 vm_object_t object;
1090
1091
1092 flow_control.state = FCS_IDLE;
1093 iq = &vm_pageout_queue_internal;
1094 eq = &vm_pageout_queue_external;
1095
1096 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1097
1098 /*???*/ /*
1099 * We want to gradually dribble pages from the active queue
1100 * to the inactive queue. If we let the inactive queue get
1101 * very small, and then suddenly dump many pages into it,
1102 * those pages won't get a sufficient chance to be referenced
1103 * before we start taking them from the inactive queue.
1104 *
1105 * We must limit the rate at which we send pages to the pagers.
1106 * data_write messages consume memory, for message buffers and
1107 * for map-copy objects. If we get too far ahead of the pagers,
1108 * we can potentially run out of memory.
1109 *
1110 * We can use the laundry count to limit directly the number
1111 * of pages outstanding to the default pager. A similar
1112 * strategy for external pagers doesn't work, because
1113 * external pagers don't have to deallocate the pages sent them,
1114 * and because we might have to send pages to external pagers
1115 * even if they aren't processing writes. So we also
1116 * use a burst count to limit writes to external pagers.
1117 *
1118 * When memory is very tight, we can't rely on external pagers to
1119 * clean pages. They probably aren't running, because they
1120 * aren't vm-privileged. If we kept sending dirty pages to them,
1121 * we could exhaust the free list.
1122 */
1123 vm_page_lock_queues();
1124 delayed_unlock = 1;
1125
1126
1127 Restart:
1128 /*
1129 * Recalculate vm_page_inactivate_target.
1130 */
1131 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1132 vm_page_inactive_count);
1133 object = NULL;
1134
1135 for (;;) {
1136 vm_page_t m;
1137
1138 if (delayed_unlock == 0)
1139 vm_page_lock_queues();
1140
1141 active_burst_count = vm_page_active_count;
1142
1143 if (active_burst_count > vm_pageout_burst_active_throttle)
1144 active_burst_count = vm_pageout_burst_active_throttle;
1145
1146 /*
1147 * Move pages from active to inactive.
1148 */
1149 while ((need_internal_inactive ||
1150 vm_page_inactive_count < vm_page_inactive_target) &&
1151 !queue_empty(&vm_page_queue_active) &&
1152 ((active_burst_count--) > 0)) {
1153
1154 vm_pageout_active++;
1155
1156 m = (vm_page_t) queue_first(&vm_page_queue_active);
1157
1158 assert(m->active && !m->inactive);
1159 assert(!m->laundry);
1160 assert(m->object != kernel_object);
1161
1162 /*
1163 * Try to lock object; since we've already got the
1164 * page queues lock, we can only 'try' for this one.
1165 * if the 'try' fails, we need to do a mutex_pause
1166 * to allow the owner of the object lock a chance to
1167 * run... otherwise, we're likely to trip over this
1168 * object in the same state as we work our way through
1169 * the queue... clumps of pages associated with the same
1170 * object are fairly typical on the inactive and active queues
1171 */
1172 if (m->object != object) {
1173 if (object != NULL) {
1174 vm_object_unlock(object);
1175 object = NULL;
1176 }
1177 if (!vm_object_lock_try(m->object)) {
1178 /*
1179 * move page to end of active queue and continue
1180 */
1181 queue_remove(&vm_page_queue_active, m,
1182 vm_page_t, pageq);
1183 queue_enter(&vm_page_queue_active, m,
1184 vm_page_t, pageq);
1185
1186 goto done_with_activepage;
1187 }
1188 object = m->object;
1189 }
1190 /*
1191 * if the page is BUSY, then we pull it
1192 * off the active queue and leave it alone.
1193 * when BUSY is cleared, it will get stuck
1194 * back on the appropriate queue
1195 */
1196 if (m->busy) {
1197 queue_remove(&vm_page_queue_active, m,
1198 vm_page_t, pageq);
1199 m->pageq.next = NULL;
1200 m->pageq.prev = NULL;
1201
1202 if (!m->fictitious)
1203 vm_page_active_count--;
1204 m->active = FALSE;
1205
1206 goto done_with_activepage;
1207 }
1208 if (need_internal_inactive) {
1209 /*
1210 * If we're unable to make forward progress
1211 * with the current set of pages on the
1212 * inactive queue due to busy objects or
1213 * throttled pageout queues, then
1214 * move a page that is already clean
1215 * or belongs to a pageout queue that
1216 * isn't currently throttled
1217 */
1218 active_throttled = FALSE;
1219
1220 if (object->internal) {
1221 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1222 active_throttled = TRUE;
1223 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1224 active_throttled = TRUE;
1225 }
1226 if (active_throttled == TRUE) {
1227 if (!m->dirty) {
1228 refmod_state = pmap_get_refmod(m->phys_page);
1229
1230 if (refmod_state & VM_MEM_REFERENCED)
1231 m->reference = TRUE;
1232 if (refmod_state & VM_MEM_MODIFIED)
1233 m->dirty = TRUE;
1234 }
1235 if (m->dirty || m->precious) {
1236 /*
1237 * page is dirty and targets a THROTTLED queue
1238 * so all we can do is move it back to the
1239 * end of the active queue to get it out
1240 * of the way
1241 */
1242 queue_remove(&vm_page_queue_active, m,
1243 vm_page_t, pageq);
1244 queue_enter(&vm_page_queue_active, m,
1245 vm_page_t, pageq);
1246
1247 vm_pageout_scan_active_throttled++;
1248
1249 goto done_with_activepage;
1250 }
1251 }
1252 vm_pageout_scan_active_throttle_success++;
1253 need_internal_inactive--;
1254 }
1255 /*
1256 * Deactivate the page while holding the object
1257 * locked, so we know the page is still not busy.
1258 * This should prevent races between pmap_enter
1259 * and pmap_clear_reference. The page might be
1260 * absent or fictitious, but vm_page_deactivate
1261 * can handle that.
1262 */
1263 vm_page_deactivate(m);
1264 done_with_activepage:
1265 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1266
1267 if (object != NULL) {
1268 vm_object_unlock(object);
1269 object = NULL;
1270 }
1271 if (local_freeq) {
1272 vm_page_free_list(local_freeq);
1273
1274 local_freeq = 0;
1275 local_freed = 0;
1276 }
1277 delayed_unlock = 0;
1278 vm_page_unlock_queues();
1279
1280 mutex_pause();
1281 vm_page_lock_queues();
1282 /*
1283 * continue the while loop processing
1284 * the active queue... need to hold
1285 * the page queues lock
1286 */
1287 continue;
1288 }
1289 }
1290
1291
1292
1293 /**********************************************************************
1294 * above this point we're playing with the active queue
1295 * below this point we're playing with the throttling mechanisms
1296 * and the inactive queue
1297 **********************************************************************/
1298
1299
1300
1301 /*
1302 * We are done if we have met our target *and*
1303 * nobody is still waiting for a page.
1304 */
1305 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1306 if (object != NULL) {
1307 vm_object_unlock(object);
1308 object = NULL;
1309 }
1310 if (local_freeq) {
1311 vm_page_free_list(local_freeq);
1312
1313 local_freeq = 0;
1314 local_freed = 0;
1315 }
1316 mutex_lock(&vm_page_queue_free_lock);
1317
1318 if ((vm_page_free_count >= vm_page_free_target) &&
1319 (vm_page_free_wanted == 0)) {
1320
1321 vm_page_unlock_queues();
1322
1323 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1324 return;
1325 }
1326 mutex_unlock(&vm_page_queue_free_lock);
1327 }
1328
1329
1330 /*
1331 * Sometimes we have to pause:
1332 * 1) No inactive pages - nothing to do.
1333 * 2) Flow control - default pageout queue is full
1334 * 3) Loop control - no acceptable pages found on the inactive queue
1335 * within the last vm_pageout_burst_inactive_throttle iterations
1336 */
1337 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1338 vm_pageout_scan_empty_throttle++;
1339 msecs = vm_pageout_empty_wait;
1340 goto vm_pageout_scan_delay;
1341
1342 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1343 vm_pageout_scan_burst_throttle++;
1344 msecs = vm_pageout_burst_wait;
1345 goto vm_pageout_scan_delay;
1346
1347 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1348
1349 switch (flow_control.state) {
1350
1351 case FCS_IDLE:
1352 reset_deadlock_timer:
1353 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1354 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1355 sysclk_gettime(&flow_control.ts);
1356 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1357
1358 flow_control.state = FCS_DELAYED;
1359 msecs = vm_pageout_deadlock_wait;
1360
1361 break;
1362
1363 case FCS_DELAYED:
1364 sysclk_gettime(&ts);
1365
1366 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1367 /*
1368 * the pageout thread for the default pager is potentially
1369 * deadlocked since the
1370 * default pager queue has been throttled for more than the
1371 * allowable time... we need to move some clean pages or dirty
1372 * pages belonging to the external pagers if they aren't throttled
1373 * vm_page_free_wanted represents the number of threads currently
1374 * blocked waiting for pages... we'll move one page for each of
1375 * these plus a fixed amount to break the logjam... once we're done
1376 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1377 * with a new timeout target since we have no way of knowing
1378 * whether we've broken the deadlock except through observation
1379 * of the queue associated with the default pager... we need to
1380 * stop moving pagings and allow the system to run to see what
1381 * state it settles into.
1382 */
1383 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1384 vm_pageout_scan_deadlock_detected++;
1385 flow_control.state = FCS_DEADLOCK_DETECTED;
1386
1387 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1388 goto consider_inactive;
1389 }
1390 /*
1391 * just resniff instead of trying
1392 * to compute a new delay time... we're going to be
1393 * awakened immediately upon a laundry completion,
1394 * so we won't wait any longer than necessary
1395 */
1396 msecs = vm_pageout_idle_wait;
1397 break;
1398
1399 case FCS_DEADLOCK_DETECTED:
1400 if (vm_pageout_deadlock_target)
1401 goto consider_inactive;
1402 goto reset_deadlock_timer;
1403
1404 }
1405 vm_pageout_scan_throttle++;
1406 iq->pgo_throttled = TRUE;
1407 vm_pageout_scan_delay:
1408 if (object != NULL) {
1409 vm_object_unlock(object);
1410 object = NULL;
1411 }
1412 if (local_freeq) {
1413 vm_page_free_list(local_freeq);
1414
1415 local_freeq = 0;
1416 local_freed = 0;
1417 }
1418 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1419
1420 counter(c_vm_pageout_scan_block++);
1421
1422 vm_page_unlock_queues();
1423
1424 thread_block(THREAD_CONTINUE_NULL);
1425
1426 vm_page_lock_queues();
1427 delayed_unlock = 1;
1428
1429 iq->pgo_throttled = FALSE;
1430
1431 if (loop_count >= vm_page_inactive_count) {
1432 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1433 /*
1434 * Make sure we move enough "appropriate"
1435 * pages to the inactive queue before trying
1436 * again.
1437 */
1438 need_internal_inactive = vm_pageout_inactive_relief;
1439 }
1440 loop_count = 0;
1441 }
1442 inactive_burst_count = 0;
1443
1444 goto Restart;
1445 /*NOTREACHED*/
1446 }
1447
1448
1449 flow_control.state = FCS_IDLE;
1450 consider_inactive:
1451 loop_count++;
1452 inactive_burst_count++;
1453 vm_pageout_inactive++;
1454
1455 if (!queue_empty(&vm_page_queue_inactive)) {
1456 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1457
1458 if (m->clustered && (m->no_isync == TRUE)) {
1459 goto use_this_page;
1460 }
1461 }
1462 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1463 vm_zf_iterator = 0;
1464 } else {
1465 last_page_zf = 0;
1466 if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1467 vm_zf_iterator = 0;
1468 }
1469 }
1470 if (queue_empty(&vm_page_queue_zf) ||
1471 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1472 !queue_empty(&vm_page_queue_inactive))) {
1473 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1474 last_page_zf = 0;
1475 } else {
1476 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1477 last_page_zf = 1;
1478 }
1479 use_this_page:
1480 assert(!m->active && m->inactive);
1481 assert(!m->laundry);
1482 assert(m->object != kernel_object);
1483
1484 /*
1485 * Try to lock object; since we've alread got the
1486 * page queues lock, we can only 'try' for this one.
1487 * if the 'try' fails, we need to do a mutex_pause
1488 * to allow the owner of the object lock a chance to
1489 * run... otherwise, we're likely to trip over this
1490 * object in the same state as we work our way through
1491 * the queue... clumps of pages associated with the same
1492 * object are fairly typical on the inactive and active queues
1493 */
1494 if (m->object != object) {
1495 if (object != NULL) {
1496 vm_object_unlock(object);
1497 object = NULL;
1498 }
1499 if (!vm_object_lock_try(m->object)) {
1500 /*
1501 * Move page to end and continue.
1502 * Don't re-issue ticket
1503 */
1504 if (m->zero_fill) {
1505 queue_remove(&vm_page_queue_zf, m,
1506 vm_page_t, pageq);
1507 queue_enter(&vm_page_queue_zf, m,
1508 vm_page_t, pageq);
1509 } else {
1510 queue_remove(&vm_page_queue_inactive, m,
1511 vm_page_t, pageq);
1512 queue_enter(&vm_page_queue_inactive, m,
1513 vm_page_t, pageq);
1514 }
1515 vm_pageout_inactive_nolock++;
1516
1517 /*
1518 * force us to dump any collected free pages
1519 * and to pause before moving on
1520 */
1521 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1522
1523 goto done_with_inactivepage;
1524 }
1525 object = m->object;
1526 }
1527 /*
1528 * If the page belongs to a purgable object with no pending copies
1529 * against it, then we reap all of the pages in the object
1530 * and note that the object has been "emptied". It'll be up to the
1531 * application the discover this and recreate its contents if desired.
1532 */
1533 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1534 object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1535 object->copy == VM_OBJECT_NULL) {
1536
1537 (void) vm_object_purge(object);
1538 vm_pageout_purged_objects++;
1539 /*
1540 * we've just taken all of the pages from this object,
1541 * so drop the lock now since we're not going to find
1542 * any more pages belonging to it anytime soon
1543 */
1544 vm_object_unlock(object);
1545 object = NULL;
1546
1547 inactive_burst_count = 0;
1548
1549 goto done_with_inactivepage;
1550 }
1551
1552 /*
1553 * Paging out pages of external objects which
1554 * are currently being created must be avoided.
1555 * The pager may claim for memory, thus leading to a
1556 * possible dead lock between it and the pageout thread,
1557 * if such pages are finally chosen. The remaining assumption
1558 * is that there will finally be enough available pages in the
1559 * inactive pool to page out in order to satisfy all memory
1560 * claimed by the thread which concurrently creates the pager.
1561 */
1562 if (!object->pager_initialized && object->pager_created) {
1563 /*
1564 * Move page to end and continue, hoping that
1565 * there will be enough other inactive pages to
1566 * page out so that the thread which currently
1567 * initializes the pager will succeed.
1568 * Don't re-grant the ticket, the page should
1569 * pulled from the queue and paged out whenever
1570 * one of its logically adjacent fellows is
1571 * targeted.
1572 */
1573 if (m->zero_fill) {
1574 queue_remove(&vm_page_queue_zf, m,
1575 vm_page_t, pageq);
1576 queue_enter(&vm_page_queue_zf, m,
1577 vm_page_t, pageq);
1578 last_page_zf = 1;
1579 vm_zf_iterator = vm_zf_iterator_count - 1;
1580 } else {
1581 queue_remove(&vm_page_queue_inactive, m,
1582 vm_page_t, pageq);
1583 queue_enter(&vm_page_queue_inactive, m,
1584 vm_page_t, pageq);
1585 last_page_zf = 0;
1586 vm_zf_iterator = 1;
1587 }
1588 vm_pageout_inactive_avoid++;
1589
1590 goto done_with_inactivepage;
1591 }
1592 /*
1593 * Remove the page from the inactive list.
1594 */
1595 if (m->zero_fill) {
1596 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1597 } else {
1598 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1599 }
1600 m->pageq.next = NULL;
1601 m->pageq.prev = NULL;
1602 m->inactive = FALSE;
1603 if (!m->fictitious)
1604 vm_page_inactive_count--;
1605
1606 if (m->busy || !object->alive) {
1607 /*
1608 * Somebody is already playing with this page.
1609 * Leave it off the pageout queues.
1610 */
1611 vm_pageout_inactive_busy++;
1612
1613 goto done_with_inactivepage;
1614 }
1615
1616 /*
1617 * If it's absent or in error, we can reclaim the page.
1618 */
1619
1620 if (m->absent || m->error) {
1621 vm_pageout_inactive_absent++;
1622 reclaim_page:
1623 if (vm_pageout_deadlock_target) {
1624 vm_pageout_scan_inactive_throttle_success++;
1625 vm_pageout_deadlock_target--;
1626 }
1627 if (m->tabled)
1628 vm_page_remove(m); /* clears tabled, object, offset */
1629 if (m->absent)
1630 vm_object_absent_release(object);
1631
1632 assert(m->pageq.next == NULL &&
1633 m->pageq.prev == NULL);
1634 m->pageq.next = (queue_entry_t)local_freeq;
1635 local_freeq = m;
1636 local_freed++;
1637
1638 inactive_burst_count = 0;
1639
1640 goto done_with_inactivepage;
1641 }
1642
1643 assert(!m->private);
1644 assert(!m->fictitious);
1645
1646 /*
1647 * If already cleaning this page in place, convert from
1648 * "adjacent" to "target". We can leave the page mapped,
1649 * and vm_pageout_object_terminate will determine whether
1650 * to free or reactivate.
1651 */
1652
1653 if (m->cleaning) {
1654 m->busy = TRUE;
1655 m->pageout = TRUE;
1656 m->dump_cleaning = TRUE;
1657 vm_page_wire(m);
1658
1659 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1660
1661 inactive_burst_count = 0;
1662
1663 goto done_with_inactivepage;
1664 }
1665
1666 /*
1667 * If it's being used, reactivate.
1668 * (Fictitious pages are either busy or absent.)
1669 */
1670 if ( (!m->reference) ) {
1671 refmod_state = pmap_get_refmod(m->phys_page);
1672
1673 if (refmod_state & VM_MEM_REFERENCED)
1674 m->reference = TRUE;
1675 if (refmod_state & VM_MEM_MODIFIED)
1676 m->dirty = TRUE;
1677 }
1678 if (m->reference) {
1679 was_referenced:
1680 vm_page_activate(m);
1681 VM_STAT(reactivations++);
1682
1683 vm_pageout_inactive_used++;
1684 last_page_zf = 0;
1685 inactive_burst_count = 0;
1686
1687 goto done_with_inactivepage;
1688 }
1689
1690 XPR(XPR_VM_PAGEOUT,
1691 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1692 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1693
1694 /*
1695 * we've got a candidate page to steal...
1696 *
1697 * m->dirty is up to date courtesy of the
1698 * preceding check for m->reference... if
1699 * we get here, then m->reference had to be
1700 * FALSE which means we did a pmap_get_refmod
1701 * and updated both m->reference and m->dirty
1702 *
1703 * if it's dirty or precious we need to
1704 * see if the target queue is throtttled
1705 * it if is, we need to skip over it by moving it back
1706 * to the end of the inactive queue
1707 */
1708 inactive_throttled = FALSE;
1709
1710 if (m->dirty || m->precious) {
1711 if (object->internal) {
1712 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1713 inactive_throttled = TRUE;
1714 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1715 inactive_throttled = TRUE;
1716 }
1717 }
1718 if (inactive_throttled == TRUE) {
1719 if (m->zero_fill) {
1720 queue_enter(&vm_page_queue_zf, m,
1721 vm_page_t, pageq);
1722 } else {
1723 queue_enter(&vm_page_queue_inactive, m,
1724 vm_page_t, pageq);
1725 }
1726 if (!m->fictitious)
1727 vm_page_inactive_count++;
1728 m->inactive = TRUE;
1729
1730 vm_pageout_scan_inactive_throttled++;
1731
1732 goto done_with_inactivepage;
1733 }
1734 /*
1735 * we've got a page that we can steal...
1736 * eliminate all mappings and make sure
1737 * we have the up-to-date modified state
1738 * first take the page BUSY, so that no new
1739 * mappings can be made
1740 */
1741 m->busy = TRUE;
1742
1743 /*
1744 * if we need to do a pmap_disconnect then we
1745 * need to re-evaluate m->dirty since the pmap_disconnect
1746 * provides the true state atomically... the
1747 * page was still mapped up to the pmap_disconnect
1748 * and may have been dirtied at the last microsecond
1749 *
1750 * we also check for the page being referenced 'late'
1751 * if it was, we first need to do a WAKEUP_DONE on it
1752 * since we already set m->busy = TRUE, before
1753 * going off to reactivate it
1754 *
1755 * if we don't need the pmap_disconnect, then
1756 * m->dirty is up to date courtesy of the
1757 * earlier check for m->reference... if
1758 * we get here, then m->reference had to be
1759 * FALSE which means we did a pmap_get_refmod
1760 * and updated both m->reference and m->dirty...
1761 */
1762 if (m->no_isync == FALSE) {
1763 refmod_state = pmap_disconnect(m->phys_page);
1764
1765 if (refmod_state & VM_MEM_MODIFIED)
1766 m->dirty = TRUE;
1767 if (refmod_state & VM_MEM_REFERENCED) {
1768 m->reference = TRUE;
1769
1770 PAGE_WAKEUP_DONE(m);
1771 goto was_referenced;
1772 }
1773 }
1774 /*
1775 * If it's clean and not precious, we can free the page.
1776 */
1777 if (!m->dirty && !m->precious) {
1778 vm_pageout_inactive_clean++;
1779 goto reclaim_page;
1780 }
1781 vm_pageout_cluster(m);
1782
1783 vm_pageout_inactive_dirty++;
1784
1785 inactive_burst_count = 0;
1786
1787 done_with_inactivepage:
1788 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1789
1790 if (object != NULL) {
1791 vm_object_unlock(object);
1792 object = NULL;
1793 }
1794 if (local_freeq) {
1795 vm_page_free_list(local_freeq);
1796
1797 local_freeq = 0;
1798 local_freed = 0;
1799 }
1800 delayed_unlock = 0;
1801 vm_page_unlock_queues();
1802 mutex_pause();
1803 }
1804 /*
1805 * back to top of pageout scan loop
1806 */
1807 }
1808 }
1809
1810
1811 int vm_page_free_count_init;
1812
1813 void
1814 vm_page_free_reserve(
1815 int pages)
1816 {
1817 int free_after_reserve;
1818
1819 vm_page_free_reserved += pages;
1820
1821 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1822
1823 vm_page_free_min = vm_page_free_reserved +
1824 VM_PAGE_FREE_MIN(free_after_reserve);
1825
1826 vm_page_free_target = vm_page_free_reserved +
1827 VM_PAGE_FREE_TARGET(free_after_reserve);
1828
1829 if (vm_page_free_target < vm_page_free_min + 5)
1830 vm_page_free_target = vm_page_free_min + 5;
1831 }
1832
1833 /*
1834 * vm_pageout is the high level pageout daemon.
1835 */
1836
1837 void
1838 vm_pageout_continue(void)
1839 {
1840 vm_pageout_scan_event_counter++;
1841 vm_pageout_scan();
1842 /* we hold vm_page_queue_free_lock now */
1843 assert(vm_page_free_wanted == 0);
1844 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1845 mutex_unlock(&vm_page_queue_free_lock);
1846
1847 counter(c_vm_pageout_block++);
1848 thread_block((thread_continue_t)vm_pageout_continue);
1849 /*NOTREACHED*/
1850 }
1851
1852
1853 /*
1854 * must be called with the
1855 * queues and object locks held
1856 */
1857 static void
1858 vm_pageout_queue_steal(vm_page_t m)
1859 {
1860 struct vm_pageout_queue *q;
1861
1862 if (m->object->internal == TRUE)
1863 q = &vm_pageout_queue_internal;
1864 else
1865 q = &vm_pageout_queue_external;
1866
1867 m->laundry = FALSE;
1868 m->pageout_queue = FALSE;
1869 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1870
1871 m->pageq.next = NULL;
1872 m->pageq.prev = NULL;
1873
1874 vm_object_paging_end(m->object);
1875
1876 q->pgo_laundry--;
1877 }
1878
1879
1880 #ifdef FAKE_DEADLOCK
1881
1882 #define FAKE_COUNT 5000
1883
1884 int internal_count = 0;
1885 int fake_deadlock = 0;
1886
1887 #endif
1888
1889 static void
1890 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1891 {
1892 vm_page_t m = NULL;
1893 vm_object_t object;
1894 boolean_t need_wakeup;
1895
1896 vm_page_lock_queues();
1897
1898 while ( !queue_empty(&q->pgo_pending) ) {
1899
1900 q->pgo_busy = TRUE;
1901 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1902 m->pageout_queue = FALSE;
1903 vm_page_unlock_queues();
1904
1905 m->pageq.next = NULL;
1906 m->pageq.prev = NULL;
1907 #ifdef FAKE_DEADLOCK
1908 if (q == &vm_pageout_queue_internal) {
1909 vm_offset_t addr;
1910 int pg_count;
1911
1912 internal_count++;
1913
1914 if ((internal_count == FAKE_COUNT)) {
1915
1916 pg_count = vm_page_free_count + vm_page_free_reserved;
1917
1918 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1919 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1920 }
1921 internal_count = 0;
1922 fake_deadlock++;
1923 }
1924 }
1925 #endif
1926 object = m->object;
1927
1928 if (!object->pager_initialized) {
1929 vm_object_lock(object);
1930
1931 /*
1932 * If there is no memory object for the page, create
1933 * one and hand it to the default pager.
1934 */
1935
1936 if (!object->pager_initialized)
1937 vm_object_collapse(object, (vm_object_offset_t)0);
1938 if (!object->pager_initialized)
1939 vm_object_pager_create(object);
1940 if (!object->pager_initialized) {
1941 /*
1942 * Still no pager for the object.
1943 * Reactivate the page.
1944 *
1945 * Should only happen if there is no
1946 * default pager.
1947 */
1948 m->list_req_pending = FALSE;
1949 m->cleaning = FALSE;
1950 m->pageout = FALSE;
1951 vm_page_unwire(m);
1952
1953 vm_pageout_throttle_up(m);
1954
1955 vm_page_lock_queues();
1956 vm_pageout_dirty_no_pager++;
1957 vm_page_activate(m);
1958 vm_page_unlock_queues();
1959
1960 /*
1961 * And we are done with it.
1962 */
1963 PAGE_WAKEUP_DONE(m);
1964
1965 vm_object_paging_end(object);
1966 vm_object_unlock(object);
1967
1968 vm_page_lock_queues();
1969 continue;
1970 } else if (object->pager == MEMORY_OBJECT_NULL) {
1971 /*
1972 * This pager has been destroyed by either
1973 * memory_object_destroy or vm_object_destroy, and
1974 * so there is nowhere for the page to go.
1975 * Just free the page... VM_PAGE_FREE takes
1976 * care of cleaning up all the state...
1977 * including doing the vm_pageout_throttle_up
1978 */
1979 VM_PAGE_FREE(m);
1980
1981 vm_object_paging_end(object);
1982 vm_object_unlock(object);
1983
1984 vm_page_lock_queues();
1985 continue;
1986 }
1987 vm_object_unlock(object);
1988 }
1989 /*
1990 * we expect the paging_in_progress reference to have
1991 * already been taken on the object before it was added
1992 * to the appropriate pageout I/O queue... this will
1993 * keep the object from being terminated and/or the
1994 * paging_offset from changing until the I/O has
1995 * completed... therefore no need to lock the object to
1996 * pull the paging_offset from it.
1997 *
1998 * Send the data to the pager.
1999 * any pageout clustering happens there
2000 */
2001 memory_object_data_return(object->pager,
2002 m->offset + object->paging_offset,
2003 PAGE_SIZE,
2004 NULL,
2005 NULL,
2006 FALSE,
2007 FALSE,
2008 0);
2009
2010 vm_object_lock(object);
2011 vm_object_paging_end(object);
2012 vm_object_unlock(object);
2013
2014 vm_page_lock_queues();
2015 }
2016 assert_wait((event_t) q, THREAD_UNINT);
2017
2018
2019 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2020 q->pgo_throttled = FALSE;
2021 need_wakeup = TRUE;
2022 } else
2023 need_wakeup = FALSE;
2024
2025 q->pgo_busy = FALSE;
2026 q->pgo_idle = TRUE;
2027 vm_page_unlock_queues();
2028
2029 if (need_wakeup == TRUE)
2030 thread_wakeup((event_t) &q->pgo_laundry);
2031
2032 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2033 /*NOTREACHED*/
2034 }
2035
2036
2037 static void
2038 vm_pageout_iothread_external(void)
2039 {
2040
2041 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2042 /*NOTREACHED*/
2043 }
2044
2045
2046 static void
2047 vm_pageout_iothread_internal(void)
2048 {
2049 thread_t self = current_thread();
2050
2051 self->options |= TH_OPT_VMPRIV;
2052
2053 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2054 /*NOTREACHED*/
2055 }
2056
2057 static void
2058 vm_pageout_garbage_collect(int collect)
2059 {
2060 if (collect) {
2061 stack_collect();
2062
2063 /*
2064 * consider_zone_gc should be last, because the other operations
2065 * might return memory to zones.
2066 */
2067 consider_machine_collect();
2068 consider_zone_gc();
2069
2070 consider_machine_adjust();
2071 }
2072
2073 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2074
2075 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2076 /*NOTREACHED*/
2077 }
2078
2079
2080
2081 void
2082 vm_pageout(void)
2083 {
2084 thread_t self = current_thread();
2085 thread_t thread;
2086 kern_return_t result;
2087 spl_t s;
2088
2089 /*
2090 * Set thread privileges.
2091 */
2092 s = splsched();
2093 thread_lock(self);
2094 self->priority = BASEPRI_PREEMPT - 1;
2095 set_sched_pri(self, self->priority);
2096 thread_unlock(self);
2097 splx(s);
2098
2099 /*
2100 * Initialize some paging parameters.
2101 */
2102
2103 if (vm_pageout_idle_wait == 0)
2104 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2105
2106 if (vm_pageout_burst_wait == 0)
2107 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2108
2109 if (vm_pageout_empty_wait == 0)
2110 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2111
2112 if (vm_pageout_deadlock_wait == 0)
2113 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2114
2115 if (vm_pageout_deadlock_relief == 0)
2116 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2117
2118 if (vm_pageout_inactive_relief == 0)
2119 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2120
2121 if (vm_pageout_burst_active_throttle == 0)
2122 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2123
2124 if (vm_pageout_burst_inactive_throttle == 0)
2125 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2126
2127 /*
2128 * Set kernel task to low backing store privileged
2129 * status
2130 */
2131 task_lock(kernel_task);
2132 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2133 task_unlock(kernel_task);
2134
2135 vm_page_free_count_init = vm_page_free_count;
2136 vm_zf_iterator = 0;
2137 /*
2138 * even if we've already called vm_page_free_reserve
2139 * call it again here to insure that the targets are
2140 * accurately calculated (it uses vm_page_free_count_init)
2141 * calling it with an arg of 0 will not change the reserve
2142 * but will re-calculate free_min and free_target
2143 */
2144 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2145 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2146 } else
2147 vm_page_free_reserve(0);
2148
2149
2150 queue_init(&vm_pageout_queue_external.pgo_pending);
2151 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2152 vm_pageout_queue_external.pgo_laundry = 0;
2153 vm_pageout_queue_external.pgo_idle = FALSE;
2154 vm_pageout_queue_external.pgo_busy = FALSE;
2155 vm_pageout_queue_external.pgo_throttled = FALSE;
2156
2157 queue_init(&vm_pageout_queue_internal.pgo_pending);
2158 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2159 vm_pageout_queue_internal.pgo_laundry = 0;
2160 vm_pageout_queue_internal.pgo_idle = FALSE;
2161 vm_pageout_queue_internal.pgo_busy = FALSE;
2162 vm_pageout_queue_internal.pgo_throttled = FALSE;
2163
2164
2165 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2166 if (result != KERN_SUCCESS)
2167 panic("vm_pageout_iothread_internal: create failed");
2168
2169 thread_deallocate(thread);
2170
2171
2172 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2173 if (result != KERN_SUCCESS)
2174 panic("vm_pageout_iothread_external: create failed");
2175
2176 thread_deallocate(thread);
2177
2178
2179 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2180 if (result != KERN_SUCCESS)
2181 panic("vm_pageout_garbage_collect: create failed");
2182
2183 thread_deallocate(thread);
2184
2185
2186 vm_pageout_continue();
2187 /*NOTREACHED*/
2188 }
2189
2190
2191 static upl_t
2192 upl_create(
2193 int flags,
2194 upl_size_t size)
2195 {
2196 upl_t upl;
2197 int page_field_size; /* bit field in word size buf */
2198
2199 page_field_size = 0;
2200 if (flags & UPL_CREATE_LITE) {
2201 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2202 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2203 }
2204 if(flags & UPL_CREATE_INTERNAL) {
2205 upl = (upl_t)kalloc(sizeof(struct upl)
2206 + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2207 + page_field_size);
2208 } else {
2209 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2210 }
2211 upl->flags = 0;
2212 upl->src_object = NULL;
2213 upl->kaddr = (vm_offset_t)0;
2214 upl->size = 0;
2215 upl->map_object = NULL;
2216 upl->ref_count = 1;
2217 upl_lock_init(upl);
2218 #ifdef UPL_DEBUG
2219 upl->ubc_alias1 = 0;
2220 upl->ubc_alias2 = 0;
2221 #endif /* UPL_DEBUG */
2222 return(upl);
2223 }
2224
2225 static void
2226 upl_destroy(
2227 upl_t upl)
2228 {
2229 int page_field_size; /* bit field in word size buf */
2230
2231 #ifdef UPL_DEBUG
2232 {
2233 upl_t upl_ele;
2234 vm_object_t object;
2235 if (upl->map_object->pageout) {
2236 object = upl->map_object->shadow;
2237 } else {
2238 object = upl->map_object;
2239 }
2240 vm_object_lock(object);
2241 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2242 if(upl_ele == upl) {
2243 queue_remove(&object->uplq,
2244 upl_ele, upl_t, uplq);
2245 break;
2246 }
2247 }
2248 vm_object_unlock(object);
2249 }
2250 #endif /* UPL_DEBUG */
2251 /* drop a reference on the map_object whether or */
2252 /* not a pageout object is inserted */
2253 if(upl->map_object->pageout)
2254 vm_object_deallocate(upl->map_object);
2255
2256 page_field_size = 0;
2257 if (upl->flags & UPL_LITE) {
2258 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2259 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2260 }
2261 if(upl->flags & UPL_INTERNAL) {
2262 kfree(upl,
2263 sizeof(struct upl) +
2264 (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2265 + page_field_size);
2266 } else {
2267 kfree(upl, sizeof(struct upl) + page_field_size);
2268 }
2269 }
2270
2271 void uc_upl_dealloc(upl_t upl);
2272 __private_extern__ void
2273 uc_upl_dealloc(
2274 upl_t upl)
2275 {
2276 upl->ref_count -= 1;
2277 if(upl->ref_count == 0) {
2278 upl_destroy(upl);
2279 }
2280 }
2281
2282 void
2283 upl_deallocate(
2284 upl_t upl)
2285 {
2286
2287 upl->ref_count -= 1;
2288 if(upl->ref_count == 0) {
2289 upl_destroy(upl);
2290 }
2291 }
2292
2293 /*
2294 * Statistics about UPL enforcement of copy-on-write obligations.
2295 */
2296 unsigned long upl_cow = 0;
2297 unsigned long upl_cow_again = 0;
2298 unsigned long upl_cow_contiguous = 0;
2299 unsigned long upl_cow_pages = 0;
2300 unsigned long upl_cow_again_pages = 0;
2301 unsigned long upl_cow_contiguous_pages = 0;
2302
2303 /*
2304 * Routine: vm_object_upl_request
2305 * Purpose:
2306 * Cause the population of a portion of a vm_object.
2307 * Depending on the nature of the request, the pages
2308 * returned may be contain valid data or be uninitialized.
2309 * A page list structure, listing the physical pages
2310 * will be returned upon request.
2311 * This function is called by the file system or any other
2312 * supplier of backing store to a pager.
2313 * IMPORTANT NOTE: The caller must still respect the relationship
2314 * between the vm_object and its backing memory object. The
2315 * caller MUST NOT substitute changes in the backing file
2316 * without first doing a memory_object_lock_request on the
2317 * target range unless it is know that the pages are not
2318 * shared with another entity at the pager level.
2319 * Copy_in_to:
2320 * if a page list structure is present
2321 * return the mapped physical pages, where a
2322 * page is not present, return a non-initialized
2323 * one. If the no_sync bit is turned on, don't
2324 * call the pager unlock to synchronize with other
2325 * possible copies of the page. Leave pages busy
2326 * in the original object, if a page list structure
2327 * was specified. When a commit of the page list
2328 * pages is done, the dirty bit will be set for each one.
2329 * Copy_out_from:
2330 * If a page list structure is present, return
2331 * all mapped pages. Where a page does not exist
2332 * map a zero filled one. Leave pages busy in
2333 * the original object. If a page list structure
2334 * is not specified, this call is a no-op.
2335 *
2336 * Note: access of default pager objects has a rather interesting
2337 * twist. The caller of this routine, presumably the file system
2338 * page cache handling code, will never actually make a request
2339 * against a default pager backed object. Only the default
2340 * pager will make requests on backing store related vm_objects
2341 * In this way the default pager can maintain the relationship
2342 * between backing store files (abstract memory objects) and
2343 * the vm_objects (cache objects), they support.
2344 *
2345 */
2346
2347 __private_extern__ kern_return_t
2348 vm_object_upl_request(
2349 vm_object_t object,
2350 vm_object_offset_t offset,
2351 upl_size_t size,
2352 upl_t *upl_ptr,
2353 upl_page_info_array_t user_page_list,
2354 unsigned int *page_list_count,
2355 int cntrl_flags)
2356 {
2357 vm_page_t dst_page = VM_PAGE_NULL;
2358 vm_object_offset_t dst_offset = offset;
2359 upl_size_t xfer_size = size;
2360 boolean_t do_m_lock = FALSE;
2361 boolean_t dirty;
2362 boolean_t hw_dirty;
2363 upl_t upl = NULL;
2364 unsigned int entry;
2365 #if MACH_CLUSTER_STATS
2366 boolean_t encountered_lrp = FALSE;
2367 #endif
2368 vm_page_t alias_page = NULL;
2369 int page_ticket;
2370 int refmod_state;
2371 wpl_array_t lite_list = NULL;
2372 vm_object_t last_copy_object;
2373
2374
2375 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2376 /*
2377 * For forward compatibility's sake,
2378 * reject any unknown flag.
2379 */
2380 return KERN_INVALID_VALUE;
2381 }
2382
2383 page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2384 >> UPL_PAGE_TICKET_SHIFT;
2385
2386 if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2387 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2388 }
2389
2390 if(cntrl_flags & UPL_SET_INTERNAL)
2391 if(page_list_count != NULL)
2392 *page_list_count = MAX_UPL_TRANSFER;
2393
2394 if((!object->internal) && (object->paging_offset != 0))
2395 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
2396
2397 if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2398 return KERN_SUCCESS;
2399 }
2400
2401 vm_object_lock(object);
2402 vm_object_paging_begin(object);
2403 vm_object_unlock(object);
2404
2405 if(upl_ptr) {
2406 if(cntrl_flags & UPL_SET_INTERNAL) {
2407 if(cntrl_flags & UPL_SET_LITE) {
2408 uintptr_t page_field_size;
2409 upl = upl_create(
2410 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2411 size);
2412 user_page_list = (upl_page_info_t *)
2413 (((uintptr_t)upl) + sizeof(struct upl));
2414 lite_list = (wpl_array_t)
2415 (((uintptr_t)user_page_list) +
2416 ((size/PAGE_SIZE) *
2417 sizeof(upl_page_info_t)));
2418 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2419 page_field_size =
2420 (page_field_size + 3) & 0xFFFFFFFC;
2421 bzero((char *)lite_list, page_field_size);
2422 upl->flags =
2423 UPL_LITE | UPL_INTERNAL;
2424 } else {
2425 upl = upl_create(UPL_CREATE_INTERNAL, size);
2426 user_page_list = (upl_page_info_t *)
2427 (((uintptr_t)upl) + sizeof(struct upl));
2428 upl->flags = UPL_INTERNAL;
2429 }
2430 } else {
2431 if(cntrl_flags & UPL_SET_LITE) {
2432 uintptr_t page_field_size;
2433 upl = upl_create(UPL_CREATE_LITE, size);
2434 lite_list = (wpl_array_t)
2435 (((uintptr_t)upl) + sizeof(struct upl));
2436 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2437 page_field_size =
2438 (page_field_size + 3) & 0xFFFFFFFC;
2439 bzero((char *)lite_list, page_field_size);
2440 upl->flags = UPL_LITE;
2441 } else {
2442 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2443 upl->flags = 0;
2444 }
2445 }
2446
2447 if (object->phys_contiguous) {
2448 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2449 object->copy != VM_OBJECT_NULL) {
2450 /* Honor copy-on-write obligations */
2451
2452 /*
2453 * XXX FBDP
2454 * We could still have a race...
2455 * A is here building the UPL for a write().
2456 * A pushes the pages to the current copy
2457 * object.
2458 * A returns the UPL to the caller.
2459 * B comes along and establishes another
2460 * private mapping on this object, inserting
2461 * a new copy object between the original
2462 * object and the old copy object.
2463 * B reads a page and gets the original contents
2464 * from the original object.
2465 * A modifies the page in the original object.
2466 * B reads the page again and sees A's changes,
2467 * which is wrong...
2468 *
2469 * The problem is that the pages are not
2470 * marked "busy" in the original object, so
2471 * nothing prevents B from reading it before
2472 * before A's changes are completed.
2473 *
2474 * The "paging_in_progress" might protect us
2475 * from the insertion of a new copy object
2476 * though... To be verified.
2477 */
2478 vm_object_lock_request(object,
2479 offset,
2480 size,
2481 FALSE,
2482 MEMORY_OBJECT_COPY_SYNC,
2483 VM_PROT_NO_CHANGE);
2484 upl_cow_contiguous++;
2485 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2486 }
2487
2488 upl->map_object = object;
2489 /* don't need any shadow mappings for this one */
2490 /* since it is already I/O memory */
2491 upl->flags |= UPL_DEVICE_MEMORY;
2492
2493
2494 /* paging_in_progress protects paging_offset */
2495 upl->offset = offset + object->paging_offset;
2496 upl->size = size;
2497 *upl_ptr = upl;
2498 if(user_page_list) {
2499 user_page_list[0].phys_addr =
2500 (offset + object->shadow_offset)>>PAGE_SHIFT;
2501 user_page_list[0].device = TRUE;
2502 }
2503
2504 if(page_list_count != NULL) {
2505 if (upl->flags & UPL_INTERNAL) {
2506 *page_list_count = 0;
2507 } else {
2508 *page_list_count = 1;
2509 }
2510 }
2511
2512 return KERN_SUCCESS;
2513 }
2514
2515 if(user_page_list)
2516 user_page_list[0].device = FALSE;
2517
2518 if(cntrl_flags & UPL_SET_LITE) {
2519 upl->map_object = object;
2520 } else {
2521 upl->map_object = vm_object_allocate(size);
2522 /*
2523 * No neeed to lock the new object: nobody else knows
2524 * about it yet, so it's all ours so far.
2525 */
2526 upl->map_object->shadow = object;
2527 upl->map_object->pageout = TRUE;
2528 upl->map_object->can_persist = FALSE;
2529 upl->map_object->copy_strategy =
2530 MEMORY_OBJECT_COPY_NONE;
2531 upl->map_object->shadow_offset = offset;
2532 upl->map_object->wimg_bits = object->wimg_bits;
2533 }
2534
2535 }
2536 if (!(cntrl_flags & UPL_SET_LITE)) {
2537 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2538 }
2539
2540 /*
2541 * ENCRYPTED SWAP:
2542 * Just mark the UPL as "encrypted" here.
2543 * We'll actually encrypt the pages later,
2544 * in upl_encrypt(), when the caller has
2545 * selected which pages need to go to swap.
2546 */
2547 if (cntrl_flags & UPL_ENCRYPT) {
2548 upl->flags |= UPL_ENCRYPTED;
2549 }
2550 if (cntrl_flags & UPL_FOR_PAGEOUT) {
2551 upl->flags |= UPL_PAGEOUT;
2552 }
2553 vm_object_lock(object);
2554
2555 /* we can lock in the paging_offset once paging_in_progress is set */
2556 if(upl_ptr) {
2557 upl->size = size;
2558 upl->offset = offset + object->paging_offset;
2559 *upl_ptr = upl;
2560 #ifdef UPL_DEBUG
2561 queue_enter(&object->uplq, upl, upl_t, uplq);
2562 #endif /* UPL_DEBUG */
2563 }
2564
2565 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2566 object->copy != VM_OBJECT_NULL) {
2567 /* Honor copy-on-write obligations */
2568
2569 /*
2570 * The caller is gathering these pages and
2571 * might modify their contents. We need to
2572 * make sure that the copy object has its own
2573 * private copies of these pages before we let
2574 * the caller modify them.
2575 */
2576 vm_object_update(object,
2577 offset,
2578 size,
2579 NULL,
2580 NULL,
2581 FALSE, /* should_return */
2582 MEMORY_OBJECT_COPY_SYNC,
2583 VM_PROT_NO_CHANGE);
2584 upl_cow++;
2585 upl_cow_pages += size >> PAGE_SHIFT;
2586
2587 }
2588 /* remember which copy object we synchronized with */
2589 last_copy_object = object->copy;
2590
2591 entry = 0;
2592 if(cntrl_flags & UPL_COPYOUT_FROM) {
2593 upl->flags |= UPL_PAGE_SYNC_DONE;
2594
2595 while (xfer_size) {
2596 if((alias_page == NULL) &&
2597 !(cntrl_flags & UPL_SET_LITE)) {
2598 vm_object_unlock(object);
2599 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2600 vm_object_lock(object);
2601 }
2602 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2603 dst_page->fictitious ||
2604 dst_page->absent ||
2605 dst_page->error ||
2606 (dst_page->wire_count && !dst_page->pageout) ||
2607
2608 ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2609 (dst_page->page_ticket != page_ticket) &&
2610 ((dst_page->page_ticket+1) != page_ticket)) ) {
2611
2612 if (user_page_list)
2613 user_page_list[entry].phys_addr = 0;
2614 } else {
2615 /*
2616 * grab this up front...
2617 * a high percentange of the time we're going to
2618 * need the hardware modification state a bit later
2619 * anyway... so we can eliminate an extra call into
2620 * the pmap layer by grabbing it here and recording it
2621 */
2622 refmod_state = pmap_get_refmod(dst_page->phys_page);
2623
2624 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2625 /*
2626 * we're only asking for DIRTY pages to be returned
2627 */
2628
2629 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2630 /*
2631 * if we were the page stolen by vm_pageout_scan to be
2632 * cleaned (as opposed to a buddy being clustered in
2633 * or this request is not being driven by a PAGEOUT cluster
2634 * then we only need to check for the page being diry or
2635 * precious to decide whether to return it
2636 */
2637 if (dst_page->dirty || dst_page->precious ||
2638 (refmod_state & VM_MEM_MODIFIED)) {
2639 goto check_busy;
2640 }
2641 }
2642 /*
2643 * this is a request for a PAGEOUT cluster and this page
2644 * is merely along for the ride as a 'buddy'... not only
2645 * does it have to be dirty to be returned, but it also
2646 * can't have been referenced recently... note that we've
2647 * already filtered above based on whether this page is
2648 * currently on the inactive queue or it meets the page
2649 * ticket (generation count) check
2650 */
2651 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2652 ((refmod_state & VM_MEM_MODIFIED) ||
2653 dst_page->dirty || dst_page->precious) ) {
2654 goto check_busy;
2655 }
2656 /*
2657 * if we reach here, we're not to return
2658 * the page... go on to the next one
2659 */
2660 if (user_page_list)
2661 user_page_list[entry].phys_addr = 0;
2662 entry++;
2663 dst_offset += PAGE_SIZE_64;
2664 xfer_size -= PAGE_SIZE;
2665 continue;
2666 }
2667 check_busy:
2668 if(dst_page->busy &&
2669 (!(dst_page->list_req_pending &&
2670 dst_page->pageout))) {
2671 if(cntrl_flags & UPL_NOBLOCK) {
2672 if(user_page_list) {
2673 user_page_list[entry].phys_addr = 0;
2674 }
2675 entry++;
2676 dst_offset += PAGE_SIZE_64;
2677 xfer_size -= PAGE_SIZE;
2678 continue;
2679 }
2680 /*
2681 * someone else is playing with the
2682 * page. We will have to wait.
2683 */
2684 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2685 continue;
2686 }
2687 /* Someone else already cleaning the page? */
2688 if((dst_page->cleaning || dst_page->absent ||
2689 dst_page->wire_count != 0) &&
2690 !dst_page->list_req_pending) {
2691 if(user_page_list) {
2692 user_page_list[entry].phys_addr = 0;
2693 }
2694 entry++;
2695 dst_offset += PAGE_SIZE_64;
2696 xfer_size -= PAGE_SIZE;
2697 continue;
2698 }
2699 /* eliminate all mappings from the */
2700 /* original object and its prodigy */
2701
2702 vm_page_lock_queues();
2703
2704 if (dst_page->pageout_queue == TRUE)
2705 /*
2706 * we've buddied up a page for a clustered pageout
2707 * that has already been moved to the pageout
2708 * queue by pageout_scan... we need to remove
2709 * it from the queue and drop the laundry count
2710 * on that queue
2711 */
2712 vm_pageout_queue_steal(dst_page);
2713 #if MACH_CLUSTER_STATS
2714 /* pageout statistics gathering. count */
2715 /* all the pages we will page out that */
2716 /* were not counted in the initial */
2717 /* vm_pageout_scan work */
2718 if(dst_page->list_req_pending)
2719 encountered_lrp = TRUE;
2720 if((dst_page->dirty ||
2721 (dst_page->object->internal &&
2722 dst_page->precious)) &&
2723 (dst_page->list_req_pending
2724 == FALSE)) {
2725 if(encountered_lrp) {
2726 CLUSTER_STAT
2727 (pages_at_higher_offsets++;)
2728 } else {
2729 CLUSTER_STAT
2730 (pages_at_lower_offsets++;)
2731 }
2732 }
2733 #endif
2734 /* Turn off busy indication on pending */
2735 /* pageout. Note: we can only get here */
2736 /* in the request pending case. */
2737 dst_page->list_req_pending = FALSE;
2738 dst_page->busy = FALSE;
2739 dst_page->cleaning = FALSE;
2740
2741 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2742 dirty = hw_dirty ? TRUE : dst_page->dirty;
2743
2744 if(cntrl_flags & UPL_SET_LITE) {
2745 int pg_num;
2746 pg_num = (dst_offset-offset)/PAGE_SIZE;
2747 lite_list[pg_num>>5] |=
2748 1 << (pg_num & 31);
2749 if (hw_dirty)
2750 pmap_clear_modify(dst_page->phys_page);
2751 /*
2752 * Record that this page has been
2753 * written out
2754 */
2755 #if MACH_PAGEMAP
2756 vm_external_state_set(
2757 object->existence_map,
2758 dst_page->offset);
2759 #endif /*MACH_PAGEMAP*/
2760
2761 /*
2762 * Mark original page as cleaning
2763 * in place.
2764 */
2765 dst_page->cleaning = TRUE;
2766 dst_page->dirty = TRUE;
2767 dst_page->precious = FALSE;
2768 } else {
2769 /* use pageclean setup, it is more */
2770 /* convenient even for the pageout */
2771 /* cases here */
2772
2773 vm_object_lock(upl->map_object);
2774 vm_pageclean_setup(dst_page,
2775 alias_page, upl->map_object,
2776 size - xfer_size);
2777 vm_object_unlock(upl->map_object);
2778
2779 alias_page->absent = FALSE;
2780 alias_page = NULL;
2781 }
2782
2783 if(!dirty) {
2784 dst_page->dirty = FALSE;
2785 dst_page->precious = TRUE;
2786 }
2787
2788 if(dst_page->pageout)
2789 dst_page->busy = TRUE;
2790
2791 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2792 /*
2793 * ENCRYPTED SWAP:
2794 * We want to deny access to the target page
2795 * because its contents are about to be
2796 * encrypted and the user would be very
2797 * confused to see encrypted data instead
2798 * of their data.
2799 */
2800 dst_page->busy = TRUE;
2801 }
2802 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2803 /*
2804 * deny access to the target page
2805 * while it is being worked on
2806 */
2807 if ((!dst_page->pageout) &&
2808 (dst_page->wire_count == 0)) {
2809 dst_page->busy = TRUE;
2810 dst_page->pageout = TRUE;
2811 vm_page_wire(dst_page);
2812 }
2813 }
2814
2815 if(user_page_list) {
2816 user_page_list[entry].phys_addr
2817 = dst_page->phys_page;
2818 user_page_list[entry].dirty =
2819 dst_page->dirty;
2820 user_page_list[entry].pageout =
2821 dst_page->pageout;
2822 user_page_list[entry].absent =
2823 dst_page->absent;
2824 user_page_list[entry].precious =
2825 dst_page->precious;
2826 }
2827 vm_page_unlock_queues();
2828
2829 /*
2830 * ENCRYPTED SWAP:
2831 * The caller is gathering this page and might
2832 * access its contents later on. Decrypt the
2833 * page before adding it to the UPL, so that
2834 * the caller never sees encrypted data.
2835 */
2836 if (! (cntrl_flags & UPL_ENCRYPT) &&
2837 dst_page->encrypted) {
2838 assert(dst_page->busy);
2839
2840 vm_page_decrypt(dst_page, 0);
2841 vm_page_decrypt_for_upl_counter++;
2842
2843 /*
2844 * Retry this page, since anything
2845 * could have changed while we were
2846 * decrypting.
2847 */
2848 continue;
2849 }
2850 }
2851 entry++;
2852 dst_offset += PAGE_SIZE_64;
2853 xfer_size -= PAGE_SIZE;
2854 }
2855 } else {
2856 while (xfer_size) {
2857 if((alias_page == NULL) &&
2858 !(cntrl_flags & UPL_SET_LITE)) {
2859 vm_object_unlock(object);
2860 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2861 vm_object_lock(object);
2862 }
2863
2864 if ((cntrl_flags & UPL_WILL_MODIFY) &&
2865 object->copy != last_copy_object) {
2866 /* Honor copy-on-write obligations */
2867
2868 /*
2869 * The copy object has changed since we
2870 * last synchronized for copy-on-write.
2871 * Another copy object might have been
2872 * inserted while we released the object's
2873 * lock. Since someone could have seen the
2874 * original contents of the remaining pages
2875 * through that new object, we have to
2876 * synchronize with it again for the remaining
2877 * pages only. The previous pages are "busy"
2878 * so they can not be seen through the new
2879 * mapping. The new mapping will see our
2880 * upcoming changes for those previous pages,
2881 * but that's OK since they couldn't see what
2882 * was there before. It's just a race anyway
2883 * and there's no guarantee of consistency or
2884 * atomicity. We just don't want new mappings
2885 * to see both the *before* and *after* pages.
2886 */
2887 if (object->copy != VM_OBJECT_NULL) {
2888 vm_object_update(
2889 object,
2890 dst_offset,/* current offset */
2891 xfer_size, /* remaining size */
2892 NULL,
2893 NULL,
2894 FALSE, /* should_return */
2895 MEMORY_OBJECT_COPY_SYNC,
2896 VM_PROT_NO_CHANGE);
2897 upl_cow_again++;
2898 upl_cow_again_pages +=
2899 xfer_size >> PAGE_SHIFT;
2900 }
2901 /* remember the copy object we synced with */
2902 last_copy_object = object->copy;
2903 }
2904
2905 dst_page = vm_page_lookup(object, dst_offset);
2906
2907 if(dst_page != VM_PAGE_NULL) {
2908 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2909 !((dst_page->list_req_pending)
2910 && (dst_page->absent))) {
2911 /* we are doing extended range */
2912 /* requests. we want to grab */
2913 /* pages around some which are */
2914 /* already present. */
2915 if(user_page_list) {
2916 user_page_list[entry].phys_addr = 0;
2917 }
2918 entry++;
2919 dst_offset += PAGE_SIZE_64;
2920 xfer_size -= PAGE_SIZE;
2921 continue;
2922 }
2923 if((dst_page->cleaning) &&
2924 !(dst_page->list_req_pending)) {
2925 /*someone else is writing to the */
2926 /* page. We will have to wait. */
2927 PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2928 continue;
2929 }
2930 if ((dst_page->fictitious &&
2931 dst_page->list_req_pending)) {
2932 /* dump the fictitious page */
2933 dst_page->list_req_pending = FALSE;
2934 dst_page->clustered = FALSE;
2935
2936 vm_page_lock_queues();
2937 vm_page_free(dst_page);
2938 vm_page_unlock_queues();
2939
2940 dst_page = NULL;
2941 } else if ((dst_page->absent &&
2942 dst_page->list_req_pending)) {
2943 /* the default_pager case */
2944 dst_page->list_req_pending = FALSE;
2945 dst_page->busy = FALSE;
2946 }
2947 }
2948 if(dst_page == VM_PAGE_NULL) {
2949 if(object->private) {
2950 /*
2951 * This is a nasty wrinkle for users
2952 * of upl who encounter device or
2953 * private memory however, it is
2954 * unavoidable, only a fault can
2955 * reslove the actual backing
2956 * physical page by asking the
2957 * backing device.
2958 */
2959 if(user_page_list) {
2960 user_page_list[entry].phys_addr = 0;
2961 }
2962 entry++;
2963 dst_offset += PAGE_SIZE_64;
2964 xfer_size -= PAGE_SIZE;
2965 continue;
2966 }
2967 /* need to allocate a page */
2968 dst_page = vm_page_alloc(object, dst_offset);
2969 if (dst_page == VM_PAGE_NULL) {
2970 vm_object_unlock(object);
2971 VM_PAGE_WAIT();
2972 vm_object_lock(object);
2973 continue;
2974 }
2975 dst_page->busy = FALSE;
2976 #if 0
2977 if(cntrl_flags & UPL_NO_SYNC) {
2978 dst_page->page_lock = 0;
2979 dst_page->unlock_request = 0;
2980 }
2981 #endif
2982 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2983 /*
2984 * if UPL_RET_ONLY_ABSENT was specified,
2985 * than we're definitely setting up a
2986 * upl for a clustered read/pagein
2987 * operation... mark the pages as clustered
2988 * so vm_fault can correctly attribute them
2989 * to the 'pagein' bucket the first time
2990 * a fault happens on them
2991 */
2992 dst_page->clustered = TRUE;
2993 }
2994 dst_page->absent = TRUE;
2995 object->absent_count++;
2996 }
2997 #if 1
2998 if(cntrl_flags & UPL_NO_SYNC) {
2999 dst_page->page_lock = 0;
3000 dst_page->unlock_request = 0;
3001 }
3002 #endif /* 1 */
3003
3004 /*
3005 * ENCRYPTED SWAP:
3006 */
3007 if (cntrl_flags & UPL_ENCRYPT) {
3008 /*
3009 * The page is going to be encrypted when we
3010 * get it from the pager, so mark it so.
3011 */
3012 dst_page->encrypted = TRUE;
3013 } else {
3014 /*
3015 * Otherwise, the page will not contain
3016 * encrypted data.
3017 */
3018 dst_page->encrypted = FALSE;
3019 }
3020
3021 dst_page->overwriting = TRUE;
3022 if(dst_page->fictitious) {
3023 panic("need corner case for fictitious page");
3024 }
3025 if(dst_page->page_lock) {
3026 do_m_lock = TRUE;
3027 }
3028 if(upl_ptr) {
3029
3030 /* eliminate all mappings from the */
3031 /* original object and its prodigy */
3032
3033 if(dst_page->busy) {
3034 /*someone else is playing with the */
3035 /* page. We will have to wait. */
3036 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3037 continue;
3038 }
3039 vm_page_lock_queues();
3040
3041 if( !(cntrl_flags & UPL_FILE_IO))
3042 hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3043 else
3044 hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3045 dirty = hw_dirty ? TRUE : dst_page->dirty;
3046
3047 if(cntrl_flags & UPL_SET_LITE) {
3048 int pg_num;
3049 pg_num = (dst_offset-offset)/PAGE_SIZE;
3050 lite_list[pg_num>>5] |=
3051 1 << (pg_num & 31);
3052 if (hw_dirty)
3053 pmap_clear_modify(dst_page->phys_page);
3054 /*
3055 * Record that this page has been
3056 * written out
3057 */
3058 #if MACH_PAGEMAP
3059 vm_external_state_set(
3060 object->existence_map,
3061 dst_page->offset);
3062 #endif /*MACH_PAGEMAP*/
3063
3064 /*
3065 * Mark original page as cleaning
3066 * in place.
3067 */
3068 dst_page->cleaning = TRUE;
3069 dst_page->dirty = TRUE;
3070 dst_page->precious = FALSE;
3071 } else {
3072 /* use pageclean setup, it is more */
3073 /* convenient even for the pageout */
3074 /* cases here */
3075 vm_object_lock(upl->map_object);
3076 vm_pageclean_setup(dst_page,
3077 alias_page, upl->map_object,
3078 size - xfer_size);
3079 vm_object_unlock(upl->map_object);
3080
3081 alias_page->absent = FALSE;
3082 alias_page = NULL;
3083 }
3084
3085 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3086 /* clean in place for read implies */
3087 /* that a write will be done on all */
3088 /* the pages that are dirty before */
3089 /* a upl commit is done. The caller */
3090 /* is obligated to preserve the */
3091 /* contents of all pages marked */
3092 /* dirty. */
3093 upl->flags |= UPL_CLEAR_DIRTY;
3094 }
3095
3096 if(!dirty) {
3097 dst_page->dirty = FALSE;
3098 dst_page->precious = TRUE;
3099 }
3100
3101 if (dst_page->wire_count == 0) {
3102 /* deny access to the target page while */
3103 /* it is being worked on */
3104 dst_page->busy = TRUE;
3105 } else {
3106 vm_page_wire(dst_page);
3107 }
3108 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3109 /*
3110 * expect the page not to be used
3111 * since it's coming in as part
3112 * of a cluster and could be
3113 * speculative... pages that
3114 * are 'consumed' will get a
3115 * hardware reference
3116 */
3117 dst_page->reference = FALSE;
3118 } else {
3119 /*
3120 * expect the page to be used
3121 */
3122 dst_page->reference = TRUE;
3123 }
3124 dst_page->precious =
3125 (cntrl_flags & UPL_PRECIOUS)
3126 ? TRUE : FALSE;
3127 if(user_page_list) {
3128 user_page_list[entry].phys_addr
3129 = dst_page->phys_page;
3130 user_page_list[entry].dirty =
3131 dst_page->dirty;
3132 user_page_list[entry].pageout =
3133 dst_page->pageout;
3134 user_page_list[entry].absent =
3135 dst_page->absent;
3136 user_page_list[entry].precious =
3137 dst_page->precious;
3138 }
3139 vm_page_unlock_queues();
3140 }
3141 entry++;
3142 dst_offset += PAGE_SIZE_64;
3143 xfer_size -= PAGE_SIZE;
3144 }
3145 }
3146
3147 if (upl->flags & UPL_INTERNAL) {
3148 if(page_list_count != NULL)
3149 *page_list_count = 0;
3150 } else if (*page_list_count > entry) {
3151 if(page_list_count != NULL)
3152 *page_list_count = entry;
3153 }
3154
3155 if(alias_page != NULL) {
3156 vm_page_lock_queues();
3157 vm_page_free(alias_page);
3158 vm_page_unlock_queues();
3159 }
3160
3161 if(do_m_lock) {
3162 vm_prot_t access_required;
3163 /* call back all associated pages from other users of the pager */
3164 /* all future updates will be on data which is based on the */
3165 /* changes we are going to make here. Note: it is assumed that */
3166 /* we already hold copies of the data so we will not be seeing */
3167 /* an avalanche of incoming data from the pager */
3168 access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3169 ? VM_PROT_READ : VM_PROT_WRITE;
3170 while (TRUE) {
3171 kern_return_t rc;
3172
3173 if(!object->pager_ready) {
3174 wait_result_t wait_result;
3175
3176 wait_result = vm_object_sleep(object,
3177 VM_OBJECT_EVENT_PAGER_READY,
3178 THREAD_UNINT);
3179 if (wait_result != THREAD_AWAKENED) {
3180 vm_object_unlock(object);
3181 return KERN_FAILURE;
3182 }
3183 continue;
3184 }
3185
3186 vm_object_unlock(object);
3187 rc = memory_object_data_unlock(
3188 object->pager,
3189 dst_offset + object->paging_offset,
3190 size,
3191 access_required);
3192 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3193 return KERN_FAILURE;
3194 vm_object_lock(object);
3195
3196 if (rc == KERN_SUCCESS)
3197 break;
3198 }
3199
3200 /* lets wait on the last page requested */
3201 /* NOTE: we will have to update lock completed routine to signal */
3202 if(dst_page != VM_PAGE_NULL &&
3203 (access_required & dst_page->page_lock) != access_required) {
3204 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3205 vm_object_unlock(object);
3206 thread_block(THREAD_CONTINUE_NULL);
3207 return KERN_SUCCESS;
3208 }
3209 }
3210
3211 vm_object_unlock(object);
3212 return KERN_SUCCESS;
3213 }
3214
3215 /* JMM - Backward compatability for now */
3216 kern_return_t
3217 vm_fault_list_request( /* forward */
3218 memory_object_control_t control,
3219 vm_object_offset_t offset,
3220 upl_size_t size,
3221 upl_t *upl_ptr,
3222 upl_page_info_t **user_page_list_ptr,
3223 int page_list_count,
3224 int cntrl_flags);
3225 kern_return_t
3226 vm_fault_list_request(
3227 memory_object_control_t control,
3228 vm_object_offset_t offset,
3229 upl_size_t size,
3230 upl_t *upl_ptr,
3231 upl_page_info_t **user_page_list_ptr,
3232 int page_list_count,
3233 int cntrl_flags)
3234 {
3235 int local_list_count;
3236 upl_page_info_t *user_page_list;
3237 kern_return_t kr;
3238
3239 if (user_page_list_ptr != NULL) {
3240 local_list_count = page_list_count;
3241 user_page_list = *user_page_list_ptr;
3242 } else {
3243 local_list_count = 0;
3244 user_page_list = NULL;
3245 }
3246 kr = memory_object_upl_request(control,
3247 offset,
3248 size,
3249 upl_ptr,
3250 user_page_list,
3251 &local_list_count,
3252 cntrl_flags);
3253
3254 if(kr != KERN_SUCCESS)
3255 return kr;
3256
3257 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3258 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3259 }
3260
3261 return KERN_SUCCESS;
3262 }
3263
3264
3265
3266 /*
3267 * Routine: vm_object_super_upl_request
3268 * Purpose:
3269 * Cause the population of a portion of a vm_object
3270 * in much the same way as memory_object_upl_request.
3271 * Depending on the nature of the request, the pages
3272 * returned may be contain valid data or be uninitialized.
3273 * However, the region may be expanded up to the super
3274 * cluster size provided.
3275 */
3276
3277 __private_extern__ kern_return_t
3278 vm_object_super_upl_request(
3279 vm_object_t object,
3280 vm_object_offset_t offset,
3281 upl_size_t size,
3282 upl_size_t super_cluster,
3283 upl_t *upl,
3284 upl_page_info_t *user_page_list,
3285 unsigned int *page_list_count,
3286 int cntrl_flags)
3287 {
3288 vm_page_t target_page;
3289 int ticket;
3290
3291
3292 if(object->paging_offset > offset)
3293 return KERN_FAILURE;
3294
3295 assert(object->paging_in_progress);
3296 offset = offset - object->paging_offset;
3297
3298 if(cntrl_flags & UPL_FOR_PAGEOUT) {
3299
3300 vm_object_lock(object);
3301
3302 if((target_page = vm_page_lookup(object, offset))
3303 != VM_PAGE_NULL) {
3304 ticket = target_page->page_ticket;
3305 cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3306 cntrl_flags = cntrl_flags |
3307 ((ticket << UPL_PAGE_TICKET_SHIFT)
3308 & UPL_PAGE_TICKET_MASK);
3309 }
3310 vm_object_unlock(object);
3311 }
3312
3313 if (super_cluster > size) {
3314
3315 vm_object_offset_t base_offset;
3316 upl_size_t super_size;
3317
3318 base_offset = (offset &
3319 ~((vm_object_offset_t) super_cluster - 1));
3320 super_size = (offset+size) > (base_offset + super_cluster) ?
3321 super_cluster<<1 : super_cluster;
3322 super_size = ((base_offset + super_size) > object->size) ?
3323 (object->size - base_offset) : super_size;
3324 if(offset > (base_offset + super_size))
3325 panic("vm_object_super_upl_request: Missed target pageout"
3326 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3327 offset, base_offset, super_size, super_cluster,
3328 size, object->paging_offset);
3329 /*
3330 * apparently there is a case where the vm requests a
3331 * page to be written out who's offset is beyond the
3332 * object size
3333 */
3334 if((offset + size) > (base_offset + super_size))
3335 super_size = (offset + size) - base_offset;
3336
3337 offset = base_offset;
3338 size = super_size;
3339 }
3340 return vm_object_upl_request(object, offset, size,
3341 upl, user_page_list, page_list_count,
3342 cntrl_flags);
3343 }
3344
3345
3346 kern_return_t
3347 vm_map_create_upl(
3348 vm_map_t map,
3349 vm_map_address_t offset,
3350 upl_size_t *upl_size,
3351 upl_t *upl,
3352 upl_page_info_array_t page_list,
3353 unsigned int *count,
3354 int *flags)
3355 {
3356 vm_map_entry_t entry;
3357 int caller_flags;
3358 int force_data_sync;
3359 int sync_cow_data;
3360 vm_object_t local_object;
3361 vm_map_offset_t local_offset;
3362 vm_map_offset_t local_start;
3363 kern_return_t ret;
3364
3365 caller_flags = *flags;
3366
3367 if (caller_flags & ~UPL_VALID_FLAGS) {
3368 /*
3369 * For forward compatibility's sake,
3370 * reject any unknown flag.
3371 */
3372 return KERN_INVALID_VALUE;
3373 }
3374
3375 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3376 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3377
3378 if(upl == NULL)
3379 return KERN_INVALID_ARGUMENT;
3380
3381
3382 REDISCOVER_ENTRY:
3383 vm_map_lock(map);
3384 if (vm_map_lookup_entry(map, offset, &entry)) {
3385 if (entry->object.vm_object == VM_OBJECT_NULL ||
3386 !entry->object.vm_object->phys_contiguous) {
3387 if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3388 *upl_size = MAX_UPL_TRANSFER * page_size;
3389 }
3390 }
3391 if((entry->vme_end - offset) < *upl_size) {
3392 *upl_size = entry->vme_end - offset;
3393 }
3394 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3395 if (entry->object.vm_object == VM_OBJECT_NULL) {
3396 *flags = 0;
3397 } else if (entry->object.vm_object->private) {
3398 *flags = UPL_DEV_MEMORY;
3399 if (entry->object.vm_object->phys_contiguous) {
3400 *flags |= UPL_PHYS_CONTIG;
3401 }
3402 } else {
3403 *flags = 0;
3404 }
3405 vm_map_unlock(map);
3406 return KERN_SUCCESS;
3407 }
3408 /*
3409 * Create an object if necessary.
3410 */
3411 if (entry->object.vm_object == VM_OBJECT_NULL) {
3412 entry->object.vm_object = vm_object_allocate(
3413 (vm_size_t)(entry->vme_end - entry->vme_start));
3414 entry->offset = 0;
3415 }
3416 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3417 if (!(entry->protection & VM_PROT_WRITE)) {
3418 vm_map_unlock(map);
3419 return KERN_PROTECTION_FAILURE;
3420 }
3421 if (entry->needs_copy) {
3422 vm_map_t local_map;
3423 vm_object_t object;
3424 vm_map_offset_t offset_hi;
3425 vm_map_offset_t offset_lo;
3426 vm_object_offset_t new_offset;
3427 vm_prot_t prot;
3428 boolean_t wired;
3429 vm_behavior_t behavior;
3430 vm_map_version_t version;
3431 vm_map_t real_map;
3432
3433 local_map = map;
3434 vm_map_lock_write_to_read(map);
3435 if(vm_map_lookup_locked(&local_map,
3436 offset, VM_PROT_WRITE,
3437 &version, &object,
3438 &new_offset, &prot, &wired,
3439 &behavior, &offset_lo,
3440 &offset_hi, &real_map)) {
3441 vm_map_unlock(local_map);
3442 return KERN_FAILURE;
3443 }
3444 if (real_map != map) {
3445 vm_map_unlock(real_map);
3446 }
3447 vm_object_unlock(object);
3448 vm_map_unlock(local_map);
3449
3450 goto REDISCOVER_ENTRY;
3451 }
3452 }
3453 if (entry->is_sub_map) {
3454 vm_map_t submap;
3455
3456 submap = entry->object.sub_map;
3457 local_start = entry->vme_start;
3458 local_offset = entry->offset;
3459 vm_map_reference(submap);
3460 vm_map_unlock(map);
3461
3462 ret = (vm_map_create_upl(submap,
3463 local_offset + (offset - local_start),
3464 upl_size, upl, page_list, count,
3465 flags));
3466
3467 vm_map_deallocate(submap);
3468 return ret;
3469 }
3470
3471 if (sync_cow_data) {
3472 if (entry->object.vm_object->shadow
3473 || entry->object.vm_object->copy) {
3474
3475 local_object = entry->object.vm_object;
3476 local_start = entry->vme_start;
3477 local_offset = entry->offset;
3478 vm_object_reference(local_object);
3479 vm_map_unlock(map);
3480
3481 if (entry->object.vm_object->shadow &&
3482 entry->object.vm_object->copy) {
3483 vm_object_lock_request(
3484 local_object->shadow,
3485 (vm_object_offset_t)
3486 ((offset - local_start) +
3487 local_offset) +
3488 local_object->shadow_offset,
3489 *upl_size, FALSE,
3490 MEMORY_OBJECT_DATA_SYNC,
3491 VM_PROT_NO_CHANGE);
3492 }
3493 sync_cow_data = FALSE;
3494 vm_object_deallocate(local_object);
3495 goto REDISCOVER_ENTRY;
3496 }
3497 }
3498
3499 if (force_data_sync) {
3500
3501 local_object = entry->object.vm_object;
3502 local_start = entry->vme_start;
3503 local_offset = entry->offset;
3504 vm_object_reference(local_object);
3505 vm_map_unlock(map);
3506
3507 vm_object_lock_request(
3508 local_object,
3509 (vm_object_offset_t)
3510 ((offset - local_start) + local_offset),
3511 (vm_object_size_t)*upl_size, FALSE,
3512 MEMORY_OBJECT_DATA_SYNC,
3513 VM_PROT_NO_CHANGE);
3514 force_data_sync = FALSE;
3515 vm_object_deallocate(local_object);
3516 goto REDISCOVER_ENTRY;
3517 }
3518
3519 if(!(entry->object.vm_object->private)) {
3520 if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3521 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3522 if(entry->object.vm_object->phys_contiguous) {
3523 *flags = UPL_PHYS_CONTIG;
3524 } else {
3525 *flags = 0;
3526 }
3527 } else {
3528 *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3529 }
3530 local_object = entry->object.vm_object;
3531 local_offset = entry->offset;
3532 local_start = entry->vme_start;
3533 vm_object_reference(local_object);
3534 vm_map_unlock(map);
3535 if(caller_flags & UPL_SET_IO_WIRE) {
3536 ret = (vm_object_iopl_request(local_object,
3537 (vm_object_offset_t)
3538 ((offset - local_start)
3539 + local_offset),
3540 *upl_size,
3541 upl,
3542 page_list,
3543 count,
3544 caller_flags));
3545 } else {
3546 ret = (vm_object_upl_request(local_object,
3547 (vm_object_offset_t)
3548 ((offset - local_start)
3549 + local_offset),
3550 *upl_size,
3551 upl,
3552 page_list,
3553 count,
3554 caller_flags));
3555 }
3556 vm_object_deallocate(local_object);
3557 return(ret);
3558 }
3559
3560 vm_map_unlock(map);
3561 return(KERN_FAILURE);
3562
3563 }
3564
3565 /*
3566 * Internal routine to enter a UPL into a VM map.
3567 *
3568 * JMM - This should just be doable through the standard
3569 * vm_map_enter() API.
3570 */
3571 kern_return_t
3572 vm_map_enter_upl(
3573 vm_map_t map,
3574 upl_t upl,
3575 vm_map_offset_t *dst_addr)
3576 {
3577 vm_map_size_t size;
3578 vm_object_offset_t offset;
3579 vm_map_offset_t addr;
3580 vm_page_t m;
3581 kern_return_t kr;
3582
3583 if (upl == UPL_NULL)
3584 return KERN_INVALID_ARGUMENT;
3585
3586 upl_lock(upl);
3587
3588 /* check to see if already mapped */
3589 if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3590 upl_unlock(upl);
3591 return KERN_FAILURE;
3592 }
3593
3594 if((!(upl->map_object->pageout)) &&
3595 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3596 (upl->map_object->phys_contiguous))) {
3597 vm_object_t object;
3598 vm_page_t alias_page;
3599 vm_object_offset_t new_offset;
3600 int pg_num;
3601 wpl_array_t lite_list;
3602
3603 if(upl->flags & UPL_INTERNAL) {
3604 lite_list = (wpl_array_t)
3605 ((((uintptr_t)upl) + sizeof(struct upl))
3606 + ((upl->size/PAGE_SIZE)
3607 * sizeof(upl_page_info_t)));
3608 } else {
3609 lite_list = (wpl_array_t)
3610 (((uintptr_t)upl) + sizeof(struct upl));
3611 }
3612 object = upl->map_object;
3613 upl->map_object = vm_object_allocate(upl->size);
3614 vm_object_lock(upl->map_object);
3615 upl->map_object->shadow = object;
3616 upl->map_object->pageout = TRUE;
3617 upl->map_object->can_persist = FALSE;
3618 upl->map_object->copy_strategy =
3619 MEMORY_OBJECT_COPY_NONE;
3620 upl->map_object->shadow_offset =
3621 upl->offset - object->paging_offset;
3622 upl->map_object->wimg_bits = object->wimg_bits;
3623 offset = upl->map_object->shadow_offset;
3624 new_offset = 0;
3625 size = upl->size;
3626
3627 vm_object_lock(object);
3628
3629 while(size) {
3630 pg_num = (new_offset)/PAGE_SIZE;
3631 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3632 vm_object_unlock(object);
3633 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3634 vm_object_lock(object);
3635 m = vm_page_lookup(object, offset);
3636 if (m == VM_PAGE_NULL) {
3637 panic("vm_upl_map: page missing\n");
3638 }
3639
3640 vm_object_paging_begin(object);
3641
3642 /*
3643 * Convert the fictitious page to a private
3644 * shadow of the real page.
3645 */
3646 assert(alias_page->fictitious);
3647 alias_page->fictitious = FALSE;
3648 alias_page->private = TRUE;
3649 alias_page->pageout = TRUE;
3650 alias_page->phys_page = m->phys_page;
3651
3652 vm_page_lock_queues();
3653 vm_page_wire(alias_page);
3654 vm_page_unlock_queues();
3655
3656 /*
3657 * ENCRYPTED SWAP:
3658 * The virtual page ("m") has to be wired in some way
3659 * here or its physical page ("m->phys_page") could
3660 * be recycled at any time.
3661 * Assuming this is enforced by the caller, we can't
3662 * get an encrypted page here. Since the encryption
3663 * key depends on the VM page's "pager" object and
3664 * the "paging_offset", we couldn't handle 2 pageable
3665 * VM pages (with different pagers and paging_offsets)
3666 * sharing the same physical page: we could end up
3667 * encrypting with one key (via one VM page) and
3668 * decrypting with another key (via the alias VM page).
3669 */
3670 ASSERT_PAGE_DECRYPTED(m);
3671
3672 vm_page_insert(alias_page,
3673 upl->map_object, new_offset);
3674 assert(!alias_page->wanted);
3675 alias_page->busy = FALSE;
3676 alias_page->absent = FALSE;
3677 }
3678
3679 size -= PAGE_SIZE;
3680 offset += PAGE_SIZE_64;
3681 new_offset += PAGE_SIZE_64;
3682 }
3683 vm_object_unlock(object);
3684 vm_object_unlock(upl->map_object);
3685 }
3686 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3687 offset = upl->offset - upl->map_object->paging_offset;
3688 else
3689 offset = 0;
3690
3691 size = upl->size;
3692
3693 vm_object_lock(upl->map_object);
3694 upl->map_object->ref_count++;
3695 vm_object_res_reference(upl->map_object);
3696 vm_object_unlock(upl->map_object);
3697
3698 *dst_addr = 0;
3699
3700
3701 /* NEED A UPL_MAP ALIAS */
3702 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3703 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3704 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3705
3706 if (kr != KERN_SUCCESS) {
3707 upl_unlock(upl);
3708 return(kr);
3709 }
3710
3711 vm_object_lock(upl->map_object);
3712
3713 for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3714 m = vm_page_lookup(upl->map_object, offset);
3715 if(m) {
3716 unsigned int cache_attr;
3717 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3718
3719 PMAP_ENTER(map->pmap, addr,
3720 m, VM_PROT_ALL,
3721 cache_attr, TRUE);
3722 }
3723 offset+=PAGE_SIZE_64;
3724 }
3725 vm_object_unlock(upl->map_object);
3726
3727 upl->ref_count++; /* hold a reference for the mapping */
3728 upl->flags |= UPL_PAGE_LIST_MAPPED;
3729 upl->kaddr = *dst_addr;
3730 upl_unlock(upl);
3731 return KERN_SUCCESS;
3732 }
3733
3734 /*
3735 * Internal routine to remove a UPL mapping from a VM map.
3736 *
3737 * XXX - This should just be doable through a standard
3738 * vm_map_remove() operation. Otherwise, implicit clean-up
3739 * of the target map won't be able to correctly remove
3740 * these (and release the reference on the UPL). Having
3741 * to do this means we can't map these into user-space
3742 * maps yet.
3743 */
3744 kern_return_t
3745 vm_map_remove_upl(
3746 vm_map_t map,
3747 upl_t upl)
3748 {
3749 vm_address_t addr;
3750 upl_size_t size;
3751
3752 if (upl == UPL_NULL)
3753 return KERN_INVALID_ARGUMENT;
3754
3755 upl_lock(upl);
3756 if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3757 addr = upl->kaddr;
3758 size = upl->size;
3759 assert(upl->ref_count > 1);
3760 upl->ref_count--; /* removing mapping ref */
3761 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3762 upl->kaddr = (vm_offset_t) 0;
3763 upl_unlock(upl);
3764
3765 vm_map_remove( map,
3766 vm_map_trunc_page(addr),
3767 vm_map_round_page(addr + size),
3768 VM_MAP_NO_FLAGS);
3769 return KERN_SUCCESS;
3770 }
3771 upl_unlock(upl);
3772 return KERN_FAILURE;
3773 }
3774
3775 kern_return_t
3776 upl_commit_range(
3777 upl_t upl,
3778 upl_offset_t offset,
3779 upl_size_t size,
3780 int flags,
3781 upl_page_info_t *page_list,
3782 mach_msg_type_number_t count,
3783 boolean_t *empty)
3784 {
3785 upl_size_t xfer_size = size;
3786 vm_object_t shadow_object;
3787 vm_object_t object = upl->map_object;
3788 vm_object_offset_t target_offset;
3789 int entry;
3790 wpl_array_t lite_list;
3791 int occupied;
3792 int delayed_unlock = 0;
3793 int clear_refmod = 0;
3794 boolean_t shadow_internal;
3795
3796 *empty = FALSE;
3797
3798 if (upl == UPL_NULL)
3799 return KERN_INVALID_ARGUMENT;
3800
3801
3802 if (count == 0)
3803 page_list = NULL;
3804
3805 if (object->pageout) {
3806 shadow_object = object->shadow;
3807 } else {
3808 shadow_object = object;
3809 }
3810
3811 upl_lock(upl);
3812
3813 if (upl->flags & UPL_ACCESS_BLOCKED) {
3814 /*
3815 * We used this UPL to block access to the pages by marking
3816 * them "busy". Now we need to clear the "busy" bit to allow
3817 * access to these pages again.
3818 */
3819 flags |= UPL_COMMIT_ALLOW_ACCESS;
3820 }
3821
3822 if (upl->flags & UPL_CLEAR_DIRTY)
3823 flags |= UPL_COMMIT_CLEAR_DIRTY;
3824
3825 if (upl->flags & UPL_DEVICE_MEMORY) {
3826 xfer_size = 0;
3827 } else if ((offset + size) > upl->size) {
3828 upl_unlock(upl);
3829 return KERN_FAILURE;
3830 }
3831
3832 if (upl->flags & UPL_INTERNAL) {
3833 lite_list = (wpl_array_t)
3834 ((((uintptr_t)upl) + sizeof(struct upl))
3835 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3836 } else {
3837 lite_list = (wpl_array_t)
3838 (((uintptr_t)upl) + sizeof(struct upl));
3839 }
3840 if (object != shadow_object)
3841 vm_object_lock(object);
3842 vm_object_lock(shadow_object);
3843
3844 shadow_internal = shadow_object->internal;
3845
3846 entry = offset/PAGE_SIZE;
3847 target_offset = (vm_object_offset_t)offset;
3848
3849 while (xfer_size) {
3850 vm_page_t t,m;
3851 upl_page_info_t *p;
3852
3853 m = VM_PAGE_NULL;
3854
3855 if (upl->flags & UPL_LITE) {
3856 int pg_num;
3857
3858 pg_num = target_offset/PAGE_SIZE;
3859
3860 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3861 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3862 m = vm_page_lookup(shadow_object,
3863 target_offset + (upl->offset -
3864 shadow_object->paging_offset));
3865 }
3866 }
3867 if (object->pageout) {
3868 if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3869 t->pageout = FALSE;
3870
3871 if (delayed_unlock) {
3872 delayed_unlock = 0;
3873 vm_page_unlock_queues();
3874 }
3875 VM_PAGE_FREE(t);
3876
3877 if (m == NULL) {
3878 m = vm_page_lookup(
3879 shadow_object,
3880 target_offset +
3881 object->shadow_offset);
3882 }
3883 if (m != VM_PAGE_NULL)
3884 vm_object_paging_end(m->object);
3885 }
3886 }
3887 if (m != VM_PAGE_NULL) {
3888
3889 clear_refmod = 0;
3890
3891 if (upl->flags & UPL_IO_WIRE) {
3892
3893 if (delayed_unlock == 0)
3894 vm_page_lock_queues();
3895
3896 vm_page_unwire(m);
3897
3898 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3899 delayed_unlock = 0;
3900 vm_page_unlock_queues();
3901 }
3902 if (page_list) {
3903 page_list[entry].phys_addr = 0;
3904 }
3905 if (flags & UPL_COMMIT_SET_DIRTY) {
3906 m->dirty = TRUE;
3907 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3908 m->dirty = FALSE;
3909 clear_refmod |= VM_MEM_MODIFIED;
3910 }
3911 if (flags & UPL_COMMIT_INACTIVATE) {
3912 m->reference = FALSE;
3913 clear_refmod |= VM_MEM_REFERENCED;
3914 vm_page_deactivate(m);
3915 }
3916 if (clear_refmod)
3917 pmap_clear_refmod(m->phys_page, clear_refmod);
3918
3919 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3920 /*
3921 * We blocked access to the pages in this UPL.
3922 * Clear the "busy" bit and wake up any waiter
3923 * for this page.
3924 */
3925 PAGE_WAKEUP_DONE(m);
3926 }
3927
3928 target_offset += PAGE_SIZE_64;
3929 xfer_size -= PAGE_SIZE;
3930 entry++;
3931 continue;
3932 }
3933 if (delayed_unlock == 0)
3934 vm_page_lock_queues();
3935 /*
3936 * make sure to clear the hardware
3937 * modify or reference bits before
3938 * releasing the BUSY bit on this page
3939 * otherwise we risk losing a legitimate
3940 * change of state
3941 */
3942 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3943 m->dirty = FALSE;
3944 clear_refmod |= VM_MEM_MODIFIED;
3945 }
3946 if (flags & UPL_COMMIT_INACTIVATE)
3947 clear_refmod |= VM_MEM_REFERENCED;
3948
3949 if (clear_refmod)
3950 pmap_clear_refmod(m->phys_page, clear_refmod);
3951
3952 if (page_list) {
3953 p = &(page_list[entry]);
3954 if(p->phys_addr && p->pageout && !m->pageout) {
3955 m->busy = TRUE;
3956 m->pageout = TRUE;
3957 vm_page_wire(m);
3958 } else if (page_list[entry].phys_addr &&
3959 !p->pageout && m->pageout &&
3960 !m->dump_cleaning) {
3961 m->pageout = FALSE;
3962 m->absent = FALSE;
3963 m->overwriting = FALSE;
3964 vm_page_unwire(m);
3965 PAGE_WAKEUP_DONE(m);
3966 }
3967 page_list[entry].phys_addr = 0;
3968 }
3969 m->dump_cleaning = FALSE;
3970 if(m->laundry) {
3971 vm_pageout_throttle_up(m);
3972 }
3973 if(m->pageout) {
3974 m->cleaning = FALSE;
3975 m->pageout = FALSE;
3976 #if MACH_CLUSTER_STATS
3977 if (m->wanted) vm_pageout_target_collisions++;
3978 #endif
3979 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3980 m->dirty = TRUE;
3981 else
3982 m->dirty = FALSE;
3983
3984 if(m->dirty) {
3985 vm_page_unwire(m);/* reactivates */
3986
3987 if (upl->flags & UPL_PAGEOUT) {
3988 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
3989 VM_STAT(reactivations++);
3990 }
3991 PAGE_WAKEUP_DONE(m);
3992 } else {
3993 vm_page_free(m);/* clears busy, etc. */
3994
3995 if (upl->flags & UPL_PAGEOUT) {
3996 CLUSTER_STAT(vm_pageout_target_page_freed++;)
3997
3998 if (page_list[entry].dirty)
3999 VM_STAT(pageouts++);
4000 }
4001 }
4002 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4003 delayed_unlock = 0;
4004 vm_page_unlock_queues();
4005 }
4006 target_offset += PAGE_SIZE_64;
4007 xfer_size -= PAGE_SIZE;
4008 entry++;
4009 continue;
4010 }
4011 #if MACH_CLUSTER_STATS
4012 m->dirty = pmap_is_modified(m->phys_page);
4013
4014 if (m->dirty) vm_pageout_cluster_dirtied++;
4015 else vm_pageout_cluster_cleaned++;
4016 if (m->wanted) vm_pageout_cluster_collisions++;
4017 #else
4018 m->dirty = 0;
4019 #endif
4020
4021 if((m->busy) && (m->cleaning)) {
4022 /* the request_page_list case */
4023 if(m->absent) {
4024 m->absent = FALSE;
4025 if(shadow_object->absent_count == 1)
4026 vm_object_absent_release(shadow_object);
4027 else
4028 shadow_object->absent_count--;
4029 }
4030 m->overwriting = FALSE;
4031 m->busy = FALSE;
4032 m->dirty = FALSE;
4033 } else if (m->overwriting) {
4034 /* alternate request page list, write to
4035 * page_list case. Occurs when the original
4036 * page was wired at the time of the list
4037 * request */
4038 assert(m->wire_count != 0);
4039 vm_page_unwire(m);/* reactivates */
4040 m->overwriting = FALSE;
4041 }
4042 m->cleaning = FALSE;
4043
4044 /* It is a part of the semantic of COPYOUT_FROM */
4045 /* UPLs that a commit implies cache sync */
4046 /* between the vm page and the backing store */
4047 /* this can be used to strip the precious bit */
4048 /* as well as clean */
4049 if (upl->flags & UPL_PAGE_SYNC_DONE)
4050 m->precious = FALSE;
4051
4052 if (flags & UPL_COMMIT_SET_DIRTY)
4053 m->dirty = TRUE;
4054
4055 if (flags & UPL_COMMIT_INACTIVATE) {
4056 m->reference = FALSE;
4057 vm_page_deactivate(m);
4058 } else if (!m->active && !m->inactive) {
4059 if (m->reference)
4060 vm_page_activate(m);
4061 else
4062 vm_page_deactivate(m);
4063 }
4064
4065 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4066 /*
4067 * We blocked access to the pages in this URL.
4068 * Clear the "busy" bit on this page before we
4069 * wake up any waiter.
4070 */
4071 m->busy = FALSE;
4072 }
4073
4074 /*
4075 * Wakeup any thread waiting for the page to be un-cleaning.
4076 */
4077 PAGE_WAKEUP(m);
4078
4079 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4080 delayed_unlock = 0;
4081 vm_page_unlock_queues();
4082 }
4083 }
4084 target_offset += PAGE_SIZE_64;
4085 xfer_size -= PAGE_SIZE;
4086 entry++;
4087 }
4088 if (delayed_unlock)
4089 vm_page_unlock_queues();
4090
4091 occupied = 1;
4092
4093 if (upl->flags & UPL_DEVICE_MEMORY) {
4094 occupied = 0;
4095 } else if (upl->flags & UPL_LITE) {
4096 int pg_num;
4097 int i;
4098 pg_num = upl->size/PAGE_SIZE;
4099 pg_num = (pg_num + 31) >> 5;
4100 occupied = 0;
4101 for(i= 0; i<pg_num; i++) {
4102 if(lite_list[i] != 0) {
4103 occupied = 1;
4104 break;
4105 }
4106 }
4107 } else {
4108 if(queue_empty(&upl->map_object->memq)) {
4109 occupied = 0;
4110 }
4111 }
4112
4113 if(occupied == 0) {
4114 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4115 *empty = TRUE;
4116 }
4117 if(object == shadow_object)
4118 vm_object_paging_end(shadow_object);
4119 }
4120 vm_object_unlock(shadow_object);
4121 if (object != shadow_object)
4122 vm_object_unlock(object);
4123 upl_unlock(upl);
4124
4125 return KERN_SUCCESS;
4126 }
4127
4128 kern_return_t
4129 upl_abort_range(
4130 upl_t upl,
4131 upl_offset_t offset,
4132 upl_size_t size,
4133 int error,
4134 boolean_t *empty)
4135 {
4136 upl_size_t xfer_size = size;
4137 vm_object_t shadow_object;
4138 vm_object_t object = upl->map_object;
4139 vm_object_offset_t target_offset;
4140 int entry;
4141 wpl_array_t lite_list;
4142 int occupied;
4143 boolean_t shadow_internal;
4144
4145 *empty = FALSE;
4146
4147 if (upl == UPL_NULL)
4148 return KERN_INVALID_ARGUMENT;
4149
4150 if (upl->flags & UPL_IO_WIRE) {
4151 return upl_commit_range(upl,
4152 offset, size, 0,
4153 NULL, 0, empty);
4154 }
4155
4156 if(object->pageout) {
4157 shadow_object = object->shadow;
4158 } else {
4159 shadow_object = object;
4160 }
4161
4162 upl_lock(upl);
4163 if(upl->flags & UPL_DEVICE_MEMORY) {
4164 xfer_size = 0;
4165 } else if ((offset + size) > upl->size) {
4166 upl_unlock(upl);
4167 return KERN_FAILURE;
4168 }
4169 if (object != shadow_object)
4170 vm_object_lock(object);
4171 vm_object_lock(shadow_object);
4172
4173 shadow_internal = shadow_object->internal;
4174
4175 if(upl->flags & UPL_INTERNAL) {
4176 lite_list = (wpl_array_t)
4177 ((((uintptr_t)upl) + sizeof(struct upl))
4178 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4179 } else {
4180 lite_list = (wpl_array_t)
4181 (((uintptr_t)upl) + sizeof(struct upl));
4182 }
4183
4184 entry = offset/PAGE_SIZE;
4185 target_offset = (vm_object_offset_t)offset;
4186 while(xfer_size) {
4187 vm_page_t t,m;
4188
4189 m = VM_PAGE_NULL;
4190 if(upl->flags & UPL_LITE) {
4191 int pg_num;
4192 pg_num = target_offset/PAGE_SIZE;
4193 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4194 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4195 m = vm_page_lookup(shadow_object,
4196 target_offset + (upl->offset -
4197 shadow_object->paging_offset));
4198 }
4199 }
4200 if(object->pageout) {
4201 if ((t = vm_page_lookup(object, target_offset))
4202 != NULL) {
4203 t->pageout = FALSE;
4204 VM_PAGE_FREE(t);
4205 if(m == NULL) {
4206 m = vm_page_lookup(
4207 shadow_object,
4208 target_offset +
4209 object->shadow_offset);
4210 }
4211 if(m != VM_PAGE_NULL)
4212 vm_object_paging_end(m->object);
4213 }
4214 }
4215 if(m != VM_PAGE_NULL) {
4216 vm_page_lock_queues();
4217 if(m->absent) {
4218 boolean_t must_free = TRUE;
4219
4220 /* COPYOUT = FALSE case */
4221 /* check for error conditions which must */
4222 /* be passed back to the pages customer */
4223 if(error & UPL_ABORT_RESTART) {
4224 m->restart = TRUE;
4225 m->absent = FALSE;
4226 vm_object_absent_release(m->object);
4227 m->page_error = KERN_MEMORY_ERROR;
4228 m->error = TRUE;
4229 must_free = FALSE;
4230 } else if(error & UPL_ABORT_UNAVAILABLE) {
4231 m->restart = FALSE;
4232 m->unusual = TRUE;
4233 must_free = FALSE;
4234 } else if(error & UPL_ABORT_ERROR) {
4235 m->restart = FALSE;
4236 m->absent = FALSE;
4237 vm_object_absent_release(m->object);
4238 m->page_error = KERN_MEMORY_ERROR;
4239 m->error = TRUE;
4240 must_free = FALSE;
4241 }
4242
4243 /*
4244 * ENCRYPTED SWAP:
4245 * If the page was already encrypted,
4246 * we don't really need to decrypt it
4247 * now. It will get decrypted later,
4248 * on demand, as soon as someone needs
4249 * to access its contents.
4250 */
4251
4252 m->cleaning = FALSE;
4253 m->overwriting = FALSE;
4254 PAGE_WAKEUP_DONE(m);
4255
4256 if (must_free == TRUE) {
4257 vm_page_free(m);
4258 } else {
4259 vm_page_activate(m);
4260 }
4261 vm_page_unlock_queues();
4262
4263 target_offset += PAGE_SIZE_64;
4264 xfer_size -= PAGE_SIZE;
4265 entry++;
4266 continue;
4267 }
4268 /*
4269 * Handle the trusted pager throttle.
4270 */
4271 if (m->laundry) {
4272 vm_pageout_throttle_up(m);
4273 }
4274 if(m->pageout) {
4275 assert(m->busy);
4276 assert(m->wire_count == 1);
4277 m->pageout = FALSE;
4278 vm_page_unwire(m);
4279 }
4280 m->dump_cleaning = FALSE;
4281 m->cleaning = FALSE;
4282 m->overwriting = FALSE;
4283 #if MACH_PAGEMAP
4284 vm_external_state_clr(
4285 m->object->existence_map, m->offset);
4286 #endif /* MACH_PAGEMAP */
4287 if(error & UPL_ABORT_DUMP_PAGES) {
4288 vm_page_free(m);
4289 pmap_disconnect(m->phys_page);
4290 } else {
4291 PAGE_WAKEUP_DONE(m);
4292 }
4293 vm_page_unlock_queues();
4294 }
4295 target_offset += PAGE_SIZE_64;
4296 xfer_size -= PAGE_SIZE;
4297 entry++;
4298 }
4299 occupied = 1;
4300 if (upl->flags & UPL_DEVICE_MEMORY) {
4301 occupied = 0;
4302 } else if (upl->flags & UPL_LITE) {
4303 int pg_num;
4304 int i;
4305 pg_num = upl->size/PAGE_SIZE;
4306 pg_num = (pg_num + 31) >> 5;
4307 occupied = 0;
4308 for(i= 0; i<pg_num; i++) {
4309 if(lite_list[i] != 0) {
4310 occupied = 1;
4311 break;
4312 }
4313 }
4314 } else {
4315 if(queue_empty(&upl->map_object->memq)) {
4316 occupied = 0;
4317 }
4318 }
4319
4320 if(occupied == 0) {
4321 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4322 *empty = TRUE;
4323 }
4324 if(object == shadow_object)
4325 vm_object_paging_end(shadow_object);
4326 }
4327 vm_object_unlock(shadow_object);
4328 if (object != shadow_object)
4329 vm_object_unlock(object);
4330
4331 upl_unlock(upl);
4332
4333 return KERN_SUCCESS;
4334 }
4335
4336 kern_return_t
4337 upl_abort(
4338 upl_t upl,
4339 int error)
4340 {
4341 vm_object_t object = NULL;
4342 vm_object_t shadow_object = NULL;
4343 vm_object_offset_t offset;
4344 vm_object_offset_t shadow_offset;
4345 vm_object_offset_t target_offset;
4346 upl_size_t i;
4347 wpl_array_t lite_list;
4348 vm_page_t t,m;
4349 int occupied;
4350 boolean_t shadow_internal;
4351
4352 if (upl == UPL_NULL)
4353 return KERN_INVALID_ARGUMENT;
4354
4355 if (upl->flags & UPL_IO_WIRE) {
4356 boolean_t empty;
4357 return upl_commit_range(upl,
4358 0, upl->size, 0,
4359 NULL, 0, &empty);
4360 }
4361
4362 upl_lock(upl);
4363 if(upl->flags & UPL_DEVICE_MEMORY) {
4364 upl_unlock(upl);
4365 return KERN_SUCCESS;
4366 }
4367
4368 object = upl->map_object;
4369
4370 if (object == NULL) {
4371 panic("upl_abort: upl object is not backed by an object");
4372 upl_unlock(upl);
4373 return KERN_INVALID_ARGUMENT;
4374 }
4375
4376 if(object->pageout) {
4377 shadow_object = object->shadow;
4378 shadow_offset = object->shadow_offset;
4379 } else {
4380 shadow_object = object;
4381 shadow_offset = upl->offset - object->paging_offset;
4382 }
4383
4384 if(upl->flags & UPL_INTERNAL) {
4385 lite_list = (wpl_array_t)
4386 ((((uintptr_t)upl) + sizeof(struct upl))
4387 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4388 } else {
4389 lite_list = (wpl_array_t)
4390 (((uintptr_t)upl) + sizeof(struct upl));
4391 }
4392 offset = 0;
4393
4394 if (object != shadow_object)
4395 vm_object_lock(object);
4396 vm_object_lock(shadow_object);
4397
4398 shadow_internal = shadow_object->internal;
4399
4400 for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4401 m = VM_PAGE_NULL;
4402 target_offset = offset + shadow_offset;
4403 if(upl->flags & UPL_LITE) {
4404 int pg_num;
4405 pg_num = offset/PAGE_SIZE;
4406 if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4407 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4408 m = vm_page_lookup(
4409 shadow_object, target_offset);
4410 }
4411 }
4412 if(object->pageout) {
4413 if ((t = vm_page_lookup(object, offset)) != NULL) {
4414 t->pageout = FALSE;
4415 VM_PAGE_FREE(t);
4416 if(m == NULL) {
4417 m = vm_page_lookup(
4418 shadow_object, target_offset);
4419 }
4420 if(m != VM_PAGE_NULL)
4421 vm_object_paging_end(m->object);
4422 }
4423 }
4424 if(m != VM_PAGE_NULL) {
4425 vm_page_lock_queues();
4426 if(m->absent) {
4427 boolean_t must_free = TRUE;
4428
4429 /* COPYOUT = FALSE case */
4430 /* check for error conditions which must */
4431 /* be passed back to the pages customer */
4432 if(error & UPL_ABORT_RESTART) {
4433 m->restart = TRUE;
4434 m->absent = FALSE;
4435 vm_object_absent_release(m->object);
4436 m->page_error = KERN_MEMORY_ERROR;
4437 m->error = TRUE;
4438 must_free = FALSE;
4439 } else if(error & UPL_ABORT_UNAVAILABLE) {
4440 m->restart = FALSE;
4441 m->unusual = TRUE;
4442 must_free = FALSE;
4443 } else if(error & UPL_ABORT_ERROR) {
4444 m->restart = FALSE;
4445 m->absent = FALSE;
4446 vm_object_absent_release(m->object);
4447 m->page_error = KERN_MEMORY_ERROR;
4448 m->error = TRUE;
4449 must_free = FALSE;
4450 }
4451
4452 /*
4453 * ENCRYPTED SWAP:
4454 * If the page was already encrypted,
4455 * we don't really need to decrypt it
4456 * now. It will get decrypted later,
4457 * on demand, as soon as someone needs
4458 * to access its contents.
4459 */
4460
4461 m->cleaning = FALSE;
4462 m->overwriting = FALSE;
4463 PAGE_WAKEUP_DONE(m);
4464
4465 if (must_free == TRUE) {
4466 vm_page_free(m);
4467 } else {
4468 vm_page_activate(m);
4469 }
4470 vm_page_unlock_queues();
4471 continue;
4472 }
4473 /*
4474 * Handle the trusted pager throttle.
4475 */
4476 if (m->laundry) {
4477 vm_pageout_throttle_up(m);
4478 }
4479 if(m->pageout) {
4480 assert(m->busy);
4481 assert(m->wire_count == 1);
4482 m->pageout = FALSE;
4483 vm_page_unwire(m);
4484 }
4485 m->dump_cleaning = FALSE;
4486 m->cleaning = FALSE;
4487 m->overwriting = FALSE;
4488 #if MACH_PAGEMAP
4489 vm_external_state_clr(
4490 m->object->existence_map, m->offset);
4491 #endif /* MACH_PAGEMAP */
4492 if(error & UPL_ABORT_DUMP_PAGES) {
4493 vm_page_free(m);
4494 pmap_disconnect(m->phys_page);
4495 } else {
4496 PAGE_WAKEUP_DONE(m);
4497 }
4498 vm_page_unlock_queues();
4499 }
4500 }
4501 occupied = 1;
4502 if (upl->flags & UPL_DEVICE_MEMORY) {
4503 occupied = 0;
4504 } else if (upl->flags & UPL_LITE) {
4505 int pg_num;
4506 int j;
4507 pg_num = upl->size/PAGE_SIZE;
4508 pg_num = (pg_num + 31) >> 5;
4509 occupied = 0;
4510 for(j= 0; j<pg_num; j++) {
4511 if(lite_list[j] != 0) {
4512 occupied = 1;
4513 break;
4514 }
4515 }
4516 } else {
4517 if(queue_empty(&upl->map_object->memq)) {
4518 occupied = 0;
4519 }
4520 }
4521
4522 if(occupied == 0) {
4523 if(object == shadow_object)
4524 vm_object_paging_end(shadow_object);
4525 }
4526 vm_object_unlock(shadow_object);
4527 if (object != shadow_object)
4528 vm_object_unlock(object);
4529
4530 upl_unlock(upl);
4531 return KERN_SUCCESS;
4532 }
4533
4534 /* an option on commit should be wire */
4535 kern_return_t
4536 upl_commit(
4537 upl_t upl,
4538 upl_page_info_t *page_list,
4539 mach_msg_type_number_t count)
4540 {
4541 if (upl == UPL_NULL)
4542 return KERN_INVALID_ARGUMENT;
4543
4544 if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4545 boolean_t empty;
4546 return upl_commit_range(upl, 0, upl->size, 0,
4547 page_list, count, &empty);
4548 }
4549
4550 if (count == 0)
4551 page_list = NULL;
4552
4553 upl_lock(upl);
4554 if (upl->flags & UPL_DEVICE_MEMORY)
4555 page_list = NULL;
4556
4557 if (upl->flags & UPL_ENCRYPTED) {
4558 /*
4559 * ENCRYPTED SWAP:
4560 * This UPL was encrypted, but we don't need
4561 * to decrypt here. We'll decrypt each page
4562 * later, on demand, as soon as someone needs
4563 * to access the page's contents.
4564 */
4565 }
4566
4567 if ((upl->flags & UPL_CLEAR_DIRTY) ||
4568 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4569 vm_object_t shadow_object = upl->map_object->shadow;
4570 vm_object_t object = upl->map_object;
4571 vm_object_offset_t target_offset;
4572 upl_size_t xfer_end;
4573 int entry;
4574
4575 vm_page_t t, m;
4576 upl_page_info_t *p;
4577
4578 if (object != shadow_object)
4579 vm_object_lock(object);
4580 vm_object_lock(shadow_object);
4581
4582 entry = 0;
4583 target_offset = object->shadow_offset;
4584 xfer_end = upl->size + object->shadow_offset;
4585
4586 while(target_offset < xfer_end) {
4587
4588 if ((t = vm_page_lookup(object,
4589 target_offset - object->shadow_offset))
4590 == NULL) {
4591 target_offset += PAGE_SIZE_64;
4592 entry++;
4593 continue;
4594 }
4595
4596 m = vm_page_lookup(shadow_object, target_offset);
4597 if(m != VM_PAGE_NULL) {
4598 /*
4599 * ENCRYPTED SWAP:
4600 * If this page was encrypted, we
4601 * don't need to decrypt it here.
4602 * We'll decrypt it later, on demand,
4603 * as soon as someone needs to access
4604 * its contents.
4605 */
4606
4607 if (upl->flags & UPL_CLEAR_DIRTY) {
4608 pmap_clear_modify(m->phys_page);
4609 m->dirty = FALSE;
4610 }
4611 /* It is a part of the semantic of */
4612 /* COPYOUT_FROM UPLs that a commit */
4613 /* implies cache sync between the */
4614 /* vm page and the backing store */
4615 /* this can be used to strip the */
4616 /* precious bit as well as clean */
4617 if (upl->flags & UPL_PAGE_SYNC_DONE)
4618 m->precious = FALSE;
4619
4620 if(page_list) {
4621 p = &(page_list[entry]);
4622 if(page_list[entry].phys_addr &&
4623 p->pageout && !m->pageout) {
4624 vm_page_lock_queues();
4625 m->busy = TRUE;
4626 m->pageout = TRUE;
4627 vm_page_wire(m);
4628 vm_page_unlock_queues();
4629 } else if (page_list[entry].phys_addr &&
4630 !p->pageout && m->pageout &&
4631 !m->dump_cleaning) {
4632 vm_page_lock_queues();
4633 m->pageout = FALSE;
4634 m->absent = FALSE;
4635 m->overwriting = FALSE;
4636 vm_page_unwire(m);
4637 PAGE_WAKEUP_DONE(m);
4638 vm_page_unlock_queues();
4639 }
4640 page_list[entry].phys_addr = 0;
4641 }
4642 }
4643 target_offset += PAGE_SIZE_64;
4644 entry++;
4645 }
4646 vm_object_unlock(shadow_object);
4647 if (object != shadow_object)
4648 vm_object_unlock(object);
4649
4650 }
4651 if (upl->flags & UPL_DEVICE_MEMORY) {
4652 vm_object_lock(upl->map_object->shadow);
4653 if(upl->map_object == upl->map_object->shadow)
4654 vm_object_paging_end(upl->map_object->shadow);
4655 vm_object_unlock(upl->map_object->shadow);
4656 }
4657 upl_unlock(upl);
4658 return KERN_SUCCESS;
4659 }
4660
4661
4662
4663 kern_return_t
4664 vm_object_iopl_request(
4665 vm_object_t object,
4666 vm_object_offset_t offset,
4667 upl_size_t size,
4668 upl_t *upl_ptr,
4669 upl_page_info_array_t user_page_list,
4670 unsigned int *page_list_count,
4671 int cntrl_flags)
4672 {
4673 vm_page_t dst_page;
4674 vm_object_offset_t dst_offset = offset;
4675 upl_size_t xfer_size = size;
4676 upl_t upl = NULL;
4677 unsigned int entry;
4678 wpl_array_t lite_list = NULL;
4679 int page_field_size;
4680 int delayed_unlock = 0;
4681 int no_zero_fill = FALSE;
4682 vm_page_t alias_page = NULL;
4683 kern_return_t ret;
4684 vm_prot_t prot;
4685
4686
4687 if (cntrl_flags & ~UPL_VALID_FLAGS) {
4688 /*
4689 * For forward compatibility's sake,
4690 * reject any unknown flag.
4691 */
4692 return KERN_INVALID_VALUE;
4693 }
4694
4695 if (cntrl_flags & UPL_ENCRYPT) {
4696 /*
4697 * ENCRYPTED SWAP:
4698 * The paging path doesn't use this interface,
4699 * so we don't support the UPL_ENCRYPT flag
4700 * here. We won't encrypt the pages.
4701 */
4702 assert(! (cntrl_flags & UPL_ENCRYPT));
4703 }
4704
4705 if (cntrl_flags & UPL_NOZEROFILL)
4706 no_zero_fill = TRUE;
4707
4708 if (cntrl_flags & UPL_COPYOUT_FROM)
4709 prot = VM_PROT_READ;
4710 else
4711 prot = VM_PROT_READ | VM_PROT_WRITE;
4712
4713 if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4714 size = MAX_UPL_TRANSFER * page_size;
4715 }
4716
4717 if(cntrl_flags & UPL_SET_INTERNAL)
4718 if(page_list_count != NULL)
4719 *page_list_count = MAX_UPL_TRANSFER;
4720 if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4721 ((page_list_count != NULL) && (*page_list_count != 0)
4722 && *page_list_count < (size/page_size)))
4723 return KERN_INVALID_ARGUMENT;
4724
4725 if((!object->internal) && (object->paging_offset != 0))
4726 panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
4727
4728 if(object->phys_contiguous) {
4729 /* No paging operations are possible against this memory */
4730 /* and so no need for map object, ever */
4731 cntrl_flags |= UPL_SET_LITE;
4732 }
4733
4734 if(upl_ptr) {
4735 if(cntrl_flags & UPL_SET_INTERNAL) {
4736 if(cntrl_flags & UPL_SET_LITE) {
4737 upl = upl_create(
4738 UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4739 size);
4740 user_page_list = (upl_page_info_t *)
4741 (((uintptr_t)upl) + sizeof(struct upl));
4742 lite_list = (wpl_array_t)
4743 (((uintptr_t)user_page_list) +
4744 ((size/PAGE_SIZE) *
4745 sizeof(upl_page_info_t)));
4746 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4747 page_field_size =
4748 (page_field_size + 3) & 0xFFFFFFFC;
4749 bzero((char *)lite_list, page_field_size);
4750 upl->flags =
4751 UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4752 } else {
4753 upl = upl_create(UPL_CREATE_INTERNAL, size);
4754 user_page_list = (upl_page_info_t *)
4755 (((uintptr_t)upl)
4756 + sizeof(struct upl));
4757 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4758 }
4759 } else {
4760 if(cntrl_flags & UPL_SET_LITE) {
4761 upl = upl_create(UPL_CREATE_LITE, size);
4762 lite_list = (wpl_array_t)
4763 (((uintptr_t)upl) + sizeof(struct upl));
4764 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4765 page_field_size =
4766 (page_field_size + 3) & 0xFFFFFFFC;
4767 bzero((char *)lite_list, page_field_size);
4768 upl->flags = UPL_LITE | UPL_IO_WIRE;
4769 } else {
4770 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4771 upl->flags = UPL_IO_WIRE;
4772 }
4773 }
4774
4775 if(object->phys_contiguous) {
4776 upl->map_object = object;
4777 /* don't need any shadow mappings for this one */
4778 /* since it is already I/O memory */
4779 upl->flags |= UPL_DEVICE_MEMORY;
4780
4781 vm_object_lock(object);
4782 vm_object_paging_begin(object);
4783 vm_object_unlock(object);
4784
4785 /* paging in progress also protects the paging_offset */
4786 upl->offset = offset + object->paging_offset;
4787 upl->size = size;
4788 *upl_ptr = upl;
4789 if(user_page_list) {
4790 user_page_list[0].phys_addr =
4791 (offset + object->shadow_offset)>>PAGE_SHIFT;
4792 user_page_list[0].device = TRUE;
4793 }
4794
4795 if(page_list_count != NULL) {
4796 if (upl->flags & UPL_INTERNAL) {
4797 *page_list_count = 0;
4798 } else {
4799 *page_list_count = 1;
4800 }
4801 }
4802 return KERN_SUCCESS;
4803 }
4804 if(user_page_list)
4805 user_page_list[0].device = FALSE;
4806
4807 if(cntrl_flags & UPL_SET_LITE) {
4808 upl->map_object = object;
4809 } else {
4810 upl->map_object = vm_object_allocate(size);
4811 vm_object_lock(upl->map_object);
4812 upl->map_object->shadow = object;
4813 upl->map_object->pageout = TRUE;
4814 upl->map_object->can_persist = FALSE;
4815 upl->map_object->copy_strategy =
4816 MEMORY_OBJECT_COPY_NONE;
4817 upl->map_object->shadow_offset = offset;
4818 upl->map_object->wimg_bits = object->wimg_bits;
4819 vm_object_unlock(upl->map_object);
4820 }
4821 }
4822 vm_object_lock(object);
4823 vm_object_paging_begin(object);
4824
4825 if (!object->phys_contiguous) {
4826 /* Protect user space from future COW operations */
4827 object->true_share = TRUE;
4828 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4829 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4830 }
4831
4832 /* we can lock the upl offset now that paging_in_progress is set */
4833 if(upl_ptr) {
4834 upl->size = size;
4835 upl->offset = offset + object->paging_offset;
4836 *upl_ptr = upl;
4837 #ifdef UPL_DEBUG
4838 queue_enter(&object->uplq, upl, upl_t, uplq);
4839 #endif /* UPL_DEBUG */
4840 }
4841
4842 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4843 /*
4844 * The user requested that access to the pages in this URL
4845 * be blocked until the UPL is commited or aborted.
4846 */
4847 upl->flags |= UPL_ACCESS_BLOCKED;
4848 }
4849
4850 entry = 0;
4851 while (xfer_size) {
4852 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4853 if (delayed_unlock) {
4854 delayed_unlock = 0;
4855 vm_page_unlock_queues();
4856 }
4857 vm_object_unlock(object);
4858 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4859 vm_object_lock(object);
4860 }
4861 dst_page = vm_page_lookup(object, dst_offset);
4862
4863 /*
4864 * ENCRYPTED SWAP:
4865 * If the page is encrypted, we need to decrypt it,
4866 * so force a soft page fault.
4867 */
4868 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4869 (dst_page->encrypted) ||
4870 (dst_page->unusual && (dst_page->error ||
4871 dst_page->restart ||
4872 dst_page->absent ||
4873 dst_page->fictitious ||
4874 (prot & dst_page->page_lock)))) {
4875 vm_fault_return_t result;
4876 do {
4877 vm_page_t top_page;
4878 kern_return_t error_code;
4879 int interruptible;
4880
4881 vm_object_offset_t lo_offset = offset;
4882 vm_object_offset_t hi_offset = offset + size;
4883
4884
4885 if (delayed_unlock) {
4886 delayed_unlock = 0;
4887 vm_page_unlock_queues();
4888 }
4889
4890 if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4891 interruptible = THREAD_ABORTSAFE;
4892 } else {
4893 interruptible = THREAD_UNINT;
4894 }
4895
4896 result = vm_fault_page(object, dst_offset,
4897 prot | VM_PROT_WRITE, FALSE,
4898 interruptible,
4899 lo_offset, hi_offset,
4900 VM_BEHAVIOR_SEQUENTIAL,
4901 &prot, &dst_page, &top_page,
4902 (int *)0,
4903 &error_code, no_zero_fill, FALSE, NULL, 0);
4904
4905 switch(result) {
4906 case VM_FAULT_SUCCESS:
4907
4908 PAGE_WAKEUP_DONE(dst_page);
4909
4910 /*
4911 * Release paging references and
4912 * top-level placeholder page, if any.
4913 */
4914
4915 if(top_page != VM_PAGE_NULL) {
4916 vm_object_t local_object;
4917 local_object =
4918 top_page->object;
4919 if(top_page->object
4920 != dst_page->object) {
4921 vm_object_lock(
4922 local_object);
4923 VM_PAGE_FREE(top_page);
4924 vm_object_paging_end(
4925 local_object);
4926 vm_object_unlock(
4927 local_object);
4928 } else {
4929 VM_PAGE_FREE(top_page);
4930 vm_object_paging_end(
4931 local_object);
4932 }
4933 }
4934
4935 break;
4936
4937
4938 case VM_FAULT_RETRY:
4939 vm_object_lock(object);
4940 vm_object_paging_begin(object);
4941 break;
4942
4943 case VM_FAULT_FICTITIOUS_SHORTAGE:
4944 vm_page_more_fictitious();
4945 vm_object_lock(object);
4946 vm_object_paging_begin(object);
4947 break;
4948
4949 case VM_FAULT_MEMORY_SHORTAGE:
4950 if (vm_page_wait(interruptible)) {
4951 vm_object_lock(object);
4952 vm_object_paging_begin(object);
4953 break;
4954 }
4955 /* fall thru */
4956
4957 case VM_FAULT_INTERRUPTED:
4958 error_code = MACH_SEND_INTERRUPTED;
4959 case VM_FAULT_MEMORY_ERROR:
4960 ret = (error_code ? error_code:
4961 KERN_MEMORY_ERROR);
4962 vm_object_lock(object);
4963 for(; offset < dst_offset;
4964 offset += PAGE_SIZE) {
4965 dst_page = vm_page_lookup(
4966 object, offset);
4967 if(dst_page == VM_PAGE_NULL)
4968 panic("vm_object_iopl_request: Wired pages missing. \n");
4969 vm_page_lock_queues();
4970 vm_page_unwire(dst_page);
4971 vm_page_unlock_queues();
4972 VM_STAT(reactivations++);
4973 }
4974 vm_object_unlock(object);
4975 upl_destroy(upl);
4976 return ret;
4977 }
4978 } while ((result != VM_FAULT_SUCCESS)
4979 || (result == VM_FAULT_INTERRUPTED));
4980 }
4981 if (delayed_unlock == 0)
4982 vm_page_lock_queues();
4983 vm_page_wire(dst_page);
4984
4985 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4986 /*
4987 * Mark the page "busy" to block any future page fault
4988 * on this page. We'll also remove the mapping
4989 * of all these pages before leaving this routine.
4990 */
4991 assert(!dst_page->fictitious);
4992 dst_page->busy = TRUE;
4993 }
4994
4995 if (upl_ptr) {
4996 if (cntrl_flags & UPL_SET_LITE) {
4997 int pg_num;
4998 pg_num = (dst_offset-offset)/PAGE_SIZE;
4999 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5000 } else {
5001 /*
5002 * Convert the fictitious page to a
5003 * private shadow of the real page.
5004 */
5005 assert(alias_page->fictitious);
5006 alias_page->fictitious = FALSE;
5007 alias_page->private = TRUE;
5008 alias_page->pageout = TRUE;
5009 alias_page->phys_page = dst_page->phys_page;
5010 vm_page_wire(alias_page);
5011
5012 vm_page_insert(alias_page,
5013 upl->map_object, size - xfer_size);
5014 assert(!alias_page->wanted);
5015 alias_page->busy = FALSE;
5016 alias_page->absent = FALSE;
5017 }
5018
5019 /* expect the page to be used */
5020 dst_page->reference = TRUE;
5021
5022 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5023 dst_page->dirty = TRUE;
5024 alias_page = NULL;
5025
5026 if (user_page_list) {
5027 user_page_list[entry].phys_addr
5028 = dst_page->phys_page;
5029 user_page_list[entry].dirty =
5030 dst_page->dirty;
5031 user_page_list[entry].pageout =
5032 dst_page->pageout;
5033 user_page_list[entry].absent =
5034 dst_page->absent;
5035 user_page_list[entry].precious =
5036 dst_page->precious;
5037 }
5038 }
5039 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5040 delayed_unlock = 0;
5041 vm_page_unlock_queues();
5042 }
5043 entry++;
5044 dst_offset += PAGE_SIZE_64;
5045 xfer_size -= PAGE_SIZE;
5046 }
5047 if (delayed_unlock)
5048 vm_page_unlock_queues();
5049
5050 if (upl->flags & UPL_INTERNAL) {
5051 if(page_list_count != NULL)
5052 *page_list_count = 0;
5053 } else if (*page_list_count > entry) {
5054 if(page_list_count != NULL)
5055 *page_list_count = entry;
5056 }
5057
5058 if (alias_page != NULL) {
5059 vm_page_lock_queues();
5060 vm_page_free(alias_page);
5061 vm_page_unlock_queues();
5062 }
5063
5064 vm_object_unlock(object);
5065
5066 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5067 /*
5068 * We've marked all the pages "busy" so that future
5069 * page faults will block.
5070 * Now remove the mapping for these pages, so that they
5071 * can't be accessed without causing a page fault.
5072 */
5073 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5074 PMAP_NULL, 0, VM_PROT_NONE);
5075 }
5076
5077 return KERN_SUCCESS;
5078 }
5079
5080 kern_return_t
5081 upl_transpose(
5082 upl_t upl1,
5083 upl_t upl2)
5084 {
5085 kern_return_t retval;
5086 boolean_t upls_locked;
5087 vm_object_t object1, object2;
5088
5089 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5090 return KERN_INVALID_ARGUMENT;
5091 }
5092
5093 upls_locked = FALSE;
5094
5095 /*
5096 * Since we need to lock both UPLs at the same time,
5097 * avoid deadlocks by always taking locks in the same order.
5098 */
5099 if (upl1 < upl2) {
5100 upl_lock(upl1);
5101 upl_lock(upl2);
5102 } else {
5103 upl_lock(upl2);
5104 upl_lock(upl1);
5105 }
5106 upls_locked = TRUE; /* the UPLs will need to be unlocked */
5107
5108 object1 = upl1->map_object;
5109 object2 = upl2->map_object;
5110
5111 if (upl1->offset != 0 || upl2->offset != 0 ||
5112 upl1->size != upl2->size) {
5113 /*
5114 * We deal only with full objects, not subsets.
5115 * That's because we exchange the entire backing store info
5116 * for the objects: pager, resident pages, etc... We can't do
5117 * only part of it.
5118 */
5119 retval = KERN_INVALID_VALUE;
5120 goto done;
5121 }
5122
5123 /*
5124 * Tranpose the VM objects' backing store.
5125 */
5126 retval = vm_object_transpose(object1, object2,
5127 (vm_object_size_t) upl1->size);
5128
5129 if (retval == KERN_SUCCESS) {
5130 /*
5131 * Make each UPL point to the correct VM object, i.e. the
5132 * object holding the pages that the UPL refers to...
5133 */
5134 upl1->map_object = object2;
5135 upl2->map_object = object1;
5136 }
5137
5138 done:
5139 /*
5140 * Cleanup.
5141 */
5142 if (upls_locked) {
5143 upl_unlock(upl1);
5144 upl_unlock(upl2);
5145 upls_locked = FALSE;
5146 }
5147
5148 return retval;
5149 }
5150
5151 /*
5152 * ENCRYPTED SWAP:
5153 *
5154 * Rationale: the user might have some encrypted data on disk (via
5155 * FileVault or any other mechanism). That data is then decrypted in
5156 * memory, which is safe as long as the machine is secure. But that
5157 * decrypted data in memory could be paged out to disk by the default
5158 * pager. The data would then be stored on disk in clear (not encrypted)
5159 * and it could be accessed by anyone who gets physical access to the
5160 * disk (if the laptop or the disk gets stolen for example). This weakens
5161 * the security offered by FileVault.
5162 *
5163 * Solution: the default pager will optionally request that all the
5164 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5165 * before it sends this UPL to disk via the vnode_pageout() path.
5166 *
5167 * Notes:
5168 *
5169 * To avoid disrupting the VM LRU algorithms, we want to keep the
5170 * clean-in-place mechanisms, which allow us to send some extra pages to
5171 * swap (clustering) without actually removing them from the user's
5172 * address space. We don't want the user to unknowingly access encrypted
5173 * data, so we have to actually remove the encrypted pages from the page
5174 * table. When the user accesses the data, the hardware will fail to
5175 * locate the virtual page in its page table and will trigger a page
5176 * fault. We can then decrypt the page and enter it in the page table
5177 * again. Whenever we allow the user to access the contents of a page,
5178 * we have to make sure it's not encrypted.
5179 *
5180 *
5181 */
5182 /*
5183 * ENCRYPTED SWAP:
5184 * Reserve of virtual addresses in the kernel address space.
5185 * We need to map the physical pages in the kernel, so that we
5186 * can call the encryption/decryption routines with a kernel
5187 * virtual address. We keep this pool of pre-allocated kernel
5188 * virtual addresses so that we don't have to scan the kernel's
5189 * virtaul address space each time we need to encrypt or decrypt
5190 * a physical page.
5191 * It would be nice to be able to encrypt and decrypt in physical
5192 * mode but that might not always be more efficient...
5193 */
5194 decl_simple_lock_data(,vm_paging_lock)
5195 #define VM_PAGING_NUM_PAGES 64
5196 vm_map_offset_t vm_paging_base_address = 0;
5197 boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5198 int vm_paging_max_index = 0;
5199 unsigned long vm_paging_no_kernel_page = 0;
5200 unsigned long vm_paging_objects_mapped = 0;
5201 unsigned long vm_paging_pages_mapped = 0;
5202 unsigned long vm_paging_objects_mapped_slow = 0;
5203 unsigned long vm_paging_pages_mapped_slow = 0;
5204
5205 /*
5206 * ENCRYPTED SWAP:
5207 * vm_paging_map_object:
5208 * Maps part of a VM object's pages in the kernel
5209 * virtual address space, using the pre-allocated
5210 * kernel virtual addresses, if possible.
5211 * Context:
5212 * The VM object is locked. This lock will get
5213 * dropped and re-acquired though.
5214 */
5215 kern_return_t
5216 vm_paging_map_object(
5217 vm_map_offset_t *address,
5218 vm_page_t page,
5219 vm_object_t object,
5220 vm_object_offset_t offset,
5221 vm_map_size_t *size)
5222 {
5223 kern_return_t kr;
5224 vm_map_offset_t page_map_offset;
5225 vm_map_size_t map_size;
5226 vm_object_offset_t object_offset;
5227 #ifdef __ppc__
5228 int i;
5229 vm_map_entry_t map_entry;
5230 #endif /* __ppc__ */
5231
5232
5233 #ifdef __ppc__
5234 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5235 /*
5236 * Optimization for the PowerPC.
5237 * Use one of the pre-allocated kernel virtual addresses
5238 * and just enter the VM page in the kernel address space
5239 * at that virtual address.
5240 */
5241 vm_object_unlock(object);
5242 simple_lock(&vm_paging_lock);
5243
5244 if (vm_paging_base_address == 0) {
5245 /*
5246 * Initialize our pool of pre-allocated kernel
5247 * virtual addresses.
5248 */
5249 simple_unlock(&vm_paging_lock);
5250 page_map_offset = 0;
5251 kr = vm_map_find_space(kernel_map,
5252 &page_map_offset,
5253 VM_PAGING_NUM_PAGES * PAGE_SIZE,
5254 0,
5255 &map_entry);
5256 if (kr != KERN_SUCCESS) {
5257 panic("vm_paging_map_object: "
5258 "kernel_map full\n");
5259 }
5260 map_entry->object.vm_object = kernel_object;
5261 map_entry->offset =
5262 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5263 vm_object_reference(kernel_object);
5264 vm_map_unlock(kernel_map);
5265
5266 simple_lock(&vm_paging_lock);
5267 if (vm_paging_base_address != 0) {
5268 /* someone raced us and won: undo */
5269 simple_unlock(&vm_paging_lock);
5270 kr = vm_map_remove(kernel_map,
5271 page_map_offset,
5272 page_map_offset +
5273 (VM_PAGING_NUM_PAGES
5274 * PAGE_SIZE),
5275 VM_MAP_NO_FLAGS);
5276 assert(kr == KERN_SUCCESS);
5277 simple_lock(&vm_paging_lock);
5278 } else {
5279 vm_paging_base_address = page_map_offset;
5280 }
5281 }
5282
5283 /*
5284 * Try and find an available kernel virtual address
5285 * from our pre-allocated pool.
5286 */
5287 page_map_offset = 0;
5288 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5289 if (vm_paging_page_inuse[i] == FALSE) {
5290 page_map_offset = vm_paging_base_address +
5291 (i * PAGE_SIZE);
5292 break;
5293 }
5294 }
5295
5296 if (page_map_offset != 0) {
5297 /*
5298 * We found a kernel virtual address;
5299 * map the physical page to that virtual address.
5300 */
5301 if (i > vm_paging_max_index) {
5302 vm_paging_max_index = i;
5303 }
5304 vm_paging_page_inuse[i] = TRUE;
5305 simple_unlock(&vm_paging_lock);
5306 pmap_map_block(kernel_pmap,
5307 page_map_offset,
5308 page->phys_page,
5309 1, /* Size is number of 4k pages */
5310 VM_PROT_DEFAULT,
5311 ((int) page->object->wimg_bits &
5312 VM_WIMG_MASK),
5313 0);
5314 vm_paging_objects_mapped++;
5315 vm_paging_pages_mapped++;
5316 *address = page_map_offset;
5317 vm_object_lock(object);
5318
5319 /* all done and mapped, ready to use ! */
5320 return KERN_SUCCESS;
5321 }
5322
5323 /*
5324 * We ran out of pre-allocated kernel virtual
5325 * addresses. Just map the page in the kernel
5326 * the slow and regular way.
5327 */
5328 vm_paging_no_kernel_page++;
5329 simple_unlock(&vm_paging_lock);
5330 vm_object_lock(object);
5331 }
5332 #endif /* __ppc__ */
5333
5334 object_offset = vm_object_trunc_page(offset);
5335 map_size = vm_map_round_page(*size);
5336
5337 /*
5338 * Try and map the required range of the object
5339 * in the kernel_map
5340 */
5341
5342 /* don't go beyond the object's end... */
5343 if (object_offset >= object->size) {
5344 map_size = 0;
5345 } else if (map_size > object->size - offset) {
5346 map_size = object->size - offset;
5347 }
5348
5349 vm_object_reference_locked(object); /* for the map entry */
5350 vm_object_unlock(object);
5351
5352 kr = vm_map_enter(kernel_map,
5353 address,
5354 map_size,
5355 0,
5356 VM_FLAGS_ANYWHERE,
5357 object,
5358 object_offset,
5359 FALSE,
5360 VM_PROT_DEFAULT,
5361 VM_PROT_ALL,
5362 VM_INHERIT_NONE);
5363 if (kr != KERN_SUCCESS) {
5364 *address = 0;
5365 *size = 0;
5366 vm_object_deallocate(object); /* for the map entry */
5367 return kr;
5368 }
5369
5370 *size = map_size;
5371
5372 /*
5373 * Enter the mapped pages in the page table now.
5374 */
5375 vm_object_lock(object);
5376 for (page_map_offset = 0;
5377 map_size != 0;
5378 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5379 unsigned int cache_attr;
5380
5381 page = vm_page_lookup(object, offset + page_map_offset);
5382 if (page == VM_PAGE_NULL) {
5383 panic("vm_paging_map_object: no page !?");
5384 }
5385 if (page->no_isync == TRUE) {
5386 pmap_sync_page_data_phys(page->phys_page);
5387 }
5388 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5389
5390 PMAP_ENTER(kernel_pmap,
5391 *address + page_map_offset,
5392 page,
5393 VM_PROT_DEFAULT,
5394 cache_attr,
5395 FALSE);
5396 }
5397
5398 vm_paging_objects_mapped_slow++;
5399 vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5400
5401 return KERN_SUCCESS;
5402 }
5403
5404 /*
5405 * ENCRYPTED SWAP:
5406 * vm_paging_unmap_object:
5407 * Unmaps part of a VM object's pages from the kernel
5408 * virtual address space.
5409 * Context:
5410 * The VM object is locked. This lock will get
5411 * dropped and re-acquired though.
5412 */
5413 void
5414 vm_paging_unmap_object(
5415 vm_object_t object,
5416 vm_map_offset_t start,
5417 vm_map_offset_t end)
5418 {
5419 kern_return_t kr;
5420 #ifdef __ppc__
5421 int i;
5422 #endif /* __ppc__ */
5423
5424 if ((vm_paging_base_address != 0) &&
5425 ((start < vm_paging_base_address) ||
5426 (end > (vm_paging_base_address
5427 + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5428 /*
5429 * We didn't use our pre-allocated pool of
5430 * kernel virtual address. Deallocate the
5431 * virtual memory.
5432 */
5433 if (object != VM_OBJECT_NULL) {
5434 vm_object_unlock(object);
5435 }
5436 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5437 if (object != VM_OBJECT_NULL) {
5438 vm_object_lock(object);
5439 }
5440 assert(kr == KERN_SUCCESS);
5441 } else {
5442 /*
5443 * We used a kernel virtual address from our
5444 * pre-allocated pool. Put it back in the pool
5445 * for next time.
5446 */
5447 #ifdef __ppc__
5448 assert(end - start == PAGE_SIZE);
5449 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5450
5451 /* undo the pmap mapping */
5452 mapping_remove(kernel_pmap, start);
5453
5454 simple_lock(&vm_paging_lock);
5455 vm_paging_page_inuse[i] = FALSE;
5456 simple_unlock(&vm_paging_lock);
5457 #endif /* __ppc__ */
5458 }
5459 }
5460
5461 /*
5462 * Encryption data.
5463 * "iv" is the "initial vector". Ideally, we want to
5464 * have a different one for each page we encrypt, so that
5465 * crackers can't find encryption patterns too easily.
5466 */
5467 #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
5468 boolean_t swap_crypt_ctx_initialized = FALSE;
5469 aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
5470 aes_ctx swap_crypt_ctx;
5471 const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5472
5473 #if DEBUG
5474 boolean_t swap_crypt_ctx_tested = FALSE;
5475 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5476 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5477 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5478 #endif /* DEBUG */
5479
5480 extern u_long random(void);
5481
5482 /*
5483 * Initialize the encryption context: key and key size.
5484 */
5485 void swap_crypt_ctx_initialize(void); /* forward */
5486 void
5487 swap_crypt_ctx_initialize(void)
5488 {
5489 unsigned int i;
5490
5491 /*
5492 * No need for locking to protect swap_crypt_ctx_initialized
5493 * because the first use of encryption will come from the
5494 * pageout thread (we won't pagein before there's been a pageout)
5495 * and there's only one pageout thread.
5496 */
5497 if (swap_crypt_ctx_initialized == FALSE) {
5498 for (i = 0;
5499 i < (sizeof (swap_crypt_key) /
5500 sizeof (swap_crypt_key[0]));
5501 i++) {
5502 swap_crypt_key[i] = random();
5503 }
5504 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5505 SWAP_CRYPT_AES_KEY_SIZE,
5506 &swap_crypt_ctx.encrypt);
5507 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5508 SWAP_CRYPT_AES_KEY_SIZE,
5509 &swap_crypt_ctx.decrypt);
5510 swap_crypt_ctx_initialized = TRUE;
5511 }
5512
5513 #if DEBUG
5514 /*
5515 * Validate the encryption algorithms.
5516 */
5517 if (swap_crypt_ctx_tested == FALSE) {
5518 /* initialize */
5519 for (i = 0; i < 4096; i++) {
5520 swap_crypt_test_page_ref[i] = (char) i;
5521 }
5522 /* encrypt */
5523 aes_encrypt_cbc(swap_crypt_test_page_ref,
5524 swap_crypt_null_iv,
5525 PAGE_SIZE / AES_BLOCK_SIZE,
5526 swap_crypt_test_page_encrypt,
5527 &swap_crypt_ctx.encrypt);
5528 /* decrypt */
5529 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5530 swap_crypt_null_iv,
5531 PAGE_SIZE / AES_BLOCK_SIZE,
5532 swap_crypt_test_page_decrypt,
5533 &swap_crypt_ctx.decrypt);
5534 /* compare result with original */
5535 for (i = 0; i < 4096; i ++) {
5536 if (swap_crypt_test_page_decrypt[i] !=
5537 swap_crypt_test_page_ref[i]) {
5538 panic("encryption test failed");
5539 }
5540 }
5541
5542 /* encrypt again */
5543 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5544 swap_crypt_null_iv,
5545 PAGE_SIZE / AES_BLOCK_SIZE,
5546 swap_crypt_test_page_decrypt,
5547 &swap_crypt_ctx.encrypt);
5548 /* decrypt in place */
5549 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5550 swap_crypt_null_iv,
5551 PAGE_SIZE / AES_BLOCK_SIZE,
5552 swap_crypt_test_page_decrypt,
5553 &swap_crypt_ctx.decrypt);
5554 for (i = 0; i < 4096; i ++) {
5555 if (swap_crypt_test_page_decrypt[i] !=
5556 swap_crypt_test_page_ref[i]) {
5557 panic("in place encryption test failed");
5558 }
5559 }
5560
5561 swap_crypt_ctx_tested = TRUE;
5562 }
5563 #endif /* DEBUG */
5564 }
5565
5566 /*
5567 * ENCRYPTED SWAP:
5568 * vm_page_encrypt:
5569 * Encrypt the given page, for secure paging.
5570 * The page might already be mapped at kernel virtual
5571 * address "kernel_mapping_offset". Otherwise, we need
5572 * to map it.
5573 *
5574 * Context:
5575 * The page's object is locked, but this lock will be released
5576 * and re-acquired.
5577 * The page is busy and not accessible by users (not entered in any pmap).
5578 */
5579 void
5580 vm_page_encrypt(
5581 vm_page_t page,
5582 vm_map_offset_t kernel_mapping_offset)
5583 {
5584 int clear_refmod = 0;
5585 kern_return_t kr;
5586 boolean_t page_was_referenced;
5587 boolean_t page_was_modified;
5588 vm_map_size_t kernel_mapping_size;
5589 vm_offset_t kernel_vaddr;
5590 union {
5591 unsigned char aes_iv[AES_BLOCK_SIZE];
5592 struct {
5593 memory_object_t pager_object;
5594 vm_object_offset_t paging_offset;
5595 } vm;
5596 } encrypt_iv;
5597
5598 if (! vm_pages_encrypted) {
5599 vm_pages_encrypted = TRUE;
5600 }
5601
5602 assert(page->busy);
5603 assert(page->dirty || page->precious);
5604
5605 if (page->encrypted) {
5606 /*
5607 * Already encrypted: no need to do it again.
5608 */
5609 vm_page_encrypt_already_encrypted_counter++;
5610 return;
5611 }
5612 ASSERT_PAGE_DECRYPTED(page);
5613
5614 /*
5615 * Gather the "reference" and "modified" status of the page.
5616 * We'll restore these values after the encryption, so that
5617 * the encryption is transparent to the rest of the system
5618 * and doesn't impact the VM's LRU logic.
5619 */
5620 page_was_referenced =
5621 (page->reference || pmap_is_referenced(page->phys_page));
5622 page_was_modified =
5623 (page->dirty || pmap_is_modified(page->phys_page));
5624
5625 if (kernel_mapping_offset == 0) {
5626 /*
5627 * The page hasn't already been mapped in kernel space
5628 * by the caller. Map it now, so that we can access
5629 * its contents and encrypt them.
5630 */
5631 kernel_mapping_size = PAGE_SIZE;
5632 kr = vm_paging_map_object(&kernel_mapping_offset,
5633 page,
5634 page->object,
5635 page->offset,
5636 &kernel_mapping_size);
5637 if (kr != KERN_SUCCESS) {
5638 panic("vm_page_encrypt: "
5639 "could not map page in kernel: 0x%x\n",
5640 kr);
5641 }
5642 } else {
5643 kernel_mapping_size = 0;
5644 }
5645 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5646
5647 if (swap_crypt_ctx_initialized == FALSE) {
5648 swap_crypt_ctx_initialize();
5649 }
5650 assert(swap_crypt_ctx_initialized);
5651
5652 /*
5653 * Prepare an "initial vector" for the encryption.
5654 * We use the "pager" and the "paging_offset" for that
5655 * page to obfuscate the encrypted data a bit more and
5656 * prevent crackers from finding patterns that they could
5657 * use to break the key.
5658 */
5659 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5660 encrypt_iv.vm.pager_object = page->object->pager;
5661 encrypt_iv.vm.paging_offset =
5662 page->object->paging_offset + page->offset;
5663
5664 vm_object_unlock(page->object);
5665
5666 /* encrypt the "initial vector" */
5667 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5668 swap_crypt_null_iv,
5669 1,
5670 &encrypt_iv.aes_iv[0],
5671 &swap_crypt_ctx.encrypt);
5672
5673 /*
5674 * Encrypt the page.
5675 */
5676 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5677 &encrypt_iv.aes_iv[0],
5678 PAGE_SIZE / AES_BLOCK_SIZE,
5679 (unsigned char *) kernel_vaddr,
5680 &swap_crypt_ctx.encrypt);
5681
5682 vm_page_encrypt_counter++;
5683
5684 vm_object_lock(page->object);
5685
5686 /*
5687 * Unmap the page from the kernel's address space,
5688 * if we had to map it ourselves. Otherwise, let
5689 * the caller undo the mapping if needed.
5690 */
5691 if (kernel_mapping_size != 0) {
5692 vm_paging_unmap_object(page->object,
5693 kernel_mapping_offset,
5694 kernel_mapping_offset + kernel_mapping_size);
5695 }
5696
5697 /*
5698 * Restore the "reference" and "modified" bits.
5699 * This should clean up any impact the encryption had
5700 * on them.
5701 */
5702 if (! page_was_referenced) {
5703 clear_refmod |= VM_MEM_REFERENCED;
5704 page->reference = FALSE;
5705 }
5706 if (! page_was_modified) {
5707 clear_refmod |= VM_MEM_MODIFIED;
5708 page->dirty = FALSE;
5709 }
5710 if (clear_refmod)
5711 pmap_clear_refmod(page->phys_page, clear_refmod);
5712
5713 page->encrypted = TRUE;
5714 }
5715
5716 /*
5717 * ENCRYPTED SWAP:
5718 * vm_page_decrypt:
5719 * Decrypt the given page.
5720 * The page might already be mapped at kernel virtual
5721 * address "kernel_mapping_offset". Otherwise, we need
5722 * to map it.
5723 *
5724 * Context:
5725 * The page's VM object is locked but will be unlocked and relocked.
5726 * The page is busy and not accessible by users (not entered in any pmap).
5727 */
5728 void
5729 vm_page_decrypt(
5730 vm_page_t page,
5731 vm_map_offset_t kernel_mapping_offset)
5732 {
5733 int clear_refmod = 0;
5734 kern_return_t kr;
5735 vm_map_size_t kernel_mapping_size;
5736 vm_offset_t kernel_vaddr;
5737 boolean_t page_was_referenced;
5738 union {
5739 unsigned char aes_iv[AES_BLOCK_SIZE];
5740 struct {
5741 memory_object_t pager_object;
5742 vm_object_offset_t paging_offset;
5743 } vm;
5744 } decrypt_iv;
5745
5746 assert(page->busy);
5747 assert(page->encrypted);
5748
5749 /*
5750 * Gather the "reference" status of the page.
5751 * We'll restore its value after the decryption, so that
5752 * the decryption is transparent to the rest of the system
5753 * and doesn't impact the VM's LRU logic.
5754 */
5755 page_was_referenced =
5756 (page->reference || pmap_is_referenced(page->phys_page));
5757
5758 if (kernel_mapping_offset == 0) {
5759 /*
5760 * The page hasn't already been mapped in kernel space
5761 * by the caller. Map it now, so that we can access
5762 * its contents and decrypt them.
5763 */
5764 kernel_mapping_size = PAGE_SIZE;
5765 kr = vm_paging_map_object(&kernel_mapping_offset,
5766 page,
5767 page->object,
5768 page->offset,
5769 &kernel_mapping_size);
5770 if (kr != KERN_SUCCESS) {
5771 panic("vm_page_decrypt: "
5772 "could not map page in kernel: 0x%x\n");
5773 }
5774 } else {
5775 kernel_mapping_size = 0;
5776 }
5777 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5778
5779 assert(swap_crypt_ctx_initialized);
5780
5781 /*
5782 * Prepare an "initial vector" for the decryption.
5783 * It has to be the same as the "initial vector" we
5784 * used to encrypt that page.
5785 */
5786 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5787 decrypt_iv.vm.pager_object = page->object->pager;
5788 decrypt_iv.vm.paging_offset =
5789 page->object->paging_offset + page->offset;
5790
5791 vm_object_unlock(page->object);
5792
5793 /* encrypt the "initial vector" */
5794 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5795 swap_crypt_null_iv,
5796 1,
5797 &decrypt_iv.aes_iv[0],
5798 &swap_crypt_ctx.encrypt);
5799
5800 /*
5801 * Decrypt the page.
5802 */
5803 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5804 &decrypt_iv.aes_iv[0],
5805 PAGE_SIZE / AES_BLOCK_SIZE,
5806 (unsigned char *) kernel_vaddr,
5807 &swap_crypt_ctx.decrypt);
5808 vm_page_decrypt_counter++;
5809
5810 vm_object_lock(page->object);
5811
5812 /*
5813 * Unmap the page from the kernel's address space,
5814 * if we had to map it ourselves. Otherwise, let
5815 * the caller undo the mapping if needed.
5816 */
5817 if (kernel_mapping_size != 0) {
5818 vm_paging_unmap_object(page->object,
5819 kernel_vaddr,
5820 kernel_vaddr + PAGE_SIZE);
5821 }
5822
5823 /*
5824 * After decryption, the page is actually clean.
5825 * It was encrypted as part of paging, which "cleans"
5826 * the "dirty" pages.
5827 * Noone could access it after it was encrypted
5828 * and the decryption doesn't count.
5829 */
5830 page->dirty = FALSE;
5831 clear_refmod = VM_MEM_MODIFIED;
5832
5833 /* restore the "reference" bit */
5834 if (! page_was_referenced) {
5835 page->reference = FALSE;
5836 clear_refmod |= VM_MEM_REFERENCED;
5837 }
5838 pmap_clear_refmod(page->phys_page, clear_refmod);
5839
5840 page->encrypted = FALSE;
5841
5842 /*
5843 * We've just modified the page's contents via the data cache and part
5844 * of the new contents might still be in the cache and not yet in RAM.
5845 * Since the page is now available and might get gathered in a UPL to
5846 * be part of a DMA transfer from a driver that expects the memory to
5847 * be coherent at this point, we have to flush the data cache.
5848 */
5849 pmap_sync_page_data_phys(page->phys_page);
5850 /*
5851 * Since the page is not mapped yet, some code might assume that it
5852 * doesn't need to invalidate the instruction cache when writing to
5853 * that page. That code relies on "no_isync" being set, so that the
5854 * caches get syncrhonized when the page is first mapped. So we need
5855 * to set "no_isync" here too, despite the fact that we just
5856 * synchronized the caches above...
5857 */
5858 page->no_isync = TRUE;
5859 }
5860
5861 unsigned long upl_encrypt_upls = 0;
5862 unsigned long upl_encrypt_pages = 0;
5863
5864 /*
5865 * ENCRYPTED SWAP:
5866 *
5867 * upl_encrypt:
5868 * Encrypts all the pages in the UPL, within the specified range.
5869 *
5870 */
5871 void
5872 upl_encrypt(
5873 upl_t upl,
5874 upl_offset_t crypt_offset,
5875 upl_size_t crypt_size)
5876 {
5877 upl_size_t upl_size;
5878 upl_offset_t upl_offset;
5879 vm_object_t upl_object;
5880 vm_page_t page;
5881 vm_object_t shadow_object;
5882 vm_object_offset_t shadow_offset;
5883 vm_object_offset_t paging_offset;
5884 vm_object_offset_t base_offset;
5885
5886 upl_encrypt_upls++;
5887 upl_encrypt_pages += crypt_size / PAGE_SIZE;
5888
5889 upl_lock(upl);
5890
5891 upl_object = upl->map_object;
5892 upl_offset = upl->offset;
5893 upl_size = upl->size;
5894
5895 upl_unlock(upl);
5896
5897 vm_object_lock(upl_object);
5898
5899 /*
5900 * Find the VM object that contains the actual pages.
5901 */
5902 if (upl_object->pageout) {
5903 shadow_object = upl_object->shadow;
5904 /*
5905 * The offset in the shadow object is actually also
5906 * accounted for in upl->offset. It possibly shouldn't be
5907 * this way, but for now don't account for it twice.
5908 */
5909 shadow_offset = 0;
5910 assert(upl_object->paging_offset == 0); /* XXX ? */
5911 vm_object_lock(shadow_object);
5912 } else {
5913 shadow_object = upl_object;
5914 shadow_offset = 0;
5915 }
5916
5917 paging_offset = shadow_object->paging_offset;
5918 vm_object_paging_begin(shadow_object);
5919
5920 if (shadow_object != upl_object) {
5921 vm_object_unlock(shadow_object);
5922 }
5923 vm_object_unlock(upl_object);
5924
5925 base_offset = shadow_offset;
5926 base_offset += upl_offset;
5927 base_offset += crypt_offset;
5928 base_offset -= paging_offset;
5929 /*
5930 * Unmap the pages, so that nobody can continue accessing them while
5931 * they're encrypted. After that point, all accesses to these pages
5932 * will cause a page fault and block while the page is being encrypted
5933 * (busy). After the encryption completes, any access will cause a
5934 * page fault and the page gets decrypted at that time.
5935 */
5936 assert(crypt_offset + crypt_size <= upl_size);
5937 vm_object_pmap_protect(shadow_object,
5938 base_offset,
5939 (vm_object_size_t)crypt_size,
5940 PMAP_NULL,
5941 0,
5942 VM_PROT_NONE);
5943
5944 /* XXX FBDP could the object have changed significantly here ? */
5945 vm_object_lock(shadow_object);
5946
5947 for (upl_offset = 0;
5948 upl_offset < crypt_size;
5949 upl_offset += PAGE_SIZE) {
5950 page = vm_page_lookup(shadow_object,
5951 base_offset + upl_offset);
5952 if (page == VM_PAGE_NULL) {
5953 panic("upl_encrypt: "
5954 "no page for (obj=%p,off=%lld+%d)!\n",
5955 shadow_object,
5956 base_offset,
5957 upl_offset);
5958 }
5959 vm_page_encrypt(page, 0);
5960 }
5961
5962 vm_object_paging_end(shadow_object);
5963 vm_object_unlock(shadow_object);
5964 }
5965
5966 vm_size_t
5967 upl_get_internal_pagelist_offset(void)
5968 {
5969 return sizeof(struct upl);
5970 }
5971
5972 void
5973 upl_set_dirty(
5974 upl_t upl)
5975 {
5976 upl->flags |= UPL_CLEAR_DIRTY;
5977 }
5978
5979 void
5980 upl_clear_dirty(
5981 upl_t upl)
5982 {
5983 upl->flags &= ~UPL_CLEAR_DIRTY;
5984 }
5985
5986
5987 #ifdef MACH_BSD
5988
5989 boolean_t upl_page_present(upl_page_info_t *upl, int index)
5990 {
5991 return(UPL_PAGE_PRESENT(upl, index));
5992 }
5993 boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
5994 {
5995 return(UPL_DIRTY_PAGE(upl, index));
5996 }
5997 boolean_t upl_valid_page(upl_page_info_t *upl, int index)
5998 {
5999 return(UPL_VALID_PAGE(upl, index));
6000 }
6001 ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
6002 {
6003 return(UPL_PHYS_PAGE(upl, index));
6004 }
6005
6006 void
6007 vm_countdirtypages(void)
6008 {
6009 vm_page_t m;
6010 int dpages;
6011 int pgopages;
6012 int precpages;
6013
6014
6015 dpages=0;
6016 pgopages=0;
6017 precpages=0;
6018
6019 vm_page_lock_queues();
6020 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6021 do {
6022 if (m ==(vm_page_t )0) break;
6023
6024 if(m->dirty) dpages++;
6025 if(m->pageout) pgopages++;
6026 if(m->precious) precpages++;
6027
6028 assert(m->object != kernel_object);
6029 m = (vm_page_t) queue_next(&m->pageq);
6030 if (m ==(vm_page_t )0) break;
6031
6032 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6033 vm_page_unlock_queues();
6034
6035 vm_page_lock_queues();
6036 m = (vm_page_t) queue_first(&vm_page_queue_zf);
6037 do {
6038 if (m ==(vm_page_t )0) break;
6039
6040 if(m->dirty) dpages++;
6041 if(m->pageout) pgopages++;
6042 if(m->precious) precpages++;
6043
6044 assert(m->object != kernel_object);
6045 m = (vm_page_t) queue_next(&m->pageq);
6046 if (m ==(vm_page_t )0) break;
6047
6048 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6049 vm_page_unlock_queues();
6050
6051 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6052
6053 dpages=0;
6054 pgopages=0;
6055 precpages=0;
6056
6057 vm_page_lock_queues();
6058 m = (vm_page_t) queue_first(&vm_page_queue_active);
6059
6060 do {
6061 if(m == (vm_page_t )0) break;
6062 if(m->dirty) dpages++;
6063 if(m->pageout) pgopages++;
6064 if(m->precious) precpages++;
6065
6066 assert(m->object != kernel_object);
6067 m = (vm_page_t) queue_next(&m->pageq);
6068 if(m == (vm_page_t )0) break;
6069
6070 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6071 vm_page_unlock_queues();
6072
6073 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6074
6075 }
6076 #endif /* MACH_BSD */
6077
6078 #ifdef UPL_DEBUG
6079 kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6080 {
6081 upl->ubc_alias1 = alias1;
6082 upl->ubc_alias2 = alias2;
6083 return KERN_SUCCESS;
6084 }
6085 int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6086 {
6087 if(al)
6088 *al = upl->ubc_alias1;
6089 if(al2)
6090 *al2 = upl->ubc_alias2;
6091 return KERN_SUCCESS;
6092 }
6093 #endif /* UPL_DEBUG */
6094
6095
6096
6097 #if MACH_KDB
6098 #include <ddb/db_output.h>
6099 #include <ddb/db_print.h>
6100 #include <vm/vm_print.h>
6101
6102 #define printf kdbprintf
6103 void db_pageout(void);
6104
6105 void
6106 db_vm(void)
6107 {
6108
6109 iprintf("VM Statistics:\n");
6110 db_indent += 2;
6111 iprintf("pages:\n");
6112 db_indent += 2;
6113 iprintf("activ %5d inact %5d free %5d",
6114 vm_page_active_count, vm_page_inactive_count,
6115 vm_page_free_count);
6116 printf(" wire %5d gobbl %5d\n",
6117 vm_page_wire_count, vm_page_gobble_count);
6118 db_indent -= 2;
6119 iprintf("target:\n");
6120 db_indent += 2;
6121 iprintf("min %5d inact %5d free %5d",
6122 vm_page_free_min, vm_page_inactive_target,
6123 vm_page_free_target);
6124 printf(" resrv %5d\n", vm_page_free_reserved);
6125 db_indent -= 2;
6126 iprintf("pause:\n");
6127 db_pageout();
6128 db_indent -= 2;
6129 }
6130
6131 #if MACH_COUNTERS
6132 extern int c_laundry_pages_freed;
6133 #endif /* MACH_COUNTERS */
6134
6135 void
6136 db_pageout(void)
6137 {
6138 iprintf("Pageout Statistics:\n");
6139 db_indent += 2;
6140 iprintf("active %5d inactv %5d\n",
6141 vm_pageout_active, vm_pageout_inactive);
6142 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
6143 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6144 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6145 iprintf("used %5d clean %5d dirty %5d\n",
6146 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6147 vm_pageout_inactive_dirty);
6148 #if MACH_COUNTERS
6149 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6150 #endif /* MACH_COUNTERS */
6151 #if MACH_CLUSTER_STATS
6152 iprintf("Cluster Statistics:\n");
6153 db_indent += 2;
6154 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
6155 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6156 vm_pageout_cluster_collisions);
6157 iprintf("clusters %5d conversions %5d\n",
6158 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6159 db_indent -= 2;
6160 iprintf("Target Statistics:\n");
6161 db_indent += 2;
6162 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
6163 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6164 vm_pageout_target_page_freed);
6165 db_indent -= 2;
6166 #endif /* MACH_CLUSTER_STATS */
6167 db_indent -= 2;
6168 }
6169
6170 #endif /* MACH_KDB */