]> git.saurik.com Git - apple/xnu.git/blame - osfmk/vm/vm_pageout.c
xnu-1504.15.3.tar.gz
[apple/xnu.git] / osfmk / vm / vm_pageout.c
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
1c79356b 65
91447636
A
66#include <stdint.h>
67
68#include <debug.h>
1c79356b
A
69#include <mach_pagemap.h>
70#include <mach_cluster_stats.h>
71#include <mach_kdb.h>
72#include <advisory_pageout.h>
73
74#include <mach/mach_types.h>
75#include <mach/memory_object.h>
76#include <mach/memory_object_default.h>
0b4e3aa0 77#include <mach/memory_object_control_server.h>
1c79356b 78#include <mach/mach_host_server.h>
91447636
A
79#include <mach/upl.h>
80#include <mach/vm_map.h>
1c79356b
A
81#include <mach/vm_param.h>
82#include <mach/vm_statistics.h>
2d21ac55 83#include <mach/sdt.h>
91447636
A
84
85#include <kern/kern_types.h>
1c79356b 86#include <kern/counters.h>
91447636
A
87#include <kern/host_statistics.h>
88#include <kern/machine.h>
89#include <kern/misc_protos.h>
b0d623f7 90#include <kern/sched.h>
1c79356b 91#include <kern/thread.h>
1c79356b 92#include <kern/xpr.h>
91447636
A
93#include <kern/kalloc.h>
94
95#include <machine/vm_tuning.h>
b0d623f7 96#include <machine/commpage.h>
91447636 97
2d21ac55
A
98#if CONFIG_EMBEDDED
99#include <sys/kern_memorystatus.h>
100#endif
101
1c79356b 102#include <vm/pmap.h>
55e303ae 103#include <vm/vm_fault.h>
1c79356b
A
104#include <vm/vm_map.h>
105#include <vm/vm_object.h>
106#include <vm/vm_page.h>
107#include <vm/vm_pageout.h>
91447636 108#include <vm/vm_protos.h> /* must be last */
2d21ac55
A
109#include <vm/memory_object.h>
110#include <vm/vm_purgeable_internal.h>
1c79356b 111
91447636
A
112/*
113 * ENCRYPTED SWAP:
114 */
91447636 115#include <../bsd/crypto/aes/aes.h>
b0d623f7 116extern u_int32_t random(void); /* from <libkern/libkern.h> */
55e303ae 117
b0d623f7
A
118#if UPL_DEBUG
119#include <libkern/OSDebug.h>
120#endif
91447636 121
2d21ac55 122#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */
2d21ac55
A
123#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100
124#endif
91447636 125
2d21ac55
A
126#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
127#ifdef CONFIG_EMBEDDED
128#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
129#else
130#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
131#endif
91447636
A
132#endif
133
134#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
135#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
136#endif
137
138#ifndef VM_PAGEOUT_INACTIVE_RELIEF
139#define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
140#endif
141
1c79356b 142#ifndef VM_PAGE_LAUNDRY_MAX
91447636 143#define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */
1c79356b
A
144#endif /* VM_PAGEOUT_LAUNDRY_MAX */
145
1c79356b
A
146#ifndef VM_PAGEOUT_BURST_WAIT
147#define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */
148#endif /* VM_PAGEOUT_BURST_WAIT */
149
150#ifndef VM_PAGEOUT_EMPTY_WAIT
151#define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
152#endif /* VM_PAGEOUT_EMPTY_WAIT */
153
91447636
A
154#ifndef VM_PAGEOUT_DEADLOCK_WAIT
155#define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
156#endif /* VM_PAGEOUT_DEADLOCK_WAIT */
157
158#ifndef VM_PAGEOUT_IDLE_WAIT
159#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
160#endif /* VM_PAGEOUT_IDLE_WAIT */
161
2d21ac55
A
162#ifndef VM_PAGE_SPECULATIVE_TARGET
163#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
164#endif /* VM_PAGE_SPECULATIVE_TARGET */
165
166#ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
167#define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
168#endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
169
91447636 170
1c79356b
A
171/*
172 * To obtain a reasonable LRU approximation, the inactive queue
173 * needs to be large enough to give pages on it a chance to be
174 * referenced a second time. This macro defines the fraction
175 * of active+inactive pages that should be inactive.
176 * The pageout daemon uses it to update vm_page_inactive_target.
177 *
178 * If vm_page_free_count falls below vm_page_free_target and
179 * vm_page_inactive_count is below vm_page_inactive_target,
180 * then the pageout daemon starts running.
181 */
182
183#ifndef VM_PAGE_INACTIVE_TARGET
184#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3)
185#endif /* VM_PAGE_INACTIVE_TARGET */
186
187/*
188 * Once the pageout daemon starts running, it keeps going
189 * until vm_page_free_count meets or exceeds vm_page_free_target.
190 */
191
192#ifndef VM_PAGE_FREE_TARGET
2d21ac55
A
193#ifdef CONFIG_EMBEDDED
194#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
195#else
1c79356b 196#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
2d21ac55 197#endif
1c79356b
A
198#endif /* VM_PAGE_FREE_TARGET */
199
200/*
201 * The pageout daemon always starts running once vm_page_free_count
202 * falls below vm_page_free_min.
203 */
204
205#ifndef VM_PAGE_FREE_MIN
2d21ac55
A
206#ifdef CONFIG_EMBEDDED
207#define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
208#else
209#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
210#endif
1c79356b
A
211#endif /* VM_PAGE_FREE_MIN */
212
2d21ac55
A
213#define VM_PAGE_FREE_MIN_LIMIT 1500
214#define VM_PAGE_FREE_TARGET_LIMIT 2000
215
216
1c79356b
A
217/*
218 * When vm_page_free_count falls below vm_page_free_reserved,
219 * only vm-privileged threads can allocate pages. vm-privilege
220 * allows the pageout daemon and default pager (and any other
221 * associated threads needed for default pageout) to continue
222 * operation by dipping into the reserved pool of pages.
223 */
224
225#ifndef VM_PAGE_FREE_RESERVED
91447636 226#define VM_PAGE_FREE_RESERVED(n) \
b0d623f7 227 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
1c79356b
A
228#endif /* VM_PAGE_FREE_RESERVED */
229
2d21ac55
A
230/*
231 * When we dequeue pages from the inactive list, they are
232 * reactivated (ie, put back on the active queue) if referenced.
233 * However, it is possible to starve the free list if other
234 * processors are referencing pages faster than we can turn off
235 * the referenced bit. So we limit the number of reactivations
236 * we will make per call of vm_pageout_scan().
237 */
238#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
239#ifndef VM_PAGE_REACTIVATE_LIMIT
240#ifdef CONFIG_EMBEDDED
241#define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
242#else
243#define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
244#endif
245#endif /* VM_PAGE_REACTIVATE_LIMIT */
246#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 100
247
91447636 248
0b4e3aa0
A
249/*
250 * Exported variable used to broadcast the activation of the pageout scan
251 * Working Set uses this to throttle its use of pmap removes. In this
252 * way, code which runs within memory in an uncontested context does
253 * not keep encountering soft faults.
254 */
255
256unsigned int vm_pageout_scan_event_counter = 0;
1c79356b
A
257
258/*
259 * Forward declarations for internal routines.
260 */
91447636
A
261
262static void vm_pageout_garbage_collect(int);
263static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
264static void vm_pageout_iothread_external(void);
265static void vm_pageout_iothread_internal(void);
91447636 266
1c79356b
A
267extern void vm_pageout_continue(void);
268extern void vm_pageout_scan(void);
1c79356b 269
2d21ac55
A
270static thread_t vm_pageout_external_iothread = THREAD_NULL;
271static thread_t vm_pageout_internal_iothread = THREAD_NULL;
272
1c79356b
A
273unsigned int vm_pageout_reserved_internal = 0;
274unsigned int vm_pageout_reserved_really = 0;
275
91447636 276unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
55e303ae 277unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
91447636
A
278unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
279unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
280unsigned int vm_pageout_deadlock_relief = 0;
281unsigned int vm_pageout_inactive_relief = 0;
282unsigned int vm_pageout_burst_active_throttle = 0;
283unsigned int vm_pageout_burst_inactive_throttle = 0;
1c79356b 284
9bccf70c
A
285/*
286 * Protection against zero fill flushing live working sets derived
287 * from existing backing store and files
288 */
289unsigned int vm_accellerate_zf_pageout_trigger = 400;
2d21ac55 290unsigned int zf_queue_min_count = 100;
2d21ac55 291unsigned int vm_zf_queue_count = 0;
9bccf70c 292
b0d623f7
A
293#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
294unsigned int vm_zf_count = 0;
295#else
296uint64_t vm_zf_count __attribute__((aligned(8))) = 0;
297#endif
298
1c79356b
A
299/*
300 * These variables record the pageout daemon's actions:
301 * how many pages it looks at and what happens to those pages.
302 * No locking needed because only one thread modifies the variables.
303 */
304
305unsigned int vm_pageout_active = 0; /* debugging */
306unsigned int vm_pageout_inactive = 0; /* debugging */
307unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
308unsigned int vm_pageout_inactive_forced = 0; /* debugging */
309unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
310unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
311unsigned int vm_pageout_inactive_busy = 0; /* debugging */
312unsigned int vm_pageout_inactive_absent = 0; /* debugging */
313unsigned int vm_pageout_inactive_used = 0; /* debugging */
314unsigned int vm_pageout_inactive_clean = 0; /* debugging */
315unsigned int vm_pageout_inactive_dirty = 0; /* debugging */
b0d623f7
A
316unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */
317unsigned int vm_pageout_inactive_zf = 0; /* debugging */
1c79356b 318unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
91447636 319unsigned int vm_pageout_purged_objects = 0; /* debugging */
1c79356b
A
320unsigned int vm_stat_discard = 0; /* debugging */
321unsigned int vm_stat_discard_sent = 0; /* debugging */
322unsigned int vm_stat_discard_failure = 0; /* debugging */
323unsigned int vm_stat_discard_throttle = 0; /* debugging */
2d21ac55
A
324unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */
325unsigned int vm_pageout_catch_ups = 0; /* debugging */
326unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */
1c79356b 327
91447636
A
328unsigned int vm_pageout_scan_active_throttled = 0;
329unsigned int vm_pageout_scan_inactive_throttled = 0;
330unsigned int vm_pageout_scan_throttle = 0; /* debugging */
b0d623f7 331unsigned int vm_pageout_scan_throttle_aborted = 0; /* debugging */
91447636
A
332unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
333unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
334unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
335unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
336unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
b0d623f7
A
337
338unsigned int vm_page_speculative_count_drifts = 0;
339unsigned int vm_page_speculative_count_drift_max = 0;
340
55e303ae
A
341/*
342 * Backing store throttle when BS is exhausted
343 */
344unsigned int vm_backing_store_low = 0;
1c79356b
A
345
346unsigned int vm_pageout_out_of_line = 0;
347unsigned int vm_pageout_in_place = 0;
55e303ae 348
b0d623f7
A
349unsigned int vm_page_steal_pageout_page = 0;
350
91447636
A
351/*
352 * ENCRYPTED SWAP:
353 * counters and statistics...
354 */
355unsigned long vm_page_decrypt_counter = 0;
356unsigned long vm_page_decrypt_for_upl_counter = 0;
357unsigned long vm_page_encrypt_counter = 0;
358unsigned long vm_page_encrypt_abort_counter = 0;
359unsigned long vm_page_encrypt_already_encrypted_counter = 0;
360boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
361
91447636
A
362struct vm_pageout_queue vm_pageout_queue_internal;
363struct vm_pageout_queue vm_pageout_queue_external;
364
2d21ac55
A
365unsigned int vm_page_speculative_target = 0;
366
367vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
368
0b4c1975 369boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
b0d623f7
A
370
371#if DEVELOPMENT || DEBUG
4a3eedf9 372unsigned long vm_cs_validated_resets = 0;
b0d623f7 373#endif
55e303ae
A
374
375/*
376 * Routine: vm_backing_store_disable
377 * Purpose:
378 * Suspend non-privileged threads wishing to extend
379 * backing store when we are low on backing store
380 * (Synchronized by caller)
381 */
382void
383vm_backing_store_disable(
384 boolean_t disable)
385{
386 if(disable) {
387 vm_backing_store_low = 1;
388 } else {
389 if(vm_backing_store_low) {
390 vm_backing_store_low = 0;
391 thread_wakeup((event_t) &vm_backing_store_low);
392 }
393 }
394}
395
396
1c79356b
A
397#if MACH_CLUSTER_STATS
398unsigned long vm_pageout_cluster_dirtied = 0;
399unsigned long vm_pageout_cluster_cleaned = 0;
400unsigned long vm_pageout_cluster_collisions = 0;
401unsigned long vm_pageout_cluster_clusters = 0;
402unsigned long vm_pageout_cluster_conversions = 0;
403unsigned long vm_pageout_target_collisions = 0;
404unsigned long vm_pageout_target_page_dirtied = 0;
405unsigned long vm_pageout_target_page_freed = 0;
1c79356b
A
406#define CLUSTER_STAT(clause) clause
407#else /* MACH_CLUSTER_STATS */
408#define CLUSTER_STAT(clause)
409#endif /* MACH_CLUSTER_STATS */
410
411/*
412 * Routine: vm_pageout_object_terminate
413 * Purpose:
2d21ac55 414 * Destroy the pageout_object, and perform all of the
1c79356b
A
415 * required cleanup actions.
416 *
417 * In/Out conditions:
418 * The object must be locked, and will be returned locked.
419 */
420void
421vm_pageout_object_terminate(
422 vm_object_t object)
423{
424 vm_object_t shadow_object;
425
426 /*
427 * Deal with the deallocation (last reference) of a pageout object
428 * (used for cleaning-in-place) by dropping the paging references/
429 * freeing pages in the original object.
430 */
431
432 assert(object->pageout);
433 shadow_object = object->shadow;
434 vm_object_lock(shadow_object);
435
436 while (!queue_empty(&object->memq)) {
437 vm_page_t p, m;
438 vm_object_offset_t offset;
439
440 p = (vm_page_t) queue_first(&object->memq);
441
442 assert(p->private);
443 assert(p->pageout);
444 p->pageout = FALSE;
445 assert(!p->cleaning);
446
447 offset = p->offset;
448 VM_PAGE_FREE(p);
449 p = VM_PAGE_NULL;
450
451 m = vm_page_lookup(shadow_object,
452 offset + object->shadow_offset);
453
454 if(m == VM_PAGE_NULL)
455 continue;
456 assert(m->cleaning);
0b4e3aa0
A
457 /* used as a trigger on upl_commit etc to recognize the */
458 /* pageout daemon's subseqent desire to pageout a cleaning */
459 /* page. When the bit is on the upl commit code will */
460 /* respect the pageout bit in the target page over the */
461 /* caller's page list indication */
462 m->dump_cleaning = FALSE;
1c79356b 463
1c79356b
A
464 assert((m->dirty) || (m->precious) ||
465 (m->busy && m->cleaning));
466
467 /*
468 * Handle the trusted pager throttle.
55e303ae 469 * Also decrement the burst throttle (if external).
1c79356b
A
470 */
471 vm_page_lock_queues();
472 if (m->laundry) {
91447636 473 vm_pageout_throttle_up(m);
1c79356b
A
474 }
475
476 /*
477 * Handle the "target" page(s). These pages are to be freed if
478 * successfully cleaned. Target pages are always busy, and are
479 * wired exactly once. The initial target pages are not mapped,
480 * (so cannot be referenced or modified) but converted target
481 * pages may have been modified between the selection as an
482 * adjacent page and conversion to a target.
483 */
484 if (m->pageout) {
485 assert(m->busy);
486 assert(m->wire_count == 1);
487 m->cleaning = FALSE;
2d21ac55 488 m->encrypted_cleaning = FALSE;
1c79356b
A
489 m->pageout = FALSE;
490#if MACH_CLUSTER_STATS
491 if (m->wanted) vm_pageout_target_collisions++;
492#endif
493 /*
494 * Revoke all access to the page. Since the object is
495 * locked, and the page is busy, this prevents the page
91447636 496 * from being dirtied after the pmap_disconnect() call
1c79356b 497 * returns.
91447636 498 *
1c79356b
A
499 * Since the page is left "dirty" but "not modifed", we
500 * can detect whether the page was redirtied during
501 * pageout by checking the modify state.
502 */
91447636
A
503 if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
504 m->dirty = TRUE;
505 else
506 m->dirty = FALSE;
1c79356b
A
507
508 if (m->dirty) {
509 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
0b4c1975 510 vm_page_unwire(m, TRUE); /* reactivates */
2d21ac55 511 VM_STAT_INCR(reactivations);
1c79356b 512 PAGE_WAKEUP_DONE(m);
1c79356b
A
513 } else {
514 CLUSTER_STAT(vm_pageout_target_page_freed++;)
515 vm_page_free(m);/* clears busy, etc. */
516 }
517 vm_page_unlock_queues();
518 continue;
519 }
520 /*
521 * Handle the "adjacent" pages. These pages were cleaned in
522 * place, and should be left alone.
523 * If prep_pin_count is nonzero, then someone is using the
524 * page, so make it active.
525 */
2d21ac55 526 if (!m->active && !m->inactive && !m->throttled && !m->private) {
0b4e3aa0 527 if (m->reference)
1c79356b
A
528 vm_page_activate(m);
529 else
530 vm_page_deactivate(m);
531 }
532 if((m->busy) && (m->cleaning)) {
533
534 /* the request_page_list case, (COPY_OUT_FROM FALSE) */
535 m->busy = FALSE;
536
537 /* We do not re-set m->dirty ! */
538 /* The page was busy so no extraneous activity */
91447636 539 /* could have occurred. COPY_INTO is a read into the */
1c79356b
A
540 /* new pages. CLEAN_IN_PLACE does actually write */
541 /* out the pages but handling outside of this code */
542 /* will take care of resetting dirty. We clear the */
543 /* modify however for the Programmed I/O case. */
55e303ae 544 pmap_clear_modify(m->phys_page);
2d21ac55
A
545
546 m->absent = FALSE;
1c79356b
A
547 m->overwriting = FALSE;
548 } else if (m->overwriting) {
549 /* alternate request page list, write to page_list */
550 /* case. Occurs when the original page was wired */
551 /* at the time of the list request */
b0d623f7 552 assert(VM_PAGE_WIRED(m));
0b4c1975 553 vm_page_unwire(m, TRUE); /* reactivates */
1c79356b
A
554 m->overwriting = FALSE;
555 } else {
556 /*
557 * Set the dirty state according to whether or not the page was
558 * modified during the pageout. Note that we purposefully do
559 * NOT call pmap_clear_modify since the page is still mapped.
560 * If the page were to be dirtied between the 2 calls, this
561 * this fact would be lost. This code is only necessary to
562 * maintain statistics, since the pmap module is always
563 * consulted if m->dirty is false.
564 */
565#if MACH_CLUSTER_STATS
55e303ae 566 m->dirty = pmap_is_modified(m->phys_page);
1c79356b
A
567
568 if (m->dirty) vm_pageout_cluster_dirtied++;
569 else vm_pageout_cluster_cleaned++;
570 if (m->wanted) vm_pageout_cluster_collisions++;
571#else
572 m->dirty = 0;
573#endif
574 }
575 m->cleaning = FALSE;
2d21ac55 576 m->encrypted_cleaning = FALSE;
1c79356b 577
1c79356b
A
578 /*
579 * Wakeup any thread waiting for the page to be un-cleaning.
580 */
581 PAGE_WAKEUP(m);
582 vm_page_unlock_queues();
583 }
584 /*
585 * Account for the paging reference taken in vm_paging_object_allocate.
586 */
b0d623f7 587 vm_object_activity_end(shadow_object);
1c79356b
A
588 vm_object_unlock(shadow_object);
589
590 assert(object->ref_count == 0);
591 assert(object->paging_in_progress == 0);
b0d623f7 592 assert(object->activity_in_progress == 0);
1c79356b
A
593 assert(object->resident_page_count == 0);
594 return;
595}
596
1c79356b
A
597/*
598 * Routine: vm_pageclean_setup
599 *
600 * Purpose: setup a page to be cleaned (made non-dirty), but not
601 * necessarily flushed from the VM page cache.
602 * This is accomplished by cleaning in place.
603 *
b0d623f7
A
604 * The page must not be busy, and new_object
605 * must be locked.
606 *
1c79356b
A
607 */
608void
609vm_pageclean_setup(
610 vm_page_t m,
611 vm_page_t new_m,
612 vm_object_t new_object,
613 vm_object_offset_t new_offset)
614{
1c79356b 615 assert(!m->busy);
2d21ac55 616#if 0
1c79356b 617 assert(!m->cleaning);
2d21ac55 618#endif
1c79356b
A
619
620 XPR(XPR_VM_PAGEOUT,
621 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
b0d623f7
A
622 m->object, m->offset, m,
623 new_m, new_offset);
1c79356b 624
55e303ae 625 pmap_clear_modify(m->phys_page);
1c79356b
A
626
627 /*
628 * Mark original page as cleaning in place.
629 */
630 m->cleaning = TRUE;
631 m->dirty = TRUE;
632 m->precious = FALSE;
633
634 /*
635 * Convert the fictitious page to a private shadow of
636 * the real page.
637 */
638 assert(new_m->fictitious);
2d21ac55 639 assert(new_m->phys_page == vm_page_fictitious_addr);
1c79356b
A
640 new_m->fictitious = FALSE;
641 new_m->private = TRUE;
642 new_m->pageout = TRUE;
55e303ae 643 new_m->phys_page = m->phys_page;
b0d623f7
A
644
645 vm_page_lockspin_queues();
1c79356b 646 vm_page_wire(new_m);
b0d623f7 647 vm_page_unlock_queues();
1c79356b
A
648
649 vm_page_insert(new_m, new_object, new_offset);
650 assert(!new_m->wanted);
651 new_m->busy = FALSE;
652}
653
1c79356b
A
654/*
655 * Routine: vm_pageout_initialize_page
656 * Purpose:
657 * Causes the specified page to be initialized in
658 * the appropriate memory object. This routine is used to push
659 * pages into a copy-object when they are modified in the
660 * permanent object.
661 *
662 * The page is moved to a temporary object and paged out.
663 *
664 * In/out conditions:
665 * The page in question must not be on any pageout queues.
666 * The object to which it belongs must be locked.
667 * The page must be busy, but not hold a paging reference.
668 *
669 * Implementation:
670 * Move this page to a completely new object.
671 */
672void
673vm_pageout_initialize_page(
674 vm_page_t m)
675{
1c79356b
A
676 vm_object_t object;
677 vm_object_offset_t paging_offset;
678 vm_page_t holding_page;
2d21ac55 679 memory_object_t pager;
1c79356b
A
680
681 XPR(XPR_VM_PAGEOUT,
682 "vm_pageout_initialize_page, page 0x%X\n",
b0d623f7 683 m, 0, 0, 0, 0);
1c79356b
A
684 assert(m->busy);
685
686 /*
687 * Verify that we really want to clean this page
688 */
689 assert(!m->absent);
690 assert(!m->error);
691 assert(m->dirty);
692
693 /*
694 * Create a paging reference to let us play with the object.
695 */
696 object = m->object;
697 paging_offset = m->offset + object->paging_offset;
2d21ac55
A
698
699 if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
1c79356b
A
700 VM_PAGE_FREE(m);
701 panic("reservation without pageout?"); /* alan */
2d21ac55
A
702 vm_object_unlock(object);
703
704 return;
705 }
706
707 /*
708 * If there's no pager, then we can't clean the page. This should
709 * never happen since this should be a copy object and therefore not
710 * an external object, so the pager should always be there.
711 */
712
713 pager = object->pager;
714
715 if (pager == MEMORY_OBJECT_NULL) {
716 VM_PAGE_FREE(m);
717 panic("missing pager for copy object");
1c79356b
A
718 return;
719 }
720
721 /* set the page for future call to vm_fault_list_request */
2d21ac55 722 vm_object_paging_begin(object);
1c79356b 723 holding_page = NULL;
b0d623f7 724
55e303ae 725 pmap_clear_modify(m->phys_page);
1c79356b 726 m->dirty = TRUE;
55e303ae
A
727 m->busy = TRUE;
728 m->list_req_pending = TRUE;
729 m->cleaning = TRUE;
1c79356b 730 m->pageout = TRUE;
b0d623f7
A
731
732 vm_page_lockspin_queues();
1c79356b 733 vm_page_wire(m);
55e303ae 734 vm_page_unlock_queues();
b0d623f7 735
55e303ae 736 vm_object_unlock(object);
1c79356b
A
737
738 /*
739 * Write the data to its pager.
740 * Note that the data is passed by naming the new object,
741 * not a virtual address; the pager interface has been
742 * manipulated to use the "internal memory" data type.
743 * [The object reference from its allocation is donated
744 * to the eventual recipient.]
745 */
2d21ac55 746 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
1c79356b
A
747
748 vm_object_lock(object);
2d21ac55 749 vm_object_paging_end(object);
1c79356b
A
750}
751
752#if MACH_CLUSTER_STATS
753#define MAXCLUSTERPAGES 16
754struct {
755 unsigned long pages_in_cluster;
756 unsigned long pages_at_higher_offsets;
757 unsigned long pages_at_lower_offsets;
758} cluster_stats[MAXCLUSTERPAGES];
759#endif /* MACH_CLUSTER_STATS */
760
1c79356b
A
761
762/*
763 * vm_pageout_cluster:
764 *
91447636
A
765 * Given a page, queue it to the appropriate I/O thread,
766 * which will page it out and attempt to clean adjacent pages
1c79356b
A
767 * in the same operation.
768 *
91447636 769 * The page must be busy, and the object and queues locked. We will take a
55e303ae 770 * paging reference to prevent deallocation or collapse when we
91447636
A
771 * release the object lock back at the call site. The I/O thread
772 * is responsible for consuming this reference
55e303ae
A
773 *
774 * The page must not be on any pageout queue.
1c79356b 775 */
91447636 776
1c79356b 777void
91447636 778vm_pageout_cluster(vm_page_t m)
1c79356b
A
779{
780 vm_object_t object = m->object;
91447636
A
781 struct vm_pageout_queue *q;
782
1c79356b
A
783
784 XPR(XPR_VM_PAGEOUT,
785 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
b0d623f7
A
786 object, m->offset, m, 0, 0);
787
788 VM_PAGE_CHECK(m);
1c79356b 789
91447636
A
790 /*
791 * Only a certain kind of page is appreciated here.
792 */
b0d623f7 793 assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
91447636 794 assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
2d21ac55 795 assert(!m->throttled);
55e303ae
A
796
797 /*
798 * protect the object from collapse -
799 * locking in the object's paging_offset.
800 */
801 vm_object_paging_begin(object);
55e303ae 802
1c79356b 803 /*
91447636
A
804 * set the page for future call to vm_fault_list_request
805 * page should already be marked busy
1c79356b 806 */
91447636 807 vm_page_wire(m);
55e303ae
A
808 m->list_req_pending = TRUE;
809 m->cleaning = TRUE;
1c79356b 810 m->pageout = TRUE;
1c79356b 811
91447636
A
812 if (object->internal == TRUE)
813 q = &vm_pageout_queue_internal;
814 else
815 q = &vm_pageout_queue_external;
d1ecb069
A
816
817 /*
818 * pgo_laundry count is tied to the laundry bit
819 */
0b4c1975 820 m->laundry = TRUE;
91447636 821 q->pgo_laundry++;
1c79356b 822
91447636
A
823 m->pageout_queue = TRUE;
824 queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
825
826 if (q->pgo_idle == TRUE) {
827 q->pgo_idle = FALSE;
828 thread_wakeup((event_t) &q->pgo_pending);
1c79356b 829 }
b0d623f7
A
830
831 VM_PAGE_CHECK(m);
1c79356b
A
832}
833
55e303ae 834
91447636 835unsigned long vm_pageout_throttle_up_count = 0;
1c79356b
A
836
837/*
b0d623f7
A
838 * A page is back from laundry or we are stealing it back from
839 * the laundering state. See if there are some pages waiting to
91447636 840 * go to laundry and if we can let some of them go now.
1c79356b 841 *
91447636 842 * Object and page queues must be locked.
1c79356b 843 */
91447636
A
844void
845vm_pageout_throttle_up(
846 vm_page_t m)
1c79356b 847{
91447636 848 struct vm_pageout_queue *q;
1c79356b 849
91447636
A
850 assert(m->object != VM_OBJECT_NULL);
851 assert(m->object != kernel_object);
1c79356b 852
b0d623f7 853 vm_pageout_throttle_up_count++;
0b4c1975 854
91447636 855 if (m->object->internal == TRUE)
0b4c1975 856 q = &vm_pageout_queue_internal;
91447636 857 else
0b4c1975 858 q = &vm_pageout_queue_external;
1c79356b 859
b0d623f7 860 if (m->pageout_queue == TRUE) {
b0d623f7
A
861
862 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
d1ecb069
A
863 m->pageout_queue = FALSE;
864
b0d623f7
A
865 m->pageq.next = NULL;
866 m->pageq.prev = NULL;
867
868 vm_object_paging_end(m->object);
869 }
0b4c1975 870 if (m->laundry == TRUE) {
d1ecb069
A
871 m->laundry = FALSE;
872 q->pgo_laundry--;
0b4c1975 873
d1ecb069
A
874 if (q->pgo_throttled == TRUE) {
875 q->pgo_throttled = FALSE;
876 thread_wakeup((event_t) &q->pgo_laundry);
877 }
0b4c1975
A
878 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
879 q->pgo_draining = FALSE;
880 thread_wakeup((event_t) (&q->pgo_laundry+1));
881 }
1c79356b 882 }
1c79356b
A
883}
884
91447636 885
1c79356b
A
886/*
887 * vm_pageout_scan does the dirty work for the pageout daemon.
888 * It returns with vm_page_queue_free_lock held and
889 * vm_page_free_wanted == 0.
890 */
1c79356b 891
2d21ac55 892#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER)
91447636
A
893
894#define FCS_IDLE 0
895#define FCS_DELAYED 1
896#define FCS_DEADLOCK_DETECTED 2
897
898struct flow_control {
899 int state;
900 mach_timespec_t ts;
901};
902
b0d623f7
A
903
904/*
905 * VM memory pressure monitoring.
906 *
907 * vm_pageout_scan() keeps track of the number of pages it considers and
908 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
909 *
910 * compute_memory_pressure() is called every second from compute_averages()
911 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
912 * of recalimed pages in a new vm_pageout_stat[] bucket.
913 *
914 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
915 * The caller provides the number of seconds ("nsecs") worth of statistics
916 * it wants, up to 30 seconds.
917 * It computes the number of pages reclaimed in the past "nsecs" seconds and
918 * also returns the number of pages the system still needs to reclaim at this
919 * moment in time.
920 */
921#define VM_PAGEOUT_STAT_SIZE 31
922struct vm_pageout_stat {
923 unsigned int considered;
924 unsigned int reclaimed;
925} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
926unsigned int vm_pageout_stat_now = 0;
927unsigned int vm_memory_pressure = 0;
928
929#define VM_PAGEOUT_STAT_BEFORE(i) \
930 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
931#define VM_PAGEOUT_STAT_AFTER(i) \
932 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
933
934/*
935 * Called from compute_averages().
936 */
937void
938compute_memory_pressure(
939 __unused void *arg)
940{
941 unsigned int vm_pageout_next;
942
943 vm_memory_pressure =
944 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
945
946 commpage_set_memory_pressure( vm_memory_pressure );
947
948 /* move "now" forward */
949 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
950 vm_pageout_stats[vm_pageout_next].considered = 0;
951 vm_pageout_stats[vm_pageout_next].reclaimed = 0;
952 vm_pageout_stat_now = vm_pageout_next;
953}
954
955unsigned int
956mach_vm_ctl_page_free_wanted(void)
957{
958 unsigned int page_free_target, page_free_count, page_free_wanted;
959
960 page_free_target = vm_page_free_target;
961 page_free_count = vm_page_free_count;
962 if (page_free_target > page_free_count) {
963 page_free_wanted = page_free_target - page_free_count;
964 } else {
965 page_free_wanted = 0;
966 }
967
968 return page_free_wanted;
969}
970
971kern_return_t
972mach_vm_pressure_monitor(
973 boolean_t wait_for_pressure,
974 unsigned int nsecs_monitored,
975 unsigned int *pages_reclaimed_p,
976 unsigned int *pages_wanted_p)
977{
978 wait_result_t wr;
979 unsigned int vm_pageout_then, vm_pageout_now;
980 unsigned int pages_reclaimed;
981
982 /*
983 * We don't take the vm_page_queue_lock here because we don't want
984 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
985 * thread when it's trying to reclaim memory. We don't need fully
986 * accurate monitoring anyway...
987 */
988
989 if (wait_for_pressure) {
990 /* wait until there's memory pressure */
991 while (vm_page_free_count >= vm_page_free_target) {
992 wr = assert_wait((event_t) &vm_page_free_wanted,
993 THREAD_INTERRUPTIBLE);
994 if (wr == THREAD_WAITING) {
995 wr = thread_block(THREAD_CONTINUE_NULL);
996 }
997 if (wr == THREAD_INTERRUPTED) {
998 return KERN_ABORTED;
999 }
1000 if (wr == THREAD_AWAKENED) {
1001 /*
1002 * The memory pressure might have already
1003 * been relieved but let's not block again
1004 * and let's report that there was memory
1005 * pressure at some point.
1006 */
1007 break;
1008 }
1009 }
1010 }
1011
1012 /* provide the number of pages the system wants to reclaim */
1013 if (pages_wanted_p != NULL) {
1014 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1015 }
1016
1017 if (pages_reclaimed_p == NULL) {
1018 return KERN_SUCCESS;
1019 }
1020
1021 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1022 do {
1023 vm_pageout_now = vm_pageout_stat_now;
1024 pages_reclaimed = 0;
1025 for (vm_pageout_then =
1026 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1027 vm_pageout_then != vm_pageout_now &&
1028 nsecs_monitored-- != 0;
1029 vm_pageout_then =
1030 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1031 pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1032 }
1033 } while (vm_pageout_now != vm_pageout_stat_now);
1034 *pages_reclaimed_p = pages_reclaimed;
1035
1036 return KERN_SUCCESS;
1037}
1038
1039/* Page States: Used below to maintain the page state
1040 before it's removed from it's Q. This saved state
1041 helps us do the right accounting in certain cases
1042*/
1043
1044#define PAGE_STATE_SPECULATIVE 1
1045#define PAGE_STATE_THROTTLED 2
1046#define PAGE_STATE_ZEROFILL 3
1047#define PAGE_STATE_INACTIVE 4
1048
1049#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m) \
1050 MACRO_BEGIN \
1051 /* \
1052 * If a "reusable" page somehow made it back into \
1053 * the active queue, it's been re-used and is not \
1054 * quite re-usable. \
1055 * If the VM object was "all_reusable", consider it \
1056 * as "all re-used" instead of converting it to \
1057 * "partially re-used", which could be expensive. \
1058 */ \
1059 if ((m)->reusable || \
1060 (m)->object->all_reusable) { \
1061 vm_object_reuse_pages((m)->object, \
1062 (m)->offset, \
1063 (m)->offset + PAGE_SIZE_64, \
1064 FALSE); \
1065 } \
1066 MACRO_END
1067
1c79356b
A
1068void
1069vm_pageout_scan(void)
1070{
91447636
A
1071 unsigned int loop_count = 0;
1072 unsigned int inactive_burst_count = 0;
1073 unsigned int active_burst_count = 0;
2d21ac55
A
1074 unsigned int reactivated_this_call;
1075 unsigned int reactivate_limit;
1076 vm_page_t local_freeq = NULL;
55e303ae 1077 int local_freed = 0;
2d21ac55 1078 int delayed_unlock;
91447636
A
1079 int refmod_state = 0;
1080 int vm_pageout_deadlock_target = 0;
1081 struct vm_pageout_queue *iq;
1082 struct vm_pageout_queue *eq;
2d21ac55 1083 struct vm_speculative_age_q *sq;
b0d623f7 1084 struct flow_control flow_control = { 0, { 0, 0 } };
91447636 1085 boolean_t inactive_throttled = FALSE;
2d21ac55 1086 boolean_t try_failed;
91447636
A
1087 mach_timespec_t ts;
1088 unsigned int msecs = 0;
1089 vm_object_t object;
2d21ac55 1090 vm_object_t last_object_tried;
b0d623f7
A
1091#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1092 unsigned int zf_ratio;
1093 unsigned int zf_run_count;
1094#else
1095 uint64_t zf_ratio;
1096 uint64_t zf_run_count;
1097#endif
2d21ac55
A
1098 uint32_t catch_up_count = 0;
1099 uint32_t inactive_reclaim_run;
1100 boolean_t forced_reclaim;
b0d623f7 1101 int page_prev_state = 0;
91447636
A
1102
1103 flow_control.state = FCS_IDLE;
1104 iq = &vm_pageout_queue_internal;
1105 eq = &vm_pageout_queue_external;
2d21ac55
A
1106 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1107
1c79356b
A
1108
1109 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1110
2d21ac55
A
1111
1112 vm_page_lock_queues();
1113 delayed_unlock = 1; /* must be nonzero if Qs are locked, 0 if unlocked */
1114
1115 /*
1116 * Calculate the max number of referenced pages on the inactive
1117 * queue that we will reactivate.
1118 */
1119 reactivated_this_call = 0;
1120 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1121 vm_page_inactive_count);
1122 inactive_reclaim_run = 0;
1123
1124
1c79356b
A
1125/*???*/ /*
1126 * We want to gradually dribble pages from the active queue
1127 * to the inactive queue. If we let the inactive queue get
1128 * very small, and then suddenly dump many pages into it,
1129 * those pages won't get a sufficient chance to be referenced
1130 * before we start taking them from the inactive queue.
1131 *
1132 * We must limit the rate at which we send pages to the pagers.
1133 * data_write messages consume memory, for message buffers and
1134 * for map-copy objects. If we get too far ahead of the pagers,
1135 * we can potentially run out of memory.
1136 *
1137 * We can use the laundry count to limit directly the number
1138 * of pages outstanding to the default pager. A similar
1139 * strategy for external pagers doesn't work, because
1140 * external pagers don't have to deallocate the pages sent them,
1141 * and because we might have to send pages to external pagers
1142 * even if they aren't processing writes. So we also
1143 * use a burst count to limit writes to external pagers.
1144 *
1145 * When memory is very tight, we can't rely on external pagers to
1146 * clean pages. They probably aren't running, because they
1147 * aren't vm-privileged. If we kept sending dirty pages to them,
55e303ae 1148 * we could exhaust the free list.
1c79356b 1149 */
91447636 1150
1c79356b 1151
91447636 1152Restart:
2d21ac55
A
1153 assert(delayed_unlock!=0);
1154
1155 /*
1156 * A page is "zero-filled" if it was not paged in from somewhere,
1157 * and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
1158 * Recalculate the zero-filled page ratio. We use this to apportion
1159 * victimized pages between the normal and zero-filled inactive
1160 * queues according to their relative abundance in memory. Thus if a task
1161 * is flooding memory with zf pages, we begin to hunt them down.
1162 * It would be better to throttle greedy tasks at a higher level,
1163 * but at the moment mach vm cannot do this.
1164 */
1165 {
b0d623f7
A
1166#if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1167 uint32_t total = vm_page_active_count + vm_page_inactive_count;
1168 uint32_t normal = total - vm_zf_count;
1169#else
1170 uint64_t total = vm_page_active_count + vm_page_inactive_count;
1171 uint64_t normal = total - vm_zf_count;
1172#endif
1173
2d21ac55
A
1174 /* zf_ratio is the number of zf pages we victimize per normal page */
1175
1176 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
1177 zf_ratio = 0;
1178 else if ((vm_zf_count <= normal) || (normal == 0))
1179 zf_ratio = 1;
1180 else
1181 zf_ratio = vm_zf_count / normal;
1182
1183 zf_run_count = 0;
1184 }
1185
91447636
A
1186 /*
1187 * Recalculate vm_page_inactivate_target.
1188 */
1189 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2d21ac55
A
1190 vm_page_inactive_count +
1191 vm_page_speculative_count);
1192 /*
1193 * don't want to wake the pageout_scan thread up everytime we fall below
1194 * the targets... set a low water mark at 0.25% below the target
1195 */
1196 vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1c79356b 1197
2d21ac55
A
1198 vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1199 vm_page_inactive_count);
1200 object = NULL;
1201 last_object_tried = NULL;
1202 try_failed = FALSE;
1203
1204 if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1205 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1206 else
1207 catch_up_count = 0;
1208
55e303ae 1209 for (;;) {
91447636 1210 vm_page_t m;
1c79356b 1211
2d21ac55 1212 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1c79356b 1213
2d21ac55
A
1214 if (delayed_unlock == 0) {
1215 vm_page_lock_queues();
1216 delayed_unlock = 1;
1217 }
91447636 1218
2d21ac55
A
1219 /*
1220 * Don't sweep through active queue more than the throttle
1221 * which should be kept relatively low
1222 */
b0d623f7
A
1223 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1224 vm_page_active_count);
91447636 1225
1c79356b
A
1226 /*
1227 * Move pages from active to inactive.
1228 */
b0d623f7 1229 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
2d21ac55
A
1230 goto done_moving_active_pages;
1231
b0d623f7 1232 while (!queue_empty(&vm_page_queue_active) && active_burst_count) {
2d21ac55
A
1233
1234 if (active_burst_count)
1235 active_burst_count--;
1c79356b 1236
1c79356b 1237 vm_pageout_active++;
55e303ae 1238
1c79356b 1239 m = (vm_page_t) queue_first(&vm_page_queue_active);
91447636
A
1240
1241 assert(m->active && !m->inactive);
1242 assert(!m->laundry);
1243 assert(m->object != kernel_object);
2d21ac55
A
1244 assert(m->phys_page != vm_page_guard_addr);
1245
1246 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1c79356b
A
1247
1248 /*
91447636
A
1249 * Try to lock object; since we've already got the
1250 * page queues lock, we can only 'try' for this one.
1251 * if the 'try' fails, we need to do a mutex_pause
1252 * to allow the owner of the object lock a chance to
1253 * run... otherwise, we're likely to trip over this
1254 * object in the same state as we work our way through
1255 * the queue... clumps of pages associated with the same
1256 * object are fairly typical on the inactive and active queues
1c79356b 1257 */
91447636
A
1258 if (m->object != object) {
1259 if (object != NULL) {
1260 vm_object_unlock(object);
1261 object = NULL;
2d21ac55 1262 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1c79356b 1263 }
2d21ac55 1264 if (!vm_object_lock_try_scan(m->object)) {
91447636
A
1265 /*
1266 * move page to end of active queue and continue
1267 */
1268 queue_remove(&vm_page_queue_active, m,
1269 vm_page_t, pageq);
1270 queue_enter(&vm_page_queue_active, m,
1271 vm_page_t, pageq);
2d21ac55
A
1272
1273 try_failed = TRUE;
55e303ae 1274
2d21ac55
A
1275 m = (vm_page_t) queue_first(&vm_page_queue_active);
1276 /*
1277 * this is the next object we're going to be interested in
b0d623f7 1278 * try to make sure it's available after the mutex_yield
2d21ac55
A
1279 * returns control
1280 */
1281 vm_pageout_scan_wants_object = m->object;
1282
91447636 1283 goto done_with_activepage;
55e303ae 1284 }
91447636 1285 object = m->object;
2d21ac55
A
1286
1287 try_failed = FALSE;
1c79356b 1288 }
2d21ac55 1289
1c79356b 1290 /*
91447636
A
1291 * if the page is BUSY, then we pull it
1292 * off the active queue and leave it alone.
1293 * when BUSY is cleared, it will get stuck
1294 * back on the appropriate queue
1c79356b 1295 */
1c79356b 1296 if (m->busy) {
1c79356b
A
1297 queue_remove(&vm_page_queue_active, m,
1298 vm_page_t, pageq);
91447636
A
1299 m->pageq.next = NULL;
1300 m->pageq.prev = NULL;
1301
1c79356b
A
1302 if (!m->fictitious)
1303 vm_page_active_count--;
91447636
A
1304 m->active = FALSE;
1305
1306 goto done_with_activepage;
1c79356b 1307 }
91447636 1308
b0d623f7
A
1309 /* deal with a rogue "reusable" page */
1310 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
1311
1c79356b
A
1312 /*
1313 * Deactivate the page while holding the object
1314 * locked, so we know the page is still not busy.
1315 * This should prevent races between pmap_enter
1316 * and pmap_clear_reference. The page might be
1317 * absent or fictitious, but vm_page_deactivate
1318 * can handle that.
1319 */
91447636 1320 vm_page_deactivate(m);
2d21ac55 1321
91447636 1322done_with_activepage:
2d21ac55 1323 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1c79356b 1324
91447636 1325 if (object != NULL) {
b0d623f7 1326 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
91447636
A
1327 vm_object_unlock(object);
1328 object = NULL;
1329 }
1330 if (local_freeq) {
b0d623f7
A
1331 vm_page_unlock_queues();
1332 vm_page_free_list(local_freeq, TRUE);
91447636 1333
2d21ac55 1334 local_freeq = NULL;
91447636 1335 local_freed = 0;
b0d623f7
A
1336 vm_page_lock_queues();
1337 } else
1338 lck_mtx_yield(&vm_page_queue_lock);
2d21ac55
A
1339
1340 delayed_unlock = 1;
91447636 1341
91447636
A
1342 /*
1343 * continue the while loop processing
1344 * the active queue... need to hold
1345 * the page queues lock
1346 */
55e303ae 1347 }
1c79356b 1348 }
91447636
A
1349
1350
1351
1352 /**********************************************************************
1353 * above this point we're playing with the active queue
1354 * below this point we're playing with the throttling mechanisms
1355 * and the inactive queue
1356 **********************************************************************/
1357
2d21ac55 1358done_moving_active_pages:
91447636 1359
1c79356b
A
1360 /*
1361 * We are done if we have met our target *and*
1362 * nobody is still waiting for a page.
1363 */
55e303ae 1364 if (vm_page_free_count + local_freed >= vm_page_free_target) {
91447636
A
1365 if (object != NULL) {
1366 vm_object_unlock(object);
1367 object = NULL;
1368 }
2d21ac55
A
1369 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1370
55e303ae 1371 if (local_freeq) {
b0d623f7
A
1372 vm_page_unlock_queues();
1373 vm_page_free_list(local_freeq, TRUE);
55e303ae 1374
2d21ac55 1375 local_freeq = NULL;
55e303ae 1376 local_freed = 0;
b0d623f7 1377 vm_page_lock_queues();
55e303ae 1378 }
2d21ac55
A
1379 /*
1380 * inactive target still not met... keep going
1381 * until we get the queues balanced
1382 */
593a1d5f
A
1383
1384 /*
1385 * Recalculate vm_page_inactivate_target.
1386 */
1387 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1388 vm_page_inactive_count +
1389 vm_page_speculative_count);
1390
1391#ifndef CONFIG_EMBEDDED
1392 /*
1393 * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1394 * to balance the queues
1395 */
2d21ac55
A
1396 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1397 !queue_empty(&vm_page_queue_active))
1398 continue;
593a1d5f 1399#endif
2d21ac55 1400
b0d623f7 1401 lck_mtx_lock(&vm_page_queue_free_lock);
55e303ae 1402
0b4e3aa0 1403 if ((vm_page_free_count >= vm_page_free_target) &&
2d21ac55 1404 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
55e303ae 1405
0b4e3aa0 1406 vm_page_unlock_queues();
91447636
A
1407
1408 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2d21ac55
A
1409
1410 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1411
91447636 1412 return;
0b4e3aa0 1413 }
b0d623f7 1414 lck_mtx_unlock(&vm_page_queue_free_lock);
1c79356b 1415 }
b0d623f7 1416
2d21ac55 1417 /*
b0d623f7
A
1418 * Before anything, we check if we have any ripe volatile
1419 * objects around. If so, try to purge the first object.
1420 * If the purge fails, fall through to reclaim a page instead.
1421 * If the purge succeeds, go back to the top and reevalute
1422 * the new memory situation.
2d21ac55
A
1423 */
1424 assert (available_for_purge>=0);
1425 if (available_for_purge)
1426 {
1427 if (object != NULL) {
1428 vm_object_unlock(object);
1429 object = NULL;
1430 }
b0d623f7
A
1431 if(TRUE == vm_purgeable_object_purge_one()) {
1432 continue;
1433 }
2d21ac55
A
1434 }
1435
1436 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1437 /*
1438 * try to pull pages from the aging bins
1439 * see vm_page.h for an explanation of how
1440 * this mechanism works
1441 */
1442 struct vm_speculative_age_q *aq;
1443 mach_timespec_t ts_fully_aged;
1444 boolean_t can_steal = FALSE;
b0d623f7 1445 int num_scanned_queues;
2d21ac55
A
1446
1447 aq = &vm_page_queue_speculative[speculative_steal_index];
1448
b0d623f7
A
1449 num_scanned_queues = 0;
1450 while (queue_empty(&aq->age_q) &&
1451 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2d21ac55
A
1452
1453 speculative_steal_index++;
1454
1455 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1456 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1457
1458 aq = &vm_page_queue_speculative[speculative_steal_index];
1459 }
b0d623f7
A
1460
1461 if (num_scanned_queues ==
1462 VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1463 /*
1464 * XXX We've scanned all the speculative
1465 * queues but still haven't found one
1466 * that is not empty, even though
1467 * vm_page_speculative_count is not 0.
1468 */
1469 /* report the anomaly... */
1470 printf("vm_pageout_scan: "
1471 "all speculative queues empty "
1472 "but count=%d. Re-adjusting.\n",
1473 vm_page_speculative_count);
1474 if (vm_page_speculative_count >
1475 vm_page_speculative_count_drift_max)
1476 vm_page_speculative_count_drift_max = vm_page_speculative_count;
1477 vm_page_speculative_count_drifts++;
1478#if 6553678
1479 Debugger("vm_pageout_scan: no speculative pages");
1480#endif
1481 /* readjust... */
1482 vm_page_speculative_count = 0;
1483 /* ... and continue */
1484 continue;
1485 }
1486
2d21ac55
A
1487 if (vm_page_speculative_count > vm_page_speculative_target)
1488 can_steal = TRUE;
1489 else {
1490 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1491 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1492 * 1000 * NSEC_PER_USEC;
1493
1494 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
55e303ae 1495
b0d623f7
A
1496 clock_sec_t sec;
1497 clock_nsec_t nsec;
1498 clock_get_system_nanotime(&sec, &nsec);
1499 ts.tv_sec = (unsigned int) sec;
1500 ts.tv_nsec = nsec;
2d21ac55
A
1501
1502 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1503 can_steal = TRUE;
1504 }
1505 if (can_steal == TRUE)
1506 vm_page_speculate_ageit(aq);
1507 }
91447636 1508
1c79356b
A
1509 /*
1510 * Sometimes we have to pause:
1511 * 1) No inactive pages - nothing to do.
91447636
A
1512 * 2) Flow control - default pageout queue is full
1513 * 3) Loop control - no acceptable pages found on the inactive queue
1514 * within the last vm_pageout_burst_inactive_throttle iterations
1c79356b 1515 */
2d21ac55
A
1516 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1517 (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
91447636
A
1518 vm_pageout_scan_empty_throttle++;
1519 msecs = vm_pageout_empty_wait;
1520 goto vm_pageout_scan_delay;
1521
b0d623f7 1522 } else if (inactive_burst_count >=
593a1d5f
A
1523 MIN(vm_pageout_burst_inactive_throttle,
1524 (vm_page_inactive_count +
1525 vm_page_speculative_count))) {
91447636
A
1526 vm_pageout_scan_burst_throttle++;
1527 msecs = vm_pageout_burst_wait;
1528 goto vm_pageout_scan_delay;
1529
2d21ac55 1530 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
b0d623f7
A
1531 clock_sec_t sec;
1532 clock_nsec_t nsec;
91447636
A
1533
1534 switch (flow_control.state) {
1535
1536 case FCS_IDLE:
1537reset_deadlock_timer:
1538 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1539 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
b0d623f7
A
1540 clock_get_system_nanotime(&sec, &nsec);
1541 flow_control.ts.tv_sec = (unsigned int) sec;
1542 flow_control.ts.tv_nsec = nsec;
91447636
A
1543 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1544
1545 flow_control.state = FCS_DELAYED;
1546 msecs = vm_pageout_deadlock_wait;
1c79356b 1547
91447636
A
1548 break;
1549
1550 case FCS_DELAYED:
b0d623f7
A
1551 clock_get_system_nanotime(&sec, &nsec);
1552 ts.tv_sec = (unsigned int) sec;
1553 ts.tv_nsec = nsec;
91447636
A
1554
1555 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1556 /*
1557 * the pageout thread for the default pager is potentially
1558 * deadlocked since the
1559 * default pager queue has been throttled for more than the
1560 * allowable time... we need to move some clean pages or dirty
1561 * pages belonging to the external pagers if they aren't throttled
1562 * vm_page_free_wanted represents the number of threads currently
1563 * blocked waiting for pages... we'll move one page for each of
1564 * these plus a fixed amount to break the logjam... once we're done
1565 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1566 * with a new timeout target since we have no way of knowing
1567 * whether we've broken the deadlock except through observation
1568 * of the queue associated with the default pager... we need to
2d21ac55 1569 * stop moving pages and allow the system to run to see what
91447636
A
1570 * state it settles into.
1571 */
2d21ac55 1572 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
91447636
A
1573 vm_pageout_scan_deadlock_detected++;
1574 flow_control.state = FCS_DEADLOCK_DETECTED;
55e303ae 1575
91447636
A
1576 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1577 goto consider_inactive;
1578 }
1579 /*
1580 * just resniff instead of trying
1581 * to compute a new delay time... we're going to be
1582 * awakened immediately upon a laundry completion,
1583 * so we won't wait any longer than necessary
1584 */
1585 msecs = vm_pageout_idle_wait;
1586 break;
1c79356b 1587
91447636
A
1588 case FCS_DEADLOCK_DETECTED:
1589 if (vm_pageout_deadlock_target)
1590 goto consider_inactive;
1591 goto reset_deadlock_timer;
55e303ae 1592
91447636
A
1593 }
1594 vm_pageout_scan_throttle++;
1595 iq->pgo_throttled = TRUE;
1596vm_pageout_scan_delay:
1597 if (object != NULL) {
1598 vm_object_unlock(object);
1599 object = NULL;
1600 }
2d21ac55
A
1601 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1602
55e303ae 1603 if (local_freeq) {
b0d623f7
A
1604 vm_page_unlock_queues();
1605 vm_page_free_list(local_freeq, TRUE);
55e303ae 1606
2d21ac55 1607 local_freeq = NULL;
55e303ae 1608 local_freed = 0;
b0d623f7
A
1609 vm_page_lock_queues();
1610
1611 if (flow_control.state == FCS_DELAYED &&
1612 !VM_PAGE_Q_THROTTLED(iq)) {
1613 flow_control.state = FCS_IDLE;
1614 vm_pageout_scan_throttle_aborted++;
1615 goto consider_inactive;
1616 }
55e303ae 1617 }
2d21ac55
A
1618#if CONFIG_EMBEDDED
1619 {
1620 int percent_avail;
0b4e3aa0 1621
2d21ac55
A
1622 /*
1623 * Decide if we need to send a memory status notification.
1624 */
1625 percent_avail =
1626 (vm_page_active_count + vm_page_inactive_count +
1627 vm_page_speculative_count + vm_page_free_count +
cf7d32b8 1628 (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
2d21ac55
A
1629 atop_64(max_mem);
1630 if (percent_avail >= (kern_memorystatus_level + 5) ||
1631 percent_avail <= (kern_memorystatus_level - 5)) {
1632 kern_memorystatus_level = percent_avail;
1633 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1634 }
1635 }
1636#endif
1637 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
2d21ac55 1638 counter(c_vm_pageout_scan_block++);
1c79356b 1639
91447636 1640 vm_page_unlock_queues();
2d21ac55
A
1641
1642 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
b0d623f7 1643
91447636
A
1644 thread_block(THREAD_CONTINUE_NULL);
1645
1646 vm_page_lock_queues();
1647 delayed_unlock = 1;
1648
1649 iq->pgo_throttled = FALSE;
0b4e3aa0 1650
2d21ac55 1651 if (loop_count >= vm_page_inactive_count)
55e303ae 1652 loop_count = 0;
91447636
A
1653 inactive_burst_count = 0;
1654
1c79356b
A
1655 goto Restart;
1656 /*NOTREACHED*/
1657 }
1658
91447636
A
1659
1660 flow_control.state = FCS_IDLE;
1661consider_inactive:
1662 loop_count++;
1663 inactive_burst_count++;
1c79356b 1664 vm_pageout_inactive++;
9bccf70c 1665
2d21ac55
A
1666 /* Choose a victim. */
1667
1668 while (1) {
1669 m = NULL;
91447636 1670
b0d623f7
A
1671 if (IP_VALID(memory_manager_default)) {
1672 assert(vm_page_throttled_count == 0);
1673 assert(queue_empty(&vm_page_queue_throttled));
91447636 1674 }
2d21ac55
A
1675
1676 /*
b0d623f7 1677 * The most eligible pages are ones we paged in speculatively,
2d21ac55
A
1678 * but which have not yet been touched.
1679 */
1680 if ( !queue_empty(&sq->age_q) ) {
1681 m = (vm_page_t) queue_first(&sq->age_q);
1682 break;
9bccf70c 1683 }
2d21ac55
A
1684 /*
1685 * Time for a zero-filled inactive page?
1686 */
1687 if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1688 queue_empty(&vm_page_queue_inactive)) {
1689 if ( !queue_empty(&vm_page_queue_zf) ) {
1690 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1691 zf_run_count++;
1692 break;
1693 }
1694 }
1695 /*
1696 * It's either a normal inactive page or nothing.
1697 */
1698 if ( !queue_empty(&vm_page_queue_inactive) ) {
1699 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1700 zf_run_count = 0;
1701 break;
1702 }
1703
1704 panic("vm_pageout: no victim");
9bccf70c 1705 }
2d21ac55
A
1706
1707 assert(!m->active && (m->inactive || m->speculative || m->throttled));
91447636
A
1708 assert(!m->laundry);
1709 assert(m->object != kernel_object);
2d21ac55
A
1710 assert(m->phys_page != vm_page_guard_addr);
1711
b0d623f7
A
1712 if (!m->speculative) {
1713 vm_pageout_stats[vm_pageout_stat_now].considered++;
1714 }
1715
2d21ac55 1716 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1c79356b 1717
91447636 1718 /*
2d21ac55
A
1719 * check to see if we currently are working
1720 * with the same object... if so, we've
1721 * already got the lock
91447636
A
1722 */
1723 if (m->object != object) {
2d21ac55
A
1724 /*
1725 * the object associated with candidate page is
1726 * different from the one we were just working
1727 * with... dump the lock if we still own it
1728 */
91447636
A
1729 if (object != NULL) {
1730 vm_object_unlock(object);
1731 object = NULL;
2d21ac55 1732 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
91447636 1733 }
2d21ac55
A
1734 /*
1735 * Try to lock object; since we've alread got the
1736 * page queues lock, we can only 'try' for this one.
1737 * if the 'try' fails, we need to do a mutex_pause
1738 * to allow the owner of the object lock a chance to
1739 * run... otherwise, we're likely to trip over this
1740 * object in the same state as we work our way through
1741 * the queue... clumps of pages associated with the same
1742 * object are fairly typical on the inactive and active queues
1743 */
1744 if (!vm_object_lock_try_scan(m->object)) {
b0d623f7
A
1745 vm_pageout_inactive_nolock++;
1746
1747 requeue_page:
91447636
A
1748 /*
1749 * Move page to end and continue.
1750 * Don't re-issue ticket
1751 */
1752 if (m->zero_fill) {
b0d623f7
A
1753 if (m->speculative) {
1754 panic("vm_pageout_scan(): page %p speculative and zero-fill !?\n", m);
1755 }
1756 assert(!m->speculative);
91447636
A
1757 queue_remove(&vm_page_queue_zf, m,
1758 vm_page_t, pageq);
1759 queue_enter(&vm_page_queue_zf, m,
1760 vm_page_t, pageq);
2d21ac55
A
1761 } else if (m->speculative) {
1762 remque(&m->pageq);
1763 m->speculative = FALSE;
1764 vm_page_speculative_count--;
1765
1766 /*
b0d623f7 1767 * move to the head of the inactive queue
2d21ac55
A
1768 * to get it out of the way... the speculative
1769 * queue is generally too small to depend
1770 * on there being enough pages from other
1771 * objects to make cycling it back on the
1772 * same queue a winning proposition
1773 */
b0d623f7
A
1774 queue_enter_first(&vm_page_queue_inactive, m,
1775 vm_page_t, pageq);
2d21ac55
A
1776 m->inactive = TRUE;
1777 vm_page_inactive_count++;
1778 token_new_pagecount++;
1779 } else if (m->throttled) {
1780 queue_remove(&vm_page_queue_throttled, m,
1781 vm_page_t, pageq);
1782 m->throttled = FALSE;
1783 vm_page_throttled_count--;
cf7d32b8 1784
2d21ac55
A
1785 /*
1786 * not throttled any more, so can stick
1787 * it on the inactive queue.
1788 */
1789 queue_enter(&vm_page_queue_inactive, m,
1790 vm_page_t, pageq);
1791 m->inactive = TRUE;
1792 vm_page_inactive_count++;
1793 token_new_pagecount++;
91447636
A
1794 } else {
1795 queue_remove(&vm_page_queue_inactive, m,
1796 vm_page_t, pageq);
2d21ac55
A
1797#if MACH_ASSERT
1798 vm_page_inactive_count--; /* balance for purgeable queue asserts */
1799#endif
cf7d32b8 1800 vm_purgeable_q_advance_all();
2d21ac55 1801
91447636
A
1802 queue_enter(&vm_page_queue_inactive, m,
1803 vm_page_t, pageq);
2d21ac55
A
1804#if MACH_ASSERT
1805 vm_page_inactive_count++; /* balance for purgeable queue asserts */
1806#endif
1807 token_new_pagecount++;
55e303ae 1808 }
2d21ac55
A
1809 pmap_clear_reference(m->phys_page);
1810 m->reference = FALSE;
1811
2d21ac55
A
1812 if ( !queue_empty(&sq->age_q) )
1813 m = (vm_page_t) queue_first(&sq->age_q);
1814 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1815 queue_empty(&vm_page_queue_inactive)) {
1816 if ( !queue_empty(&vm_page_queue_zf) )
1817 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1818 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1819 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1820 }
1821 /*
1822 * this is the next object we're going to be interested in
1823 * try to make sure its available after the mutex_yield
1824 * returns control
1825 */
1826 vm_pageout_scan_wants_object = m->object;
1827
91447636
A
1828 /*
1829 * force us to dump any collected free pages
1830 * and to pause before moving on
1831 */
2d21ac55 1832 try_failed = TRUE;
55e303ae 1833
91447636 1834 goto done_with_inactivepage;
1c79356b 1835 }
91447636 1836 object = m->object;
2d21ac55 1837 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
0b4e3aa0 1838
2d21ac55 1839 try_failed = FALSE;
1c79356b
A
1840 }
1841
1c79356b 1842 /*
55e303ae
A
1843 * Paging out pages of external objects which
1844 * are currently being created must be avoided.
1845 * The pager may claim for memory, thus leading to a
1846 * possible dead lock between it and the pageout thread,
1847 * if such pages are finally chosen. The remaining assumption
1848 * is that there will finally be enough available pages in the
1849 * inactive pool to page out in order to satisfy all memory
1850 * claimed by the thread which concurrently creates the pager.
1c79356b 1851 */
1c79356b
A
1852 if (!object->pager_initialized && object->pager_created) {
1853 /*
1854 * Move page to end and continue, hoping that
1855 * there will be enough other inactive pages to
1856 * page out so that the thread which currently
1857 * initializes the pager will succeed.
0b4e3aa0
A
1858 * Don't re-grant the ticket, the page should
1859 * pulled from the queue and paged out whenever
1860 * one of its logically adjacent fellows is
1861 * targeted.
1c79356b 1862 */
1c79356b 1863 vm_pageout_inactive_avoid++;
b0d623f7 1864 goto requeue_page;
91447636 1865 }
1c79356b 1866 /*
2d21ac55 1867 * Remove the page from its list.
1c79356b 1868 */
2d21ac55
A
1869 if (m->speculative) {
1870 remque(&m->pageq);
b0d623f7 1871 page_prev_state = PAGE_STATE_SPECULATIVE;
2d21ac55
A
1872 m->speculative = FALSE;
1873 vm_page_speculative_count--;
1874 } else if (m->throttled) {
1875 queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
b0d623f7 1876 page_prev_state = PAGE_STATE_THROTTLED;
2d21ac55
A
1877 m->throttled = FALSE;
1878 vm_page_throttled_count--;
9bccf70c 1879 } else {
2d21ac55
A
1880 if (m->zero_fill) {
1881 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
b0d623f7 1882 page_prev_state = PAGE_STATE_ZEROFILL;
2d21ac55
A
1883 vm_zf_queue_count--;
1884 } else {
b0d623f7 1885 page_prev_state = PAGE_STATE_INACTIVE;
2d21ac55
A
1886 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1887 }
1888 m->inactive = FALSE;
1889 if (!m->fictitious)
1890 vm_page_inactive_count--;
b0d623f7 1891 vm_purgeable_q_advance_all();
2d21ac55
A
1892 }
1893
91447636
A
1894 m->pageq.next = NULL;
1895 m->pageq.prev = NULL;
1c79356b 1896
2d21ac55
A
1897 if ( !m->fictitious && catch_up_count)
1898 catch_up_count--;
1899
1900 /*
1901 * ENCRYPTED SWAP:
1902 * if this page has already been picked up as part of a
1903 * page-out cluster, it will be busy because it is being
1904 * encrypted (see vm_object_upl_request()). But we still
1905 * want to demote it from "clean-in-place" (aka "adjacent")
1906 * to "clean-and-free" (aka "target"), so let's ignore its
1907 * "busy" bit here and proceed to check for "cleaning" a
1908 * little bit below...
1909 */
1910 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1c79356b
A
1911 /*
1912 * Somebody is already playing with this page.
1913 * Leave it off the pageout queues.
2d21ac55 1914 *
1c79356b 1915 */
1c79356b 1916 vm_pageout_inactive_busy++;
91447636
A
1917
1918 goto done_with_inactivepage;
1c79356b
A
1919 }
1920
1921 /*
1922 * If it's absent or in error, we can reclaim the page.
1923 */
1924
1925 if (m->absent || m->error) {
1926 vm_pageout_inactive_absent++;
91447636
A
1927reclaim_page:
1928 if (vm_pageout_deadlock_target) {
1929 vm_pageout_scan_inactive_throttle_success++;
1930 vm_pageout_deadlock_target--;
1931 }
2d21ac55
A
1932
1933 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1934
b0d623f7 1935 if (object->internal) {
2d21ac55
A
1936 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1937 } else {
1938 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1939 }
b0d623f7 1940 vm_page_free_prepare_queues(m);
2d21ac55 1941
b0d623f7
A
1942 /*
1943 * remove page from object here since we're already
1944 * behind the object lock... defer the rest of the work
1945 * we'd normally do in vm_page_free_prepare_object
1946 * until 'vm_page_free_list' is called
1947 */
1948 if (m->tabled)
1949 vm_page_remove(m, TRUE);
55e303ae 1950
91447636
A
1951 assert(m->pageq.next == NULL &&
1952 m->pageq.prev == NULL);
55e303ae
A
1953 m->pageq.next = (queue_entry_t)local_freeq;
1954 local_freeq = m;
91447636 1955 local_freed++;
55e303ae 1956
91447636
A
1957 inactive_burst_count = 0;
1958
b0d623f7
A
1959 if(page_prev_state != PAGE_STATE_SPECULATIVE) {
1960 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
1961 page_prev_state = 0;
1962 }
1963
91447636 1964 goto done_with_inactivepage;
1c79356b
A
1965 }
1966
1967 assert(!m->private);
1968 assert(!m->fictitious);
1969
1970 /*
1971 * If already cleaning this page in place, convert from
1972 * "adjacent" to "target". We can leave the page mapped,
1973 * and vm_pageout_object_terminate will determine whether
1974 * to free or reactivate.
1975 */
1976
1977 if (m->cleaning) {
0b4e3aa0
A
1978 m->busy = TRUE;
1979 m->pageout = TRUE;
1980 m->dump_cleaning = TRUE;
1981 vm_page_wire(m);
55e303ae 1982
91447636
A
1983 CLUSTER_STAT(vm_pageout_cluster_conversions++);
1984
1985 inactive_burst_count = 0;
1986
1987 goto done_with_inactivepage;
1c79356b
A
1988 }
1989
b0d623f7
A
1990 /*
1991 * If the object is empty, the page must be reclaimed even
1992 * if dirty or used.
1993 * If the page belongs to a volatile object, we stick it back
1994 * on.
1995 */
1996 if (object->copy == VM_OBJECT_NULL) {
1997 if (object->purgable == VM_PURGABLE_EMPTY) {
1998 m->busy = TRUE;
1999 if (m->pmapped == TRUE) {
2000 /* unmap the page */
2001 refmod_state = pmap_disconnect(m->phys_page);
2002 if (refmod_state & VM_MEM_MODIFIED) {
2003 m->dirty = TRUE;
2004 }
2005 }
2006 if (m->dirty || m->precious) {
2007 /* we saved the cost of cleaning this page ! */
2008 vm_page_purged_count++;
2009 }
2010 goto reclaim_page;
2011 }
2012 if (object->purgable == VM_PURGABLE_VOLATILE) {
2013 /* if it's wired, we can't put it on our queue */
2014 assert(!VM_PAGE_WIRED(m));
2015 /* just stick it back on! */
2016 goto reactivate_page;
2017 }
2018 }
2019
1c79356b
A
2020 /*
2021 * If it's being used, reactivate.
2022 * (Fictitious pages are either busy or absent.)
2d21ac55
A
2023 * First, update the reference and dirty bits
2024 * to make sure the page is unreferenced.
1c79356b 2025 */
2d21ac55
A
2026 refmod_state = -1;
2027
2028 if (m->reference == FALSE && m->pmapped == TRUE) {
91447636
A
2029 refmod_state = pmap_get_refmod(m->phys_page);
2030
2031 if (refmod_state & VM_MEM_REFERENCED)
2032 m->reference = TRUE;
2033 if (refmod_state & VM_MEM_MODIFIED)
2034 m->dirty = TRUE;
2035 }
b0d623f7
A
2036
2037 if (m->reference || m->dirty) {
2038 /* deal with a rogue "reusable" page */
2039 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2040 }
2041
2d21ac55
A
2042 if (m->reference && !m->no_cache) {
2043 /*
2044 * The page we pulled off the inactive list has
2045 * been referenced. It is possible for other
2046 * processors to be touching pages faster than we
2047 * can clear the referenced bit and traverse the
2048 * inactive queue, so we limit the number of
2049 * reactivations.
2050 */
2051 if (++reactivated_this_call >= reactivate_limit) {
2052 vm_pageout_reactivation_limit_exceeded++;
2053 } else if (catch_up_count) {
2054 vm_pageout_catch_ups++;
2055 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2056 vm_pageout_inactive_force_reclaim++;
2057 } else {
b0d623f7 2058 uint32_t isinuse;
2d21ac55 2059reactivate_page:
b0d623f7
A
2060 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2061 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2062 /*
2063 * no explict mappings of this object exist
2064 * and it's not open via the filesystem
2065 */
2066 vm_page_deactivate(m);
2067 vm_pageout_inactive_deactivated++;
2068 } else {
2069 /*
2070 * The page was/is being used, so put back on active list.
2071 */
2072 vm_page_activate(m);
2073 VM_STAT_INCR(reactivations);
2074 }
2d21ac55
A
2075 vm_pageout_inactive_used++;
2076 inactive_burst_count = 0;
55e303ae 2077
2d21ac55
A
2078 goto done_with_inactivepage;
2079 }
2080 /*
2081 * Make sure we call pmap_get_refmod() if it
2082 * wasn't already called just above, to update
2083 * the dirty bit.
2084 */
2085 if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2086 refmod_state = pmap_get_refmod(m->phys_page);
2087 if (refmod_state & VM_MEM_MODIFIED)
2088 m->dirty = TRUE;
2089 }
2090 forced_reclaim = TRUE;
2091 } else {
2092 forced_reclaim = FALSE;
1c79356b
A
2093 }
2094
91447636
A
2095 XPR(XPR_VM_PAGEOUT,
2096 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
b0d623f7 2097 object, m->offset, m, 0,0);
0b4e3aa0 2098
91447636
A
2099 /*
2100 * we've got a candidate page to steal...
2101 *
2102 * m->dirty is up to date courtesy of the
2103 * preceding check for m->reference... if
2104 * we get here, then m->reference had to be
2d21ac55
A
2105 * FALSE (or possibly "reactivate_limit" was
2106 * exceeded), but in either case we called
2107 * pmap_get_refmod() and updated both
2108 * m->reference and m->dirty
91447636
A
2109 *
2110 * if it's dirty or precious we need to
2111 * see if the target queue is throtttled
2112 * it if is, we need to skip over it by moving it back
2113 * to the end of the inactive queue
2114 */
b0d623f7 2115
91447636
A
2116 inactive_throttled = FALSE;
2117
2118 if (m->dirty || m->precious) {
2119 if (object->internal) {
2d21ac55 2120 if (VM_PAGE_Q_THROTTLED(iq))
91447636
A
2121 inactive_throttled = TRUE;
2122 } else if (VM_PAGE_Q_THROTTLED(eq)) {
2d21ac55 2123 inactive_throttled = TRUE;
1c79356b 2124 }
91447636
A
2125 }
2126 if (inactive_throttled == TRUE) {
2d21ac55
A
2127throttle_inactive:
2128 if (!IP_VALID(memory_manager_default) &&
d1ecb069
A
2129 object->internal && m->dirty &&
2130 (object->purgable == VM_PURGABLE_DENY ||
2131 object->purgable == VM_PURGABLE_NONVOLATILE ||
2132 object->purgable == VM_PURGABLE_VOLATILE)) {
2d21ac55 2133 queue_enter(&vm_page_queue_throttled, m,
91447636 2134 vm_page_t, pageq);
2d21ac55
A
2135 m->throttled = TRUE;
2136 vm_page_throttled_count++;
91447636 2137 } else {
2d21ac55
A
2138 if (m->zero_fill) {
2139 queue_enter(&vm_page_queue_zf, m,
2140 vm_page_t, pageq);
2141 vm_zf_queue_count++;
2142 } else
2143 queue_enter(&vm_page_queue_inactive, m,
2144 vm_page_t, pageq);
2145 m->inactive = TRUE;
2146 if (!m->fictitious) {
2147 vm_page_inactive_count++;
2148 token_new_pagecount++;
2149 }
1c79356b 2150 }
91447636 2151 vm_pageout_scan_inactive_throttled++;
91447636 2152 goto done_with_inactivepage;
1c79356b 2153 }
2d21ac55 2154
1c79356b 2155 /*
91447636
A
2156 * we've got a page that we can steal...
2157 * eliminate all mappings and make sure
2158 * we have the up-to-date modified state
2159 * first take the page BUSY, so that no new
2160 * mappings can be made
1c79356b 2161 */
1c79356b 2162 m->busy = TRUE;
55e303ae 2163
91447636
A
2164 /*
2165 * if we need to do a pmap_disconnect then we
2166 * need to re-evaluate m->dirty since the pmap_disconnect
2167 * provides the true state atomically... the
2168 * page was still mapped up to the pmap_disconnect
2169 * and may have been dirtied at the last microsecond
2170 *
2171 * we also check for the page being referenced 'late'
2172 * if it was, we first need to do a WAKEUP_DONE on it
2173 * since we already set m->busy = TRUE, before
2174 * going off to reactivate it
2175 *
2d21ac55
A
2176 * Note that if 'pmapped' is FALSE then the page is not
2177 * and has not been in any map, so there is no point calling
2178 * pmap_disconnect(). m->dirty and/or m->reference could
2179 * have been set in anticipation of likely usage of the page.
91447636 2180 */
2d21ac55 2181 if (m->pmapped == TRUE) {
91447636 2182 refmod_state = pmap_disconnect(m->phys_page);
0b4e3aa0 2183
91447636
A
2184 if (refmod_state & VM_MEM_MODIFIED)
2185 m->dirty = TRUE;
2186 if (refmod_state & VM_MEM_REFERENCED) {
2d21ac55
A
2187
2188 /* If m->reference is already set, this page must have
2189 * already failed the reactivate_limit test, so don't
2190 * bump the counts twice.
2191 */
2192 if ( ! m->reference ) {
2193 m->reference = TRUE;
2194 if (forced_reclaim ||
2195 ++reactivated_this_call >= reactivate_limit)
2196 vm_pageout_reactivation_limit_exceeded++;
2197 else {
2198 PAGE_WAKEUP_DONE(m);
2199 goto reactivate_page;
2200 }
2201 }
91447636
A
2202 }
2203 }
2d21ac55
A
2204 /*
2205 * reset our count of pages that have been reclaimed
2206 * since the last page was 'stolen'
2207 */
2208 inactive_reclaim_run = 0;
2209
1c79356b
A
2210 /*
2211 * If it's clean and not precious, we can free the page.
2212 */
1c79356b 2213 if (!m->dirty && !m->precious) {
b0d623f7
A
2214 if (m->zero_fill)
2215 vm_pageout_inactive_zf++;
1c79356b 2216 vm_pageout_inactive_clean++;
b0d623f7 2217
1c79356b
A
2218 goto reclaim_page;
2219 }
2d21ac55
A
2220
2221 /*
2222 * The page may have been dirtied since the last check
2223 * for a throttled target queue (which may have been skipped
2224 * if the page was clean then). With the dirty page
2225 * disconnected here, we can make one final check.
2226 */
2227 {
2228 boolean_t disconnect_throttled = FALSE;
2229 if (object->internal) {
2230 if (VM_PAGE_Q_THROTTLED(iq))
2231 disconnect_throttled = TRUE;
2232 } else if (VM_PAGE_Q_THROTTLED(eq)) {
2233 disconnect_throttled = TRUE;
2234 }
2235
2236 if (disconnect_throttled == TRUE) {
2237 PAGE_WAKEUP_DONE(m);
2238 goto throttle_inactive;
2239 }
2240 }
2241
b0d623f7
A
2242 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2243
91447636 2244 vm_pageout_cluster(m);
1c79356b 2245
b0d623f7
A
2246 if (m->zero_fill)
2247 vm_pageout_inactive_zf++;
91447636 2248 vm_pageout_inactive_dirty++;
1c79356b 2249
91447636 2250 inactive_burst_count = 0;
1c79356b 2251
91447636 2252done_with_inactivepage:
2d21ac55 2253 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1c79356b 2254
91447636 2255 if (object != NULL) {
b0d623f7 2256 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
91447636
A
2257 vm_object_unlock(object);
2258 object = NULL;
2259 }
2260 if (local_freeq) {
b0d623f7
A
2261 vm_page_unlock_queues();
2262 vm_page_free_list(local_freeq, TRUE);
91447636 2263
2d21ac55 2264 local_freeq = NULL;
91447636 2265 local_freed = 0;
b0d623f7
A
2266 vm_page_lock_queues();
2267 } else
2268 lck_mtx_yield(&vm_page_queue_lock);
2d21ac55
A
2269
2270 delayed_unlock = 1;
1c79356b 2271 }
91447636
A
2272 /*
2273 * back to top of pageout scan loop
2274 */
1c79356b 2275 }
1c79356b
A
2276}
2277
1c79356b 2278
1c79356b
A
2279int vm_page_free_count_init;
2280
2281void
2282vm_page_free_reserve(
2283 int pages)
2284{
2285 int free_after_reserve;
2286
2287 vm_page_free_reserved += pages;
2288
2289 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2290
2291 vm_page_free_min = vm_page_free_reserved +
2292 VM_PAGE_FREE_MIN(free_after_reserve);
2293
2d21ac55
A
2294 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2295 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2296
1c79356b
A
2297 vm_page_free_target = vm_page_free_reserved +
2298 VM_PAGE_FREE_TARGET(free_after_reserve);
2299
2d21ac55
A
2300 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2301 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2302
1c79356b
A
2303 if (vm_page_free_target < vm_page_free_min + 5)
2304 vm_page_free_target = vm_page_free_min + 5;
2d21ac55 2305
b0d623f7
A
2306 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2307 vm_page_creation_throttle = vm_page_free_target / 2;
1c79356b
A
2308}
2309
2310/*
2311 * vm_pageout is the high level pageout daemon.
2312 */
2313
55e303ae
A
2314void
2315vm_pageout_continue(void)
2316{
2d21ac55 2317 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
55e303ae
A
2318 vm_pageout_scan_event_counter++;
2319 vm_pageout_scan();
2320 /* we hold vm_page_queue_free_lock now */
2321 assert(vm_page_free_wanted == 0);
2d21ac55 2322 assert(vm_page_free_wanted_privileged == 0);
55e303ae 2323 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
b0d623f7 2324 lck_mtx_unlock(&vm_page_queue_free_lock);
55e303ae
A
2325
2326 counter(c_vm_pageout_block++);
91447636 2327 thread_block((thread_continue_t)vm_pageout_continue);
55e303ae
A
2328 /*NOTREACHED*/
2329}
1c79356b 2330
91447636 2331
91447636 2332#ifdef FAKE_DEADLOCK
1c79356b 2333
91447636
A
2334#define FAKE_COUNT 5000
2335
2336int internal_count = 0;
2337int fake_deadlock = 0;
2338
2339#endif
2340
2341static void
2342vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2343{
2344 vm_page_t m = NULL;
2345 vm_object_t object;
2d21ac55
A
2346 memory_object_t pager;
2347 thread_t self = current_thread();
91447636 2348
2d21ac55
A
2349 if ((vm_pageout_internal_iothread != THREAD_NULL)
2350 && (self == vm_pageout_external_iothread )
2351 && (self->options & TH_OPT_VMPRIV))
2352 self->options &= ~TH_OPT_VMPRIV;
2353
2354 vm_page_lockspin_queues();
91447636
A
2355
2356 while ( !queue_empty(&q->pgo_pending) ) {
2357
2358 q->pgo_busy = TRUE;
2359 queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
b0d623f7 2360 VM_PAGE_CHECK(m);
91447636 2361 m->pageout_queue = FALSE;
91447636
A
2362 m->pageq.next = NULL;
2363 m->pageq.prev = NULL;
b0d623f7
A
2364 vm_page_unlock_queues();
2365
91447636
A
2366#ifdef FAKE_DEADLOCK
2367 if (q == &vm_pageout_queue_internal) {
2368 vm_offset_t addr;
2369 int pg_count;
2370
2371 internal_count++;
2372
2373 if ((internal_count == FAKE_COUNT)) {
2374
2375 pg_count = vm_page_free_count + vm_page_free_reserved;
2376
2377 if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2378 kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2379 }
2380 internal_count = 0;
2381 fake_deadlock++;
2382 }
2383 }
2384#endif
2385 object = m->object;
2386
2d21ac55
A
2387 vm_object_lock(object);
2388
91447636 2389 if (!object->pager_initialized) {
91447636
A
2390
2391 /*
2392 * If there is no memory object for the page, create
2393 * one and hand it to the default pager.
2394 */
2395
2396 if (!object->pager_initialized)
0c530ab8
A
2397 vm_object_collapse(object,
2398 (vm_object_offset_t) 0,
2399 TRUE);
91447636
A
2400 if (!object->pager_initialized)
2401 vm_object_pager_create(object);
2402 if (!object->pager_initialized) {
2403 /*
2404 * Still no pager for the object.
2405 * Reactivate the page.
2406 *
2407 * Should only happen if there is no
2408 * default pager.
2409 */
2d21ac55 2410 vm_page_lockspin_queues();
b0d623f7
A
2411
2412 vm_pageout_queue_steal(m, TRUE);
91447636
A
2413 vm_pageout_dirty_no_pager++;
2414 vm_page_activate(m);
b0d623f7 2415
91447636
A
2416 vm_page_unlock_queues();
2417
2418 /*
2419 * And we are done with it.
2420 */
2421 PAGE_WAKEUP_DONE(m);
2422
2423 vm_object_paging_end(object);
2424 vm_object_unlock(object);
2425
2d21ac55 2426 vm_page_lockspin_queues();
91447636 2427 continue;
2d21ac55
A
2428 }
2429 }
2430 pager = object->pager;
2431 if (pager == MEMORY_OBJECT_NULL) {
2432 /*
2433 * This pager has been destroyed by either
2434 * memory_object_destroy or vm_object_destroy, and
2435 * so there is nowhere for the page to go.
2d21ac55 2436 */
0b4c1975
A
2437 if (m->pageout) {
2438 /*
2439 * Just free the page... VM_PAGE_FREE takes
2440 * care of cleaning up all the state...
2441 * including doing the vm_pageout_throttle_up
2442 */
2443 VM_PAGE_FREE(m);
2444 } else {
2445 vm_page_lockspin_queues();
91447636 2446
0b4c1975
A
2447 vm_pageout_queue_steal(m, TRUE);
2448 vm_page_activate(m);
2449
2450 vm_page_unlock_queues();
91447636 2451
0b4c1975
A
2452 /*
2453 * And we are done with it.
2454 */
2455 PAGE_WAKEUP_DONE(m);
2456 }
2d21ac55 2457 vm_object_paging_end(object);
91447636 2458 vm_object_unlock(object);
2d21ac55
A
2459
2460 vm_page_lockspin_queues();
2461 continue;
91447636 2462 }
b0d623f7 2463 VM_PAGE_CHECK(m);
2d21ac55 2464 vm_object_unlock(object);
91447636
A
2465 /*
2466 * we expect the paging_in_progress reference to have
2467 * already been taken on the object before it was added
2468 * to the appropriate pageout I/O queue... this will
2469 * keep the object from being terminated and/or the
2470 * paging_offset from changing until the I/O has
2471 * completed... therefore no need to lock the object to
2472 * pull the paging_offset from it.
2473 *
2474 * Send the data to the pager.
2475 * any pageout clustering happens there
2476 */
2d21ac55 2477 memory_object_data_return(pager,
91447636
A
2478 m->offset + object->paging_offset,
2479 PAGE_SIZE,
2480 NULL,
2481 NULL,
2482 FALSE,
2483 FALSE,
2484 0);
2485
2486 vm_object_lock(object);
2487 vm_object_paging_end(object);
2488 vm_object_unlock(object);
2489
2d21ac55 2490 vm_page_lockspin_queues();
91447636
A
2491 }
2492 assert_wait((event_t) q, THREAD_UNINT);
2493
91447636
A
2494 if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2495 q->pgo_throttled = FALSE;
0b4c1975
A
2496 thread_wakeup((event_t) &q->pgo_laundry);
2497 }
2498 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
2499 q->pgo_draining = FALSE;
2500 thread_wakeup((event_t) (&q->pgo_laundry+1));
2501 }
91447636
A
2502 q->pgo_busy = FALSE;
2503 q->pgo_idle = TRUE;
2504 vm_page_unlock_queues();
2505
91447636
A
2506 thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2507 /*NOTREACHED*/
2508}
2509
2510
2511static void
2512vm_pageout_iothread_external(void)
2513{
2d21ac55
A
2514 thread_t self = current_thread();
2515
2516 self->options |= TH_OPT_VMPRIV;
91447636
A
2517
2518 vm_pageout_iothread_continue(&vm_pageout_queue_external);
2519 /*NOTREACHED*/
2520}
2521
2522
2523static void
2524vm_pageout_iothread_internal(void)
2525{
2526 thread_t self = current_thread();
2527
2528 self->options |= TH_OPT_VMPRIV;
2529
2530 vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2531 /*NOTREACHED*/
2532}
2533
b0d623f7 2534kern_return_t
0b4c1975 2535vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
b0d623f7
A
2536{
2537 if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
2538 return KERN_SUCCESS;
2539 } else {
2540 return KERN_FAILURE; /* Already set */
2541 }
2542}
2543
91447636
A
2544static void
2545vm_pageout_garbage_collect(int collect)
2546{
2547 if (collect) {
b0d623f7 2548 boolean_t buf_large_zfree = FALSE;
91447636
A
2549 stack_collect();
2550
2551 /*
2552 * consider_zone_gc should be last, because the other operations
2553 * might return memory to zones.
2554 */
2555 consider_machine_collect();
b0d623f7 2556 if (consider_buffer_cache_collect != NULL) {
0b4c1975 2557 buf_large_zfree = (*consider_buffer_cache_collect)(0);
b0d623f7
A
2558 }
2559 consider_zone_gc(buf_large_zfree);
91447636
A
2560
2561 consider_machine_adjust();
2562 }
2563
2564 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2565
2566 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2567 /*NOTREACHED*/
2568}
2569
2570
2571
2572void
2573vm_pageout(void)
2574{
2575 thread_t self = current_thread();
2576 thread_t thread;
2577 kern_return_t result;
2578 spl_t s;
2579
2580 /*
2581 * Set thread privileges.
2582 */
2583 s = splsched();
2584 thread_lock(self);
2585 self->priority = BASEPRI_PREEMPT - 1;
2586 set_sched_pri(self, self->priority);
2587 thread_unlock(self);
2d21ac55
A
2588
2589 if (!self->reserved_stack)
2590 self->reserved_stack = self->kernel_stack;
2591
91447636
A
2592 splx(s);
2593
2594 /*
2595 * Initialize some paging parameters.
2596 */
2597
2598 if (vm_pageout_idle_wait == 0)
2599 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2600
2601 if (vm_pageout_burst_wait == 0)
2602 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2603
2604 if (vm_pageout_empty_wait == 0)
2605 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2606
2607 if (vm_pageout_deadlock_wait == 0)
2608 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2609
2610 if (vm_pageout_deadlock_relief == 0)
2611 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2612
2613 if (vm_pageout_inactive_relief == 0)
2614 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2615
2616 if (vm_pageout_burst_active_throttle == 0)
2617 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2618
2619 if (vm_pageout_burst_inactive_throttle == 0)
2620 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2621
2622 /*
2623 * Set kernel task to low backing store privileged
55e303ae
A
2624 * status
2625 */
2626 task_lock(kernel_task);
2627 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2628 task_unlock(kernel_task);
2629
1c79356b 2630 vm_page_free_count_init = vm_page_free_count;
2d21ac55 2631
1c79356b
A
2632 /*
2633 * even if we've already called vm_page_free_reserve
2634 * call it again here to insure that the targets are
2635 * accurately calculated (it uses vm_page_free_count_init)
2636 * calling it with an arg of 0 will not change the reserve
2637 * but will re-calculate free_min and free_target
2638 */
91447636
A
2639 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2640 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
55e303ae 2641 } else
1c79356b
A
2642 vm_page_free_reserve(0);
2643
55e303ae 2644
91447636
A
2645 queue_init(&vm_pageout_queue_external.pgo_pending);
2646 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2647 vm_pageout_queue_external.pgo_laundry = 0;
2648 vm_pageout_queue_external.pgo_idle = FALSE;
2649 vm_pageout_queue_external.pgo_busy = FALSE;
2650 vm_pageout_queue_external.pgo_throttled = FALSE;
0b4c1975 2651 vm_pageout_queue_external.pgo_draining = FALSE;
55e303ae 2652
91447636 2653 queue_init(&vm_pageout_queue_internal.pgo_pending);
2d21ac55 2654 vm_pageout_queue_internal.pgo_maxlaundry = 0;
91447636
A
2655 vm_pageout_queue_internal.pgo_laundry = 0;
2656 vm_pageout_queue_internal.pgo_idle = FALSE;
2657 vm_pageout_queue_internal.pgo_busy = FALSE;
2658 vm_pageout_queue_internal.pgo_throttled = FALSE;
0b4c1975 2659 vm_pageout_queue_internal.pgo_draining = FALSE;
9bccf70c 2660
55e303ae 2661
2d21ac55
A
2662 /* internal pageout thread started when default pager registered first time */
2663 /* external pageout and garbage collection threads started here */
55e303ae 2664
2d21ac55
A
2665 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2666 BASEPRI_PREEMPT - 1,
2667 &vm_pageout_external_iothread);
91447636
A
2668 if (result != KERN_SUCCESS)
2669 panic("vm_pageout_iothread_external: create failed");
55e303ae 2670
2d21ac55 2671 thread_deallocate(vm_pageout_external_iothread);
9bccf70c 2672
2d21ac55
A
2673 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2674 MINPRI_KERNEL,
2675 &thread);
91447636
A
2676 if (result != KERN_SUCCESS)
2677 panic("vm_pageout_garbage_collect: create failed");
55e303ae 2678
91447636 2679 thread_deallocate(thread);
55e303ae 2680
8f6c56a5
A
2681 vm_object_reaper_init();
2682
2d21ac55 2683
91447636 2684 vm_pageout_continue();
2d21ac55
A
2685
2686 /*
2687 * Unreached code!
2688 *
2689 * The vm_pageout_continue() call above never returns, so the code below is never
2690 * executed. We take advantage of this to declare several DTrace VM related probe
2691 * points that our kernel doesn't have an analog for. These are probe points that
2692 * exist in Solaris and are in the DTrace documentation, so people may have written
2693 * scripts that use them. Declaring the probe points here means their scripts will
2694 * compile and execute which we want for portability of the scripts, but since this
2695 * section of code is never reached, the probe points will simply never fire. Yes,
2696 * this is basically a hack. The problem is the DTrace probe points were chosen with
2697 * Solaris specific VM events in mind, not portability to different VM implementations.
2698 */
2699
2700 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2701 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2702 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2703 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2704 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2705 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2706 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
91447636 2707 /*NOTREACHED*/
9bccf70c
A
2708}
2709
2d21ac55
A
2710kern_return_t
2711vm_pageout_internal_start(void)
2712{
2713 kern_return_t result;
2714
2715 vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2716 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2717 if (result == KERN_SUCCESS)
2718 thread_deallocate(vm_pageout_internal_iothread);
2719 return result;
2720}
2721
1c79356b 2722
b0d623f7
A
2723/*
2724 * when marshalling pages into a UPL and subsequently committing
2725 * or aborting them, it is necessary to hold
2726 * the vm_page_queue_lock (a hot global lock) for certain operations
2727 * on the page... however, the majority of the work can be done
2728 * while merely holding the object lock... in fact there are certain
2729 * collections of pages that don't require any work brokered by the
2730 * vm_page_queue_lock... to mitigate the time spent behind the global
2731 * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
2732 * while doing all of the work that doesn't require the vm_page_queue_lock...
2733 * then call dw_do_work to acquire the vm_page_queue_lock and do the
2734 * necessary work for each page... we will grab the busy bit on the page
2735 * if it's not already held so that dw_do_work can drop the object lock
2736 * if it can't immediately take the vm_page_queue_lock in order to compete
2737 * for the locks in the same order that vm_pageout_scan takes them.
2738 * the operation names are modeled after the names of the routines that
2739 * need to be called in order to make the changes very obvious in the
2740 * original loop
2741 */
2742
2743#define DELAYED_WORK_LIMIT 32
2744
2745#define DW_vm_page_unwire 0x01
2746#define DW_vm_page_wire 0x02
2747#define DW_vm_page_free 0x04
2748#define DW_vm_page_activate 0x08
2749#define DW_vm_page_deactivate_internal 0x10
2750#define DW_vm_page_speculate 0x20
2751#define DW_vm_page_lru 0x40
2752#define DW_vm_pageout_throttle_up 0x80
2753#define DW_PAGE_WAKEUP 0x100
2754#define DW_clear_busy 0x200
2755#define DW_clear_reference 0x400
2756#define DW_set_reference 0x800
2757
2758struct dw {
2759 vm_page_t dw_m;
2760 int dw_mask;
2761};
2762
2763
2764static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count);
2765
2766
2767
2768static upl_t
2769upl_create(int type, int flags, upl_size_t size)
0b4e3aa0
A
2770{
2771 upl_t upl;
2d21ac55
A
2772 int page_field_size = 0;
2773 int upl_flags = 0;
2774 int upl_size = sizeof(struct upl);
0b4e3aa0 2775
b0d623f7
A
2776 size = round_page_32(size);
2777
2d21ac55 2778 if (type & UPL_CREATE_LITE) {
b0d623f7 2779 page_field_size = (atop(size) + 7) >> 3;
55e303ae 2780 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2d21ac55
A
2781
2782 upl_flags |= UPL_LITE;
55e303ae 2783 }
2d21ac55 2784 if (type & UPL_CREATE_INTERNAL) {
b0d623f7 2785 upl_size += (int) sizeof(struct upl_page_info) * atop(size);
2d21ac55
A
2786
2787 upl_flags |= UPL_INTERNAL;
0b4e3aa0 2788 }
2d21ac55
A
2789 upl = (upl_t)kalloc(upl_size + page_field_size);
2790
2791 if (page_field_size)
2792 bzero((char *)upl + upl_size, page_field_size);
2793
2794 upl->flags = upl_flags | flags;
0b4e3aa0
A
2795 upl->src_object = NULL;
2796 upl->kaddr = (vm_offset_t)0;
2797 upl->size = 0;
2798 upl->map_object = NULL;
2799 upl->ref_count = 1;
0c530ab8 2800 upl->highest_page = 0;
0b4e3aa0 2801 upl_lock_init(upl);
b0d623f7
A
2802 upl->vector_upl = NULL;
2803#if UPL_DEBUG
0b4e3aa0
A
2804 upl->ubc_alias1 = 0;
2805 upl->ubc_alias2 = 0;
b0d623f7
A
2806
2807 upl->upl_creator = current_thread();
2808 upl->upl_state = 0;
2809 upl->upl_commit_index = 0;
2810 bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
2811
2812 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
91447636 2813#endif /* UPL_DEBUG */
b0d623f7 2814
0b4e3aa0
A
2815 return(upl);
2816}
2817
2818static void
2d21ac55 2819upl_destroy(upl_t upl)
0b4e3aa0 2820{
55e303ae 2821 int page_field_size; /* bit field in word size buf */
2d21ac55 2822 int size;
0b4e3aa0 2823
b0d623f7 2824#if UPL_DEBUG
0b4e3aa0 2825 {
55e303ae 2826 vm_object_t object;
2d21ac55
A
2827
2828 if (upl->flags & UPL_SHADOWED) {
55e303ae
A
2829 object = upl->map_object->shadow;
2830 } else {
2831 object = upl->map_object;
2832 }
2833 vm_object_lock(object);
2d21ac55 2834 queue_remove(&object->uplq, upl, upl_t, uplq);
55e303ae 2835 vm_object_unlock(object);
0b4e3aa0 2836 }
91447636 2837#endif /* UPL_DEBUG */
2d21ac55
A
2838 /*
2839 * drop a reference on the map_object whether or
2840 * not a pageout object is inserted
2841 */
2842 if (upl->flags & UPL_SHADOWED)
0b4e3aa0 2843 vm_object_deallocate(upl->map_object);
55e303ae 2844
2d21ac55
A
2845 if (upl->flags & UPL_DEVICE_MEMORY)
2846 size = PAGE_SIZE;
2847 else
2848 size = upl->size;
55e303ae 2849 page_field_size = 0;
2d21ac55 2850
55e303ae 2851 if (upl->flags & UPL_LITE) {
2d21ac55 2852 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
55e303ae
A
2853 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2854 }
b0d623f7
A
2855 upl_lock_destroy(upl);
2856 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
2d21ac55 2857 if (upl->flags & UPL_INTERNAL) {
91447636
A
2858 kfree(upl,
2859 sizeof(struct upl) +
2d21ac55 2860 (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
91447636 2861 + page_field_size);
0b4e3aa0 2862 } else {
91447636 2863 kfree(upl, sizeof(struct upl) + page_field_size);
0b4e3aa0
A
2864 }
2865}
2866
91447636 2867void uc_upl_dealloc(upl_t upl);
0b4e3aa0 2868__private_extern__ void
2d21ac55 2869uc_upl_dealloc(upl_t upl)
1c79356b 2870{
2d21ac55 2871 if (--upl->ref_count == 0)
1c79356b 2872 upl_destroy(upl);
1c79356b
A
2873}
2874
0b4e3aa0 2875void
2d21ac55 2876upl_deallocate(upl_t upl)
0b4e3aa0 2877{
b0d623f7
A
2878 if (--upl->ref_count == 0) {
2879 if(vector_upl_is_valid(upl))
2880 vector_upl_deallocate(upl);
0b4e3aa0 2881 upl_destroy(upl);
b0d623f7 2882 }
0b4e3aa0 2883}
1c79356b 2884
b0d623f7
A
2885#if DEVELOPMENT || DEBUG
2886/*/*
91447636
A
2887 * Statistics about UPL enforcement of copy-on-write obligations.
2888 */
2889unsigned long upl_cow = 0;
2890unsigned long upl_cow_again = 0;
91447636
A
2891unsigned long upl_cow_pages = 0;
2892unsigned long upl_cow_again_pages = 0;
b0d623f7
A
2893
2894unsigned long iopl_cow = 0;
2895unsigned long iopl_cow_pages = 0;
2896#endif
91447636 2897
1c79356b 2898/*
0b4e3aa0 2899 * Routine: vm_object_upl_request
1c79356b
A
2900 * Purpose:
2901 * Cause the population of a portion of a vm_object.
2902 * Depending on the nature of the request, the pages
2903 * returned may be contain valid data or be uninitialized.
2904 * A page list structure, listing the physical pages
2905 * will be returned upon request.
2906 * This function is called by the file system or any other
2907 * supplier of backing store to a pager.
2908 * IMPORTANT NOTE: The caller must still respect the relationship
2909 * between the vm_object and its backing memory object. The
2910 * caller MUST NOT substitute changes in the backing file
2911 * without first doing a memory_object_lock_request on the
2912 * target range unless it is know that the pages are not
2913 * shared with another entity at the pager level.
2914 * Copy_in_to:
2915 * if a page list structure is present
2916 * return the mapped physical pages, where a
2917 * page is not present, return a non-initialized
2918 * one. If the no_sync bit is turned on, don't
2919 * call the pager unlock to synchronize with other
2920 * possible copies of the page. Leave pages busy
2921 * in the original object, if a page list structure
2922 * was specified. When a commit of the page list
2923 * pages is done, the dirty bit will be set for each one.
2924 * Copy_out_from:
2925 * If a page list structure is present, return
2926 * all mapped pages. Where a page does not exist
2927 * map a zero filled one. Leave pages busy in
2928 * the original object. If a page list structure
2929 * is not specified, this call is a no-op.
2930 *
2931 * Note: access of default pager objects has a rather interesting
2932 * twist. The caller of this routine, presumably the file system
2933 * page cache handling code, will never actually make a request
2934 * against a default pager backed object. Only the default
2935 * pager will make requests on backing store related vm_objects
2936 * In this way the default pager can maintain the relationship
2937 * between backing store files (abstract memory objects) and
2938 * the vm_objects (cache objects), they support.
2939 *
2940 */
91447636 2941
0b4e3aa0
A
2942__private_extern__ kern_return_t
2943vm_object_upl_request(
1c79356b 2944 vm_object_t object,
91447636
A
2945 vm_object_offset_t offset,
2946 upl_size_t size,
1c79356b 2947 upl_t *upl_ptr,
0b4e3aa0
A
2948 upl_page_info_array_t user_page_list,
2949 unsigned int *page_list_count,
91447636 2950 int cntrl_flags)
1c79356b 2951{
91447636 2952 vm_page_t dst_page = VM_PAGE_NULL;
2d21ac55
A
2953 vm_object_offset_t dst_offset;
2954 upl_size_t xfer_size;
1c79356b 2955 boolean_t dirty;
55e303ae 2956 boolean_t hw_dirty;
1c79356b 2957 upl_t upl = NULL;
91447636
A
2958 unsigned int entry;
2959#if MACH_CLUSTER_STATS
1c79356b 2960 boolean_t encountered_lrp = FALSE;
91447636 2961#endif
1c79356b 2962 vm_page_t alias_page = NULL;
2d21ac55 2963 int refmod_state = 0;
91447636
A
2964 wpl_array_t lite_list = NULL;
2965 vm_object_t last_copy_object;
b0d623f7
A
2966 struct dw dw_array[DELAYED_WORK_LIMIT];
2967 struct dw *dwp;
2968 int dw_count;
91447636
A
2969
2970 if (cntrl_flags & ~UPL_VALID_FLAGS) {
2971 /*
2972 * For forward compatibility's sake,
2973 * reject any unknown flag.
2974 */
2975 return KERN_INVALID_VALUE;
2976 }
2d21ac55
A
2977 if ( (!object->internal) && (object->paging_offset != 0) )
2978 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2979 if (object->phys_contiguous)
2980 panic("vm_object_upl_request: contiguous object specified\n");
0b4e3aa0 2981
0b4e3aa0 2982
cf7d32b8
A
2983 if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2984 size = MAX_UPL_SIZE * PAGE_SIZE;
1c79356b 2985
2d21ac55 2986 if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
cf7d32b8 2987 *page_list_count = MAX_UPL_SIZE;
1c79356b 2988
2d21ac55
A
2989 if (cntrl_flags & UPL_SET_INTERNAL) {
2990 if (cntrl_flags & UPL_SET_LITE) {
55e303ae 2991
2d21ac55 2992 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
91447636 2993
2d21ac55
A
2994 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2995 lite_list = (wpl_array_t)
91447636 2996 (((uintptr_t)user_page_list) +
2d21ac55 2997 ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
b0d623f7
A
2998 if (size == 0) {
2999 user_page_list = NULL;
3000 lite_list = NULL;
3001 }
1c79356b 3002 } else {
2d21ac55 3003 upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
55e303ae 3004
2d21ac55 3005 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
3006 if (size == 0) {
3007 user_page_list = NULL;
3008 }
55e303ae 3009 }
2d21ac55
A
3010 } else {
3011 if (cntrl_flags & UPL_SET_LITE) {
91447636 3012
2d21ac55 3013 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
55e303ae 3014
2d21ac55 3015 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
3016 if (size == 0) {
3017 lite_list = NULL;
3018 }
55e303ae 3019 } else {
2d21ac55 3020 upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
0b4e3aa0 3021 }
55e303ae 3022 }
2d21ac55
A
3023 *upl_ptr = upl;
3024
3025 if (user_page_list)
3026 user_page_list[0].device = FALSE;
91447636 3027
2d21ac55
A
3028 if (cntrl_flags & UPL_SET_LITE) {
3029 upl->map_object = object;
3030 } else {
3031 upl->map_object = vm_object_allocate(size);
3032 /*
3033 * No neeed to lock the new object: nobody else knows
3034 * about it yet, so it's all ours so far.
3035 */
3036 upl->map_object->shadow = object;
3037 upl->map_object->pageout = TRUE;
3038 upl->map_object->can_persist = FALSE;
3039 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3040 upl->map_object->shadow_offset = offset;
3041 upl->map_object->wimg_bits = object->wimg_bits;
3042
3043 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3044
3045 upl->flags |= UPL_SHADOWED;
3046 }
3047 /*
91447636
A
3048 * ENCRYPTED SWAP:
3049 * Just mark the UPL as "encrypted" here.
3050 * We'll actually encrypt the pages later,
3051 * in upl_encrypt(), when the caller has
3052 * selected which pages need to go to swap.
3053 */
2d21ac55 3054 if (cntrl_flags & UPL_ENCRYPT)
91447636 3055 upl->flags |= UPL_ENCRYPTED;
2d21ac55
A
3056
3057 if (cntrl_flags & UPL_FOR_PAGEOUT)
91447636 3058 upl->flags |= UPL_PAGEOUT;
2d21ac55 3059
55e303ae 3060 vm_object_lock(object);
b0d623f7 3061 vm_object_activity_begin(object);
2d21ac55
A
3062
3063 /*
3064 * we can lock in the paging_offset once paging_in_progress is set
3065 */
3066 upl->size = size;
3067 upl->offset = offset + object->paging_offset;
55e303ae 3068
b0d623f7 3069#if UPL_DEBUG
2d21ac55 3070 queue_enter(&object->uplq, upl, upl_t, uplq);
91447636 3071#endif /* UPL_DEBUG */
91447636 3072
2d21ac55 3073 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
91447636 3074 /*
2d21ac55
A
3075 * Honor copy-on-write obligations
3076 *
91447636
A
3077 * The caller is gathering these pages and
3078 * might modify their contents. We need to
3079 * make sure that the copy object has its own
3080 * private copies of these pages before we let
3081 * the caller modify them.
3082 */
3083 vm_object_update(object,
3084 offset,
3085 size,
3086 NULL,
3087 NULL,
3088 FALSE, /* should_return */
3089 MEMORY_OBJECT_COPY_SYNC,
3090 VM_PROT_NO_CHANGE);
b0d623f7 3091#if DEVELOPMENT || DEBUG
91447636
A
3092 upl_cow++;
3093 upl_cow_pages += size >> PAGE_SHIFT;
b0d623f7 3094#endif
55e303ae 3095 }
2d21ac55
A
3096 /*
3097 * remember which copy object we synchronized with
3098 */
91447636 3099 last_copy_object = object->copy;
1c79356b 3100 entry = 0;
55e303ae 3101
2d21ac55
A
3102 xfer_size = size;
3103 dst_offset = offset;
3104
b0d623f7
A
3105 dwp = &dw_array[0];
3106 dw_count = 0;
3107
2d21ac55
A
3108 while (xfer_size) {
3109
b0d623f7
A
3110 dwp->dw_mask = 0;
3111
2d21ac55 3112 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2d21ac55
A
3113 vm_object_unlock(object);
3114 VM_PAGE_GRAB_FICTITIOUS(alias_page);
b0d623f7 3115 vm_object_lock(object);
4a3eedf9 3116 }
2d21ac55
A
3117 if (cntrl_flags & UPL_COPYOUT_FROM) {
3118 upl->flags |= UPL_PAGE_SYNC_DONE;
3119
91447636 3120 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
1c79356b
A
3121 dst_page->fictitious ||
3122 dst_page->absent ||
3123 dst_page->error ||
b0d623f7 3124 (VM_PAGE_WIRED(dst_page) && !dst_page->pageout && !dst_page->list_req_pending)) {
91447636
A
3125
3126 if (user_page_list)
1c79356b 3127 user_page_list[entry].phys_addr = 0;
2d21ac55 3128
b0d623f7 3129 goto try_next_page;
2d21ac55
A
3130 }
3131 /*
3132 * grab this up front...
3133 * a high percentange of the time we're going to
3134 * need the hardware modification state a bit later
3135 * anyway... so we can eliminate an extra call into
3136 * the pmap layer by grabbing it here and recording it
3137 */
3138 if (dst_page->pmapped)
3139 refmod_state = pmap_get_refmod(dst_page->phys_page);
3140 else
3141 refmod_state = 0;
3142
3143 if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
91447636 3144 /*
2d21ac55
A
3145 * page is on inactive list and referenced...
3146 * reactivate it now... this gets it out of the
3147 * way of vm_pageout_scan which would have to
3148 * reactivate it upon tripping over it
91447636 3149 */
b0d623f7 3150 dwp->dw_mask |= DW_vm_page_activate;
2d21ac55
A
3151 }
3152 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
3153 /*
3154 * we're only asking for DIRTY pages to be returned
3155 */
3156 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
91447636 3157 /*
2d21ac55
A
3158 * if we were the page stolen by vm_pageout_scan to be
3159 * cleaned (as opposed to a buddy being clustered in
3160 * or this request is not being driven by a PAGEOUT cluster
3161 * then we only need to check for the page being dirty or
3162 * precious to decide whether to return it
91447636 3163 */
2d21ac55 3164 if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
91447636 3165 goto check_busy;
2d21ac55 3166 goto dont_return;
1c79356b 3167 }
2d21ac55
A
3168 /*
3169 * this is a request for a PAGEOUT cluster and this page
3170 * is merely along for the ride as a 'buddy'... not only
3171 * does it have to be dirty to be returned, but it also
3172 * can't have been referenced recently... note that we've
3173 * already filtered above based on whether this page is
3174 * currently on the inactive queue or it meets the page
3175 * ticket (generation count) check
3176 */
0b4c1975 3177 if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED)) &&
2d21ac55
A
3178 ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
3179 goto check_busy;
1c79356b 3180 }
2d21ac55
A
3181dont_return:
3182 /*
3183 * if we reach here, we're not to return
3184 * the page... go on to the next one
3185 */
3186 if (user_page_list)
3187 user_page_list[entry].phys_addr = 0;
55e303ae 3188
b0d623f7 3189 goto try_next_page;
2d21ac55
A
3190 }
3191check_busy:
0b4c1975 3192 if (dst_page->busy && (!(dst_page->list_req_pending && (dst_page->pageout || dst_page->cleaning)))) {
2d21ac55
A
3193 if (cntrl_flags & UPL_NOBLOCK) {
3194 if (user_page_list)
3195 user_page_list[entry].phys_addr = 0;
55e303ae 3196
b0d623f7 3197 goto try_next_page;
1c79356b 3198 }
2d21ac55
A
3199 /*
3200 * someone else is playing with the
3201 * page. We will have to wait.
3202 */
2d21ac55 3203 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
1c79356b 3204
2d21ac55
A
3205 continue;
3206 }
3207 /*
3208 * Someone else already cleaning the page?
3209 */
b0d623f7 3210 if ((dst_page->cleaning || dst_page->absent || VM_PAGE_WIRED(dst_page)) && !dst_page->list_req_pending) {
2d21ac55
A
3211 if (user_page_list)
3212 user_page_list[entry].phys_addr = 0;
91447636 3213
b0d623f7 3214 goto try_next_page;
2d21ac55
A
3215 }
3216 /*
3217 * ENCRYPTED SWAP:
3218 * The caller is gathering this page and might
3219 * access its contents later on. Decrypt the
3220 * page before adding it to the UPL, so that
3221 * the caller never sees encrypted data.
3222 */
3223 if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
3224 int was_busy;
91447636
A
3225
3226 /*
2d21ac55
A
3227 * save the current state of busy
3228 * mark page as busy while decrypt
3229 * is in progress since it will drop
3230 * the object lock...
91447636 3231 */
2d21ac55
A
3232 was_busy = dst_page->busy;
3233 dst_page->busy = TRUE;
91447636 3234
2d21ac55
A
3235 vm_page_decrypt(dst_page, 0);
3236 vm_page_decrypt_for_upl_counter++;
3237 /*
3238 * restore to original busy state
3239 */
3240 dst_page->busy = was_busy;
b0d623f7
A
3241 }
3242 if (dst_page->pageout_queue == TRUE) {
91447636 3243
b0d623f7
A
3244 vm_page_lockspin_queues();
3245
d1ecb069
A
3246#if CONFIG_EMBEDDED
3247 if (dst_page->laundry)
3248#else
3249 if (dst_page->pageout_queue == TRUE)
3250#endif
3251 {
b0d623f7
A
3252 /*
3253 * we've buddied up a page for a clustered pageout
3254 * that has already been moved to the pageout
3255 * queue by pageout_scan... we need to remove
3256 * it from the queue and drop the laundry count
3257 * on that queue
3258 */
3259 vm_pageout_throttle_up(dst_page);
3260 }
3261 vm_page_unlock_queues();
91447636 3262 }
2d21ac55
A
3263#if MACH_CLUSTER_STATS
3264 /*
3265 * pageout statistics gathering. count
3266 * all the pages we will page out that
3267 * were not counted in the initial
3268 * vm_pageout_scan work
3269 */
3270 if (dst_page->list_req_pending)
3271 encountered_lrp = TRUE;
3272 if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
3273 if (encountered_lrp)
3274 CLUSTER_STAT(pages_at_higher_offsets++;)
3275 else
3276 CLUSTER_STAT(pages_at_lower_offsets++;)
3277 }
3278#endif
3279 /*
3280 * Turn off busy indication on pending
3281 * pageout. Note: we can only get here
3282 * in the request pending case.
3283 */
3284 dst_page->list_req_pending = FALSE;
3285 dst_page->busy = FALSE;
3286
3287 hw_dirty = refmod_state & VM_MEM_MODIFIED;
3288 dirty = hw_dirty ? TRUE : dst_page->dirty;
3289
3290 if (dst_page->phys_page > upl->highest_page)
3291 upl->highest_page = dst_page->phys_page;
3292
3293 if (cntrl_flags & UPL_SET_LITE) {
b0d623f7 3294 unsigned int pg_num;
2d21ac55 3295
b0d623f7
A
3296 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3297 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
2d21ac55
A
3298 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3299
3300 if (hw_dirty)
3301 pmap_clear_modify(dst_page->phys_page);
3302
3303 /*
3304 * Mark original page as cleaning
3305 * in place.
3306 */
3307 dst_page->cleaning = TRUE;
3308 dst_page->precious = FALSE;
3309 } else {
3310 /*
3311 * use pageclean setup, it is more
3312 * convenient even for the pageout
3313 * cases here
3314 */
3315 vm_object_lock(upl->map_object);
3316 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3317 vm_object_unlock(upl->map_object);
3318
3319 alias_page->absent = FALSE;
3320 alias_page = NULL;
1c79356b 3321 }
2d21ac55
A
3322#if MACH_PAGEMAP
3323 /*
3324 * Record that this page has been
3325 * written out
3326 */
3327 vm_external_state_set(object->existence_map, dst_page->offset);
3328#endif /*MACH_PAGEMAP*/
3329 dst_page->dirty = dirty;
55e303ae 3330
2d21ac55
A
3331 if (!dirty)
3332 dst_page->precious = TRUE;
91447636 3333
2d21ac55
A
3334 if (dst_page->pageout)
3335 dst_page->busy = TRUE;
3336
3337 if ( (cntrl_flags & UPL_ENCRYPT) ) {
3338 /*
3339 * ENCRYPTED SWAP:
3340 * We want to deny access to the target page
3341 * because its contents are about to be
3342 * encrypted and the user would be very
3343 * confused to see encrypted data instead
3344 * of their data.
3345 * We also set "encrypted_cleaning" to allow
3346 * vm_pageout_scan() to demote that page
3347 * from "adjacent/clean-in-place" to
3348 * "target/clean-and-free" if it bumps into
3349 * this page during its scanning while we're
3350 * still processing this cluster.
3351 */
3352 dst_page->busy = TRUE;
3353 dst_page->encrypted_cleaning = TRUE;
3354 }
3355 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3356 /*
3357 * deny access to the target page
3358 * while it is being worked on
3359 */
b0d623f7 3360 if ((!dst_page->pageout) && ( !VM_PAGE_WIRED(dst_page))) {
2d21ac55
A
3361 dst_page->busy = TRUE;
3362 dst_page->pageout = TRUE;
b0d623f7
A
3363
3364 dwp->dw_mask |= DW_vm_page_wire;
2d21ac55
A
3365 }
3366 }
3367 } else {
3368 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
91447636 3369 /*
2d21ac55
A
3370 * Honor copy-on-write obligations
3371 *
91447636
A
3372 * The copy object has changed since we
3373 * last synchronized for copy-on-write.
3374 * Another copy object might have been
3375 * inserted while we released the object's
3376 * lock. Since someone could have seen the
3377 * original contents of the remaining pages
3378 * through that new object, we have to
3379 * synchronize with it again for the remaining
3380 * pages only. The previous pages are "busy"
3381 * so they can not be seen through the new
3382 * mapping. The new mapping will see our
3383 * upcoming changes for those previous pages,
3384 * but that's OK since they couldn't see what
3385 * was there before. It's just a race anyway
3386 * and there's no guarantee of consistency or
3387 * atomicity. We just don't want new mappings
3388 * to see both the *before* and *after* pages.
3389 */
3390 if (object->copy != VM_OBJECT_NULL) {
3391 vm_object_update(
3392 object,
3393 dst_offset,/* current offset */
3394 xfer_size, /* remaining size */
3395 NULL,
3396 NULL,
3397 FALSE, /* should_return */
3398 MEMORY_OBJECT_COPY_SYNC,
3399 VM_PROT_NO_CHANGE);
2d21ac55 3400
b0d623f7 3401#if DEVELOPMENT || DEBUG
91447636 3402 upl_cow_again++;
2d21ac55 3403 upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
b0d623f7 3404#endif
91447636 3405 }
2d21ac55
A
3406 /*
3407 * remember the copy object we synced with
3408 */
91447636
A
3409 last_copy_object = object->copy;
3410 }
91447636
A
3411 dst_page = vm_page_lookup(object, dst_offset);
3412
2d21ac55 3413 if (dst_page != VM_PAGE_NULL) {
b0d623f7
A
3414
3415 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3416
3417 if ( !(dst_page->absent && dst_page->list_req_pending) ) {
3418 /*
2d21ac55
A
3419 * skip over pages already present in the cache
3420 */
b0d623f7
A
3421 if (user_page_list)
3422 user_page_list[entry].phys_addr = 0;
2d21ac55 3423
b0d623f7 3424 goto try_next_page;
55e303ae 3425 }
b0d623f7
A
3426 }
3427 if ( !(dst_page->list_req_pending) ) {
3428
2d21ac55
A
3429 if (dst_page->cleaning) {
3430 /*
3431 * someone else is writing to the page... wait...
3432 */
2d21ac55
A
3433 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3434
3435 continue;
3436 }
3437 } else {
3438 if (dst_page->fictitious &&
3439 dst_page->phys_page == vm_page_fictitious_addr) {
3440 assert( !dst_page->speculative);
3441 /*
3442 * dump the fictitious page
3443 */
3444 dst_page->list_req_pending = FALSE;
55e303ae 3445
b0d623f7 3446 VM_PAGE_FREE(dst_page);
2d21ac55
A
3447
3448 dst_page = NULL;
b0d623f7 3449
2d21ac55
A
3450 } else if (dst_page->absent) {
3451 /*
3452 * the default_pager case
3453 */
3454 dst_page->list_req_pending = FALSE;
3455 dst_page->busy = FALSE;
b0d623f7 3456
0b4c1975 3457 } else if (dst_page->pageout || dst_page->cleaning) {
b0d623f7
A
3458 /*
3459 * page was earmarked by vm_pageout_scan
3460 * to be cleaned and stolen... we're going
3461 * to take it back since we are not attempting
3462 * to read that page and we don't want to stall
3463 * waiting for it to be cleaned for 2 reasons...
3464 * 1 - no use paging it out and back in
3465 * 2 - if we stall, we may casue a deadlock in
3466 * the FS trying to acquire the its locks
3467 * on the VNOP_PAGEOUT path presuming that
3468 * those locks are already held on the read
3469 * path before trying to create this UPL
3470 *
3471 * so undo all of the state that vm_pageout_scan
3472 * hung on this page
3473 */
3474 dst_page->busy = FALSE;
3475
3476 vm_pageout_queue_steal(dst_page, FALSE);
2d21ac55 3477 }
0b4e3aa0 3478 }
1c79356b 3479 }
2d21ac55
A
3480 if (dst_page == VM_PAGE_NULL) {
3481 if (object->private) {
0b4e3aa0
A
3482 /*
3483 * This is a nasty wrinkle for users
3484 * of upl who encounter device or
3485 * private memory however, it is
3486 * unavoidable, only a fault can
2d21ac55 3487 * resolve the actual backing
0b4e3aa0
A
3488 * physical page by asking the
3489 * backing device.
3490 */
2d21ac55 3491 if (user_page_list)
55e303ae 3492 user_page_list[entry].phys_addr = 0;
2d21ac55 3493
b0d623f7 3494 goto try_next_page;
0b4e3aa0 3495 }
2d21ac55
A
3496 /*
3497 * need to allocate a page
2d21ac55 3498 */
4a3eedf9 3499 dst_page = vm_page_grab();
2d21ac55 3500
1c79356b 3501 if (dst_page == VM_PAGE_NULL) {
2d21ac55
A
3502 if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3503 /*
3504 * we don't want to stall waiting for pages to come onto the free list
3505 * while we're already holding absent pages in this UPL
3506 * the caller will deal with the empty slots
3507 */
3508 if (user_page_list)
3509 user_page_list[entry].phys_addr = 0;
3510
3511 goto try_next_page;
3512 }
3513 /*
3514 * no pages available... wait
3515 * then try again for the same
3516 * offset...
3517 */
0b4e3aa0
A
3518 vm_object_unlock(object);
3519 VM_PAGE_WAIT();
b0d623f7 3520 vm_object_lock(object);
2d21ac55 3521
0b4e3aa0 3522 continue;
1c79356b 3523 }
b0d623f7 3524 vm_page_insert(dst_page, object, dst_offset);
4a3eedf9 3525
2d21ac55 3526 dst_page->absent = TRUE;
4a3eedf9 3527 dst_page->busy = FALSE;
2d21ac55
A
3528
3529 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
91447636
A
3530 /*
3531 * if UPL_RET_ONLY_ABSENT was specified,
3532 * than we're definitely setting up a
3533 * upl for a clustered read/pagein
3534 * operation... mark the pages as clustered
2d21ac55
A
3535 * so upl_commit_range can put them on the
3536 * speculative list
91447636
A
3537 */
3538 dst_page->clustered = TRUE;
3539 }
1c79356b 3540 }
b0d623f7
A
3541 if (dst_page->fictitious) {
3542 panic("need corner case for fictitious page");
3543 }
3544 if (dst_page->busy) {
3545 /*
3546 * someone else is playing with the
3547 * page. We will have to wait.
3548 */
3549 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3550
3551 continue;
3552 }
91447636
A
3553 /*
3554 * ENCRYPTED SWAP:
3555 */
3556 if (cntrl_flags & UPL_ENCRYPT) {
3557 /*
3558 * The page is going to be encrypted when we
3559 * get it from the pager, so mark it so.
3560 */
3561 dst_page->encrypted = TRUE;
3562 } else {
3563 /*
3564 * Otherwise, the page will not contain
3565 * encrypted data.
3566 */
3567 dst_page->encrypted = FALSE;
3568 }
1c79356b 3569 dst_page->overwriting = TRUE;
2d21ac55 3570
2d21ac55
A
3571 if (dst_page->pmapped) {
3572 if ( !(cntrl_flags & UPL_FILE_IO))
3573 /*
3574 * eliminate all mappings from the
3575 * original object and its prodigy
55e303ae 3576 */
2d21ac55
A
3577 refmod_state = pmap_disconnect(dst_page->phys_page);
3578 else
3579 refmod_state = pmap_get_refmod(dst_page->phys_page);
3580 } else
3581 refmod_state = 0;
55e303ae 3582
2d21ac55
A
3583 hw_dirty = refmod_state & VM_MEM_MODIFIED;
3584 dirty = hw_dirty ? TRUE : dst_page->dirty;
1c79356b 3585
2d21ac55 3586 if (cntrl_flags & UPL_SET_LITE) {
b0d623f7 3587 unsigned int pg_num;
1c79356b 3588
b0d623f7
A
3589 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3590 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
2d21ac55 3591 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
91447636 3592
2d21ac55
A
3593 if (hw_dirty)
3594 pmap_clear_modify(dst_page->phys_page);
0b4e3aa0 3595
2d21ac55
A
3596 /*
3597 * Mark original page as cleaning
3598 * in place.
3599 */
3600 dst_page->cleaning = TRUE;
3601 dst_page->precious = FALSE;
3602 } else {
3603 /*
3604 * use pageclean setup, it is more
3605 * convenient even for the pageout
3606 * cases here
3607 */
3608 vm_object_lock(upl->map_object);
3609 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3610 vm_object_unlock(upl->map_object);
0b4e3aa0 3611
2d21ac55
A
3612 alias_page->absent = FALSE;
3613 alias_page = NULL;
3614 }
1c79356b 3615
2d21ac55
A
3616 if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3617 /*
3618 * clean in place for read implies
3619 * that a write will be done on all
3620 * the pages that are dirty before
3621 * a upl commit is done. The caller
3622 * is obligated to preserve the
3623 * contents of all pages marked dirty
3624 */
3625 upl->flags |= UPL_CLEAR_DIRTY;
3626 }
3627 dst_page->dirty = dirty;
91447636 3628
2d21ac55
A
3629 if (!dirty)
3630 dst_page->precious = TRUE;
3631
b0d623f7 3632 if ( !VM_PAGE_WIRED(dst_page)) {
2d21ac55
A
3633 /*
3634 * deny access to the target page while
3635 * it is being worked on
3636 */
3637 dst_page->busy = TRUE;
3638 } else
b0d623f7 3639 dwp->dw_mask |= DW_vm_page_wire;
2d21ac55 3640
b0d623f7
A
3641 /*
3642 * We might be about to satisfy a fault which has been
3643 * requested. So no need for the "restart" bit.
3644 */
3645 dst_page->restart = FALSE;
3646 if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
2d21ac55
A
3647 /*
3648 * expect the page to be used
3649 */
b0d623f7 3650 dwp->dw_mask |= DW_set_reference;
2d21ac55
A
3651 }
3652 dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3653 }
d41d1dae
A
3654 if (dst_page->busy)
3655 upl->flags |= UPL_HAS_BUSY;
3656
2d21ac55
A
3657 if (dst_page->phys_page > upl->highest_page)
3658 upl->highest_page = dst_page->phys_page;
3659 if (user_page_list) {
3660 user_page_list[entry].phys_addr = dst_page->phys_page;
2d21ac55
A
3661 user_page_list[entry].pageout = dst_page->pageout;
3662 user_page_list[entry].absent = dst_page->absent;
593a1d5f 3663 user_page_list[entry].dirty = dst_page->dirty;
2d21ac55 3664 user_page_list[entry].precious = dst_page->precious;
593a1d5f 3665 user_page_list[entry].device = FALSE;
2d21ac55
A
3666 if (dst_page->clustered == TRUE)
3667 user_page_list[entry].speculative = dst_page->speculative;
3668 else
3669 user_page_list[entry].speculative = FALSE;
593a1d5f
A
3670 user_page_list[entry].cs_validated = dst_page->cs_validated;
3671 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
2d21ac55
A
3672 }
3673 /*
3674 * if UPL_RET_ONLY_ABSENT is set, then
3675 * we are working with a fresh page and we've
3676 * just set the clustered flag on it to
3677 * indicate that it was drug in as part of a
3678 * speculative cluster... so leave it alone
3679 */
3680 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3681 /*
3682 * someone is explicitly grabbing this page...
3683 * update clustered and speculative state
3684 *
3685 */
3686 VM_PAGE_CONSUME_CLUSTERED(dst_page);
3687 }
b0d623f7
A
3688try_next_page:
3689 if (dwp->dw_mask) {
3690 if (dwp->dw_mask & DW_vm_page_activate)
3691 VM_STAT_INCR(reactivations);
4a3eedf9 3692
b0d623f7
A
3693 if (dst_page->busy == FALSE) {
3694 /*
3695 * dw_do_work may need to drop the object lock
3696 * if it does, we need the pages it's looking at to
3697 * be held stable via the busy bit.
3698 */
3699 dst_page->busy = TRUE;
3700 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
3701 }
3702 dwp->dw_m = dst_page;
3703 dwp++;
3704 dw_count++;
3705
3706 if (dw_count >= DELAYED_WORK_LIMIT) {
3707 dw_do_work(object, &dw_array[0], dw_count);
3708
3709 dwp = &dw_array[0];
3710 dw_count = 0;
4a3eedf9 3711 }
2d21ac55 3712 }
2d21ac55
A
3713 entry++;
3714 dst_offset += PAGE_SIZE_64;
3715 xfer_size -= PAGE_SIZE;
3716 }
b0d623f7
A
3717 if (dw_count)
3718 dw_do_work(object, &dw_array[0], dw_count);
3719
2d21ac55 3720 if (alias_page != NULL) {
b0d623f7 3721 VM_PAGE_FREE(alias_page);
1c79356b 3722 }
91447636 3723
2d21ac55
A
3724 if (page_list_count != NULL) {
3725 if (upl->flags & UPL_INTERNAL)
3726 *page_list_count = 0;
3727 else if (*page_list_count > entry)
3728 *page_list_count = entry;
3729 }
b0d623f7
A
3730#if UPL_DEBUG
3731 upl->upl_state = 1;
3732#endif
1c79356b 3733 vm_object_unlock(object);
2d21ac55 3734
1c79356b
A
3735 return KERN_SUCCESS;
3736}
3737
0b4e3aa0 3738/* JMM - Backward compatability for now */
1c79356b 3739kern_return_t
91447636
A
3740vm_fault_list_request( /* forward */
3741 memory_object_control_t control,
3742 vm_object_offset_t offset,
3743 upl_size_t size,
3744 upl_t *upl_ptr,
3745 upl_page_info_t **user_page_list_ptr,
2d21ac55 3746 unsigned int page_list_count,
91447636
A
3747 int cntrl_flags);
3748kern_return_t
0b4e3aa0
A
3749vm_fault_list_request(
3750 memory_object_control_t control,
1c79356b 3751 vm_object_offset_t offset,
91447636 3752 upl_size_t size,
0b4e3aa0 3753 upl_t *upl_ptr,
1c79356b 3754 upl_page_info_t **user_page_list_ptr,
2d21ac55 3755 unsigned int page_list_count,
1c79356b
A
3756 int cntrl_flags)
3757{
0c530ab8 3758 unsigned int local_list_count;
0b4e3aa0
A
3759 upl_page_info_t *user_page_list;
3760 kern_return_t kr;
3761
b0d623f7
A
3762 if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
3763 return KERN_INVALID_ARGUMENT;
3764
0b4e3aa0
A
3765 if (user_page_list_ptr != NULL) {
3766 local_list_count = page_list_count;
3767 user_page_list = *user_page_list_ptr;
3768 } else {
3769 local_list_count = 0;
3770 user_page_list = NULL;
3771 }
3772 kr = memory_object_upl_request(control,
3773 offset,
3774 size,
3775 upl_ptr,
3776 user_page_list,
3777 &local_list_count,
3778 cntrl_flags);
3779
3780 if(kr != KERN_SUCCESS)
3781 return kr;
3782
3783 if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3784 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3785 }
3786
3787 return KERN_SUCCESS;
3788}
3789
3790
3791
3792/*
3793 * Routine: vm_object_super_upl_request
3794 * Purpose:
3795 * Cause the population of a portion of a vm_object
3796 * in much the same way as memory_object_upl_request.
3797 * Depending on the nature of the request, the pages
3798 * returned may be contain valid data or be uninitialized.
3799 * However, the region may be expanded up to the super
3800 * cluster size provided.
3801 */
3802
3803__private_extern__ kern_return_t
3804vm_object_super_upl_request(
3805 vm_object_t object,
3806 vm_object_offset_t offset,
91447636
A
3807 upl_size_t size,
3808 upl_size_t super_cluster,
0b4e3aa0
A
3809 upl_t *upl,
3810 upl_page_info_t *user_page_list,
3811 unsigned int *page_list_count,
3812 int cntrl_flags)
3813{
b0d623f7 3814 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
1c79356b 3815 return KERN_FAILURE;
0b4e3aa0 3816
55e303ae 3817 assert(object->paging_in_progress);
1c79356b 3818 offset = offset - object->paging_offset;
91447636 3819
91447636 3820 if (super_cluster > size) {
1c79356b
A
3821
3822 vm_object_offset_t base_offset;
91447636 3823 upl_size_t super_size;
b0d623f7 3824 vm_object_size_t super_size_64;
1c79356b 3825
2d21ac55
A
3826 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3827 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
b0d623f7
A
3828 super_size_64 = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3829 super_size = (upl_size_t) super_size_64;
3830 assert(super_size == super_size_64);
2d21ac55
A
3831
3832 if (offset > (base_offset + super_size)) {
3833 panic("vm_object_super_upl_request: Missed target pageout"
3834 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3835 offset, base_offset, super_size, super_cluster,
3836 size, object->paging_offset);
3837 }
91447636
A
3838 /*
3839 * apparently there is a case where the vm requests a
3840 * page to be written out who's offset is beyond the
3841 * object size
3842 */
b0d623f7
A
3843 if ((offset + size) > (base_offset + super_size)) {
3844 super_size_64 = (offset + size) - base_offset;
3845 super_size = (upl_size_t) super_size_64;
3846 assert(super_size == super_size_64);
3847 }
1c79356b
A
3848
3849 offset = base_offset;
3850 size = super_size;
3851 }
2d21ac55 3852 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
1c79356b
A
3853}
3854
b0d623f7 3855
91447636
A
3856kern_return_t
3857vm_map_create_upl(
3858 vm_map_t map,
3859 vm_map_address_t offset,
3860 upl_size_t *upl_size,
3861 upl_t *upl,
3862 upl_page_info_array_t page_list,
3863 unsigned int *count,
3864 int *flags)
3865{
3866 vm_map_entry_t entry;
3867 int caller_flags;
3868 int force_data_sync;
3869 int sync_cow_data;
3870 vm_object_t local_object;
3871 vm_map_offset_t local_offset;
3872 vm_map_offset_t local_start;
3873 kern_return_t ret;
3874
3875 caller_flags = *flags;
3876
3877 if (caller_flags & ~UPL_VALID_FLAGS) {
3878 /*
3879 * For forward compatibility's sake,
3880 * reject any unknown flag.
3881 */
3882 return KERN_INVALID_VALUE;
3883 }
91447636
A
3884 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3885 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3886
2d21ac55 3887 if (upl == NULL)
91447636
A
3888 return KERN_INVALID_ARGUMENT;
3889
91447636 3890REDISCOVER_ENTRY:
b0d623f7 3891 vm_map_lock_read(map);
2d21ac55 3892
91447636 3893 if (vm_map_lookup_entry(map, offset, &entry)) {
2d21ac55 3894
b0d623f7
A
3895 if ((entry->vme_end - offset) < *upl_size) {
3896 *upl_size = (upl_size_t) (entry->vme_end - offset);
3897 assert(*upl_size == entry->vme_end - offset);
3898 }
2d21ac55 3899
91447636 3900 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
2d21ac55
A
3901 *flags = 0;
3902
b0d623f7 3903 if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
2d21ac55
A
3904 if (entry->object.vm_object->private)
3905 *flags = UPL_DEV_MEMORY;
3906
3907 if (entry->object.vm_object->phys_contiguous)
91447636 3908 *flags |= UPL_PHYS_CONTIG;
91447636 3909 }
b0d623f7 3910 vm_map_unlock_read(map);
2d21ac55 3911
91447636
A
3912 return KERN_SUCCESS;
3913 }
2d21ac55 3914 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
b0d623f7
A
3915 if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
3916 *upl_size = MAX_UPL_SIZE * PAGE_SIZE;
2d21ac55 3917 }
91447636
A
3918 /*
3919 * Create an object if necessary.
3920 */
3921 if (entry->object.vm_object == VM_OBJECT_NULL) {
b0d623f7
A
3922
3923 if (vm_map_lock_read_to_write(map))
3924 goto REDISCOVER_ENTRY;
3925
2d21ac55 3926 entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
91447636 3927 entry->offset = 0;
b0d623f7
A
3928
3929 vm_map_lock_write_to_read(map);
91447636
A
3930 }
3931 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3932 if (!(entry->protection & VM_PROT_WRITE)) {
b0d623f7 3933 vm_map_unlock_read(map);
91447636
A
3934 return KERN_PROTECTION_FAILURE;
3935 }
3936 if (entry->needs_copy) {
b0d623f7
A
3937 /*
3938 * Honor copy-on-write for COPY_SYMMETRIC
3939 * strategy.
3940 */
91447636
A
3941 vm_map_t local_map;
3942 vm_object_t object;
91447636
A
3943 vm_object_offset_t new_offset;
3944 vm_prot_t prot;
3945 boolean_t wired;
91447636
A
3946 vm_map_version_t version;
3947 vm_map_t real_map;
3948
3949 local_map = map;
2d21ac55
A
3950
3951 if (vm_map_lookup_locked(&local_map,
3952 offset, VM_PROT_WRITE,
3953 OBJECT_LOCK_EXCLUSIVE,
3954 &version, &object,
3955 &new_offset, &prot, &wired,
3956 NULL,
b0d623f7
A
3957 &real_map) != KERN_SUCCESS) {
3958 vm_map_unlock_read(local_map);
91447636
A
3959 return KERN_FAILURE;
3960 }
2d21ac55 3961 if (real_map != map)
91447636 3962 vm_map_unlock(real_map);
b0d623f7
A
3963 vm_map_unlock_read(local_map);
3964
91447636 3965 vm_object_unlock(object);
91447636
A
3966
3967 goto REDISCOVER_ENTRY;
3968 }
3969 }
3970 if (entry->is_sub_map) {
3971 vm_map_t submap;
3972
3973 submap = entry->object.sub_map;
3974 local_start = entry->vme_start;
3975 local_offset = entry->offset;
2d21ac55 3976
91447636 3977 vm_map_reference(submap);
b0d623f7 3978 vm_map_unlock_read(map);
91447636 3979
2d21ac55
A
3980 ret = vm_map_create_upl(submap,
3981 local_offset + (offset - local_start),
3982 upl_size, upl, page_list, count, flags);
91447636 3983 vm_map_deallocate(submap);
2d21ac55 3984
91447636
A
3985 return ret;
3986 }
91447636 3987 if (sync_cow_data) {
2d21ac55 3988 if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
91447636
A
3989 local_object = entry->object.vm_object;
3990 local_start = entry->vme_start;
3991 local_offset = entry->offset;
2d21ac55 3992
91447636 3993 vm_object_reference(local_object);
b0d623f7 3994 vm_map_unlock_read(map);
91447636 3995
b0d623f7 3996 if (local_object->shadow && local_object->copy) {
2d21ac55
A
3997 vm_object_lock_request(
3998 local_object->shadow,
3999 (vm_object_offset_t)
4000 ((offset - local_start) +
4001 local_offset) +
4002 local_object->shadow_offset,
4003 *upl_size, FALSE,
4004 MEMORY_OBJECT_DATA_SYNC,
4005 VM_PROT_NO_CHANGE);
91447636
A
4006 }
4007 sync_cow_data = FALSE;
4008 vm_object_deallocate(local_object);
2d21ac55 4009
91447636
A
4010 goto REDISCOVER_ENTRY;
4011 }
4012 }
91447636 4013 if (force_data_sync) {
91447636
A
4014 local_object = entry->object.vm_object;
4015 local_start = entry->vme_start;
4016 local_offset = entry->offset;
2d21ac55 4017
91447636 4018 vm_object_reference(local_object);
b0d623f7 4019 vm_map_unlock_read(map);
91447636
A
4020
4021 vm_object_lock_request(
2d21ac55
A
4022 local_object,
4023 (vm_object_offset_t)
4024 ((offset - local_start) + local_offset),
4025 (vm_object_size_t)*upl_size, FALSE,
4026 MEMORY_OBJECT_DATA_SYNC,
4027 VM_PROT_NO_CHANGE);
4028
91447636
A
4029 force_data_sync = FALSE;
4030 vm_object_deallocate(local_object);
2d21ac55 4031
91447636
A
4032 goto REDISCOVER_ENTRY;
4033 }
2d21ac55
A
4034 if (entry->object.vm_object->private)
4035 *flags = UPL_DEV_MEMORY;
4036 else
4037 *flags = 0;
4038
4039 if (entry->object.vm_object->phys_contiguous)
4040 *flags |= UPL_PHYS_CONTIG;
91447636 4041
91447636
A
4042 local_object = entry->object.vm_object;
4043 local_offset = entry->offset;
4044 local_start = entry->vme_start;
2d21ac55 4045
91447636 4046 vm_object_reference(local_object);
b0d623f7 4047 vm_map_unlock_read(map);
2d21ac55
A
4048
4049 ret = vm_object_iopl_request(local_object,
4050 (vm_object_offset_t) ((offset - local_start) + local_offset),
4051 *upl_size,
4052 upl,
4053 page_list,
4054 count,
4055 caller_flags);
91447636 4056 vm_object_deallocate(local_object);
2d21ac55 4057
91447636
A
4058 return(ret);
4059 }
b0d623f7 4060 vm_map_unlock_read(map);
1c79356b 4061
2d21ac55 4062 return(KERN_FAILURE);
91447636
A
4063}
4064
4065/*
4066 * Internal routine to enter a UPL into a VM map.
4067 *
4068 * JMM - This should just be doable through the standard
4069 * vm_map_enter() API.
4070 */
1c79356b 4071kern_return_t
91447636
A
4072vm_map_enter_upl(
4073 vm_map_t map,
4074 upl_t upl,
b0d623f7 4075 vm_map_offset_t *dst_addr)
1c79356b 4076{
91447636 4077 vm_map_size_t size;
1c79356b 4078 vm_object_offset_t offset;
91447636 4079 vm_map_offset_t addr;
1c79356b
A
4080 vm_page_t m;
4081 kern_return_t kr;
b0d623f7
A
4082 int isVectorUPL = 0, curr_upl=0;
4083 upl_t vector_upl = NULL;
4084 vm_offset_t vector_upl_dst_addr = 0;
4085 vm_map_t vector_upl_submap = NULL;
4086 upl_offset_t subupl_offset = 0;
4087 upl_size_t subupl_size = 0;
1c79356b 4088
0b4e3aa0
A
4089 if (upl == UPL_NULL)
4090 return KERN_INVALID_ARGUMENT;
4091
b0d623f7
A
4092 if((isVectorUPL = vector_upl_is_valid(upl))) {
4093 int mapped=0,valid_upls=0;
4094 vector_upl = upl;
4095
4096 upl_lock(vector_upl);
4097 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4098 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
4099 if(upl == NULL)
4100 continue;
4101 valid_upls++;
4102 if (UPL_PAGE_LIST_MAPPED & upl->flags)
4103 mapped++;
4104 }
4105
4106 if(mapped) {
4107 if(mapped != valid_upls)
4108 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
4109 else {
4110 upl_unlock(vector_upl);
4111 return KERN_FAILURE;
4112 }
4113 }
4114
4115 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
4116 if( kr != KERN_SUCCESS )
4117 panic("Vector UPL submap allocation failed\n");
4118 map = vector_upl_submap;
4119 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
4120 curr_upl=0;
4121 }
4122 else
4123 upl_lock(upl);
4124
4125process_upl_to_enter:
4126 if(isVectorUPL){
4127 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4128 *dst_addr = vector_upl_dst_addr;
4129 upl_unlock(vector_upl);
4130 return KERN_SUCCESS;
4131 }
4132 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4133 if(upl == NULL)
4134 goto process_upl_to_enter;
4135 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
4136 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
d41d1dae
A
4137 } else {
4138 /*
4139 * check to see if already mapped
4140 */
4141 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
4142 upl_unlock(upl);
4143 return KERN_FAILURE;
4144 }
b0d623f7 4145 }
d41d1dae
A
4146 if ((!(upl->flags & UPL_SHADOWED)) &&
4147 ((upl->flags & UPL_HAS_BUSY) ||
4148 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
0b4e3aa0 4149
55e303ae
A
4150 vm_object_t object;
4151 vm_page_t alias_page;
4152 vm_object_offset_t new_offset;
b0d623f7 4153 unsigned int pg_num;
55e303ae
A
4154 wpl_array_t lite_list;
4155
2d21ac55 4156 if (upl->flags & UPL_INTERNAL) {
55e303ae 4157 lite_list = (wpl_array_t)
91447636 4158 ((((uintptr_t)upl) + sizeof(struct upl))
2d21ac55 4159 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
55e303ae 4160 } else {
2d21ac55 4161 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
55e303ae
A
4162 }
4163 object = upl->map_object;
4164 upl->map_object = vm_object_allocate(upl->size);
2d21ac55 4165
55e303ae 4166 vm_object_lock(upl->map_object);
2d21ac55 4167
55e303ae
A
4168 upl->map_object->shadow = object;
4169 upl->map_object->pageout = TRUE;
4170 upl->map_object->can_persist = FALSE;
2d21ac55
A
4171 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4172 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
55e303ae 4173 upl->map_object->wimg_bits = object->wimg_bits;
55e303ae
A
4174 offset = upl->map_object->shadow_offset;
4175 new_offset = 0;
4176 size = upl->size;
91447636 4177
2d21ac55 4178 upl->flags |= UPL_SHADOWED;
91447636 4179
2d21ac55 4180 while (size) {
b0d623f7
A
4181 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
4182 assert(pg_num == new_offset / PAGE_SIZE);
55e303ae 4183
2d21ac55 4184 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
55e303ae 4185
2d21ac55 4186 VM_PAGE_GRAB_FICTITIOUS(alias_page);
91447636 4187
2d21ac55 4188 vm_object_lock(object);
91447636 4189
2d21ac55
A
4190 m = vm_page_lookup(object, offset);
4191 if (m == VM_PAGE_NULL) {
4192 panic("vm_upl_map: page missing\n");
4193 }
55e303ae 4194
2d21ac55
A
4195 /*
4196 * Convert the fictitious page to a private
4197 * shadow of the real page.
4198 */
4199 assert(alias_page->fictitious);
4200 alias_page->fictitious = FALSE;
4201 alias_page->private = TRUE;
4202 alias_page->pageout = TRUE;
4203 /*
4204 * since m is a page in the upl it must
4205 * already be wired or BUSY, so it's
4206 * safe to assign the underlying physical
4207 * page to the alias
4208 */
4209 alias_page->phys_page = m->phys_page;
4210
4211 vm_object_unlock(object);
4212
4213 vm_page_lockspin_queues();
4214 vm_page_wire(alias_page);
4215 vm_page_unlock_queues();
4216
4217 /*
4218 * ENCRYPTED SWAP:
4219 * The virtual page ("m") has to be wired in some way
4220 * here or its physical page ("m->phys_page") could
4221 * be recycled at any time.
4222 * Assuming this is enforced by the caller, we can't
4223 * get an encrypted page here. Since the encryption
4224 * key depends on the VM page's "pager" object and
4225 * the "paging_offset", we couldn't handle 2 pageable
4226 * VM pages (with different pagers and paging_offsets)
4227 * sharing the same physical page: we could end up
4228 * encrypting with one key (via one VM page) and
4229 * decrypting with another key (via the alias VM page).
4230 */
4231 ASSERT_PAGE_DECRYPTED(m);
55e303ae 4232
2d21ac55
A
4233 vm_page_insert(alias_page, upl->map_object, new_offset);
4234
4235 assert(!alias_page->wanted);
4236 alias_page->busy = FALSE;
4237 alias_page->absent = FALSE;
4238 }
4239 size -= PAGE_SIZE;
4240 offset += PAGE_SIZE_64;
4241 new_offset += PAGE_SIZE_64;
55e303ae 4242 }
91447636 4243 vm_object_unlock(upl->map_object);
55e303ae 4244 }
d41d1dae 4245 if (upl->flags & UPL_SHADOWED)
55e303ae 4246 offset = 0;
d41d1dae
A
4247 else
4248 offset = upl->offset - upl->map_object->paging_offset;
1c79356b
A
4249 size = upl->size;
4250
2d21ac55 4251 vm_object_reference(upl->map_object);
1c79356b 4252
b0d623f7
A
4253 if(!isVectorUPL) {
4254 *dst_addr = 0;
4255 /*
4256 * NEED A UPL_MAP ALIAS
4257 */
4258 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4259 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
4260 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
d41d1dae
A
4261
4262 if (kr != KERN_SUCCESS) {
4263 upl_unlock(upl);
4264 return(kr);
4265 }
b0d623f7
A
4266 }
4267 else {
4268 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4269 VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
4270 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4271 if(kr)
4272 panic("vm_map_enter failed for a Vector UPL\n");
4273 }
91447636
A
4274 vm_object_lock(upl->map_object);
4275
2d21ac55 4276 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
1c79356b 4277 m = vm_page_lookup(upl->map_object, offset);
2d21ac55
A
4278
4279 if (m) {
4280 unsigned int cache_attr;
4281 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
4282
4283 m->pmapped = TRUE;
b0d623f7
A
4284
4285 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
4286 * but only in kernel space. If this was on a user map,
4287 * we'd have to set the wpmapped bit. */
4288 /* m->wpmapped = TRUE; */
4289 assert(map==kernel_map);
9bccf70c 4290
2d21ac55 4291 PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
1c79356b 4292 }
2d21ac55 4293 offset += PAGE_SIZE_64;
1c79356b 4294 }
91447636
A
4295 vm_object_unlock(upl->map_object);
4296
2d21ac55
A
4297 /*
4298 * hold a reference for the mapping
4299 */
4300 upl->ref_count++;
1c79356b 4301 upl->flags |= UPL_PAGE_LIST_MAPPED;
b0d623f7
A
4302 upl->kaddr = (vm_offset_t) *dst_addr;
4303 assert(upl->kaddr == *dst_addr);
4304
d41d1dae 4305 if(isVectorUPL)
b0d623f7 4306 goto process_upl_to_enter;
2d21ac55 4307
d41d1dae
A
4308 upl_unlock(upl);
4309
1c79356b
A
4310 return KERN_SUCCESS;
4311}
4312
91447636
A
4313/*
4314 * Internal routine to remove a UPL mapping from a VM map.
4315 *
4316 * XXX - This should just be doable through a standard
4317 * vm_map_remove() operation. Otherwise, implicit clean-up
4318 * of the target map won't be able to correctly remove
4319 * these (and release the reference on the UPL). Having
4320 * to do this means we can't map these into user-space
4321 * maps yet.
4322 */
1c79356b 4323kern_return_t
91447636 4324vm_map_remove_upl(
1c79356b
A
4325 vm_map_t map,
4326 upl_t upl)
4327{
0b4e3aa0 4328 vm_address_t addr;
91447636 4329 upl_size_t size;
b0d623f7
A
4330 int isVectorUPL = 0, curr_upl = 0;
4331 upl_t vector_upl = NULL;
1c79356b 4332
0b4e3aa0
A
4333 if (upl == UPL_NULL)
4334 return KERN_INVALID_ARGUMENT;
4335
b0d623f7
A
4336 if((isVectorUPL = vector_upl_is_valid(upl))) {
4337 int unmapped=0, valid_upls=0;
4338 vector_upl = upl;
4339 upl_lock(vector_upl);
4340 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4341 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
4342 if(upl == NULL)
4343 continue;
4344 valid_upls++;
4345 if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
4346 unmapped++;
4347 }
4348
4349 if(unmapped) {
4350 if(unmapped != valid_upls)
4351 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
4352 else {
4353 upl_unlock(vector_upl);
4354 return KERN_FAILURE;
4355 }
4356 }
4357 curr_upl=0;
4358 }
4359 else
4360 upl_lock(upl);
4361
4362process_upl_to_remove:
4363 if(isVectorUPL) {
4364 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4365 vm_map_t v_upl_submap;
4366 vm_offset_t v_upl_submap_dst_addr;
4367 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
4368
4369 vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
4370 vm_map_deallocate(v_upl_submap);
4371 upl_unlock(vector_upl);
4372 return KERN_SUCCESS;
4373 }
4374
4375 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4376 if(upl == NULL)
4377 goto process_upl_to_remove;
4378 }
2d21ac55
A
4379
4380 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
0b4e3aa0 4381 addr = upl->kaddr;
1c79356b 4382 size = upl->size;
2d21ac55 4383
0b4e3aa0
A
4384 assert(upl->ref_count > 1);
4385 upl->ref_count--; /* removing mapping ref */
2d21ac55 4386
1c79356b
A
4387 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
4388 upl->kaddr = (vm_offset_t) 0;
b0d623f7
A
4389
4390 if(!isVectorUPL) {
4391 upl_unlock(upl);
4392
4393 vm_map_remove(map,
4394 vm_map_trunc_page(addr),
4395 vm_map_round_page(addr + size),
4396 VM_MAP_NO_FLAGS);
4397
4398 return KERN_SUCCESS;
4399 }
4400 else {
4401 /*
4402 * If it's a Vectored UPL, we'll be removing the entire
4403 * submap anyways, so no need to remove individual UPL
4404 * element mappings from within the submap
4405 */
4406 goto process_upl_to_remove;
4407 }
1c79356b 4408 }
0b4e3aa0 4409 upl_unlock(upl);
2d21ac55 4410
0b4e3aa0 4411 return KERN_FAILURE;
1c79356b
A
4412}
4413
b0d623f7
A
4414static void
4415dw_do_work(
4416 vm_object_t object,
4417 struct dw *dwp,
4418 int dw_count)
4419{
4420 int j;
4421 boolean_t held_as_spin = TRUE;
4422
4423 /*
4424 * pageout_scan takes the vm_page_lock_queues first
4425 * then tries for the object lock... to avoid what
4426 * is effectively a lock inversion, we'll go to the
4427 * trouble of taking them in that same order... otherwise
4428 * if this object contains the majority of the pages resident
4429 * in the UBC (or a small set of large objects actively being
4430 * worked on contain the majority of the pages), we could
4431 * cause the pageout_scan thread to 'starve' in its attempt
4432 * to find pages to move to the free queue, since it has to
4433 * successfully acquire the object lock of any candidate page
4434 * before it can steal/clean it.
4435 */
4436 if (!vm_page_trylockspin_queues()) {
4437 vm_object_unlock(object);
4438
4439 vm_page_lockspin_queues();
4440
4441 for (j = 0; ; j++) {
4442 if (!vm_object_lock_avoid(object) &&
4443 _vm_object_lock_try(object))
4444 break;
4445 vm_page_unlock_queues();
4446 mutex_pause(j);
4447 vm_page_lockspin_queues();
4448 }
4449 }
4450 for (j = 0; j < dw_count; j++, dwp++) {
4451
4452 if (dwp->dw_mask & DW_vm_pageout_throttle_up)
4453 vm_pageout_throttle_up(dwp->dw_m);
4454
4455 if (dwp->dw_mask & DW_vm_page_wire)
4456 vm_page_wire(dwp->dw_m);
0b4c1975
A
4457 else if (dwp->dw_mask & DW_vm_page_unwire) {
4458 boolean_t queueit;
b0d623f7 4459
0b4c1975
A
4460 queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE;
4461
4462 vm_page_unwire(dwp->dw_m, queueit);
4463 }
b0d623f7
A
4464 if (dwp->dw_mask & DW_vm_page_free) {
4465 if (held_as_spin == TRUE) {
4466 vm_page_lockconvert_queues();
4467 held_as_spin = FALSE;
4468 }
4469 vm_page_free(dwp->dw_m);
4470 } else {
4471 if (dwp->dw_mask & DW_vm_page_deactivate_internal)
4472 vm_page_deactivate_internal(dwp->dw_m, FALSE);
4473 else if (dwp->dw_mask & DW_vm_page_activate)
4474 vm_page_activate(dwp->dw_m);
4475 else if (dwp->dw_mask & DW_vm_page_speculate)
4476 vm_page_speculate(dwp->dw_m, TRUE);
4477 else if (dwp->dw_mask & DW_vm_page_lru)
4478 vm_page_lru(dwp->dw_m);
4479
4480 if (dwp->dw_mask & DW_set_reference)
4481 dwp->dw_m->reference = TRUE;
4482 else if (dwp->dw_mask & DW_clear_reference)
4483 dwp->dw_m->reference = FALSE;
4484
4485 if (dwp->dw_mask & DW_clear_busy)
4486 dwp->dw_m->busy = FALSE;
4487
4488 if (dwp->dw_mask & DW_PAGE_WAKEUP)
4489 PAGE_WAKEUP(dwp->dw_m);
4490 }
4491 }
4492 vm_page_unlock_queues();
4493}
4494
4495
4496
1c79356b 4497kern_return_t
0b4e3aa0 4498upl_commit_range(
1c79356b 4499 upl_t upl,
91447636
A
4500 upl_offset_t offset,
4501 upl_size_t size,
1c79356b 4502 int flags,
0b4e3aa0
A
4503 upl_page_info_t *page_list,
4504 mach_msg_type_number_t count,
4505 boolean_t *empty)
1c79356b 4506{
b0d623f7 4507 upl_size_t xfer_size, subupl_size = size;
55e303ae 4508 vm_object_t shadow_object;
2d21ac55 4509 vm_object_t object;
1c79356b 4510 vm_object_offset_t target_offset;
b0d623f7 4511 upl_offset_t subupl_offset = offset;
1c79356b 4512 int entry;
55e303ae
A
4513 wpl_array_t lite_list;
4514 int occupied;
91447636 4515 int clear_refmod = 0;
2d21ac55 4516 int pgpgout_count = 0;
b0d623f7
A
4517 struct dw dw_array[DELAYED_WORK_LIMIT];
4518 struct dw *dwp;
4519 int dw_count, isVectorUPL = 0;
4520 upl_t vector_upl = NULL;
1c79356b 4521
0b4e3aa0
A
4522 *empty = FALSE;
4523
4524 if (upl == UPL_NULL)
4525 return KERN_INVALID_ARGUMENT;
4526
4527 if (count == 0)
4528 page_list = NULL;
4529
b0d623f7
A
4530 if((isVectorUPL = vector_upl_is_valid(upl))) {
4531 vector_upl = upl;
4532 upl_lock(vector_upl);
4533 }
4534 else
4535 upl_lock(upl);
4536
4537process_upl_to_commit:
4538
4539 if(isVectorUPL) {
4540 size = subupl_size;
4541 offset = subupl_offset;
4542 if(size == 0) {
4543 upl_unlock(vector_upl);
4544 return KERN_SUCCESS;
4545 }
4546 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
4547 if(upl == NULL) {
4548 upl_unlock(vector_upl);
4549 return KERN_FAILURE;
4550 }
4551 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
4552 subupl_size -= size;
4553 subupl_offset += size;
4554 }
4555
4556#if UPL_DEBUG
4557 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
4558 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4559
4560 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
4561 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
4562
4563 upl->upl_commit_index++;
4564 }
4565#endif
2d21ac55
A
4566 if (upl->flags & UPL_DEVICE_MEMORY)
4567 xfer_size = 0;
4568 else if ((offset + size) <= upl->size)
4569 xfer_size = size;
b0d623f7
A
4570 else {
4571 if(!isVectorUPL)
4572 upl_unlock(upl);
4573 else {
4574 upl_unlock(vector_upl);
4575 }
2d21ac55 4576 return KERN_FAILURE;
91447636 4577 }
55e303ae
A
4578 if (upl->flags & UPL_CLEAR_DIRTY)
4579 flags |= UPL_COMMIT_CLEAR_DIRTY;
4580
2d21ac55
A
4581 if (upl->flags & UPL_INTERNAL)
4582 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
4583 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4584 else
4585 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
1c79356b 4586
2d21ac55
A
4587 object = upl->map_object;
4588
4589 if (upl->flags & UPL_SHADOWED) {
4590 vm_object_lock(object);
4591 shadow_object = object->shadow;
55e303ae 4592 } else {
2d21ac55 4593 shadow_object = object;
55e303ae 4594 }
1c79356b
A
4595 entry = offset/PAGE_SIZE;
4596 target_offset = (vm_object_offset_t)offset;
55e303ae 4597
b0d623f7
A
4598 if (upl->flags & UPL_KERNEL_OBJECT)
4599 vm_object_lock_shared(shadow_object);
4600 else
4601 vm_object_lock(shadow_object);
4a3eedf9 4602
b0d623f7
A
4603 if (upl->flags & UPL_ACCESS_BLOCKED) {
4604 assert(shadow_object->blocked_access);
4605 shadow_object->blocked_access = FALSE;
4606 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4a3eedf9 4607 }
4a3eedf9 4608
593a1d5f
A
4609 if (shadow_object->code_signed) {
4610 /*
4611 * CODE SIGNING:
4612 * If the object is code-signed, do not let this UPL tell
4613 * us if the pages are valid or not. Let the pages be
4614 * validated by VM the normal way (when they get mapped or
4615 * copied).
4616 */
4617 flags &= ~UPL_COMMIT_CS_VALIDATED;
4618 }
4619 if (! page_list) {
4620 /*
4621 * No page list to get the code-signing info from !?
4622 */
4623 flags &= ~UPL_COMMIT_CS_VALIDATED;
4624 }
4625
b0d623f7
A
4626 dwp = &dw_array[0];
4627 dw_count = 0;
4628
91447636 4629 while (xfer_size) {
2d21ac55
A
4630 vm_page_t t, m;
4631
b0d623f7
A
4632 dwp->dw_mask = 0;
4633 clear_refmod = 0;
4634
55e303ae 4635 m = VM_PAGE_NULL;
d7e50217 4636
55e303ae 4637 if (upl->flags & UPL_LITE) {
b0d623f7 4638 unsigned int pg_num;
55e303ae 4639
b0d623f7
A
4640 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
4641 assert(pg_num == target_offset/PAGE_SIZE);
55e303ae
A
4642
4643 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4644 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
2d21ac55 4645
b0d623f7
A
4646 if (!(upl->flags & UPL_KERNEL_OBJECT))
4647 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
55e303ae
A
4648 }
4649 }
2d21ac55
A
4650 if (upl->flags & UPL_SHADOWED) {
4651 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4652
55e303ae
A
4653 t->pageout = FALSE;
4654
b0d623f7 4655 VM_PAGE_FREE(t);
55e303ae 4656
2d21ac55
A
4657 if (m == VM_PAGE_NULL)
4658 m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
55e303ae
A
4659 }
4660 }
b0d623f7 4661 if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
593a1d5f 4662 goto commit_next_page;
55e303ae 4663
593a1d5f
A
4664 if (flags & UPL_COMMIT_CS_VALIDATED) {
4665 /*
4666 * CODE SIGNING:
4667 * Set the code signing bits according to
4668 * what the UPL says they should be.
4669 */
4670 m->cs_validated = page_list[entry].cs_validated;
4671 m->cs_tainted = page_list[entry].cs_tainted;
4672 }
4673 if (upl->flags & UPL_IO_WIRE) {
55e303ae 4674
593a1d5f
A
4675 if (page_list)
4676 page_list[entry].phys_addr = 0;
2d21ac55 4677
593a1d5f
A
4678 if (flags & UPL_COMMIT_SET_DIRTY)
4679 m->dirty = TRUE;
4680 else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4681 m->dirty = FALSE;
b0d623f7 4682
593a1d5f
A
4683 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4684 m->cs_validated && !m->cs_tainted) {
4a3eedf9
A
4685 /*
4686 * CODE SIGNING:
4687 * This page is no longer dirty
4688 * but could have been modified,
4689 * so it will need to be
4690 * re-validated.
4691 */
4692 m->cs_validated = FALSE;
b0d623f7 4693#if DEVELOPMENT || DEBUG
4a3eedf9 4694 vm_cs_validated_resets++;
b0d623f7
A
4695#endif
4696 pmap_disconnect(m->phys_page);
4a3eedf9 4697 }
91447636 4698 clear_refmod |= VM_MEM_MODIFIED;
55e303ae 4699 }
b0d623f7
A
4700 if (flags & UPL_COMMIT_INACTIVATE) {
4701 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4702 clear_refmod |= VM_MEM_REFERENCED;
4703 }
4704 if (upl->flags & UPL_ACCESS_BLOCKED) {
593a1d5f
A
4705 /*
4706 * We blocked access to the pages in this UPL.
4707 * Clear the "busy" bit and wake up any waiter
4708 * for this page.
4709 */
b0d623f7 4710 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
593a1d5f 4711 }
0b4c1975
A
4712 if (m->absent) {
4713 if (flags & UPL_COMMIT_FREE_ABSENT)
4714 dwp->dw_mask |= DW_vm_page_free;
d41d1dae 4715 else {
0b4c1975 4716 m->absent = FALSE;
d41d1dae
A
4717 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4718 }
4719 } else
4720 dwp->dw_mask |= DW_vm_page_unwire;
4721
593a1d5f
A
4722 goto commit_next_page;
4723 }
4724 /*
4725 * make sure to clear the hardware
4726 * modify or reference bits before
4727 * releasing the BUSY bit on this page
4728 * otherwise we risk losing a legitimate
4729 * change of state
4730 */
4731 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4732 m->dirty = FALSE;
2d21ac55 4733
593a1d5f
A
4734 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4735 m->cs_validated && !m->cs_tainted) {
4736 /*
4737 * CODE SIGNING:
4738 * This page is no longer dirty
4739 * but could have been modified,
4740 * so it will need to be
4741 * re-validated.
4742 */
4743 m->cs_validated = FALSE;
4744#if DEVELOPMENT || DEBUG
4745 vm_cs_validated_resets++;
4746#endif
b0d623f7 4747 pmap_disconnect(m->phys_page);
55e303ae 4748 }
593a1d5f
A
4749 clear_refmod |= VM_MEM_MODIFIED;
4750 }
593a1d5f
A
4751 if (page_list) {
4752 upl_page_info_t *p;
2d21ac55 4753
593a1d5f 4754 p = &(page_list[entry]);
b0d623f7 4755
593a1d5f
A
4756 if (p->phys_addr && p->pageout && !m->pageout) {
4757 m->busy = TRUE;
4758 m->pageout = TRUE;
b0d623f7
A
4759
4760 dwp->dw_mask |= DW_vm_page_wire;
4761
593a1d5f
A
4762 } else if (p->phys_addr &&
4763 !p->pageout && m->pageout &&
4764 !m->dump_cleaning) {
2d21ac55 4765 m->pageout = FALSE;
593a1d5f
A
4766 m->absent = FALSE;
4767 m->overwriting = FALSE;
b0d623f7
A
4768
4769 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
593a1d5f
A
4770 }
4771 page_list[entry].phys_addr = 0;
4772 }
4773 m->dump_cleaning = FALSE;
2d21ac55 4774
593a1d5f 4775 if (m->laundry)
b0d623f7 4776 dwp->dw_mask |= DW_vm_pageout_throttle_up;
91447636 4777
593a1d5f
A
4778 if (m->pageout) {
4779 m->cleaning = FALSE;
4780 m->encrypted_cleaning = FALSE;
4781 m->pageout = FALSE;
1c79356b 4782#if MACH_CLUSTER_STATS
593a1d5f 4783 if (m->wanted) vm_pageout_target_collisions++;
1c79356b 4784#endif
2d21ac55 4785 m->dirty = FALSE;
b0d623f7 4786
593a1d5f
A
4787 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4788 m->cs_validated && !m->cs_tainted) {
4a3eedf9
A
4789 /*
4790 * CODE SIGNING:
4791 * This page is no longer dirty
4792 * but could have been modified,
4793 * so it will need to be
4794 * re-validated.
4795 */
4796 m->cs_validated = FALSE;
593a1d5f 4797#if DEVELOPMENT || DEBUG
4a3eedf9 4798 vm_cs_validated_resets++;
593a1d5f 4799#endif
b0d623f7 4800 pmap_disconnect(m->phys_page);
4a3eedf9 4801 }
b0d623f7
A
4802
4803 if ((flags & UPL_COMMIT_SET_DIRTY) ||
4804 (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)))
593a1d5f 4805 m->dirty = TRUE;
b0d623f7 4806
593a1d5f
A
4807 if (m->dirty) {
4808 /*
4809 * page was re-dirtied after we started
4810 * the pageout... reactivate it since
4811 * we don't know whether the on-disk
4812 * copy matches what is now in memory
2d21ac55 4813 */
b0d623f7
A
4814 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4815
593a1d5f
A
4816 if (upl->flags & UPL_PAGEOUT) {
4817 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4818 VM_STAT_INCR(reactivations);
4819 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4820 }
593a1d5f
A
4821 } else {
4822 /*
4823 * page has been successfully cleaned
4824 * go ahead and free it for other use
2d21ac55 4825 */
b0d623f7 4826
593a1d5f
A
4827 if (m->object->internal) {
4828 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4829 } else {
4830 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4831 }
b0d623f7
A
4832 dwp->dw_mask |= DW_vm_page_free;
4833
593a1d5f
A
4834 if (upl->flags & UPL_PAGEOUT) {
4835 CLUSTER_STAT(vm_pageout_target_page_freed++;)
b0d623f7 4836
593a1d5f
A
4837 if (page_list[entry].dirty) {
4838 VM_STAT_INCR(pageouts);
4839 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4840 pgpgout_count++;
4841 }
4842 }
de355530 4843 }
593a1d5f
A
4844 goto commit_next_page;
4845 }
4846#if MACH_CLUSTER_STATS
4847 if (m->wpmapped)
4848 m->dirty = pmap_is_modified(m->phys_page);
4849
4850 if (m->dirty) vm_pageout_cluster_dirtied++;
4851 else vm_pageout_cluster_cleaned++;
4852 if (m->wanted) vm_pageout_cluster_collisions++;
4853#endif
4854 m->dirty = FALSE;
91447636 4855
593a1d5f
A
4856 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4857 m->cs_validated && !m->cs_tainted) {
2d21ac55 4858 /*
593a1d5f
A
4859 * CODE SIGNING:
4860 * This page is no longer dirty
4861 * but could have been modified,
4862 * so it will need to be
4863 * re-validated.
2d21ac55 4864 */
593a1d5f
A
4865 m->cs_validated = FALSE;
4866#if DEVELOPMENT || DEBUG
4867 vm_cs_validated_resets++;
4868#endif
b0d623f7 4869 pmap_disconnect(m->phys_page);
593a1d5f 4870 }
55e303ae 4871
593a1d5f
A
4872 if ((m->busy) && (m->cleaning)) {
4873 /*
4874 * the request_page_list case
4875 */
4876 m->absent = FALSE;
4877 m->overwriting = FALSE;
b0d623f7
A
4878
4879 dwp->dw_mask |= DW_clear_busy;
4880
593a1d5f
A
4881 } else if (m->overwriting) {
4882 /*
4883 * alternate request page list, write to
4884 * page_list case. Occurs when the original
4885 * page was wired at the time of the list
4886 * request
4887 */
b0d623f7 4888 assert(VM_PAGE_WIRED(m));
593a1d5f 4889 m->overwriting = FALSE;
b0d623f7
A
4890
4891 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
593a1d5f
A
4892 }
4893 m->cleaning = FALSE;
4894 m->encrypted_cleaning = FALSE;
b0d623f7 4895
593a1d5f
A
4896 /*
4897 * It is a part of the semantic of COPYOUT_FROM
4898 * UPLs that a commit implies cache sync
4899 * between the vm page and the backing store
4900 * this can be used to strip the precious bit
4901 * as well as clean
4902 */
b0d623f7 4903 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
593a1d5f 4904 m->precious = FALSE;
b0d623f7 4905
593a1d5f
A
4906 if (flags & UPL_COMMIT_SET_DIRTY)
4907 m->dirty = TRUE;
b0d623f7 4908
593a1d5f 4909 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
b0d623f7
A
4910 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4911 clear_refmod |= VM_MEM_REFERENCED;
4912
593a1d5f 4913 } else if (!m->active && !m->inactive && !m->speculative) {
b0d623f7
A
4914
4915 if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
4916 dwp->dw_mask |= DW_vm_page_speculate;
593a1d5f 4917 else if (m->reference)
b0d623f7
A
4918 dwp->dw_mask |= DW_vm_page_activate;
4919 else {
4920 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4921 clear_refmod |= VM_MEM_REFERENCED;
4922 }
593a1d5f 4923 }
b0d623f7 4924 if (upl->flags & UPL_ACCESS_BLOCKED) {
2d21ac55 4925 /*
593a1d5f
A
4926 * We blocked access to the pages in this URL.
4927 * Clear the "busy" bit on this page before we
4928 * wake up any waiter.
2d21ac55 4929 */
b0d623f7 4930 dwp->dw_mask |= DW_clear_busy;
1c79356b 4931 }
593a1d5f
A
4932 /*
4933 * Wakeup any thread waiting for the page to be un-cleaning.
4934 */
b0d623f7 4935 dwp->dw_mask |= DW_PAGE_WAKEUP;
593a1d5f 4936
2d21ac55 4937commit_next_page:
b0d623f7
A
4938 if (clear_refmod)
4939 pmap_clear_refmod(m->phys_page, clear_refmod);
4940
1c79356b
A
4941 target_offset += PAGE_SIZE_64;
4942 xfer_size -= PAGE_SIZE;
4943 entry++;
2d21ac55 4944
b0d623f7
A
4945 if (dwp->dw_mask) {
4946 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
4947 if (m->busy == FALSE) {
4948 /*
4949 * dw_do_work may need to drop the object lock
4950 * if it does, we need the pages it's looking at to
4951 * be held stable via the busy bit.
4952 */
4953 m->busy = TRUE;
4954 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4955 }
4956 dwp->dw_m = m;
4957 dwp++;
4958 dw_count++;
4a3eedf9 4959
b0d623f7
A
4960 if (dw_count >= DELAYED_WORK_LIMIT) {
4961 dw_do_work(shadow_object, &dw_array[0], dw_count);
4962
4963 dwp = &dw_array[0];
4964 dw_count = 0;
4965 }
4966 } else {
4967 if (dwp->dw_mask & DW_clear_busy)
4968 m->busy = FALSE;
4969
4970 if (dwp->dw_mask & DW_PAGE_WAKEUP)
4971 PAGE_WAKEUP(m);
4a3eedf9 4972 }
2d21ac55 4973 }
1c79356b 4974 }
b0d623f7
A
4975 if (dw_count)
4976 dw_do_work(shadow_object, &dw_array[0], dw_count);
55e303ae
A
4977
4978 occupied = 1;
4979
4980 if (upl->flags & UPL_DEVICE_MEMORY) {
4981 occupied = 0;
4982 } else if (upl->flags & UPL_LITE) {
4983 int pg_num;
4984 int i;
2d21ac55 4985
55e303ae
A
4986 pg_num = upl->size/PAGE_SIZE;
4987 pg_num = (pg_num + 31) >> 5;
4988 occupied = 0;
2d21ac55
A
4989
4990 for (i = 0; i < pg_num; i++) {
4991 if (lite_list[i] != 0) {
55e303ae
A
4992 occupied = 1;
4993 break;
4994 }
4995 }
4996 } else {
2d21ac55 4997 if (queue_empty(&upl->map_object->memq))
55e303ae 4998 occupied = 0;
55e303ae 4999 }
2d21ac55 5000 if (occupied == 0) {
b0d623f7
A
5001 /*
5002 * If this UPL element belongs to a Vector UPL and is
5003 * empty, then this is the right function to deallocate
5004 * it. So go ahead set the *empty variable. The flag
5005 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5006 * should be considered relevant for the Vector UPL and not
5007 * the internal UPLs.
5008 */
5009 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
0b4e3aa0 5010 *empty = TRUE;
2d21ac55 5011
b0d623f7 5012 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
2d21ac55
A
5013 /*
5014 * this is not a paging object
5015 * so we need to drop the paging reference
5016 * that was taken when we created the UPL
5017 * against this object
5018 */
b0d623f7 5019 vm_object_activity_end(shadow_object);
2d21ac55
A
5020 } else {
5021 /*
5022 * we dontated the paging reference to
5023 * the map object... vm_pageout_object_terminate
5024 * will drop this reference
5025 */
5026 }
1c79356b 5027 }
55e303ae 5028 vm_object_unlock(shadow_object);
91447636
A
5029 if (object != shadow_object)
5030 vm_object_unlock(object);
b0d623f7
A
5031
5032 if(!isVectorUPL)
5033 upl_unlock(upl);
5034 else {
5035 /*
5036 * If we completed our operations on an UPL that is
5037 * part of a Vectored UPL and if empty is TRUE, then
5038 * we should go ahead and deallocate this UPL element.
5039 * Then we check if this was the last of the UPL elements
5040 * within that Vectored UPL. If so, set empty to TRUE
5041 * so that in ubc_upl_commit_range or ubc_upl_commit, we
5042 * can go ahead and deallocate the Vector UPL too.
5043 */
5044 if(*empty==TRUE) {
5045 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
5046 upl_deallocate(upl);
5047 }
5048 goto process_upl_to_commit;
5049 }
0b4e3aa0 5050
2d21ac55
A
5051 if (pgpgout_count) {
5052 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
5053 }
5054
1c79356b
A
5055 return KERN_SUCCESS;
5056}
5057
0b4e3aa0
A
5058kern_return_t
5059upl_abort_range(
1c79356b 5060 upl_t upl,
91447636
A
5061 upl_offset_t offset,
5062 upl_size_t size,
0b4e3aa0
A
5063 int error,
5064 boolean_t *empty)
1c79356b 5065{
b0d623f7 5066 upl_size_t xfer_size, subupl_size = size;
55e303ae 5067 vm_object_t shadow_object;
2d21ac55 5068 vm_object_t object;
1c79356b 5069 vm_object_offset_t target_offset;
b0d623f7 5070 upl_offset_t subupl_offset = offset;
1c79356b 5071 int entry;
55e303ae
A
5072 wpl_array_t lite_list;
5073 int occupied;
b0d623f7
A
5074 struct dw dw_array[DELAYED_WORK_LIMIT];
5075 struct dw *dwp;
5076 int dw_count, isVectorUPL = 0;
5077 upl_t vector_upl = NULL;
1c79356b 5078
0b4e3aa0
A
5079 *empty = FALSE;
5080
5081 if (upl == UPL_NULL)
5082 return KERN_INVALID_ARGUMENT;
5083
2d21ac55 5084 if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
0b4c1975 5085 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
55e303ae 5086
b0d623f7
A
5087 if((isVectorUPL = vector_upl_is_valid(upl))) {
5088 vector_upl = upl;
5089 upl_lock(vector_upl);
5090 }
5091 else
5092 upl_lock(upl);
5093
5094process_upl_to_abort:
5095 if(isVectorUPL) {
5096 size = subupl_size;
5097 offset = subupl_offset;
5098 if(size == 0) {
5099 upl_unlock(vector_upl);
5100 return KERN_SUCCESS;
5101 }
5102 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5103 if(upl == NULL) {
5104 upl_unlock(vector_upl);
5105 return KERN_FAILURE;
5106 }
5107 subupl_size -= size;
5108 subupl_offset += size;
5109 }
5110
5111 *empty = FALSE;
5112
5113#if UPL_DEBUG
5114 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5115 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5116
5117 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5118 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5119 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
5120
5121 upl->upl_commit_index++;
5122 }
5123#endif
2d21ac55 5124 if (upl->flags & UPL_DEVICE_MEMORY)
1c79356b 5125 xfer_size = 0;
2d21ac55
A
5126 else if ((offset + size) <= upl->size)
5127 xfer_size = size;
b0d623f7
A
5128 else {
5129 if(!isVectorUPL)
5130 upl_unlock(upl);
5131 else {
5132 upl_unlock(vector_upl);
5133 }
55e303ae 5134
b0d623f7
A
5135 return KERN_FAILURE;
5136 }
2d21ac55 5137 if (upl->flags & UPL_INTERNAL) {
55e303ae 5138 lite_list = (wpl_array_t)
91447636 5139 ((((uintptr_t)upl) + sizeof(struct upl))
55e303ae
A
5140 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5141 } else {
5142 lite_list = (wpl_array_t)
91447636 5143 (((uintptr_t)upl) + sizeof(struct upl));
55e303ae 5144 }
2d21ac55
A
5145 object = upl->map_object;
5146
5147 if (upl->flags & UPL_SHADOWED) {
5148 vm_object_lock(object);
5149 shadow_object = object->shadow;
5150 } else
5151 shadow_object = object;
5152
1c79356b
A
5153 entry = offset/PAGE_SIZE;
5154 target_offset = (vm_object_offset_t)offset;
2d21ac55 5155
b0d623f7
A
5156 if (upl->flags & UPL_KERNEL_OBJECT)
5157 vm_object_lock_shared(shadow_object);
5158 else
5159 vm_object_lock(shadow_object);
4a3eedf9 5160
b0d623f7
A
5161 if (upl->flags & UPL_ACCESS_BLOCKED) {
5162 assert(shadow_object->blocked_access);
5163 shadow_object->blocked_access = FALSE;
5164 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4a3eedf9 5165 }
b0d623f7
A
5166
5167 dwp = &dw_array[0];
5168 dw_count = 0;
5169
5170 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
5171 panic("upl_abort_range: kernel_object being DUMPED");
4a3eedf9 5172
2d21ac55
A
5173 while (xfer_size) {
5174 vm_page_t t, m;
5175
b0d623f7
A
5176 dwp->dw_mask = 0;
5177
55e303ae 5178 m = VM_PAGE_NULL;
2d21ac55
A
5179
5180 if (upl->flags & UPL_LITE) {
b0d623f7
A
5181 unsigned int pg_num;
5182
5183 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5184 assert(pg_num == target_offset/PAGE_SIZE);
5185
2d21ac55
A
5186
5187 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
55e303ae 5188 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
2d21ac55 5189
b0d623f7
A
5190 if ( !(upl->flags & UPL_KERNEL_OBJECT))
5191 m = vm_page_lookup(shadow_object, target_offset +
5192 (upl->offset - shadow_object->paging_offset));
55e303ae
A
5193 }
5194 }
2d21ac55
A
5195 if (upl->flags & UPL_SHADOWED) {
5196 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
5197 t->pageout = FALSE;
5198
b0d623f7 5199 VM_PAGE_FREE(t);
2d21ac55
A
5200
5201 if (m == VM_PAGE_NULL)
5202 m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
55e303ae
A
5203 }
5204 }
b0d623f7
A
5205 if ((upl->flags & UPL_KERNEL_OBJECT))
5206 goto abort_next_page;
5207
2d21ac55
A
5208 if (m != VM_PAGE_NULL) {
5209
5210 if (m->absent) {
91447636
A
5211 boolean_t must_free = TRUE;
5212
2d21ac55
A
5213 m->clustered = FALSE;
5214 /*
5215 * COPYOUT = FALSE case
5216 * check for error conditions which must
5217 * be passed back to the pages customer
5218 */
5219 if (error & UPL_ABORT_RESTART) {
1c79356b
A
5220 m->restart = TRUE;
5221 m->absent = FALSE;
2d21ac55 5222 m->unusual = TRUE;
91447636 5223 must_free = FALSE;
2d21ac55 5224 } else if (error & UPL_ABORT_UNAVAILABLE) {
1c79356b
A
5225 m->restart = FALSE;
5226 m->unusual = TRUE;
91447636 5227 must_free = FALSE;
2d21ac55 5228 } else if (error & UPL_ABORT_ERROR) {
1c79356b
A
5229 m->restart = FALSE;
5230 m->absent = FALSE;
1c79356b 5231 m->error = TRUE;
2d21ac55 5232 m->unusual = TRUE;
91447636 5233 must_free = FALSE;
1c79356b 5234 }
91447636
A
5235
5236 /*
5237 * ENCRYPTED SWAP:
5238 * If the page was already encrypted,
5239 * we don't really need to decrypt it
5240 * now. It will get decrypted later,
5241 * on demand, as soon as someone needs
5242 * to access its contents.
5243 */
1c79356b
A
5244
5245 m->cleaning = FALSE;
2d21ac55 5246 m->encrypted_cleaning = FALSE;
1c79356b 5247 m->overwriting = FALSE;
b0d623f7
A
5248
5249 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
91447636 5250
2d21ac55 5251 if (must_free == TRUE)
b0d623f7 5252 dwp->dw_mask |= DW_vm_page_free;
2d21ac55 5253 else
b0d623f7 5254 dwp->dw_mask |= DW_vm_page_activate;
2d21ac55
A
5255 } else {
5256 /*
5257 * Handle the trusted pager throttle.
5258 */
5259 if (m->laundry)
b0d623f7 5260 dwp->dw_mask |= DW_vm_pageout_throttle_up;
2d21ac55
A
5261
5262 if (m->pageout) {
5263 assert(m->busy);
5264 assert(m->wire_count == 1);
5265 m->pageout = FALSE;
b0d623f7
A
5266
5267 dwp->dw_mask |= DW_vm_page_unwire;
1c79356b 5268 }
2d21ac55
A
5269 m->dump_cleaning = FALSE;
5270 m->cleaning = FALSE;
5271 m->encrypted_cleaning = FALSE;
5272 m->overwriting = FALSE;
1c79356b 5273#if MACH_PAGEMAP
2d21ac55 5274 vm_external_state_clr(m->object->existence_map, m->offset);
1c79356b 5275#endif /* MACH_PAGEMAP */
2d21ac55
A
5276 if (error & UPL_ABORT_DUMP_PAGES) {
5277 pmap_disconnect(m->phys_page);
b0d623f7
A
5278
5279 dwp->dw_mask |= DW_vm_page_free;
2d21ac55
A
5280 } else {
5281 if (error & UPL_ABORT_REFERENCE) {
5282 /*
5283 * we've been told to explictly
5284 * reference this page... for
5285 * file I/O, this is done by
5286 * implementing an LRU on the inactive q
5287 */
b0d623f7 5288 dwp->dw_mask |= DW_vm_page_lru;
2d21ac55 5289 }
b0d623f7 5290 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
2d21ac55 5291 }
1c79356b 5292 }
2d21ac55 5293 }
b0d623f7 5294abort_next_page:
55e303ae
A
5295 target_offset += PAGE_SIZE_64;
5296 xfer_size -= PAGE_SIZE;
5297 entry++;
b0d623f7
A
5298
5299 if (dwp->dw_mask) {
5300 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5301 if (m->busy == FALSE) {
5302 /*
5303 * dw_do_work may need to drop the object lock
5304 * if it does, we need the pages it's looking at to
5305 * be held stable via the busy bit.
5306 */
5307 m->busy = TRUE;
5308 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5309 }
5310 dwp->dw_m = m;
5311 dwp++;
5312 dw_count++;
5313
5314 if (dw_count >= DELAYED_WORK_LIMIT) {
5315 dw_do_work(shadow_object, &dw_array[0], dw_count);
5316
5317 dwp = &dw_array[0];
5318 dw_count = 0;
5319 }
5320 } else {
5321 if (dwp->dw_mask & DW_clear_busy)
5322 m->busy = FALSE;
5323
5324 if (dwp->dw_mask & DW_PAGE_WAKEUP)
5325 PAGE_WAKEUP(m);
5326 }
5327 }
d7e50217 5328 }
b0d623f7
A
5329 if (dw_count)
5330 dw_do_work(shadow_object, &dw_array[0], dw_count);
2d21ac55 5331
55e303ae 5332 occupied = 1;
2d21ac55 5333
55e303ae
A
5334 if (upl->flags & UPL_DEVICE_MEMORY) {
5335 occupied = 0;
5336 } else if (upl->flags & UPL_LITE) {
5337 int pg_num;
5338 int i;
2d21ac55 5339
55e303ae
A
5340 pg_num = upl->size/PAGE_SIZE;
5341 pg_num = (pg_num + 31) >> 5;
5342 occupied = 0;
2d21ac55
A
5343
5344 for (i = 0; i < pg_num; i++) {
5345 if (lite_list[i] != 0) {
55e303ae
A
5346 occupied = 1;
5347 break;
5348 }
5349 }
5350 } else {
2d21ac55 5351 if (queue_empty(&upl->map_object->memq))
55e303ae 5352 occupied = 0;
55e303ae 5353 }
2d21ac55 5354 if (occupied == 0) {
b0d623f7
A
5355 /*
5356 * If this UPL element belongs to a Vector UPL and is
5357 * empty, then this is the right function to deallocate
5358 * it. So go ahead set the *empty variable. The flag
5359 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5360 * should be considered relevant for the Vector UPL and
5361 * not the internal UPLs.
5362 */
5363 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
0b4e3aa0 5364 *empty = TRUE;
2d21ac55 5365
b0d623f7 5366 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
2d21ac55
A
5367 /*
5368 * this is not a paging object
5369 * so we need to drop the paging reference
5370 * that was taken when we created the UPL
5371 * against this object
5372 */
b0d623f7 5373 vm_object_activity_end(shadow_object);
2d21ac55
A
5374 } else {
5375 /*
5376 * we dontated the paging reference to
5377 * the map object... vm_pageout_object_terminate
5378 * will drop this reference
5379 */
5380 }
1c79356b 5381 }
55e303ae 5382 vm_object_unlock(shadow_object);
91447636
A
5383 if (object != shadow_object)
5384 vm_object_unlock(object);
b0d623f7
A
5385
5386 if(!isVectorUPL)
5387 upl_unlock(upl);
5388 else {
5389 /*
5390 * If we completed our operations on an UPL that is
5391 * part of a Vectored UPL and if empty is TRUE, then
5392 * we should go ahead and deallocate this UPL element.
5393 * Then we check if this was the last of the UPL elements
5394 * within that Vectored UPL. If so, set empty to TRUE
5395 * so that in ubc_upl_abort_range or ubc_upl_abort, we
5396 * can go ahead and deallocate the Vector UPL too.
5397 */
5398 if(*empty == TRUE) {
5399 *empty = vector_upl_set_subupl(vector_upl, upl,0);
5400 upl_deallocate(upl);
5401 }
5402 goto process_upl_to_abort;
5403 }
55e303ae 5404
1c79356b
A
5405 return KERN_SUCCESS;
5406}
5407
2d21ac55 5408
1c79356b 5409kern_return_t
0b4e3aa0 5410upl_abort(
1c79356b
A
5411 upl_t upl,
5412 int error)
2d21ac55
A
5413{
5414 boolean_t empty;
5415
5416 return upl_abort_range(upl, 0, upl->size, error, &empty);
1c79356b
A
5417}
5418
55e303ae 5419
2d21ac55
A
5420/* an option on commit should be wire */
5421kern_return_t
5422upl_commit(
5423 upl_t upl,
5424 upl_page_info_t *page_list,
5425 mach_msg_type_number_t count)
5426{
5427 boolean_t empty;
5428
5429 return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
5430}
5431
55e303ae 5432
b0d623f7
A
5433unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
5434
55e303ae
A
5435kern_return_t
5436vm_object_iopl_request(
5437 vm_object_t object,
5438 vm_object_offset_t offset,
91447636 5439 upl_size_t size,
55e303ae
A
5440 upl_t *upl_ptr,
5441 upl_page_info_array_t user_page_list,
5442 unsigned int *page_list_count,
5443 int cntrl_flags)
5444{
5445 vm_page_t dst_page;
2d21ac55
A
5446 vm_object_offset_t dst_offset;
5447 upl_size_t xfer_size;
55e303ae 5448 upl_t upl = NULL;
91447636
A
5449 unsigned int entry;
5450 wpl_array_t lite_list = NULL;
91447636 5451 int no_zero_fill = FALSE;
2d21ac55 5452 u_int32_t psize;
55e303ae
A
5453 kern_return_t ret;
5454 vm_prot_t prot;
2d21ac55 5455 struct vm_object_fault_info fault_info;
b0d623f7
A
5456 struct dw dw_array[DELAYED_WORK_LIMIT];
5457 struct dw *dwp;
5458 int dw_count;
5459 int dw_index;
55e303ae 5460
91447636
A
5461 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5462 /*
5463 * For forward compatibility's sake,
5464 * reject any unknown flag.
5465 */
5466 return KERN_INVALID_VALUE;
5467 }
0b4c1975 5468 if (vm_lopage_needed == FALSE)
0c530ab8
A
5469 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
5470
5471 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
5472 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
5473 return KERN_INVALID_VALUE;
5474
5475 if (object->phys_contiguous) {
5476 if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
5477 return KERN_INVALID_ADDRESS;
2d21ac55
A
5478
5479 if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
0c530ab8
A
5480 return KERN_INVALID_ADDRESS;
5481 }
5482 }
91447636
A
5483
5484 if (cntrl_flags & UPL_ENCRYPT) {
5485 /*
5486 * ENCRYPTED SWAP:
5487 * The paging path doesn't use this interface,
5488 * so we don't support the UPL_ENCRYPT flag
5489 * here. We won't encrypt the pages.
5490 */
5491 assert(! (cntrl_flags & UPL_ENCRYPT));
5492 }
91447636
A
5493 if (cntrl_flags & UPL_NOZEROFILL)
5494 no_zero_fill = TRUE;
5495
5496 if (cntrl_flags & UPL_COPYOUT_FROM)
55e303ae 5497 prot = VM_PROT_READ;
91447636 5498 else
55e303ae 5499 prot = VM_PROT_READ | VM_PROT_WRITE;
55e303ae 5500
b0d623f7
A
5501 if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
5502 size = MAX_UPL_SIZE * PAGE_SIZE;
55e303ae 5503
2d21ac55
A
5504 if (cntrl_flags & UPL_SET_INTERNAL) {
5505 if (page_list_count != NULL)
cf7d32b8 5506 *page_list_count = MAX_UPL_SIZE;
2d21ac55
A
5507 }
5508 if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
5509 ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
5510 return KERN_INVALID_ARGUMENT;
55e303ae 5511
2d21ac55
A
5512 if ((!object->internal) && (object->paging_offset != 0))
5513 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
5514
5515
5516 if (object->phys_contiguous)
5517 psize = PAGE_SIZE;
5518 else
5519 psize = size;
5520
5521 if (cntrl_flags & UPL_SET_INTERNAL) {
5522 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5523
5524 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5525 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
5526 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
b0d623f7
A
5527 if (size == 0) {
5528 user_page_list = NULL;
5529 lite_list = NULL;
5530 }
2d21ac55
A
5531 } else {
5532 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
55e303ae 5533
2d21ac55 5534 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
5535 if (size == 0) {
5536 lite_list = NULL;
5537 }
55e303ae 5538 }
2d21ac55
A
5539 if (user_page_list)
5540 user_page_list[0].device = FALSE;
5541 *upl_ptr = upl;
55e303ae 5542
2d21ac55
A
5543 upl->map_object = object;
5544 upl->size = size;
5545
b0d623f7
A
5546 if (object == kernel_object &&
5547 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
5548 upl->flags |= UPL_KERNEL_OBJECT;
5549#if UPL_DEBUG
5550 vm_object_lock(object);
5551#else
5552 vm_object_lock_shared(object);
5553#endif
5554 } else {
5555 vm_object_lock(object);
5556 vm_object_activity_begin(object);
5557 }
2d21ac55
A
5558 /*
5559 * paging in progress also protects the paging_offset
5560 */
5561 upl->offset = offset + object->paging_offset;
55e303ae 5562
b0d623f7
A
5563 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5564 /*
5565 * The user requested that access to the pages in this URL
5566 * be blocked until the UPL is commited or aborted.
5567 */
5568 upl->flags |= UPL_ACCESS_BLOCKED;
5569 }
5570
2d21ac55 5571 if (object->phys_contiguous) {
b0d623f7 5572#if UPL_DEBUG
2d21ac55
A
5573 queue_enter(&object->uplq, upl, upl_t, uplq);
5574#endif /* UPL_DEBUG */
55e303ae 5575
b0d623f7
A
5576 if (upl->flags & UPL_ACCESS_BLOCKED) {
5577 assert(!object->blocked_access);
5578 object->blocked_access = TRUE;
5579 }
5580
2d21ac55 5581 vm_object_unlock(object);
55e303ae 5582
2d21ac55
A
5583 /*
5584 * don't need any shadow mappings for this one
5585 * since it is already I/O memory
5586 */
5587 upl->flags |= UPL_DEVICE_MEMORY;
55e303ae 5588
b0d623f7 5589 upl->highest_page = (ppnum_t) ((offset + object->shadow_offset + size - 1)>>PAGE_SHIFT);
2d21ac55
A
5590
5591 if (user_page_list) {
b0d623f7 5592 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->shadow_offset)>>PAGE_SHIFT);
2d21ac55 5593 user_page_list[0].device = TRUE;
55e303ae 5594 }
2d21ac55
A
5595 if (page_list_count != NULL) {
5596 if (upl->flags & UPL_INTERNAL)
5597 *page_list_count = 0;
5598 else
5599 *page_list_count = 1;
55e303ae 5600 }
2d21ac55 5601 return KERN_SUCCESS;
55e303ae 5602 }
b0d623f7
A
5603 if (object != kernel_object) {
5604 /*
5605 * Protect user space from future COW operations
5606 */
5607 object->true_share = TRUE;
55e303ae 5608
b0d623f7
A
5609 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
5610 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
5611 }
55e303ae 5612
b0d623f7 5613#if UPL_DEBUG
2d21ac55 5614 queue_enter(&object->uplq, upl, upl_t, uplq);
91447636 5615#endif /* UPL_DEBUG */
91447636 5616
b0d623f7
A
5617 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
5618 object->copy != VM_OBJECT_NULL) {
91447636 5619 /*
b0d623f7
A
5620 * Honor copy-on-write obligations
5621 *
5622 * The caller is gathering these pages and
5623 * might modify their contents. We need to
5624 * make sure that the copy object has its own
5625 * private copies of these pages before we let
5626 * the caller modify them.
5627 *
5628 * NOTE: someone else could map the original object
5629 * after we've done this copy-on-write here, and they
5630 * could then see an inconsistent picture of the memory
5631 * while it's being modified via the UPL. To prevent this,
5632 * we would have to block access to these pages until the
5633 * UPL is released. We could use the UPL_BLOCK_ACCESS
5634 * code path for that...
91447636 5635 */
b0d623f7
A
5636 vm_object_update(object,
5637 offset,
5638 size,
5639 NULL,
5640 NULL,
5641 FALSE, /* should_return */
5642 MEMORY_OBJECT_COPY_SYNC,
5643 VM_PROT_NO_CHANGE);
5644#if DEVELOPMENT || DEBUG
5645 iopl_cow++;
5646 iopl_cow_pages += size >> PAGE_SHIFT;
5647#endif
55e303ae 5648 }
b0d623f7
A
5649
5650
55e303ae 5651 entry = 0;
2d21ac55
A
5652
5653 xfer_size = size;
5654 dst_offset = offset;
5655
5656 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
5657 fault_info.user_tag = 0;
5658 fault_info.lo_offset = offset;
5659 fault_info.hi_offset = offset + xfer_size;
5660 fault_info.no_cache = FALSE;
b0d623f7 5661 fault_info.stealth = FALSE;
0b4c1975 5662 fault_info.mark_zf_absent = TRUE;
b0d623f7
A
5663
5664 dwp = &dw_array[0];
5665 dw_count = 0;
2d21ac55 5666
55e303ae 5667 while (xfer_size) {
2d21ac55 5668 vm_fault_return_t result;
b0d623f7
A
5669 unsigned int pg_num;
5670
5671 dwp->dw_mask = 0;
2d21ac55 5672
55e303ae
A
5673 dst_page = vm_page_lookup(object, dst_offset);
5674
91447636
A
5675 /*
5676 * ENCRYPTED SWAP:
5677 * If the page is encrypted, we need to decrypt it,
5678 * so force a soft page fault.
5679 */
b0d623f7
A
5680 if (dst_page == VM_PAGE_NULL ||
5681 dst_page->busy ||
5682 dst_page->encrypted ||
5683 dst_page->error ||
5684 dst_page->restart ||
5685 dst_page->absent ||
5686 dst_page->fictitious) {
5687
5688 if (object == kernel_object)
5689 panic("vm_object_iopl_request: missing/bad page in kernel object\n");
2d21ac55 5690
55e303ae
A
5691 do {
5692 vm_page_t top_page;
5693 kern_return_t error_code;
5694 int interruptible;
5695
2d21ac55 5696 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
55e303ae 5697 interruptible = THREAD_ABORTSAFE;
2d21ac55 5698 else
55e303ae 5699 interruptible = THREAD_UNINT;
2d21ac55
A
5700
5701 fault_info.interruptible = interruptible;
5702 fault_info.cluster_size = xfer_size;
55e303ae 5703
b0d623f7
A
5704 vm_object_paging_begin(object);
5705
55e303ae 5706 result = vm_fault_page(object, dst_offset,
2d21ac55
A
5707 prot | VM_PROT_WRITE, FALSE,
5708 &prot, &dst_page, &top_page,
5709 (int *)0,
5710 &error_code, no_zero_fill,
5711 FALSE, &fault_info);
5712
5713 switch (result) {
5714
55e303ae
A
5715 case VM_FAULT_SUCCESS:
5716
d41d1dae
A
5717 if ( !dst_page->absent) {
5718 PAGE_WAKEUP_DONE(dst_page);
5719 } else {
5720 /*
5721 * we only get back an absent page if we
5722 * requested that it not be zero-filled
5723 * because we are about to fill it via I/O
5724 *
5725 * absent pages should be left BUSY
5726 * to prevent them from being faulted
5727 * into an address space before we've
5728 * had a chance to complete the I/O on
5729 * them since they may contain info that
5730 * shouldn't be seen by the faulting task
5731 */
5732 }
55e303ae
A
5733 /*
5734 * Release paging references and
5735 * top-level placeholder page, if any.
5736 */
2d21ac55 5737 if (top_page != VM_PAGE_NULL) {
55e303ae 5738 vm_object_t local_object;
2d21ac55
A
5739
5740 local_object = top_page->object;
5741
5742 if (top_page->object != dst_page->object) {
5743 vm_object_lock(local_object);
55e303ae 5744 VM_PAGE_FREE(top_page);
2d21ac55
A
5745 vm_object_paging_end(local_object);
5746 vm_object_unlock(local_object);
55e303ae
A
5747 } else {
5748 VM_PAGE_FREE(top_page);
2d21ac55 5749 vm_object_paging_end(local_object);
55e303ae
A
5750 }
5751 }
b0d623f7 5752 vm_object_paging_end(object);
55e303ae
A
5753 break;
5754
55e303ae
A
5755 case VM_FAULT_RETRY:
5756 vm_object_lock(object);
55e303ae
A
5757 break;
5758
5759 case VM_FAULT_FICTITIOUS_SHORTAGE:
5760 vm_page_more_fictitious();
2d21ac55 5761
55e303ae 5762 vm_object_lock(object);
55e303ae
A
5763 break;
5764
5765 case VM_FAULT_MEMORY_SHORTAGE:
5766 if (vm_page_wait(interruptible)) {
5767 vm_object_lock(object);
55e303ae
A
5768 break;
5769 }
5770 /* fall thru */
5771
5772 case VM_FAULT_INTERRUPTED:
5773 error_code = MACH_SEND_INTERRUPTED;
5774 case VM_FAULT_MEMORY_ERROR:
b0d623f7 5775 memory_error:
2d21ac55 5776 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
0c530ab8 5777
2d21ac55 5778 vm_object_lock(object);
0c530ab8 5779 goto return_err;
b0d623f7
A
5780
5781 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5782 /* success but no page: fail */
5783 vm_object_paging_end(object);
5784 vm_object_unlock(object);
5785 goto memory_error;
5786
5787 default:
5788 panic("vm_object_iopl_request: unexpected error"
5789 " 0x%x from vm_fault_page()\n", result);
55e303ae 5790 }
2d21ac55 5791 } while (result != VM_FAULT_SUCCESS);
b0d623f7 5792
55e303ae 5793 }
0c530ab8 5794
b0d623f7
A
5795 if (upl->flags & UPL_KERNEL_OBJECT)
5796 goto record_phys_addr;
5797
5798 if (dst_page->cleaning) {
5799 /*
5800 * Someone else is cleaning this page in place.as
5801 * In theory, we should be able to proceed and use this
5802 * page but they'll probably end up clearing the "busy"
5803 * bit on it in upl_commit_range() but they didn't set
5804 * it, so they would clear our "busy" bit and open
5805 * us to race conditions.
5806 * We'd better wait for the cleaning to complete and
5807 * then try again.
5808 */
5809 vm_object_iopl_request_sleep_for_cleaning++;
5810 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5811 continue;
5812 }
0c530ab8
A
5813 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5814 dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5815 vm_page_t low_page;
5816 int refmod;
5817
5818 /*
5819 * support devices that can't DMA above 32 bits
5820 * by substituting pages from a pool of low address
5821 * memory for any pages we find above the 4G mark
5822 * can't substitute if the page is already wired because
5823 * we don't know whether that physical address has been
5824 * handed out to some other 64 bit capable DMA device to use
5825 */
b0d623f7 5826 if (VM_PAGE_WIRED(dst_page)) {
0c530ab8
A
5827 ret = KERN_PROTECTION_FAILURE;
5828 goto return_err;
5829 }
0c530ab8
A
5830 low_page = vm_page_grablo();
5831
5832 if (low_page == VM_PAGE_NULL) {
5833 ret = KERN_RESOURCE_SHORTAGE;
5834 goto return_err;
5835 }
5836 /*
5837 * from here until the vm_page_replace completes
5838 * we musn't drop the object lock... we don't
5839 * want anyone refaulting this page in and using
5840 * it after we disconnect it... we want the fault
5841 * to find the new page being substituted.
5842 */
2d21ac55
A
5843 if (dst_page->pmapped)
5844 refmod = pmap_disconnect(dst_page->phys_page);
5845 else
5846 refmod = 0;
d41d1dae
A
5847
5848 if ( !dst_page->absent)
5849 vm_page_copy(dst_page, low_page);
2d21ac55 5850
0c530ab8
A
5851 low_page->reference = dst_page->reference;
5852 low_page->dirty = dst_page->dirty;
d41d1dae 5853 low_page->absent = dst_page->absent;
0c530ab8
A
5854
5855 if (refmod & VM_MEM_REFERENCED)
5856 low_page->reference = TRUE;
5857 if (refmod & VM_MEM_MODIFIED)
5858 low_page->dirty = TRUE;
5859
0c530ab8 5860 vm_page_replace(low_page, object, dst_offset);
0c530ab8
A
5861
5862 dst_page = low_page;
5863 /*
5864 * vm_page_grablo returned the page marked
5865 * BUSY... we don't need a PAGE_WAKEUP_DONE
5866 * here, because we've never dropped the object lock
5867 */
d41d1dae
A
5868 if ( !dst_page->absent)
5869 dst_page->busy = FALSE;
0c530ab8 5870 }
d41d1dae
A
5871 if ( !dst_page->busy)
5872 dwp->dw_mask |= DW_vm_page_wire;
55e303ae 5873
91447636
A
5874 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5875 /*
5876 * Mark the page "busy" to block any future page fault
5877 * on this page. We'll also remove the mapping
5878 * of all these pages before leaving this routine.
5879 */
5880 assert(!dst_page->fictitious);
5881 dst_page->busy = TRUE;
5882 }
2d21ac55
A
5883 /*
5884 * expect the page to be used
5885 * page queues lock must be held to set 'reference'
5886 */
b0d623f7 5887 dwp->dw_mask |= DW_set_reference;
55e303ae 5888
2d21ac55
A
5889 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5890 dst_page->dirty = TRUE;
b0d623f7 5891record_phys_addr:
d41d1dae
A
5892 if (dst_page->busy)
5893 upl->flags |= UPL_HAS_BUSY;
5894
b0d623f7
A
5895 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5896 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5897 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
55e303ae 5898
2d21ac55
A
5899 if (dst_page->phys_page > upl->highest_page)
5900 upl->highest_page = dst_page->phys_page;
55e303ae 5901
2d21ac55
A
5902 if (user_page_list) {
5903 user_page_list[entry].phys_addr = dst_page->phys_page;
2d21ac55
A
5904 user_page_list[entry].pageout = dst_page->pageout;
5905 user_page_list[entry].absent = dst_page->absent;
593a1d5f 5906 user_page_list[entry].dirty = dst_page->dirty;
2d21ac55 5907 user_page_list[entry].precious = dst_page->precious;
593a1d5f 5908 user_page_list[entry].device = FALSE;
2d21ac55
A
5909 if (dst_page->clustered == TRUE)
5910 user_page_list[entry].speculative = dst_page->speculative;
5911 else
5912 user_page_list[entry].speculative = FALSE;
593a1d5f
A
5913 user_page_list[entry].cs_validated = dst_page->cs_validated;
5914 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
55e303ae 5915 }
b0d623f7
A
5916 if (object != kernel_object) {
5917 /*
5918 * someone is explicitly grabbing this page...
5919 * update clustered and speculative state
5920 *
5921 */
5922 VM_PAGE_CONSUME_CLUSTERED(dst_page);
55e303ae
A
5923 }
5924 entry++;
5925 dst_offset += PAGE_SIZE_64;
5926 xfer_size -= PAGE_SIZE;
b0d623f7
A
5927
5928 if (dwp->dw_mask) {
5929 if (dst_page->busy == FALSE) {
5930 /*
5931 * dw_do_work may need to drop the object lock
5932 * if it does, we need the pages it's looking at to
5933 * be held stable via the busy bit.
5934 */
5935 dst_page->busy = TRUE;
5936 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5937 }
5938 dwp->dw_m = dst_page;
5939 dwp++;
5940 dw_count++;
5941
5942 if (dw_count >= DELAYED_WORK_LIMIT) {
5943 dw_do_work(object, &dw_array[0], dw_count);
5944
5945 dwp = &dw_array[0];
5946 dw_count = 0;
5947 }
5948 }
55e303ae 5949 }
b0d623f7
A
5950 if (dw_count)
5951 dw_do_work(object, &dw_array[0], dw_count);
55e303ae 5952
2d21ac55
A
5953 if (page_list_count != NULL) {
5954 if (upl->flags & UPL_INTERNAL)
55e303ae 5955 *page_list_count = 0;
2d21ac55 5956 else if (*page_list_count > entry)
55e303ae
A
5957 *page_list_count = entry;
5958 }
55e303ae 5959 vm_object_unlock(object);
55e303ae 5960
91447636
A
5961 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5962 /*
5963 * We've marked all the pages "busy" so that future
5964 * page faults will block.
5965 * Now remove the mapping for these pages, so that they
5966 * can't be accessed without causing a page fault.
5967 */
5968 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5969 PMAP_NULL, 0, VM_PROT_NONE);
b0d623f7
A
5970 assert(!object->blocked_access);
5971 object->blocked_access = TRUE;
91447636 5972 }
91447636 5973 return KERN_SUCCESS;
0c530ab8 5974
0c530ab8 5975return_err:
b0d623f7 5976 dw_index = 0;
0c530ab8
A
5977
5978 for (; offset < dst_offset; offset += PAGE_SIZE) {
0b4c1975
A
5979 boolean_t need_unwire;
5980
0c530ab8
A
5981 dst_page = vm_page_lookup(object, offset);
5982
5983 if (dst_page == VM_PAGE_NULL)
d41d1dae 5984 panic("vm_object_iopl_request: Wired page missing. \n");
2d21ac55 5985
0b4c1975
A
5986 /*
5987 * if we've already processed this page in an earlier
5988 * dw_do_work, we need to undo the wiring... we will
5989 * leave the dirty and reference bits on if they
5990 * were set, since we don't have a good way of knowing
5991 * what the previous state was and we won't get here
5992 * under any normal circumstances... we will always
5993 * clear BUSY and wakeup any waiters via vm_page_free
5994 * or PAGE_WAKEUP_DONE
5995 */
5996 need_unwire = TRUE;
5997
b0d623f7
A
5998 if (dw_count) {
5999 if (dw_array[dw_index].dw_m == dst_page) {
0b4c1975
A
6000 /*
6001 * still in the deferred work list
6002 * which means we haven't yet called
6003 * vm_page_wire on this page
6004 */
6005 need_unwire = FALSE;
d41d1dae
A
6006
6007 dw_index++;
6008 dw_count--;
b0d623f7
A
6009 }
6010 }
0b4c1975
A
6011 vm_page_lock_queues();
6012
d41d1dae
A
6013 if (dst_page->absent) {
6014 vm_page_free(dst_page);
0b4c1975 6015
d41d1dae
A
6016 need_unwire = FALSE;
6017 } else {
6018 if (need_unwire == TRUE)
6019 vm_page_unwire(dst_page, TRUE);
0b4c1975 6020
0b4c1975 6021 PAGE_WAKEUP_DONE(dst_page);
d41d1dae 6022 }
0c530ab8 6023 vm_page_unlock_queues();
2d21ac55 6024
0b4c1975
A
6025 if (need_unwire == TRUE)
6026 VM_STAT_INCR(reactivations);
0c530ab8 6027 }
b0d623f7
A
6028#if UPL_DEBUG
6029 upl->upl_state = 2;
6030#endif
6031 if (! (upl->flags & UPL_KERNEL_OBJECT)) {
6032 vm_object_activity_end(object);
6033 }
0c530ab8
A
6034 vm_object_unlock(object);
6035 upl_destroy(upl);
6036
6037 return ret;
1c79356b
A
6038}
6039
91447636
A
6040kern_return_t
6041upl_transpose(
6042 upl_t upl1,
6043 upl_t upl2)
1c79356b 6044{
91447636
A
6045 kern_return_t retval;
6046 boolean_t upls_locked;
6047 vm_object_t object1, object2;
1c79356b 6048
b0d623f7 6049 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR) || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
91447636
A
6050 return KERN_INVALID_ARGUMENT;
6051 }
6052
6053 upls_locked = FALSE;
1c79356b 6054
91447636
A
6055 /*
6056 * Since we need to lock both UPLs at the same time,
6057 * avoid deadlocks by always taking locks in the same order.
6058 */
6059 if (upl1 < upl2) {
6060 upl_lock(upl1);
6061 upl_lock(upl2);
6062 } else {
6063 upl_lock(upl2);
6064 upl_lock(upl1);
6065 }
6066 upls_locked = TRUE; /* the UPLs will need to be unlocked */
6067
6068 object1 = upl1->map_object;
6069 object2 = upl2->map_object;
6070
6071 if (upl1->offset != 0 || upl2->offset != 0 ||
6072 upl1->size != upl2->size) {
6073 /*
6074 * We deal only with full objects, not subsets.
6075 * That's because we exchange the entire backing store info
6076 * for the objects: pager, resident pages, etc... We can't do
6077 * only part of it.
6078 */
6079 retval = KERN_INVALID_VALUE;
6080 goto done;
6081 }
6082
6083 /*
6084 * Tranpose the VM objects' backing store.
6085 */
6086 retval = vm_object_transpose(object1, object2,
6087 (vm_object_size_t) upl1->size);
6088
6089 if (retval == KERN_SUCCESS) {
6090 /*
6091 * Make each UPL point to the correct VM object, i.e. the
6092 * object holding the pages that the UPL refers to...
6093 */
b0d623f7 6094#if UPL_DEBUG
2d21ac55
A
6095 queue_remove(&object1->uplq, upl1, upl_t, uplq);
6096 queue_remove(&object2->uplq, upl2, upl_t, uplq);
6097#endif
91447636
A
6098 upl1->map_object = object2;
6099 upl2->map_object = object1;
b0d623f7 6100#if UPL_DEBUG
2d21ac55
A
6101 queue_enter(&object1->uplq, upl2, upl_t, uplq);
6102 queue_enter(&object2->uplq, upl1, upl_t, uplq);
6103#endif
91447636
A
6104 }
6105
6106done:
6107 /*
6108 * Cleanup.
6109 */
6110 if (upls_locked) {
6111 upl_unlock(upl1);
6112 upl_unlock(upl2);
6113 upls_locked = FALSE;
6114 }
6115
6116 return retval;
6117}
6118
6119/*
6120 * ENCRYPTED SWAP:
6121 *
6122 * Rationale: the user might have some encrypted data on disk (via
6123 * FileVault or any other mechanism). That data is then decrypted in
6124 * memory, which is safe as long as the machine is secure. But that
6125 * decrypted data in memory could be paged out to disk by the default
6126 * pager. The data would then be stored on disk in clear (not encrypted)
6127 * and it could be accessed by anyone who gets physical access to the
6128 * disk (if the laptop or the disk gets stolen for example). This weakens
6129 * the security offered by FileVault.
6130 *
6131 * Solution: the default pager will optionally request that all the
6132 * pages it gathers for pageout be encrypted, via the UPL interfaces,
6133 * before it sends this UPL to disk via the vnode_pageout() path.
6134 *
6135 * Notes:
6136 *
6137 * To avoid disrupting the VM LRU algorithms, we want to keep the
6138 * clean-in-place mechanisms, which allow us to send some extra pages to
6139 * swap (clustering) without actually removing them from the user's
6140 * address space. We don't want the user to unknowingly access encrypted
6141 * data, so we have to actually remove the encrypted pages from the page
6142 * table. When the user accesses the data, the hardware will fail to
6143 * locate the virtual page in its page table and will trigger a page
6144 * fault. We can then decrypt the page and enter it in the page table
6145 * again. Whenever we allow the user to access the contents of a page,
6146 * we have to make sure it's not encrypted.
6147 *
6148 *
6149 */
6150/*
6151 * ENCRYPTED SWAP:
6152 * Reserve of virtual addresses in the kernel address space.
6153 * We need to map the physical pages in the kernel, so that we
6154 * can call the encryption/decryption routines with a kernel
6155 * virtual address. We keep this pool of pre-allocated kernel
6156 * virtual addresses so that we don't have to scan the kernel's
6157 * virtaul address space each time we need to encrypt or decrypt
6158 * a physical page.
6159 * It would be nice to be able to encrypt and decrypt in physical
6160 * mode but that might not always be more efficient...
6161 */
6162decl_simple_lock_data(,vm_paging_lock)
6163#define VM_PAGING_NUM_PAGES 64
6164vm_map_offset_t vm_paging_base_address = 0;
6165boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
6166int vm_paging_max_index = 0;
2d21ac55
A
6167int vm_paging_page_waiter = 0;
6168int vm_paging_page_waiter_total = 0;
91447636
A
6169unsigned long vm_paging_no_kernel_page = 0;
6170unsigned long vm_paging_objects_mapped = 0;
6171unsigned long vm_paging_pages_mapped = 0;
6172unsigned long vm_paging_objects_mapped_slow = 0;
6173unsigned long vm_paging_pages_mapped_slow = 0;
6174
2d21ac55
A
6175void
6176vm_paging_map_init(void)
6177{
6178 kern_return_t kr;
6179 vm_map_offset_t page_map_offset;
6180 vm_map_entry_t map_entry;
6181
6182 assert(vm_paging_base_address == 0);
6183
6184 /*
6185 * Initialize our pool of pre-allocated kernel
6186 * virtual addresses.
6187 */
6188 page_map_offset = 0;
6189 kr = vm_map_find_space(kernel_map,
6190 &page_map_offset,
6191 VM_PAGING_NUM_PAGES * PAGE_SIZE,
6192 0,
6193 0,
6194 &map_entry);
6195 if (kr != KERN_SUCCESS) {
6196 panic("vm_paging_map_init: kernel_map full\n");
6197 }
6198 map_entry->object.vm_object = kernel_object;
b0d623f7 6199 map_entry->offset = page_map_offset;
2d21ac55
A
6200 vm_object_reference(kernel_object);
6201 vm_map_unlock(kernel_map);
6202
6203 assert(vm_paging_base_address == 0);
6204 vm_paging_base_address = page_map_offset;
6205}
6206
91447636
A
6207/*
6208 * ENCRYPTED SWAP:
6209 * vm_paging_map_object:
6210 * Maps part of a VM object's pages in the kernel
6211 * virtual address space, using the pre-allocated
6212 * kernel virtual addresses, if possible.
6213 * Context:
6214 * The VM object is locked. This lock will get
2d21ac55
A
6215 * dropped and re-acquired though, so the caller
6216 * must make sure the VM object is kept alive
6217 * (by holding a VM map that has a reference
6218 * on it, for example, or taking an extra reference).
6219 * The page should also be kept busy to prevent
6220 * it from being reclaimed.
91447636
A
6221 */
6222kern_return_t
6223vm_paging_map_object(
6224 vm_map_offset_t *address,
6225 vm_page_t page,
6226 vm_object_t object,
6227 vm_object_offset_t offset,
2d21ac55 6228 vm_map_size_t *size,
593a1d5f 6229 vm_prot_t protection,
2d21ac55 6230 boolean_t can_unlock_object)
91447636
A
6231{
6232 kern_return_t kr;
6233 vm_map_offset_t page_map_offset;
6234 vm_map_size_t map_size;
6235 vm_object_offset_t object_offset;
91447636 6236 int i;
91447636 6237
593a1d5f 6238
91447636 6239 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
2d21ac55 6240 assert(page->busy);
91447636 6241 /*
91447636
A
6242 * Use one of the pre-allocated kernel virtual addresses
6243 * and just enter the VM page in the kernel address space
6244 * at that virtual address.
6245 */
91447636
A
6246 simple_lock(&vm_paging_lock);
6247
91447636
A
6248 /*
6249 * Try and find an available kernel virtual address
6250 * from our pre-allocated pool.
6251 */
6252 page_map_offset = 0;
2d21ac55
A
6253 for (;;) {
6254 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
6255 if (vm_paging_page_inuse[i] == FALSE) {
6256 page_map_offset =
6257 vm_paging_base_address +
6258 (i * PAGE_SIZE);
6259 break;
6260 }
6261 }
6262 if (page_map_offset != 0) {
6263 /* found a space to map our page ! */
6264 break;
6265 }
6266
6267 if (can_unlock_object) {
6268 /*
6269 * If we can afford to unlock the VM object,
6270 * let's take the slow path now...
6271 */
91447636
A
6272 break;
6273 }
2d21ac55
A
6274 /*
6275 * We can't afford to unlock the VM object, so
6276 * let's wait for a space to become available...
6277 */
6278 vm_paging_page_waiter_total++;
6279 vm_paging_page_waiter++;
6280 thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
6281 &vm_paging_lock,
6282 THREAD_UNINT);
6283 vm_paging_page_waiter--;
6284 /* ... and try again */
91447636
A
6285 }
6286
6287 if (page_map_offset != 0) {
6288 /*
6289 * We found a kernel virtual address;
6290 * map the physical page to that virtual address.
6291 */
6292 if (i > vm_paging_max_index) {
6293 vm_paging_max_index = i;
6294 }
6295 vm_paging_page_inuse[i] = TRUE;
6296 simple_unlock(&vm_paging_lock);
2d21ac55
A
6297
6298 if (page->pmapped == FALSE) {
0c530ab8
A
6299 pmap_sync_page_data_phys(page->phys_page);
6300 }
2d21ac55
A
6301 page->pmapped = TRUE;
6302
6303 /*
6304 * Keep the VM object locked over the PMAP_ENTER
6305 * and the actual use of the page by the kernel,
6306 * or this pmap mapping might get undone by a
6307 * vm_object_pmap_protect() call...
6308 */
0c530ab8
A
6309 PMAP_ENTER(kernel_pmap,
6310 page_map_offset,
6311 page,
593a1d5f 6312 protection,
0c530ab8
A
6313 ((int) page->object->wimg_bits &
6314 VM_WIMG_MASK),
6315 TRUE);
91447636
A
6316 vm_paging_objects_mapped++;
6317 vm_paging_pages_mapped++;
6318 *address = page_map_offset;
91447636
A
6319
6320 /* all done and mapped, ready to use ! */
6321 return KERN_SUCCESS;
6322 }
6323
6324 /*
6325 * We ran out of pre-allocated kernel virtual
6326 * addresses. Just map the page in the kernel
6327 * the slow and regular way.
6328 */
6329 vm_paging_no_kernel_page++;
6330 simple_unlock(&vm_paging_lock);
2d21ac55
A
6331 }
6332
6333 if (! can_unlock_object) {
6334 return KERN_NOT_SUPPORTED;
91447636 6335 }
91447636
A
6336
6337 object_offset = vm_object_trunc_page(offset);
6338 map_size = vm_map_round_page(*size);
6339
6340 /*
6341 * Try and map the required range of the object
6342 * in the kernel_map
6343 */
6344
91447636
A
6345 vm_object_reference_locked(object); /* for the map entry */
6346 vm_object_unlock(object);
6347
6348 kr = vm_map_enter(kernel_map,
6349 address,
6350 map_size,
6351 0,
6352 VM_FLAGS_ANYWHERE,
6353 object,
6354 object_offset,
6355 FALSE,
593a1d5f 6356 protection,
91447636
A
6357 VM_PROT_ALL,
6358 VM_INHERIT_NONE);
6359 if (kr != KERN_SUCCESS) {
6360 *address = 0;
6361 *size = 0;
6362 vm_object_deallocate(object); /* for the map entry */
2d21ac55 6363 vm_object_lock(object);
91447636
A
6364 return kr;
6365 }
6366
6367 *size = map_size;
6368
6369 /*
6370 * Enter the mapped pages in the page table now.
6371 */
6372 vm_object_lock(object);
2d21ac55
A
6373 /*
6374 * VM object must be kept locked from before PMAP_ENTER()
6375 * until after the kernel is done accessing the page(s).
6376 * Otherwise, the pmap mappings in the kernel could be
6377 * undone by a call to vm_object_pmap_protect().
6378 */
6379
91447636
A
6380 for (page_map_offset = 0;
6381 map_size != 0;
6382 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
6383 unsigned int cache_attr;
6384
6385 page = vm_page_lookup(object, offset + page_map_offset);
6386 if (page == VM_PAGE_NULL) {
2d21ac55
A
6387 printf("vm_paging_map_object: no page !?");
6388 vm_object_unlock(object);
6389 kr = vm_map_remove(kernel_map, *address, *size,
6390 VM_MAP_NO_FLAGS);
6391 assert(kr == KERN_SUCCESS);
6392 *address = 0;
6393 *size = 0;
6394 vm_object_lock(object);
6395 return KERN_MEMORY_ERROR;
91447636 6396 }
2d21ac55 6397 if (page->pmapped == FALSE) {
91447636
A
6398 pmap_sync_page_data_phys(page->phys_page);
6399 }
2d21ac55 6400 page->pmapped = TRUE;
91447636
A
6401 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
6402
2d21ac55 6403 //assert(pmap_verify_free(page->phys_page));
91447636
A
6404 PMAP_ENTER(kernel_pmap,
6405 *address + page_map_offset,
6406 page,
593a1d5f 6407 protection,
91447636 6408 cache_attr,
0c530ab8 6409 TRUE);
91447636
A
6410 }
6411
6412 vm_paging_objects_mapped_slow++;
b0d623f7 6413 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
91447636
A
6414
6415 return KERN_SUCCESS;
6416}
6417
6418/*
6419 * ENCRYPTED SWAP:
6420 * vm_paging_unmap_object:
6421 * Unmaps part of a VM object's pages from the kernel
6422 * virtual address space.
6423 * Context:
6424 * The VM object is locked. This lock will get
6425 * dropped and re-acquired though.
6426 */
6427void
6428vm_paging_unmap_object(
6429 vm_object_t object,
6430 vm_map_offset_t start,
6431 vm_map_offset_t end)
6432{
6433 kern_return_t kr;
91447636 6434 int i;
91447636 6435
0c530ab8 6436 if ((vm_paging_base_address == 0) ||
8f6c56a5
A
6437 (start < vm_paging_base_address) ||
6438 (end > (vm_paging_base_address
2d21ac55 6439 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
91447636
A
6440 /*
6441 * We didn't use our pre-allocated pool of
6442 * kernel virtual address. Deallocate the
6443 * virtual memory.
6444 */
6445 if (object != VM_OBJECT_NULL) {
6446 vm_object_unlock(object);
6447 }
6448 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
6449 if (object != VM_OBJECT_NULL) {
6450 vm_object_lock(object);
6451 }
6452 assert(kr == KERN_SUCCESS);
6453 } else {
6454 /*
6455 * We used a kernel virtual address from our
6456 * pre-allocated pool. Put it back in the pool
6457 * for next time.
6458 */
91447636 6459 assert(end - start == PAGE_SIZE);
b0d623f7
A
6460 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
6461 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
91447636
A
6462
6463 /* undo the pmap mapping */
0c530ab8 6464 pmap_remove(kernel_pmap, start, end);
91447636
A
6465
6466 simple_lock(&vm_paging_lock);
6467 vm_paging_page_inuse[i] = FALSE;
2d21ac55
A
6468 if (vm_paging_page_waiter) {
6469 thread_wakeup(&vm_paging_page_waiter);
6470 }
91447636 6471 simple_unlock(&vm_paging_lock);
91447636
A
6472 }
6473}
6474
2d21ac55 6475#if CRYPTO
91447636
A
6476/*
6477 * Encryption data.
6478 * "iv" is the "initial vector". Ideally, we want to
6479 * have a different one for each page we encrypt, so that
6480 * crackers can't find encryption patterns too easily.
6481 */
6482#define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
6483boolean_t swap_crypt_ctx_initialized = FALSE;
6484aes_32t swap_crypt_key[8]; /* big enough for a 256 key */
6485aes_ctx swap_crypt_ctx;
6486const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
6487
6488#if DEBUG
6489boolean_t swap_crypt_ctx_tested = FALSE;
6490unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
6491unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
6492unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
6493#endif /* DEBUG */
6494
91447636
A
6495/*
6496 * Initialize the encryption context: key and key size.
6497 */
6498void swap_crypt_ctx_initialize(void); /* forward */
6499void
6500swap_crypt_ctx_initialize(void)
6501{
6502 unsigned int i;
6503
6504 /*
6505 * No need for locking to protect swap_crypt_ctx_initialized
6506 * because the first use of encryption will come from the
6507 * pageout thread (we won't pagein before there's been a pageout)
6508 * and there's only one pageout thread.
6509 */
6510 if (swap_crypt_ctx_initialized == FALSE) {
6511 for (i = 0;
6512 i < (sizeof (swap_crypt_key) /
6513 sizeof (swap_crypt_key[0]));
6514 i++) {
6515 swap_crypt_key[i] = random();
6516 }
6517 aes_encrypt_key((const unsigned char *) swap_crypt_key,
6518 SWAP_CRYPT_AES_KEY_SIZE,
6519 &swap_crypt_ctx.encrypt);
6520 aes_decrypt_key((const unsigned char *) swap_crypt_key,
6521 SWAP_CRYPT_AES_KEY_SIZE,
6522 &swap_crypt_ctx.decrypt);
6523 swap_crypt_ctx_initialized = TRUE;
6524 }
6525
6526#if DEBUG
6527 /*
6528 * Validate the encryption algorithms.
6529 */
6530 if (swap_crypt_ctx_tested == FALSE) {
6531 /* initialize */
6532 for (i = 0; i < 4096; i++) {
6533 swap_crypt_test_page_ref[i] = (char) i;
6534 }
6535 /* encrypt */
6536 aes_encrypt_cbc(swap_crypt_test_page_ref,
6537 swap_crypt_null_iv,
6538 PAGE_SIZE / AES_BLOCK_SIZE,
6539 swap_crypt_test_page_encrypt,
6540 &swap_crypt_ctx.encrypt);
6541 /* decrypt */
6542 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
6543 swap_crypt_null_iv,
6544 PAGE_SIZE / AES_BLOCK_SIZE,
6545 swap_crypt_test_page_decrypt,
6546 &swap_crypt_ctx.decrypt);
6547 /* compare result with original */
6548 for (i = 0; i < 4096; i ++) {
6549 if (swap_crypt_test_page_decrypt[i] !=
6550 swap_crypt_test_page_ref[i]) {
6551 panic("encryption test failed");
6552 }
6553 }
6554
6555 /* encrypt again */
6556 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
6557 swap_crypt_null_iv,
6558 PAGE_SIZE / AES_BLOCK_SIZE,
6559 swap_crypt_test_page_decrypt,
6560 &swap_crypt_ctx.encrypt);
6561 /* decrypt in place */
6562 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
6563 swap_crypt_null_iv,
6564 PAGE_SIZE / AES_BLOCK_SIZE,
6565 swap_crypt_test_page_decrypt,
6566 &swap_crypt_ctx.decrypt);
6567 for (i = 0; i < 4096; i ++) {
6568 if (swap_crypt_test_page_decrypt[i] !=
6569 swap_crypt_test_page_ref[i]) {
6570 panic("in place encryption test failed");
6571 }
6572 }
6573
6574 swap_crypt_ctx_tested = TRUE;
6575 }
6576#endif /* DEBUG */
6577}
6578
6579/*
6580 * ENCRYPTED SWAP:
6581 * vm_page_encrypt:
6582 * Encrypt the given page, for secure paging.
6583 * The page might already be mapped at kernel virtual
6584 * address "kernel_mapping_offset". Otherwise, we need
6585 * to map it.
6586 *
6587 * Context:
6588 * The page's object is locked, but this lock will be released
6589 * and re-acquired.
6590 * The page is busy and not accessible by users (not entered in any pmap).
6591 */
6592void
6593vm_page_encrypt(
6594 vm_page_t page,
6595 vm_map_offset_t kernel_mapping_offset)
6596{
91447636 6597 kern_return_t kr;
91447636
A
6598 vm_map_size_t kernel_mapping_size;
6599 vm_offset_t kernel_vaddr;
6600 union {
6601 unsigned char aes_iv[AES_BLOCK_SIZE];
6602 struct {
6603 memory_object_t pager_object;
6604 vm_object_offset_t paging_offset;
6605 } vm;
6606 } encrypt_iv;
6607
6608 if (! vm_pages_encrypted) {
6609 vm_pages_encrypted = TRUE;
6610 }
6611
6612 assert(page->busy);
6613 assert(page->dirty || page->precious);
6614
6615 if (page->encrypted) {
6616 /*
6617 * Already encrypted: no need to do it again.
6618 */
6619 vm_page_encrypt_already_encrypted_counter++;
6620 return;
6621 }
6622 ASSERT_PAGE_DECRYPTED(page);
6623
6624 /*
2d21ac55
A
6625 * Take a paging-in-progress reference to keep the object
6626 * alive even if we have to unlock it (in vm_paging_map_object()
6627 * for example)...
91447636 6628 */
2d21ac55 6629 vm_object_paging_begin(page->object);
91447636
A
6630
6631 if (kernel_mapping_offset == 0) {
6632 /*
6633 * The page hasn't already been mapped in kernel space
6634 * by the caller. Map it now, so that we can access
6635 * its contents and encrypt them.
6636 */
6637 kernel_mapping_size = PAGE_SIZE;
6638 kr = vm_paging_map_object(&kernel_mapping_offset,
6639 page,
6640 page->object,
6641 page->offset,
2d21ac55 6642 &kernel_mapping_size,
593a1d5f 6643 VM_PROT_READ | VM_PROT_WRITE,
2d21ac55 6644 FALSE);
91447636
A
6645 if (kr != KERN_SUCCESS) {
6646 panic("vm_page_encrypt: "
6647 "could not map page in kernel: 0x%x\n",
6648 kr);
6649 }
6650 } else {
6651 kernel_mapping_size = 0;
6652 }
6653 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6654
6655 if (swap_crypt_ctx_initialized == FALSE) {
6656 swap_crypt_ctx_initialize();
6657 }
6658 assert(swap_crypt_ctx_initialized);
6659
6660 /*
6661 * Prepare an "initial vector" for the encryption.
6662 * We use the "pager" and the "paging_offset" for that
6663 * page to obfuscate the encrypted data a bit more and
6664 * prevent crackers from finding patterns that they could
6665 * use to break the key.
6666 */
6667 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
6668 encrypt_iv.vm.pager_object = page->object->pager;
6669 encrypt_iv.vm.paging_offset =
6670 page->object->paging_offset + page->offset;
6671
91447636
A
6672 /* encrypt the "initial vector" */
6673 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
6674 swap_crypt_null_iv,
6675 1,
6676 &encrypt_iv.aes_iv[0],
6677 &swap_crypt_ctx.encrypt);
6678
6679 /*
6680 * Encrypt the page.
6681 */
6682 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
6683 &encrypt_iv.aes_iv[0],
6684 PAGE_SIZE / AES_BLOCK_SIZE,
6685 (unsigned char *) kernel_vaddr,
6686 &swap_crypt_ctx.encrypt);
6687
6688 vm_page_encrypt_counter++;
6689
91447636
A
6690 /*
6691 * Unmap the page from the kernel's address space,
6692 * if we had to map it ourselves. Otherwise, let
6693 * the caller undo the mapping if needed.
6694 */
6695 if (kernel_mapping_size != 0) {
6696 vm_paging_unmap_object(page->object,
6697 kernel_mapping_offset,
6698 kernel_mapping_offset + kernel_mapping_size);
6699 }
6700
6701 /*
2d21ac55 6702 * Clear the "reference" and "modified" bits.
91447636
A
6703 * This should clean up any impact the encryption had
6704 * on them.
2d21ac55
A
6705 * The page was kept busy and disconnected from all pmaps,
6706 * so it can't have been referenced or modified from user
6707 * space.
6708 * The software bits will be reset later after the I/O
6709 * has completed (in upl_commit_range()).
91447636 6710 */
2d21ac55 6711 pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
91447636
A
6712
6713 page->encrypted = TRUE;
2d21ac55
A
6714
6715 vm_object_paging_end(page->object);
91447636
A
6716}
6717
6718/*
6719 * ENCRYPTED SWAP:
6720 * vm_page_decrypt:
6721 * Decrypt the given page.
6722 * The page might already be mapped at kernel virtual
6723 * address "kernel_mapping_offset". Otherwise, we need
6724 * to map it.
6725 *
6726 * Context:
6727 * The page's VM object is locked but will be unlocked and relocked.
6728 * The page is busy and not accessible by users (not entered in any pmap).
6729 */
6730void
6731vm_page_decrypt(
6732 vm_page_t page,
6733 vm_map_offset_t kernel_mapping_offset)
6734{
91447636
A
6735 kern_return_t kr;
6736 vm_map_size_t kernel_mapping_size;
6737 vm_offset_t kernel_vaddr;
91447636
A
6738 union {
6739 unsigned char aes_iv[AES_BLOCK_SIZE];
6740 struct {
6741 memory_object_t pager_object;
6742 vm_object_offset_t paging_offset;
6743 } vm;
6744 } decrypt_iv;
6745
6746 assert(page->busy);
6747 assert(page->encrypted);
6748
6749 /*
2d21ac55
A
6750 * Take a paging-in-progress reference to keep the object
6751 * alive even if we have to unlock it (in vm_paging_map_object()
6752 * for example)...
91447636 6753 */
2d21ac55 6754 vm_object_paging_begin(page->object);
91447636
A
6755
6756 if (kernel_mapping_offset == 0) {
6757 /*
6758 * The page hasn't already been mapped in kernel space
6759 * by the caller. Map it now, so that we can access
6760 * its contents and decrypt them.
6761 */
6762 kernel_mapping_size = PAGE_SIZE;
6763 kr = vm_paging_map_object(&kernel_mapping_offset,
6764 page,
6765 page->object,
6766 page->offset,
2d21ac55 6767 &kernel_mapping_size,
593a1d5f 6768 VM_PROT_READ | VM_PROT_WRITE,
2d21ac55 6769 FALSE);
91447636
A
6770 if (kr != KERN_SUCCESS) {
6771 panic("vm_page_decrypt: "
2d21ac55
A
6772 "could not map page in kernel: 0x%x\n",
6773 kr);
91447636
A
6774 }
6775 } else {
6776 kernel_mapping_size = 0;
6777 }
6778 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6779
6780 assert(swap_crypt_ctx_initialized);
6781
6782 /*
6783 * Prepare an "initial vector" for the decryption.
6784 * It has to be the same as the "initial vector" we
6785 * used to encrypt that page.
6786 */
6787 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
6788 decrypt_iv.vm.pager_object = page->object->pager;
6789 decrypt_iv.vm.paging_offset =
6790 page->object->paging_offset + page->offset;
6791
91447636
A
6792 /* encrypt the "initial vector" */
6793 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
6794 swap_crypt_null_iv,
6795 1,
6796 &decrypt_iv.aes_iv[0],
6797 &swap_crypt_ctx.encrypt);
6798
6799 /*
6800 * Decrypt the page.
6801 */
6802 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
6803 &decrypt_iv.aes_iv[0],
6804 PAGE_SIZE / AES_BLOCK_SIZE,
6805 (unsigned char *) kernel_vaddr,
6806 &swap_crypt_ctx.decrypt);
6807 vm_page_decrypt_counter++;
6808
91447636
A
6809 /*
6810 * Unmap the page from the kernel's address space,
6811 * if we had to map it ourselves. Otherwise, let
6812 * the caller undo the mapping if needed.
6813 */
6814 if (kernel_mapping_size != 0) {
6815 vm_paging_unmap_object(page->object,
6816 kernel_vaddr,
6817 kernel_vaddr + PAGE_SIZE);
6818 }
6819
6820 /*
6821 * After decryption, the page is actually clean.
6822 * It was encrypted as part of paging, which "cleans"
6823 * the "dirty" pages.
6824 * Noone could access it after it was encrypted
6825 * and the decryption doesn't count.
6826 */
6827 page->dirty = FALSE;
b0d623f7 6828 assert (page->cs_validated == FALSE);
2d21ac55 6829 pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
91447636
A
6830 page->encrypted = FALSE;
6831
6832 /*
6833 * We've just modified the page's contents via the data cache and part
6834 * of the new contents might still be in the cache and not yet in RAM.
6835 * Since the page is now available and might get gathered in a UPL to
6836 * be part of a DMA transfer from a driver that expects the memory to
6837 * be coherent at this point, we have to flush the data cache.
6838 */
0c530ab8 6839 pmap_sync_page_attributes_phys(page->phys_page);
91447636
A
6840 /*
6841 * Since the page is not mapped yet, some code might assume that it
6842 * doesn't need to invalidate the instruction cache when writing to
2d21ac55
A
6843 * that page. That code relies on "pmapped" being FALSE, so that the
6844 * caches get synchronized when the page is first mapped.
91447636 6845 */
2d21ac55
A
6846 assert(pmap_verify_free(page->phys_page));
6847 page->pmapped = FALSE;
4a3eedf9 6848 page->wpmapped = FALSE;
2d21ac55
A
6849
6850 vm_object_paging_end(page->object);
91447636
A
6851}
6852
b0d623f7 6853#if DEVELOPMENT || DEBUG
91447636
A
6854unsigned long upl_encrypt_upls = 0;
6855unsigned long upl_encrypt_pages = 0;
b0d623f7 6856#endif
91447636
A
6857
6858/*
6859 * ENCRYPTED SWAP:
6860 *
6861 * upl_encrypt:
6862 * Encrypts all the pages in the UPL, within the specified range.
6863 *
6864 */
6865void
6866upl_encrypt(
6867 upl_t upl,
6868 upl_offset_t crypt_offset,
6869 upl_size_t crypt_size)
6870{
b0d623f7
A
6871 upl_size_t upl_size, subupl_size=crypt_size;
6872 upl_offset_t offset_in_upl, subupl_offset=crypt_offset;
91447636 6873 vm_object_t upl_object;
b0d623f7 6874 vm_object_offset_t upl_offset;
91447636
A
6875 vm_page_t page;
6876 vm_object_t shadow_object;
6877 vm_object_offset_t shadow_offset;
6878 vm_object_offset_t paging_offset;
6879 vm_object_offset_t base_offset;
b0d623f7
A
6880 int isVectorUPL = 0;
6881 upl_t vector_upl = NULL;
6882
6883 if((isVectorUPL = vector_upl_is_valid(upl)))
6884 vector_upl = upl;
6885
6886process_upl_to_encrypt:
6887 if(isVectorUPL) {
6888 crypt_size = subupl_size;
6889 crypt_offset = subupl_offset;
6890 upl = vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
6891 if(upl == NULL)
6892 panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
6893 subupl_size -= crypt_size;
6894 subupl_offset += crypt_size;
6895 }
91447636 6896
b0d623f7 6897#if DEVELOPMENT || DEBUG
91447636
A
6898 upl_encrypt_upls++;
6899 upl_encrypt_pages += crypt_size / PAGE_SIZE;
b0d623f7 6900#endif
91447636
A
6901 upl_object = upl->map_object;
6902 upl_offset = upl->offset;
6903 upl_size = upl->size;
6904
91447636
A
6905 vm_object_lock(upl_object);
6906
6907 /*
6908 * Find the VM object that contains the actual pages.
6909 */
6910 if (upl_object->pageout) {
6911 shadow_object = upl_object->shadow;
6912 /*
6913 * The offset in the shadow object is actually also
6914 * accounted for in upl->offset. It possibly shouldn't be
6915 * this way, but for now don't account for it twice.
6916 */
6917 shadow_offset = 0;
6918 assert(upl_object->paging_offset == 0); /* XXX ? */
6919 vm_object_lock(shadow_object);
6920 } else {
6921 shadow_object = upl_object;
6922 shadow_offset = 0;
6923 }
6924
6925 paging_offset = shadow_object->paging_offset;
6926 vm_object_paging_begin(shadow_object);
6927
2d21ac55
A
6928 if (shadow_object != upl_object)
6929 vm_object_unlock(upl_object);
6930
91447636
A
6931
6932 base_offset = shadow_offset;
6933 base_offset += upl_offset;
6934 base_offset += crypt_offset;
6935 base_offset -= paging_offset;
91447636 6936
2d21ac55 6937 assert(crypt_offset + crypt_size <= upl_size);
91447636 6938
b0d623f7
A
6939 for (offset_in_upl = 0;
6940 offset_in_upl < crypt_size;
6941 offset_in_upl += PAGE_SIZE) {
91447636 6942 page = vm_page_lookup(shadow_object,
b0d623f7 6943 base_offset + offset_in_upl);
91447636
A
6944 if (page == VM_PAGE_NULL) {
6945 panic("upl_encrypt: "
6946 "no page for (obj=%p,off=%lld+%d)!\n",
6947 shadow_object,
6948 base_offset,
b0d623f7 6949 offset_in_upl);
91447636 6950 }
2d21ac55
A
6951 /*
6952 * Disconnect the page from all pmaps, so that nobody can
6953 * access it while it's encrypted. After that point, all
6954 * accesses to this page will cause a page fault and block
6955 * while the page is busy being encrypted. After the
6956 * encryption completes, any access will cause a
6957 * page fault and the page gets decrypted at that time.
6958 */
6959 pmap_disconnect(page->phys_page);
91447636 6960 vm_page_encrypt(page, 0);
2d21ac55 6961
b0d623f7 6962 if (vm_object_lock_avoid(shadow_object)) {
2d21ac55
A
6963 /*
6964 * Give vm_pageout_scan() a chance to convert more
6965 * pages from "clean-in-place" to "clean-and-free",
6966 * if it's interested in the same pages we selected
6967 * in this cluster.
6968 */
6969 vm_object_unlock(shadow_object);
b0d623f7 6970 mutex_pause(2);
2d21ac55
A
6971 vm_object_lock(shadow_object);
6972 }
91447636
A
6973 }
6974
6975 vm_object_paging_end(shadow_object);
6976 vm_object_unlock(shadow_object);
b0d623f7
A
6977
6978 if(isVectorUPL && subupl_size)
6979 goto process_upl_to_encrypt;
91447636
A
6980}
6981
2d21ac55
A
6982#else /* CRYPTO */
6983void
6984upl_encrypt(
6985 __unused upl_t upl,
6986 __unused upl_offset_t crypt_offset,
6987 __unused upl_size_t crypt_size)
6988{
6989}
6990
6991void
6992vm_page_encrypt(
6993 __unused vm_page_t page,
6994 __unused vm_map_offset_t kernel_mapping_offset)
6995{
6996}
6997
6998void
6999vm_page_decrypt(
7000 __unused vm_page_t page,
7001 __unused vm_map_offset_t kernel_mapping_offset)
7002{
7003}
7004
7005#endif /* CRYPTO */
7006
b0d623f7
A
7007void
7008vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked)
7009{
0b4c1975
A
7010 boolean_t pageout;
7011
7012 pageout = page->pageout;
7013
b0d623f7
A
7014 page->list_req_pending = FALSE;
7015 page->cleaning = FALSE;
7016 page->pageout = FALSE;
7017
7018 if (!queues_locked) {
7019 vm_page_lockspin_queues();
7020 }
7021
7022 /*
7023 * need to drop the laundry count...
7024 * we may also need to remove it
7025 * from the I/O paging queue...
7026 * vm_pageout_throttle_up handles both cases
7027 *
7028 * the laundry and pageout_queue flags are cleared...
7029 */
7030 vm_pageout_throttle_up(page);
b0d623f7 7031
0b4c1975
A
7032 if (pageout == TRUE) {
7033 /*
7034 * toss the wire count we picked up
7035 * when we intially set this page up
7036 * to be cleaned...
7037 */
7038 vm_page_unwire(page, TRUE);
7039 }
b0d623f7
A
7040 vm_page_steal_pageout_page++;
7041
7042 if (!queues_locked) {
7043 vm_page_unlock_queues();
7044 }
7045}
7046
7047upl_t
7048vector_upl_create(vm_offset_t upl_offset)
7049{
7050 int vector_upl_size = sizeof(struct _vector_upl);
7051 int i=0;
7052 upl_t upl;
7053 vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
7054
7055 upl = upl_create(0,UPL_VECTOR,0);
7056 upl->vector_upl = vector_upl;
7057 upl->offset = upl_offset;
7058 vector_upl->size = 0;
7059 vector_upl->offset = upl_offset;
7060 vector_upl->invalid_upls=0;
7061 vector_upl->num_upls=0;
7062 vector_upl->pagelist = NULL;
7063
7064 for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
7065 vector_upl->upl_iostates[i].size = 0;
7066 vector_upl->upl_iostates[i].offset = 0;
7067
7068 }
7069 return upl;
7070}
7071
7072void
7073vector_upl_deallocate(upl_t upl)
7074{
7075 if(upl) {
7076 vector_upl_t vector_upl = upl->vector_upl;
7077 if(vector_upl) {
7078 if(vector_upl->invalid_upls != vector_upl->num_upls)
7079 panic("Deallocating non-empty Vectored UPL\n");
7080 kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
7081 vector_upl->invalid_upls=0;
7082 vector_upl->num_upls = 0;
7083 vector_upl->pagelist = NULL;
7084 vector_upl->size = 0;
7085 vector_upl->offset = 0;
7086 kfree(vector_upl, sizeof(struct _vector_upl));
7087 vector_upl = (vector_upl_t)0xdeadbeef;
7088 }
7089 else
7090 panic("vector_upl_deallocate was passed a non-vectored upl\n");
7091 }
7092 else
7093 panic("vector_upl_deallocate was passed a NULL upl\n");
7094}
7095
7096boolean_t
7097vector_upl_is_valid(upl_t upl)
7098{
7099 if(upl && ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
7100 vector_upl_t vector_upl = upl->vector_upl;
7101 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xdeadbeef || vector_upl == (vector_upl_t)0xfeedbeef)
7102 return FALSE;
7103 else
7104 return TRUE;
7105 }
7106 return FALSE;
7107}
7108
7109boolean_t
7110vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
7111{
7112 if(vector_upl_is_valid(upl)) {
7113 vector_upl_t vector_upl = upl->vector_upl;
7114
7115 if(vector_upl) {
7116 if(subupl) {
7117 if(io_size) {
7118 if(io_size < PAGE_SIZE)
7119 io_size = PAGE_SIZE;
7120 subupl->vector_upl = (void*)vector_upl;
7121 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
7122 vector_upl->size += io_size;
7123 upl->size += io_size;
7124 }
7125 else {
7126 uint32_t i=0,invalid_upls=0;
7127 for(i = 0; i < vector_upl->num_upls; i++) {
7128 if(vector_upl->upl_elems[i] == subupl)
7129 break;
7130 }
7131 if(i == vector_upl->num_upls)
7132 panic("Trying to remove sub-upl when none exists");
7133
7134 vector_upl->upl_elems[i] = NULL;
7135 invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
7136 if(invalid_upls == vector_upl->num_upls)
7137 return TRUE;
7138 else
7139 return FALSE;
7140 }
7141 }
7142 else
7143 panic("vector_upl_set_subupl was passed a NULL upl element\n");
7144 }
7145 else
7146 panic("vector_upl_set_subupl was passed a non-vectored upl\n");
7147 }
7148 else
7149 panic("vector_upl_set_subupl was passed a NULL upl\n");
7150
7151 return FALSE;
7152}
7153
7154void
7155vector_upl_set_pagelist(upl_t upl)
7156{
7157 if(vector_upl_is_valid(upl)) {
7158 uint32_t i=0;
7159 vector_upl_t vector_upl = upl->vector_upl;
7160
7161 if(vector_upl) {
7162 vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
7163
7164 vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
7165
7166 for(i=0; i < vector_upl->num_upls; i++) {
7167 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
7168 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
7169 pagelist_size += cur_upl_pagelist_size;
7170 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
7171 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
7172 }
7173 assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
7174 }
7175 else
7176 panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
7177 }
7178 else
7179 panic("vector_upl_set_pagelist was passed a NULL upl\n");
7180
7181}
7182
7183upl_t
7184vector_upl_subupl_byindex(upl_t upl, uint32_t index)
7185{
7186 if(vector_upl_is_valid(upl)) {
7187 vector_upl_t vector_upl = upl->vector_upl;
7188 if(vector_upl) {
7189 if(index < vector_upl->num_upls)
7190 return vector_upl->upl_elems[index];
7191 }
7192 else
7193 panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
7194 }
7195 return NULL;
7196}
7197
7198upl_t
7199vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
7200{
7201 if(vector_upl_is_valid(upl)) {
7202 uint32_t i=0;
7203 vector_upl_t vector_upl = upl->vector_upl;
7204
7205 if(vector_upl) {
7206 upl_t subupl = NULL;
7207 vector_upl_iostates_t subupl_state;
7208
7209 for(i=0; i < vector_upl->num_upls; i++) {
7210 subupl = vector_upl->upl_elems[i];
7211 subupl_state = vector_upl->upl_iostates[i];
7212 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
7213 /* We could have been passed an offset/size pair that belongs
7214 * to an UPL element that has already been committed/aborted.
7215 * If so, return NULL.
7216 */
7217 if(subupl == NULL)
7218 return NULL;
7219 if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
7220 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
7221 if(*upl_size > subupl_state.size)
7222 *upl_size = subupl_state.size;
7223 }
7224 if(*upl_offset >= subupl_state.offset)
7225 *upl_offset -= subupl_state.offset;
7226 else if(i)
7227 panic("Vector UPL offset miscalculation\n");
7228 return subupl;
7229 }
7230 }
7231 }
7232 else
7233 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
7234 }
7235 return NULL;
7236}
7237
7238void
7239vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
7240{
7241 *v_upl_submap = NULL;
7242
7243 if(vector_upl_is_valid(upl)) {
7244 vector_upl_t vector_upl = upl->vector_upl;
7245 if(vector_upl) {
7246 *v_upl_submap = vector_upl->submap;
7247 *submap_dst_addr = vector_upl->submap_dst_addr;
7248 }
7249 else
7250 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7251 }
7252 else
7253 panic("vector_upl_get_submap was passed a null UPL\n");
7254}
7255
7256void
7257vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
7258{
7259 if(vector_upl_is_valid(upl)) {
7260 vector_upl_t vector_upl = upl->vector_upl;
7261 if(vector_upl) {
7262 vector_upl->submap = submap;
7263 vector_upl->submap_dst_addr = submap_dst_addr;
7264 }
7265 else
7266 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7267 }
7268 else
7269 panic("vector_upl_get_submap was passed a NULL UPL\n");
7270}
7271
7272void
7273vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
7274{
7275 if(vector_upl_is_valid(upl)) {
7276 uint32_t i = 0;
7277 vector_upl_t vector_upl = upl->vector_upl;
7278
7279 if(vector_upl) {
7280 for(i = 0; i < vector_upl->num_upls; i++) {
7281 if(vector_upl->upl_elems[i] == subupl)
7282 break;
7283 }
7284
7285 if(i == vector_upl->num_upls)
7286 panic("setting sub-upl iostate when none exists");
7287
7288 vector_upl->upl_iostates[i].offset = offset;
7289 if(size < PAGE_SIZE)
7290 size = PAGE_SIZE;
7291 vector_upl->upl_iostates[i].size = size;
7292 }
7293 else
7294 panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
7295 }
7296 else
7297 panic("vector_upl_set_iostate was passed a NULL UPL\n");
7298}
7299
7300void
7301vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
7302{
7303 if(vector_upl_is_valid(upl)) {
7304 uint32_t i = 0;
7305 vector_upl_t vector_upl = upl->vector_upl;
7306
7307 if(vector_upl) {
7308 for(i = 0; i < vector_upl->num_upls; i++) {
7309 if(vector_upl->upl_elems[i] == subupl)
7310 break;
7311 }
7312
7313 if(i == vector_upl->num_upls)
7314 panic("getting sub-upl iostate when none exists");
7315
7316 *offset = vector_upl->upl_iostates[i].offset;
7317 *size = vector_upl->upl_iostates[i].size;
7318 }
7319 else
7320 panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
7321 }
7322 else
7323 panic("vector_upl_get_iostate was passed a NULL UPL\n");
7324}
7325
7326void
7327vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
7328{
7329 if(vector_upl_is_valid(upl)) {
7330 vector_upl_t vector_upl = upl->vector_upl;
7331 if(vector_upl) {
7332 if(index < vector_upl->num_upls) {
7333 *offset = vector_upl->upl_iostates[index].offset;
7334 *size = vector_upl->upl_iostates[index].size;
7335 }
7336 else
7337 *offset = *size = 0;
7338 }
7339 else
7340 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
7341 }
7342 else
7343 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
7344}
7345
7346upl_page_info_t *
7347upl_get_internal_vectorupl_pagelist(upl_t upl)
7348{
7349 return ((vector_upl_t)(upl->vector_upl))->pagelist;
7350}
7351
7352void *
7353upl_get_internal_vectorupl(upl_t upl)
7354{
7355 return upl->vector_upl;
7356}
7357
91447636
A
7358vm_size_t
7359upl_get_internal_pagelist_offset(void)
7360{
7361 return sizeof(struct upl);
7362}
7363
91447636
A
7364void
7365upl_clear_dirty(
0c530ab8
A
7366 upl_t upl,
7367 boolean_t value)
91447636 7368{
0c530ab8
A
7369 if (value) {
7370 upl->flags |= UPL_CLEAR_DIRTY;
7371 } else {
7372 upl->flags &= ~UPL_CLEAR_DIRTY;
7373 }
91447636
A
7374}
7375
7376
7377#ifdef MACH_BSD
1c79356b 7378
2d21ac55
A
7379boolean_t upl_device_page(upl_page_info_t *upl)
7380{
7381 return(UPL_DEVICE_PAGE(upl));
7382}
1c79356b
A
7383boolean_t upl_page_present(upl_page_info_t *upl, int index)
7384{
7385 return(UPL_PAGE_PRESENT(upl, index));
7386}
2d21ac55
A
7387boolean_t upl_speculative_page(upl_page_info_t *upl, int index)
7388{
7389 return(UPL_SPECULATIVE_PAGE(upl, index));
7390}
1c79356b
A
7391boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
7392{
7393 return(UPL_DIRTY_PAGE(upl, index));
7394}
7395boolean_t upl_valid_page(upl_page_info_t *upl, int index)
7396{
7397 return(UPL_VALID_PAGE(upl, index));
7398}
91447636 7399ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
1c79356b 7400{
91447636 7401 return(UPL_PHYS_PAGE(upl, index));
1c79356b
A
7402}
7403
2d21ac55 7404
0b4e3aa0
A
7405void
7406vm_countdirtypages(void)
1c79356b
A
7407{
7408 vm_page_t m;
7409 int dpages;
7410 int pgopages;
7411 int precpages;
7412
7413
7414 dpages=0;
7415 pgopages=0;
7416 precpages=0;
7417
7418 vm_page_lock_queues();
7419 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
7420 do {
7421 if (m ==(vm_page_t )0) break;
7422
7423 if(m->dirty) dpages++;
7424 if(m->pageout) pgopages++;
7425 if(m->precious) precpages++;
7426
91447636 7427 assert(m->object != kernel_object);
1c79356b
A
7428 m = (vm_page_t) queue_next(&m->pageq);
7429 if (m ==(vm_page_t )0) break;
7430
7431 } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
7432 vm_page_unlock_queues();
9bccf70c 7433
2d21ac55
A
7434 vm_page_lock_queues();
7435 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
7436 do {
7437 if (m ==(vm_page_t )0) break;
7438
7439 dpages++;
7440 assert(m->dirty);
7441 assert(!m->pageout);
7442 assert(m->object != kernel_object);
7443 m = (vm_page_t) queue_next(&m->pageq);
7444 if (m ==(vm_page_t )0) break;
7445
7446 } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
7447 vm_page_unlock_queues();
7448
9bccf70c
A
7449 vm_page_lock_queues();
7450 m = (vm_page_t) queue_first(&vm_page_queue_zf);
7451 do {
7452 if (m ==(vm_page_t )0) break;
7453
7454 if(m->dirty) dpages++;
7455 if(m->pageout) pgopages++;
7456 if(m->precious) precpages++;
7457
91447636 7458 assert(m->object != kernel_object);
9bccf70c
A
7459 m = (vm_page_t) queue_next(&m->pageq);
7460 if (m ==(vm_page_t )0) break;
7461
7462 } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
7463 vm_page_unlock_queues();
1c79356b
A
7464
7465 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
7466
7467 dpages=0;
7468 pgopages=0;
7469 precpages=0;
7470
7471 vm_page_lock_queues();
7472 m = (vm_page_t) queue_first(&vm_page_queue_active);
7473
7474 do {
7475 if(m == (vm_page_t )0) break;
7476 if(m->dirty) dpages++;
7477 if(m->pageout) pgopages++;
7478 if(m->precious) precpages++;
7479
91447636 7480 assert(m->object != kernel_object);
1c79356b
A
7481 m = (vm_page_t) queue_next(&m->pageq);
7482 if(m == (vm_page_t )0) break;
7483
7484 } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
7485 vm_page_unlock_queues();
7486
7487 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
7488
7489}
7490#endif /* MACH_BSD */
7491
0c530ab8 7492ppnum_t upl_get_highest_page(
2d21ac55 7493 upl_t upl)
0c530ab8 7494{
2d21ac55 7495 return upl->highest_page;
0c530ab8
A
7496}
7497
b0d623f7
A
7498upl_size_t upl_get_size(
7499 upl_t upl)
7500{
7501 return upl->size;
7502}
7503
7504#if UPL_DEBUG
7505kern_return_t upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
1c79356b
A
7506{
7507 upl->ubc_alias1 = alias1;
7508 upl->ubc_alias2 = alias2;
7509 return KERN_SUCCESS;
7510}
b0d623f7 7511int upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
1c79356b
A
7512{
7513 if(al)
7514 *al = upl->ubc_alias1;
7515 if(al2)
7516 *al2 = upl->ubc_alias2;
7517 return KERN_SUCCESS;
7518}
91447636 7519#endif /* UPL_DEBUG */
1c79356b
A
7520
7521
7522
7523#if MACH_KDB
7524#include <ddb/db_output.h>
7525#include <ddb/db_print.h>
7526#include <vm/vm_print.h>
7527
7528#define printf kdbprintf
1c79356b
A
7529void db_pageout(void);
7530
7531void
7532db_vm(void)
7533{
1c79356b
A
7534
7535 iprintf("VM Statistics:\n");
7536 db_indent += 2;
7537 iprintf("pages:\n");
7538 db_indent += 2;
7539 iprintf("activ %5d inact %5d free %5d",
7540 vm_page_active_count, vm_page_inactive_count,
7541 vm_page_free_count);
7542 printf(" wire %5d gobbl %5d\n",
7543 vm_page_wire_count, vm_page_gobble_count);
1c79356b
A
7544 db_indent -= 2;
7545 iprintf("target:\n");
7546 db_indent += 2;
7547 iprintf("min %5d inact %5d free %5d",
7548 vm_page_free_min, vm_page_inactive_target,
7549 vm_page_free_target);
7550 printf(" resrv %5d\n", vm_page_free_reserved);
7551 db_indent -= 2;
1c79356b 7552 iprintf("pause:\n");
1c79356b
A
7553 db_pageout();
7554 db_indent -= 2;
7555}
7556
1c79356b 7557#if MACH_COUNTERS
91447636 7558extern int c_laundry_pages_freed;
1c79356b
A
7559#endif /* MACH_COUNTERS */
7560
91447636
A
7561void
7562db_pageout(void)
7563{
1c79356b
A
7564 iprintf("Pageout Statistics:\n");
7565 db_indent += 2;
7566 iprintf("active %5d inactv %5d\n",
7567 vm_pageout_active, vm_pageout_inactive);
7568 iprintf("nolock %5d avoid %5d busy %5d absent %5d\n",
7569 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
7570 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
7571 iprintf("used %5d clean %5d dirty %5d\n",
7572 vm_pageout_inactive_used, vm_pageout_inactive_clean,
7573 vm_pageout_inactive_dirty);
1c79356b
A
7574#if MACH_COUNTERS
7575 iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
7576#endif /* MACH_COUNTERS */
7577#if MACH_CLUSTER_STATS
7578 iprintf("Cluster Statistics:\n");
7579 db_indent += 2;
7580 iprintf("dirtied %5d cleaned %5d collisions %5d\n",
7581 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
7582 vm_pageout_cluster_collisions);
7583 iprintf("clusters %5d conversions %5d\n",
7584 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
7585 db_indent -= 2;
7586 iprintf("Target Statistics:\n");
7587 db_indent += 2;
7588 iprintf("collisions %5d page_dirtied %5d page_freed %5d\n",
7589 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
7590 vm_pageout_target_page_freed);
1c79356b
A
7591 db_indent -= 2;
7592#endif /* MACH_CLUSTER_STATS */
7593 db_indent -= 2;
7594}
7595
1c79356b 7596#endif /* MACH_KDB */