2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
59 * Paging File Management.
62 #include <mach/host_priv.h>
63 #include <mach/memory_object_control.h>
64 #include <mach/memory_object_server.h>
66 #include <default_pager/default_pager_internal.h>
67 #include <default_pager/default_pager_alerts.h>
68 #include <default_pager/default_pager_object_server.h>
70 #include <ipc/ipc_types.h>
71 #include <ipc/ipc_port.h>
72 #include <ipc/ipc_space.h>
74 #include <kern/kern_types.h>
75 #include <kern/host.h>
76 #include <kern/queue.h>
77 #include <kern/counters.h>
78 #include <kern/sched_prim.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_pageout.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_protos.h>
87 /* todo - need large internal object support */
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
97 * This should be determined dynamically in the future
100 #define ALLOC_STRIDE (1024 * 1024 * 1024)
101 int physical_transfer_cluster_count
= 0;
103 #define VM_SUPER_CLUSTER 0x40000
104 #define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_MIN_SIZE)
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
110 #define VSTRUCT_MIN_CLSHIFT 0
112 #define VSTRUCT_DEF_CLSHIFT 2
113 int default_pager_clsize
= 0;
115 int vstruct_def_clshift
= VSTRUCT_DEF_CLSHIFT
;
118 unsigned int clustered_writes
[VM_SUPER_PAGES
+1];
119 unsigned int clustered_reads
[VM_SUPER_PAGES
+1];
122 * Globals used for asynchronous paging operations:
123 * vs_async_list: head of list of to-be-completed I/O ops
124 * async_num_queued: number of pages completed, but not yet
125 * processed by async thread.
126 * async_requests_out: number of pages of requests not completed.
130 struct vs_async
*vs_async_list
;
131 int async_num_queued
;
132 int async_requests_out
;
136 #define VS_ASYNC_REUSE 1
137 struct vs_async
*vs_async_free_list
;
139 lck_mtx_t default_pager_async_lock
; /* Protects globals above */
142 int vs_alloc_async_failed
= 0; /* statistics */
143 int vs_alloc_async_count
= 0; /* statistics */
144 struct vs_async
*vs_alloc_async(void); /* forward */
145 void vs_free_async(struct vs_async
*vsa
); /* forward */
148 #define VS_ALLOC_ASYNC() vs_alloc_async()
149 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
151 #define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock)
152 #define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock)
153 #define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
154 #define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
155 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
157 * Paging Space Hysteresis triggers and the target notification port
160 unsigned int dp_pages_free_drift_count
= 0;
161 unsigned int dp_pages_free_drifted_max
= 0;
162 unsigned int minimum_pages_remaining
= 0;
163 unsigned int maximum_pages_free
= 0;
164 ipc_port_t min_pages_trigger_port
= NULL
;
165 ipc_port_t max_pages_trigger_port
= NULL
;
168 boolean_t use_emergency_swap_file_first
= TRUE
;
170 boolean_t use_emergency_swap_file_first
= FALSE
;
172 boolean_t bs_low
= FALSE
;
173 int backing_store_release_trigger_disable
= 0;
174 boolean_t backing_store_stop_compaction
= FALSE
;
175 boolean_t backing_store_abort_compaction
= FALSE
;
177 /* Have we decided if swap needs to be encrypted yet ? */
178 boolean_t dp_encryption_inited
= FALSE
;
179 /* Should we encrypt swap ? */
180 boolean_t dp_encryption
= FALSE
;
182 boolean_t dp_isssd
= FALSE
;
185 * Object sizes are rounded up to the next power of 2,
186 * unless they are bigger than a given maximum size.
188 vm_size_t max_doubled_size
= 4 * 1024 * 1024; /* 4 meg */
191 * List of all backing store and segments.
193 MACH_PORT_FACE emergency_segment_backing_store
;
194 struct backing_store_list_head backing_store_list
;
195 paging_segment_t paging_segments
[MAX_NUM_PAGING_SEGMENTS
];
196 lck_mtx_t paging_segments_lock
;
197 int paging_segment_max
= 0;
198 int paging_segment_count
= 0;
199 int ps_select_array
[BS_MAXPRI
+1] = { -1,-1,-1,-1,-1 };
203 * Total pages free in system
204 * This differs from clusters committed/avail which is a measure of the
205 * over commitment of paging segments to backing store. An idea which is
206 * likely to be deprecated.
208 unsigned int dp_pages_free
= 0;
209 unsigned int dp_pages_reserve
= 0;
210 unsigned int cluster_transfer_minimum
= 100;
215 struct ps_vnode_trim_data
{
221 /* forward declarations */
222 kern_return_t
ps_write_file(paging_segment_t
, upl_t
, upl_offset_t
, dp_offset_t
, unsigned int, int); /* forward */
223 kern_return_t
ps_read_file (paging_segment_t
, upl_t
, upl_offset_t
, dp_offset_t
, unsigned int, unsigned int *, int); /* forward */
224 default_pager_thread_t
*get_read_buffer( void );
225 kern_return_t
ps_vstruct_transfer_from_segment(
227 paging_segment_t segment
,
229 kern_return_t
ps_read_device(paging_segment_t
, dp_offset_t
, vm_offset_t
*, unsigned int, unsigned int *, int); /* forward */
230 kern_return_t
ps_write_device(paging_segment_t
, dp_offset_t
, vm_offset_t
, unsigned int, struct vs_async
*); /* forward */
231 kern_return_t
vs_cluster_transfer(
236 vs_map_t
vs_get_map_entry(
241 default_pager_backing_store_delete_internal( MACH_PORT_FACE
);
243 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data
*data
);
244 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data
*data
);
245 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data
*data
, struct vs_map
*map
, unsigned int shift
, dp_size_t length
);
247 default_pager_thread_t
*
248 get_read_buffer( void )
254 for (i
=0; i
<default_pager_internal_count
; i
++) {
255 if(dpt_array
[i
]->checked_out
== FALSE
) {
256 dpt_array
[i
]->checked_out
= TRUE
;
257 DPT_UNLOCK(dpt_lock
);
261 DPT_SLEEP(dpt_lock
, &dpt_array
, THREAD_UNINT
);
271 * List of all backing store.
274 queue_init(&backing_store_list
.bsl_queue
);
277 VS_ASYNC_LOCK_INIT();
279 vs_async_free_list
= NULL
;
280 #endif /* VS_ASYNC_REUSE */
282 for (i
= 0; i
< VM_SUPER_PAGES
+ 1; i
++) {
283 clustered_writes
[i
] = 0;
284 clustered_reads
[i
] = 0;
290 * When things do not quite workout...
292 void bs_no_paging_space(boolean_t
); /* forward */
296 boolean_t out_of_memory
)
300 dprintf(("*** OUT OF MEMORY ***\n"));
301 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
304 void bs_more_space(int); /* forward */
305 void bs_commit(int); /* forward */
307 boolean_t user_warned
= FALSE
;
308 unsigned int clusters_committed
= 0;
309 unsigned int clusters_available
= 0;
310 unsigned int clusters_committed_peak
= 0;
318 * Account for new paging space.
320 clusters_available
+= nclusters
;
322 if (clusters_available
>= clusters_committed
) {
323 if (verbose
&& user_warned
) {
324 printf("%s%s - %d excess clusters now.\n",
326 "paging space is OK now",
327 clusters_available
- clusters_committed
);
329 clusters_committed_peak
= 0;
332 if (verbose
&& user_warned
) {
333 printf("%s%s - still short of %d clusters.\n",
335 "WARNING: paging space over-committed",
336 clusters_committed
- clusters_available
);
337 clusters_committed_peak
-= nclusters
;
350 clusters_committed
+= nclusters
;
351 if (clusters_committed
> clusters_available
) {
352 if (verbose
&& !user_warned
) {
354 printf("%s%s - short of %d clusters.\n",
356 "WARNING: paging space over-committed",
357 clusters_committed
- clusters_available
);
359 if (clusters_committed
> clusters_committed_peak
) {
360 clusters_committed_peak
= clusters_committed
;
363 if (verbose
&& user_warned
) {
364 printf("%s%s - was short of up to %d clusters.\n",
366 "paging space is OK now",
367 clusters_committed_peak
- clusters_available
);
369 clusters_committed_peak
= 0;
377 int default_pager_info_verbose
= 1;
384 uint64_t pages_total
, pages_free
;
389 pages_total
= pages_free
= 0;
390 for (i
= 0; i
<= paging_segment_max
; i
++) {
391 ps
= paging_segments
[i
];
392 if (ps
== PAGING_SEGMENT_NULL
)
396 * no need to lock: by the time this data
397 * gets back to any remote requestor it
398 * will be obsolete anyways
400 pages_total
+= ps
->ps_pgnum
;
401 pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
402 DP_DEBUG(DEBUG_BS_INTERNAL
,
403 ("segment #%d: %d total, %d free\n",
404 i
, ps
->ps_pgnum
, ps
->ps_clcount
<< ps
->ps_clshift
));
406 *totalp
= pages_total
;
408 if (verbose
&& user_warned
&& default_pager_info_verbose
) {
409 if (clusters_available
< clusters_committed
) {
410 printf("%s %d clusters committed, %d available.\n",
419 backing_store_t
backing_store_alloc(void); /* forward */
422 backing_store_alloc(void)
426 bs
= (backing_store_t
) kalloc(sizeof (struct backing_store
));
427 if (bs
== BACKING_STORE_NULL
)
428 panic("backing_store_alloc: no memory");
431 bs
->bs_port
= MACH_PORT_NULL
;
434 bs
->bs_pages_total
= 0;
436 bs
->bs_pages_in_fail
= 0;
437 bs
->bs_pages_out
= 0;
438 bs
->bs_pages_out_fail
= 0;
443 backing_store_t
backing_store_lookup(MACH_PORT_FACE
); /* forward */
445 /* Even in both the component space and external versions of this pager, */
446 /* backing_store_lookup will be called from tasks in the application space */
448 backing_store_lookup(
454 port is currently backed with a vs structure in the alias field
455 we could create an ISBS alias and a port_is_bs call but frankly
456 I see no reason for the test, the bs->port == port check below
457 will work properly on junk entries.
459 if ((port == MACH_PORT_NULL) || port_is_vs(port))
461 if (port
== MACH_PORT_NULL
)
462 return BACKING_STORE_NULL
;
465 queue_iterate(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
468 if (bs
->bs_port
== port
) {
470 /* Success, return it locked. */
476 return BACKING_STORE_NULL
;
479 void backing_store_add(backing_store_t
); /* forward */
483 __unused backing_store_t bs
)
485 // MACH_PORT_FACE port = bs->bs_port;
486 // MACH_PORT_FACE pset = default_pager_default_set;
487 kern_return_t kr
= KERN_SUCCESS
;
489 if (kr
!= KERN_SUCCESS
)
490 panic("backing_store_add: add to set");
495 * Set up default page shift, but only if not already
496 * set and argument is within range.
499 bs_set_default_clsize(unsigned int npages
)
506 if (default_pager_clsize
== 0) /* if not yet set */
507 vstruct_def_clshift
= local_log2(npages
);
513 int bs_get_global_clsize(int clsize
); /* forward */
516 bs_get_global_clsize(
520 memory_object_default_t dmm
;
524 * Only allow setting of cluster size once. If called
525 * with no cluster size (default), we use the compiled-in default
526 * for the duration. The same cluster size is used for all
529 if (default_pager_clsize
== 0) {
531 * Keep cluster size in bit shift because it's quicker
532 * arithmetic, and easier to keep at a power of 2.
534 if (clsize
!= NO_CLSIZE
) {
535 for (i
= 0; (1 << i
) < clsize
; i
++);
536 if (i
> MAX_CLUSTER_SHIFT
)
537 i
= MAX_CLUSTER_SHIFT
;
538 vstruct_def_clshift
= i
;
540 default_pager_clsize
= (1 << vstruct_def_clshift
);
543 * Let the user know the new (and definitive) cluster size.
546 printf("%scluster size = %d page%s\n",
547 my_name
, default_pager_clsize
,
548 (default_pager_clsize
== 1) ? "" : "s");
551 * Let the kernel know too, in case it hasn't used the
552 * default value provided in main() yet.
554 dmm
= default_pager_object
;
555 clsize
= default_pager_clsize
* vm_page_size
; /* in bytes */
556 kr
= host_default_memory_manager(host_priv_self(),
559 memory_object_default_deallocate(dmm
);
561 if (kr
!= KERN_SUCCESS
) {
562 panic("bs_get_global_cl_size:host_default_memory_manager");
564 if (dmm
!= default_pager_object
) {
565 panic("bs_get_global_cl_size:there is another default pager");
568 ASSERT(default_pager_clsize
> 0 &&
569 (default_pager_clsize
& (default_pager_clsize
- 1)) == 0);
571 return default_pager_clsize
;
575 default_pager_backing_store_create(
576 memory_object_default_t pager
,
578 int clsize
, /* in bytes */
579 MACH_PORT_FACE
*backing_store
)
584 struct vstruct_alias
*alias_struct
;
586 if (pager
!= default_pager_object
)
587 return KERN_INVALID_ARGUMENT
;
589 bs
= backing_store_alloc();
590 port
= ipc_port_alloc_kernel();
591 ipc_port_make_send(port
);
592 assert (port
!= IP_NULL
);
594 DP_DEBUG(DEBUG_BS_EXTERNAL
,
595 ("priority=%d clsize=%d bs_port=0x%x\n",
596 priority
, clsize
, (int) backing_store
));
598 alias_struct
= (struct vstruct_alias
*)
599 kalloc(sizeof (struct vstruct_alias
));
600 if(alias_struct
!= NULL
) {
601 alias_struct
->vs
= (struct vstruct
*)bs
;
602 alias_struct
->name
= &default_pager_ops
;
603 port
->ip_alias
= (uintptr_t) alias_struct
;
606 ipc_port_dealloc_kernel((MACH_PORT_FACE
)(port
));
609 kfree(bs
, sizeof (struct backing_store
));
611 return KERN_RESOURCE_SHORTAGE
;
615 if (priority
== DEFAULT_PAGER_BACKING_STORE_MAXPRI
)
616 priority
= BS_MAXPRI
;
617 else if (priority
== BS_NOPRI
)
618 priority
= BS_MAXPRI
;
620 priority
= BS_MINPRI
;
621 bs
->bs_priority
= priority
;
623 bs
->bs_clsize
= bs_get_global_clsize(atop_32(clsize
));
626 queue_enter(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
630 backing_store_add(bs
);
632 *backing_store
= port
;
637 default_pager_backing_store_info(
638 MACH_PORT_FACE backing_store
,
639 backing_store_flavor_t flavour
,
640 backing_store_info_t info
,
641 mach_msg_type_number_t
*size
)
644 backing_store_basic_info_t basic
;
648 if (flavour
!= BACKING_STORE_BASIC_INFO
||
649 *size
< BACKING_STORE_BASIC_INFO_COUNT
)
650 return KERN_INVALID_ARGUMENT
;
652 basic
= (backing_store_basic_info_t
)info
;
653 *size
= BACKING_STORE_BASIC_INFO_COUNT
;
655 VSTATS_LOCK(&global_stats
.gs_lock
);
656 basic
->pageout_calls
= global_stats
.gs_pageout_calls
;
657 basic
->pagein_calls
= global_stats
.gs_pagein_calls
;
658 basic
->pages_in
= global_stats
.gs_pages_in
;
659 basic
->pages_out
= global_stats
.gs_pages_out
;
660 basic
->pages_unavail
= global_stats
.gs_pages_unavail
;
661 basic
->pages_init
= global_stats
.gs_pages_init
;
662 basic
->pages_init_writes
= global_stats
.gs_pages_init_writes
;
663 VSTATS_UNLOCK(&global_stats
.gs_lock
);
665 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
666 return KERN_INVALID_ARGUMENT
;
668 basic
->bs_pages_total
= bs
->bs_pages_total
;
670 bs
->bs_pages_free
= 0;
671 for (i
= 0; i
<= paging_segment_max
; i
++) {
672 ps
= paging_segments
[i
];
673 if (ps
!= PAGING_SEGMENT_NULL
&& ps
->ps_bs
== bs
) {
675 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
680 basic
->bs_pages_free
= bs
->bs_pages_free
;
681 basic
->bs_pages_in
= bs
->bs_pages_in
;
682 basic
->bs_pages_in_fail
= bs
->bs_pages_in_fail
;
683 basic
->bs_pages_out
= bs
->bs_pages_out
;
684 basic
->bs_pages_out_fail
= bs
->bs_pages_out_fail
;
686 basic
->bs_priority
= bs
->bs_priority
;
687 basic
->bs_clsize
= ptoa_32(bs
->bs_clsize
); /* in bytes */
694 int ps_delete(paging_segment_t
); /* forward */
695 boolean_t
current_thread_aborted(void);
702 kern_return_t error
= KERN_SUCCESS
;
705 VSL_LOCK(); /* get the lock on the list of vs's */
707 /* The lock relationship and sequence is farily complicated */
708 /* this code looks at a live list, locking and unlocking the list */
709 /* as it traverses it. It depends on the locking behavior of */
710 /* default_pager_no_senders. no_senders always locks the vstruct */
711 /* targeted for removal before locking the vstruct list. However */
712 /* it will remove that member of the list without locking its */
713 /* neighbors. We can be sure when we hold a lock on a vstruct */
714 /* it cannot be removed from the list but we must hold the list */
715 /* lock to be sure that its pointers to its neighbors are valid. */
716 /* Also, we can hold off destruction of a vstruct when the list */
717 /* lock and the vs locks are not being held by bumping the */
718 /* vs_async_pending count. */
721 while(backing_store_release_trigger_disable
!= 0) {
722 VSL_SLEEP(&backing_store_release_trigger_disable
, THREAD_UNINT
);
725 /* we will choose instead to hold a send right */
726 vs_count
= vstruct_list
.vsl_count
;
727 vs
= (vstruct_t
) queue_first((queue_entry_t
)&(vstruct_list
.vsl_queue
));
728 if(vs
== (vstruct_t
)&vstruct_list
) {
733 vs_async_wait(vs
); /* wait for any pending async writes */
734 if ((vs_count
!= 0) && (vs
!= NULL
))
735 vs
->vs_async_pending
+= 1; /* hold parties calling */
739 backing_store_abort_compaction
= FALSE
;
743 while((vs_count
!= 0) && (vs
!= NULL
)) {
744 /* We take the count of AMO's before beginning the */
745 /* transfer of of the target segment. */
746 /* We are guaranteed that the target segment cannot get */
747 /* more users. We also know that queue entries are */
748 /* made at the back of the list. If some of the entries */
749 /* we would check disappear while we are traversing the */
750 /* list then we will either check new entries which */
751 /* do not have any backing store in the target segment */
752 /* or re-check old entries. This might not be optimal */
753 /* but it will always be correct. The alternative is to */
754 /* take a snapshot of the list. */
757 if(dp_pages_free
< cluster_transfer_minimum
)
758 error
= KERN_FAILURE
;
760 vm_object_t transfer_object
;
763 upl_control_flags_t upl_flags
;
765 transfer_object
= vm_object_allocate((vm_object_size_t
)VM_SUPER_CLUSTER
);
767 upl_flags
= (UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
|
768 UPL_SET_LITE
| UPL_SET_INTERNAL
);
770 /* mark the pages as "encrypted" when they come in */
771 upl_flags
|= UPL_ENCRYPT
;
773 error
= vm_object_upl_request(transfer_object
,
774 (vm_object_offset_t
)0, VM_SUPER_CLUSTER
,
775 &upl
, NULL
, &count
, upl_flags
);
777 if(error
== KERN_SUCCESS
) {
778 error
= ps_vstruct_transfer_from_segment(
780 upl_commit(upl
, NULL
, 0);
783 error
= KERN_FAILURE
;
785 vm_object_deallocate(transfer_object
);
787 if(error
|| current_thread_aborted()) {
789 vs
->vs_async_pending
-= 1; /* release vs_async_wait */
790 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
791 vs
->vs_waiting_async
= FALSE
;
793 thread_wakeup(&vs
->vs_async_pending
);
802 while(backing_store_release_trigger_disable
!= 0) {
803 VSL_SLEEP(&backing_store_release_trigger_disable
,
807 next_vs
= (vstruct_t
) queue_next(&(vs
->vs_links
));
808 if((next_vs
!= (vstruct_t
)&vstruct_list
) &&
809 (vs
!= next_vs
) && (vs_count
!= 1)) {
811 vs_async_wait(next_vs
); /* wait for any */
812 /* pending async writes */
813 next_vs
->vs_async_pending
+= 1; /* hold parties */
814 /* calling vs_async_wait */
819 vs
->vs_async_pending
-= 1;
820 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
821 vs
->vs_waiting_async
= FALSE
;
823 thread_wakeup(&vs
->vs_async_pending
);
827 if((vs
== next_vs
) || (next_vs
== (vstruct_t
)&vstruct_list
))
838 default_pager_backing_store_delete_internal(
839 MACH_PORT_FACE backing_store
)
845 int interim_pages_removed
= 0;
846 boolean_t dealing_with_emergency_segment
= ( backing_store
== emergency_segment_backing_store
);
848 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
849 return KERN_INVALID_ARGUMENT
;
853 error
= KERN_SUCCESS
;
854 for (i
= 0; i
<= paging_segment_max
; i
++) {
855 ps
= paging_segments
[i
];
856 if (ps
!= PAGING_SEGMENT_NULL
&&
858 ! IS_PS_GOING_AWAY(ps
)) {
861 if( IS_PS_GOING_AWAY(ps
) || !IS_PS_OK_TO_USE(ps
)) {
863 * Someone is already busy reclamining this paging segment.
864 * If it's the emergency segment we are looking at then check
865 * that someone has not already recovered it and set the right
866 * state i.e. online but not activated.
872 /* disable access to this segment */
873 ps
->ps_state
&= ~PS_CAN_USE
;
874 ps
->ps_state
|= PS_GOING_AWAY
;
877 * The "ps" segment is "off-line" now,
878 * we can try and delete it...
880 if(dp_pages_free
< (cluster_transfer_minimum
882 error
= KERN_FAILURE
;
886 /* remove all pages associated with the */
887 /* segment from the list of free pages */
888 /* when transfer is through, all target */
889 /* segment pages will appear to be free */
891 dp_pages_free
-= ps
->ps_pgcount
;
892 interim_pages_removed
+= ps
->ps_pgcount
;
894 error
= ps_delete(ps
);
896 if (error
!= KERN_SUCCESS
) {
898 * We couldn't delete the segment,
899 * probably because there's not enough
900 * virtual memory left.
901 * Re-enable all the segments.
910 if (error
!= KERN_SUCCESS
) {
911 for (i
= 0; i
<= paging_segment_max
; i
++) {
912 ps
= paging_segments
[i
];
913 if (ps
!= PAGING_SEGMENT_NULL
&&
915 IS_PS_GOING_AWAY(ps
)) {
918 if( !IS_PS_GOING_AWAY(ps
)) {
922 /* Handle the special clusters that came in while we let go the lock*/
923 if( ps
->ps_special_clusters
) {
924 dp_pages_free
+= ps
->ps_special_clusters
<< ps
->ps_clshift
;
925 ps
->ps_pgcount
+= ps
->ps_special_clusters
<< ps
->ps_clshift
;
926 ps
->ps_clcount
+= ps
->ps_special_clusters
;
927 if ( ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
) {
928 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
930 ps
->ps_special_clusters
= 0;
932 /* re-enable access to this segment */
933 ps
->ps_state
&= ~PS_GOING_AWAY
;
934 ps
->ps_state
|= PS_CAN_USE
;
938 dp_pages_free
+= interim_pages_removed
;
944 for (i
= 0; i
<= paging_segment_max
; i
++) {
945 ps
= paging_segments
[i
];
946 if (ps
!= PAGING_SEGMENT_NULL
&&
948 if(IS_PS_GOING_AWAY(ps
)) {
949 if(IS_PS_EMERGENCY_SEGMENT(ps
)) {
951 ps
->ps_state
&= ~PS_GOING_AWAY
;
952 ps
->ps_special_clusters
= 0;
953 ps
->ps_pgcount
= ps
->ps_pgnum
;
954 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
955 dp_pages_reserve
+= ps
->ps_pgcount
;
958 paging_segments
[i
] = PAGING_SEGMENT_NULL
;
959 paging_segment_count
--;
961 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
962 kfree(ps
, sizeof *ps
);
968 /* Scan the entire ps array separately to make certain we find the */
969 /* proper paging_segment_max */
970 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
971 if(paging_segments
[i
] != PAGING_SEGMENT_NULL
)
972 paging_segment_max
= i
;
977 if( dealing_with_emergency_segment
) {
983 * All the segments have been deleted.
984 * We can remove the backing store.
988 * Disable lookups of this backing store.
990 if((void *)bs
->bs_port
->ip_alias
!= NULL
)
991 kfree((void *) bs
->bs_port
->ip_alias
,
992 sizeof (struct vstruct_alias
));
993 ipc_port_dealloc_kernel((ipc_port_t
) (bs
->bs_port
));
994 bs
->bs_port
= MACH_PORT_NULL
;
998 * Remove backing store from backing_store list.
1001 queue_remove(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
1006 * Free the backing store structure.
1008 BS_LOCK_DESTROY(bs
);
1009 kfree(bs
, sizeof *bs
);
1011 return KERN_SUCCESS
;
1015 default_pager_backing_store_delete(
1016 MACH_PORT_FACE backing_store
)
1018 if( backing_store
!= emergency_segment_backing_store
) {
1019 default_pager_backing_store_delete_internal(emergency_segment_backing_store
);
1021 return(default_pager_backing_store_delete_internal(backing_store
));
1024 int ps_enter(paging_segment_t
); /* forward */
1028 paging_segment_t ps
)
1034 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
1035 if (paging_segments
[i
] == PAGING_SEGMENT_NULL
)
1039 if (i
< MAX_NUM_PAGING_SEGMENTS
) {
1040 paging_segments
[i
] = ps
;
1041 if (i
> paging_segment_max
)
1042 paging_segment_max
= i
;
1043 paging_segment_count
++;
1044 if ((ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_NOPRI
) ||
1045 (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
))
1046 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
1050 return KERN_RESOURCE_SHORTAGE
;
1057 #ifdef DEVICE_PAGING
1059 default_pager_add_segment(
1060 MACH_PORT_FACE backing_store
,
1061 MACH_PORT_FACE device
,
1067 paging_segment_t ps
;
1071 if ((bs
= backing_store_lookup(backing_store
))
1072 == BACKING_STORE_NULL
)
1073 return KERN_INVALID_ARGUMENT
;
1076 for (i
= 0; i
<= paging_segment_max
; i
++) {
1077 ps
= paging_segments
[i
];
1078 if (ps
== PAGING_SEGMENT_NULL
)
1082 * Check for overlap on same device.
1084 if (!(ps
->ps_device
!= device
1085 || offset
>= ps
->ps_offset
+ ps
->ps_recnum
1086 || offset
+ count
<= ps
->ps_offset
)) {
1089 return KERN_INVALID_ARGUMENT
;
1095 * Set up the paging segment
1097 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
1098 if (ps
== PAGING_SEGMENT_NULL
) {
1100 return KERN_RESOURCE_SHORTAGE
;
1103 ps
->ps_segtype
= PS_PARTITION
;
1104 ps
->ps_device
= device
;
1105 ps
->ps_offset
= offset
;
1106 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
1107 ps
->ps_recnum
= count
;
1108 ps
->ps_pgnum
= count
>> ps
->ps_record_shift
;
1110 ps
->ps_pgcount
= ps
->ps_pgnum
;
1111 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
1112 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
1116 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
1118 PS_LOCK_DESTROY(ps
);
1119 kfree(ps
, sizeof *ps
);
1121 return KERN_RESOURCE_SHORTAGE
;
1123 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
1124 clrbit(ps
->ps_bmap
, i
);
1127 if(paging_segment_count
== 0) {
1128 ps
->ps_state
= PS_EMERGENCY_SEGMENT
;
1129 if(use_emergency_swap_file_first
) {
1130 ps
->ps_state
|= PS_CAN_USE
;
1133 ps
->ps_state
= PS_CAN_USE
;
1138 if ((error
= ps_enter(ps
)) != 0) {
1139 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
1141 PS_LOCK_DESTROY(ps
);
1142 kfree(ps
, sizeof *ps
);
1144 return KERN_RESOURCE_SHORTAGE
;
1147 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1148 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1152 if(IS_PS_OK_TO_USE(ps
)) {
1153 dp_pages_free
+= ps
->ps_pgcount
;
1155 dp_pages_reserve
+= ps
->ps_pgcount
;
1159 bs_more_space(ps
->ps_clcount
);
1161 DP_DEBUG(DEBUG_BS_INTERNAL
,
1162 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163 device
, offset
, count
, record_size
,
1164 ps
->ps_record_shift
, ps
->ps_pgnum
));
1166 return KERN_SUCCESS
;
1172 MACH_PORT_FACE master
)
1174 security_token_t null_security_token
= {
1177 MACH_PORT_FACE device
;
1178 int info
[DEV_GET_SIZE_COUNT
];
1179 mach_msg_type_number_t info_count
;
1180 MACH_PORT_FACE bs
= MACH_PORT_NULL
;
1181 unsigned int rec_size
;
1184 MACH_PORT_FACE reply_port
;
1186 if (ds_device_open_sync(master
, MACH_PORT_NULL
, D_READ
| D_WRITE
,
1187 null_security_token
, dev_name
, &device
))
1190 info_count
= DEV_GET_SIZE_COUNT
;
1191 if (!ds_device_get_status(device
, DEV_GET_SIZE
, info
, &info_count
)) {
1192 rec_size
= info
[DEV_GET_SIZE_RECORD_SIZE
];
1193 count
= info
[DEV_GET_SIZE_DEVICE_SIZE
] / rec_size
;
1194 clsize
= bs_get_global_clsize(0);
1195 if (!default_pager_backing_store_create(
1196 default_pager_object
,
1197 DEFAULT_PAGER_BACKING_STORE_MAXPRI
,
1198 (clsize
* vm_page_size
),
1200 if (!default_pager_add_segment(bs
, device
,
1201 0, count
, rec_size
)) {
1204 ipc_port_release_receive(bs
);
1208 ipc_port_release_send(device
);
1211 #endif /* DEVICE_PAGING */
1216 vs_alloc_async(void)
1218 struct vs_async
*vsa
;
1219 MACH_PORT_FACE reply_port
;
1220 // kern_return_t kr;
1223 if (vs_async_free_list
== NULL
) {
1225 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1228 * Try allocating a reply port named after the
1229 * address of the vs_async structure.
1231 struct vstruct_alias
*alias_struct
;
1233 reply_port
= ipc_port_alloc_kernel();
1234 alias_struct
= (struct vstruct_alias
*)
1235 kalloc(sizeof (struct vstruct_alias
));
1236 if(alias_struct
!= NULL
) {
1237 __IGNORE_WCASTALIGN(alias_struct
->vs
= (struct vstruct
*)vsa
);
1238 alias_struct
->name
= &default_pager_ops
;
1239 reply_port
->ip_alias
= (uintptr_t) alias_struct
;
1240 vsa
->reply_port
= reply_port
;
1241 vs_alloc_async_count
++;
1244 vs_alloc_async_failed
++;
1245 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1247 kfree(vsa
, sizeof (struct vs_async
));
1252 vsa
= vs_async_free_list
;
1253 vs_async_free_list
= vs_async_free_list
->vsa_next
;
1262 struct vs_async
*vsa
)
1265 vsa
->vsa_next
= vs_async_free_list
;
1266 vs_async_free_list
= vsa
;
1270 #else /* VS_ASYNC_REUSE */
1273 vs_alloc_async(void)
1275 struct vs_async
*vsa
;
1276 MACH_PORT_FACE reply_port
;
1279 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1282 * Try allocating a reply port named after the
1283 * address of the vs_async structure.
1285 reply_port
= ipc_port_alloc_kernel();
1286 alias_struct
= (vstruct_alias
*)
1287 kalloc(sizeof (struct vstruct_alias
));
1288 if(alias_struct
!= NULL
) {
1289 alias_struct
->vs
= reply_port
;
1290 alias_struct
->name
= &default_pager_ops
;
1291 reply_port
->defpager_importance
.alias
= (int) vsa
;
1292 vsa
->reply_port
= reply_port
;
1293 vs_alloc_async_count
++;
1296 vs_alloc_async_failed
++;
1297 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1299 kfree(vsa
, sizeof (struct vs_async
));
1309 struct vs_async
*vsa
)
1311 MACH_PORT_FACE reply_port
;
1314 reply_port
= vsa
->reply_port
;
1315 kfree(reply_port
->ip_alias
, sizeof (struct vstuct_alias
));
1316 kfree(vsa
, sizeof (struct vs_async
));
1317 ipc_port_dealloc_kernel((MACH_PORT_FACE
) (reply_port
));
1320 vs_alloc_async_count
--;
1325 #endif /* VS_ASYNC_REUSE */
1327 zone_t vstruct_zone
;
1336 vs
= (vstruct_t
) zalloc(vstruct_zone
);
1337 if (vs
== VSTRUCT_NULL
) {
1338 return VSTRUCT_NULL
;
1344 * The following fields will be provided later.
1346 vs
->vs_pager_ops
= NULL
;
1347 vs
->vs_control
= MEMORY_OBJECT_CONTROL_NULL
;
1348 vs
->vs_references
= 1;
1351 vs
->vs_waiting_seqno
= FALSE
;
1352 vs
->vs_waiting_read
= FALSE
;
1353 vs
->vs_waiting_write
= FALSE
;
1354 vs
->vs_waiting_async
= FALSE
;
1361 vs
->vs_clshift
= local_log2(bs_get_global_clsize(0));
1362 vs
->vs_size
= ((atop_32(round_page_32(size
)) - 1) >> vs
->vs_clshift
) + 1;
1363 vs
->vs_async_pending
= 0;
1366 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367 * depending on the size of the memory object.
1369 if (INDIRECT_CLMAP(vs
->vs_size
)) {
1370 vs
->vs_imap
= (struct vs_map
**)
1371 kalloc(INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1372 vs
->vs_indirect
= TRUE
;
1374 vs
->vs_dmap
= (struct vs_map
*)
1375 kalloc(CLMAP_SIZE(vs
->vs_size
));
1376 vs
->vs_indirect
= FALSE
;
1378 vs
->vs_xfer_pending
= FALSE
;
1379 DP_DEBUG(DEBUG_VS_INTERNAL
,
1380 ("map=0x%x, indirect=%d\n", (int) vs
->vs_dmap
, vs
->vs_indirect
));
1383 * Check to see that we got the space.
1386 kfree(vs
, sizeof *vs
);
1387 return VSTRUCT_NULL
;
1391 * Zero the indirect pointers, or clear the direct pointers.
1393 if (vs
->vs_indirect
)
1394 memset(vs
->vs_imap
, 0,
1395 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1397 for (i
= 0; i
< vs
->vs_size
; i
++)
1398 VSM_CLR(vs
->vs_dmap
[i
]);
1400 VS_MAP_LOCK_INIT(vs
);
1402 bs_commit(vs
->vs_size
);
1407 paging_segment_t
ps_select_segment(unsigned int, int *); /* forward */
1414 paging_segment_t ps
;
1419 * Optimize case where there's only one segment.
1420 * paging_segment_max will index the one and only segment.
1424 if (paging_segment_count
== 1) {
1425 paging_segment_t lps
= PAGING_SEGMENT_NULL
; /* used to avoid extra PS_UNLOCK */
1426 ipc_port_t trigger
= IP_NULL
;
1428 ps
= paging_segments
[paging_segment_max
];
1429 *psindex
= paging_segment_max
;
1431 if( !IS_PS_EMERGENCY_SEGMENT(ps
) ) {
1432 panic("Emergency paging segment missing\n");
1434 ASSERT(ps
->ps_clshift
>= shift
);
1435 if(IS_PS_OK_TO_USE(ps
)) {
1436 if (ps
->ps_clcount
) {
1438 dp_pages_free
-= 1 << ps
->ps_clshift
;
1439 ps
->ps_pgcount
-= 1 << ps
->ps_clshift
;
1440 if(min_pages_trigger_port
&&
1441 (dp_pages_free
< minimum_pages_remaining
)) {
1442 trigger
= min_pages_trigger_port
;
1443 min_pages_trigger_port
= NULL
;
1445 backing_store_abort_compaction
= TRUE
;
1452 if( lps
== PAGING_SEGMENT_NULL
) {
1454 dp_pages_free_drift_count
++;
1455 if(dp_pages_free
> dp_pages_free_drifted_max
) {
1456 dp_pages_free_drifted_max
= dp_pages_free
;
1458 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free
));
1465 if (trigger
!= IP_NULL
) {
1466 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1468 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1469 ipc_port_release_send(trigger
);
1474 if (paging_segment_count
== 0) {
1476 dp_pages_free_drift_count
++;
1477 if(dp_pages_free
> dp_pages_free_drifted_max
) {
1478 dp_pages_free_drifted_max
= dp_pages_free
;
1480 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free
));
1484 return PAGING_SEGMENT_NULL
;
1488 i
>= BS_MINPRI
; i
--) {
1491 if ((ps_select_array
[i
] == BS_NOPRI
) ||
1492 (ps_select_array
[i
] == BS_FULLPRI
))
1494 start_index
= ps_select_array
[i
];
1496 if(!(paging_segments
[start_index
])) {
1498 physical_transfer_cluster_count
= 0;
1500 else if ((physical_transfer_cluster_count
+1) == (ALLOC_STRIDE
>>
1501 (((paging_segments
[start_index
])->ps_clshift
)
1502 + vm_page_shift
))) {
1503 physical_transfer_cluster_count
= 0;
1504 j
= start_index
+ 1;
1506 physical_transfer_cluster_count
+=1;
1508 if(start_index
== 0)
1509 start_index
= paging_segment_max
;
1511 start_index
= start_index
- 1;
1515 if (j
> paging_segment_max
)
1517 if ((ps
= paging_segments
[j
]) &&
1518 (ps
->ps_bs
->bs_priority
== i
)) {
1520 * Force the ps cluster size to be
1521 * >= that of the vstruct.
1524 if (IS_PS_OK_TO_USE(ps
)) {
1525 if ((ps
->ps_clcount
) &&
1526 (ps
->ps_clshift
>= shift
)) {
1527 ipc_port_t trigger
= IP_NULL
;
1530 dp_pages_free
-= 1 << ps
->ps_clshift
;
1531 ps
->ps_pgcount
-= 1 << ps
->ps_clshift
;
1532 if(min_pages_trigger_port
&&
1534 minimum_pages_remaining
)) {
1535 trigger
= min_pages_trigger_port
;
1536 min_pages_trigger_port
= NULL
;
1538 backing_store_abort_compaction
= TRUE
;
1542 * found one, quit looking.
1544 ps_select_array
[i
] = j
;
1547 if (trigger
!= IP_NULL
) {
1548 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1550 default_pager_space_alert(
1553 ipc_port_release_send(trigger
);
1561 if (j
== start_index
) {
1563 * none at this priority -- mark it full
1565 ps_select_array
[i
] = BS_FULLPRI
;
1573 dp_pages_free_drift_count
++;
1574 if(dp_pages_free
> dp_pages_free_drifted_max
) {
1575 dp_pages_free_drifted_max
= dp_pages_free
;
1577 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count
,dp_pages_free
));
1581 return PAGING_SEGMENT_NULL
;
1584 dp_offset_t
ps_allocate_cluster(vstruct_t
, int *, paging_segment_t
); /*forward*/
1587 ps_allocate_cluster(
1590 paging_segment_t use_ps
)
1592 unsigned int byte_num
;
1594 paging_segment_t ps
;
1595 dp_offset_t cluster
;
1596 ipc_port_t trigger
= IP_NULL
;
1599 * Find best paging segment.
1600 * ps_select_segment will decrement cluster count on ps.
1601 * Must pass cluster shift to find the most appropriate segment.
1603 /* NOTE: The addition of paging segment delete capability threatened
1604 * to seriously complicate the treatment of paging segments in this
1605 * module and the ones that call it (notably ps_clmap), because of the
1606 * difficulty in assuring that the paging segment would continue to
1607 * exist between being unlocked and locked. This was
1608 * avoided because all calls to this module are based in either
1609 * dp_memory_object calls which rely on the vs lock, or by
1610 * the transfer function which is part of the segment delete path.
1611 * The transfer function which is part of paging segment delete is
1612 * protected from multiple callers by the backing store lock.
1613 * The paging segment delete function treats mappings to a paging
1614 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615 * while data is transferred to the remaining segments. This is in
1616 * line with the view that incomplete or in-transition mappings between
1617 * data, a vstruct, and backing store are protected by the vs lock.
1618 * This and the ordering of the paging segment "going_away" bit setting
1622 if (use_ps
!= PAGING_SEGMENT_NULL
) {
1627 ASSERT(ps
->ps_clcount
!= 0);
1630 dp_pages_free
-= 1 << ps
->ps_clshift
;
1631 ps
->ps_pgcount
-= 1 << ps
->ps_clshift
;
1632 if(min_pages_trigger_port
&&
1633 (dp_pages_free
< minimum_pages_remaining
)) {
1634 trigger
= min_pages_trigger_port
;
1635 min_pages_trigger_port
= NULL
;
1637 backing_store_abort_compaction
= TRUE
;
1641 if (trigger
!= IP_NULL
) {
1642 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1644 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1645 ipc_port_release_send(trigger
);
1648 } else if ((ps
= ps_select_segment(vs
->vs_clshift
, psindex
)) ==
1649 PAGING_SEGMENT_NULL
) {
1650 static clock_sec_t lastnotify
= 0;
1652 clock_nsec_t nanoseconds_dummy
;
1655 * Don't immediately jump to the emergency segment. Give the
1656 * dynamic pager a chance to create it's first normal swap file.
1657 * Unless, of course the very first normal swap file can't be
1658 * created due to some problem and we didn't expect that problem
1659 * i.e. use_emergency_swap_file_first was never set to true initially.
1660 * It then gets set in the swap file creation error handling.
1662 if(paging_segment_count
> 1 || use_emergency_swap_file_first
== TRUE
) {
1664 ps
= paging_segments
[EMERGENCY_PSEG_INDEX
];
1665 if(IS_PS_EMERGENCY_SEGMENT(ps
) && !IS_PS_GOING_AWAY(ps
)) {
1669 if(IS_PS_GOING_AWAY(ps
)) {
1670 /* Someone de-activated the emergency paging segment*/
1674 } else if(dp_pages_free
) {
1676 * Someone has already activated the emergency paging segment
1678 * Between us having rec'd a NULL segment from ps_select_segment
1679 * and reaching here a new normal segment could have been added.
1680 * E.g. we get NULL segment and another thread just added the
1681 * new swap file. Hence check to see if we have more dp_pages_free
1682 * before activating the emergency segment.
1688 } else if(!IS_PS_OK_TO_USE(ps
) && ps
->ps_clcount
) {
1690 * PS_CAN_USE is only reset from the emergency segment when it's
1691 * been successfully recovered. So it's legal to have an emergency
1692 * segment that has PS_CAN_USE but no clusters because it's recovery
1695 backing_store_t bs
= ps
->ps_bs
;
1696 ps
->ps_state
|= PS_CAN_USE
;
1697 if(ps_select_array
[bs
->bs_priority
] == BS_FULLPRI
||
1698 ps_select_array
[bs
->bs_priority
] == BS_NOPRI
) {
1699 ps_select_array
[bs
->bs_priority
] = 0;
1701 dp_pages_free
+= ps
->ps_pgcount
;
1702 dp_pages_reserve
-= ps
->ps_pgcount
;
1705 dprintf(("Switching ON Emergency paging segment\n"));
1715 * Emit a notification of the low-paging resource condition
1716 * but don't issue it more than once every five seconds. This
1717 * prevents us from overflowing logs with thousands of
1718 * repetitions of the message.
1720 clock_get_system_nanotime(&now
, &nanoseconds_dummy
);
1721 if (paging_segment_count
> 1 && (now
> lastnotify
+ 5)) {
1722 /* With an activated emergency paging segment we still
1723 * didn't get any clusters. This could mean that the
1724 * emergency paging segment is exhausted.
1726 dprintf(("System is out of paging space.\n"));
1732 if(min_pages_trigger_port
) {
1733 trigger
= min_pages_trigger_port
;
1734 min_pages_trigger_port
= NULL
;
1736 backing_store_abort_compaction
= TRUE
;
1739 if (trigger
!= IP_NULL
) {
1740 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1742 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1743 ipc_port_release_send(trigger
);
1745 return (dp_offset_t
) -1;
1749 * Look for an available cluster. At the end of the loop,
1750 * byte_num is the byte offset and bit_num is the bit offset of the
1751 * first zero bit in the paging segment bitmap.
1754 byte_num
= ps
->ps_hint
;
1755 for (; byte_num
< howmany(ps
->ps_ncls
, NBBY
); byte_num
++) {
1756 if (*(ps
->ps_bmap
+ byte_num
) != BYTEMASK
) {
1757 for (bit_num
= 0; bit_num
< NBBY
; bit_num
++) {
1758 if (isclr((ps
->ps_bmap
+ byte_num
), bit_num
))
1761 ASSERT(bit_num
!= NBBY
);
1765 ps
->ps_hint
= byte_num
;
1766 cluster
= (byte_num
*NBBY
) + bit_num
;
1768 /* Space was reserved, so this must be true */
1769 ASSERT(cluster
< ps
->ps_ncls
);
1771 setbit(ps
->ps_bmap
, cluster
);
1777 void ps_deallocate_cluster(paging_segment_t
, dp_offset_t
); /* forward */
1780 ps_deallocate_cluster(
1781 paging_segment_t ps
,
1782 dp_offset_t cluster
)
1785 if (cluster
>= ps
->ps_ncls
)
1786 panic("ps_deallocate_cluster: Invalid cluster number");
1789 * Lock the paging segment, clear the cluster's bitmap and increment the
1790 * number of free cluster.
1794 clrbit(ps
->ps_bmap
, cluster
);
1795 if( IS_PS_OK_TO_USE(ps
)) {
1797 ps
->ps_pgcount
+= 1 << ps
->ps_clshift
;
1798 dp_pages_free
+= 1 << ps
->ps_clshift
;
1800 ps
->ps_special_clusters
+= 1;
1804 * Move the hint down to the freed cluster if it is
1805 * less than the current hint.
1807 if ((cluster
/NBBY
) < ps
->ps_hint
) {
1808 ps
->ps_hint
= (cluster
/NBBY
);
1813 * If we're freeing space on a full priority, reset the array.
1815 if ( IS_PS_OK_TO_USE(ps
) && ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
)
1816 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
1823 void ps_dealloc_vsmap(struct vs_map
*, dp_size_t
); /* forward */
1827 struct vs_map
*vsmap
,
1831 struct ps_vnode_trim_data trim_data
;
1833 ps_vnode_trim_init(&trim_data
);
1835 for (i
= 0; i
< size
; i
++) {
1836 if (!VSM_ISCLR(vsmap
[i
]) && !VSM_ISERR(vsmap
[i
])) {
1837 ps_vnode_trim_more(&trim_data
,
1839 VSM_PS(vsmap
[i
])->ps_clshift
,
1840 vm_page_size
<< VSM_PS(vsmap
[i
])->ps_clshift
);
1841 ps_deallocate_cluster(VSM_PS(vsmap
[i
]),
1842 VSM_CLOFF(vsmap
[i
]));
1844 ps_vnode_trim_now(&trim_data
);
1847 ps_vnode_trim_now(&trim_data
);
1860 * If this is an indirect structure, then we walk through the valid
1861 * (non-zero) indirect pointers and deallocate the clusters
1862 * associated with each used map entry (via ps_dealloc_vsmap).
1863 * When all of the clusters in an indirect block have been
1864 * freed, we deallocate the block. When all of the indirect
1865 * blocks have been deallocated we deallocate the memory
1866 * holding the indirect pointers.
1868 if (vs
->vs_indirect
) {
1869 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
1870 if (vs
->vs_imap
[i
] != NULL
) {
1871 ps_dealloc_vsmap(vs
->vs_imap
[i
], CLMAP_ENTRIES
);
1872 kfree(vs
->vs_imap
[i
], CLMAP_THRESHOLD
);
1875 kfree(vs
->vs_imap
, INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1878 * Direct map. Free used clusters, then memory.
1880 ps_dealloc_vsmap(vs
->vs_dmap
, vs
->vs_size
);
1881 kfree(vs
->vs_dmap
, CLMAP_SIZE(vs
->vs_size
));
1885 bs_commit(- vs
->vs_size
);
1887 VS_MAP_LOCK_DESTROY(vs
);
1889 zfree(vstruct_zone
, vs
);
1895 boolean_t return_to_vm
,
1896 boolean_t reclaim_backing_store
)
1899 struct vs_map
*vsmap
;
1900 boolean_t vsmap_all_clear
, vsimap_all_clear
;
1901 struct vm_object_fault_info fault_info
;
1903 unsigned int vsmap_size
;
1904 kern_return_t kr
= KERN_SUCCESS
;
1908 fault_info
.cluster_size
= VM_SUPER_CLUSTER
;
1909 fault_info
.behavior
= VM_BEHAVIOR_SEQUENTIAL
;
1910 fault_info
.user_tag
= 0;
1911 fault_info
.pmap_options
= 0;
1912 fault_info
.lo_offset
= 0;
1913 fault_info
.hi_offset
= ptoa_32(vs
->vs_size
<< vs
->vs_clshift
);
1914 fault_info
.io_sync
= reclaim_backing_store
;
1915 fault_info
.batch_pmap_op
= FALSE
;
1918 * If this is an indirect structure, then we walk through the valid
1919 * (non-zero) indirect pointers and deallocate the clusters
1920 * associated with each used map entry (via ps_dealloc_vsmap).
1921 * When all of the clusters in an indirect block have been
1922 * freed, we deallocate the block. When all of the indirect
1923 * blocks have been deallocated we deallocate the memory
1924 * holding the indirect pointers.
1926 if (vs
->vs_indirect
) {
1927 vsimap_all_clear
= TRUE
;
1928 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
1929 vsmap
= vs
->vs_imap
[i
];
1932 /* loop on clusters in this indirect map */
1933 clmap_off
= (vm_page_size
* CLMAP_ENTRIES
*
1935 if (i
+1 == INDIRECT_CLMAP_ENTRIES(vs
->vs_size
))
1936 vsmap_size
= vs
->vs_size
- (CLMAP_ENTRIES
* i
);
1938 vsmap_size
= CLMAP_ENTRIES
;
1939 vsmap_all_clear
= TRUE
;
1941 for (j
= 0; j
< vsmap_size
;) {
1942 if (VSM_ISCLR(vsmap
[j
]) ||
1943 VSM_ISERR(vsmap
[j
])) {
1945 clmap_off
+= vm_page_size
* VSCLSIZE(vs
);
1949 kr
= pvs_cluster_read(
1952 (dp_size_t
) -1, /* read whole cluster */
1955 VS_MAP_LOCK(vs
); /* XXX what if it changed ? */
1956 if (kr
!= KERN_SUCCESS
) {
1957 vsmap_all_clear
= FALSE
;
1958 vsimap_all_clear
= FALSE
;
1960 kr
= KERN_MEMORY_ERROR
;
1965 if (vsmap_all_clear
) {
1966 ps_dealloc_vsmap(vsmap
, CLMAP_ENTRIES
);
1967 kfree(vsmap
, CLMAP_THRESHOLD
);
1968 vs
->vs_imap
[i
] = NULL
;
1971 if (vsimap_all_clear
) {
1972 // kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1976 * Direct map. Free used clusters, then memory.
1978 vsmap
= vs
->vs_dmap
;
1979 if (vsmap
== NULL
) {
1982 vsmap_all_clear
= TRUE
;
1983 /* loop on clusters in the direct map */
1985 for (j
= 0; j
< vs
->vs_size
;) {
1986 if (VSM_ISCLR(vsmap
[j
]) ||
1987 VSM_ISERR(vsmap
[j
])) {
1991 clmap_off
= vm_page_size
* (j
<< vs
->vs_clshift
);
1993 kr
= pvs_cluster_read(
1996 (dp_size_t
) -1, /* read whole cluster */
1999 VS_MAP_LOCK(vs
); /* XXX what if it changed ? */
2000 if (kr
!= KERN_SUCCESS
) {
2001 vsmap_all_clear
= FALSE
;
2003 kr
= KERN_MEMORY_ERROR
;
2006 // VSM_CLR(vsmap[j]);
2010 if (vsmap_all_clear
) {
2011 ps_dealloc_vsmap(vs
->vs_dmap
, vs
->vs_size
);
2012 // kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2021 int ps_map_extend(vstruct_t
, unsigned int); /* forward */
2025 unsigned int new_size
)
2027 struct vs_map
**new_imap
;
2028 struct vs_map
*new_dmap
= NULL
;
2031 void *old_map
= NULL
;
2032 int old_map_size
= 0;
2034 if (vs
->vs_size
>= new_size
) {
2036 * Someone has already done the work.
2042 * If the new size extends into the indirect range, then we have one
2043 * of two cases: we are going from indirect to indirect, or we are
2044 * going from direct to indirect. If we are going from indirect to
2045 * indirect, then it is possible that the new size will fit in the old
2046 * indirect map. If this is the case, then just reset the size of the
2047 * vstruct map and we are done. If the new size will not
2048 * fit into the old indirect map, then we have to allocate a new
2049 * indirect map and copy the old map pointers into this new map.
2051 * If we are going from direct to indirect, then we have to allocate a
2052 * new indirect map and copy the old direct pages into the first
2053 * indirect page of the new map.
2054 * NOTE: allocating memory here is dangerous, as we're in the
2057 if (INDIRECT_CLMAP(new_size
)) {
2058 int new_map_size
= INDIRECT_CLMAP_SIZE(new_size
);
2061 * Get a new indirect map and zero it.
2063 old_map_size
= INDIRECT_CLMAP_SIZE(vs
->vs_size
);
2064 if (vs
->vs_indirect
&&
2065 (new_map_size
== old_map_size
)) {
2066 bs_commit(new_size
- vs
->vs_size
);
2067 vs
->vs_size
= new_size
;
2071 new_imap
= (struct vs_map
**)kalloc(new_map_size
);
2072 if (new_imap
== NULL
) {
2075 memset(new_imap
, 0, new_map_size
);
2077 if (vs
->vs_indirect
) {
2078 /* Copy old entries into new map */
2079 memcpy(new_imap
, vs
->vs_imap
, old_map_size
);
2080 /* Arrange to free the old map */
2081 old_map
= (void *) vs
->vs_imap
;
2083 } else { /* Old map was a direct map */
2084 /* Allocate an indirect page */
2085 if ((new_imap
[0] = (struct vs_map
*)
2086 kalloc(CLMAP_THRESHOLD
)) == NULL
) {
2087 kfree(new_imap
, new_map_size
);
2090 new_dmap
= new_imap
[0];
2091 newdsize
= CLMAP_ENTRIES
;
2095 newdsize
= new_size
;
2097 * If the new map is a direct map, then the old map must
2098 * also have been a direct map. All we have to do is
2099 * to allocate a new direct map, copy the old entries
2100 * into it and free the old map.
2102 if ((new_dmap
= (struct vs_map
*)
2103 kalloc(CLMAP_SIZE(new_size
))) == NULL
) {
2109 /* Free the old map */
2110 old_map
= (void *) vs
->vs_dmap
;
2111 old_map_size
= CLMAP_SIZE(vs
->vs_size
);
2113 /* Copy info from the old map into the new map */
2114 memcpy(new_dmap
, vs
->vs_dmap
, old_map_size
);
2116 /* Initialize the rest of the new map */
2117 for (i
= vs
->vs_size
; i
< newdsize
; i
++)
2118 VSM_CLR(new_dmap
[i
]);
2121 vs
->vs_imap
= new_imap
;
2122 vs
->vs_indirect
= TRUE
;
2124 vs
->vs_dmap
= new_dmap
;
2125 bs_commit(new_size
- vs
->vs_size
);
2126 vs
->vs_size
= new_size
;
2128 kfree(old_map
, old_map_size
);
2136 struct clmap
*clmap
,
2141 dp_offset_t cluster
; /* The cluster of offset. */
2142 dp_offset_t newcl
; /* The new cluster allocated. */
2145 struct vs_map
*vsmap
;
2149 ASSERT(vs
->vs_dmap
);
2150 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
2153 * Initialize cluster error value
2155 clmap
->cl_error
= 0;
2158 * If the object has grown, extend the page map.
2160 if (cluster
>= vs
->vs_size
) {
2161 if (flag
== CL_FIND
) {
2162 /* Do not allocate if just doing a lookup */
2164 return (dp_offset_t
) -1;
2166 if (ps_map_extend(vs
, cluster
+ 1)) {
2168 return (dp_offset_t
) -1;
2173 * Look for the desired cluster. If the map is indirect, then we
2174 * have a two level lookup. First find the indirect block, then
2175 * find the actual cluster. If the indirect block has not yet
2176 * been allocated, then do so. If the cluster has not yet been
2177 * allocated, then do so.
2179 * If any of the allocations fail, then return an error.
2180 * Don't allocate if just doing a lookup.
2182 if (vs
->vs_indirect
) {
2183 long ind_block
= cluster
/CLMAP_ENTRIES
;
2185 /* Is the indirect block allocated? */
2186 vsmap
= vs
->vs_imap
[ind_block
];
2187 if (vsmap
== NULL
) {
2188 if (flag
== CL_FIND
) {
2190 return (dp_offset_t
) -1;
2193 /* Allocate the indirect block */
2194 vsmap
= (struct vs_map
*) kalloc(CLMAP_THRESHOLD
);
2195 if (vsmap
== NULL
) {
2197 return (dp_offset_t
) -1;
2199 /* Initialize the cluster offsets */
2200 for (i
= 0; i
< CLMAP_ENTRIES
; i
++)
2202 vs
->vs_imap
[ind_block
] = vsmap
;
2205 vsmap
= vs
->vs_dmap
;
2208 vsmap
+= cluster%CLMAP_ENTRIES
;
2211 * At this point, vsmap points to the struct vs_map desired.
2213 * Look in the map for the cluster, if there was an error on a
2214 * previous write, flag it and return. If it is not yet
2215 * allocated, then allocate it, if we're writing; if we're
2216 * doing a lookup and the cluster's not allocated, return error.
2218 if (VSM_ISERR(*vsmap
)) {
2219 clmap
->cl_error
= VSM_GETERR(*vsmap
);
2221 return (dp_offset_t
) -1;
2222 } else if (VSM_ISCLR(*vsmap
)) {
2225 if (flag
== CL_FIND
) {
2227 * If there's an error and the entry is clear, then
2228 * we've run out of swap space. Record the error
2232 VSM_SETERR(*vsmap
, error
);
2235 return (dp_offset_t
) -1;
2238 * Attempt to allocate a cluster from the paging segment
2240 newcl
= ps_allocate_cluster(vs
, &psindex
,
2241 PAGING_SEGMENT_NULL
);
2242 if (newcl
== (dp_offset_t
) -1) {
2244 return (dp_offset_t
) -1;
2247 VSM_SETCLOFF(*vsmap
, newcl
);
2248 VSM_SETPS(*vsmap
, psindex
);
2251 newcl
= VSM_CLOFF(*vsmap
);
2254 * Fill in pertinent fields of the clmap
2256 clmap
->cl_ps
= VSM_PS(*vsmap
);
2257 clmap
->cl_numpages
= VSCLSIZE(vs
);
2258 clmap
->cl_bmap
.clb_map
= (unsigned int) VSM_BMAP(*vsmap
);
2261 * Byte offset in paging segment is byte offset to cluster plus
2262 * byte offset within cluster. It looks ugly, but should be
2265 ASSERT(trunc_page(offset
) == offset
);
2266 newcl
= ptoa_32(newcl
) << vs
->vs_clshift
;
2267 newoff
= offset
& ((1<<(vm_page_shift
+ vs
->vs_clshift
)) - 1);
2268 if (flag
== CL_ALLOC
) {
2270 * set bits in the allocation bitmap according to which
2271 * pages were requested. size is in bytes.
2273 i
= atop_32(newoff
);
2274 while ((size
> 0) && (i
< VSCLSIZE(vs
))) {
2275 VSM_SETALLOC(*vsmap
, i
);
2277 size
-= vm_page_size
;
2280 clmap
->cl_alloc
.clb_map
= (unsigned int) VSM_ALLOC(*vsmap
);
2283 * Offset is not cluster aligned, so number of pages
2284 * and bitmaps must be adjusted
2286 clmap
->cl_numpages
-= atop_32(newoff
);
2287 CLMAP_SHIFT(clmap
, vs
);
2288 CLMAP_SHIFTALLOC(clmap
, vs
);
2293 * The setting of valid bits and handling of write errors
2294 * must be done here, while we hold the lock on the map.
2295 * It logically should be done in ps_vs_write_complete().
2296 * The size and error information has been passed from
2297 * ps_vs_write_complete(). If the size parameter is non-zero,
2298 * then there is work to be done. If error is also non-zero,
2299 * then the error number is recorded in the cluster and the
2300 * entire cluster is in error.
2302 if (size
&& flag
== CL_FIND
) {
2303 dp_offset_t off
= (dp_offset_t
) 0;
2306 for (i
= VSCLSIZE(vs
) - clmap
->cl_numpages
; size
> 0;
2308 VSM_SETPG(*vsmap
, i
);
2309 size
-= vm_page_size
;
2311 ASSERT(i
<= VSCLSIZE(vs
));
2313 BS_STAT(clmap
->cl_ps
->ps_bs
,
2314 clmap
->cl_ps
->ps_bs
->bs_pages_out_fail
+=
2316 off
= VSM_CLOFF(*vsmap
);
2317 VSM_SETERR(*vsmap
, error
);
2320 * Deallocate cluster if error, and no valid pages
2323 if (off
!= (dp_offset_t
) 0)
2324 ps_deallocate_cluster(clmap
->cl_ps
, off
);
2326 return (dp_offset_t
) 0;
2330 DP_DEBUG(DEBUG_VS_INTERNAL
,
2331 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2332 newcl
+newoff
, (int) vs
, (int) vsmap
, flag
));
2333 DP_DEBUG(DEBUG_VS_INTERNAL
,
2334 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2335 (int) clmap
->cl_ps
, clmap
->cl_numpages
,
2336 (int) clmap
->cl_bmap
.clb_map
, (int) clmap
->cl_alloc
.clb_map
));
2338 return (newcl
+ newoff
);
2341 void ps_clunmap(vstruct_t
, dp_offset_t
, dp_size_t
); /* forward */
2349 dp_offset_t cluster
; /* The cluster number of offset */
2350 struct vs_map
*vsmap
;
2351 struct ps_vnode_trim_data trim_data
;
2353 ps_vnode_trim_init(&trim_data
);
2358 * Loop through all clusters in this range, freeing paging segment
2359 * clusters and map entries as encountered.
2361 while (length
> 0) {
2365 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
2366 if (vs
->vs_indirect
) /* indirect map */
2367 vsmap
= vs
->vs_imap
[cluster
/CLMAP_ENTRIES
];
2369 vsmap
= vs
->vs_dmap
;
2370 if (vsmap
== NULL
) {
2371 ps_vnode_trim_now(&trim_data
);
2375 vsmap
+= cluster%CLMAP_ENTRIES
;
2376 if (VSM_ISCLR(*vsmap
)) {
2377 ps_vnode_trim_now(&trim_data
);
2378 length
-= vm_page_size
;
2379 offset
+= vm_page_size
;
2383 * We've got a valid mapping. Clear it and deallocate
2384 * paging segment cluster pages.
2385 * Optimize for entire cluster cleraing.
2387 if ( (newoff
= (offset
&((1<<(vm_page_shift
+vs
->vs_clshift
))-1))) ) {
2389 * Not cluster aligned.
2391 ASSERT(trunc_page(newoff
) == newoff
);
2392 i
= atop_32(newoff
);
2395 while ((i
< VSCLSIZE(vs
)) && (length
> 0)) {
2396 VSM_CLRPG(*vsmap
, i
);
2397 VSM_CLRALLOC(*vsmap
, i
);
2398 length
-= vm_page_size
;
2399 offset
+= vm_page_size
;
2404 * If map entry is empty, clear and deallocate cluster.
2406 if (!VSM_BMAP(*vsmap
)) {
2407 ps_vnode_trim_more(&trim_data
,
2410 VSCLSIZE(vs
) * vm_page_size
);
2411 ps_deallocate_cluster(VSM_PS(*vsmap
),
2415 ps_vnode_trim_now(&trim_data
);
2418 ps_vnode_trim_now(&trim_data
);
2423 void ps_vs_write_complete(vstruct_t
, dp_offset_t
, dp_size_t
, int); /* forward */
2426 ps_vs_write_complete(
2435 * Get the struct vsmap for this cluster.
2436 * Use READ, even though it was written, because the
2437 * cluster MUST be present, unless there was an error
2438 * in the original ps_clmap (e.g. no space), in which
2439 * case, nothing happens.
2441 * Must pass enough information to ps_clmap to allow it
2442 * to set the vs_map structure bitmap under lock.
2444 (void) ps_clmap(vs
, offset
, &clmap
, CL_FIND
, size
, error
);
2447 void vs_cl_write_complete(vstruct_t
, paging_segment_t
, dp_offset_t
, vm_offset_t
, dp_size_t
, boolean_t
, int); /* forward */
2450 vs_cl_write_complete(
2452 __unused paging_segment_t ps
,
2454 __unused vm_offset_t addr
,
2459 // kern_return_t kr;
2463 * For internal objects, the error is recorded on a
2464 * per-cluster basis by ps_clmap() which is called
2465 * by ps_vs_write_complete() below.
2467 dprintf(("write failed error = 0x%x\n", error
));
2468 /* add upl_abort code here */
2470 GSTAT(global_stats
.gs_pages_out
+= atop_32(size
));
2472 * Notify the vstruct mapping code, so it can do its accounting.
2474 ps_vs_write_complete(vs
, offset
, size
, error
);
2478 ASSERT(vs
->vs_async_pending
> 0);
2479 vs
->vs_async_pending
-= size
;
2480 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
2481 vs
->vs_waiting_async
= FALSE
;
2483 thread_wakeup(&vs
->vs_async_pending
);
2490 #ifdef DEVICE_PAGING
2491 kern_return_t
device_write_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2495 MACH_PORT_FACE reply_port
,
2496 kern_return_t device_code
,
2497 io_buf_len_t bytes_written
)
2499 struct vs_async
*vsa
;
2501 vsa
= (struct vs_async
*)
2502 ((struct vstruct_alias
*)(reply_port
->ip_alias
))->vs
;
2504 if (device_code
== KERN_SUCCESS
&& bytes_written
!= vsa
->vsa_size
) {
2505 device_code
= KERN_FAILURE
;
2508 vsa
->vsa_error
= device_code
;
2511 ASSERT(vsa
->vsa_vs
!= VSTRUCT_NULL
);
2512 if(vsa
->vsa_flags
& VSA_TRANSFER
) {
2513 /* revisit when async disk segments redone */
2514 if(vsa
->vsa_error
) {
2515 /* need to consider error condition. re-write data or */
2516 /* throw it away here. */
2517 vm_map_copy_discard((vm_map_copy_t
)vsa
->vsa_addr
);
2519 ps_vs_write_complete(vsa
->vsa_vs
, vsa
->vsa_offset
,
2520 vsa
->vsa_size
, vsa
->vsa_error
);
2522 vs_cl_write_complete(vsa
->vsa_vs
, vsa
->vsa_ps
, vsa
->vsa_offset
,
2523 vsa
->vsa_addr
, vsa
->vsa_size
, TRUE
,
2528 return KERN_SUCCESS
;
2531 kern_return_t
device_write_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2533 device_write_reply_inband(
2534 MACH_PORT_FACE reply_port
,
2535 kern_return_t return_code
,
2536 io_buf_len_t bytes_written
)
2538 panic("device_write_reply_inband: illegal");
2539 return KERN_SUCCESS
;
2542 kern_return_t
device_read_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_t
, mach_msg_type_number_t
);
2545 MACH_PORT_FACE reply_port
,
2546 kern_return_t return_code
,
2548 mach_msg_type_number_t dataCnt
)
2550 struct vs_async
*vsa
;
2551 vsa
= (struct vs_async
*)
2552 ((struct vstruct_alias
*)(reply_port
->defpager_importance
.alias
))->vs
;
2553 vsa
->vsa_addr
= (vm_offset_t
)data
;
2554 vsa
->vsa_size
= (vm_size_t
)dataCnt
;
2555 vsa
->vsa_error
= return_code
;
2556 thread_wakeup(&vsa
);
2557 return KERN_SUCCESS
;
2560 kern_return_t
device_read_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_inband_t
, mach_msg_type_number_t
);
2562 device_read_reply_inband(
2563 MACH_PORT_FACE reply_port
,
2564 kern_return_t return_code
,
2565 io_buf_ptr_inband_t data
,
2566 mach_msg_type_number_t dataCnt
)
2568 panic("device_read_reply_inband: illegal");
2569 return KERN_SUCCESS
;
2572 kern_return_t
device_read_reply_overwrite(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2574 device_read_reply_overwrite(
2575 MACH_PORT_FACE reply_port
,
2576 kern_return_t return_code
,
2577 io_buf_len_t bytes_read
)
2579 panic("device_read_reply_overwrite: illegal\n");
2580 return KERN_SUCCESS
;
2583 kern_return_t
device_open_reply(MACH_PORT_FACE
, kern_return_t
, MACH_PORT_FACE
);
2586 MACH_PORT_FACE reply_port
,
2587 kern_return_t return_code
,
2588 MACH_PORT_FACE device_port
)
2590 panic("device_open_reply: illegal\n");
2591 return KERN_SUCCESS
;
2596 paging_segment_t ps
,
2598 vm_offset_t
*bufferp
,
2600 unsigned int *residualp
,
2604 recnum_t dev_offset
;
2605 unsigned int bytes_wanted
;
2606 unsigned int bytes_read
;
2607 unsigned int total_read
;
2608 vm_offset_t dev_buffer
;
2609 vm_offset_t buf_ptr
;
2610 unsigned int records_read
;
2611 struct vs_async
*vsa
;
2614 vm_map_copy_t device_data
= NULL
;
2615 default_pager_thread_t
*dpt
= NULL
;
2617 device
= dev_port_lookup(ps
->ps_device
);
2618 clustered_reads
[atop_32(size
)]++;
2620 dev_offset
= (ps
->ps_offset
+
2621 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2622 bytes_wanted
= size
;
2624 *bufferp
= (vm_offset_t
)NULL
;
2627 vsa
= VS_ALLOC_ASYNC();
2631 vsa
->vsa_offset
= 0;
2635 ip_lock(vsa
->reply_port
);
2636 vsa
->reply_port
->ip_sorights
++;
2637 ip_reference(vsa
->reply_port
);
2638 ip_unlock(vsa
->reply_port
);
2639 kr
= ds_device_read_common(device
,
2641 (mach_msg_type_name_t
)
2642 MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2646 (IO_READ
| IO_CALL
),
2647 (io_buf_ptr_t
*) &dev_buffer
,
2648 (mach_msg_type_number_t
*) &bytes_read
);
2649 if(kr
== MIG_NO_REPLY
) {
2650 assert_wait(&vsa
, THREAD_UNINT
);
2651 thread_block(THREAD_CONTINUE_NULL
);
2653 dev_buffer
= vsa
->vsa_addr
;
2654 bytes_read
= (unsigned int)vsa
->vsa_size
;
2655 kr
= vsa
->vsa_error
;
2658 if (kr
!= KERN_SUCCESS
|| bytes_read
== 0) {
2661 total_read
+= bytes_read
;
2664 * If we got the entire range, use the returned dev_buffer.
2666 if (bytes_read
== size
) {
2667 *bufferp
= (vm_offset_t
)dev_buffer
;
2672 dprintf(("read only %d bytes out of %d\n",
2673 bytes_read
, bytes_wanted
));
2676 dpt
= get_read_buffer();
2677 buf_ptr
= dpt
->dpt_buffer
;
2678 *bufferp
= (vm_offset_t
)buf_ptr
;
2681 * Otherwise, copy the data into the provided buffer (*bufferp)
2682 * and append the rest of the range as it comes in.
2684 memcpy((void *) buf_ptr
, (void *) dev_buffer
, bytes_read
);
2685 buf_ptr
+= bytes_read
;
2686 bytes_wanted
-= bytes_read
;
2687 records_read
= (bytes_read
>>
2688 (vm_page_shift
- ps
->ps_record_shift
));
2689 dev_offset
+= records_read
;
2690 DP_DEBUG(DEBUG_VS_INTERNAL
,
2691 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2692 dev_buffer
, bytes_read
));
2693 if (vm_deallocate(kernel_map
, dev_buffer
, bytes_read
)
2695 Panic("dealloc buf");
2696 } while (bytes_wanted
);
2698 *residualp
= size
- total_read
;
2699 if((dev_buffer
!= *bufferp
) && (total_read
!= 0)) {
2700 vm_offset_t temp_buffer
;
2701 vm_allocate(kernel_map
, &temp_buffer
, total_read
, VM_FLAGS_ANYWHERE
| VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK
));
2702 memcpy((void *) temp_buffer
, (void *) *bufferp
, total_read
);
2703 if(vm_map_copyin_page_list(kernel_map
, temp_buffer
, total_read
,
2704 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2705 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2706 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2707 (vm_map_copy_t
*)&device_data
, FALSE
))
2708 panic("ps_read_device: cannot copyin locally provided buffer\n");
2710 else if((kr
== KERN_SUCCESS
) && (total_read
!= 0) && (dev_buffer
!= 0)){
2711 if(vm_map_copyin_page_list(kernel_map
, dev_buffer
, bytes_read
,
2712 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2713 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2714 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2715 (vm_map_copy_t
*)&device_data
, FALSE
))
2716 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2721 *bufferp
= (vm_offset_t
)device_data
;
2724 /* Free the receive buffer */
2725 dpt
->checked_out
= 0;
2726 thread_wakeup(&dpt_array
);
2728 return KERN_SUCCESS
;
2733 paging_segment_t ps
,
2737 struct vs_async
*vsa
)
2739 recnum_t dev_offset
;
2740 io_buf_len_t bytes_to_write
, bytes_written
;
2741 recnum_t records_written
;
2743 MACH_PORT_FACE reply_port
;
2747 clustered_writes
[atop_32(size
)]++;
2749 dev_offset
= (ps
->ps_offset
+
2750 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2751 bytes_to_write
= size
;
2755 * Asynchronous write.
2757 reply_port
= vsa
->reply_port
;
2758 ip_lock(reply_port
);
2759 reply_port
->ip_sorights
++;
2760 ip_reference(reply_port
);
2761 ip_unlock(reply_port
);
2764 device
= dev_port_lookup(ps
->ps_device
);
2766 vsa
->vsa_addr
= addr
;
2767 kr
=ds_device_write_common(device
,
2769 (mach_msg_type_name_t
) MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2772 (io_buf_ptr_t
) addr
,
2774 (IO_WRITE
| IO_CALL
),
2777 if ((kr
!= KERN_SUCCESS
) && (kr
!= MIG_NO_REPLY
)) {
2779 dprintf(("%s0x%x, addr=0x%x,"
2780 "size=0x%x,offset=0x%x\n",
2781 "device_write_request returned ",
2782 kr
, addr
, size
, offset
));
2784 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2785 /* do the completion notification to free resources */
2786 device_write_reply(reply_port
, kr
, 0);
2791 * Synchronous write.
2795 device
= dev_port_lookup(ps
->ps_device
);
2796 kr
=ds_device_write_common(device
,
2800 (io_buf_ptr_t
) addr
,
2802 (IO_WRITE
| IO_SYNC
| IO_KERNEL_BUF
),
2805 if (kr
!= KERN_SUCCESS
) {
2806 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2807 "device_write returned ",
2808 kr
, addr
, size
, offset
));
2810 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2813 if (bytes_written
& ((vm_page_size
>> ps
->ps_record_shift
) - 1))
2814 Panic("fragmented write");
2815 records_written
= (bytes_written
>>
2816 (vm_page_shift
- ps
->ps_record_shift
));
2817 dev_offset
+= records_written
;
2819 if (bytes_written
!= bytes_to_write
) {
2820 dprintf(("wrote only %d bytes out of %d\n",
2821 bytes_written
, bytes_to_write
));
2824 bytes_to_write
-= bytes_written
;
2825 addr
+= bytes_written
;
2826 } while (bytes_to_write
> 0);
2828 return PAGER_SUCCESS
;
2832 #else /* !DEVICE_PAGING */
2836 __unused paging_segment_t ps
,
2837 __unused dp_offset_t offset
,
2838 __unused vm_offset_t
*bufferp
,
2839 __unused
unsigned int size
,
2840 __unused
unsigned int *residualp
,
2843 panic("ps_read_device not supported");
2844 return KERN_FAILURE
;
2849 __unused paging_segment_t ps
,
2850 __unused dp_offset_t offset
,
2851 __unused vm_offset_t addr
,
2852 __unused
unsigned int size
,
2853 __unused
struct vs_async
*vsa
)
2855 panic("ps_write_device not supported");
2856 return KERN_FAILURE
;
2859 #endif /* DEVICE_PAGING */
2860 void pvs_object_data_provided(vstruct_t
, upl_t
, upl_offset_t
, upl_size_t
); /* forward */
2863 pvs_object_data_provided(
2864 __unused vstruct_t vs
,
2866 __unused upl_offset_t offset
,
2873 DP_DEBUG(DEBUG_VS_INTERNAL
,
2874 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2875 upl
, offset
, size
));
2878 GSTAT(global_stats
.gs_pages_in
+= atop_32(size
));
2880 /* check upl iosync flag instead of using RECLAIM_SWAP*/
2882 if (size
!= upl
->size
) {
2884 ps_clunmap(vs
, offset
, size
);
2885 upl_commit_range(upl
, 0, size
, 0, NULL
, 0, &empty
);
2887 upl_abort(upl
, UPL_ABORT_ERROR
);
2888 upl_deallocate(upl
);
2890 ps_clunmap(vs
, offset
, size
);
2891 upl_commit(upl
, NULL
, 0);
2892 upl_deallocate(upl
);
2894 #endif /* RECLAIM_SWAP */
2898 static memory_object_offset_t last_start
;
2899 static vm_size_t last_length
;
2902 * A "cnt" of 0 means that the caller just wants to check if the page at
2903 * offset "vs_offset" exists in the backing store. That page hasn't been
2904 * prepared, so no need to release it.
2906 * A "cnt" of -1 means that the caller wants to bring back from the backing
2907 * store all existing pages in the cluster containing "vs_offset".
2912 dp_offset_t vs_offset
,
2916 kern_return_t error
= KERN_SUCCESS
;
2918 unsigned int residual
;
2919 unsigned int request_flags
;
2926 unsigned int xfer_size
;
2927 dp_offset_t orig_vs_offset
;
2928 dp_offset_t ps_offset
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_MIN_CLSHIFT
];
2929 paging_segment_t psp
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_MIN_CLSHIFT
];
2932 unsigned int page_list_count
;
2933 memory_object_offset_t cluster_start
;
2934 vm_size_t cluster_length
;
2935 uint32_t io_streaming
;
2937 boolean_t io_sync
= FALSE
;
2938 boolean_t reclaim_all
= FALSE
;
2940 pages_in_cl
= 1 << vs
->vs_clshift
;
2941 cl_size
= pages_in_cl
* vm_page_size
;
2942 cl_mask
= cl_size
- 1;
2944 request_flags
= UPL_NO_SYNC
| UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
;
2946 if (cnt
== (dp_size_t
) -1)
2949 if (reclaim_all
== TRUE
) {
2951 * We've been called from ps_vstruct_reclaim() to move all
2952 * the object's swapped pages back to VM pages.
2953 * This can put memory pressure on the system, so we do want
2954 * to wait for free pages, to avoid getting in the way of the
2955 * vm_pageout_scan() thread.
2956 * Let's not use UPL_NOBLOCK in this case.
2958 vs_offset
&= ~cl_mask
;
2964 * if the I/O cluster size == PAGE_SIZE, we don't want to set
2965 * the UPL_NOBLOCK since we may be trying to recover from a
2966 * previous partial pagein I/O that occurred because we were low
2967 * on memory and bailed early in order to honor the UPL_NOBLOCK...
2968 * since we're only asking for a single page, we can block w/o fear
2969 * of tying up pages while waiting for more to become available
2971 if (fault_info
== NULL
|| ((vm_object_fault_info_t
)fault_info
)->cluster_size
> PAGE_SIZE
)
2972 request_flags
|= UPL_NOBLOCK
;
2976 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
2978 if ((ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0) == (dp_offset_t
)-1) ||
2979 !CLMAP_ISSET(clmap
, cl_index
)) {
2981 * the needed page doesn't exist in the backing store...
2982 * we don't want to try to do any I/O, just abort the
2983 * page and let the fault handler provide a zero-fill
2987 * The caller was just poking at us to see if
2988 * the page has been paged out. No need to
2989 * mess with the page at all.
2990 * Just let the caller know we don't have that page.
2992 return KERN_FAILURE
;
2994 if (reclaim_all
== TRUE
) {
2997 /* no more pages in this cluster */
2998 return KERN_FAILURE
;
3000 /* try the next page in this cluster */
3001 vs_offset
+= vm_page_size
;
3005 page_list_count
= 0;
3007 memory_object_super_upl_request(vs
->vs_control
, (memory_object_offset_t
)vs_offset
,
3008 PAGE_SIZE
, PAGE_SIZE
,
3009 &upl
, NULL
, &page_list_count
,
3010 request_flags
| UPL_SET_INTERNAL
);
3011 upl_range_needed(upl
, 0, 1);
3014 upl_abort(upl
, UPL_ABORT_ERROR
);
3016 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
3017 upl_deallocate(upl
);
3019 return KERN_SUCCESS
;
3024 * The caller was just poking at us to see if
3025 * the page has been paged out. No need to
3026 * mess with the page at all.
3027 * Just let the caller know we do have that page.
3029 return KERN_SUCCESS
;
3032 if(((vm_object_fault_info_t
)fault_info
)->io_sync
== TRUE
) {
3037 #endif /* RECLAIM_SWAP */
3040 if( io_sync
== TRUE
) {
3042 io_flags
|= UPL_IOSYNC
| UPL_NOCOMMIT
;
3044 request_flags
|= UPL_PRECIOUS
| UPL_CLEAN_IN_PLACE
;
3045 #else /* USE_PRECIOUS */
3046 request_flags
|= UPL_REQUEST_SET_DIRTY
;
3047 #endif /* USE_PRECIOUS */
3050 assert(dp_encryption_inited
);
3051 if (dp_encryption
) {
3054 * request that the UPL be prepared for
3057 request_flags
|= UPL_ENCRYPT
;
3058 io_flags
|= UPL_PAGING_ENCRYPTED
;
3060 orig_vs_offset
= vs_offset
;
3063 cnt
= VM_SUPER_CLUSTER
;
3064 cluster_start
= (memory_object_offset_t
) vs_offset
;
3065 cluster_length
= (vm_size_t
) cnt
;
3069 * determine how big a speculative I/O we should try for...
3071 if (memory_object_cluster_size(vs
->vs_control
, &cluster_start
, &cluster_length
, &io_streaming
, (memory_object_fault_info_t
)fault_info
) == KERN_SUCCESS
) {
3072 assert(vs_offset
>= (dp_offset_t
) cluster_start
&&
3073 vs_offset
< (dp_offset_t
) (cluster_start
+ cluster_length
));
3074 vs_offset
= (dp_offset_t
) cluster_start
;
3075 cnt
= (dp_size_t
) cluster_length
;
3077 cluster_length
= PAGE_SIZE
;
3082 io_flags
|= UPL_IOSTREAMING
;
3084 last_start
= cluster_start
;
3085 last_length
= cluster_length
;
3088 * This loop will be executed multiple times until the entire
3089 * range has been looked at or we issue an I/O... if the request spans cluster
3090 * boundaries, the clusters will be checked for logical continunity,
3091 * if contiguous the I/O request will span multiple clusters...
3092 * at most only 1 I/O will be issued... it will encompass the original offset
3094 while (cnt
&& error
== KERN_SUCCESS
) {
3097 if ((vs_offset
& cl_mask
) && (cnt
> (VM_SUPER_CLUSTER
- (vs_offset
& cl_mask
)))) {
3098 size
= VM_SUPER_CLUSTER
;
3099 size
-= vs_offset
& cl_mask
;
3100 } else if (cnt
> VM_SUPER_CLUSTER
)
3101 size
= VM_SUPER_CLUSTER
;
3110 while (size
> 0 && error
== KERN_SUCCESS
) {
3111 unsigned int abort_size
;
3116 dp_offset_t cur_offset
;
3118 if ( !ps_info_valid
) {
3119 ps_offset
[seg_index
] = ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
3120 psp
[seg_index
] = CLMAP_PS(clmap
);
3124 * skip over unallocated physical segments
3126 if (ps_offset
[seg_index
] == (dp_offset_t
) -1) {
3127 abort_size
= cl_size
- (vs_offset
& cl_mask
);
3128 abort_size
= MIN(abort_size
, size
);
3131 vs_offset
+= abort_size
;
3138 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
3140 for (abort_size
= 0; cl_index
< pages_in_cl
&& abort_size
< size
; cl_index
++) {
3142 * skip over unallocated pages
3144 if (CLMAP_ISSET(clmap
, cl_index
))
3146 abort_size
+= vm_page_size
;
3150 vs_offset
+= abort_size
;
3152 if (cl_index
== pages_in_cl
) {
3154 * if we're at the end of this physical cluster
3155 * then bump to the next one and continue looking
3166 * remember the starting point of the first allocated page
3167 * for the I/O we're about to issue
3169 beg_pseg
= seg_index
;
3170 beg_indx
= cl_index
;
3171 cur_offset
= vs_offset
;
3174 * calculate the size of the I/O that we can do...
3175 * this may span multiple physical segments if
3176 * they are contiguous
3178 for (xfer_size
= 0; xfer_size
< size
; ) {
3180 while (cl_index
< pages_in_cl
&& xfer_size
< size
) {
3182 * accumulate allocated pages within
3183 * a physical segment
3185 if (CLMAP_ISSET(clmap
, cl_index
)) {
3186 xfer_size
+= vm_page_size
;
3187 cur_offset
+= vm_page_size
;
3190 BS_STAT(psp
[seg_index
]->ps_bs
,
3191 psp
[seg_index
]->ps_bs
->bs_pages_in
++);
3195 if (cl_index
< pages_in_cl
|| xfer_size
>= size
) {
3197 * we've hit an unallocated page or
3198 * the end of this request... see if
3199 * it's time to fire the I/O
3204 * we've hit the end of the current physical
3205 * segment and there's more to do, so try
3206 * moving to the next one
3210 ps_offset
[seg_index
] = ps_clmap(vs
, cur_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
3211 psp
[seg_index
] = CLMAP_PS(clmap
);
3214 if ((ps_offset
[seg_index
- 1] != (ps_offset
[seg_index
] - cl_size
)) || (psp
[seg_index
- 1] != psp
[seg_index
])) {
3216 * if the physical segment we're about
3217 * to step into is not contiguous to
3218 * the one we're currently in, or it's
3219 * in a different paging file, or
3220 * it hasn't been allocated....
3221 * we stop this run and go check
3222 * to see if it's time to fire the I/O
3227 * start with first page of the next physical
3232 if (xfer_size
== 0) {
3234 * no I/O to generate for this segment
3238 if (cur_offset
<= orig_vs_offset
) {
3240 * we've hit a hole in our speculative cluster
3241 * before the offset that we're really after...
3242 * don't issue the I/O since it doesn't encompass
3243 * the original offset and we're looking to only
3244 * pull in the speculative pages if they can be
3245 * made part of a single I/O
3248 vs_offset
+= xfer_size
;
3253 * we have a contiguous range of allocated pages
3254 * to read from that encompasses the original offset
3256 page_list_count
= 0;
3257 memory_object_super_upl_request(vs
->vs_control
, (memory_object_offset_t
)vs_offset
,
3258 xfer_size
, xfer_size
,
3259 &upl
, NULL
, &page_list_count
,
3260 request_flags
| UPL_SET_INTERNAL
);
3262 error
= ps_read_file(psp
[beg_pseg
],
3263 upl
, (upl_offset_t
) 0,
3264 ps_offset
[beg_pseg
] + (beg_indx
* vm_page_size
),
3265 xfer_size
, &residual
, io_flags
);
3269 * Adjust counts and send response to VM. Optimize
3270 * for the common case, i.e. no error and/or partial
3271 * data. If there was an error, then we need to error
3272 * the entire range, even if some data was successfully
3273 * read. If there was a partial read we may supply some
3274 * data and may error some as well. In all cases the
3275 * VM must receive some notification for every page
3278 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
3280 * Got everything we asked for, supply the data
3281 * to the VM. Note that as a side effect of
3282 * supplying the data, the buffer holding the
3283 * supplied data is deallocated from the pager's
3290 failed_size
= xfer_size
;
3292 if (error
== KERN_SUCCESS
) {
3293 if (residual
== xfer_size
) {
3295 * If a read operation returns no error
3296 * and no data moved, we turn it into
3297 * an error, assuming we're reading at
3299 * Fall through and error the entire range.
3301 error
= KERN_FAILURE
;
3304 * Otherwise, we have partial read. If
3305 * the part read is a integral number
3306 * of pages supply it. Otherwise round
3307 * it up to a page boundary, zero fill
3308 * the unread part, and supply it.
3309 * Fall through and error the remainder
3310 * of the range, if any.
3314 fill
= residual
& (vm_page_size
- 1);
3315 lsize
= (xfer_size
- residual
) + fill
;
3317 if (lsize
< xfer_size
)
3318 failed_size
= xfer_size
- lsize
;
3320 if (reclaim_all
== FALSE
)
3321 error
= KERN_FAILURE
;
3325 pvs_object_data_provided(vs
, upl
, vs_offset
, lsize
);
3329 * There was an error in some part of the range, tell
3330 * the VM. Note that error is explicitly checked again
3331 * since it can be modified above.
3333 BS_STAT(psp
[beg_pseg
]->ps_bs
,
3334 psp
[beg_pseg
]->ps_bs
->bs_pages_in_fail
+= atop_32(failed_size
));
3337 * we've issued a single I/O that encompassed the original offset
3338 * at this point we either met our speculative request length or
3339 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3340 * not present or not physically contiguous to the previous one), so
3341 * we're done issuing I/O at this point
3349 int vs_do_async_write
= 1;
3355 upl_offset_t offset
,
3357 boolean_t dp_internal
,
3360 upl_size_t transfer_size
;
3364 dp_offset_t actual_offset
; /* Offset within paging segment */
3365 paging_segment_t ps
;
3366 dp_offset_t mobj_base_addr
;
3367 dp_offset_t mobj_target_addr
;
3370 upl_page_info_t
*pl
;
3372 unsigned int page_max_index
;
3375 unsigned int cl_size
;
3377 unsigned int seg_size
;
3378 unsigned int upl_offset_in_object
;
3379 boolean_t minimal_clustering
= FALSE
;
3380 boolean_t found_dirty
;
3382 if (!dp_encryption_inited
) {
3385 * Once we've started using swap, we
3386 * can't change our mind on whether
3387 * it needs to be encrypted or
3390 dp_encryption_inited
= TRUE
;
3392 if (dp_encryption
) {
3395 * the UPL will need to be encrypted...
3397 flags
|= UPL_PAGING_ENCRYPTED
;
3400 pages_in_cl
= 1 << vs
->vs_clshift
;
3401 cl_size
= pages_in_cl
* vm_page_size
;
3404 minimal_clustering
= TRUE
;
3406 if (dp_isssd
== TRUE
)
3407 minimal_clustering
= TRUE
;
3410 unsigned int page_list_count
;
3412 unsigned int super_size
;
3417 upl_offset_t upl_offset
;
3418 upl_offset_t upl_offset_aligned
;
3419 dp_offset_t seg_offset
;
3420 dp_offset_t ps_offset
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_MIN_CLSHIFT
) + 1];
3421 paging_segment_t psp
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_MIN_CLSHIFT
) + 1];
3425 super_size
= cl_size
;
3427 super_size
= VM_SUPER_CLUSTER
;
3429 request_flags
= UPL_NOBLOCK
| UPL_CLEAN_IN_PLACE
|
3430 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
3431 UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
;
3433 if (dp_encryption
) {
3436 * request that the UPL be prepared for
3439 request_flags
|= UPL_ENCRYPT
;
3440 flags
|= UPL_PAGING_ENCRYPTED
;
3443 page_list_count
= 0;
3444 memory_object_super_upl_request(vs
->vs_control
,
3445 (memory_object_offset_t
)offset
,
3447 &upl
, NULL
, &page_list_count
,
3448 request_flags
| UPL_FOR_PAGEOUT
);
3451 * The default pager does not handle objects larger than
3452 * 4GB, so it does not deal with offset that don't fit in
3453 * 32-bit. Cast down upl->offset now and make sure we
3454 * did not lose any valuable bits.
3456 upl_offset_in_object
= (unsigned int) upl
->offset
;
3457 assert(upl
->offset
== upl_offset_in_object
);
3459 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
3461 seg_size
= cl_size
- (upl_offset_in_object
% cl_size
);
3462 upl_offset_aligned
= upl_offset_in_object
& ~(cl_size
- 1);
3464 page_max_index
= upl
->size
/ PAGE_SIZE
;
3467 for (seg_index
= 0, transfer_size
= upl
->size
; transfer_size
> 0; ) {
3469 unsigned int seg_pgcnt
;
3471 seg_pgcnt
= seg_size
/ PAGE_SIZE
;
3473 if (minimal_clustering
== TRUE
) {
3474 unsigned int non_dirty
;
3477 found_dirty
= FALSE
;
3479 for (; non_dirty
< seg_pgcnt
; non_dirty
++) {
3480 if ((page_index
+ non_dirty
) >= page_max_index
)
3483 if (UPL_DIRTY_PAGE(pl
, page_index
+ non_dirty
) ||
3484 UPL_PRECIOUS_PAGE(pl
, page_index
+ non_dirty
)) {
3490 if (found_dirty
== TRUE
) {
3491 ps_offset
[seg_index
] =
3497 if (ps_offset
[seg_index
] == (dp_offset_t
) -1) {
3499 upl_deallocate(upl
);
3501 return KERN_FAILURE
;
3503 psp
[seg_index
] = CLMAP_PS(clmap
);
3505 if (transfer_size
> seg_size
) {
3506 page_index
+= seg_pgcnt
;
3507 transfer_size
-= seg_size
;
3508 upl_offset_aligned
+= cl_size
;
3515 * Ignore any non-present pages at the end of the
3518 for (page_index
= upl
->size
/ vm_page_size
; page_index
> 0;) {
3519 if (UPL_PAGE_PRESENT(pl
, --page_index
)) {
3524 if (page_index
== 0) {
3526 * no pages in the UPL
3530 upl_deallocate(upl
);
3532 return KERN_SUCCESS
;
3534 num_of_pages
= page_index
;
3536 base_index
= (upl_offset_in_object
% cl_size
) / PAGE_SIZE
;
3538 for (page_index
= 0; page_index
< num_of_pages
; ) {
3540 * skip over non-dirty pages
3542 for ( ; page_index
< num_of_pages
; page_index
++) {
3543 if (UPL_DIRTY_PAGE(pl
, page_index
)
3544 || UPL_PRECIOUS_PAGE(pl
, page_index
))
3546 * this is a page we need to write
3547 * go see if we can buddy it up with
3548 * others that are contiguous to it
3552 * if the page is not-dirty, but present we
3553 * need to commit it... This is an unusual
3554 * case since we only asked for dirty pages
3556 if (UPL_PAGE_PRESENT(pl
, page_index
)) {
3557 boolean_t empty
= FALSE
;
3558 upl_commit_range(upl
,
3559 page_index
* vm_page_size
,
3561 UPL_COMMIT_NOTIFY_EMPTY
,
3566 assert(page_index
==
3568 upl_deallocate(upl
);
3572 if (page_index
== num_of_pages
)
3574 * no more pages to look at, we're out of here
3579 * gather up contiguous dirty pages... we have at
3580 * least 1 * otherwise we would have bailed above
3581 * make sure that each physical segment that we step
3582 * into is contiguous to the one we're currently in
3583 * if it's not, we have to stop and write what we have
3585 for (first_dirty
= page_index
;
3586 page_index
< num_of_pages
; ) {
3587 if ( !UPL_DIRTY_PAGE(pl
, page_index
)
3588 && !UPL_PRECIOUS_PAGE(pl
, page_index
))
3592 * if we just looked at the last page in the UPL
3593 * we don't need to check for physical segment
3596 if (page_index
< num_of_pages
) {
3600 cur_seg
= (base_index
+ (page_index
- 1))/pages_in_cl
;
3601 nxt_seg
= (base_index
+ page_index
)/pages_in_cl
;
3603 if (cur_seg
!= nxt_seg
) {
3604 if ((ps_offset
[cur_seg
] != (ps_offset
[nxt_seg
] - cl_size
)) || (psp
[cur_seg
] != psp
[nxt_seg
]))
3606 * if the segment we're about
3607 * to step into is not
3608 * contiguous to the one we're
3609 * currently in, or it's in a
3610 * different paging file....
3611 * we stop here and generate
3618 num_dirty
= page_index
- first_dirty
;
3621 upl_offset
= first_dirty
* vm_page_size
;
3622 transfer_size
= num_dirty
* vm_page_size
;
3624 while (transfer_size
) {
3626 if ((seg_size
= cl_size
-
3627 ((upl_offset_in_object
+
3628 upl_offset
) % cl_size
))
3630 seg_size
= transfer_size
;
3632 ps_vs_write_complete(
3634 (upl_offset_in_object
+
3638 transfer_size
-= seg_size
;
3639 upl_offset
+= seg_size
;
3641 upl_offset
= first_dirty
* vm_page_size
;
3642 transfer_size
= num_dirty
* vm_page_size
;
3644 seg_index
= (base_index
+ first_dirty
) / pages_in_cl
;
3645 seg_offset
= (upl_offset_in_object
+ upl_offset
) % cl_size
;
3647 error
= ps_write_file(psp
[seg_index
],
3649 ps_offset
[seg_index
]
3651 transfer_size
, flags
);
3656 assert(cnt
<= (unsigned) (vm_page_size
<< vs
->vs_clshift
));
3660 /* The caller provides a mapped_data which is derived */
3661 /* from a temporary object. The targeted pages are */
3662 /* guaranteed to be set at offset 0 in the mapped_data */
3663 /* The actual offset however must still be derived */
3664 /* from the offset in the vs in question */
3665 mobj_base_addr
= offset
;
3666 mobj_target_addr
= mobj_base_addr
;
3668 for (transfer_size
= list_size
; transfer_size
!= 0;) {
3669 actual_offset
= ps_clmap(vs
, mobj_target_addr
,
3671 transfer_size
< cl_size
?
3672 transfer_size
: cl_size
, 0);
3673 if(actual_offset
== (dp_offset_t
) -1) {
3677 cnt
= MIN(transfer_size
,
3678 (unsigned) CLMAP_NPGS(clmap
) * vm_page_size
);
3679 ps
= CLMAP_PS(clmap
);
3680 /* Assume that the caller has given us contiguous */
3683 ps_vs_write_complete(vs
, mobj_target_addr
,
3685 error
= ps_write_file(ps
, internal_upl
,
3693 actual_offset
+= cnt
;
3694 mobj_target_addr
+= cnt
;
3695 transfer_size
-= cnt
;
3703 return KERN_FAILURE
;
3705 return KERN_SUCCESS
;
3709 ps_vstruct_allocated_size(
3713 struct vs_map
*vsmap
;
3714 unsigned int i
, j
, k
;
3717 if (vs
->vs_indirect
) {
3718 /* loop on indirect maps */
3719 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3720 vsmap
= vs
->vs_imap
[i
];
3723 /* loop on clusters in this indirect map */
3724 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3725 if (VSM_ISCLR(vsmap
[j
]) ||
3726 VSM_ISERR(vsmap
[j
]))
3728 /* loop on pages in this cluster */
3729 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3730 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3736 vsmap
= vs
->vs_dmap
;
3739 /* loop on clusters in the direct map */
3740 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3741 if (VSM_ISCLR(vsmap
[j
]) ||
3742 VSM_ISERR(vsmap
[j
]))
3744 /* loop on pages in this cluster */
3745 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3746 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3752 return ptoa_32(num_pages
);
3756 ps_vstruct_allocated_pages(
3758 default_pager_page_t
*pages
,
3759 unsigned int pages_size
)
3761 unsigned int num_pages
;
3762 struct vs_map
*vsmap
;
3764 unsigned int i
, j
, k
;
3768 if (vs
->vs_indirect
) {
3769 /* loop on indirect maps */
3770 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3771 vsmap
= vs
->vs_imap
[i
];
3772 if (vsmap
== NULL
) {
3773 offset
+= (vm_page_size
* CLMAP_ENTRIES
*
3777 /* loop on clusters in this indirect map */
3778 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3779 if (VSM_ISCLR(vsmap
[j
]) ||
3780 VSM_ISERR(vsmap
[j
])) {
3781 offset
+= vm_page_size
* VSCLSIZE(vs
);
3784 /* loop on pages in this cluster */
3785 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3786 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3788 if (num_pages
< pages_size
)
3789 pages
++->dpp_offset
=
3792 offset
+= vm_page_size
;
3797 vsmap
= vs
->vs_dmap
;
3800 /* loop on clusters in the direct map */
3801 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3802 if (VSM_ISCLR(vsmap
[j
]) ||
3803 VSM_ISERR(vsmap
[j
])) {
3804 offset
+= vm_page_size
* VSCLSIZE(vs
);
3807 /* loop on pages in this cluster */
3808 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3809 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3811 if (num_pages
< pages_size
)
3812 pages
++->dpp_offset
= offset
;
3814 offset
+= vm_page_size
;
3824 ps_vstruct_transfer_from_segment(
3826 paging_segment_t segment
,
3829 struct vs_map
*vsmap
;
3830 // struct vs_map old_vsmap;
3831 // struct vs_map new_vsmap;
3834 VS_LOCK(vs
); /* block all work on this vstruct */
3835 /* can't allow the normal multiple write */
3836 /* semantic because writes may conflict */
3837 vs
->vs_xfer_pending
= TRUE
;
3838 vs_wait_for_sync_writers(vs
);
3840 vs_wait_for_readers(vs
);
3841 /* we will unlock the vs to allow other writes while transferring */
3842 /* and will be guaranteed of the persistance of the vs struct */
3843 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3844 /* vs_async_pending */
3845 /* OK we now have guaranteed no other parties are accessing this */
3846 /* vs. Now that we are also supporting simple lock versions of */
3847 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3848 /* our purpose in holding it before was the multiple write case */
3849 /* we now use the boolean xfer_pending to do that. We can use */
3850 /* a boolean instead of a count because we have guaranteed single */
3851 /* file access to this code in its caller */
3854 if (vs
->vs_indirect
) {
3855 unsigned int vsmap_size
;
3857 /* loop on indirect maps */
3858 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3859 vsmap
= vs
->vs_imap
[i
];
3862 /* loop on clusters in this indirect map */
3863 clmap_off
= (vm_page_size
* CLMAP_ENTRIES
*
3865 if(i
+1 == INDIRECT_CLMAP_ENTRIES(vs
->vs_size
))
3866 vsmap_size
= vs
->vs_size
- (CLMAP_ENTRIES
* i
);
3868 vsmap_size
= CLMAP_ENTRIES
;
3869 for (j
= 0; j
< vsmap_size
; j
++) {
3870 if (VSM_ISCLR(vsmap
[j
]) ||
3871 VSM_ISERR(vsmap
[j
]) ||
3872 (VSM_PS(vsmap
[j
]) != segment
))
3874 if(vs_cluster_transfer(vs
,
3875 (vm_page_size
* (j
<< vs
->vs_clshift
))
3877 vm_page_size
<< vs
->vs_clshift
,
3881 vs
->vs_xfer_pending
= FALSE
;
3883 vs_finish_write(vs
);
3884 return KERN_FAILURE
;
3886 /* allow other readers/writers during transfer*/
3888 vs
->vs_xfer_pending
= FALSE
;
3890 vs_finish_write(vs
);
3892 if (backing_store_abort_compaction
|| backing_store_stop_compaction
) {
3893 backing_store_abort_compaction
= FALSE
;
3894 dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3895 return KERN_FAILURE
;
3897 vnode_pager_throttle();
3900 vs
->vs_xfer_pending
= TRUE
;
3901 vs_wait_for_sync_writers(vs
);
3903 vs_wait_for_readers(vs
);
3905 if (!(vs
->vs_indirect
)) {
3911 vsmap
= vs
->vs_dmap
;
3912 if (vsmap
== NULL
) {
3914 vs
->vs_xfer_pending
= FALSE
;
3916 vs_finish_write(vs
);
3917 return KERN_SUCCESS
;
3919 /* loop on clusters in the direct map */
3920 for (j
= 0; j
< vs
->vs_size
; j
++) {
3921 if (VSM_ISCLR(vsmap
[j
]) ||
3922 VSM_ISERR(vsmap
[j
]) ||
3923 (VSM_PS(vsmap
[j
]) != segment
))
3925 if(vs_cluster_transfer(vs
,
3926 vm_page_size
* (j
<< vs
->vs_clshift
),
3927 vm_page_size
<< vs
->vs_clshift
,
3928 upl
) != KERN_SUCCESS
) {
3930 vs
->vs_xfer_pending
= FALSE
;
3932 vs_finish_write(vs
);
3933 return KERN_FAILURE
;
3935 /* allow other readers/writers during transfer*/
3937 vs
->vs_xfer_pending
= FALSE
;
3939 vs_finish_write(vs
);
3941 vs
->vs_xfer_pending
= TRUE
;
3942 vs_wait_for_sync_writers(vs
);
3944 vs_wait_for_readers(vs
);
3946 if (vs
->vs_indirect
) {
3953 vs
->vs_xfer_pending
= FALSE
;
3955 vs_finish_write(vs
);
3956 return KERN_SUCCESS
;
3966 struct vs_map
*vsmap
;
3967 dp_offset_t cluster
;
3969 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
3970 if (vs
->vs_indirect
) {
3971 long ind_block
= cluster
/CLMAP_ENTRIES
;
3973 /* Is the indirect block allocated? */
3974 vsmap
= vs
->vs_imap
[ind_block
];
3975 if(vsmap
== (vs_map_t
) NULL
)
3978 vsmap
= vs
->vs_dmap
;
3979 vsmap
+= cluster%CLMAP_ENTRIES
;
3984 vs_cluster_transfer(
3990 dp_offset_t actual_offset
;
3991 paging_segment_t ps
;
3993 kern_return_t error
= KERN_SUCCESS
;
3994 unsigned int size
, size_wanted
;
3996 unsigned int residual
= 0;
3997 unsigned int unavail_size
;
3998 // default_pager_thread_t *dpt;
3999 // boolean_t dealloc;
4000 struct vs_map
*vsmap_ptr
= NULL
;
4001 struct vs_map read_vsmap
;
4002 struct vs_map original_read_vsmap
;
4003 struct vs_map write_vsmap
;
4005 // vm_offset_t ioaddr;
4007 /* vs_cluster_transfer reads in the pages of a cluster and
4008 * then writes these pages back to new backing store. The
4009 * segment the pages are being read from is assumed to have
4010 * been taken off-line and is no longer considered for new
4015 * This loop will be executed once per cluster referenced.
4016 * Typically this means once, since it's unlikely that the
4017 * VM system will ask for anything spanning cluster boundaries.
4019 * If there are holes in a cluster (in a paging segment), we stop
4020 * reading at the hole, then loop again, hoping to
4021 * find valid pages later in the cluster. This continues until
4022 * the entire range has been examined, and read, if present. The
4023 * pages are written as they are read. If a failure occurs after
4024 * some pages are written the unmap call at the bottom of the loop
4025 * recovers the backing store and the old backing store remains
4029 VSM_CLR(write_vsmap
);
4030 VSM_CLR(original_read_vsmap
);
4031 /* grab the actual object's pages to sync with I/O */
4032 while (cnt
&& (error
== KERN_SUCCESS
)) {
4033 vsmap_ptr
= vs_get_map_entry(vs
, offset
);
4034 actual_offset
= ps_clmap(vs
, offset
, &clmap
, CL_FIND
, 0, 0);
4036 if (actual_offset
== (dp_offset_t
) -1) {
4039 * Nothing left to write in this cluster at least
4040 * set write cluster information for any previous
4041 * write, clear for next cluster, if there is one
4043 unsigned int local_size
, clmask
, clsize
;
4045 clsize
= vm_page_size
<< vs
->vs_clshift
;
4046 clmask
= clsize
- 1;
4047 local_size
= clsize
- (offset
& clmask
);
4049 local_size
= MIN(local_size
, cnt
);
4051 /* This cluster has no data in it beyond what may */
4052 /* have been found on a previous iteration through */
4053 /* the loop "write_vsmap" */
4054 *vsmap_ptr
= write_vsmap
;
4055 VSM_CLR(write_vsmap
);
4056 VSM_CLR(original_read_vsmap
);
4059 offset
+= local_size
;
4064 * Count up contiguous available or unavailable
4067 ps
= CLMAP_PS(clmap
);
4072 (size
< cnt
) && (unavail_size
< cnt
) &&
4073 (i
< CLMAP_NPGS(clmap
)); i
++) {
4074 if (CLMAP_ISSET(clmap
, i
)) {
4075 if (unavail_size
!= 0)
4077 size
+= vm_page_size
;
4079 ps
->ps_bs
->bs_pages_in
++);
4083 unavail_size
+= vm_page_size
;
4088 ASSERT(unavail_size
);
4089 ps_clunmap(vs
, offset
, unavail_size
);
4090 cnt
-= unavail_size
;
4091 offset
+= unavail_size
;
4092 if((offset
& ((vm_page_size
<< vs
->vs_clshift
) - 1))
4094 /* There is no more to transfer in this
4097 *vsmap_ptr
= write_vsmap
;
4098 VSM_CLR(write_vsmap
);
4099 VSM_CLR(original_read_vsmap
);
4104 if(VSM_ISCLR(original_read_vsmap
))
4105 original_read_vsmap
= *vsmap_ptr
;
4107 if(ps
->ps_segtype
== PS_PARTITION
) {
4108 panic("swap partition not supported\n");
4110 error
= KERN_FAILURE
;
4113 NEED TO ISSUE WITH SYNC & NO COMMIT
4114 error = ps_read_device(ps, actual_offset, &buffer,
4115 size, &residual, flags);
4118 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
4119 error
= ps_read_file(ps
, upl
, (upl_offset_t
) 0, actual_offset
,
4121 (UPL_IOSYNC
| UPL_NOCOMMIT
| (dp_encryption
? UPL_PAGING_ENCRYPTED
: 0)));
4124 read_vsmap
= *vsmap_ptr
;
4128 * Adjust counts and put data in new BS. Optimize for the
4129 * common case, i.e. no error and/or partial data.
4130 * If there was an error, then we need to error the entire
4131 * range, even if some data was successfully read.
4134 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
4137 * Got everything we asked for, supply the data to
4138 * the new BS. Note that as a side effect of supplying
4139 * the data, the buffer holding the supplied data is
4140 * deallocated from the pager's address space unless
4141 * the write is unsuccessful.
4144 /* note buffer will be cleaned up in all cases by */
4145 /* internal_cluster_write or if an error on write */
4146 /* the vm_map_copy_page_discard call */
4147 *vsmap_ptr
= write_vsmap
;
4149 if(vs_cluster_write(vs
, upl
, offset
,
4150 size
, TRUE
, UPL_IOSYNC
| UPL_NOCOMMIT
) != KERN_SUCCESS
) {
4151 error
= KERN_FAILURE
;
4152 if(!(VSM_ISCLR(*vsmap_ptr
))) {
4153 /* unmap the new backing store object */
4154 ps_clunmap(vs
, offset
, size
);
4156 /* original vsmap */
4157 *vsmap_ptr
= original_read_vsmap
;
4158 VSM_CLR(write_vsmap
);
4160 if((offset
+ size
) &
4161 ((vm_page_size
<< vs
->vs_clshift
)
4163 /* There is more to transfer in this
4166 write_vsmap
= *vsmap_ptr
;
4167 *vsmap_ptr
= read_vsmap
;
4168 ps_clunmap(vs
, offset
, size
);
4170 /* discard the old backing object */
4171 write_vsmap
= *vsmap_ptr
;
4172 *vsmap_ptr
= read_vsmap
;
4173 ps_clunmap(vs
, offset
, size
);
4174 *vsmap_ptr
= write_vsmap
;
4175 VSM_CLR(write_vsmap
);
4176 VSM_CLR(original_read_vsmap
);
4181 if (error
== KERN_SUCCESS
) {
4182 if (residual
== size
) {
4184 * If a read operation returns no error
4185 * and no data moved, we turn it into
4186 * an error, assuming we're reading at
4188 * Fall through and error the entire
4191 error
= KERN_FAILURE
;
4192 *vsmap_ptr
= write_vsmap
;
4193 if(!(VSM_ISCLR(*vsmap_ptr
))) {
4194 /* unmap the new backing store object */
4195 ps_clunmap(vs
, offset
, size
);
4197 *vsmap_ptr
= original_read_vsmap
;
4198 VSM_CLR(write_vsmap
);
4202 * Otherwise, we have partial read.
4203 * This is also considered an error
4204 * for the purposes of cluster transfer
4206 error
= KERN_FAILURE
;
4207 *vsmap_ptr
= write_vsmap
;
4208 if(!(VSM_ISCLR(*vsmap_ptr
))) {
4209 /* unmap the new backing store object */
4210 ps_clunmap(vs
, offset
, size
);
4212 *vsmap_ptr
= original_read_vsmap
;
4213 VSM_CLR(write_vsmap
);
4222 } /* END while (cnt && (error == 0)) */
4223 if(!VSM_ISCLR(write_vsmap
))
4224 *vsmap_ptr
= write_vsmap
;
4230 default_pager_add_file(
4231 MACH_PORT_FACE backing_store
,
4237 paging_segment_t ps
;
4242 if ((bs
= backing_store_lookup(backing_store
))
4243 == BACKING_STORE_NULL
)
4244 return KERN_INVALID_ARGUMENT
;
4247 for (i
= 0; i
<= paging_segment_max
; i
++) {
4248 ps
= paging_segments
[i
];
4249 if (ps
== PAGING_SEGMENT_NULL
)
4251 if (ps
->ps_segtype
!= PS_FILE
)
4255 * Check for overlap on same device.
4257 if (ps
->ps_vnode
== (struct vnode
*)vp
) {
4260 return KERN_INVALID_ARGUMENT
;
4266 * Set up the paging segment
4268 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
4269 if (ps
== PAGING_SEGMENT_NULL
) {
4271 return KERN_RESOURCE_SHORTAGE
;
4274 ps
->ps_segtype
= PS_FILE
;
4275 ps
->ps_vnode
= (struct vnode
*)vp
;
4277 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
4278 assert((dp_size_t
) size
== size
);
4279 ps
->ps_recnum
= (dp_size_t
) size
;
4280 ps
->ps_pgnum
= ((dp_size_t
) size
) >> ps
->ps_record_shift
;
4282 ps
->ps_pgcount
= ps
->ps_pgnum
;
4283 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
4284 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
4285 ps
->ps_special_clusters
= 0;
4289 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
4291 PS_LOCK_DESTROY(ps
);
4292 kfree(ps
, sizeof *ps
);
4294 return KERN_RESOURCE_SHORTAGE
;
4296 for (j
= 0; j
< ps
->ps_ncls
; j
++) {
4297 clrbit(ps
->ps_bmap
, j
);
4300 if(paging_segment_count
== 0) {
4301 ps
->ps_state
= PS_EMERGENCY_SEGMENT
;
4302 if(use_emergency_swap_file_first
) {
4303 ps
->ps_state
|= PS_CAN_USE
;
4305 emergency_segment_backing_store
= backing_store
;
4307 ps
->ps_state
= PS_CAN_USE
;
4312 if ((error
= ps_enter(ps
)) != 0) {
4313 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
4314 PS_LOCK_DESTROY(ps
);
4315 kfree(ps
, sizeof *ps
);
4317 return KERN_RESOURCE_SHORTAGE
;
4320 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
4321 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
4323 if(IS_PS_OK_TO_USE(ps
)) {
4324 dp_pages_free
+= ps
->ps_pgcount
;
4326 dp_pages_reserve
+= ps
->ps_pgcount
;
4332 bs_more_space(ps
->ps_clcount
);
4335 * If the paging segment being activated is not the emergency
4336 * segment and we notice that the emergency segment is being
4337 * used then we help recover it. If all goes well, the
4338 * emergency segment will be back to its original state of
4339 * online but not activated (till it's needed the next time).
4342 if (!memorystatus_freeze_enabled
)
4345 ps
= paging_segments
[EMERGENCY_PSEG_INDEX
];
4346 if(IS_PS_EMERGENCY_SEGMENT(ps
) && IS_PS_OK_TO_USE(ps
)) {
4347 if(default_pager_backing_store_delete(emergency_segment_backing_store
)) {
4348 dprintf(("Failed to recover emergency paging segment\n"));
4350 dprintf(("Recovered emergency paging segment\n"));
4355 DP_DEBUG(DEBUG_BS_INTERNAL
,
4356 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4357 device
, offset
, (dp_size_t
) size
, record_size
,
4358 ps
->ps_record_shift
, ps
->ps_pgnum
));
4360 return KERN_SUCCESS
;
4367 paging_segment_t ps
,
4369 upl_offset_t upl_offset
,
4372 unsigned int *residualp
,
4375 vm_object_offset_t f_offset
;
4379 assert(dp_encryption_inited
);
4381 clustered_reads
[atop_32(size
)]++;
4383 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
4386 * for transfer case we need to pass uploffset and flags
4388 assert((upl_size_t
) size
== size
);
4389 error
= vnode_pagein(ps
->ps_vnode
, upl
, upl_offset
, f_offset
, (upl_size_t
)size
, flags
, NULL
);
4391 /* The vnode_pagein semantic is somewhat at odds with the existing */
4392 /* device_read semantic. Partial reads are not experienced at this */
4393 /* level. It is up to the bit map code and cluster read code to */
4394 /* check that requested data locations are actually backed, and the */
4395 /* pagein code to either read all of the requested data or return an */
4399 result
= KERN_FAILURE
;
4402 result
= KERN_SUCCESS
;
4409 paging_segment_t ps
,
4411 upl_offset_t upl_offset
,
4416 vm_object_offset_t f_offset
;
4417 kern_return_t result
;
4419 assert(dp_encryption_inited
);
4421 clustered_writes
[atop_32(size
)]++;
4422 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
4424 if (flags
& UPL_PAGING_ENCRYPTED
) {
4427 * encrypt all the pages that we're going
4430 upl_encrypt(upl
, upl_offset
, size
);
4432 assert((upl_size_t
) size
== size
);
4433 if (vnode_pageout(ps
->ps_vnode
, upl
, upl_offset
, f_offset
, (upl_size_t
)size
, flags
, NULL
))
4434 result
= KERN_FAILURE
;
4436 result
= KERN_SUCCESS
;
4441 static inline void ps_vnode_trim_init(struct ps_vnode_trim_data
*data
)
4443 #pragma unused(data)
4446 static inline void ps_vnode_trim_now(struct ps_vnode_trim_data
*data
)
4448 #pragma unused(data)
4451 static inline void ps_vnode_trim_more(struct ps_vnode_trim_data
*data
, struct vs_map
*map
, unsigned int shift
, dp_size_t length
)
4453 #pragma unused(data, map, shift, length)
4457 default_pager_triggers( __unused MACH_PORT_FACE default_pager
,
4461 MACH_PORT_FACE trigger_port
)
4463 MACH_PORT_FACE release
= IPC_PORT_NULL
;
4466 clock_nsec_t nanoseconds_dummy
;
4467 static clock_sec_t error_notify
= 0;
4470 if (flags
== SWAP_ENCRYPT_ON
) {
4471 /* ENCRYPTED SWAP: turn encryption on */
4472 release
= trigger_port
;
4473 if (!dp_encryption_inited
) {
4474 dp_encryption_inited
= TRUE
;
4475 dp_encryption
= TRUE
;
4480 } else if (flags
== SWAP_ENCRYPT_OFF
) {
4481 /* ENCRYPTED SWAP: turn encryption off */
4482 release
= trigger_port
;
4483 if (!dp_encryption_inited
) {
4484 dp_encryption_inited
= TRUE
;
4485 dp_encryption
= FALSE
;
4490 } else if (flags
== HI_WAT_ALERT
) {
4491 release
= min_pages_trigger_port
;
4493 /* High and low water signals aren't applicable when freeze is */
4494 /* enabled, so release the trigger ports here and return */
4496 if (memorystatus_freeze_enabled
) {
4497 if (IP_VALID( trigger_port
)){
4498 ipc_port_release_send( trigger_port
);
4500 min_pages_trigger_port
= IPC_PORT_NULL
;
4506 min_pages_trigger_port
= trigger_port
;
4507 minimum_pages_remaining
= hi_wat
/vm_page_size
;
4511 } else if (flags
== LO_WAT_ALERT
) {
4512 release
= max_pages_trigger_port
;
4514 if (memorystatus_freeze_enabled
) {
4515 if (IP_VALID( trigger_port
)){
4516 ipc_port_release_send( trigger_port
);
4518 max_pages_trigger_port
= IPC_PORT_NULL
;
4524 max_pages_trigger_port
= trigger_port
;
4525 maximum_pages_free
= lo_wat
/vm_page_size
;
4528 } else if (flags
== USE_EMERGENCY_SWAP_FILE_FIRST
) {
4529 use_emergency_swap_file_first
= TRUE
;
4530 release
= trigger_port
;
4532 } else if (flags
== SWAP_FILE_CREATION_ERROR
) {
4533 release
= trigger_port
;
4535 if( paging_segment_count
== 1) {
4536 use_emergency_swap_file_first
= TRUE
;
4538 no_paging_space_action();
4539 clock_get_system_nanotime(&now
, &nanoseconds_dummy
);
4540 if (now
> error_notify
+ 5) {
4541 dprintf(("Swap File Error.\n"));
4545 release
= trigger_port
;
4546 kr
= KERN_INVALID_ARGUMENT
;
4550 if (IP_VALID(release
))
4551 ipc_port_release_send(release
);
4557 * Monitor the amount of available backing store vs. the amount of
4558 * required backing store, notify a listener (if present) when
4559 * backing store may safely be removed.
4561 * We attempt to avoid the situation where backing store is
4562 * discarded en masse, as this can lead to thrashing as the
4563 * backing store is compacted.
4566 #define PF_INTERVAL 3 /* time between free level checks */
4567 #define PF_LATENCY 10 /* number of intervals before release */
4569 static int dp_pages_free_low_count
= 0;
4570 thread_call_t default_pager_backing_store_monitor_callout
;
4573 default_pager_backing_store_monitor(__unused thread_call_param_t p1
,
4574 __unused thread_call_param_t p2
)
4576 // unsigned long long average;
4581 * We determine whether it will be safe to release some
4582 * backing store by watching the free page level. If
4583 * it remains below the maximum_pages_free threshold for
4584 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4585 * then we deem it safe.
4587 * Note that this establishes a maximum rate at which backing
4588 * store will be released, as each notification (currently)
4589 * only results in a single backing store object being
4592 if (dp_pages_free
> maximum_pages_free
) {
4593 dp_pages_free_low_count
++;
4595 dp_pages_free_low_count
= 0;
4598 /* decide whether to send notification */
4600 if (max_pages_trigger_port
&&
4601 (backing_store_release_trigger_disable
== 0) &&
4602 (dp_pages_free_low_count
> PF_LATENCY
)) {
4603 trigger
= max_pages_trigger_port
;
4604 max_pages_trigger_port
= NULL
;
4607 /* send notification */
4608 if (trigger
!= IP_NULL
) {
4610 if(backing_store_release_trigger_disable
!= 0) {
4611 assert_wait((event_t
)
4612 &backing_store_release_trigger_disable
,
4615 thread_block(THREAD_CONTINUE_NULL
);
4619 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4621 default_pager_space_alert(trigger
, LO_WAT_ALERT
);
4622 ipc_port_release_send(trigger
);
4623 dp_pages_free_low_count
= 0;
4626 clock_interval_to_deadline(PF_INTERVAL
, NSEC_PER_SEC
, &deadline
);
4627 thread_call_enter_delayed(default_pager_backing_store_monitor_callout
, deadline
);
4631 unsigned int default_pager_swap_pages_free() {
4632 return dp_pages_free
;