2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
59 * Paging File Management.
62 #include <mach/host_priv.h>
63 #include <mach/memory_object_control.h>
64 #include <mach/memory_object_server.h>
66 #include <default_pager/default_pager_internal.h>
67 #include <default_pager/default_pager_alerts.h>
68 #include <default_pager/default_pager_object_server.h>
70 #include <ipc/ipc_types.h>
71 #include <ipc/ipc_port.h>
72 #include <ipc/ipc_space.h>
74 #include <kern/kern_types.h>
75 #include <kern/host.h>
76 #include <kern/queue.h>
77 #include <kern/counters.h>
78 #include <kern/sched_prim.h>
80 #include <vm/vm_kern.h>
81 #include <vm/vm_pageout.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_protos.h>
87 /* LP64todo - need large internal object support */
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
97 * This should be determined dynamically in the future
100 #define ALLOC_STRIDE (1024 * 1024 * 1024)
101 int physical_transfer_cluster_count
= 0;
103 #define VM_SUPER_CLUSTER 0x40000
104 #define VM_SUPER_PAGES 64
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
110 #define VSTRUCT_DEF_CLSHIFT 2
111 int vstruct_def_clshift
= VSTRUCT_DEF_CLSHIFT
;
112 int default_pager_clsize
= 0;
115 unsigned int clustered_writes
[VM_SUPER_PAGES
+1];
116 unsigned int clustered_reads
[VM_SUPER_PAGES
+1];
119 * Globals used for asynchronous paging operations:
120 * vs_async_list: head of list of to-be-completed I/O ops
121 * async_num_queued: number of pages completed, but not yet
122 * processed by async thread.
123 * async_requests_out: number of pages of requests not completed.
127 struct vs_async
*vs_async_list
;
128 int async_num_queued
;
129 int async_requests_out
;
133 #define VS_ASYNC_REUSE 1
134 struct vs_async
*vs_async_free_list
;
136 mutex_t default_pager_async_lock
; /* Protects globals above */
139 int vs_alloc_async_failed
= 0; /* statistics */
140 int vs_alloc_async_count
= 0; /* statistics */
141 struct vs_async
*vs_alloc_async(void); /* forward */
142 void vs_free_async(struct vs_async
*vsa
); /* forward */
145 #define VS_ALLOC_ASYNC() vs_alloc_async()
146 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
148 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
149 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
150 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, 0)
151 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
153 * Paging Space Hysteresis triggers and the target notification port
157 unsigned int minimum_pages_remaining
= 0;
158 unsigned int maximum_pages_free
= 0;
159 ipc_port_t min_pages_trigger_port
= NULL
;
160 ipc_port_t max_pages_trigger_port
= NULL
;
162 boolean_t bs_low
= FALSE
;
163 int backing_store_release_trigger_disable
= 0;
166 /* Have we decided if swap needs to be encrypted yet ? */
167 boolean_t dp_encryption_inited
= FALSE
;
168 /* Should we encrypt swap ? */
169 boolean_t dp_encryption
= FALSE
;
173 * Object sizes are rounded up to the next power of 2,
174 * unless they are bigger than a given maximum size.
176 vm_size_t max_doubled_size
= 4 * 1024 * 1024; /* 4 meg */
179 * List of all backing store and segments.
181 struct backing_store_list_head backing_store_list
;
182 paging_segment_t paging_segments
[MAX_NUM_PAGING_SEGMENTS
];
183 mutex_t paging_segments_lock
;
184 int paging_segment_max
= 0;
185 int paging_segment_count
= 0;
186 int ps_select_array
[BS_MAXPRI
+1] = { -1,-1,-1,-1,-1 };
190 * Total pages free in system
191 * This differs from clusters committed/avail which is a measure of the
192 * over commitment of paging segments to backing store. An idea which is
193 * likely to be deprecated.
195 unsigned int dp_pages_free
= 0;
196 unsigned int cluster_transfer_minimum
= 100;
198 /* forward declarations */
199 kern_return_t
ps_write_file(paging_segment_t
, upl_t
, upl_offset_t
, vm_offset_t
, unsigned int, int); /* forward */
200 kern_return_t
ps_read_file (paging_segment_t
, upl_t
, upl_offset_t
, vm_offset_t
, unsigned int, unsigned int *, int); /* forward */
201 default_pager_thread_t
*get_read_buffer( void );
202 kern_return_t
ps_vstruct_transfer_from_segment(
204 paging_segment_t segment
,
206 kern_return_t
ps_read_device(paging_segment_t
, vm_offset_t
, vm_offset_t
*, unsigned int, unsigned int *, int); /* forward */
207 kern_return_t
ps_write_device(paging_segment_t
, vm_offset_t
, vm_offset_t
, unsigned int, struct vs_async
*); /* forward */
208 kern_return_t
vs_cluster_transfer(
213 vs_map_t
vs_get_map_entry(
218 default_pager_thread_t
*
219 get_read_buffer( void )
225 for (i
=0; i
<default_pager_internal_count
; i
++) {
226 if(dpt_array
[i
]->checked_out
== FALSE
) {
227 dpt_array
[i
]->checked_out
= TRUE
;
228 DPT_UNLOCK(dpt_lock
);
232 DPT_SLEEP(dpt_lock
, &dpt_array
, THREAD_UNINT
);
242 * List of all backing store.
245 queue_init(&backing_store_list
.bsl_queue
);
248 VS_ASYNC_LOCK_INIT();
250 vs_async_free_list
= NULL
;
251 #endif /* VS_ASYNC_REUSE */
253 for (i
= 0; i
< VM_SUPER_PAGES
+ 1; i
++) {
254 clustered_writes
[i
] = 0;
255 clustered_reads
[i
] = 0;
261 * When things do not quite workout...
263 void bs_no_paging_space(boolean_t
); /* forward */
267 boolean_t out_of_memory
)
271 dprintf(("*** OUT OF MEMORY ***\n"));
272 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
275 void bs_more_space(int); /* forward */
276 void bs_commit(int); /* forward */
278 boolean_t user_warned
= FALSE
;
279 unsigned int clusters_committed
= 0;
280 unsigned int clusters_available
= 0;
281 unsigned int clusters_committed_peak
= 0;
289 * Account for new paging space.
291 clusters_available
+= nclusters
;
293 if (clusters_available
>= clusters_committed
) {
294 if (verbose
&& user_warned
) {
295 printf("%s%s - %d excess clusters now.\n",
297 "paging space is OK now",
298 clusters_available
- clusters_committed
);
300 clusters_committed_peak
= 0;
303 if (verbose
&& user_warned
) {
304 printf("%s%s - still short of %d clusters.\n",
306 "WARNING: paging space over-committed",
307 clusters_committed
- clusters_available
);
308 clusters_committed_peak
-= nclusters
;
321 clusters_committed
+= nclusters
;
322 if (clusters_committed
> clusters_available
) {
323 if (verbose
&& !user_warned
) {
325 printf("%s%s - short of %d clusters.\n",
327 "WARNING: paging space over-committed",
328 clusters_committed
- clusters_available
);
330 if (clusters_committed
> clusters_committed_peak
) {
331 clusters_committed_peak
= clusters_committed
;
334 if (verbose
&& user_warned
) {
335 printf("%s%s - was short of up to %d clusters.\n",
337 "paging space is OK now",
338 clusters_committed_peak
- clusters_available
);
340 clusters_committed_peak
= 0;
348 int default_pager_info_verbose
= 1;
355 vm_size_t pages_total
, pages_free
;
360 pages_total
= pages_free
= 0;
361 for (i
= 0; i
<= paging_segment_max
; i
++) {
362 ps
= paging_segments
[i
];
363 if (ps
== PAGING_SEGMENT_NULL
)
367 * no need to lock: by the time this data
368 * gets back to any remote requestor it
369 * will be obsolete anyways
371 pages_total
+= ps
->ps_pgnum
;
372 pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
373 DP_DEBUG(DEBUG_BS_INTERNAL
,
374 ("segment #%d: %d total, %d free\n",
375 i
, ps
->ps_pgnum
, ps
->ps_clcount
<< ps
->ps_clshift
));
377 *totalp
= pages_total
;
379 if (verbose
&& user_warned
&& default_pager_info_verbose
) {
380 if (clusters_available
< clusters_committed
) {
381 printf("%s %d clusters committed, %d available.\n",
390 backing_store_t
backing_store_alloc(void); /* forward */
393 backing_store_alloc(void)
397 bs
= (backing_store_t
) kalloc(sizeof (struct backing_store
));
398 if (bs
== BACKING_STORE_NULL
)
399 panic("backing_store_alloc: no memory");
402 bs
->bs_port
= MACH_PORT_NULL
;
405 bs
->bs_pages_total
= 0;
407 bs
->bs_pages_in_fail
= 0;
408 bs
->bs_pages_out
= 0;
409 bs
->bs_pages_out_fail
= 0;
414 backing_store_t
backing_store_lookup(MACH_PORT_FACE
); /* forward */
416 /* Even in both the component space and external versions of this pager, */
417 /* backing_store_lookup will be called from tasks in the application space */
419 backing_store_lookup(
425 port is currently backed with a vs structure in the alias field
426 we could create an ISBS alias and a port_is_bs call but frankly
427 I see no reason for the test, the bs->port == port check below
428 will work properly on junk entries.
430 if ((port == MACH_PORT_NULL) || port_is_vs(port))
432 if ((port
== MACH_PORT_NULL
))
433 return BACKING_STORE_NULL
;
436 queue_iterate(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
439 if (bs
->bs_port
== port
) {
441 /* Success, return it locked. */
447 return BACKING_STORE_NULL
;
450 void backing_store_add(backing_store_t
); /* forward */
454 __unused backing_store_t bs
)
456 // MACH_PORT_FACE port = bs->bs_port;
457 // MACH_PORT_FACE pset = default_pager_default_set;
458 kern_return_t kr
= KERN_SUCCESS
;
460 if (kr
!= KERN_SUCCESS
)
461 panic("backing_store_add: add to set");
466 * Set up default page shift, but only if not already
467 * set and argument is within range.
470 bs_set_default_clsize(unsigned int npages
)
477 if (default_pager_clsize
== 0) /* if not yet set */
478 vstruct_def_clshift
= local_log2(npages
);
484 int bs_get_global_clsize(int clsize
); /* forward */
487 bs_get_global_clsize(
491 memory_object_default_t dmm
;
495 * Only allow setting of cluster size once. If called
496 * with no cluster size (default), we use the compiled-in default
497 * for the duration. The same cluster size is used for all
500 if (default_pager_clsize
== 0) {
502 * Keep cluster size in bit shift because it's quicker
503 * arithmetic, and easier to keep at a power of 2.
505 if (clsize
!= NO_CLSIZE
) {
506 for (i
= 0; (1 << i
) < clsize
; i
++);
507 if (i
> MAX_CLUSTER_SHIFT
)
508 i
= MAX_CLUSTER_SHIFT
;
509 vstruct_def_clshift
= i
;
511 default_pager_clsize
= (1 << vstruct_def_clshift
);
514 * Let the user know the new (and definitive) cluster size.
517 printf("%scluster size = %d page%s\n",
518 my_name
, default_pager_clsize
,
519 (default_pager_clsize
== 1) ? "" : "s");
522 * Let the kernel know too, in case it hasn't used the
523 * default value provided in main() yet.
525 dmm
= default_pager_object
;
526 clsize
= default_pager_clsize
* vm_page_size
; /* in bytes */
527 kr
= host_default_memory_manager(host_priv_self(),
530 memory_object_default_deallocate(dmm
);
532 if (kr
!= KERN_SUCCESS
) {
533 panic("bs_get_global_cl_size:host_default_memory_manager");
535 if (dmm
!= default_pager_object
) {
536 panic("bs_get_global_cl_size:there is another default pager");
539 ASSERT(default_pager_clsize
> 0 &&
540 (default_pager_clsize
& (default_pager_clsize
- 1)) == 0);
542 return default_pager_clsize
;
546 default_pager_backing_store_create(
547 memory_object_default_t pager
,
549 int clsize
, /* in bytes */
550 MACH_PORT_FACE
*backing_store
)
555 struct vstruct_alias
*alias_struct
;
557 if (pager
!= default_pager_object
)
558 return KERN_INVALID_ARGUMENT
;
560 bs
= backing_store_alloc();
561 port
= ipc_port_alloc_kernel();
562 ipc_port_make_send(port
);
563 assert (port
!= IP_NULL
);
565 DP_DEBUG(DEBUG_BS_EXTERNAL
,
566 ("priority=%d clsize=%d bs_port=0x%x\n",
567 priority
, clsize
, (int) backing_store
));
569 alias_struct
= (struct vstruct_alias
*)
570 kalloc(sizeof (struct vstruct_alias
));
571 if(alias_struct
!= NULL
) {
572 alias_struct
->vs
= (struct vstruct
*)bs
;
573 alias_struct
->name
= &default_pager_ops
;
574 port
->alias
= (int) alias_struct
;
577 ipc_port_dealloc_kernel((MACH_PORT_FACE
)(port
));
578 kfree(bs
, sizeof (struct backing_store
));
579 return KERN_RESOURCE_SHORTAGE
;
583 if (priority
== DEFAULT_PAGER_BACKING_STORE_MAXPRI
)
584 priority
= BS_MAXPRI
;
585 else if (priority
== BS_NOPRI
)
586 priority
= BS_MAXPRI
;
588 priority
= BS_MINPRI
;
589 bs
->bs_priority
= priority
;
591 bs
->bs_clsize
= bs_get_global_clsize(atop_32(clsize
));
594 queue_enter(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
598 backing_store_add(bs
);
600 *backing_store
= port
;
605 default_pager_backing_store_info(
606 MACH_PORT_FACE backing_store
,
607 backing_store_flavor_t flavour
,
608 backing_store_info_t info
,
609 mach_msg_type_number_t
*size
)
612 backing_store_basic_info_t basic
;
616 if (flavour
!= BACKING_STORE_BASIC_INFO
||
617 *size
< BACKING_STORE_BASIC_INFO_COUNT
)
618 return KERN_INVALID_ARGUMENT
;
620 basic
= (backing_store_basic_info_t
)info
;
621 *size
= BACKING_STORE_BASIC_INFO_COUNT
;
623 VSTATS_LOCK(&global_stats
.gs_lock
);
624 basic
->pageout_calls
= global_stats
.gs_pageout_calls
;
625 basic
->pagein_calls
= global_stats
.gs_pagein_calls
;
626 basic
->pages_in
= global_stats
.gs_pages_in
;
627 basic
->pages_out
= global_stats
.gs_pages_out
;
628 basic
->pages_unavail
= global_stats
.gs_pages_unavail
;
629 basic
->pages_init
= global_stats
.gs_pages_init
;
630 basic
->pages_init_writes
= global_stats
.gs_pages_init_writes
;
631 VSTATS_UNLOCK(&global_stats
.gs_lock
);
633 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
634 return KERN_INVALID_ARGUMENT
;
636 basic
->bs_pages_total
= bs
->bs_pages_total
;
638 bs
->bs_pages_free
= 0;
639 for (i
= 0; i
<= paging_segment_max
; i
++) {
640 ps
= paging_segments
[i
];
641 if (ps
!= PAGING_SEGMENT_NULL
&& ps
->ps_bs
== bs
) {
643 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
648 basic
->bs_pages_free
= bs
->bs_pages_free
;
649 basic
->bs_pages_in
= bs
->bs_pages_in
;
650 basic
->bs_pages_in_fail
= bs
->bs_pages_in_fail
;
651 basic
->bs_pages_out
= bs
->bs_pages_out
;
652 basic
->bs_pages_out_fail
= bs
->bs_pages_out_fail
;
654 basic
->bs_priority
= bs
->bs_priority
;
655 basic
->bs_clsize
= ptoa_32(bs
->bs_clsize
); /* in bytes */
662 int ps_delete(paging_segment_t
); /* forward */
669 kern_return_t error
= KERN_SUCCESS
;
672 VSL_LOCK(); /* get the lock on the list of vs's */
674 /* The lock relationship and sequence is farily complicated */
675 /* this code looks at a live list, locking and unlocking the list */
676 /* as it traverses it. It depends on the locking behavior of */
677 /* default_pager_no_senders. no_senders always locks the vstruct */
678 /* targeted for removal before locking the vstruct list. However */
679 /* it will remove that member of the list without locking its */
680 /* neighbors. We can be sure when we hold a lock on a vstruct */
681 /* it cannot be removed from the list but we must hold the list */
682 /* lock to be sure that its pointers to its neighbors are valid. */
683 /* Also, we can hold off destruction of a vstruct when the list */
684 /* lock and the vs locks are not being held by bumping the */
685 /* vs_async_pending count. */
688 while(backing_store_release_trigger_disable
!= 0) {
689 VSL_SLEEP(&backing_store_release_trigger_disable
, THREAD_UNINT
);
692 /* we will choose instead to hold a send right */
693 vs_count
= vstruct_list
.vsl_count
;
694 vs
= (vstruct_t
) queue_first((queue_entry_t
)&(vstruct_list
.vsl_queue
));
695 if(vs
== (vstruct_t
)&vstruct_list
) {
700 vs_async_wait(vs
); /* wait for any pending async writes */
701 if ((vs_count
!= 0) && (vs
!= NULL
))
702 vs
->vs_async_pending
+= 1; /* hold parties calling */
706 while((vs_count
!= 0) && (vs
!= NULL
)) {
707 /* We take the count of AMO's before beginning the */
708 /* transfer of of the target segment. */
709 /* We are guaranteed that the target segment cannot get */
710 /* more users. We also know that queue entries are */
711 /* made at the back of the list. If some of the entries */
712 /* we would check disappear while we are traversing the */
713 /* list then we will either check new entries which */
714 /* do not have any backing store in the target segment */
715 /* or re-check old entries. This might not be optimal */
716 /* but it will always be correct. The alternative is to */
717 /* take a snapshot of the list. */
720 if(dp_pages_free
< cluster_transfer_minimum
)
721 error
= KERN_FAILURE
;
723 vm_object_t transfer_object
;
727 transfer_object
= vm_object_allocate((vm_object_size_t
)VM_SUPER_CLUSTER
);
729 error
= vm_object_upl_request(transfer_object
,
730 (vm_object_offset_t
)0, VM_SUPER_CLUSTER
,
732 UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_LITE
| UPL_SET_INTERNAL
);
734 if(error
== KERN_SUCCESS
) {
735 error
= ps_vstruct_transfer_from_segment(
737 upl_commit(upl
, NULL
, 0);
740 error
= KERN_FAILURE
;
742 vm_object_deallocate(transfer_object
);
746 vs
->vs_async_pending
-= 1; /* release vs_async_wait */
747 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
748 vs
->vs_waiting_async
= FALSE
;
750 thread_wakeup(&vs
->vs_async_pending
);
759 while(backing_store_release_trigger_disable
!= 0) {
760 VSL_SLEEP(&backing_store_release_trigger_disable
,
764 next_vs
= (vstruct_t
) queue_next(&(vs
->vs_links
));
765 if((next_vs
!= (vstruct_t
)&vstruct_list
) &&
766 (vs
!= next_vs
) && (vs_count
!= 1)) {
768 vs_async_wait(next_vs
); /* wait for any */
769 /* pending async writes */
770 next_vs
->vs_async_pending
+= 1; /* hold parties */
771 /* calling vs_async_wait */
776 vs
->vs_async_pending
-= 1;
777 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
778 vs
->vs_waiting_async
= FALSE
;
780 thread_wakeup(&vs
->vs_async_pending
);
784 if((vs
== next_vs
) || (next_vs
== (vstruct_t
)&vstruct_list
))
795 default_pager_backing_store_delete(
796 MACH_PORT_FACE backing_store
)
802 int interim_pages_removed
= 0;
805 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
806 return KERN_INVALID_ARGUMENT
;
809 /* not implemented */
816 error
= KERN_SUCCESS
;
817 for (i
= 0; i
<= paging_segment_max
; i
++) {
818 ps
= paging_segments
[i
];
819 if (ps
!= PAGING_SEGMENT_NULL
&&
821 ! ps
->ps_going_away
) {
823 /* disable access to this segment */
824 ps
->ps_going_away
= TRUE
;
827 * The "ps" segment is "off-line" now,
828 * we can try and delete it...
830 if(dp_pages_free
< (cluster_transfer_minimum
832 error
= KERN_FAILURE
;
836 /* remove all pages associated with the */
837 /* segment from the list of free pages */
838 /* when transfer is through, all target */
839 /* segment pages will appear to be free */
841 dp_pages_free
-= ps
->ps_pgcount
;
842 interim_pages_removed
+= ps
->ps_pgcount
;
844 error
= ps_delete(ps
);
846 if (error
!= KERN_SUCCESS
) {
848 * We couldn't delete the segment,
849 * probably because there's not enough
850 * virtual memory left.
851 * Re-enable all the segments.
860 if (error
!= KERN_SUCCESS
) {
861 for (i
= 0; i
<= paging_segment_max
; i
++) {
862 ps
= paging_segments
[i
];
863 if (ps
!= PAGING_SEGMENT_NULL
&&
867 /* re-enable access to this segment */
868 ps
->ps_going_away
= FALSE
;
872 dp_pages_free
+= interim_pages_removed
;
878 for (i
= 0; i
<= paging_segment_max
; i
++) {
879 ps
= paging_segments
[i
];
880 if (ps
!= PAGING_SEGMENT_NULL
&&
882 if(ps
->ps_going_away
) {
883 paging_segments
[i
] = PAGING_SEGMENT_NULL
;
884 paging_segment_count
--;
886 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
887 kfree(ps
, sizeof *ps
);
892 /* Scan the entire ps array separately to make certain we find the */
893 /* proper paging_segment_max */
894 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
895 if(paging_segments
[i
] != PAGING_SEGMENT_NULL
)
896 paging_segment_max
= i
;
902 * All the segments have been deleted.
903 * We can remove the backing store.
907 * Disable lookups of this backing store.
909 if((void *)bs
->bs_port
->alias
!= NULL
)
910 kfree((void *) bs
->bs_port
->alias
,
911 sizeof (struct vstruct_alias
));
912 ipc_port_dealloc_kernel((ipc_port_t
) (bs
->bs_port
));
913 bs
->bs_port
= MACH_PORT_NULL
;
917 * Remove backing store from backing_store list.
920 queue_remove(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
925 * Free the backing store structure.
927 kfree(bs
, sizeof *bs
);
932 int ps_enter(paging_segment_t
); /* forward */
942 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
943 if (paging_segments
[i
] == PAGING_SEGMENT_NULL
)
947 if (i
< MAX_NUM_PAGING_SEGMENTS
) {
948 paging_segments
[i
] = ps
;
949 if (i
> paging_segment_max
)
950 paging_segment_max
= i
;
951 paging_segment_count
++;
952 if ((ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_NOPRI
) ||
953 (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
))
954 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
958 return KERN_RESOURCE_SHORTAGE
;
967 default_pager_add_segment(
968 MACH_PORT_FACE backing_store
,
969 MACH_PORT_FACE device
,
979 if ((bs
= backing_store_lookup(backing_store
))
980 == BACKING_STORE_NULL
)
981 return KERN_INVALID_ARGUMENT
;
984 for (i
= 0; i
<= paging_segment_max
; i
++) {
985 ps
= paging_segments
[i
];
986 if (ps
== PAGING_SEGMENT_NULL
)
990 * Check for overlap on same device.
992 if (!(ps
->ps_device
!= device
993 || offset
>= ps
->ps_offset
+ ps
->ps_recnum
994 || offset
+ count
<= ps
->ps_offset
)) {
997 return KERN_INVALID_ARGUMENT
;
1003 * Set up the paging segment
1005 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
1006 if (ps
== PAGING_SEGMENT_NULL
) {
1008 return KERN_RESOURCE_SHORTAGE
;
1011 ps
->ps_segtype
= PS_PARTITION
;
1012 ps
->ps_device
= device
;
1013 ps
->ps_offset
= offset
;
1014 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
1015 ps
->ps_recnum
= count
;
1016 ps
->ps_pgnum
= count
>> ps
->ps_record_shift
;
1018 ps
->ps_pgcount
= ps
->ps_pgnum
;
1019 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
1020 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
1024 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
1026 kfree(ps
, sizeof *ps
);
1028 return KERN_RESOURCE_SHORTAGE
;
1030 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
1031 clrbit(ps
->ps_bmap
, i
);
1034 ps
->ps_going_away
= FALSE
;
1037 if ((error
= ps_enter(ps
)) != 0) {
1038 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
1039 kfree(ps
, sizeof *ps
);
1041 return KERN_RESOURCE_SHORTAGE
;
1044 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1045 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1049 dp_pages_free
+= ps
->ps_pgcount
;
1052 bs_more_space(ps
->ps_clcount
);
1054 DP_DEBUG(DEBUG_BS_INTERNAL
,
1055 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1056 device
, offset
, count
, record_size
,
1057 ps
->ps_record_shift
, ps
->ps_pgnum
));
1059 return KERN_SUCCESS
;
1065 MACH_PORT_FACE master
)
1067 security_token_t null_security_token
= {
1070 MACH_PORT_FACE device
;
1071 int info
[DEV_GET_SIZE_COUNT
];
1072 mach_msg_type_number_t info_count
;
1073 MACH_PORT_FACE bs
= MACH_PORT_NULL
;
1074 unsigned int rec_size
;
1077 MACH_PORT_FACE reply_port
;
1079 if (ds_device_open_sync(master
, MACH_PORT_NULL
, D_READ
| D_WRITE
,
1080 null_security_token
, dev_name
, &device
))
1083 info_count
= DEV_GET_SIZE_COUNT
;
1084 if (!ds_device_get_status(device
, DEV_GET_SIZE
, info
, &info_count
)) {
1085 rec_size
= info
[DEV_GET_SIZE_RECORD_SIZE
];
1086 count
= info
[DEV_GET_SIZE_DEVICE_SIZE
] / rec_size
;
1087 clsize
= bs_get_global_clsize(0);
1088 if (!default_pager_backing_store_create(
1089 default_pager_object
,
1090 DEFAULT_PAGER_BACKING_STORE_MAXPRI
,
1091 (clsize
* vm_page_size
),
1093 if (!default_pager_add_segment(bs
, device
,
1094 0, count
, rec_size
)) {
1097 ipc_port_release_receive(bs
);
1101 ipc_port_release_send(device
);
1104 #endif /* DEVICE_PAGING */
1109 vs_alloc_async(void)
1111 struct vs_async
*vsa
;
1112 MACH_PORT_FACE reply_port
;
1113 // kern_return_t kr;
1116 if (vs_async_free_list
== NULL
) {
1118 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1121 * Try allocating a reply port named after the
1122 * address of the vs_async structure.
1124 struct vstruct_alias
*alias_struct
;
1126 reply_port
= ipc_port_alloc_kernel();
1127 alias_struct
= (struct vstruct_alias
*)
1128 kalloc(sizeof (struct vstruct_alias
));
1129 if(alias_struct
!= NULL
) {
1130 alias_struct
->vs
= (struct vstruct
*)vsa
;
1131 alias_struct
->name
= &default_pager_ops
;
1132 reply_port
->alias
= (int) alias_struct
;
1133 vsa
->reply_port
= reply_port
;
1134 vs_alloc_async_count
++;
1137 vs_alloc_async_failed
++;
1138 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1140 kfree(vsa
, sizeof (struct vs_async
));
1145 vsa
= vs_async_free_list
;
1146 vs_async_free_list
= vs_async_free_list
->vsa_next
;
1155 struct vs_async
*vsa
)
1158 vsa
->vsa_next
= vs_async_free_list
;
1159 vs_async_free_list
= vsa
;
1163 #else /* VS_ASYNC_REUSE */
1166 vs_alloc_async(void)
1168 struct vs_async
*vsa
;
1169 MACH_PORT_FACE reply_port
;
1172 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1175 * Try allocating a reply port named after the
1176 * address of the vs_async structure.
1178 reply_port
= ipc_port_alloc_kernel();
1179 alias_struct
= (vstruct_alias
*)
1180 kalloc(sizeof (struct vstruct_alias
));
1181 if(alias_struct
!= NULL
) {
1182 alias_struct
->vs
= reply_port
;
1183 alias_struct
->name
= &default_pager_ops
;
1184 reply_port
->alias
= (int) vsa
;
1185 vsa
->reply_port
= reply_port
;
1186 vs_alloc_async_count
++;
1189 vs_alloc_async_failed
++;
1190 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1192 kfree(vsa
, sizeof (struct vs_async
));
1202 struct vs_async
*vsa
)
1204 MACH_PORT_FACE reply_port
;
1207 reply_port
= vsa
->reply_port
;
1208 kfree(reply_port
->alias
, sizeof (struct vstuct_alias
));
1209 kfree(vsa
, sizeof (struct vs_async
));
1210 ipc_port_dealloc_kernel((MACH_PORT_FACE
) (reply_port
));
1213 vs_alloc_async_count
--;
1218 #endif /* VS_ASYNC_REUSE */
1220 zone_t vstruct_zone
;
1229 vs
= (vstruct_t
) zalloc(vstruct_zone
);
1230 if (vs
== VSTRUCT_NULL
) {
1231 return VSTRUCT_NULL
;
1237 * The following fields will be provided later.
1239 vs
->vs_pager_ops
= NULL
;
1240 vs
->vs_control
= MEMORY_OBJECT_CONTROL_NULL
;
1241 vs
->vs_references
= 1;
1245 vs
->vs_waiting_seqno
= FALSE
;
1246 vs
->vs_waiting_read
= FALSE
;
1247 vs
->vs_waiting_write
= FALSE
;
1248 vs
->vs_waiting_async
= FALSE
;
1250 mutex_init(&vs
->vs_waiting_seqno
, 0);
1251 mutex_init(&vs
->vs_waiting_read
, 0);
1252 mutex_init(&vs
->vs_waiting_write
, 0);
1253 mutex_init(&vs
->vs_waiting_refs
, 0);
1254 mutex_init(&vs
->vs_waiting_async
, 0);
1262 vs
->vs_clshift
= local_log2(bs_get_global_clsize(0));
1263 vs
->vs_size
= ((atop_32(round_page_32(size
)) - 1) >> vs
->vs_clshift
) + 1;
1264 vs
->vs_async_pending
= 0;
1267 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1268 * depending on the size of the memory object.
1270 if (INDIRECT_CLMAP(vs
->vs_size
)) {
1271 vs
->vs_imap
= (struct vs_map
**)
1272 kalloc(INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1273 vs
->vs_indirect
= TRUE
;
1275 vs
->vs_dmap
= (struct vs_map
*)
1276 kalloc(CLMAP_SIZE(vs
->vs_size
));
1277 vs
->vs_indirect
= FALSE
;
1279 vs
->vs_xfer_pending
= FALSE
;
1280 DP_DEBUG(DEBUG_VS_INTERNAL
,
1281 ("map=0x%x, indirect=%d\n", (int) vs
->vs_dmap
, vs
->vs_indirect
));
1284 * Check to see that we got the space.
1287 kfree(vs
, sizeof *vs
);
1288 return VSTRUCT_NULL
;
1292 * Zero the indirect pointers, or clear the direct pointers.
1294 if (vs
->vs_indirect
)
1295 memset(vs
->vs_imap
, 0,
1296 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1298 for (i
= 0; i
< vs
->vs_size
; i
++)
1299 VSM_CLR(vs
->vs_dmap
[i
]);
1301 VS_MAP_LOCK_INIT(vs
);
1303 bs_commit(vs
->vs_size
);
1308 paging_segment_t
ps_select_segment(unsigned int, int *); /* forward */
1315 paging_segment_t ps
;
1320 * Optimize case where there's only one segment.
1321 * paging_segment_max will index the one and only segment.
1325 if (paging_segment_count
== 1) {
1326 paging_segment_t lps
; /* used to avoid extra PS_UNLOCK */
1327 ipc_port_t trigger
= IP_NULL
;
1329 ps
= paging_segments
[paging_segment_max
];
1330 *psindex
= paging_segment_max
;
1332 if (ps
->ps_going_away
) {
1333 /* this segment is being turned off */
1334 lps
= PAGING_SEGMENT_NULL
;
1336 ASSERT(ps
->ps_clshift
>= shift
);
1337 if (ps
->ps_clcount
) {
1339 dp_pages_free
-= 1 << ps
->ps_clshift
;
1340 if(min_pages_trigger_port
&&
1341 (dp_pages_free
< minimum_pages_remaining
)) {
1342 trigger
= min_pages_trigger_port
;
1343 min_pages_trigger_port
= NULL
;
1348 lps
= PAGING_SEGMENT_NULL
;
1353 if (trigger
!= IP_NULL
) {
1354 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1355 ipc_port_release_send(trigger
);
1360 if (paging_segment_count
== 0) {
1362 return PAGING_SEGMENT_NULL
;
1366 i
>= BS_MINPRI
; i
--) {
1369 if ((ps_select_array
[i
] == BS_NOPRI
) ||
1370 (ps_select_array
[i
] == BS_FULLPRI
))
1372 start_index
= ps_select_array
[i
];
1374 if(!(paging_segments
[start_index
])) {
1376 physical_transfer_cluster_count
= 0;
1378 else if ((physical_transfer_cluster_count
+1) == (ALLOC_STRIDE
>>
1379 (((paging_segments
[start_index
])->ps_clshift
)
1380 + vm_page_shift
))) {
1381 physical_transfer_cluster_count
= 0;
1382 j
= start_index
+ 1;
1384 physical_transfer_cluster_count
+=1;
1386 if(start_index
== 0)
1387 start_index
= paging_segment_max
;
1389 start_index
= start_index
- 1;
1393 if (j
> paging_segment_max
)
1395 if ((ps
= paging_segments
[j
]) &&
1396 (ps
->ps_bs
->bs_priority
== i
)) {
1398 * Force the ps cluster size to be
1399 * >= that of the vstruct.
1402 if (ps
->ps_going_away
) {
1403 /* this segment is being turned off */
1404 } else if ((ps
->ps_clcount
) &&
1405 (ps
->ps_clshift
>= shift
)) {
1406 ipc_port_t trigger
= IP_NULL
;
1409 dp_pages_free
-= 1 << ps
->ps_clshift
;
1410 if(min_pages_trigger_port
&&
1412 minimum_pages_remaining
)) {
1413 trigger
= min_pages_trigger_port
;
1414 min_pages_trigger_port
= NULL
;
1418 * found one, quit looking.
1420 ps_select_array
[i
] = j
;
1423 if (trigger
!= IP_NULL
) {
1424 default_pager_space_alert(
1427 ipc_port_release_send(trigger
);
1434 if (j
== start_index
) {
1436 * none at this priority -- mark it full
1438 ps_select_array
[i
] = BS_FULLPRI
;
1445 return PAGING_SEGMENT_NULL
;
1448 vm_offset_t
ps_allocate_cluster(vstruct_t
, int *, paging_segment_t
); /*forward*/
1451 ps_allocate_cluster(
1454 paging_segment_t use_ps
)
1456 unsigned int byte_num
;
1458 paging_segment_t ps
;
1459 vm_offset_t cluster
;
1460 ipc_port_t trigger
= IP_NULL
;
1463 * Find best paging segment.
1464 * ps_select_segment will decrement cluster count on ps.
1465 * Must pass cluster shift to find the most appropriate segment.
1467 /* NOTE: The addition of paging segment delete capability threatened
1468 * to seriously complicate the treatment of paging segments in this
1469 * module and the ones that call it (notably ps_clmap), because of the
1470 * difficulty in assuring that the paging segment would continue to
1471 * exist between being unlocked and locked. This was
1472 * avoided because all calls to this module are based in either
1473 * dp_memory_object calls which rely on the vs lock, or by
1474 * the transfer function which is part of the segment delete path.
1475 * The transfer function which is part of paging segment delete is
1476 * protected from multiple callers by the backing store lock.
1477 * The paging segment delete function treats mappings to a paging
1478 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1479 * while data is transferred to the remaining segments. This is in
1480 * line with the view that incomplete or in-transition mappings between
1481 * data, a vstruct, and backing store are protected by the vs lock.
1482 * This and the ordering of the paging segment "going_away" bit setting
1485 if (use_ps
!= PAGING_SEGMENT_NULL
) {
1490 ASSERT(ps
->ps_clcount
!= 0);
1493 dp_pages_free
-= 1 << ps
->ps_clshift
;
1494 if(min_pages_trigger_port
&&
1495 (dp_pages_free
< minimum_pages_remaining
)) {
1496 trigger
= min_pages_trigger_port
;
1497 min_pages_trigger_port
= NULL
;
1501 if (trigger
!= IP_NULL
) {
1502 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1503 ipc_port_release_send(trigger
);
1506 } else if ((ps
= ps_select_segment(vs
->vs_clshift
, psindex
)) ==
1507 PAGING_SEGMENT_NULL
) {
1508 static uint32_t lastnotify
= 0;
1509 uint32_t now
, nanoseconds_dummy
;
1512 * Emit a notification of the low-paging resource condition
1513 * but don't issue it more than once every five seconds. This
1514 * prevents us from overflowing logs with thousands of
1515 * repetitions of the message.
1517 clock_get_system_nanotime(&now
, &nanoseconds_dummy
);
1518 if (now
> lastnotify
+ 5) {
1519 dprintf(("no space in available paging segments\n"));
1523 /* the count got off maybe, reset to zero */
1526 if(min_pages_trigger_port
) {
1527 trigger
= min_pages_trigger_port
;
1528 min_pages_trigger_port
= NULL
;
1532 if (trigger
!= IP_NULL
) {
1533 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1534 ipc_port_release_send(trigger
);
1536 return (vm_offset_t
) -1;
1540 * Look for an available cluster. At the end of the loop,
1541 * byte_num is the byte offset and bit_num is the bit offset of the
1542 * first zero bit in the paging segment bitmap.
1545 byte_num
= ps
->ps_hint
;
1546 for (; byte_num
< howmany(ps
->ps_ncls
, NBBY
); byte_num
++) {
1547 if (*(ps
->ps_bmap
+ byte_num
) != BYTEMASK
) {
1548 for (bit_num
= 0; bit_num
< NBBY
; bit_num
++) {
1549 if (isclr((ps
->ps_bmap
+ byte_num
), bit_num
))
1552 ASSERT(bit_num
!= NBBY
);
1556 ps
->ps_hint
= byte_num
;
1557 cluster
= (byte_num
*NBBY
) + bit_num
;
1559 /* Space was reserved, so this must be true */
1560 ASSERT(cluster
< ps
->ps_ncls
);
1562 setbit(ps
->ps_bmap
, cluster
);
1568 void ps_deallocate_cluster(paging_segment_t
, vm_offset_t
); /* forward */
1571 ps_deallocate_cluster(
1572 paging_segment_t ps
,
1573 vm_offset_t cluster
)
1576 if (cluster
>= (vm_offset_t
) ps
->ps_ncls
)
1577 panic("ps_deallocate_cluster: Invalid cluster number");
1580 * Lock the paging segment, clear the cluster's bitmap and increment the
1581 * number of free cluster.
1585 clrbit(ps
->ps_bmap
, cluster
);
1587 dp_pages_free
+= 1 << ps
->ps_clshift
;
1591 * Move the hint down to the freed cluster if it is
1592 * less than the current hint.
1594 if ((cluster
/NBBY
) < ps
->ps_hint
) {
1595 ps
->ps_hint
= (cluster
/NBBY
);
1601 * If we're freeing space on a full priority, reset the array.
1604 if (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
)
1605 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
1611 void ps_dealloc_vsmap(struct vs_map
*, vm_size_t
); /* forward */
1615 struct vs_map
*vsmap
,
1619 for (i
= 0; i
< size
; i
++)
1620 if (!VSM_ISCLR(vsmap
[i
]) && !VSM_ISERR(vsmap
[i
]))
1621 ps_deallocate_cluster(VSM_PS(vsmap
[i
]),
1622 VSM_CLOFF(vsmap
[i
]));
1635 * If this is an indirect structure, then we walk through the valid
1636 * (non-zero) indirect pointers and deallocate the clusters
1637 * associated with each used map entry (via ps_dealloc_vsmap).
1638 * When all of the clusters in an indirect block have been
1639 * freed, we deallocate the block. When all of the indirect
1640 * blocks have been deallocated we deallocate the memory
1641 * holding the indirect pointers.
1643 if (vs
->vs_indirect
) {
1644 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
1645 if (vs
->vs_imap
[i
] != NULL
) {
1646 ps_dealloc_vsmap(vs
->vs_imap
[i
], CLMAP_ENTRIES
);
1647 kfree(vs
->vs_imap
[i
], CLMAP_THRESHOLD
);
1650 kfree(vs
->vs_imap
, INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1653 * Direct map. Free used clusters, then memory.
1655 ps_dealloc_vsmap(vs
->vs_dmap
, vs
->vs_size
);
1656 kfree(vs
->vs_dmap
, CLMAP_SIZE(vs
->vs_size
));
1660 bs_commit(- vs
->vs_size
);
1662 zfree(vstruct_zone
, vs
);
1665 int ps_map_extend(vstruct_t
, unsigned int); /* forward */
1669 unsigned int new_size
)
1671 struct vs_map
**new_imap
;
1672 struct vs_map
*new_dmap
= NULL
;
1675 void *old_map
= NULL
;
1676 int old_map_size
= 0;
1678 if (vs
->vs_size
>= new_size
) {
1680 * Someone has already done the work.
1686 * If the new size extends into the indirect range, then we have one
1687 * of two cases: we are going from indirect to indirect, or we are
1688 * going from direct to indirect. If we are going from indirect to
1689 * indirect, then it is possible that the new size will fit in the old
1690 * indirect map. If this is the case, then just reset the size of the
1691 * vstruct map and we are done. If the new size will not
1692 * fit into the old indirect map, then we have to allocate a new
1693 * indirect map and copy the old map pointers into this new map.
1695 * If we are going from direct to indirect, then we have to allocate a
1696 * new indirect map and copy the old direct pages into the first
1697 * indirect page of the new map.
1698 * NOTE: allocating memory here is dangerous, as we're in the
1701 if (INDIRECT_CLMAP(new_size
)) {
1702 int new_map_size
= INDIRECT_CLMAP_SIZE(new_size
);
1705 * Get a new indirect map and zero it.
1707 old_map_size
= INDIRECT_CLMAP_SIZE(vs
->vs_size
);
1708 if (vs
->vs_indirect
&&
1709 (new_map_size
== old_map_size
)) {
1710 bs_commit(new_size
- vs
->vs_size
);
1711 vs
->vs_size
= new_size
;
1715 new_imap
= (struct vs_map
**)kalloc(new_map_size
);
1716 if (new_imap
== NULL
) {
1719 memset(new_imap
, 0, new_map_size
);
1721 if (vs
->vs_indirect
) {
1722 /* Copy old entries into new map */
1723 memcpy(new_imap
, vs
->vs_imap
, old_map_size
);
1724 /* Arrange to free the old map */
1725 old_map
= (void *) vs
->vs_imap
;
1727 } else { /* Old map was a direct map */
1728 /* Allocate an indirect page */
1729 if ((new_imap
[0] = (struct vs_map
*)
1730 kalloc(CLMAP_THRESHOLD
)) == NULL
) {
1731 kfree(new_imap
, new_map_size
);
1734 new_dmap
= new_imap
[0];
1735 newdsize
= CLMAP_ENTRIES
;
1739 newdsize
= new_size
;
1741 * If the new map is a direct map, then the old map must
1742 * also have been a direct map. All we have to do is
1743 * to allocate a new direct map, copy the old entries
1744 * into it and free the old map.
1746 if ((new_dmap
= (struct vs_map
*)
1747 kalloc(CLMAP_SIZE(new_size
))) == NULL
) {
1753 /* Free the old map */
1754 old_map
= (void *) vs
->vs_dmap
;
1755 old_map_size
= CLMAP_SIZE(vs
->vs_size
);
1757 /* Copy info from the old map into the new map */
1758 memcpy(new_dmap
, vs
->vs_dmap
, old_map_size
);
1760 /* Initialize the rest of the new map */
1761 for (i
= vs
->vs_size
; i
< newdsize
; i
++)
1762 VSM_CLR(new_dmap
[i
]);
1765 vs
->vs_imap
= new_imap
;
1766 vs
->vs_indirect
= TRUE
;
1768 vs
->vs_dmap
= new_dmap
;
1769 bs_commit(new_size
- vs
->vs_size
);
1770 vs
->vs_size
= new_size
;
1772 kfree(old_map
, old_map_size
);
1780 struct clmap
*clmap
,
1785 vm_offset_t cluster
; /* The cluster of offset. */
1786 vm_offset_t newcl
; /* The new cluster allocated. */
1789 struct vs_map
*vsmap
;
1793 ASSERT(vs
->vs_dmap
);
1794 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
1797 * Initialize cluster error value
1799 clmap
->cl_error
= 0;
1802 * If the object has grown, extend the page map.
1804 if (cluster
>= vs
->vs_size
) {
1805 if (flag
== CL_FIND
) {
1806 /* Do not allocate if just doing a lookup */
1808 return (vm_offset_t
) -1;
1810 if (ps_map_extend(vs
, cluster
+ 1)) {
1812 return (vm_offset_t
) -1;
1817 * Look for the desired cluster. If the map is indirect, then we
1818 * have a two level lookup. First find the indirect block, then
1819 * find the actual cluster. If the indirect block has not yet
1820 * been allocated, then do so. If the cluster has not yet been
1821 * allocated, then do so.
1823 * If any of the allocations fail, then return an error.
1824 * Don't allocate if just doing a lookup.
1826 if (vs
->vs_indirect
) {
1827 long ind_block
= cluster
/CLMAP_ENTRIES
;
1829 /* Is the indirect block allocated? */
1830 vsmap
= vs
->vs_imap
[ind_block
];
1831 if (vsmap
== NULL
) {
1832 if (flag
== CL_FIND
) {
1834 return (vm_offset_t
) -1;
1837 /* Allocate the indirect block */
1838 vsmap
= (struct vs_map
*) kalloc(CLMAP_THRESHOLD
);
1839 if (vsmap
== NULL
) {
1841 return (vm_offset_t
) -1;
1843 /* Initialize the cluster offsets */
1844 for (i
= 0; i
< CLMAP_ENTRIES
; i
++)
1846 vs
->vs_imap
[ind_block
] = vsmap
;
1849 vsmap
= vs
->vs_dmap
;
1852 vsmap
+= cluster%CLMAP_ENTRIES
;
1855 * At this point, vsmap points to the struct vs_map desired.
1857 * Look in the map for the cluster, if there was an error on a
1858 * previous write, flag it and return. If it is not yet
1859 * allocated, then allocate it, if we're writing; if we're
1860 * doing a lookup and the cluster's not allocated, return error.
1862 if (VSM_ISERR(*vsmap
)) {
1863 clmap
->cl_error
= VSM_GETERR(*vsmap
);
1865 return (vm_offset_t
) -1;
1866 } else if (VSM_ISCLR(*vsmap
)) {
1869 if (flag
== CL_FIND
) {
1871 * If there's an error and the entry is clear, then
1872 * we've run out of swap space. Record the error
1876 VSM_SETERR(*vsmap
, error
);
1879 return (vm_offset_t
) -1;
1882 * Attempt to allocate a cluster from the paging segment
1884 newcl
= ps_allocate_cluster(vs
, &psindex
,
1885 PAGING_SEGMENT_NULL
);
1886 if (newcl
== (vm_offset_t
) -1) {
1888 return (vm_offset_t
) -1;
1891 VSM_SETCLOFF(*vsmap
, newcl
);
1892 VSM_SETPS(*vsmap
, psindex
);
1895 newcl
= VSM_CLOFF(*vsmap
);
1898 * Fill in pertinent fields of the clmap
1900 clmap
->cl_ps
= VSM_PS(*vsmap
);
1901 clmap
->cl_numpages
= VSCLSIZE(vs
);
1902 clmap
->cl_bmap
.clb_map
= (unsigned int) VSM_BMAP(*vsmap
);
1905 * Byte offset in paging segment is byte offset to cluster plus
1906 * byte offset within cluster. It looks ugly, but should be
1909 ASSERT(trunc_page(offset
) == offset
);
1910 newcl
= ptoa_32(newcl
) << vs
->vs_clshift
;
1911 newoff
= offset
& ((1<<(vm_page_shift
+ vs
->vs_clshift
)) - 1);
1912 if (flag
== CL_ALLOC
) {
1914 * set bits in the allocation bitmap according to which
1915 * pages were requested. size is in bytes.
1917 i
= atop_32(newoff
);
1918 while ((size
> 0) && (i
< VSCLSIZE(vs
))) {
1919 VSM_SETALLOC(*vsmap
, i
);
1921 size
-= vm_page_size
;
1924 clmap
->cl_alloc
.clb_map
= (unsigned int) VSM_ALLOC(*vsmap
);
1927 * Offset is not cluster aligned, so number of pages
1928 * and bitmaps must be adjusted
1930 clmap
->cl_numpages
-= atop_32(newoff
);
1931 CLMAP_SHIFT(clmap
, vs
);
1932 CLMAP_SHIFTALLOC(clmap
, vs
);
1937 * The setting of valid bits and handling of write errors
1938 * must be done here, while we hold the lock on the map.
1939 * It logically should be done in ps_vs_write_complete().
1940 * The size and error information has been passed from
1941 * ps_vs_write_complete(). If the size parameter is non-zero,
1942 * then there is work to be done. If error is also non-zero,
1943 * then the error number is recorded in the cluster and the
1944 * entire cluster is in error.
1946 if (size
&& flag
== CL_FIND
) {
1947 vm_offset_t off
= (vm_offset_t
) 0;
1950 for (i
= VSCLSIZE(vs
) - clmap
->cl_numpages
; size
> 0;
1952 VSM_SETPG(*vsmap
, i
);
1953 size
-= vm_page_size
;
1955 ASSERT(i
<= VSCLSIZE(vs
));
1957 BS_STAT(clmap
->cl_ps
->ps_bs
,
1958 clmap
->cl_ps
->ps_bs
->bs_pages_out_fail
+=
1960 off
= VSM_CLOFF(*vsmap
);
1961 VSM_SETERR(*vsmap
, error
);
1964 * Deallocate cluster if error, and no valid pages
1967 if (off
!= (vm_offset_t
) 0)
1968 ps_deallocate_cluster(clmap
->cl_ps
, off
);
1970 return (vm_offset_t
) 0;
1974 DP_DEBUG(DEBUG_VS_INTERNAL
,
1975 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1976 newcl
+newoff
, (int) vs
, (int) vsmap
, flag
));
1977 DP_DEBUG(DEBUG_VS_INTERNAL
,
1978 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1979 (int) clmap
->cl_ps
, clmap
->cl_numpages
,
1980 (int) clmap
->cl_bmap
.clb_map
, (int) clmap
->cl_alloc
.clb_map
));
1982 return (newcl
+ newoff
);
1985 void ps_clunmap(vstruct_t
, vm_offset_t
, vm_size_t
); /* forward */
1993 vm_offset_t cluster
; /* The cluster number of offset */
1994 struct vs_map
*vsmap
;
1999 * Loop through all clusters in this range, freeing paging segment
2000 * clusters and map entries as encountered.
2002 while (length
> 0) {
2006 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
2007 if (vs
->vs_indirect
) /* indirect map */
2008 vsmap
= vs
->vs_imap
[cluster
/CLMAP_ENTRIES
];
2010 vsmap
= vs
->vs_dmap
;
2011 if (vsmap
== NULL
) {
2015 vsmap
+= cluster%CLMAP_ENTRIES
;
2016 if (VSM_ISCLR(*vsmap
)) {
2017 length
-= vm_page_size
;
2018 offset
+= vm_page_size
;
2022 * We've got a valid mapping. Clear it and deallocate
2023 * paging segment cluster pages.
2024 * Optimize for entire cluster cleraing.
2026 if ( (newoff
= (offset
&((1<<(vm_page_shift
+vs
->vs_clshift
))-1))) ) {
2028 * Not cluster aligned.
2030 ASSERT(trunc_page(newoff
) == newoff
);
2031 i
= atop_32(newoff
);
2034 while ((i
< VSCLSIZE(vs
)) && (length
> 0)) {
2035 VSM_CLRPG(*vsmap
, i
);
2036 VSM_CLRALLOC(*vsmap
, i
);
2037 length
-= vm_page_size
;
2038 offset
+= vm_page_size
;
2043 * If map entry is empty, clear and deallocate cluster.
2045 if (!VSM_ALLOC(*vsmap
)) {
2046 ps_deallocate_cluster(VSM_PS(*vsmap
),
2055 void ps_vs_write_complete(vstruct_t
, vm_offset_t
, vm_size_t
, int); /* forward */
2058 ps_vs_write_complete(
2067 * Get the struct vsmap for this cluster.
2068 * Use READ, even though it was written, because the
2069 * cluster MUST be present, unless there was an error
2070 * in the original ps_clmap (e.g. no space), in which
2071 * case, nothing happens.
2073 * Must pass enough information to ps_clmap to allow it
2074 * to set the vs_map structure bitmap under lock.
2076 (void) ps_clmap(vs
, offset
, &clmap
, CL_FIND
, size
, error
);
2079 void vs_cl_write_complete(vstruct_t
, paging_segment_t
, vm_offset_t
, vm_offset_t
, vm_size_t
, boolean_t
, int); /* forward */
2082 vs_cl_write_complete(
2084 __unused paging_segment_t ps
,
2086 __unused vm_offset_t addr
,
2091 // kern_return_t kr;
2095 * For internal objects, the error is recorded on a
2096 * per-cluster basis by ps_clmap() which is called
2097 * by ps_vs_write_complete() below.
2099 dprintf(("write failed error = 0x%x\n", error
));
2100 /* add upl_abort code here */
2102 GSTAT(global_stats
.gs_pages_out
+= atop_32(size
));
2104 * Notify the vstruct mapping code, so it can do its accounting.
2106 ps_vs_write_complete(vs
, offset
, size
, error
);
2110 ASSERT(vs
->vs_async_pending
> 0);
2111 vs
->vs_async_pending
-= size
;
2112 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
2113 vs
->vs_waiting_async
= FALSE
;
2115 /* mutex_unlock(&vs->vs_waiting_async); */
2116 thread_wakeup(&vs
->vs_async_pending
);
2123 #ifdef DEVICE_PAGING
2124 kern_return_t
device_write_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2128 MACH_PORT_FACE reply_port
,
2129 kern_return_t device_code
,
2130 io_buf_len_t bytes_written
)
2132 struct vs_async
*vsa
;
2134 vsa
= (struct vs_async
*)
2135 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2137 if (device_code
== KERN_SUCCESS
&& bytes_written
!= vsa
->vsa_size
) {
2138 device_code
= KERN_FAILURE
;
2141 vsa
->vsa_error
= device_code
;
2144 ASSERT(vsa
->vsa_vs
!= VSTRUCT_NULL
);
2145 if(vsa
->vsa_flags
& VSA_TRANSFER
) {
2146 /* revisit when async disk segments redone */
2147 if(vsa
->vsa_error
) {
2148 /* need to consider error condition. re-write data or */
2149 /* throw it away here. */
2150 vm_map_copy_discard((vm_map_copy_t
)vsa
->vsa_addr
);
2152 ps_vs_write_complete(vsa
->vsa_vs
, vsa
->vsa_offset
,
2153 vsa
->vsa_size
, vsa
->vsa_error
);
2155 vs_cl_write_complete(vsa
->vsa_vs
, vsa
->vsa_ps
, vsa
->vsa_offset
,
2156 vsa
->vsa_addr
, vsa
->vsa_size
, TRUE
,
2161 return KERN_SUCCESS
;
2164 kern_return_t
device_write_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2166 device_write_reply_inband(
2167 MACH_PORT_FACE reply_port
,
2168 kern_return_t return_code
,
2169 io_buf_len_t bytes_written
)
2171 panic("device_write_reply_inband: illegal");
2172 return KERN_SUCCESS
;
2175 kern_return_t
device_read_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_t
, mach_msg_type_number_t
);
2178 MACH_PORT_FACE reply_port
,
2179 kern_return_t return_code
,
2181 mach_msg_type_number_t dataCnt
)
2183 struct vs_async
*vsa
;
2184 vsa
= (struct vs_async
*)
2185 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2186 vsa
->vsa_addr
= (vm_offset_t
)data
;
2187 vsa
->vsa_size
= (vm_size_t
)dataCnt
;
2188 vsa
->vsa_error
= return_code
;
2189 thread_wakeup(&vsa
->vsa_lock
);
2190 return KERN_SUCCESS
;
2193 kern_return_t
device_read_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_inband_t
, mach_msg_type_number_t
);
2195 device_read_reply_inband(
2196 MACH_PORT_FACE reply_port
,
2197 kern_return_t return_code
,
2198 io_buf_ptr_inband_t data
,
2199 mach_msg_type_number_t dataCnt
)
2201 panic("device_read_reply_inband: illegal");
2202 return KERN_SUCCESS
;
2205 kern_return_t
device_read_reply_overwrite(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2207 device_read_reply_overwrite(
2208 MACH_PORT_FACE reply_port
,
2209 kern_return_t return_code
,
2210 io_buf_len_t bytes_read
)
2212 panic("device_read_reply_overwrite: illegal\n");
2213 return KERN_SUCCESS
;
2216 kern_return_t
device_open_reply(MACH_PORT_FACE
, kern_return_t
, MACH_PORT_FACE
);
2219 MACH_PORT_FACE reply_port
,
2220 kern_return_t return_code
,
2221 MACH_PORT_FACE device_port
)
2223 panic("device_open_reply: illegal\n");
2224 return KERN_SUCCESS
;
2229 paging_segment_t ps
,
2231 vm_offset_t
*bufferp
,
2233 unsigned int *residualp
,
2237 recnum_t dev_offset
;
2238 unsigned int bytes_wanted
;
2239 unsigned int bytes_read
;
2240 unsigned int total_read
;
2241 vm_offset_t dev_buffer
;
2242 vm_offset_t buf_ptr
;
2243 unsigned int records_read
;
2244 struct vs_async
*vsa
;
2245 mutex_t vs_waiting_read_reply
;
2248 vm_map_copy_t device_data
= NULL
;
2249 default_pager_thread_t
*dpt
= NULL
;
2251 device
= dev_port_lookup(ps
->ps_device
);
2252 clustered_reads
[atop_32(size
)]++;
2254 dev_offset
= (ps
->ps_offset
+
2255 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2256 bytes_wanted
= size
;
2258 *bufferp
= (vm_offset_t
)NULL
;
2261 vsa
= VS_ALLOC_ASYNC();
2265 vsa
->vsa_offset
= 0;
2269 mutex_init(&vsa
->vsa_lock
, 0);
2270 ip_lock(vsa
->reply_port
);
2271 vsa
->reply_port
->ip_sorights
++;
2272 ip_reference(vsa
->reply_port
);
2273 ip_unlock(vsa
->reply_port
);
2274 kr
= ds_device_read_common(device
,
2276 (mach_msg_type_name_t
)
2277 MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2281 (IO_READ
| IO_CALL
),
2282 (io_buf_ptr_t
*) &dev_buffer
,
2283 (mach_msg_type_number_t
*) &bytes_read
);
2284 if(kr
== MIG_NO_REPLY
) {
2285 assert_wait(&vsa
->vsa_lock
, THREAD_UNINT
);
2286 thread_block(THREAD_CONTINUE_NULL
);
2288 dev_buffer
= vsa
->vsa_addr
;
2289 bytes_read
= (unsigned int)vsa
->vsa_size
;
2290 kr
= vsa
->vsa_error
;
2293 if (kr
!= KERN_SUCCESS
|| bytes_read
== 0) {
2296 total_read
+= bytes_read
;
2299 * If we got the entire range, use the returned dev_buffer.
2301 if (bytes_read
== size
) {
2302 *bufferp
= (vm_offset_t
)dev_buffer
;
2307 dprintf(("read only %d bytes out of %d\n",
2308 bytes_read
, bytes_wanted
));
2311 dpt
= get_read_buffer();
2312 buf_ptr
= dpt
->dpt_buffer
;
2313 *bufferp
= (vm_offset_t
)buf_ptr
;
2316 * Otherwise, copy the data into the provided buffer (*bufferp)
2317 * and append the rest of the range as it comes in.
2319 memcpy((void *) buf_ptr
, (void *) dev_buffer
, bytes_read
);
2320 buf_ptr
+= bytes_read
;
2321 bytes_wanted
-= bytes_read
;
2322 records_read
= (bytes_read
>>
2323 (vm_page_shift
- ps
->ps_record_shift
));
2324 dev_offset
+= records_read
;
2325 DP_DEBUG(DEBUG_VS_INTERNAL
,
2326 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2327 dev_buffer
, bytes_read
));
2328 if (vm_deallocate(kernel_map
, dev_buffer
, bytes_read
)
2330 Panic("dealloc buf");
2331 } while (bytes_wanted
);
2333 *residualp
= size
- total_read
;
2334 if((dev_buffer
!= *bufferp
) && (total_read
!= 0)) {
2335 vm_offset_t temp_buffer
;
2336 vm_allocate(kernel_map
, &temp_buffer
, total_read
, VM_FLAGS_ANYWHERE
);
2337 memcpy((void *) temp_buffer
, (void *) *bufferp
, total_read
);
2338 if(vm_map_copyin_page_list(kernel_map
, temp_buffer
, total_read
,
2339 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2340 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2341 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2342 (vm_map_copy_t
*)&device_data
, FALSE
))
2343 panic("ps_read_device: cannot copyin locally provided buffer\n");
2345 else if((kr
== KERN_SUCCESS
) && (total_read
!= 0) && (dev_buffer
!= 0)){
2346 if(vm_map_copyin_page_list(kernel_map
, dev_buffer
, bytes_read
,
2347 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2348 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2349 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2350 (vm_map_copy_t
*)&device_data
, FALSE
))
2351 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2356 *bufferp
= (vm_offset_t
)device_data
;
2359 /* Free the receive buffer */
2360 dpt
->checked_out
= 0;
2361 thread_wakeup(&dpt_array
);
2363 return KERN_SUCCESS
;
2368 paging_segment_t ps
,
2372 struct vs_async
*vsa
)
2374 recnum_t dev_offset
;
2375 io_buf_len_t bytes_to_write
, bytes_written
;
2376 recnum_t records_written
;
2378 MACH_PORT_FACE reply_port
;
2382 clustered_writes
[atop_32(size
)]++;
2384 dev_offset
= (ps
->ps_offset
+
2385 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2386 bytes_to_write
= size
;
2390 * Asynchronous write.
2392 reply_port
= vsa
->reply_port
;
2393 ip_lock(reply_port
);
2394 reply_port
->ip_sorights
++;
2395 ip_reference(reply_port
);
2396 ip_unlock(reply_port
);
2399 device
= dev_port_lookup(ps
->ps_device
);
2401 vsa
->vsa_addr
= addr
;
2402 kr
=ds_device_write_common(device
,
2404 (mach_msg_type_name_t
) MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2407 (io_buf_ptr_t
) addr
,
2409 (IO_WRITE
| IO_CALL
),
2412 if ((kr
!= KERN_SUCCESS
) && (kr
!= MIG_NO_REPLY
)) {
2414 dprintf(("%s0x%x, addr=0x%x,"
2415 "size=0x%x,offset=0x%x\n",
2416 "device_write_request returned ",
2417 kr
, addr
, size
, offset
));
2419 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2420 /* do the completion notification to free resources */
2421 device_write_reply(reply_port
, kr
, 0);
2426 * Synchronous write.
2430 device
= dev_port_lookup(ps
->ps_device
);
2431 kr
=ds_device_write_common(device
,
2435 (io_buf_ptr_t
) addr
,
2437 (IO_WRITE
| IO_SYNC
| IO_KERNEL_BUF
),
2440 if (kr
!= KERN_SUCCESS
) {
2441 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2442 "device_write returned ",
2443 kr
, addr
, size
, offset
));
2445 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2448 if (bytes_written
& ((vm_page_size
>> ps
->ps_record_shift
) - 1))
2449 Panic("fragmented write");
2450 records_written
= (bytes_written
>>
2451 (vm_page_shift
- ps
->ps_record_shift
));
2452 dev_offset
+= records_written
;
2454 if (bytes_written
!= bytes_to_write
) {
2455 dprintf(("wrote only %d bytes out of %d\n",
2456 bytes_written
, bytes_to_write
));
2459 bytes_to_write
-= bytes_written
;
2460 addr
+= bytes_written
;
2461 } while (bytes_to_write
> 0);
2463 return PAGER_SUCCESS
;
2467 #else /* !DEVICE_PAGING */
2471 __unused paging_segment_t ps
,
2472 __unused vm_offset_t offset
,
2473 __unused vm_offset_t
*bufferp
,
2474 __unused
unsigned int size
,
2475 __unused
unsigned int *residualp
,
2478 panic("ps_read_device not supported");
2479 return KERN_FAILURE
;
2484 __unused paging_segment_t ps
,
2485 __unused vm_offset_t offset
,
2486 __unused vm_offset_t addr
,
2487 __unused
unsigned int size
,
2488 __unused
struct vs_async
*vsa
)
2490 panic("ps_write_device not supported");
2491 return KERN_FAILURE
;
2494 #endif /* DEVICE_PAGING */
2495 void pvs_object_data_provided(vstruct_t
, upl_t
, upl_offset_t
, upl_size_t
); /* forward */
2498 pvs_object_data_provided(
2499 __unused vstruct_t vs
,
2501 __unused upl_offset_t offset
,
2505 DP_DEBUG(DEBUG_VS_INTERNAL
,
2506 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2507 upl
, offset
, size
));
2510 GSTAT(global_stats
.gs_pages_in
+= atop_32(size
));
2514 ps_clunmap(vs
, offset
, size
);
2515 #endif /* USE_PRECIOUS */
2519 static memory_object_offset_t last_start
;
2520 static vm_size_t last_length
;
2525 vm_offset_t vs_offset
,
2529 kern_return_t error
= KERN_SUCCESS
;
2531 unsigned int residual
;
2532 unsigned int request_flags
;
2538 unsigned int xfer_size
;
2539 vm_offset_t orig_vs_offset
;
2540 vm_offset_t ps_offset
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2541 paging_segment_t psp
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2544 unsigned int page_list_count
;
2545 memory_object_offset_t start
;
2547 pages_in_cl
= 1 << vs
->vs_clshift
;
2548 cl_size
= pages_in_cl
* vm_page_size
;
2549 cl_mask
= cl_size
- 1;
2552 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
| UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
;
2554 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
;
2556 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
2558 if ((ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0) == (vm_offset_t
)-1) ||
2559 !CLMAP_ISSET(clmap
, cl_index
)) {
2561 * the needed page doesn't exist in the backing store...
2562 * we don't want to try to do any I/O, just abort the
2563 * page and let the fault handler provide a zero-fill
2567 * The caller was just poking at us to see if
2568 * the page has been paged out. No need to
2569 * mess with the page at all.
2570 * Just let the caller know we don't have that page.
2572 return KERN_FAILURE
;
2575 page_list_count
= 0;
2577 memory_object_super_upl_request(vs
->vs_control
, (memory_object_offset_t
)vs_offset
,
2578 PAGE_SIZE
, PAGE_SIZE
,
2579 &upl
, NULL
, &page_list_count
,
2583 upl_abort(upl
, UPL_ABORT_ERROR
);
2585 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
2586 upl_deallocate(upl
);
2588 return KERN_SUCCESS
;
2593 * The caller was just poking at us to see if
2594 * the page has been paged out. No need to
2595 * mess with the page at all.
2596 * Just let the caller know we do have that page.
2598 return KERN_SUCCESS
;
2601 assert(dp_encryption_inited
);
2602 if (dp_encryption
) {
2605 * request that the UPL be prepared for
2608 request_flags
|= UPL_ENCRYPT
;
2610 orig_vs_offset
= vs_offset
;
2612 start
= (memory_object_offset_t
)vs_offset
;
2614 cnt
= VM_SUPER_CLUSTER
;
2617 * determine how big a speculative I/O we should try for...
2619 if (memory_object_cluster_size(vs
->vs_control
, &start
, &cnt
, (memory_object_fault_info_t
)fault_info
) == KERN_SUCCESS
) {
2620 assert(vs_offset
>= (vm_offset_t
) start
&&
2621 vs_offset
< (vm_offset_t
) (start
+ cnt
));
2622 vs_offset
= (vm_offset_t
)start
;
2630 * This loop will be executed multiple times until the entire
2631 * range has been looked at or we issue an I/O... if the request spans cluster
2632 * boundaries, the clusters will be checked for logical continunity,
2633 * if contiguous the I/O request will span multiple clusters...
2634 * at most only 1 I/O will be issued... it will encompass the original offset
2636 while (cnt
&& error
== KERN_SUCCESS
) {
2639 if ((vs_offset
& cl_mask
) && (cnt
> (VM_SUPER_CLUSTER
- (vs_offset
& cl_mask
)))) {
2640 size
= VM_SUPER_CLUSTER
;
2641 size
-= vs_offset
& cl_mask
;
2642 } else if (cnt
> VM_SUPER_CLUSTER
)
2643 size
= VM_SUPER_CLUSTER
;
2652 while (size
> 0 && error
== KERN_SUCCESS
) {
2653 unsigned int abort_size
;
2657 vm_offset_t cur_offset
;
2659 if ( !ps_info_valid
) {
2660 ps_offset
[seg_index
] = ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
2661 psp
[seg_index
] = CLMAP_PS(clmap
);
2665 * skip over unallocated physical segments
2667 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2668 abort_size
= cl_size
- (vs_offset
& cl_mask
);
2669 abort_size
= MIN(abort_size
, size
);
2672 vs_offset
+= abort_size
;
2679 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
2681 for (abort_size
= 0; cl_index
< pages_in_cl
&& abort_size
< size
; cl_index
++) {
2683 * skip over unallocated pages
2685 if (CLMAP_ISSET(clmap
, cl_index
))
2687 abort_size
+= vm_page_size
;
2691 vs_offset
+= abort_size
;
2693 if (cl_index
== pages_in_cl
) {
2695 * if we're at the end of this physical cluster
2696 * then bump to the next one and continue looking
2707 * remember the starting point of the first allocated page
2708 * for the I/O we're about to issue
2710 beg_pseg
= seg_index
;
2711 beg_indx
= cl_index
;
2712 cur_offset
= vs_offset
;
2715 * calculate the size of the I/O that we can do...
2716 * this may span multiple physical segments if
2717 * they are contiguous
2719 for (xfer_size
= 0; xfer_size
< size
; ) {
2721 while (cl_index
< pages_in_cl
&& xfer_size
< size
) {
2723 * accumulate allocated pages within
2724 * a physical segment
2726 if (CLMAP_ISSET(clmap
, cl_index
)) {
2727 xfer_size
+= vm_page_size
;
2728 cur_offset
+= vm_page_size
;
2731 BS_STAT(psp
[seg_index
]->ps_bs
,
2732 psp
[seg_index
]->ps_bs
->bs_pages_in
++);
2736 if (cl_index
< pages_in_cl
|| xfer_size
>= size
) {
2738 * we've hit an unallocated page or
2739 * the end of this request... see if
2740 * it's time to fire the I/O
2745 * we've hit the end of the current physical
2746 * segment and there's more to do, so try
2747 * moving to the next one
2751 ps_offset
[seg_index
] = ps_clmap(vs
, cur_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
2752 psp
[seg_index
] = CLMAP_PS(clmap
);
2755 if ((ps_offset
[seg_index
- 1] != (ps_offset
[seg_index
] - cl_size
)) || (psp
[seg_index
- 1] != psp
[seg_index
])) {
2757 * if the physical segment we're about
2758 * to step into is not contiguous to
2759 * the one we're currently in, or it's
2760 * in a different paging file, or
2761 * it hasn't been allocated....
2762 * we stop this run and go check
2763 * to see if it's time to fire the I/O
2768 * start with first page of the next physical
2773 if (xfer_size
== 0) {
2775 * no I/O to generate for this segment
2779 if (cur_offset
<= orig_vs_offset
) {
2781 * we've hit a hole in our speculative cluster
2782 * before the offset that we're really after...
2783 * don't issue the I/O since it doesn't encompass
2784 * the original offset and we're looking to only
2785 * pull in the speculative pages if they can be
2786 * made part of a single I/O
2789 vs_offset
+= xfer_size
;
2794 * we have a contiguous range of allocated pages
2795 * to read from that encompasses the original offset
2797 page_list_count
= 0;
2798 memory_object_super_upl_request(vs
->vs_control
, (memory_object_offset_t
)vs_offset
,
2799 xfer_size
, xfer_size
,
2800 &upl
, NULL
, &page_list_count
,
2801 request_flags
| UPL_SET_INTERNAL
| UPL_NOBLOCK
);
2803 error
= ps_read_file(psp
[beg_pseg
],
2804 upl
, (upl_offset_t
) 0,
2805 ps_offset
[beg_pseg
] + (beg_indx
* vm_page_size
),
2806 xfer_size
, &residual
, 0);
2811 * Adjust counts and send response to VM. Optimize
2812 * for the common case, i.e. no error and/or partial
2813 * data. If there was an error, then we need to error
2814 * the entire range, even if some data was successfully
2815 * read. If there was a partial read we may supply some
2816 * data and may error some as well. In all cases the
2817 * VM must receive some notification for every page
2820 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
2822 * Got everything we asked for, supply the data
2823 * to the VM. Note that as a side effect of
2824 * supplying the data, the buffer holding the
2825 * supplied data is deallocated from the pager's
2828 pvs_object_data_provided(vs
, upl
, vs_offset
, xfer_size
);
2830 failed_size
= xfer_size
;
2832 if (error
== KERN_SUCCESS
) {
2833 if (residual
== xfer_size
) {
2835 * If a read operation returns no error
2836 * and no data moved, we turn it into
2837 * an error, assuming we're reading at
2839 * Fall through and error the entire range.
2841 error
= KERN_FAILURE
;
2844 * Otherwise, we have partial read. If
2845 * the part read is a integral number
2846 * of pages supply it. Otherwise round
2847 * it up to a page boundary, zero fill
2848 * the unread part, and supply it.
2849 * Fall through and error the remainder
2850 * of the range, if any.
2855 fill
= residual
& ~vm_page_size
;
2856 lsize
= (xfer_size
- residual
) + fill
;
2858 pvs_object_data_provided(vs
, upl
, vs_offset
, lsize
);
2860 if (lsize
< xfer_size
) {
2861 failed_size
= xfer_size
- lsize
;
2862 error
= KERN_FAILURE
;
2867 if (error
!= KERN_SUCCESS
) {
2869 * There was an error in some part of the range, tell
2870 * the VM. Note that error is explicitly checked again
2871 * since it can be modified above.
2873 BS_STAT(psp
[beg_pseg
]->ps_bs
,
2874 psp
[beg_pseg
]->ps_bs
->bs_pages_in_fail
+= atop_32(failed_size
));
2877 * we've issued a single I/O that encompassed the original offset
2878 * at this point we either met our speculative request length or
2879 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
2880 * not present or not physically contiguous to the previous one), so
2881 * we're done issuing I/O at this point
2889 int vs_do_async_write
= 1;
2895 upl_offset_t offset
,
2897 boolean_t dp_internal
,
2900 upl_size_t transfer_size
;
2904 vm_offset_t actual_offset
; /* Offset within paging segment */
2905 paging_segment_t ps
;
2906 vm_offset_t mobj_base_addr
;
2907 vm_offset_t mobj_target_addr
;
2910 upl_page_info_t
*pl
;
2914 unsigned int cl_size
;
2916 unsigned int seg_size
;
2918 pages_in_cl
= 1 << vs
->vs_clshift
;
2919 cl_size
= pages_in_cl
* vm_page_size
;
2922 unsigned int page_list_count
;
2924 unsigned int super_size
;
2929 upl_offset_t upl_offset
;
2930 vm_offset_t seg_offset
;
2931 vm_offset_t ps_offset
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2932 paging_segment_t psp
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2936 super_size
= cl_size
;
2938 request_flags
= UPL_NOBLOCK
|
2939 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2940 UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
;
2942 super_size
= VM_SUPER_CLUSTER
;
2944 request_flags
= UPL_NOBLOCK
| UPL_CLEAN_IN_PLACE
|
2945 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2946 UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
;
2949 if (!dp_encryption_inited
) {
2952 * Once we've started using swap, we
2953 * can't change our mind on whether
2954 * it needs to be encrypted or
2957 dp_encryption_inited
= TRUE
;
2959 if (dp_encryption
) {
2962 * request that the UPL be prepared for
2965 request_flags
|= UPL_ENCRYPT
;
2966 flags
|= UPL_PAGING_ENCRYPTED
;
2969 page_list_count
= 0;
2970 memory_object_super_upl_request(vs
->vs_control
,
2971 (memory_object_offset_t
)offset
,
2973 &upl
, NULL
, &page_list_count
,
2974 request_flags
| UPL_FOR_PAGEOUT
);
2976 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2978 seg_size
= cl_size
- (upl
->offset
% cl_size
);
2979 upl_offset
= upl
->offset
& ~(cl_size
- 1);
2981 for (seg_index
= 0, transfer_size
= upl
->size
;
2982 transfer_size
> 0; ) {
2983 ps_offset
[seg_index
] =
2989 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2991 upl_deallocate(upl
);
2993 return KERN_FAILURE
;
2996 psp
[seg_index
] = CLMAP_PS(clmap
);
2998 if (transfer_size
> seg_size
) {
2999 transfer_size
-= seg_size
;
3000 upl_offset
+= cl_size
;
3007 * Ignore any non-present pages at the end of the
3010 for (page_index
= upl
->size
/ vm_page_size
; page_index
> 0;)
3011 if (UPL_PAGE_PRESENT(pl
, --page_index
))
3013 num_of_pages
= page_index
+ 1;
3015 base_index
= (upl
->offset
% cl_size
) / PAGE_SIZE
;
3017 for (page_index
= 0; page_index
< num_of_pages
; ) {
3019 * skip over non-dirty pages
3021 for ( ; page_index
< num_of_pages
; page_index
++) {
3022 if (UPL_DIRTY_PAGE(pl
, page_index
)
3023 || UPL_PRECIOUS_PAGE(pl
, page_index
))
3025 * this is a page we need to write
3026 * go see if we can buddy it up with
3027 * others that are contiguous to it
3031 * if the page is not-dirty, but present we
3032 * need to commit it... This is an unusual
3033 * case since we only asked for dirty pages
3035 if (UPL_PAGE_PRESENT(pl
, page_index
)) {
3036 boolean_t empty
= FALSE
;
3037 upl_commit_range(upl
,
3038 page_index
* vm_page_size
,
3040 UPL_COMMIT_NOTIFY_EMPTY
,
3045 assert(page_index
==
3047 upl_deallocate(upl
);
3051 if (page_index
== num_of_pages
)
3053 * no more pages to look at, we're out of here
3058 * gather up contiguous dirty pages... we have at
3059 * least 1 * otherwise we would have bailed above
3060 * make sure that each physical segment that we step
3061 * into is contiguous to the one we're currently in
3062 * if it's not, we have to stop and write what we have
3064 for (first_dirty
= page_index
;
3065 page_index
< num_of_pages
; ) {
3066 if ( !UPL_DIRTY_PAGE(pl
, page_index
)
3067 && !UPL_PRECIOUS_PAGE(pl
, page_index
))
3071 * if we just looked at the last page in the UPL
3072 * we don't need to check for physical segment
3075 if (page_index
< num_of_pages
) {
3079 cur_seg
= (base_index
+ (page_index
- 1))/pages_in_cl
;
3080 nxt_seg
= (base_index
+ page_index
)/pages_in_cl
;
3082 if (cur_seg
!= nxt_seg
) {
3083 if ((ps_offset
[cur_seg
] != (ps_offset
[nxt_seg
] - cl_size
)) || (psp
[cur_seg
] != psp
[nxt_seg
]))
3085 * if the segment we're about
3086 * to step into is not
3087 * contiguous to the one we're
3088 * currently in, or it's in a
3089 * different paging file....
3090 * we stop here and generate
3097 num_dirty
= page_index
- first_dirty
;
3100 upl_offset
= first_dirty
* vm_page_size
;
3101 transfer_size
= num_dirty
* vm_page_size
;
3103 while (transfer_size
) {
3105 if ((seg_size
= cl_size
-
3106 ((upl
->offset
+ upl_offset
) % cl_size
))
3108 seg_size
= transfer_size
;
3110 ps_vs_write_complete(vs
,
3111 upl
->offset
+ upl_offset
,
3114 transfer_size
-= seg_size
;
3115 upl_offset
+= seg_size
;
3117 upl_offset
= first_dirty
* vm_page_size
;
3118 transfer_size
= num_dirty
* vm_page_size
;
3120 seg_index
= (base_index
+ first_dirty
) / pages_in_cl
;
3121 seg_offset
= (upl
->offset
+ upl_offset
) % cl_size
;
3123 error
= ps_write_file(psp
[seg_index
],
3125 ps_offset
[seg_index
]
3127 transfer_size
, flags
);
3129 boolean_t empty
= FALSE
;
3130 upl_abort_range(upl
,
3131 first_dirty
* vm_page_size
,
3132 num_dirty
* vm_page_size
,
3133 UPL_ABORT_NOTIFY_EMPTY
,
3136 assert(page_index
== num_of_pages
);
3137 upl_deallocate(upl
);
3143 assert(cnt
<= (vm_page_size
<< vs
->vs_clshift
));
3147 /* The caller provides a mapped_data which is derived */
3148 /* from a temporary object. The targeted pages are */
3149 /* guaranteed to be set at offset 0 in the mapped_data */
3150 /* The actual offset however must still be derived */
3151 /* from the offset in the vs in question */
3152 mobj_base_addr
= offset
;
3153 mobj_target_addr
= mobj_base_addr
;
3155 for (transfer_size
= list_size
; transfer_size
!= 0;) {
3156 actual_offset
= ps_clmap(vs
, mobj_target_addr
,
3158 transfer_size
< cl_size
?
3159 transfer_size
: cl_size
, 0);
3160 if(actual_offset
== (vm_offset_t
) -1) {
3164 cnt
= MIN(transfer_size
,
3165 CLMAP_NPGS(clmap
) * vm_page_size
);
3166 ps
= CLMAP_PS(clmap
);
3167 /* Assume that the caller has given us contiguous */
3170 ps_vs_write_complete(vs
, mobj_target_addr
,
3172 error
= ps_write_file(ps
, internal_upl
,
3180 actual_offset
+= cnt
;
3181 mobj_target_addr
+= cnt
;
3182 transfer_size
-= cnt
;
3190 return KERN_FAILURE
;
3192 return KERN_SUCCESS
;
3196 ps_vstruct_allocated_size(
3200 struct vs_map
*vsmap
;
3201 unsigned int i
, j
, k
;
3204 if (vs
->vs_indirect
) {
3205 /* loop on indirect maps */
3206 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3207 vsmap
= vs
->vs_imap
[i
];
3210 /* loop on clusters in this indirect map */
3211 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3212 if (VSM_ISCLR(vsmap
[j
]) ||
3213 VSM_ISERR(vsmap
[j
]))
3215 /* loop on pages in this cluster */
3216 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3217 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3223 vsmap
= vs
->vs_dmap
;
3226 /* loop on clusters in the direct map */
3227 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3228 if (VSM_ISCLR(vsmap
[j
]) ||
3229 VSM_ISERR(vsmap
[j
]))
3231 /* loop on pages in this cluster */
3232 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3233 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3239 return ptoa_32(num_pages
);
3243 ps_vstruct_allocated_pages(
3245 default_pager_page_t
*pages
,
3248 unsigned int num_pages
;
3249 struct vs_map
*vsmap
;
3251 unsigned int i
, j
, k
;
3255 if (vs
->vs_indirect
) {
3256 /* loop on indirect maps */
3257 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3258 vsmap
= vs
->vs_imap
[i
];
3259 if (vsmap
== NULL
) {
3260 offset
+= (vm_page_size
* CLMAP_ENTRIES
*
3264 /* loop on clusters in this indirect map */
3265 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3266 if (VSM_ISCLR(vsmap
[j
]) ||
3267 VSM_ISERR(vsmap
[j
])) {
3268 offset
+= vm_page_size
* VSCLSIZE(vs
);
3271 /* loop on pages in this cluster */
3272 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3273 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3275 if (num_pages
< pages_size
)
3276 pages
++->dpp_offset
=
3279 offset
+= vm_page_size
;
3284 vsmap
= vs
->vs_dmap
;
3287 /* loop on clusters in the direct map */
3288 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3289 if (VSM_ISCLR(vsmap
[j
]) ||
3290 VSM_ISERR(vsmap
[j
])) {
3291 offset
+= vm_page_size
* VSCLSIZE(vs
);
3294 /* loop on pages in this cluster */
3295 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3296 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3298 if (num_pages
< pages_size
)
3299 pages
++->dpp_offset
= offset
;
3301 offset
+= vm_page_size
;
3311 ps_vstruct_transfer_from_segment(
3313 paging_segment_t segment
,
3316 struct vs_map
*vsmap
;
3317 // struct vs_map old_vsmap;
3318 // struct vs_map new_vsmap;
3321 VS_LOCK(vs
); /* block all work on this vstruct */
3322 /* can't allow the normal multiple write */
3323 /* semantic because writes may conflict */
3324 vs
->vs_xfer_pending
= TRUE
;
3325 vs_wait_for_sync_writers(vs
);
3327 vs_wait_for_readers(vs
);
3328 /* we will unlock the vs to allow other writes while transferring */
3329 /* and will be guaranteed of the persistance of the vs struct */
3330 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3331 /* vs_async_pending */
3332 /* OK we now have guaranteed no other parties are accessing this */
3333 /* vs. Now that we are also supporting simple lock versions of */
3334 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3335 /* our purpose in holding it before was the multiple write case */
3336 /* we now use the boolean xfer_pending to do that. We can use */
3337 /* a boolean instead of a count because we have guaranteed single */
3338 /* file access to this code in its caller */
3341 if (vs
->vs_indirect
) {
3342 unsigned int vsmap_size
;
3344 /* loop on indirect maps */
3345 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3346 vsmap
= vs
->vs_imap
[i
];
3349 /* loop on clusters in this indirect map */
3350 clmap_off
= (vm_page_size
* CLMAP_ENTRIES
*
3352 if(i
+1 == INDIRECT_CLMAP_ENTRIES(vs
->vs_size
))
3353 vsmap_size
= vs
->vs_size
- (CLMAP_ENTRIES
* i
);
3355 vsmap_size
= CLMAP_ENTRIES
;
3356 for (j
= 0; j
< vsmap_size
; j
++) {
3357 if (VSM_ISCLR(vsmap
[j
]) ||
3358 VSM_ISERR(vsmap
[j
]) ||
3359 (VSM_PS(vsmap
[j
]) != segment
))
3361 if(vs_cluster_transfer(vs
,
3362 (vm_page_size
* (j
<< vs
->vs_clshift
))
3364 vm_page_size
<< vs
->vs_clshift
,
3368 vs
->vs_xfer_pending
= FALSE
;
3370 vs_finish_write(vs
);
3371 return KERN_FAILURE
;
3373 /* allow other readers/writers during transfer*/
3375 vs
->vs_xfer_pending
= FALSE
;
3377 vs_finish_write(vs
);
3379 vs
->vs_xfer_pending
= TRUE
;
3380 vs_wait_for_sync_writers(vs
);
3382 vs_wait_for_readers(vs
);
3384 if (!(vs
->vs_indirect
)) {
3390 vsmap
= vs
->vs_dmap
;
3391 if (vsmap
== NULL
) {
3393 vs
->vs_xfer_pending
= FALSE
;
3395 vs_finish_write(vs
);
3396 return KERN_SUCCESS
;
3398 /* loop on clusters in the direct map */
3399 for (j
= 0; j
< vs
->vs_size
; j
++) {
3400 if (VSM_ISCLR(vsmap
[j
]) ||
3401 VSM_ISERR(vsmap
[j
]) ||
3402 (VSM_PS(vsmap
[j
]) != segment
))
3404 if(vs_cluster_transfer(vs
,
3405 vm_page_size
* (j
<< vs
->vs_clshift
),
3406 vm_page_size
<< vs
->vs_clshift
,
3407 upl
) != KERN_SUCCESS
) {
3409 vs
->vs_xfer_pending
= FALSE
;
3411 vs_finish_write(vs
);
3412 return KERN_FAILURE
;
3414 /* allow other readers/writers during transfer*/
3416 vs
->vs_xfer_pending
= FALSE
;
3418 vs_finish_write(vs
);
3420 vs
->vs_xfer_pending
= TRUE
;
3422 vs_wait_for_sync_writers(vs
);
3424 vs_wait_for_readers(vs
);
3425 if (vs
->vs_indirect
) {
3432 vs
->vs_xfer_pending
= FALSE
;
3434 vs_finish_write(vs
);
3435 return KERN_SUCCESS
;
3445 struct vs_map
*vsmap
;
3446 vm_offset_t cluster
;
3448 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
3449 if (vs
->vs_indirect
) {
3450 long ind_block
= cluster
/CLMAP_ENTRIES
;
3452 /* Is the indirect block allocated? */
3453 vsmap
= vs
->vs_imap
[ind_block
];
3454 if(vsmap
== (vs_map_t
) NULL
)
3457 vsmap
= vs
->vs_dmap
;
3458 vsmap
+= cluster%CLMAP_ENTRIES
;
3463 vs_cluster_transfer(
3469 vm_offset_t actual_offset
;
3470 paging_segment_t ps
;
3472 kern_return_t error
= KERN_SUCCESS
;
3473 unsigned int size
, size_wanted
;
3475 unsigned int residual
= 0;
3476 unsigned int unavail_size
;
3477 // default_pager_thread_t *dpt;
3478 // boolean_t dealloc;
3479 struct vs_map
*vsmap_ptr
= NULL
;
3480 struct vs_map read_vsmap
;
3481 struct vs_map original_read_vsmap
;
3482 struct vs_map write_vsmap
;
3484 // vm_offset_t ioaddr;
3486 /* vs_cluster_transfer reads in the pages of a cluster and
3487 * then writes these pages back to new backing store. The
3488 * segment the pages are being read from is assumed to have
3489 * been taken off-line and is no longer considered for new
3494 * This loop will be executed once per cluster referenced.
3495 * Typically this means once, since it's unlikely that the
3496 * VM system will ask for anything spanning cluster boundaries.
3498 * If there are holes in a cluster (in a paging segment), we stop
3499 * reading at the hole, then loop again, hoping to
3500 * find valid pages later in the cluster. This continues until
3501 * the entire range has been examined, and read, if present. The
3502 * pages are written as they are read. If a failure occurs after
3503 * some pages are written the unmap call at the bottom of the loop
3504 * recovers the backing store and the old backing store remains
3508 VSM_CLR(write_vsmap
);
3509 VSM_CLR(original_read_vsmap
);
3510 /* grab the actual object's pages to sync with I/O */
3511 while (cnt
&& (error
== KERN_SUCCESS
)) {
3512 vsmap_ptr
= vs_get_map_entry(vs
, offset
);
3513 actual_offset
= ps_clmap(vs
, offset
, &clmap
, CL_FIND
, 0, 0);
3515 if (actual_offset
== (vm_offset_t
) -1) {
3518 * Nothing left to write in this cluster at least
3519 * set write cluster information for any previous
3520 * write, clear for next cluster, if there is one
3522 unsigned int local_size
, clmask
, clsize
;
3524 clsize
= vm_page_size
<< vs
->vs_clshift
;
3525 clmask
= clsize
- 1;
3526 local_size
= clsize
- (offset
& clmask
);
3528 local_size
= MIN(local_size
, cnt
);
3530 /* This cluster has no data in it beyond what may */
3531 /* have been found on a previous iteration through */
3532 /* the loop "write_vsmap" */
3533 *vsmap_ptr
= write_vsmap
;
3534 VSM_CLR(write_vsmap
);
3535 VSM_CLR(original_read_vsmap
);
3538 offset
+= local_size
;
3543 * Count up contiguous available or unavailable
3546 ps
= CLMAP_PS(clmap
);
3551 (size
< cnt
) && (unavail_size
< cnt
) &&
3552 (i
< CLMAP_NPGS(clmap
)); i
++) {
3553 if (CLMAP_ISSET(clmap
, i
)) {
3554 if (unavail_size
!= 0)
3556 size
+= vm_page_size
;
3558 ps
->ps_bs
->bs_pages_in
++);
3562 unavail_size
+= vm_page_size
;
3567 ASSERT(unavail_size
);
3568 cnt
-= unavail_size
;
3569 offset
+= unavail_size
;
3570 if((offset
& ((vm_page_size
<< vs
->vs_clshift
) - 1))
3572 /* There is no more to transfer in this
3575 *vsmap_ptr
= write_vsmap
;
3576 VSM_CLR(write_vsmap
);
3577 VSM_CLR(original_read_vsmap
);
3582 if(VSM_ISCLR(original_read_vsmap
))
3583 original_read_vsmap
= *vsmap_ptr
;
3585 if(ps
->ps_segtype
== PS_PARTITION
) {
3586 panic("swap partition not supported\n");
3588 error
= KERN_FAILURE
;
3591 NEED TO ISSUE WITH SYNC & NO COMMIT
3592 error = ps_read_device(ps, actual_offset, &buffer,
3593 size, &residual, flags);
3596 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3597 error
= ps_read_file(ps
, upl
, (upl_offset_t
) 0, actual_offset
,
3599 (UPL_IOSYNC
| UPL_NOCOMMIT
));
3602 read_vsmap
= *vsmap_ptr
;
3606 * Adjust counts and put data in new BS. Optimize for the
3607 * common case, i.e. no error and/or partial data.
3608 * If there was an error, then we need to error the entire
3609 * range, even if some data was successfully read.
3612 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
3615 * Got everything we asked for, supply the data to
3616 * the new BS. Note that as a side effect of supplying
3617 * the data, the buffer holding the supplied data is
3618 * deallocated from the pager's address space unless
3619 * the write is unsuccessful.
3622 /* note buffer will be cleaned up in all cases by */
3623 /* internal_cluster_write or if an error on write */
3624 /* the vm_map_copy_page_discard call */
3625 *vsmap_ptr
= write_vsmap
;
3627 if(vs_cluster_write(vs
, upl
, offset
,
3628 size
, TRUE
, UPL_IOSYNC
| UPL_NOCOMMIT
) != KERN_SUCCESS
) {
3629 error
= KERN_FAILURE
;
3630 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3631 /* unmap the new backing store object */
3632 ps_clunmap(vs
, offset
, size
);
3634 /* original vsmap */
3635 *vsmap_ptr
= original_read_vsmap
;
3636 VSM_CLR(write_vsmap
);
3638 if((offset
+ size
) &
3639 ((vm_page_size
<< vs
->vs_clshift
)
3641 /* There is more to transfer in this
3644 write_vsmap
= *vsmap_ptr
;
3645 *vsmap_ptr
= read_vsmap
;
3647 /* discard the old backing object */
3648 write_vsmap
= *vsmap_ptr
;
3649 *vsmap_ptr
= read_vsmap
;
3650 ps_clunmap(vs
, offset
, size
);
3651 *vsmap_ptr
= write_vsmap
;
3652 VSM_CLR(write_vsmap
);
3653 VSM_CLR(original_read_vsmap
);
3658 if (error
== KERN_SUCCESS
) {
3659 if (residual
== size
) {
3661 * If a read operation returns no error
3662 * and no data moved, we turn it into
3663 * an error, assuming we're reading at
3665 * Fall through and error the entire
3668 error
= KERN_FAILURE
;
3669 *vsmap_ptr
= write_vsmap
;
3670 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3671 /* unmap the new backing store object */
3672 ps_clunmap(vs
, offset
, size
);
3674 *vsmap_ptr
= original_read_vsmap
;
3675 VSM_CLR(write_vsmap
);
3679 * Otherwise, we have partial read.
3680 * This is also considered an error
3681 * for the purposes of cluster transfer
3683 error
= KERN_FAILURE
;
3684 *vsmap_ptr
= write_vsmap
;
3685 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3686 /* unmap the new backing store object */
3687 ps_clunmap(vs
, offset
, size
);
3689 *vsmap_ptr
= original_read_vsmap
;
3690 VSM_CLR(write_vsmap
);
3699 } /* END while (cnt && (error == 0)) */
3700 if(!VSM_ISCLR(write_vsmap
))
3701 *vsmap_ptr
= write_vsmap
;
3707 default_pager_add_file(
3708 MACH_PORT_FACE backing_store
,
3714 paging_segment_t ps
;
3719 if ((bs
= backing_store_lookup(backing_store
))
3720 == BACKING_STORE_NULL
)
3721 return KERN_INVALID_ARGUMENT
;
3724 for (i
= 0; i
<= paging_segment_max
; i
++) {
3725 ps
= paging_segments
[i
];
3726 if (ps
== PAGING_SEGMENT_NULL
)
3728 if (ps
->ps_segtype
!= PS_FILE
)
3732 * Check for overlap on same device.
3734 if (ps
->ps_vnode
== (struct vnode
*)vp
) {
3737 return KERN_INVALID_ARGUMENT
;
3743 * Set up the paging segment
3745 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
3746 if (ps
== PAGING_SEGMENT_NULL
) {
3748 return KERN_RESOURCE_SHORTAGE
;
3751 ps
->ps_segtype
= PS_FILE
;
3752 ps
->ps_vnode
= (struct vnode
*)vp
;
3754 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
3755 ps
->ps_recnum
= size
;
3756 ps
->ps_pgnum
= size
>> ps
->ps_record_shift
;
3758 ps
->ps_pgcount
= ps
->ps_pgnum
;
3759 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
3760 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
3764 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
3766 kfree(ps
, sizeof *ps
);
3768 return KERN_RESOURCE_SHORTAGE
;
3770 for (j
= 0; j
< ps
->ps_ncls
; j
++) {
3771 clrbit(ps
->ps_bmap
, j
);
3774 ps
->ps_going_away
= FALSE
;
3777 if ((error
= ps_enter(ps
)) != 0) {
3778 kfree(ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
3779 kfree(ps
, sizeof *ps
);
3781 return KERN_RESOURCE_SHORTAGE
;
3784 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3785 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3787 dp_pages_free
+= ps
->ps_pgcount
;
3792 bs_more_space(ps
->ps_clcount
);
3794 DP_DEBUG(DEBUG_BS_INTERNAL
,
3795 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3796 device
, offset
, size
, record_size
,
3797 ps
->ps_record_shift
, ps
->ps_pgnum
));
3799 return KERN_SUCCESS
;
3806 paging_segment_t ps
,
3808 upl_offset_t upl_offset
,
3811 unsigned int *residualp
,
3814 vm_object_offset_t f_offset
;
3818 assert(dp_encryption_inited
);
3820 clustered_reads
[atop_32(size
)]++;
3822 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3825 * for transfer case we need to pass uploffset and flags
3827 error
= vnode_pagein(ps
->ps_vnode
, upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
, NULL
);
3829 /* The vnode_pagein semantic is somewhat at odds with the existing */
3830 /* device_read semantic. Partial reads are not experienced at this */
3831 /* level. It is up to the bit map code and cluster read code to */
3832 /* check that requested data locations are actually backed, and the */
3833 /* pagein code to either read all of the requested data or return an */
3837 result
= KERN_FAILURE
;
3840 result
= KERN_SUCCESS
;
3847 paging_segment_t ps
,
3849 upl_offset_t upl_offset
,
3854 vm_object_offset_t f_offset
;
3855 kern_return_t result
;
3857 assert(dp_encryption_inited
);
3859 clustered_writes
[atop_32(size
)]++;
3860 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3862 if (flags
& UPL_PAGING_ENCRYPTED
) {
3865 * encrypt all the pages that we're going
3868 upl_encrypt(upl
, upl_offset
, size
);
3870 if (vnode_pageout(ps
->ps_vnode
, upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
, NULL
))
3871 result
= KERN_FAILURE
;
3873 result
= KERN_SUCCESS
;
3879 default_pager_triggers( __unused MACH_PORT_FACE default_pager
,
3883 MACH_PORT_FACE trigger_port
)
3885 MACH_PORT_FACE release
;
3889 if (flags
== SWAP_ENCRYPT_ON
) {
3890 /* ENCRYPTED SWAP: turn encryption on */
3891 release
= trigger_port
;
3892 if (!dp_encryption_inited
) {
3893 dp_encryption_inited
= TRUE
;
3894 dp_encryption
= TRUE
;
3899 } else if (flags
== SWAP_ENCRYPT_OFF
) {
3900 /* ENCRYPTED SWAP: turn encryption off */
3901 release
= trigger_port
;
3902 if (!dp_encryption_inited
) {
3903 dp_encryption_inited
= TRUE
;
3904 dp_encryption
= FALSE
;
3909 } else if (flags
== HI_WAT_ALERT
) {
3910 release
= min_pages_trigger_port
;
3911 min_pages_trigger_port
= trigger_port
;
3912 minimum_pages_remaining
= hi_wat
/vm_page_size
;
3915 } else if (flags
== LO_WAT_ALERT
) {
3916 release
= max_pages_trigger_port
;
3917 max_pages_trigger_port
= trigger_port
;
3918 maximum_pages_free
= lo_wat
/vm_page_size
;
3921 release
= trigger_port
;
3922 kr
= KERN_INVALID_ARGUMENT
;
3926 if (IP_VALID(release
))
3927 ipc_port_release_send(release
);
3933 * Monitor the amount of available backing store vs. the amount of
3934 * required backing store, notify a listener (if present) when
3935 * backing store may safely be removed.
3937 * We attempt to avoid the situation where backing store is
3938 * discarded en masse, as this can lead to thrashing as the
3939 * backing store is compacted.
3942 #define PF_INTERVAL 3 /* time between free level checks */
3943 #define PF_LATENCY 10 /* number of intervals before release */
3945 static int dp_pages_free_low_count
= 0;
3946 thread_call_t default_pager_backing_store_monitor_callout
;
3949 default_pager_backing_store_monitor(__unused thread_call_param_t p1
,
3950 __unused thread_call_param_t p2
)
3952 // unsigned long long average;
3957 * We determine whether it will be safe to release some
3958 * backing store by watching the free page level. If
3959 * it remains below the maximum_pages_free threshold for
3960 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3961 * then we deem it safe.
3963 * Note that this establishes a maximum rate at which backing
3964 * store will be released, as each notification (currently)
3965 * only results in a single backing store object being
3968 if (dp_pages_free
> maximum_pages_free
) {
3969 dp_pages_free_low_count
++;
3971 dp_pages_free_low_count
= 0;
3974 /* decide whether to send notification */
3976 if (max_pages_trigger_port
&&
3977 (backing_store_release_trigger_disable
== 0) &&
3978 (dp_pages_free_low_count
> PF_LATENCY
)) {
3979 trigger
= max_pages_trigger_port
;
3980 max_pages_trigger_port
= NULL
;
3983 /* send notification */
3984 if (trigger
!= IP_NULL
) {
3986 if(backing_store_release_trigger_disable
!= 0) {
3987 assert_wait((event_t
)
3988 &backing_store_release_trigger_disable
,
3991 thread_block(THREAD_CONTINUE_NULL
);
3995 default_pager_space_alert(trigger
, LO_WAT_ALERT
);
3996 ipc_port_release_send(trigger
);
3997 dp_pages_free_low_count
= 0;
4000 clock_interval_to_deadline(PF_INTERVAL
, NSEC_PER_SEC
, &deadline
);
4001 thread_call_enter_delayed(default_pager_backing_store_monitor_callout
, deadline
);