2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
29 * Mach Operating System
30 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
31 * All Rights Reserved.
33 * Permission to use, copy, modify and distribute this software and its
34 * documentation is hereby granted, provided that both the copyright
35 * notice and this permission notice appear in all copies of the
36 * software, derivative works or modified versions, and any portions
37 * thereof, and that both notices appear in supporting documentation.
39 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
40 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
41 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
43 * Carnegie Mellon requests users of this software to return to
45 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
46 * School of Computer Science
47 * Carnegie Mellon University
48 * Pittsburgh PA 15213-3890
50 * any improvements or extensions that they make and grant Carnegie Mellon
51 * the rights to redistribute these changes.
56 * Paging File Management.
59 #include <mach/memory_object_control.h>
60 #include <mach/memory_object_server.h>
61 #include "default_pager_internal.h"
62 #include <default_pager/default_pager_alerts.h>
63 #include <ipc/ipc_port.h>
64 #include <ipc/ipc_space.h>
65 #include <kern/queue.h>
66 #include <kern/counters.h>
67 #include <kern/sched_prim.h>
68 #include <vm/vm_kern.h>
69 #include <vm/vm_pageout.h>
71 #include <vm/vm_map.h>
74 * ALLOC_STRIDE... the maximum number of bytes allocated from
75 * a swap file before moving on to the next swap file... if
76 * all swap files reside on a single disk, this value should
77 * be very large (this is the default assumption)... if the
78 * swap files are spread across multiple disks, than this value
79 * should be small (128 * 1024)...
81 * This should be determined dynamically in the future
84 #define ALLOC_STRIDE (1024 * 1024 * 1024)
85 int physical_transfer_cluster_count
= 0;
87 #define VM_SUPER_CLUSTER 0x40000
88 #define VM_SUPER_PAGES 64
91 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
92 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
94 #define VSTRUCT_DEF_CLSHIFT 2
95 int vstruct_def_clshift
= VSTRUCT_DEF_CLSHIFT
;
96 int default_pager_clsize
= 0;
99 unsigned int clustered_writes
[VM_SUPER_PAGES
+1];
100 unsigned int clustered_reads
[VM_SUPER_PAGES
+1];
103 * Globals used for asynchronous paging operations:
104 * vs_async_list: head of list of to-be-completed I/O ops
105 * async_num_queued: number of pages completed, but not yet
106 * processed by async thread.
107 * async_requests_out: number of pages of requests not completed.
111 struct vs_async
*vs_async_list
;
112 int async_num_queued
;
113 int async_requests_out
;
117 #define VS_ASYNC_REUSE 1
118 struct vs_async
*vs_async_free_list
;
120 mutex_t default_pager_async_lock
; /* Protects globals above */
123 int vs_alloc_async_failed
= 0; /* statistics */
124 int vs_alloc_async_count
= 0; /* statistics */
125 struct vs_async
*vs_alloc_async(void); /* forward */
126 void vs_free_async(struct vs_async
*vsa
); /* forward */
129 #define VS_ALLOC_ASYNC() vs_alloc_async()
130 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
132 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
133 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
134 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
136 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
138 * Paging Space Hysteresis triggers and the target notification port
142 unsigned int minimum_pages_remaining
= 0;
143 unsigned int maximum_pages_free
= 0;
144 ipc_port_t min_pages_trigger_port
= NULL
;
145 ipc_port_t max_pages_trigger_port
= NULL
;
147 boolean_t bs_low
= FALSE
;
148 int backing_store_release_trigger_disable
= 0;
153 * Object sizes are rounded up to the next power of 2,
154 * unless they are bigger than a given maximum size.
156 vm_size_t max_doubled_size
= 4 * 1024 * 1024; /* 4 meg */
159 * List of all backing store and segments.
161 struct backing_store_list_head backing_store_list
;
162 paging_segment_t paging_segments
[MAX_NUM_PAGING_SEGMENTS
];
163 mutex_t paging_segments_lock
;
164 int paging_segment_max
= 0;
165 int paging_segment_count
= 0;
166 int ps_select_array
[BS_MAXPRI
+1] = { -1,-1,-1,-1,-1 };
170 * Total pages free in system
171 * This differs from clusters committed/avail which is a measure of the
172 * over commitment of paging segments to backing store. An idea which is
173 * likely to be deprecated.
175 unsigned int dp_pages_free
= 0;
176 unsigned int cluster_transfer_minimum
= 100;
178 kern_return_t
ps_write_file(paging_segment_t
, upl_t
, vm_offset_t
, vm_offset_t
, unsigned int, int); /* forward */
179 kern_return_t
ps_read_file (paging_segment_t
, upl_t
, vm_offset_t
, vm_offset_t
, unsigned int, unsigned int *, int); /* forward */
182 default_pager_thread_t
*
189 for (i
=0; i
<default_pager_internal_count
; i
++) {
190 if(dpt_array
[i
]->checked_out
== FALSE
) {
191 dpt_array
[i
]->checked_out
= TRUE
;
192 DPT_UNLOCK(dpt_lock
);
196 DPT_SLEEP(dpt_lock
, &dpt_array
, THREAD_UNINT
);
206 * List of all backing store.
209 queue_init(&backing_store_list
.bsl_queue
);
212 VS_ASYNC_LOCK_INIT();
214 vs_async_free_list
= NULL
;
215 #endif /* VS_ASYNC_REUSE */
217 for (i
= 0; i
< VM_SUPER_PAGES
+ 1; i
++) {
218 clustered_writes
[i
] = 0;
219 clustered_reads
[i
] = 0;
225 * When things do not quite workout...
227 void bs_no_paging_space(boolean_t
); /* forward */
231 boolean_t out_of_memory
)
235 dprintf(("*** OUT OF MEMORY ***\n"));
236 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
239 void bs_more_space(int); /* forward */
240 void bs_commit(int); /* forward */
242 boolean_t user_warned
= FALSE
;
243 unsigned int clusters_committed
= 0;
244 unsigned int clusters_available
= 0;
245 unsigned int clusters_committed_peak
= 0;
253 * Account for new paging space.
255 clusters_available
+= nclusters
;
257 if (clusters_available
>= clusters_committed
) {
258 if (verbose
&& user_warned
) {
259 printf("%s%s - %d excess clusters now.\n",
261 "paging space is OK now",
262 clusters_available
- clusters_committed
);
264 clusters_committed_peak
= 0;
267 if (verbose
&& user_warned
) {
268 printf("%s%s - still short of %d clusters.\n",
270 "WARNING: paging space over-committed",
271 clusters_committed
- clusters_available
);
272 clusters_committed_peak
-= nclusters
;
285 clusters_committed
+= nclusters
;
286 if (clusters_committed
> clusters_available
) {
287 if (verbose
&& !user_warned
) {
289 printf("%s%s - short of %d clusters.\n",
291 "WARNING: paging space over-committed",
292 clusters_committed
- clusters_available
);
294 if (clusters_committed
> clusters_committed_peak
) {
295 clusters_committed_peak
= clusters_committed
;
298 if (verbose
&& user_warned
) {
299 printf("%s%s - was short of up to %d clusters.\n",
301 "paging space is OK now",
302 clusters_committed_peak
- clusters_available
);
304 clusters_committed_peak
= 0;
312 int default_pager_info_verbose
= 1;
319 vm_size_t pages_total
, pages_free
;
324 pages_total
= pages_free
= 0;
325 for (i
= 0; i
<= paging_segment_max
; i
++) {
326 ps
= paging_segments
[i
];
327 if (ps
== PAGING_SEGMENT_NULL
)
331 * no need to lock: by the time this data
332 * gets back to any remote requestor it
333 * will be obsolete anyways
335 pages_total
+= ps
->ps_pgnum
;
336 pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
337 DEBUG(DEBUG_BS_INTERNAL
,
338 ("segment #%d: %d total, %d free\n",
339 i
, ps
->ps_pgnum
, ps
->ps_clcount
<< ps
->ps_clshift
));
341 *totalp
= pages_total
;
343 if (verbose
&& user_warned
&& default_pager_info_verbose
) {
344 if (clusters_available
< clusters_committed
) {
345 printf("%s %d clusters committed, %d available.\n",
354 backing_store_t
backing_store_alloc(void); /* forward */
357 backing_store_alloc(void)
361 bs
= (backing_store_t
) kalloc(sizeof (struct backing_store
));
362 if (bs
== BACKING_STORE_NULL
)
363 panic("backing_store_alloc: no memory");
366 bs
->bs_port
= MACH_PORT_NULL
;
369 bs
->bs_pages_total
= 0;
371 bs
->bs_pages_in_fail
= 0;
372 bs
->bs_pages_out
= 0;
373 bs
->bs_pages_out_fail
= 0;
378 backing_store_t
backing_store_lookup(MACH_PORT_FACE
); /* forward */
380 /* Even in both the component space and external versions of this pager, */
381 /* backing_store_lookup will be called from tasks in the application space */
383 backing_store_lookup(
389 port is currently backed with a vs structure in the alias field
390 we could create an ISBS alias and a port_is_bs call but frankly
391 I see no reason for the test, the bs->port == port check below
392 will work properly on junk entries.
394 if ((port == MACH_PORT_NULL) || port_is_vs(port))
396 if ((port
== MACH_PORT_NULL
))
397 return BACKING_STORE_NULL
;
400 queue_iterate(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
403 if (bs
->bs_port
== port
) {
405 /* Success, return it locked. */
411 return BACKING_STORE_NULL
;
414 void backing_store_add(backing_store_t
); /* forward */
420 MACH_PORT_FACE port
= bs
->bs_port
;
421 MACH_PORT_FACE pset
= default_pager_default_set
;
422 kern_return_t kr
= KERN_SUCCESS
;
424 if (kr
!= KERN_SUCCESS
)
425 panic("backing_store_add: add to set");
430 * Set up default page shift, but only if not already
431 * set and argument is within range.
434 bs_set_default_clsize(unsigned int npages
)
441 if (default_pager_clsize
== 0) /* if not yet set */
442 vstruct_def_clshift
= local_log2(npages
);
448 int bs_get_global_clsize(int clsize
); /* forward */
451 bs_get_global_clsize(
455 memory_object_default_t dmm
;
459 * Only allow setting of cluster size once. If called
460 * with no cluster size (default), we use the compiled-in default
461 * for the duration. The same cluster size is used for all
464 if (default_pager_clsize
== 0) {
466 * Keep cluster size in bit shift because it's quicker
467 * arithmetic, and easier to keep at a power of 2.
469 if (clsize
!= NO_CLSIZE
) {
470 for (i
= 0; (1 << i
) < clsize
; i
++);
471 if (i
> MAX_CLUSTER_SHIFT
)
472 i
= MAX_CLUSTER_SHIFT
;
473 vstruct_def_clshift
= i
;
475 default_pager_clsize
= (1 << vstruct_def_clshift
);
478 * Let the user know the new (and definitive) cluster size.
481 printf("%scluster size = %d page%s\n",
482 my_name
, default_pager_clsize
,
483 (default_pager_clsize
== 1) ? "" : "s");
486 * Let the kernel know too, in case it hasn't used the
487 * default value provided in main() yet.
489 dmm
= default_pager_object
;
490 clsize
= default_pager_clsize
* vm_page_size
; /* in bytes */
491 kr
= host_default_memory_manager(host_priv_self(),
494 memory_object_default_deallocate(dmm
);
496 if (kr
!= KERN_SUCCESS
) {
497 panic("bs_get_global_cl_size:host_default_memory_manager");
499 if (dmm
!= default_pager_object
) {
500 panic("bs_get_global_cl_size:there is another default pager");
503 ASSERT(default_pager_clsize
> 0 &&
504 (default_pager_clsize
& (default_pager_clsize
- 1)) == 0);
506 return default_pager_clsize
;
510 default_pager_backing_store_create(
511 memory_object_default_t pager
,
513 int clsize
, /* in bytes */
514 MACH_PORT_FACE
*backing_store
)
519 struct vstruct_alias
*alias_struct
;
521 if (pager
!= default_pager_object
)
522 return KERN_INVALID_ARGUMENT
;
524 bs
= backing_store_alloc();
525 port
= ipc_port_alloc_kernel();
526 ipc_port_make_send(port
);
527 assert (port
!= IP_NULL
);
529 DEBUG(DEBUG_BS_EXTERNAL
,
530 ("priority=%d clsize=%d bs_port=0x%x\n",
531 priority
, clsize
, (int) backing_store
));
533 alias_struct
= (struct vstruct_alias
*)
534 kalloc(sizeof (struct vstruct_alias
));
535 if(alias_struct
!= NULL
) {
536 alias_struct
->vs
= (struct vstruct
*)bs
;
537 alias_struct
->name
= ISVS
;
538 port
->alias
= (int) alias_struct
;
541 ipc_port_dealloc_kernel((MACH_PORT_FACE
)(port
));
542 kfree((vm_offset_t
)bs
, sizeof (struct backing_store
));
543 return KERN_RESOURCE_SHORTAGE
;
547 if (priority
== DEFAULT_PAGER_BACKING_STORE_MAXPRI
)
548 priority
= BS_MAXPRI
;
549 else if (priority
== BS_NOPRI
)
550 priority
= BS_MAXPRI
;
552 priority
= BS_MINPRI
;
553 bs
->bs_priority
= priority
;
555 bs
->bs_clsize
= bs_get_global_clsize(atop_32(clsize
));
558 queue_enter(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
562 backing_store_add(bs
);
564 *backing_store
= port
;
569 default_pager_backing_store_info(
570 MACH_PORT_FACE backing_store
,
571 backing_store_flavor_t flavour
,
572 backing_store_info_t info
,
573 mach_msg_type_number_t
*size
)
576 backing_store_basic_info_t basic
;
580 if (flavour
!= BACKING_STORE_BASIC_INFO
||
581 *size
< BACKING_STORE_BASIC_INFO_COUNT
)
582 return KERN_INVALID_ARGUMENT
;
584 basic
= (backing_store_basic_info_t
)info
;
585 *size
= BACKING_STORE_BASIC_INFO_COUNT
;
587 VSTATS_LOCK(&global_stats
.gs_lock
);
588 basic
->pageout_calls
= global_stats
.gs_pageout_calls
;
589 basic
->pagein_calls
= global_stats
.gs_pagein_calls
;
590 basic
->pages_in
= global_stats
.gs_pages_in
;
591 basic
->pages_out
= global_stats
.gs_pages_out
;
592 basic
->pages_unavail
= global_stats
.gs_pages_unavail
;
593 basic
->pages_init
= global_stats
.gs_pages_init
;
594 basic
->pages_init_writes
= global_stats
.gs_pages_init_writes
;
595 VSTATS_UNLOCK(&global_stats
.gs_lock
);
597 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
598 return KERN_INVALID_ARGUMENT
;
600 basic
->bs_pages_total
= bs
->bs_pages_total
;
602 bs
->bs_pages_free
= 0;
603 for (i
= 0; i
<= paging_segment_max
; i
++) {
604 ps
= paging_segments
[i
];
605 if (ps
!= PAGING_SEGMENT_NULL
&& ps
->ps_bs
== bs
) {
607 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
612 basic
->bs_pages_free
= bs
->bs_pages_free
;
613 basic
->bs_pages_in
= bs
->bs_pages_in
;
614 basic
->bs_pages_in_fail
= bs
->bs_pages_in_fail
;
615 basic
->bs_pages_out
= bs
->bs_pages_out
;
616 basic
->bs_pages_out_fail
= bs
->bs_pages_out_fail
;
618 basic
->bs_priority
= bs
->bs_priority
;
619 basic
->bs_clsize
= ptoa_32(bs
->bs_clsize
); /* in bytes */
626 int ps_delete(paging_segment_t
); /* forward */
633 kern_return_t error
= KERN_SUCCESS
;
636 VSL_LOCK(); /* get the lock on the list of vs's */
638 /* The lock relationship and sequence is farily complicated */
639 /* this code looks at a live list, locking and unlocking the list */
640 /* as it traverses it. It depends on the locking behavior of */
641 /* default_pager_no_senders. no_senders always locks the vstruct */
642 /* targeted for removal before locking the vstruct list. However */
643 /* it will remove that member of the list without locking its */
644 /* neighbors. We can be sure when we hold a lock on a vstruct */
645 /* it cannot be removed from the list but we must hold the list */
646 /* lock to be sure that its pointers to its neighbors are valid. */
647 /* Also, we can hold off destruction of a vstruct when the list */
648 /* lock and the vs locks are not being held by bumping the */
649 /* vs_async_pending count. */
652 while(backing_store_release_trigger_disable
!= 0) {
653 VSL_SLEEP(&backing_store_release_trigger_disable
, THREAD_UNINT
);
656 /* we will choose instead to hold a send right */
657 vs_count
= vstruct_list
.vsl_count
;
658 vs
= (vstruct_t
) queue_first((queue_entry_t
)&(vstruct_list
.vsl_queue
));
659 if(vs
== (vstruct_t
)&vstruct_list
) {
664 vs_async_wait(vs
); /* wait for any pending async writes */
665 if ((vs_count
!= 0) && (vs
!= NULL
))
666 vs
->vs_async_pending
+= 1; /* hold parties calling */
670 while((vs_count
!= 0) && (vs
!= NULL
)) {
671 /* We take the count of AMO's before beginning the */
672 /* transfer of of the target segment. */
673 /* We are guaranteed that the target segment cannot get */
674 /* more users. We also know that queue entries are */
675 /* made at the back of the list. If some of the entries */
676 /* we would check disappear while we are traversing the */
677 /* list then we will either check new entries which */
678 /* do not have any backing store in the target segment */
679 /* or re-check old entries. This might not be optimal */
680 /* but it will always be correct. The alternative is to */
681 /* take a snapshot of the list. */
684 if(dp_pages_free
< cluster_transfer_minimum
)
685 error
= KERN_FAILURE
;
687 vm_object_t transfer_object
;
691 transfer_object
= vm_object_allocate(VM_SUPER_CLUSTER
);
693 error
= vm_object_upl_request(transfer_object
,
694 (vm_object_offset_t
)0, VM_SUPER_CLUSTER
,
696 UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
698 if(error
== KERN_SUCCESS
) {
699 error
= ps_vstruct_transfer_from_segment(
701 upl_commit(upl
, NULL
);
704 error
= KERN_FAILURE
;
706 vm_object_deallocate(transfer_object
);
710 vs
->vs_async_pending
-= 1; /* release vs_async_wait */
711 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
712 vs
->vs_waiting_async
= FALSE
;
714 thread_wakeup(&vs
->vs_async_pending
);
723 while(backing_store_release_trigger_disable
!= 0) {
724 VSL_SLEEP(&backing_store_release_trigger_disable
,
728 next_vs
= (vstruct_t
) queue_next(&(vs
->vs_links
));
729 if((next_vs
!= (vstruct_t
)&vstruct_list
) &&
730 (vs
!= next_vs
) && (vs_count
!= 1)) {
732 vs_async_wait(next_vs
); /* wait for any */
733 /* pending async writes */
734 next_vs
->vs_async_pending
+= 1; /* hold parties */
735 /* calling vs_async_wait */
740 vs
->vs_async_pending
-= 1;
741 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
742 vs
->vs_waiting_async
= FALSE
;
744 thread_wakeup(&vs
->vs_async_pending
);
748 if((vs
== next_vs
) || (next_vs
== (vstruct_t
)&vstruct_list
))
759 default_pager_backing_store_delete(
760 MACH_PORT_FACE backing_store
)
766 int interim_pages_removed
= 0;
769 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
770 return KERN_INVALID_ARGUMENT
;
773 /* not implemented */
780 error
= KERN_SUCCESS
;
781 for (i
= 0; i
<= paging_segment_max
; i
++) {
782 ps
= paging_segments
[i
];
783 if (ps
!= PAGING_SEGMENT_NULL
&&
785 ! ps
->ps_going_away
) {
787 /* disable access to this segment */
788 ps
->ps_going_away
= TRUE
;
791 * The "ps" segment is "off-line" now,
792 * we can try and delete it...
794 if(dp_pages_free
< (cluster_transfer_minimum
796 error
= KERN_FAILURE
;
800 /* remove all pages associated with the */
801 /* segment from the list of free pages */
802 /* when transfer is through, all target */
803 /* segment pages will appear to be free */
805 dp_pages_free
-= ps
->ps_pgcount
;
806 interim_pages_removed
+= ps
->ps_pgcount
;
808 error
= ps_delete(ps
);
810 if (error
!= KERN_SUCCESS
) {
812 * We couldn't delete the segment,
813 * probably because there's not enough
814 * virtual memory left.
815 * Re-enable all the segments.
824 if (error
!= KERN_SUCCESS
) {
825 for (i
= 0; i
<= paging_segment_max
; i
++) {
826 ps
= paging_segments
[i
];
827 if (ps
!= PAGING_SEGMENT_NULL
&&
831 /* re-enable access to this segment */
832 ps
->ps_going_away
= FALSE
;
836 dp_pages_free
+= interim_pages_removed
;
842 for (i
= 0; i
<= paging_segment_max
; i
++) {
843 ps
= paging_segments
[i
];
844 if (ps
!= PAGING_SEGMENT_NULL
&&
846 if(ps
->ps_going_away
) {
847 paging_segments
[i
] = PAGING_SEGMENT_NULL
;
848 paging_segment_count
--;
850 kfree((vm_offset_t
)ps
->ps_bmap
,
851 RMAPSIZE(ps
->ps_ncls
));
852 kfree((vm_offset_t
)ps
, sizeof *ps
);
857 /* Scan the entire ps array separately to make certain we find the */
858 /* proper paging_segment_max */
859 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
860 if(paging_segments
[i
] != PAGING_SEGMENT_NULL
)
861 paging_segment_max
= i
;
867 * All the segments have been deleted.
868 * We can remove the backing store.
872 * Disable lookups of this backing store.
874 if((void *)bs
->bs_port
->alias
!= NULL
)
875 kfree((vm_offset_t
) bs
->bs_port
->alias
,
876 sizeof (struct vstruct_alias
));
877 ipc_port_dealloc_kernel((ipc_port_t
) (bs
->bs_port
));
878 bs
->bs_port
= MACH_PORT_NULL
;
882 * Remove backing store from backing_store list.
885 queue_remove(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
890 * Free the backing store structure.
892 kfree((vm_offset_t
)bs
, sizeof *bs
);
897 int ps_enter(paging_segment_t
); /* forward */
907 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
908 if (paging_segments
[i
] == PAGING_SEGMENT_NULL
)
912 if (i
< MAX_NUM_PAGING_SEGMENTS
) {
913 paging_segments
[i
] = ps
;
914 if (i
> paging_segment_max
)
915 paging_segment_max
= i
;
916 paging_segment_count
++;
917 if ((ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_NOPRI
) ||
918 (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
))
919 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
923 return KERN_RESOURCE_SHORTAGE
;
932 default_pager_add_segment(
933 MACH_PORT_FACE backing_store
,
934 MACH_PORT_FACE device
,
944 if ((bs
= backing_store_lookup(backing_store
))
945 == BACKING_STORE_NULL
)
946 return KERN_INVALID_ARGUMENT
;
949 for (i
= 0; i
<= paging_segment_max
; i
++) {
950 ps
= paging_segments
[i
];
951 if (ps
== PAGING_SEGMENT_NULL
)
955 * Check for overlap on same device.
957 if (!(ps
->ps_device
!= device
958 || offset
>= ps
->ps_offset
+ ps
->ps_recnum
959 || offset
+ count
<= ps
->ps_offset
)) {
962 return KERN_INVALID_ARGUMENT
;
968 * Set up the paging segment
970 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
971 if (ps
== PAGING_SEGMENT_NULL
) {
973 return KERN_RESOURCE_SHORTAGE
;
976 ps
->ps_segtype
= PS_PARTITION
;
977 ps
->ps_device
= device
;
978 ps
->ps_offset
= offset
;
979 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
980 ps
->ps_recnum
= count
;
981 ps
->ps_pgnum
= count
>> ps
->ps_record_shift
;
983 ps
->ps_pgcount
= ps
->ps_pgnum
;
984 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
985 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
989 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
991 kfree((vm_offset_t
)ps
, sizeof *ps
);
993 return KERN_RESOURCE_SHORTAGE
;
995 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
996 clrbit(ps
->ps_bmap
, i
);
999 ps
->ps_going_away
= FALSE
;
1002 if ((error
= ps_enter(ps
)) != 0) {
1003 kfree((vm_offset_t
)ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
1004 kfree((vm_offset_t
)ps
, sizeof *ps
);
1006 return KERN_RESOURCE_SHORTAGE
;
1009 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1010 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1014 dp_pages_free
+= ps
->ps_pgcount
;
1017 bs_more_space(ps
->ps_clcount
);
1019 DEBUG(DEBUG_BS_INTERNAL
,
1020 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1021 device
, offset
, count
, record_size
,
1022 ps
->ps_record_shift
, ps
->ps_pgnum
));
1024 return KERN_SUCCESS
;
1030 MACH_PORT_FACE master
)
1032 security_token_t null_security_token
= {
1035 MACH_PORT_FACE device
;
1036 int info
[DEV_GET_SIZE_COUNT
];
1037 mach_msg_type_number_t info_count
;
1038 MACH_PORT_FACE bs
= MACH_PORT_NULL
;
1039 unsigned int rec_size
;
1042 MACH_PORT_FACE reply_port
;
1044 if (ds_device_open_sync(master
, MACH_PORT_NULL
, D_READ
| D_WRITE
,
1045 null_security_token
, dev_name
, &device
))
1048 info_count
= DEV_GET_SIZE_COUNT
;
1049 if (!ds_device_get_status(device
, DEV_GET_SIZE
, info
, &info_count
)) {
1050 rec_size
= info
[DEV_GET_SIZE_RECORD_SIZE
];
1051 count
= info
[DEV_GET_SIZE_DEVICE_SIZE
] / rec_size
;
1052 clsize
= bs_get_global_clsize(0);
1053 if (!default_pager_backing_store_create(
1054 default_pager_object
,
1055 DEFAULT_PAGER_BACKING_STORE_MAXPRI
,
1056 (clsize
* vm_page_size
),
1058 if (!default_pager_add_segment(bs
, device
,
1059 0, count
, rec_size
)) {
1062 ipc_port_release_receive(bs
);
1066 ipc_port_release_send(device
);
1069 #endif /* DEVICE_PAGING */
1074 vs_alloc_async(void)
1076 struct vs_async
*vsa
;
1077 MACH_PORT_FACE reply_port
;
1081 if (vs_async_free_list
== NULL
) {
1083 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1086 * Try allocating a reply port named after the
1087 * address of the vs_async structure.
1089 struct vstruct_alias
*alias_struct
;
1091 reply_port
= ipc_port_alloc_kernel();
1092 alias_struct
= (struct vstruct_alias
*)
1093 kalloc(sizeof (struct vstruct_alias
));
1094 if(alias_struct
!= NULL
) {
1095 alias_struct
->vs
= (struct vstruct
*)vsa
;
1096 alias_struct
->name
= ISVS
;
1097 reply_port
->alias
= (int) alias_struct
;
1098 vsa
->reply_port
= reply_port
;
1099 vs_alloc_async_count
++;
1102 vs_alloc_async_failed
++;
1103 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1105 kfree((vm_offset_t
)vsa
,
1106 sizeof (struct vs_async
));
1111 vsa
= vs_async_free_list
;
1112 vs_async_free_list
= vs_async_free_list
->vsa_next
;
1121 struct vs_async
*vsa
)
1124 vsa
->vsa_next
= vs_async_free_list
;
1125 vs_async_free_list
= vsa
;
1129 #else /* VS_ASYNC_REUSE */
1132 vs_alloc_async(void)
1134 struct vs_async
*vsa
;
1135 MACH_PORT_FACE reply_port
;
1138 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1141 * Try allocating a reply port named after the
1142 * address of the vs_async structure.
1144 reply_port
= ipc_port_alloc_kernel();
1145 alias_struct
= (vstruct_alias
*)
1146 kalloc(sizeof (struct vstruct_alias
));
1147 if(alias_struct
!= NULL
) {
1148 alias_struct
->vs
= reply_port
;
1149 alias_struct
->name
= ISVS
;
1150 reply_port
->alias
= (int) vsa
;
1151 vsa
->reply_port
= reply_port
;
1152 vs_alloc_async_count
++;
1155 vs_alloc_async_failed
++;
1156 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1158 kfree((vm_offset_t
) vsa
,
1159 sizeof (struct vs_async
));
1169 struct vs_async
*vsa
)
1171 MACH_PORT_FACE reply_port
;
1174 reply_port
= vsa
->reply_port
;
1175 kfree((vm_offset_t
) reply_port
->alias
, sizeof (struct vstuct_alias
));
1176 kfree((vm_offset_t
) vsa
, sizeof (struct vs_async
));
1177 ipc_port_dealloc_kernel((MACH_PORT_FACE
) (reply_port
));
1180 vs_alloc_async_count
--;
1185 #endif /* VS_ASYNC_REUSE */
1187 zone_t vstruct_zone
;
1196 vs
= (vstruct_t
) zalloc(vstruct_zone
);
1197 if (vs
== VSTRUCT_NULL
) {
1198 return VSTRUCT_NULL
;
1204 * The following fields will be provided later.
1206 vs
->vs_mem_obj
= NULL
;
1207 vs
->vs_control
= MEMORY_OBJECT_CONTROL_NULL
;
1208 vs
->vs_references
= 1;
1212 vs
->vs_waiting_seqno
= FALSE
;
1213 vs
->vs_waiting_read
= FALSE
;
1214 vs
->vs_waiting_write
= FALSE
;
1215 vs
->vs_waiting_async
= FALSE
;
1217 mutex_init(&vs
->vs_waiting_seqno
, ETAP_DPAGE_VSSEQNO
);
1218 mutex_init(&vs
->vs_waiting_read
, ETAP_DPAGE_VSREAD
);
1219 mutex_init(&vs
->vs_waiting_write
, ETAP_DPAGE_VSWRITE
);
1220 mutex_init(&vs
->vs_waiting_refs
, ETAP_DPAGE_VSREFS
);
1221 mutex_init(&vs
->vs_waiting_async
, ETAP_DPAGE_VSASYNC
);
1229 vs
->vs_clshift
= local_log2(bs_get_global_clsize(0));
1230 vs
->vs_size
= ((atop_32(round_page_32(size
)) - 1) >> vs
->vs_clshift
) + 1;
1231 vs
->vs_async_pending
= 0;
1234 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1235 * depending on the size of the memory object.
1237 if (INDIRECT_CLMAP(vs
->vs_size
)) {
1238 vs
->vs_imap
= (struct vs_map
**)
1239 kalloc(INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1240 vs
->vs_indirect
= TRUE
;
1242 vs
->vs_dmap
= (struct vs_map
*)
1243 kalloc(CLMAP_SIZE(vs
->vs_size
));
1244 vs
->vs_indirect
= FALSE
;
1246 vs
->vs_xfer_pending
= FALSE
;
1247 DEBUG(DEBUG_VS_INTERNAL
,
1248 ("map=0x%x, indirect=%d\n", (int) vs
->vs_dmap
, vs
->vs_indirect
));
1251 * Check to see that we got the space.
1254 kfree((vm_offset_t
)vs
, sizeof *vs
);
1255 return VSTRUCT_NULL
;
1259 * Zero the indirect pointers, or clear the direct pointers.
1261 if (vs
->vs_indirect
)
1262 memset(vs
->vs_imap
, 0,
1263 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1265 for (i
= 0; i
< vs
->vs_size
; i
++)
1266 VSM_CLR(vs
->vs_dmap
[i
]);
1268 VS_MAP_LOCK_INIT(vs
);
1270 bs_commit(vs
->vs_size
);
1275 paging_segment_t
ps_select_segment(int, int *); /* forward */
1282 paging_segment_t ps
;
1287 * Optimize case where there's only one segment.
1288 * paging_segment_max will index the one and only segment.
1292 if (paging_segment_count
== 1) {
1293 paging_segment_t lps
; /* used to avoid extra PS_UNLOCK */
1294 ipc_port_t trigger
= IP_NULL
;
1296 ps
= paging_segments
[paging_segment_max
];
1297 *psindex
= paging_segment_max
;
1299 if (ps
->ps_going_away
) {
1300 /* this segment is being turned off */
1301 lps
= PAGING_SEGMENT_NULL
;
1303 ASSERT(ps
->ps_clshift
>= shift
);
1304 if (ps
->ps_clcount
) {
1306 dp_pages_free
-= 1 << ps
->ps_clshift
;
1307 if(min_pages_trigger_port
&&
1308 (dp_pages_free
< minimum_pages_remaining
)) {
1309 trigger
= min_pages_trigger_port
;
1310 min_pages_trigger_port
= NULL
;
1315 lps
= PAGING_SEGMENT_NULL
;
1320 if (trigger
!= IP_NULL
) {
1321 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1322 ipc_port_release_send(trigger
);
1327 if (paging_segment_count
== 0) {
1329 return PAGING_SEGMENT_NULL
;
1333 i
>= BS_MINPRI
; i
--) {
1336 if ((ps_select_array
[i
] == BS_NOPRI
) ||
1337 (ps_select_array
[i
] == BS_FULLPRI
))
1339 start_index
= ps_select_array
[i
];
1341 if(!(paging_segments
[start_index
])) {
1343 physical_transfer_cluster_count
= 0;
1345 else if ((physical_transfer_cluster_count
+1) == (ALLOC_STRIDE
>>
1346 (((paging_segments
[start_index
])->ps_clshift
)
1347 + vm_page_shift
))) {
1348 physical_transfer_cluster_count
= 0;
1349 j
= start_index
+ 1;
1351 physical_transfer_cluster_count
+=1;
1353 if(start_index
== 0)
1354 start_index
= paging_segment_max
;
1356 start_index
= start_index
- 1;
1360 if (j
> paging_segment_max
)
1362 if ((ps
= paging_segments
[j
]) &&
1363 (ps
->ps_bs
->bs_priority
== i
)) {
1365 * Force the ps cluster size to be
1366 * >= that of the vstruct.
1369 if (ps
->ps_going_away
) {
1370 /* this segment is being turned off */
1371 } else if ((ps
->ps_clcount
) &&
1372 (ps
->ps_clshift
>= shift
)) {
1373 ipc_port_t trigger
= IP_NULL
;
1376 dp_pages_free
-= 1 << ps
->ps_clshift
;
1377 if(min_pages_trigger_port
&&
1379 minimum_pages_remaining
)) {
1380 trigger
= min_pages_trigger_port
;
1381 min_pages_trigger_port
= NULL
;
1385 * found one, quit looking.
1387 ps_select_array
[i
] = j
;
1390 if (trigger
!= IP_NULL
) {
1391 default_pager_space_alert(
1394 ipc_port_release_send(trigger
);
1401 if (j
== start_index
) {
1403 * none at this priority -- mark it full
1405 ps_select_array
[i
] = BS_FULLPRI
;
1412 return PAGING_SEGMENT_NULL
;
1415 vm_offset_t
ps_allocate_cluster(vstruct_t
, int *, paging_segment_t
); /*forward*/
1418 ps_allocate_cluster(
1421 paging_segment_t use_ps
)
1425 paging_segment_t ps
;
1426 vm_offset_t cluster
;
1427 ipc_port_t trigger
= IP_NULL
;
1430 * Find best paging segment.
1431 * ps_select_segment will decrement cluster count on ps.
1432 * Must pass cluster shift to find the most appropriate segment.
1434 /* NOTE: The addition of paging segment delete capability threatened
1435 * to seriously complicate the treatment of paging segments in this
1436 * module and the ones that call it (notably ps_clmap), because of the
1437 * difficulty in assuring that the paging segment would continue to
1438 * exist between being unlocked and locked. This was
1439 * avoided because all calls to this module are based in either
1440 * dp_memory_object calls which rely on the vs lock, or by
1441 * the transfer function which is part of the segment delete path.
1442 * The transfer function which is part of paging segment delete is
1443 * protected from multiple callers by the backing store lock.
1444 * The paging segment delete function treats mappings to a paging
1445 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1446 * while data is transferred to the remaining segments. This is in
1447 * line with the view that incomplete or in-transition mappings between
1448 * data, a vstruct, and backing store are protected by the vs lock.
1449 * This and the ordering of the paging segment "going_away" bit setting
1452 if (use_ps
!= PAGING_SEGMENT_NULL
) {
1457 ASSERT(ps
->ps_clcount
!= 0);
1460 dp_pages_free
-= 1 << ps
->ps_clshift
;
1461 if(min_pages_trigger_port
&&
1462 (dp_pages_free
< minimum_pages_remaining
)) {
1463 trigger
= min_pages_trigger_port
;
1464 min_pages_trigger_port
= NULL
;
1468 if (trigger
!= IP_NULL
) {
1469 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1470 ipc_port_release_send(trigger
);
1473 } else if ((ps
= ps_select_segment(vs
->vs_clshift
, psindex
)) ==
1474 PAGING_SEGMENT_NULL
) {
1476 bs_no_paging_space(TRUE
);
1481 dprintf(("no space in available paging segments; "
1482 "swapon suggested\n"));
1483 /* the count got off maybe, reset to zero */
1486 if(min_pages_trigger_port
) {
1487 trigger
= min_pages_trigger_port
;
1488 min_pages_trigger_port
= NULL
;
1492 if (trigger
!= IP_NULL
) {
1493 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1494 ipc_port_release_send(trigger
);
1496 return (vm_offset_t
) -1;
1500 * Look for an available cluster. At the end of the loop,
1501 * byte_num is the byte offset and bit_num is the bit offset of the
1502 * first zero bit in the paging segment bitmap.
1505 byte_num
= ps
->ps_hint
;
1506 for (; byte_num
< howmany(ps
->ps_ncls
, NBBY
); byte_num
++) {
1507 if (*(ps
->ps_bmap
+ byte_num
) != BYTEMASK
) {
1508 for (bit_num
= 0; bit_num
< NBBY
; bit_num
++) {
1509 if (isclr((ps
->ps_bmap
+ byte_num
), bit_num
))
1512 ASSERT(bit_num
!= NBBY
);
1516 ps
->ps_hint
= byte_num
;
1517 cluster
= (byte_num
*NBBY
) + bit_num
;
1519 /* Space was reserved, so this must be true */
1520 ASSERT(cluster
< ps
->ps_ncls
);
1522 setbit(ps
->ps_bmap
, cluster
);
1528 void ps_deallocate_cluster(paging_segment_t
, vm_offset_t
); /* forward */
1531 ps_deallocate_cluster(
1532 paging_segment_t ps
,
1533 vm_offset_t cluster
)
1536 if (cluster
>= (vm_offset_t
) ps
->ps_ncls
)
1537 panic("ps_deallocate_cluster: Invalid cluster number");
1540 * Lock the paging segment, clear the cluster's bitmap and increment the
1541 * number of free cluster.
1545 clrbit(ps
->ps_bmap
, cluster
);
1547 dp_pages_free
+= 1 << ps
->ps_clshift
;
1551 * Move the hint down to the freed cluster if it is
1552 * less than the current hint.
1554 if ((cluster
/NBBY
) < ps
->ps_hint
) {
1555 ps
->ps_hint
= (cluster
/NBBY
);
1561 * If we're freeing space on a full priority, reset the array.
1564 if (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
)
1565 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
1571 void ps_dealloc_vsmap(struct vs_map
*, vm_size_t
); /* forward */
1575 struct vs_map
*vsmap
,
1579 for (i
= 0; i
< size
; i
++)
1580 if (!VSM_ISCLR(vsmap
[i
]) && !VSM_ISERR(vsmap
[i
]))
1581 ps_deallocate_cluster(VSM_PS(vsmap
[i
]),
1582 VSM_CLOFF(vsmap
[i
]));
1595 * If this is an indirect structure, then we walk through the valid
1596 * (non-zero) indirect pointers and deallocate the clusters
1597 * associated with each used map entry (via ps_dealloc_vsmap).
1598 * When all of the clusters in an indirect block have been
1599 * freed, we deallocate the block. When all of the indirect
1600 * blocks have been deallocated we deallocate the memory
1601 * holding the indirect pointers.
1603 if (vs
->vs_indirect
) {
1604 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
1605 if (vs
->vs_imap
[i
] != NULL
) {
1606 ps_dealloc_vsmap(vs
->vs_imap
[i
], CLMAP_ENTRIES
);
1607 kfree((vm_offset_t
)vs
->vs_imap
[i
],
1611 kfree((vm_offset_t
)vs
->vs_imap
,
1612 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1615 * Direct map. Free used clusters, then memory.
1617 ps_dealloc_vsmap(vs
->vs_dmap
, vs
->vs_size
);
1618 kfree((vm_offset_t
)vs
->vs_dmap
, CLMAP_SIZE(vs
->vs_size
));
1622 bs_commit(- vs
->vs_size
);
1624 zfree(vstruct_zone
, (vm_offset_t
)vs
);
1627 int ps_map_extend(vstruct_t
, int); /* forward */
1633 struct vs_map
**new_imap
;
1634 struct vs_map
*new_dmap
= NULL
;
1637 void *old_map
= NULL
;
1638 int old_map_size
= 0;
1640 if (vs
->vs_size
>= new_size
) {
1642 * Someone has already done the work.
1648 * If the new size extends into the indirect range, then we have one
1649 * of two cases: we are going from indirect to indirect, or we are
1650 * going from direct to indirect. If we are going from indirect to
1651 * indirect, then it is possible that the new size will fit in the old
1652 * indirect map. If this is the case, then just reset the size of the
1653 * vstruct map and we are done. If the new size will not
1654 * fit into the old indirect map, then we have to allocate a new
1655 * indirect map and copy the old map pointers into this new map.
1657 * If we are going from direct to indirect, then we have to allocate a
1658 * new indirect map and copy the old direct pages into the first
1659 * indirect page of the new map.
1660 * NOTE: allocating memory here is dangerous, as we're in the
1663 if (INDIRECT_CLMAP(new_size
)) {
1664 int new_map_size
= INDIRECT_CLMAP_SIZE(new_size
);
1667 * Get a new indirect map and zero it.
1669 old_map_size
= INDIRECT_CLMAP_SIZE(vs
->vs_size
);
1670 if (vs
->vs_indirect
&&
1671 (new_map_size
== old_map_size
)) {
1672 bs_commit(new_size
- vs
->vs_size
);
1673 vs
->vs_size
= new_size
;
1677 new_imap
= (struct vs_map
**)kalloc(new_map_size
);
1678 if (new_imap
== NULL
) {
1681 memset(new_imap
, 0, new_map_size
);
1683 if (vs
->vs_indirect
) {
1684 /* Copy old entries into new map */
1685 memcpy(new_imap
, vs
->vs_imap
, old_map_size
);
1686 /* Arrange to free the old map */
1687 old_map
= (void *) vs
->vs_imap
;
1689 } else { /* Old map was a direct map */
1690 /* Allocate an indirect page */
1691 if ((new_imap
[0] = (struct vs_map
*)
1692 kalloc(CLMAP_THRESHOLD
)) == NULL
) {
1693 kfree((vm_offset_t
)new_imap
, new_map_size
);
1696 new_dmap
= new_imap
[0];
1697 newdsize
= CLMAP_ENTRIES
;
1701 newdsize
= new_size
;
1703 * If the new map is a direct map, then the old map must
1704 * also have been a direct map. All we have to do is
1705 * to allocate a new direct map, copy the old entries
1706 * into it and free the old map.
1708 if ((new_dmap
= (struct vs_map
*)
1709 kalloc(CLMAP_SIZE(new_size
))) == NULL
) {
1715 /* Free the old map */
1716 old_map
= (void *) vs
->vs_dmap
;
1717 old_map_size
= CLMAP_SIZE(vs
->vs_size
);
1719 /* Copy info from the old map into the new map */
1720 memcpy(new_dmap
, vs
->vs_dmap
, old_map_size
);
1722 /* Initialize the rest of the new map */
1723 for (i
= vs
->vs_size
; i
< newdsize
; i
++)
1724 VSM_CLR(new_dmap
[i
]);
1727 vs
->vs_imap
= new_imap
;
1728 vs
->vs_indirect
= TRUE
;
1730 vs
->vs_dmap
= new_dmap
;
1731 bs_commit(new_size
- vs
->vs_size
);
1732 vs
->vs_size
= new_size
;
1734 kfree((vm_offset_t
)old_map
, old_map_size
);
1742 struct clmap
*clmap
,
1747 vm_offset_t cluster
; /* The cluster of offset. */
1748 vm_offset_t newcl
; /* The new cluster allocated. */
1751 struct vs_map
*vsmap
;
1755 ASSERT(vs
->vs_dmap
);
1756 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
1759 * Initialize cluster error value
1761 clmap
->cl_error
= 0;
1764 * If the object has grown, extend the page map.
1766 if (cluster
>= vs
->vs_size
) {
1767 if (flag
== CL_FIND
) {
1768 /* Do not allocate if just doing a lookup */
1770 return (vm_offset_t
) -1;
1772 if (ps_map_extend(vs
, cluster
+ 1)) {
1774 return (vm_offset_t
) -1;
1779 * Look for the desired cluster. If the map is indirect, then we
1780 * have a two level lookup. First find the indirect block, then
1781 * find the actual cluster. If the indirect block has not yet
1782 * been allocated, then do so. If the cluster has not yet been
1783 * allocated, then do so.
1785 * If any of the allocations fail, then return an error.
1786 * Don't allocate if just doing a lookup.
1788 if (vs
->vs_indirect
) {
1789 long ind_block
= cluster
/CLMAP_ENTRIES
;
1791 /* Is the indirect block allocated? */
1792 vsmap
= vs
->vs_imap
[ind_block
];
1793 if (vsmap
== NULL
) {
1794 if (flag
== CL_FIND
) {
1796 return (vm_offset_t
) -1;
1799 /* Allocate the indirect block */
1800 vsmap
= (struct vs_map
*) kalloc(CLMAP_THRESHOLD
);
1801 if (vsmap
== NULL
) {
1803 return (vm_offset_t
) -1;
1805 /* Initialize the cluster offsets */
1806 for (i
= 0; i
< CLMAP_ENTRIES
; i
++)
1808 vs
->vs_imap
[ind_block
] = vsmap
;
1811 vsmap
= vs
->vs_dmap
;
1814 vsmap
+= cluster%CLMAP_ENTRIES
;
1817 * At this point, vsmap points to the struct vs_map desired.
1819 * Look in the map for the cluster, if there was an error on a
1820 * previous write, flag it and return. If it is not yet
1821 * allocated, then allocate it, if we're writing; if we're
1822 * doing a lookup and the cluster's not allocated, return error.
1824 if (VSM_ISERR(*vsmap
)) {
1825 clmap
->cl_error
= VSM_GETERR(*vsmap
);
1827 return (vm_offset_t
) -1;
1828 } else if (VSM_ISCLR(*vsmap
)) {
1831 if (flag
== CL_FIND
) {
1833 * If there's an error and the entry is clear, then
1834 * we've run out of swap space. Record the error
1838 VSM_SETERR(*vsmap
, error
);
1841 return (vm_offset_t
) -1;
1844 * Attempt to allocate a cluster from the paging segment
1846 newcl
= ps_allocate_cluster(vs
, &psindex
,
1847 PAGING_SEGMENT_NULL
);
1850 return (vm_offset_t
) -1;
1853 VSM_SETCLOFF(*vsmap
, newcl
);
1854 VSM_SETPS(*vsmap
, psindex
);
1857 newcl
= VSM_CLOFF(*vsmap
);
1860 * Fill in pertinent fields of the clmap
1862 clmap
->cl_ps
= VSM_PS(*vsmap
);
1863 clmap
->cl_numpages
= VSCLSIZE(vs
);
1864 clmap
->cl_bmap
.clb_map
= (unsigned int) VSM_BMAP(*vsmap
);
1867 * Byte offset in paging segment is byte offset to cluster plus
1868 * byte offset within cluster. It looks ugly, but should be
1871 ASSERT(trunc_page(offset
) == offset
);
1872 newcl
= ptoa_32(newcl
) << vs
->vs_clshift
;
1873 newoff
= offset
& ((1<<(vm_page_shift
+ vs
->vs_clshift
)) - 1);
1874 if (flag
== CL_ALLOC
) {
1876 * set bits in the allocation bitmap according to which
1877 * pages were requested. size is in bytes.
1879 i
= atop_32(newoff
);
1880 while ((size
> 0) && (i
< VSCLSIZE(vs
))) {
1881 VSM_SETALLOC(*vsmap
, i
);
1883 size
-= vm_page_size
;
1886 clmap
->cl_alloc
.clb_map
= (unsigned int) VSM_ALLOC(*vsmap
);
1889 * Offset is not cluster aligned, so number of pages
1890 * and bitmaps must be adjusted
1892 clmap
->cl_numpages
-= atop_32(newoff
);
1893 CLMAP_SHIFT(clmap
, vs
);
1894 CLMAP_SHIFTALLOC(clmap
, vs
);
1899 * The setting of valid bits and handling of write errors
1900 * must be done here, while we hold the lock on the map.
1901 * It logically should be done in ps_vs_write_complete().
1902 * The size and error information has been passed from
1903 * ps_vs_write_complete(). If the size parameter is non-zero,
1904 * then there is work to be done. If error is also non-zero,
1905 * then the error number is recorded in the cluster and the
1906 * entire cluster is in error.
1908 if (size
&& flag
== CL_FIND
) {
1909 vm_offset_t off
= (vm_offset_t
) 0;
1912 for (i
= VSCLSIZE(vs
) - clmap
->cl_numpages
; size
> 0;
1914 VSM_SETPG(*vsmap
, i
);
1915 size
-= vm_page_size
;
1917 ASSERT(i
<= VSCLSIZE(vs
));
1919 BS_STAT(clmap
->cl_ps
->ps_bs
,
1920 clmap
->cl_ps
->ps_bs
->bs_pages_out_fail
+=
1922 off
= VSM_CLOFF(*vsmap
);
1923 VSM_SETERR(*vsmap
, error
);
1926 * Deallocate cluster if error, and no valid pages
1929 if (off
!= (vm_offset_t
) 0)
1930 ps_deallocate_cluster(clmap
->cl_ps
, off
);
1932 return (vm_offset_t
) 0;
1936 DEBUG(DEBUG_VS_INTERNAL
,
1937 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1938 newcl
+newoff
, (int) vs
, (int) vsmap
, flag
));
1939 DEBUG(DEBUG_VS_INTERNAL
,
1940 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1941 (int) clmap
->cl_ps
, clmap
->cl_numpages
,
1942 (int) clmap
->cl_bmap
.clb_map
, (int) clmap
->cl_alloc
.clb_map
));
1944 return (newcl
+ newoff
);
1947 void ps_clunmap(vstruct_t
, vm_offset_t
, vm_size_t
); /* forward */
1955 vm_offset_t cluster
; /* The cluster number of offset */
1956 struct vs_map
*vsmap
;
1961 * Loop through all clusters in this range, freeing paging segment
1962 * clusters and map entries as encountered.
1964 while (length
> 0) {
1968 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
1969 if (vs
->vs_indirect
) /* indirect map */
1970 vsmap
= vs
->vs_imap
[cluster
/CLMAP_ENTRIES
];
1972 vsmap
= vs
->vs_dmap
;
1973 if (vsmap
== NULL
) {
1977 vsmap
+= cluster%CLMAP_ENTRIES
;
1978 if (VSM_ISCLR(*vsmap
)) {
1979 length
-= vm_page_size
;
1980 offset
+= vm_page_size
;
1984 * We've got a valid mapping. Clear it and deallocate
1985 * paging segment cluster pages.
1986 * Optimize for entire cluster cleraing.
1988 if (newoff
= (offset
&((1<<(vm_page_shift
+vs
->vs_clshift
))-1))) {
1990 * Not cluster aligned.
1992 ASSERT(trunc_page(newoff
) == newoff
);
1993 i
= atop_32(newoff
);
1996 while ((i
< VSCLSIZE(vs
)) && (length
> 0)) {
1997 VSM_CLRPG(*vsmap
, i
);
1998 VSM_CLRALLOC(*vsmap
, i
);
1999 length
-= vm_page_size
;
2000 offset
+= vm_page_size
;
2005 * If map entry is empty, clear and deallocate cluster.
2007 if (!VSM_ALLOC(*vsmap
)) {
2008 ps_deallocate_cluster(VSM_PS(*vsmap
),
2017 void ps_vs_write_complete(vstruct_t
, vm_offset_t
, vm_size_t
, int); /* forward */
2020 ps_vs_write_complete(
2029 * Get the struct vsmap for this cluster.
2030 * Use READ, even though it was written, because the
2031 * cluster MUST be present, unless there was an error
2032 * in the original ps_clmap (e.g. no space), in which
2033 * case, nothing happens.
2035 * Must pass enough information to ps_clmap to allow it
2036 * to set the vs_map structure bitmap under lock.
2038 (void) ps_clmap(vs
, offset
, &clmap
, CL_FIND
, size
, error
);
2041 void vs_cl_write_complete(vstruct_t
, paging_segment_t
, vm_offset_t
, vm_offset_t
, vm_size_t
, boolean_t
, int); /* forward */
2044 vs_cl_write_complete(
2046 paging_segment_t ps
,
2057 * For internal objects, the error is recorded on a
2058 * per-cluster basis by ps_clmap() which is called
2059 * by ps_vs_write_complete() below.
2061 dprintf(("write failed error = 0x%x\n", error
));
2062 /* add upl_abort code here */
2064 GSTAT(global_stats
.gs_pages_out
+= atop_32(size
));
2066 * Notify the vstruct mapping code, so it can do its accounting.
2068 ps_vs_write_complete(vs
, offset
, size
, error
);
2072 ASSERT(vs
->vs_async_pending
> 0);
2073 vs
->vs_async_pending
-= size
;
2074 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
2075 vs
->vs_waiting_async
= FALSE
;
2077 /* mutex_unlock(&vs->vs_waiting_async); */
2078 thread_wakeup(&vs
->vs_async_pending
);
2085 #ifdef DEVICE_PAGING
2086 kern_return_t
device_write_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2090 MACH_PORT_FACE reply_port
,
2091 kern_return_t device_code
,
2092 io_buf_len_t bytes_written
)
2094 struct vs_async
*vsa
;
2096 vsa
= (struct vs_async
*)
2097 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2099 if (device_code
== KERN_SUCCESS
&& bytes_written
!= vsa
->vsa_size
) {
2100 device_code
= KERN_FAILURE
;
2103 vsa
->vsa_error
= device_code
;
2106 ASSERT(vsa
->vsa_vs
!= VSTRUCT_NULL
);
2107 if(vsa
->vsa_flags
& VSA_TRANSFER
) {
2108 /* revisit when async disk segments redone */
2109 if(vsa
->vsa_error
) {
2110 /* need to consider error condition. re-write data or */
2111 /* throw it away here. */
2113 if(vm_map_copyout(kernel_map
, &ioaddr
,
2114 (vm_map_copy_t
)vsa
->vsa_addr
) != KERN_SUCCESS
)
2115 panic("vs_cluster_write: unable to copy source list\n");
2116 vm_deallocate(kernel_map
, ioaddr
, vsa
->vsa_size
);
2118 ps_vs_write_complete(vsa
->vsa_vs
, vsa
->vsa_offset
,
2119 vsa
->vsa_size
, vsa
->vsa_error
);
2121 vs_cl_write_complete(vsa
->vsa_vs
, vsa
->vsa_ps
, vsa
->vsa_offset
,
2122 vsa
->vsa_addr
, vsa
->vsa_size
, TRUE
,
2127 return KERN_SUCCESS
;
2130 kern_return_t
device_write_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2132 device_write_reply_inband(
2133 MACH_PORT_FACE reply_port
,
2134 kern_return_t return_code
,
2135 io_buf_len_t bytes_written
)
2137 panic("device_write_reply_inband: illegal");
2138 return KERN_SUCCESS
;
2141 kern_return_t
device_read_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_t
, mach_msg_type_number_t
);
2144 MACH_PORT_FACE reply_port
,
2145 kern_return_t return_code
,
2147 mach_msg_type_number_t dataCnt
)
2149 struct vs_async
*vsa
;
2150 vsa
= (struct vs_async
*)
2151 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2152 vsa
->vsa_addr
= (vm_offset_t
)data
;
2153 vsa
->vsa_size
= (vm_size_t
)dataCnt
;
2154 vsa
->vsa_error
= return_code
;
2155 thread_wakeup(&vsa
->vsa_lock
);
2156 return KERN_SUCCESS
;
2159 kern_return_t
device_read_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_inband_t
, mach_msg_type_number_t
);
2161 device_read_reply_inband(
2162 MACH_PORT_FACE reply_port
,
2163 kern_return_t return_code
,
2164 io_buf_ptr_inband_t data
,
2165 mach_msg_type_number_t dataCnt
)
2167 panic("device_read_reply_inband: illegal");
2168 return KERN_SUCCESS
;
2171 kern_return_t
device_read_reply_overwrite(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2173 device_read_reply_overwrite(
2174 MACH_PORT_FACE reply_port
,
2175 kern_return_t return_code
,
2176 io_buf_len_t bytes_read
)
2178 panic("device_read_reply_overwrite: illegal\n");
2179 return KERN_SUCCESS
;
2182 kern_return_t
device_open_reply(MACH_PORT_FACE
, kern_return_t
, MACH_PORT_FACE
);
2185 MACH_PORT_FACE reply_port
,
2186 kern_return_t return_code
,
2187 MACH_PORT_FACE device_port
)
2189 panic("device_open_reply: illegal\n");
2190 return KERN_SUCCESS
;
2193 kern_return_t
ps_read_device(paging_segment_t
, vm_offset_t
, vm_offset_t
*, unsigned int, unsigned int *, int); /* forward */
2197 paging_segment_t ps
,
2199 vm_offset_t
*bufferp
,
2201 unsigned int *residualp
,
2205 recnum_t dev_offset
;
2206 unsigned int bytes_wanted
;
2207 unsigned int bytes_read
;
2208 unsigned int total_read
;
2209 vm_offset_t dev_buffer
;
2210 vm_offset_t buf_ptr
;
2211 unsigned int records_read
;
2212 struct vs_async
*vsa
;
2213 mutex_t vs_waiting_read_reply
;
2216 vm_map_copy_t device_data
= NULL
;
2217 default_pager_thread_t
*dpt
= NULL
;
2219 device
= dev_port_lookup(ps
->ps_device
);
2220 clustered_reads
[atop_32(size
)]++;
2222 dev_offset
= (ps
->ps_offset
+
2223 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2224 bytes_wanted
= size
;
2226 *bufferp
= (vm_offset_t
)NULL
;
2229 vsa
= VS_ALLOC_ASYNC();
2233 vsa
->vsa_offset
= 0;
2237 mutex_init(&vsa
->vsa_lock
, ETAP_DPAGE_VSSEQNO
);
2238 ip_lock(vsa
->reply_port
);
2239 vsa
->reply_port
->ip_sorights
++;
2240 ip_reference(vsa
->reply_port
);
2241 ip_unlock(vsa
->reply_port
);
2242 kr
= ds_device_read_common(device
,
2244 (mach_msg_type_name_t
)
2245 MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2249 (IO_READ
| IO_CALL
),
2250 (io_buf_ptr_t
*) &dev_buffer
,
2251 (mach_msg_type_number_t
*) &bytes_read
);
2252 if(kr
== MIG_NO_REPLY
) {
2253 assert_wait(&vsa
->vsa_lock
, THREAD_UNINT
);
2254 thread_block(THREAD_CONTINUE_NULL
);
2256 dev_buffer
= vsa
->vsa_addr
;
2257 bytes_read
= (unsigned int)vsa
->vsa_size
;
2258 kr
= vsa
->vsa_error
;
2261 if (kr
!= KERN_SUCCESS
|| bytes_read
== 0) {
2264 total_read
+= bytes_read
;
2267 * If we got the entire range, use the returned dev_buffer.
2269 if (bytes_read
== size
) {
2270 *bufferp
= (vm_offset_t
)dev_buffer
;
2275 dprintf(("read only %d bytes out of %d\n",
2276 bytes_read
, bytes_wanted
));
2279 dpt
= get_read_buffer();
2280 buf_ptr
= dpt
->dpt_buffer
;
2281 *bufferp
= (vm_offset_t
)buf_ptr
;
2284 * Otherwise, copy the data into the provided buffer (*bufferp)
2285 * and append the rest of the range as it comes in.
2287 memcpy((void *) buf_ptr
, (void *) dev_buffer
, bytes_read
);
2288 buf_ptr
+= bytes_read
;
2289 bytes_wanted
-= bytes_read
;
2290 records_read
= (bytes_read
>>
2291 (vm_page_shift
- ps
->ps_record_shift
));
2292 dev_offset
+= records_read
;
2293 DEBUG(DEBUG_VS_INTERNAL
,
2294 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2295 dev_buffer
, bytes_read
));
2296 if (vm_deallocate(kernel_map
, dev_buffer
, bytes_read
)
2298 Panic("dealloc buf");
2299 } while (bytes_wanted
);
2301 *residualp
= size
- total_read
;
2302 if((dev_buffer
!= *bufferp
) && (total_read
!= 0)) {
2303 vm_offset_t temp_buffer
;
2304 vm_allocate(kernel_map
, &temp_buffer
, total_read
, TRUE
);
2305 memcpy((void *) temp_buffer
, (void *) *bufferp
, total_read
);
2306 if(vm_map_copyin_page_list(kernel_map
, temp_buffer
, total_read
,
2307 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2308 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2309 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2310 (vm_map_copy_t
*)&device_data
, FALSE
))
2311 panic("ps_read_device: cannot copyin locally provided buffer\n");
2313 else if((kr
== KERN_SUCCESS
) && (total_read
!= 0) && (dev_buffer
!= 0)){
2314 if(vm_map_copyin_page_list(kernel_map
, dev_buffer
, bytes_read
,
2315 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2316 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2317 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2318 (vm_map_copy_t
*)&device_data
, FALSE
))
2319 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2324 *bufferp
= (vm_offset_t
)device_data
;
2327 /* Free the receive buffer */
2328 dpt
->checked_out
= 0;
2329 thread_wakeup(&dpt_array
);
2331 return KERN_SUCCESS
;
2334 kern_return_t
ps_write_device(paging_segment_t
, vm_offset_t
, vm_offset_t
, unsigned int, struct vs_async
*); /* forward */
2338 paging_segment_t ps
,
2342 struct vs_async
*vsa
)
2344 recnum_t dev_offset
;
2345 io_buf_len_t bytes_to_write
, bytes_written
;
2346 recnum_t records_written
;
2348 MACH_PORT_FACE reply_port
;
2352 clustered_writes
[atop_32(size
)]++;
2354 dev_offset
= (ps
->ps_offset
+
2355 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2356 bytes_to_write
= size
;
2360 * Asynchronous write.
2362 reply_port
= vsa
->reply_port
;
2363 ip_lock(reply_port
);
2364 reply_port
->ip_sorights
++;
2365 ip_reference(reply_port
);
2366 ip_unlock(reply_port
);
2369 device
= dev_port_lookup(ps
->ps_device
);
2371 vsa
->vsa_addr
= addr
;
2372 kr
=ds_device_write_common(device
,
2374 (mach_msg_type_name_t
) MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2377 (io_buf_ptr_t
) addr
,
2379 (IO_WRITE
| IO_CALL
),
2382 if ((kr
!= KERN_SUCCESS
) && (kr
!= MIG_NO_REPLY
)) {
2384 dprintf(("%s0x%x, addr=0x%x,"
2385 "size=0x%x,offset=0x%x\n",
2386 "device_write_request returned ",
2387 kr
, addr
, size
, offset
));
2389 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2390 /* do the completion notification to free resources */
2391 device_write_reply(reply_port
, kr
, 0);
2396 * Synchronous write.
2400 device
= dev_port_lookup(ps
->ps_device
);
2401 kr
=ds_device_write_common(device
,
2405 (io_buf_ptr_t
) addr
,
2407 (IO_WRITE
| IO_SYNC
| IO_KERNEL_BUF
),
2410 if (kr
!= KERN_SUCCESS
) {
2411 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2412 "device_write returned ",
2413 kr
, addr
, size
, offset
));
2415 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2418 if (bytes_written
& ((vm_page_size
>> ps
->ps_record_shift
) - 1))
2419 Panic("fragmented write");
2420 records_written
= (bytes_written
>>
2421 (vm_page_shift
- ps
->ps_record_shift
));
2422 dev_offset
+= records_written
;
2424 if (bytes_written
!= bytes_to_write
) {
2425 dprintf(("wrote only %d bytes out of %d\n",
2426 bytes_written
, bytes_to_write
));
2429 bytes_to_write
-= bytes_written
;
2430 addr
+= bytes_written
;
2431 } while (bytes_to_write
> 0);
2433 return PAGER_SUCCESS
;
2437 #else /* !DEVICE_PAGING */
2441 paging_segment_t ps
,
2443 vm_offset_t
*bufferp
,
2445 unsigned int *residualp
,
2448 panic("ps_read_device not supported");
2452 paging_segment_t ps
,
2456 struct vs_async
*vsa
)
2458 panic("ps_write_device not supported");
2461 #endif /* DEVICE_PAGING */
2462 void pvs_object_data_provided(vstruct_t
, upl_t
, vm_offset_t
, vm_size_t
); /* forward */
2465 pvs_object_data_provided(
2472 DEBUG(DEBUG_VS_INTERNAL
,
2473 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2474 upl
, offset
, size
));
2477 GSTAT(global_stats
.gs_pages_in
+= atop_32(size
));
2481 ps_clunmap(vs
, offset
, size
);
2482 #endif /* USE_PRECIOUS */
2489 vm_offset_t vs_offset
,
2493 kern_return_t error
= KERN_SUCCESS
;
2495 unsigned int residual
;
2496 unsigned int request_flags
;
2503 vm_offset_t ps_offset
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2504 paging_segment_t psp
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2507 pages_in_cl
= 1 << vs
->vs_clshift
;
2508 cl_size
= pages_in_cl
* vm_page_size
;
2509 cl_mask
= cl_size
- 1;
2512 * This loop will be executed multiple times until the entire
2513 * request has been satisfied... if the request spans cluster
2514 * boundaries, the clusters will be checked for logical continunity,
2515 * if contiguous the I/O request will span multiple clusters, otherwise
2516 * it will be broken up into the minimal set of I/O's
2518 * If there are holes in a request (either unallocated pages in a paging
2519 * segment or an unallocated paging segment), we stop
2520 * reading at the hole, inform the VM of any data read, inform
2521 * the VM of an unavailable range, then loop again, hoping to
2522 * find valid pages later in the requested range. This continues until
2523 * the entire range has been examined, and read, if present.
2527 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
| UPL_RET_ONLY_ABSENT
;
2529 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_RET_ONLY_ABSENT
;
2531 while (cnt
&& (error
== KERN_SUCCESS
)) {
2533 int page_list_count
;
2535 if((vs_offset
& cl_mask
) &&
2536 (cnt
> (VM_SUPER_CLUSTER
-
2537 (vs_offset
& cl_mask
)))) {
2538 size
= VM_SUPER_CLUSTER
;
2539 size
-= vs_offset
& cl_mask
;
2540 } else if (cnt
> VM_SUPER_CLUSTER
) {
2541 size
= VM_SUPER_CLUSTER
;
2550 while (size
> 0 && error
== KERN_SUCCESS
) {
2555 vm_offset_t cur_offset
;
2558 if ( !ps_info_valid
) {
2559 ps_offset
[seg_index
] = ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
2560 psp
[seg_index
] = CLMAP_PS(clmap
);
2564 * skip over unallocated physical segments
2566 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2567 abort_size
= cl_size
- (vs_offset
& cl_mask
);
2568 abort_size
= MIN(abort_size
, size
);
2570 page_list_count
= 0;
2571 memory_object_super_upl_request(
2573 (memory_object_offset_t
)vs_offset
,
2574 abort_size
, abort_size
,
2575 &upl
, NULL
, &page_list_count
,
2578 if (clmap
.cl_error
) {
2579 upl_abort(upl
, UPL_ABORT_ERROR
);
2581 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
2583 upl_deallocate(upl
);
2586 vs_offset
+= abort_size
;
2592 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
2594 for (abort_size
= 0; cl_index
< pages_in_cl
&& abort_size
< size
; cl_index
++) {
2596 * skip over unallocated pages
2598 if (CLMAP_ISSET(clmap
, cl_index
))
2600 abort_size
+= vm_page_size
;
2604 * Let VM system know about holes in clusters.
2606 GSTAT(global_stats
.gs_pages_unavail
+= atop_32(abort_size
));
2608 page_list_count
= 0;
2609 memory_object_super_upl_request(
2611 (memory_object_offset_t
)vs_offset
,
2612 abort_size
, abort_size
,
2613 &upl
, NULL
, &page_list_count
,
2616 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
2617 upl_deallocate(upl
);
2620 vs_offset
+= abort_size
;
2622 if (cl_index
== pages_in_cl
) {
2624 * if we're at the end of this physical cluster
2625 * then bump to the next one and continue looking
2635 * remember the starting point of the first allocated page
2636 * for the I/O we're about to issue
2638 beg_pseg
= seg_index
;
2639 beg_indx
= cl_index
;
2640 cur_offset
= vs_offset
;
2643 * calculate the size of the I/O that we can do...
2644 * this may span multiple physical segments if
2645 * they are contiguous
2647 for (xfer_size
= 0; xfer_size
< size
; ) {
2649 while (cl_index
< pages_in_cl
2650 && xfer_size
< size
) {
2652 * accumulate allocated pages within
2653 * a physical segment
2655 if (CLMAP_ISSET(clmap
, cl_index
)) {
2656 xfer_size
+= vm_page_size
;
2657 cur_offset
+= vm_page_size
;
2660 BS_STAT(psp
[seg_index
]->ps_bs
,
2661 psp
[seg_index
]->ps_bs
->bs_pages_in
++);
2665 if (cl_index
< pages_in_cl
2666 || xfer_size
>= size
) {
2668 * we've hit an unallocated page or
2669 * the end of this request... go fire
2675 * we've hit the end of the current physical
2676 * segment and there's more to do, so try
2677 * moving to the next one
2681 ps_offset
[seg_index
] =
2683 cur_offset
& ~cl_mask
,
2684 &clmap
, CL_FIND
, 0, 0);
2685 psp
[seg_index
] = CLMAP_PS(clmap
);
2688 if ((ps_offset
[seg_index
- 1] != (ps_offset
[seg_index
] - cl_size
)) || (psp
[seg_index
- 1] != psp
[seg_index
])) {
2690 * if the physical segment we're about
2691 * to step into is not contiguous to
2692 * the one we're currently in, or it's
2693 * in a different paging file, or
2694 * it hasn't been allocated....
2695 * we stop here and generate the I/O
2700 * start with first page of the next physical
2707 * we have a contiguous range of allocated pages
2710 page_list_count
= 0;
2711 memory_object_super_upl_request(vs
->vs_control
,
2712 (memory_object_offset_t
)vs_offset
,
2713 xfer_size
, xfer_size
,
2714 &upl
, NULL
, &page_list_count
,
2715 request_flags
| UPL_SET_INTERNAL
);
2717 error
= ps_read_file(psp
[beg_pseg
],
2718 upl
, (vm_offset_t
) 0,
2719 ps_offset
[beg_pseg
] +
2720 (beg_indx
* vm_page_size
),
2721 xfer_size
, &residual
, 0);
2728 * Adjust counts and send response to VM. Optimize
2729 * for the common case, i.e. no error and/or partial
2730 * data. If there was an error, then we need to error
2731 * the entire range, even if some data was successfully
2732 * read. If there was a partial read we may supply some
2733 * data and may error some as well. In all cases the
2734 * VM must receive some notification for every page
2737 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
2739 * Got everything we asked for, supply the data
2740 * to the VM. Note that as a side effect of
2741 * supplying the data, the buffer holding the
2742 * supplied data is deallocated from the pager's
2745 pvs_object_data_provided(
2746 vs
, upl
, vs_offset
, xfer_size
);
2748 failed_size
= xfer_size
;
2750 if (error
== KERN_SUCCESS
) {
2751 if (residual
== xfer_size
) {
2753 * If a read operation returns no error
2754 * and no data moved, we turn it into
2755 * an error, assuming we're reading at
2757 * Fall through and error the entire
2760 error
= KERN_FAILURE
;
2763 * Otherwise, we have partial read. If
2764 * the part read is a integral number
2765 * of pages supply it. Otherwise round
2766 * it up to a page boundary, zero fill
2767 * the unread part, and supply it.
2768 * Fall through and error the remainder
2769 * of the range, if any.
2775 lsize
= (xfer_size
- residual
)
2777 pvs_object_data_provided(
2781 if (lsize
< xfer_size
) {
2784 error
= KERN_FAILURE
;
2790 * If there was an error in any part of the range, tell
2791 * the VM. Note that error is explicitly checked again
2792 * since it can be modified above.
2794 if (error
!= KERN_SUCCESS
) {
2795 BS_STAT(psp
[beg_pseg
]->ps_bs
,
2796 psp
[beg_pseg
]->ps_bs
->bs_pages_in_fail
2797 += atop_32(failed_size
));
2800 vs_offset
+= xfer_size
;
2803 } /* END while (cnt && (error == 0)) */
2807 int vs_do_async_write
= 1;
2815 boolean_t dp_internal
,
2819 vm_offset_t transfer_size
;
2823 vm_offset_t actual_offset
; /* Offset within paging segment */
2824 paging_segment_t ps
;
2825 vm_offset_t subx_size
;
2826 vm_offset_t mobj_base_addr
;
2827 vm_offset_t mobj_target_addr
;
2830 struct vs_async
*vsa
;
2834 upl_page_info_t
*pl
;
2842 pages_in_cl
= 1 << vs
->vs_clshift
;
2843 cl_size
= pages_in_cl
* vm_page_size
;
2846 int page_list_count
;
2853 vm_offset_t upl_offset
;
2854 vm_offset_t seg_offset
;
2855 vm_offset_t ps_offset
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2856 paging_segment_t psp
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2860 super_size
= cl_size
;
2862 request_flags
= UPL_NOBLOCK
|
2863 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2864 UPL_NO_SYNC
| UPL_SET_INTERNAL
;
2866 super_size
= VM_SUPER_CLUSTER
;
2868 request_flags
= UPL_NOBLOCK
| UPL_CLEAN_IN_PLACE
|
2869 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2870 UPL_NO_SYNC
| UPL_SET_INTERNAL
;
2873 page_list_count
= 0;
2874 memory_object_super_upl_request(vs
->vs_control
,
2875 (memory_object_offset_t
)offset
,
2877 &upl
, NULL
, &page_list_count
,
2878 request_flags
| UPL_FOR_PAGEOUT
);
2880 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2882 seg_size
= cl_size
- (upl
->offset
% cl_size
);
2883 upl_offset
= upl
->offset
& ~(cl_size
- 1);
2885 for (seg_index
= 0, transfer_size
= upl
->size
;
2886 transfer_size
> 0; ) {
2887 ps_offset
[seg_index
] =
2893 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2895 upl_deallocate(upl
);
2897 return KERN_FAILURE
;
2900 psp
[seg_index
] = CLMAP_PS(clmap
);
2902 if (transfer_size
> seg_size
) {
2903 transfer_size
-= seg_size
;
2904 upl_offset
+= cl_size
;
2911 * Ignore any non-present pages at the end of the
2914 for (page_index
= upl
->size
/ vm_page_size
; page_index
> 0;)
2915 if (UPL_PAGE_PRESENT(pl
, --page_index
))
2917 num_of_pages
= page_index
+ 1;
2919 base_index
= (upl
->offset
% cl_size
) / PAGE_SIZE
;
2921 for (page_index
= 0; page_index
< num_of_pages
; ) {
2923 * skip over non-dirty pages
2925 for ( ; page_index
< num_of_pages
; page_index
++) {
2926 if (UPL_DIRTY_PAGE(pl
, page_index
)
2927 || UPL_PRECIOUS_PAGE(pl
, page_index
))
2929 * this is a page we need to write
2930 * go see if we can buddy it up with
2931 * others that are contiguous to it
2935 * if the page is not-dirty, but present we
2936 * need to commit it... This is an unusual
2937 * case since we only asked for dirty pages
2939 if (UPL_PAGE_PRESENT(pl
, page_index
)) {
2940 boolean_t empty
= FALSE
;
2941 upl_commit_range(upl
,
2942 page_index
* vm_page_size
,
2944 UPL_COMMIT_NOTIFY_EMPTY
,
2949 assert(page_index
==
2951 upl_deallocate(upl
);
2955 if (page_index
== num_of_pages
)
2957 * no more pages to look at, we're out of here
2962 * gather up contiguous dirty pages... we have at
2963 * least 1 * otherwise we would have bailed above
2964 * make sure that each physical segment that we step
2965 * into is contiguous to the one we're currently in
2966 * if it's not, we have to stop and write what we have
2968 for (first_dirty
= page_index
;
2969 page_index
< num_of_pages
; ) {
2970 if ( !UPL_DIRTY_PAGE(pl
, page_index
)
2971 && !UPL_PRECIOUS_PAGE(pl
, page_index
))
2975 * if we just looked at the last page in the UPL
2976 * we don't need to check for physical segment
2979 if (page_index
< num_of_pages
) {
2983 cur_seg
= (base_index
+ (page_index
- 1))/pages_in_cl
;
2984 nxt_seg
= (base_index
+ page_index
)/pages_in_cl
;
2986 if (cur_seg
!= nxt_seg
) {
2987 if ((ps_offset
[cur_seg
] != (ps_offset
[nxt_seg
] - cl_size
)) || (psp
[cur_seg
] != psp
[nxt_seg
]))
2989 * if the segment we're about
2990 * to step into is not
2991 * contiguous to the one we're
2992 * currently in, or it's in a
2993 * different paging file....
2994 * we stop here and generate
3001 num_dirty
= page_index
- first_dirty
;
3004 upl_offset
= first_dirty
* vm_page_size
;
3005 transfer_size
= num_dirty
* vm_page_size
;
3007 while (transfer_size
) {
3009 if ((seg_size
= cl_size
-
3010 ((upl
->offset
+ upl_offset
) % cl_size
))
3012 seg_size
= transfer_size
;
3014 ps_vs_write_complete(vs
,
3015 upl
->offset
+ upl_offset
,
3018 transfer_size
-= seg_size
;
3019 upl_offset
+= seg_size
;
3021 upl_offset
= first_dirty
* vm_page_size
;
3022 transfer_size
= num_dirty
* vm_page_size
;
3024 seg_index
= (base_index
+ first_dirty
) / pages_in_cl
;
3025 seg_offset
= (upl
->offset
+ upl_offset
) % cl_size
;
3027 error
= ps_write_file(psp
[seg_index
],
3029 ps_offset
[seg_index
]
3031 transfer_size
, flags
);
3033 boolean_t empty
= FALSE
;
3034 upl_abort_range(upl
,
3035 first_dirty
* vm_page_size
,
3036 num_dirty
* vm_page_size
,
3037 UPL_ABORT_NOTIFY_EMPTY
,
3040 assert(page_index
== num_of_pages
);
3041 upl_deallocate(upl
);
3047 assert(cnt
<= (vm_page_size
<< vs
->vs_clshift
));
3051 /* The caller provides a mapped_data which is derived */
3052 /* from a temporary object. The targeted pages are */
3053 /* guaranteed to be set at offset 0 in the mapped_data */
3054 /* The actual offset however must still be derived */
3055 /* from the offset in the vs in question */
3056 mobj_base_addr
= offset
;
3057 mobj_target_addr
= mobj_base_addr
;
3059 for (transfer_size
= list_size
; transfer_size
!= 0;) {
3060 actual_offset
= ps_clmap(vs
, mobj_target_addr
,
3062 transfer_size
< cl_size
?
3063 transfer_size
: cl_size
, 0);
3064 if(actual_offset
== (vm_offset_t
) -1) {
3068 cnt
= MIN(transfer_size
,
3069 CLMAP_NPGS(clmap
) * vm_page_size
);
3070 ps
= CLMAP_PS(clmap
);
3071 /* Assume that the caller has given us contiguous */
3074 ps_vs_write_complete(vs
, mobj_target_addr
,
3076 error
= ps_write_file(ps
, internal_upl
,
3084 actual_offset
+= cnt
;
3085 mobj_target_addr
+= cnt
;
3086 transfer_size
-= cnt
;
3094 return KERN_FAILURE
;
3096 return KERN_SUCCESS
;
3100 ps_vstruct_allocated_size(
3104 struct vs_map
*vsmap
;
3108 if (vs
->vs_indirect
) {
3109 /* loop on indirect maps */
3110 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3111 vsmap
= vs
->vs_imap
[i
];
3114 /* loop on clusters in this indirect map */
3115 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3116 if (VSM_ISCLR(vsmap
[j
]) ||
3117 VSM_ISERR(vsmap
[j
]))
3119 /* loop on pages in this cluster */
3120 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3121 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3127 vsmap
= vs
->vs_dmap
;
3130 /* loop on clusters in the direct map */
3131 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3132 if (VSM_ISCLR(vsmap
[j
]) ||
3133 VSM_ISERR(vsmap
[j
]))
3135 /* loop on pages in this cluster */
3136 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3137 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3143 return ptoa_32(num_pages
);
3147 ps_vstruct_allocated_pages(
3149 default_pager_page_t
*pages
,
3153 struct vs_map
*vsmap
;
3159 if (vs
->vs_indirect
) {
3160 /* loop on indirect maps */
3161 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3162 vsmap
= vs
->vs_imap
[i
];
3163 if (vsmap
== NULL
) {
3164 offset
+= (vm_page_size
* CLMAP_ENTRIES
*
3168 /* loop on clusters in this indirect map */
3169 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3170 if (VSM_ISCLR(vsmap
[j
]) ||
3171 VSM_ISERR(vsmap
[j
])) {
3172 offset
+= vm_page_size
* VSCLSIZE(vs
);
3175 /* loop on pages in this cluster */
3176 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3177 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3179 if (num_pages
< pages_size
)
3180 pages
++->dpp_offset
=
3183 offset
+= vm_page_size
;
3188 vsmap
= vs
->vs_dmap
;
3191 /* loop on clusters in the direct map */
3192 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3193 if (VSM_ISCLR(vsmap
[j
]) ||
3194 VSM_ISERR(vsmap
[j
])) {
3195 offset
+= vm_page_size
* VSCLSIZE(vs
);
3198 /* loop on pages in this cluster */
3199 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3200 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3202 if (num_pages
< pages_size
)
3203 pages
++->dpp_offset
= offset
;
3205 offset
+= vm_page_size
;
3215 ps_vstruct_transfer_from_segment(
3217 paging_segment_t segment
,
3220 struct vs_map
*vsmap
;
3221 struct vs_map old_vsmap
;
3222 struct vs_map new_vsmap
;
3225 VS_LOCK(vs
); /* block all work on this vstruct */
3226 /* can't allow the normal multiple write */
3227 /* semantic because writes may conflict */
3228 vs
->vs_xfer_pending
= TRUE
;
3229 vs_wait_for_sync_writers(vs
);
3231 vs_wait_for_readers(vs
);
3232 /* we will unlock the vs to allow other writes while transferring */
3233 /* and will be guaranteed of the persistance of the vs struct */
3234 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3235 /* vs_async_pending */
3236 /* OK we now have guaranteed no other parties are accessing this */
3237 /* vs. Now that we are also supporting simple lock versions of */
3238 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3239 /* our purpose in holding it before was the multiple write case */
3240 /* we now use the boolean xfer_pending to do that. We can use */
3241 /* a boolean instead of a count because we have guaranteed single */
3242 /* file access to this code in its caller */
3245 if (vs
->vs_indirect
) {
3248 /* loop on indirect maps */
3249 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3250 vsmap
= vs
->vs_imap
[i
];
3253 /* loop on clusters in this indirect map */
3254 clmap_off
= (vm_page_size
* CLMAP_ENTRIES
*
3256 if(i
+1 == INDIRECT_CLMAP_ENTRIES(vs
->vs_size
))
3257 vsmap_size
= vs
->vs_size
- (CLMAP_ENTRIES
* i
);
3259 vsmap_size
= CLMAP_ENTRIES
;
3260 for (j
= 0; j
< vsmap_size
; j
++) {
3261 if (VSM_ISCLR(vsmap
[j
]) ||
3262 VSM_ISERR(vsmap
[j
]) ||
3263 (VSM_PS(vsmap
[j
]) != segment
))
3265 if(vs_cluster_transfer(vs
,
3266 (vm_page_size
* (j
<< vs
->vs_clshift
))
3268 vm_page_size
<< vs
->vs_clshift
,
3272 vs
->vs_xfer_pending
= FALSE
;
3274 vs_finish_write(vs
);
3275 return KERN_FAILURE
;
3277 /* allow other readers/writers during transfer*/
3279 vs
->vs_xfer_pending
= FALSE
;
3281 vs_finish_write(vs
);
3283 vs
->vs_xfer_pending
= TRUE
;
3284 vs_wait_for_sync_writers(vs
);
3286 vs_wait_for_readers(vs
);
3288 if (!(vs
->vs_indirect
)) {
3294 vsmap
= vs
->vs_dmap
;
3295 if (vsmap
== NULL
) {
3297 vs
->vs_xfer_pending
= FALSE
;
3299 vs_finish_write(vs
);
3300 return KERN_SUCCESS
;
3302 /* loop on clusters in the direct map */
3303 for (j
= 0; j
< vs
->vs_size
; j
++) {
3304 if (VSM_ISCLR(vsmap
[j
]) ||
3305 VSM_ISERR(vsmap
[j
]) ||
3306 (VSM_PS(vsmap
[j
]) != segment
))
3308 if(vs_cluster_transfer(vs
,
3309 vm_page_size
* (j
<< vs
->vs_clshift
),
3310 vm_page_size
<< vs
->vs_clshift
,
3311 upl
) != KERN_SUCCESS
) {
3313 vs
->vs_xfer_pending
= FALSE
;
3315 vs_finish_write(vs
);
3316 return KERN_FAILURE
;
3318 /* allow other readers/writers during transfer*/
3320 vs
->vs_xfer_pending
= FALSE
;
3322 vs_finish_write(vs
);
3324 vs
->vs_xfer_pending
= TRUE
;
3326 vs_wait_for_sync_writers(vs
);
3328 vs_wait_for_readers(vs
);
3329 if (vs
->vs_indirect
) {
3336 vs
->vs_xfer_pending
= FALSE
;
3338 vs_finish_write(vs
);
3339 return KERN_SUCCESS
;
3349 struct vs_map
*vsmap
;
3350 vm_offset_t cluster
;
3352 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
3353 if (vs
->vs_indirect
) {
3354 long ind_block
= cluster
/CLMAP_ENTRIES
;
3356 /* Is the indirect block allocated? */
3357 vsmap
= vs
->vs_imap
[ind_block
];
3358 if(vsmap
== (vs_map_t
) NULL
)
3361 vsmap
= vs
->vs_dmap
;
3362 vsmap
+= cluster%CLMAP_ENTRIES
;
3367 vs_cluster_transfer(
3373 vm_offset_t actual_offset
;
3374 paging_segment_t ps
;
3376 kern_return_t error
= KERN_SUCCESS
;
3377 int size
, size_wanted
, i
;
3378 unsigned int residual
;
3380 default_pager_thread_t
*dpt
;
3382 struct vs_map
*vsmap_ptr
;
3383 struct vs_map read_vsmap
;
3384 struct vs_map original_read_vsmap
;
3385 struct vs_map write_vsmap
;
3389 /* vs_cluster_transfer reads in the pages of a cluster and
3390 * then writes these pages back to new backing store. The
3391 * segment the pages are being read from is assumed to have
3392 * been taken off-line and is no longer considered for new
3397 * This loop will be executed once per cluster referenced.
3398 * Typically this means once, since it's unlikely that the
3399 * VM system will ask for anything spanning cluster boundaries.
3401 * If there are holes in a cluster (in a paging segment), we stop
3402 * reading at the hole, then loop again, hoping to
3403 * find valid pages later in the cluster. This continues until
3404 * the entire range has been examined, and read, if present. The
3405 * pages are written as they are read. If a failure occurs after
3406 * some pages are written the unmap call at the bottom of the loop
3407 * recovers the backing store and the old backing store remains
3411 VSM_CLR(write_vsmap
);
3412 VSM_CLR(original_read_vsmap
);
3413 /* grab the actual object's pages to sync with I/O */
3414 while (cnt
&& (error
== KERN_SUCCESS
)) {
3415 vsmap_ptr
= vs_get_map_entry(vs
, offset
);
3416 actual_offset
= ps_clmap(vs
, offset
, &clmap
, CL_FIND
, 0, 0);
3418 if (actual_offset
== (vm_offset_t
) -1) {
3421 * Nothing left to write in this cluster at least
3422 * set write cluster information for any previous
3423 * write, clear for next cluster, if there is one
3425 unsigned int local_size
, clmask
, clsize
;
3427 clsize
= vm_page_size
<< vs
->vs_clshift
;
3428 clmask
= clsize
- 1;
3429 local_size
= clsize
- (offset
& clmask
);
3431 local_size
= MIN(local_size
, cnt
);
3433 /* This cluster has no data in it beyond what may */
3434 /* have been found on a previous iteration through */
3435 /* the loop "write_vsmap" */
3436 *vsmap_ptr
= write_vsmap
;
3437 VSM_CLR(write_vsmap
);
3438 VSM_CLR(original_read_vsmap
);
3441 offset
+= local_size
;
3446 * Count up contiguous available or unavailable
3449 ps
= CLMAP_PS(clmap
);
3454 (size
< cnt
) && (unavail_size
< cnt
) &&
3455 (i
< CLMAP_NPGS(clmap
)); i
++) {
3456 if (CLMAP_ISSET(clmap
, i
)) {
3457 if (unavail_size
!= 0)
3459 size
+= vm_page_size
;
3461 ps
->ps_bs
->bs_pages_in
++);
3465 unavail_size
+= vm_page_size
;
3470 ASSERT(unavail_size
);
3471 cnt
-= unavail_size
;
3472 offset
+= unavail_size
;
3473 if((offset
& ((vm_page_size
<< vs
->vs_clshift
) - 1))
3475 /* There is no more to transfer in this
3478 *vsmap_ptr
= write_vsmap
;
3479 VSM_CLR(write_vsmap
);
3480 VSM_CLR(original_read_vsmap
);
3485 if(VSM_ISCLR(original_read_vsmap
))
3486 original_read_vsmap
= *vsmap_ptr
;
3488 if(ps
->ps_segtype
== PS_PARTITION
) {
3490 NEED TO ISSUE WITH SYNC & NO COMMIT
3491 error = ps_read_device(ps, actual_offset, &buffer,
3492 size, &residual, flags);
3495 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3496 error
= ps_read_file(ps
, upl
, (vm_offset_t
) 0, actual_offset
,
3498 (UPL_IOSYNC
| UPL_NOCOMMIT
));
3501 read_vsmap
= *vsmap_ptr
;
3505 * Adjust counts and put data in new BS. Optimize for the
3506 * common case, i.e. no error and/or partial data.
3507 * If there was an error, then we need to error the entire
3508 * range, even if some data was successfully read.
3511 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
3512 int page_list_count
= 0;
3515 * Got everything we asked for, supply the data to
3516 * the new BS. Note that as a side effect of supplying
3517 * the data, the buffer holding the supplied data is
3518 * deallocated from the pager's address space unless
3519 * the write is unsuccessful.
3522 /* note buffer will be cleaned up in all cases by */
3523 /* internal_cluster_write or if an error on write */
3524 /* the vm_map_copy_page_discard call */
3525 *vsmap_ptr
= write_vsmap
;
3527 if(vs_cluster_write(vs
, upl
, offset
,
3528 size
, TRUE
, UPL_IOSYNC
| UPL_NOCOMMIT
) != KERN_SUCCESS
) {
3529 error
= KERN_FAILURE
;
3530 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3531 /* unmap the new backing store object */
3532 ps_clunmap(vs
, offset
, size
);
3534 /* original vsmap */
3535 *vsmap_ptr
= original_read_vsmap
;
3536 VSM_CLR(write_vsmap
);
3538 if((offset
+ size
) &
3539 ((vm_page_size
<< vs
->vs_clshift
)
3541 /* There is more to transfer in this
3544 write_vsmap
= *vsmap_ptr
;
3545 *vsmap_ptr
= read_vsmap
;
3547 /* discard the old backing object */
3548 write_vsmap
= *vsmap_ptr
;
3549 *vsmap_ptr
= read_vsmap
;
3550 ps_clunmap(vs
, offset
, size
);
3551 *vsmap_ptr
= write_vsmap
;
3552 VSM_CLR(write_vsmap
);
3553 VSM_CLR(original_read_vsmap
);
3558 if (error
== KERN_SUCCESS
) {
3559 if (residual
== size
) {
3561 * If a read operation returns no error
3562 * and no data moved, we turn it into
3563 * an error, assuming we're reading at
3565 * Fall through and error the entire
3568 error
= KERN_FAILURE
;
3569 *vsmap_ptr
= write_vsmap
;
3570 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3571 /* unmap the new backing store object */
3572 ps_clunmap(vs
, offset
, size
);
3574 *vsmap_ptr
= original_read_vsmap
;
3575 VSM_CLR(write_vsmap
);
3579 * Otherwise, we have partial read.
3580 * This is also considered an error
3581 * for the purposes of cluster transfer
3583 error
= KERN_FAILURE
;
3584 *vsmap_ptr
= write_vsmap
;
3585 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3586 /* unmap the new backing store object */
3587 ps_clunmap(vs
, offset
, size
);
3589 *vsmap_ptr
= original_read_vsmap
;
3590 VSM_CLR(write_vsmap
);
3599 } /* END while (cnt && (error == 0)) */
3600 if(!VSM_ISCLR(write_vsmap
))
3601 *vsmap_ptr
= write_vsmap
;
3607 default_pager_add_file(MACH_PORT_FACE backing_store
,
3613 paging_segment_t ps
;
3617 if ((bs
= backing_store_lookup(backing_store
))
3618 == BACKING_STORE_NULL
)
3619 return KERN_INVALID_ARGUMENT
;
3622 for (i
= 0; i
<= paging_segment_max
; i
++) {
3623 ps
= paging_segments
[i
];
3624 if (ps
== PAGING_SEGMENT_NULL
)
3626 if (ps
->ps_segtype
!= PS_FILE
)
3630 * Check for overlap on same device.
3632 if (ps
->ps_vnode
== (struct vnode
*)vp
) {
3635 return KERN_INVALID_ARGUMENT
;
3641 * Set up the paging segment
3643 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
3644 if (ps
== PAGING_SEGMENT_NULL
) {
3646 return KERN_RESOURCE_SHORTAGE
;
3649 ps
->ps_segtype
= PS_FILE
;
3650 ps
->ps_vnode
= (struct vnode
*)vp
;
3652 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
3653 ps
->ps_recnum
= size
;
3654 ps
->ps_pgnum
= size
>> ps
->ps_record_shift
;
3656 ps
->ps_pgcount
= ps
->ps_pgnum
;
3657 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
3658 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
3662 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
3664 kfree((vm_offset_t
)ps
, sizeof *ps
);
3666 return KERN_RESOURCE_SHORTAGE
;
3668 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
3669 clrbit(ps
->ps_bmap
, i
);
3672 ps
->ps_going_away
= FALSE
;
3675 if ((error
= ps_enter(ps
)) != 0) {
3676 kfree((vm_offset_t
)ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
3677 kfree((vm_offset_t
)ps
, sizeof *ps
);
3679 return KERN_RESOURCE_SHORTAGE
;
3682 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3683 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3685 dp_pages_free
+= ps
->ps_pgcount
;
3690 bs_more_space(ps
->ps_clcount
);
3692 DEBUG(DEBUG_BS_INTERNAL
,
3693 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3694 device
, offset
, size
, record_size
,
3695 ps
->ps_record_shift
, ps
->ps_pgnum
));
3697 return KERN_SUCCESS
;
3704 paging_segment_t ps
,
3706 vm_offset_t upl_offset
,
3709 unsigned int *residualp
,
3712 vm_object_offset_t f_offset
;
3717 clustered_reads
[atop_32(size
)]++;
3719 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3721 /* for transfer case we need to pass uploffset and flags */
3722 error
= vnode_pagein(ps
->ps_vnode
,
3723 upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
| UPL_NORDAHEAD
, NULL
);
3725 /* The vnode_pagein semantic is somewhat at odds with the existing */
3726 /* device_read semantic. Partial reads are not experienced at this */
3727 /* level. It is up to the bit map code and cluster read code to */
3728 /* check that requested data locations are actually backed, and the */
3729 /* pagein code to either read all of the requested data or return an */
3733 result
= KERN_FAILURE
;
3736 result
= KERN_SUCCESS
;
3743 paging_segment_t ps
,
3745 vm_offset_t upl_offset
,
3750 vm_object_offset_t f_offset
;
3751 kern_return_t result
;
3755 clustered_writes
[atop_32(size
)]++;
3756 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3758 if (vnode_pageout(ps
->ps_vnode
,
3759 upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
, NULL
))
3760 result
= KERN_FAILURE
;
3762 result
= KERN_SUCCESS
;
3768 default_pager_triggers(MACH_PORT_FACE default_pager
,
3772 MACH_PORT_FACE trigger_port
)
3774 MACH_PORT_FACE release
;
3778 if (flags
== HI_WAT_ALERT
) {
3779 release
= min_pages_trigger_port
;
3780 min_pages_trigger_port
= trigger_port
;
3781 minimum_pages_remaining
= hi_wat
/vm_page_size
;
3784 } else if (flags
== LO_WAT_ALERT
) {
3785 release
= max_pages_trigger_port
;
3786 max_pages_trigger_port
= trigger_port
;
3787 maximum_pages_free
= lo_wat
/vm_page_size
;
3790 release
= trigger_port
;
3791 kr
= KERN_INVALID_ARGUMENT
;
3795 if (IP_VALID(release
))
3796 ipc_port_release_send(release
);
3802 * Monitor the amount of available backing store vs. the amount of
3803 * required backing store, notify a listener (if present) when
3804 * backing store may safely be removed.
3806 * We attempt to avoid the situation where backing store is
3807 * discarded en masse, as this can lead to thrashing as the
3808 * backing store is compacted.
3811 #define PF_INTERVAL 3 /* time between free level checks */
3812 #define PF_LATENCY 10 /* number of intervals before release */
3814 static int dp_pages_free_low_count
= 0;
3817 default_pager_backing_store_monitor(thread_call_param_t p1
, thread_call_param_t p2
)
3819 unsigned long long average
;
3824 * We determine whether it will be safe to release some
3825 * backing store by watching the free page level. If
3826 * it remains below the maximum_pages_free threshold for
3827 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3828 * then we deem it safe.
3830 * Note that this establishes a maximum rate at which backing
3831 * store will be released, as each notification (currently)
3832 * only results in a single backing store object being
3835 if (dp_pages_free
> maximum_pages_free
) {
3836 dp_pages_free_low_count
++;
3838 dp_pages_free_low_count
= 0;
3841 /* decide whether to send notification */
3843 if (max_pages_trigger_port
&&
3844 (backing_store_release_trigger_disable
== 0) &&
3845 (dp_pages_free_low_count
> PF_LATENCY
)) {
3846 trigger
= max_pages_trigger_port
;
3847 max_pages_trigger_port
= NULL
;
3850 /* send notification */
3851 if (trigger
!= IP_NULL
) {
3853 if(backing_store_release_trigger_disable
!= 0) {
3854 assert_wait((event_t
)
3855 &backing_store_release_trigger_disable
,
3858 thread_block(THREAD_CONTINUE_NULL
);
3862 default_pager_space_alert(trigger
, LO_WAT_ALERT
);
3863 ipc_port_release_send(trigger
);
3864 dp_pages_free_low_count
= 0;
3867 clock_interval_to_deadline(PF_INTERVAL
, NSEC_PER_SEC
, &deadline
);
3868 thread_call_func_delayed(default_pager_backing_store_monitor
, NULL
, deadline
);