2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
28 * All Rights Reserved.
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
40 * Carnegie Mellon requests users of this software to return to
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
53 * Paging File Management.
56 #include <mach/memory_object_control.h>
57 #include <mach/memory_object_server.h>
58 #include "default_pager_internal.h"
59 #include <default_pager/default_pager_alerts.h>
60 #include <ipc/ipc_port.h>
61 #include <ipc/ipc_space.h>
62 #include <kern/queue.h>
63 #include <kern/counters.h>
64 #include <kern/sched_prim.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_pageout.h>
68 #include <vm/vm_map.h>
71 * ALLOC_STRIDE... the maximum number of bytes allocated from
72 * a swap file before moving on to the next swap file... if
73 * all swap files reside on a single disk, this value should
74 * be very large (this is the default assumption)... if the
75 * swap files are spread across multiple disks, than this value
76 * should be small (128 * 1024)...
78 * This should be determined dynamically in the future
81 #define ALLOC_STRIDE (1024 * 1024 * 1024)
82 int physical_transfer_cluster_count
= 0;
84 #define VM_SUPER_CLUSTER 0x40000
85 #define VM_SUPER_PAGES 64
88 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
89 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
91 #define VSTRUCT_DEF_CLSHIFT 2
92 int vstruct_def_clshift
= VSTRUCT_DEF_CLSHIFT
;
93 int default_pager_clsize
= 0;
96 unsigned int clustered_writes
[VM_SUPER_PAGES
+1];
97 unsigned int clustered_reads
[VM_SUPER_PAGES
+1];
100 * Globals used for asynchronous paging operations:
101 * vs_async_list: head of list of to-be-completed I/O ops
102 * async_num_queued: number of pages completed, but not yet
103 * processed by async thread.
104 * async_requests_out: number of pages of requests not completed.
108 struct vs_async
*vs_async_list
;
109 int async_num_queued
;
110 int async_requests_out
;
114 #define VS_ASYNC_REUSE 1
115 struct vs_async
*vs_async_free_list
;
117 mutex_t default_pager_async_lock
; /* Protects globals above */
120 int vs_alloc_async_failed
= 0; /* statistics */
121 int vs_alloc_async_count
= 0; /* statistics */
122 struct vs_async
*vs_alloc_async(void); /* forward */
123 void vs_free_async(struct vs_async
*vsa
); /* forward */
126 #define VS_ALLOC_ASYNC() vs_alloc_async()
127 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
129 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
130 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
131 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
133 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
135 * Paging Space Hysteresis triggers and the target notification port
139 unsigned int minimum_pages_remaining
= 0;
140 unsigned int maximum_pages_free
= 0;
141 ipc_port_t min_pages_trigger_port
= NULL
;
142 ipc_port_t max_pages_trigger_port
= NULL
;
144 boolean_t bs_low
= FALSE
;
145 int backing_store_release_trigger_disable
= 0;
150 * Object sizes are rounded up to the next power of 2,
151 * unless they are bigger than a given maximum size.
153 vm_size_t max_doubled_size
= 4 * 1024 * 1024; /* 4 meg */
156 * List of all backing store and segments.
158 struct backing_store_list_head backing_store_list
;
159 paging_segment_t paging_segments
[MAX_NUM_PAGING_SEGMENTS
];
160 mutex_t paging_segments_lock
;
161 int paging_segment_max
= 0;
162 int paging_segment_count
= 0;
163 int ps_select_array
[BS_MAXPRI
+1] = { -1,-1,-1,-1,-1 };
167 * Total pages free in system
168 * This differs from clusters committed/avail which is a measure of the
169 * over commitment of paging segments to backing store. An idea which is
170 * likely to be deprecated.
172 unsigned int dp_pages_free
= 0;
173 unsigned int cluster_transfer_minimum
= 100;
175 kern_return_t
ps_write_file(paging_segment_t
, upl_t
, vm_offset_t
, vm_offset_t
, unsigned int, int); /* forward */
176 kern_return_t
ps_read_file (paging_segment_t
, upl_t
, vm_offset_t
, vm_offset_t
, unsigned int, unsigned int *, int); /* forward */
179 default_pager_thread_t
*
186 for (i
=0; i
<default_pager_internal_count
; i
++) {
187 if(dpt_array
[i
]->checked_out
== FALSE
) {
188 dpt_array
[i
]->checked_out
= TRUE
;
189 DPT_UNLOCK(dpt_lock
);
193 DPT_SLEEP(dpt_lock
, &dpt_array
, THREAD_UNINT
);
203 * List of all backing store.
206 queue_init(&backing_store_list
.bsl_queue
);
209 VS_ASYNC_LOCK_INIT();
211 vs_async_free_list
= NULL
;
212 #endif /* VS_ASYNC_REUSE */
214 for (i
= 0; i
< VM_SUPER_PAGES
+ 1; i
++) {
215 clustered_writes
[i
] = 0;
216 clustered_reads
[i
] = 0;
222 * When things do not quite workout...
224 void bs_no_paging_space(boolean_t
); /* forward */
228 boolean_t out_of_memory
)
232 dprintf(("*** OUT OF MEMORY ***\n"));
233 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
236 void bs_more_space(int); /* forward */
237 void bs_commit(int); /* forward */
239 boolean_t user_warned
= FALSE
;
240 unsigned int clusters_committed
= 0;
241 unsigned int clusters_available
= 0;
242 unsigned int clusters_committed_peak
= 0;
250 * Account for new paging space.
252 clusters_available
+= nclusters
;
254 if (clusters_available
>= clusters_committed
) {
255 if (verbose
&& user_warned
) {
256 printf("%s%s - %d excess clusters now.\n",
258 "paging space is OK now",
259 clusters_available
- clusters_committed
);
261 clusters_committed_peak
= 0;
264 if (verbose
&& user_warned
) {
265 printf("%s%s - still short of %d clusters.\n",
267 "WARNING: paging space over-committed",
268 clusters_committed
- clusters_available
);
269 clusters_committed_peak
-= nclusters
;
282 clusters_committed
+= nclusters
;
283 if (clusters_committed
> clusters_available
) {
284 if (verbose
&& !user_warned
) {
286 printf("%s%s - short of %d clusters.\n",
288 "WARNING: paging space over-committed",
289 clusters_committed
- clusters_available
);
291 if (clusters_committed
> clusters_committed_peak
) {
292 clusters_committed_peak
= clusters_committed
;
295 if (verbose
&& user_warned
) {
296 printf("%s%s - was short of up to %d clusters.\n",
298 "paging space is OK now",
299 clusters_committed_peak
- clusters_available
);
301 clusters_committed_peak
= 0;
309 int default_pager_info_verbose
= 1;
316 vm_size_t pages_total
, pages_free
;
321 pages_total
= pages_free
= 0;
322 for (i
= 0; i
<= paging_segment_max
; i
++) {
323 ps
= paging_segments
[i
];
324 if (ps
== PAGING_SEGMENT_NULL
)
328 * no need to lock: by the time this data
329 * gets back to any remote requestor it
330 * will be obsolete anyways
332 pages_total
+= ps
->ps_pgnum
;
333 pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
334 DEBUG(DEBUG_BS_INTERNAL
,
335 ("segment #%d: %d total, %d free\n",
336 i
, ps
->ps_pgnum
, ps
->ps_clcount
<< ps
->ps_clshift
));
338 *totalp
= pages_total
;
340 if (verbose
&& user_warned
&& default_pager_info_verbose
) {
341 if (clusters_available
< clusters_committed
) {
342 printf("%s %d clusters committed, %d available.\n",
351 backing_store_t
backing_store_alloc(void); /* forward */
354 backing_store_alloc(void)
358 bs
= (backing_store_t
) kalloc(sizeof (struct backing_store
));
359 if (bs
== BACKING_STORE_NULL
)
360 panic("backing_store_alloc: no memory");
363 bs
->bs_port
= MACH_PORT_NULL
;
366 bs
->bs_pages_total
= 0;
368 bs
->bs_pages_in_fail
= 0;
369 bs
->bs_pages_out
= 0;
370 bs
->bs_pages_out_fail
= 0;
375 backing_store_t
backing_store_lookup(MACH_PORT_FACE
); /* forward */
377 /* Even in both the component space and external versions of this pager, */
378 /* backing_store_lookup will be called from tasks in the application space */
380 backing_store_lookup(
386 port is currently backed with a vs structure in the alias field
387 we could create an ISBS alias and a port_is_bs call but frankly
388 I see no reason for the test, the bs->port == port check below
389 will work properly on junk entries.
391 if ((port == MACH_PORT_NULL) || port_is_vs(port))
393 if ((port
== MACH_PORT_NULL
))
394 return BACKING_STORE_NULL
;
397 queue_iterate(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
400 if (bs
->bs_port
== port
) {
402 /* Success, return it locked. */
408 return BACKING_STORE_NULL
;
411 void backing_store_add(backing_store_t
); /* forward */
417 MACH_PORT_FACE port
= bs
->bs_port
;
418 MACH_PORT_FACE pset
= default_pager_default_set
;
419 kern_return_t kr
= KERN_SUCCESS
;
421 if (kr
!= KERN_SUCCESS
)
422 panic("backing_store_add: add to set");
427 * Set up default page shift, but only if not already
428 * set and argument is within range.
431 bs_set_default_clsize(unsigned int npages
)
438 if (default_pager_clsize
== 0) /* if not yet set */
439 vstruct_def_clshift
= local_log2(npages
);
445 int bs_get_global_clsize(int clsize
); /* forward */
448 bs_get_global_clsize(
452 memory_object_default_t dmm
;
456 * Only allow setting of cluster size once. If called
457 * with no cluster size (default), we use the compiled-in default
458 * for the duration. The same cluster size is used for all
461 if (default_pager_clsize
== 0) {
463 * Keep cluster size in bit shift because it's quicker
464 * arithmetic, and easier to keep at a power of 2.
466 if (clsize
!= NO_CLSIZE
) {
467 for (i
= 0; (1 << i
) < clsize
; i
++);
468 if (i
> MAX_CLUSTER_SHIFT
)
469 i
= MAX_CLUSTER_SHIFT
;
470 vstruct_def_clshift
= i
;
472 default_pager_clsize
= (1 << vstruct_def_clshift
);
475 * Let the user know the new (and definitive) cluster size.
478 printf("%scluster size = %d page%s\n",
479 my_name
, default_pager_clsize
,
480 (default_pager_clsize
== 1) ? "" : "s");
483 * Let the kernel know too, in case it hasn't used the
484 * default value provided in main() yet.
486 dmm
= default_pager_object
;
487 clsize
= default_pager_clsize
* vm_page_size
; /* in bytes */
488 kr
= host_default_memory_manager(host_priv_self(),
491 memory_object_default_deallocate(dmm
);
493 if (kr
!= KERN_SUCCESS
) {
494 panic("bs_get_global_cl_size:host_default_memory_manager");
496 if (dmm
!= default_pager_object
) {
497 panic("bs_get_global_cl_size:there is another default pager");
500 ASSERT(default_pager_clsize
> 0 &&
501 (default_pager_clsize
& (default_pager_clsize
- 1)) == 0);
503 return default_pager_clsize
;
507 default_pager_backing_store_create(
508 memory_object_default_t pager
,
510 int clsize
, /* in bytes */
511 MACH_PORT_FACE
*backing_store
)
516 struct vstruct_alias
*alias_struct
;
518 if (pager
!= default_pager_object
)
519 return KERN_INVALID_ARGUMENT
;
521 bs
= backing_store_alloc();
522 port
= ipc_port_alloc_kernel();
523 ipc_port_make_send(port
);
524 assert (port
!= IP_NULL
);
526 DEBUG(DEBUG_BS_EXTERNAL
,
527 ("priority=%d clsize=%d bs_port=0x%x\n",
528 priority
, clsize
, (int) backing_store
));
530 alias_struct
= (struct vstruct_alias
*)
531 kalloc(sizeof (struct vstruct_alias
));
532 if(alias_struct
!= NULL
) {
533 alias_struct
->vs
= (struct vstruct
*)bs
;
534 alias_struct
->name
= ISVS
;
535 port
->alias
= (int) alias_struct
;
538 ipc_port_dealloc_kernel((MACH_PORT_FACE
)(port
));
539 kfree((vm_offset_t
)bs
, sizeof (struct backing_store
));
540 return KERN_RESOURCE_SHORTAGE
;
544 if (priority
== DEFAULT_PAGER_BACKING_STORE_MAXPRI
)
545 priority
= BS_MAXPRI
;
546 else if (priority
== BS_NOPRI
)
547 priority
= BS_MAXPRI
;
549 priority
= BS_MINPRI
;
550 bs
->bs_priority
= priority
;
552 bs
->bs_clsize
= bs_get_global_clsize(atop_32(clsize
));
555 queue_enter(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
559 backing_store_add(bs
);
561 *backing_store
= port
;
566 default_pager_backing_store_info(
567 MACH_PORT_FACE backing_store
,
568 backing_store_flavor_t flavour
,
569 backing_store_info_t info
,
570 mach_msg_type_number_t
*size
)
573 backing_store_basic_info_t basic
;
577 if (flavour
!= BACKING_STORE_BASIC_INFO
||
578 *size
< BACKING_STORE_BASIC_INFO_COUNT
)
579 return KERN_INVALID_ARGUMENT
;
581 basic
= (backing_store_basic_info_t
)info
;
582 *size
= BACKING_STORE_BASIC_INFO_COUNT
;
584 VSTATS_LOCK(&global_stats
.gs_lock
);
585 basic
->pageout_calls
= global_stats
.gs_pageout_calls
;
586 basic
->pagein_calls
= global_stats
.gs_pagein_calls
;
587 basic
->pages_in
= global_stats
.gs_pages_in
;
588 basic
->pages_out
= global_stats
.gs_pages_out
;
589 basic
->pages_unavail
= global_stats
.gs_pages_unavail
;
590 basic
->pages_init
= global_stats
.gs_pages_init
;
591 basic
->pages_init_writes
= global_stats
.gs_pages_init_writes
;
592 VSTATS_UNLOCK(&global_stats
.gs_lock
);
594 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
595 return KERN_INVALID_ARGUMENT
;
597 basic
->bs_pages_total
= bs
->bs_pages_total
;
599 bs
->bs_pages_free
= 0;
600 for (i
= 0; i
<= paging_segment_max
; i
++) {
601 ps
= paging_segments
[i
];
602 if (ps
!= PAGING_SEGMENT_NULL
&& ps
->ps_bs
== bs
) {
604 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
609 basic
->bs_pages_free
= bs
->bs_pages_free
;
610 basic
->bs_pages_in
= bs
->bs_pages_in
;
611 basic
->bs_pages_in_fail
= bs
->bs_pages_in_fail
;
612 basic
->bs_pages_out
= bs
->bs_pages_out
;
613 basic
->bs_pages_out_fail
= bs
->bs_pages_out_fail
;
615 basic
->bs_priority
= bs
->bs_priority
;
616 basic
->bs_clsize
= ptoa_32(bs
->bs_clsize
); /* in bytes */
623 int ps_delete(paging_segment_t
); /* forward */
630 kern_return_t error
= KERN_SUCCESS
;
633 VSL_LOCK(); /* get the lock on the list of vs's */
635 /* The lock relationship and sequence is farily complicated */
636 /* this code looks at a live list, locking and unlocking the list */
637 /* as it traverses it. It depends on the locking behavior of */
638 /* default_pager_no_senders. no_senders always locks the vstruct */
639 /* targeted for removal before locking the vstruct list. However */
640 /* it will remove that member of the list without locking its */
641 /* neighbors. We can be sure when we hold a lock on a vstruct */
642 /* it cannot be removed from the list but we must hold the list */
643 /* lock to be sure that its pointers to its neighbors are valid. */
644 /* Also, we can hold off destruction of a vstruct when the list */
645 /* lock and the vs locks are not being held by bumping the */
646 /* vs_async_pending count. */
649 while(backing_store_release_trigger_disable
!= 0) {
650 VSL_SLEEP(&backing_store_release_trigger_disable
, THREAD_UNINT
);
653 /* we will choose instead to hold a send right */
654 vs_count
= vstruct_list
.vsl_count
;
655 vs
= (vstruct_t
) queue_first((queue_entry_t
)&(vstruct_list
.vsl_queue
));
656 if(vs
== (vstruct_t
)&vstruct_list
) {
661 vs_async_wait(vs
); /* wait for any pending async writes */
662 if ((vs_count
!= 0) && (vs
!= NULL
))
663 vs
->vs_async_pending
+= 1; /* hold parties calling */
667 while((vs_count
!= 0) && (vs
!= NULL
)) {
668 /* We take the count of AMO's before beginning the */
669 /* transfer of of the target segment. */
670 /* We are guaranteed that the target segment cannot get */
671 /* more users. We also know that queue entries are */
672 /* made at the back of the list. If some of the entries */
673 /* we would check disappear while we are traversing the */
674 /* list then we will either check new entries which */
675 /* do not have any backing store in the target segment */
676 /* or re-check old entries. This might not be optimal */
677 /* but it will always be correct. The alternative is to */
678 /* take a snapshot of the list. */
681 if(dp_pages_free
< cluster_transfer_minimum
)
682 error
= KERN_FAILURE
;
684 vm_object_t transfer_object
;
688 transfer_object
= vm_object_allocate(VM_SUPER_CLUSTER
);
690 error
= vm_object_upl_request(transfer_object
,
691 (vm_object_offset_t
)0, VM_SUPER_CLUSTER
,
693 UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
695 if(error
== KERN_SUCCESS
) {
696 error
= ps_vstruct_transfer_from_segment(
698 upl_commit(upl
, NULL
);
701 error
= KERN_FAILURE
;
703 vm_object_deallocate(transfer_object
);
707 vs
->vs_async_pending
-= 1; /* release vs_async_wait */
708 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
709 vs
->vs_waiting_async
= FALSE
;
711 thread_wakeup(&vs
->vs_async_pending
);
720 while(backing_store_release_trigger_disable
!= 0) {
721 VSL_SLEEP(&backing_store_release_trigger_disable
,
725 next_vs
= (vstruct_t
) queue_next(&(vs
->vs_links
));
726 if((next_vs
!= (vstruct_t
)&vstruct_list
) &&
727 (vs
!= next_vs
) && (vs_count
!= 1)) {
729 vs_async_wait(next_vs
); /* wait for any */
730 /* pending async writes */
731 next_vs
->vs_async_pending
+= 1; /* hold parties */
732 /* calling vs_async_wait */
737 vs
->vs_async_pending
-= 1;
738 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
739 vs
->vs_waiting_async
= FALSE
;
741 thread_wakeup(&vs
->vs_async_pending
);
745 if((vs
== next_vs
) || (next_vs
== (vstruct_t
)&vstruct_list
))
756 default_pager_backing_store_delete(
757 MACH_PORT_FACE backing_store
)
763 int interim_pages_removed
= 0;
766 if ((bs
= backing_store_lookup(backing_store
)) == BACKING_STORE_NULL
)
767 return KERN_INVALID_ARGUMENT
;
770 /* not implemented */
777 error
= KERN_SUCCESS
;
778 for (i
= 0; i
<= paging_segment_max
; i
++) {
779 ps
= paging_segments
[i
];
780 if (ps
!= PAGING_SEGMENT_NULL
&&
782 ! ps
->ps_going_away
) {
784 /* disable access to this segment */
785 ps
->ps_going_away
= TRUE
;
788 * The "ps" segment is "off-line" now,
789 * we can try and delete it...
791 if(dp_pages_free
< (cluster_transfer_minimum
793 error
= KERN_FAILURE
;
797 /* remove all pages associated with the */
798 /* segment from the list of free pages */
799 /* when transfer is through, all target */
800 /* segment pages will appear to be free */
802 dp_pages_free
-= ps
->ps_pgcount
;
803 interim_pages_removed
+= ps
->ps_pgcount
;
805 error
= ps_delete(ps
);
807 if (error
!= KERN_SUCCESS
) {
809 * We couldn't delete the segment,
810 * probably because there's not enough
811 * virtual memory left.
812 * Re-enable all the segments.
821 if (error
!= KERN_SUCCESS
) {
822 for (i
= 0; i
<= paging_segment_max
; i
++) {
823 ps
= paging_segments
[i
];
824 if (ps
!= PAGING_SEGMENT_NULL
&&
828 /* re-enable access to this segment */
829 ps
->ps_going_away
= FALSE
;
833 dp_pages_free
+= interim_pages_removed
;
839 for (i
= 0; i
<= paging_segment_max
; i
++) {
840 ps
= paging_segments
[i
];
841 if (ps
!= PAGING_SEGMENT_NULL
&&
843 if(ps
->ps_going_away
) {
844 paging_segments
[i
] = PAGING_SEGMENT_NULL
;
845 paging_segment_count
--;
847 kfree((vm_offset_t
)ps
->ps_bmap
,
848 RMAPSIZE(ps
->ps_ncls
));
849 kfree((vm_offset_t
)ps
, sizeof *ps
);
854 /* Scan the entire ps array separately to make certain we find the */
855 /* proper paging_segment_max */
856 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
857 if(paging_segments
[i
] != PAGING_SEGMENT_NULL
)
858 paging_segment_max
= i
;
864 * All the segments have been deleted.
865 * We can remove the backing store.
869 * Disable lookups of this backing store.
871 if((void *)bs
->bs_port
->alias
!= NULL
)
872 kfree((vm_offset_t
) bs
->bs_port
->alias
,
873 sizeof (struct vstruct_alias
));
874 ipc_port_dealloc_kernel((ipc_port_t
) (bs
->bs_port
));
875 bs
->bs_port
= MACH_PORT_NULL
;
879 * Remove backing store from backing_store list.
882 queue_remove(&backing_store_list
.bsl_queue
, bs
, backing_store_t
,
887 * Free the backing store structure.
889 kfree((vm_offset_t
)bs
, sizeof *bs
);
894 int ps_enter(paging_segment_t
); /* forward */
904 for (i
= 0; i
< MAX_NUM_PAGING_SEGMENTS
; i
++) {
905 if (paging_segments
[i
] == PAGING_SEGMENT_NULL
)
909 if (i
< MAX_NUM_PAGING_SEGMENTS
) {
910 paging_segments
[i
] = ps
;
911 if (i
> paging_segment_max
)
912 paging_segment_max
= i
;
913 paging_segment_count
++;
914 if ((ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_NOPRI
) ||
915 (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
))
916 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
920 return KERN_RESOURCE_SHORTAGE
;
929 default_pager_add_segment(
930 MACH_PORT_FACE backing_store
,
931 MACH_PORT_FACE device
,
941 if ((bs
= backing_store_lookup(backing_store
))
942 == BACKING_STORE_NULL
)
943 return KERN_INVALID_ARGUMENT
;
946 for (i
= 0; i
<= paging_segment_max
; i
++) {
947 ps
= paging_segments
[i
];
948 if (ps
== PAGING_SEGMENT_NULL
)
952 * Check for overlap on same device.
954 if (!(ps
->ps_device
!= device
955 || offset
>= ps
->ps_offset
+ ps
->ps_recnum
956 || offset
+ count
<= ps
->ps_offset
)) {
959 return KERN_INVALID_ARGUMENT
;
965 * Set up the paging segment
967 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
968 if (ps
== PAGING_SEGMENT_NULL
) {
970 return KERN_RESOURCE_SHORTAGE
;
973 ps
->ps_segtype
= PS_PARTITION
;
974 ps
->ps_device
= device
;
975 ps
->ps_offset
= offset
;
976 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
977 ps
->ps_recnum
= count
;
978 ps
->ps_pgnum
= count
>> ps
->ps_record_shift
;
980 ps
->ps_pgcount
= ps
->ps_pgnum
;
981 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
982 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
986 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
988 kfree((vm_offset_t
)ps
, sizeof *ps
);
990 return KERN_RESOURCE_SHORTAGE
;
992 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
993 clrbit(ps
->ps_bmap
, i
);
996 ps
->ps_going_away
= FALSE
;
999 if ((error
= ps_enter(ps
)) != 0) {
1000 kfree((vm_offset_t
)ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
1001 kfree((vm_offset_t
)ps
, sizeof *ps
);
1003 return KERN_RESOURCE_SHORTAGE
;
1006 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1007 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
1011 dp_pages_free
+= ps
->ps_pgcount
;
1014 bs_more_space(ps
->ps_clcount
);
1016 DEBUG(DEBUG_BS_INTERNAL
,
1017 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1018 device
, offset
, count
, record_size
,
1019 ps
->ps_record_shift
, ps
->ps_pgnum
));
1021 return KERN_SUCCESS
;
1027 MACH_PORT_FACE master
)
1029 security_token_t null_security_token
= {
1032 MACH_PORT_FACE device
;
1033 int info
[DEV_GET_SIZE_COUNT
];
1034 mach_msg_type_number_t info_count
;
1035 MACH_PORT_FACE bs
= MACH_PORT_NULL
;
1036 unsigned int rec_size
;
1039 MACH_PORT_FACE reply_port
;
1041 if (ds_device_open_sync(master
, MACH_PORT_NULL
, D_READ
| D_WRITE
,
1042 null_security_token
, dev_name
, &device
))
1045 info_count
= DEV_GET_SIZE_COUNT
;
1046 if (!ds_device_get_status(device
, DEV_GET_SIZE
, info
, &info_count
)) {
1047 rec_size
= info
[DEV_GET_SIZE_RECORD_SIZE
];
1048 count
= info
[DEV_GET_SIZE_DEVICE_SIZE
] / rec_size
;
1049 clsize
= bs_get_global_clsize(0);
1050 if (!default_pager_backing_store_create(
1051 default_pager_object
,
1052 DEFAULT_PAGER_BACKING_STORE_MAXPRI
,
1053 (clsize
* vm_page_size
),
1055 if (!default_pager_add_segment(bs
, device
,
1056 0, count
, rec_size
)) {
1059 ipc_port_release_receive(bs
);
1063 ipc_port_release_send(device
);
1066 #endif /* DEVICE_PAGING */
1071 vs_alloc_async(void)
1073 struct vs_async
*vsa
;
1074 MACH_PORT_FACE reply_port
;
1078 if (vs_async_free_list
== NULL
) {
1080 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1083 * Try allocating a reply port named after the
1084 * address of the vs_async structure.
1086 struct vstruct_alias
*alias_struct
;
1088 reply_port
= ipc_port_alloc_kernel();
1089 alias_struct
= (struct vstruct_alias
*)
1090 kalloc(sizeof (struct vstruct_alias
));
1091 if(alias_struct
!= NULL
) {
1092 alias_struct
->vs
= (struct vstruct
*)vsa
;
1093 alias_struct
->name
= ISVS
;
1094 reply_port
->alias
= (int) alias_struct
;
1095 vsa
->reply_port
= reply_port
;
1096 vs_alloc_async_count
++;
1099 vs_alloc_async_failed
++;
1100 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1102 kfree((vm_offset_t
)vsa
,
1103 sizeof (struct vs_async
));
1108 vsa
= vs_async_free_list
;
1109 vs_async_free_list
= vs_async_free_list
->vsa_next
;
1118 struct vs_async
*vsa
)
1121 vsa
->vsa_next
= vs_async_free_list
;
1122 vs_async_free_list
= vsa
;
1126 #else /* VS_ASYNC_REUSE */
1129 vs_alloc_async(void)
1131 struct vs_async
*vsa
;
1132 MACH_PORT_FACE reply_port
;
1135 vsa
= (struct vs_async
*) kalloc(sizeof (struct vs_async
));
1138 * Try allocating a reply port named after the
1139 * address of the vs_async structure.
1141 reply_port
= ipc_port_alloc_kernel();
1142 alias_struct
= (vstruct_alias
*)
1143 kalloc(sizeof (struct vstruct_alias
));
1144 if(alias_struct
!= NULL
) {
1145 alias_struct
->vs
= reply_port
;
1146 alias_struct
->name
= ISVS
;
1147 reply_port
->alias
= (int) vsa
;
1148 vsa
->reply_port
= reply_port
;
1149 vs_alloc_async_count
++;
1152 vs_alloc_async_failed
++;
1153 ipc_port_dealloc_kernel((MACH_PORT_FACE
)
1155 kfree((vm_offset_t
) vsa
,
1156 sizeof (struct vs_async
));
1166 struct vs_async
*vsa
)
1168 MACH_PORT_FACE reply_port
;
1171 reply_port
= vsa
->reply_port
;
1172 kfree((vm_offset_t
) reply_port
->alias
, sizeof (struct vstuct_alias
));
1173 kfree((vm_offset_t
) vsa
, sizeof (struct vs_async
));
1174 ipc_port_dealloc_kernel((MACH_PORT_FACE
) (reply_port
));
1177 vs_alloc_async_count
--;
1182 #endif /* VS_ASYNC_REUSE */
1184 zone_t vstruct_zone
;
1193 vs
= (vstruct_t
) zalloc(vstruct_zone
);
1194 if (vs
== VSTRUCT_NULL
) {
1195 return VSTRUCT_NULL
;
1201 * The following fields will be provided later.
1203 vs
->vs_mem_obj
= NULL
;
1204 vs
->vs_control
= MEMORY_OBJECT_CONTROL_NULL
;
1205 vs
->vs_references
= 1;
1209 vs
->vs_waiting_seqno
= FALSE
;
1210 vs
->vs_waiting_read
= FALSE
;
1211 vs
->vs_waiting_write
= FALSE
;
1212 vs
->vs_waiting_async
= FALSE
;
1214 mutex_init(&vs
->vs_waiting_seqno
, ETAP_DPAGE_VSSEQNO
);
1215 mutex_init(&vs
->vs_waiting_read
, ETAP_DPAGE_VSREAD
);
1216 mutex_init(&vs
->vs_waiting_write
, ETAP_DPAGE_VSWRITE
);
1217 mutex_init(&vs
->vs_waiting_refs
, ETAP_DPAGE_VSREFS
);
1218 mutex_init(&vs
->vs_waiting_async
, ETAP_DPAGE_VSASYNC
);
1226 vs
->vs_clshift
= local_log2(bs_get_global_clsize(0));
1227 vs
->vs_size
= ((atop_32(round_page_32(size
)) - 1) >> vs
->vs_clshift
) + 1;
1228 vs
->vs_async_pending
= 0;
1231 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1232 * depending on the size of the memory object.
1234 if (INDIRECT_CLMAP(vs
->vs_size
)) {
1235 vs
->vs_imap
= (struct vs_map
**)
1236 kalloc(INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1237 vs
->vs_indirect
= TRUE
;
1239 vs
->vs_dmap
= (struct vs_map
*)
1240 kalloc(CLMAP_SIZE(vs
->vs_size
));
1241 vs
->vs_indirect
= FALSE
;
1243 vs
->vs_xfer_pending
= FALSE
;
1244 DEBUG(DEBUG_VS_INTERNAL
,
1245 ("map=0x%x, indirect=%d\n", (int) vs
->vs_dmap
, vs
->vs_indirect
));
1248 * Check to see that we got the space.
1251 kfree((vm_offset_t
)vs
, sizeof *vs
);
1252 return VSTRUCT_NULL
;
1256 * Zero the indirect pointers, or clear the direct pointers.
1258 if (vs
->vs_indirect
)
1259 memset(vs
->vs_imap
, 0,
1260 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1262 for (i
= 0; i
< vs
->vs_size
; i
++)
1263 VSM_CLR(vs
->vs_dmap
[i
]);
1265 VS_MAP_LOCK_INIT(vs
);
1267 bs_commit(vs
->vs_size
);
1272 paging_segment_t
ps_select_segment(int, int *); /* forward */
1279 paging_segment_t ps
;
1284 * Optimize case where there's only one segment.
1285 * paging_segment_max will index the one and only segment.
1289 if (paging_segment_count
== 1) {
1290 paging_segment_t lps
; /* used to avoid extra PS_UNLOCK */
1291 ipc_port_t trigger
= IP_NULL
;
1293 ps
= paging_segments
[paging_segment_max
];
1294 *psindex
= paging_segment_max
;
1296 if (ps
->ps_going_away
) {
1297 /* this segment is being turned off */
1298 lps
= PAGING_SEGMENT_NULL
;
1300 ASSERT(ps
->ps_clshift
>= shift
);
1301 if (ps
->ps_clcount
) {
1303 dp_pages_free
-= 1 << ps
->ps_clshift
;
1304 if(min_pages_trigger_port
&&
1305 (dp_pages_free
< minimum_pages_remaining
)) {
1306 trigger
= min_pages_trigger_port
;
1307 min_pages_trigger_port
= NULL
;
1312 lps
= PAGING_SEGMENT_NULL
;
1317 if (trigger
!= IP_NULL
) {
1318 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1319 ipc_port_release_send(trigger
);
1324 if (paging_segment_count
== 0) {
1326 return PAGING_SEGMENT_NULL
;
1330 i
>= BS_MINPRI
; i
--) {
1333 if ((ps_select_array
[i
] == BS_NOPRI
) ||
1334 (ps_select_array
[i
] == BS_FULLPRI
))
1336 start_index
= ps_select_array
[i
];
1338 if(!(paging_segments
[start_index
])) {
1340 physical_transfer_cluster_count
= 0;
1342 else if ((physical_transfer_cluster_count
+1) == (ALLOC_STRIDE
>>
1343 (((paging_segments
[start_index
])->ps_clshift
)
1344 + vm_page_shift
))) {
1345 physical_transfer_cluster_count
= 0;
1346 j
= start_index
+ 1;
1348 physical_transfer_cluster_count
+=1;
1350 if(start_index
== 0)
1351 start_index
= paging_segment_max
;
1353 start_index
= start_index
- 1;
1357 if (j
> paging_segment_max
)
1359 if ((ps
= paging_segments
[j
]) &&
1360 (ps
->ps_bs
->bs_priority
== i
)) {
1362 * Force the ps cluster size to be
1363 * >= that of the vstruct.
1366 if (ps
->ps_going_away
) {
1367 /* this segment is being turned off */
1368 } else if ((ps
->ps_clcount
) &&
1369 (ps
->ps_clshift
>= shift
)) {
1370 ipc_port_t trigger
= IP_NULL
;
1373 dp_pages_free
-= 1 << ps
->ps_clshift
;
1374 if(min_pages_trigger_port
&&
1376 minimum_pages_remaining
)) {
1377 trigger
= min_pages_trigger_port
;
1378 min_pages_trigger_port
= NULL
;
1382 * found one, quit looking.
1384 ps_select_array
[i
] = j
;
1387 if (trigger
!= IP_NULL
) {
1388 default_pager_space_alert(
1391 ipc_port_release_send(trigger
);
1398 if (j
== start_index
) {
1400 * none at this priority -- mark it full
1402 ps_select_array
[i
] = BS_FULLPRI
;
1409 return PAGING_SEGMENT_NULL
;
1412 vm_offset_t
ps_allocate_cluster(vstruct_t
, int *, paging_segment_t
); /*forward*/
1415 ps_allocate_cluster(
1418 paging_segment_t use_ps
)
1422 paging_segment_t ps
;
1423 vm_offset_t cluster
;
1424 ipc_port_t trigger
= IP_NULL
;
1427 * Find best paging segment.
1428 * ps_select_segment will decrement cluster count on ps.
1429 * Must pass cluster shift to find the most appropriate segment.
1431 /* NOTE: The addition of paging segment delete capability threatened
1432 * to seriously complicate the treatment of paging segments in this
1433 * module and the ones that call it (notably ps_clmap), because of the
1434 * difficulty in assuring that the paging segment would continue to
1435 * exist between being unlocked and locked. This was
1436 * avoided because all calls to this module are based in either
1437 * dp_memory_object calls which rely on the vs lock, or by
1438 * the transfer function which is part of the segment delete path.
1439 * The transfer function which is part of paging segment delete is
1440 * protected from multiple callers by the backing store lock.
1441 * The paging segment delete function treats mappings to a paging
1442 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1443 * while data is transferred to the remaining segments. This is in
1444 * line with the view that incomplete or in-transition mappings between
1445 * data, a vstruct, and backing store are protected by the vs lock.
1446 * This and the ordering of the paging segment "going_away" bit setting
1449 if (use_ps
!= PAGING_SEGMENT_NULL
) {
1454 ASSERT(ps
->ps_clcount
!= 0);
1457 dp_pages_free
-= 1 << ps
->ps_clshift
;
1458 if(min_pages_trigger_port
&&
1459 (dp_pages_free
< minimum_pages_remaining
)) {
1460 trigger
= min_pages_trigger_port
;
1461 min_pages_trigger_port
= NULL
;
1465 if (trigger
!= IP_NULL
) {
1466 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1467 ipc_port_release_send(trigger
);
1470 } else if ((ps
= ps_select_segment(vs
->vs_clshift
, psindex
)) ==
1471 PAGING_SEGMENT_NULL
) {
1473 bs_no_paging_space(TRUE
);
1478 dprintf(("no space in available paging segments; "
1479 "swapon suggested\n"));
1480 /* the count got off maybe, reset to zero */
1483 if(min_pages_trigger_port
) {
1484 trigger
= min_pages_trigger_port
;
1485 min_pages_trigger_port
= NULL
;
1489 if (trigger
!= IP_NULL
) {
1490 default_pager_space_alert(trigger
, HI_WAT_ALERT
);
1491 ipc_port_release_send(trigger
);
1493 return (vm_offset_t
) -1;
1497 * Look for an available cluster. At the end of the loop,
1498 * byte_num is the byte offset and bit_num is the bit offset of the
1499 * first zero bit in the paging segment bitmap.
1502 byte_num
= ps
->ps_hint
;
1503 for (; byte_num
< howmany(ps
->ps_ncls
, NBBY
); byte_num
++) {
1504 if (*(ps
->ps_bmap
+ byte_num
) != BYTEMASK
) {
1505 for (bit_num
= 0; bit_num
< NBBY
; bit_num
++) {
1506 if (isclr((ps
->ps_bmap
+ byte_num
), bit_num
))
1509 ASSERT(bit_num
!= NBBY
);
1513 ps
->ps_hint
= byte_num
;
1514 cluster
= (byte_num
*NBBY
) + bit_num
;
1516 /* Space was reserved, so this must be true */
1517 ASSERT(cluster
< ps
->ps_ncls
);
1519 setbit(ps
->ps_bmap
, cluster
);
1525 void ps_deallocate_cluster(paging_segment_t
, vm_offset_t
); /* forward */
1528 ps_deallocate_cluster(
1529 paging_segment_t ps
,
1530 vm_offset_t cluster
)
1533 if (cluster
>= (vm_offset_t
) ps
->ps_ncls
)
1534 panic("ps_deallocate_cluster: Invalid cluster number");
1537 * Lock the paging segment, clear the cluster's bitmap and increment the
1538 * number of free cluster.
1542 clrbit(ps
->ps_bmap
, cluster
);
1544 dp_pages_free
+= 1 << ps
->ps_clshift
;
1548 * Move the hint down to the freed cluster if it is
1549 * less than the current hint.
1551 if ((cluster
/NBBY
) < ps
->ps_hint
) {
1552 ps
->ps_hint
= (cluster
/NBBY
);
1558 * If we're freeing space on a full priority, reset the array.
1561 if (ps_select_array
[ps
->ps_bs
->bs_priority
] == BS_FULLPRI
)
1562 ps_select_array
[ps
->ps_bs
->bs_priority
] = 0;
1568 void ps_dealloc_vsmap(struct vs_map
*, vm_size_t
); /* forward */
1572 struct vs_map
*vsmap
,
1576 for (i
= 0; i
< size
; i
++)
1577 if (!VSM_ISCLR(vsmap
[i
]) && !VSM_ISERR(vsmap
[i
]))
1578 ps_deallocate_cluster(VSM_PS(vsmap
[i
]),
1579 VSM_CLOFF(vsmap
[i
]));
1592 * If this is an indirect structure, then we walk through the valid
1593 * (non-zero) indirect pointers and deallocate the clusters
1594 * associated with each used map entry (via ps_dealloc_vsmap).
1595 * When all of the clusters in an indirect block have been
1596 * freed, we deallocate the block. When all of the indirect
1597 * blocks have been deallocated we deallocate the memory
1598 * holding the indirect pointers.
1600 if (vs
->vs_indirect
) {
1601 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
1602 if (vs
->vs_imap
[i
] != NULL
) {
1603 ps_dealloc_vsmap(vs
->vs_imap
[i
], CLMAP_ENTRIES
);
1604 kfree((vm_offset_t
)vs
->vs_imap
[i
],
1608 kfree((vm_offset_t
)vs
->vs_imap
,
1609 INDIRECT_CLMAP_SIZE(vs
->vs_size
));
1612 * Direct map. Free used clusters, then memory.
1614 ps_dealloc_vsmap(vs
->vs_dmap
, vs
->vs_size
);
1615 kfree((vm_offset_t
)vs
->vs_dmap
, CLMAP_SIZE(vs
->vs_size
));
1619 bs_commit(- vs
->vs_size
);
1621 zfree(vstruct_zone
, (vm_offset_t
)vs
);
1624 int ps_map_extend(vstruct_t
, int); /* forward */
1630 struct vs_map
**new_imap
;
1631 struct vs_map
*new_dmap
= NULL
;
1634 void *old_map
= NULL
;
1635 int old_map_size
= 0;
1637 if (vs
->vs_size
>= new_size
) {
1639 * Someone has already done the work.
1645 * If the new size extends into the indirect range, then we have one
1646 * of two cases: we are going from indirect to indirect, or we are
1647 * going from direct to indirect. If we are going from indirect to
1648 * indirect, then it is possible that the new size will fit in the old
1649 * indirect map. If this is the case, then just reset the size of the
1650 * vstruct map and we are done. If the new size will not
1651 * fit into the old indirect map, then we have to allocate a new
1652 * indirect map and copy the old map pointers into this new map.
1654 * If we are going from direct to indirect, then we have to allocate a
1655 * new indirect map and copy the old direct pages into the first
1656 * indirect page of the new map.
1657 * NOTE: allocating memory here is dangerous, as we're in the
1660 if (INDIRECT_CLMAP(new_size
)) {
1661 int new_map_size
= INDIRECT_CLMAP_SIZE(new_size
);
1664 * Get a new indirect map and zero it.
1666 old_map_size
= INDIRECT_CLMAP_SIZE(vs
->vs_size
);
1667 if (vs
->vs_indirect
&&
1668 (new_map_size
== old_map_size
)) {
1669 bs_commit(new_size
- vs
->vs_size
);
1670 vs
->vs_size
= new_size
;
1674 new_imap
= (struct vs_map
**)kalloc(new_map_size
);
1675 if (new_imap
== NULL
) {
1678 memset(new_imap
, 0, new_map_size
);
1680 if (vs
->vs_indirect
) {
1681 /* Copy old entries into new map */
1682 memcpy(new_imap
, vs
->vs_imap
, old_map_size
);
1683 /* Arrange to free the old map */
1684 old_map
= (void *) vs
->vs_imap
;
1686 } else { /* Old map was a direct map */
1687 /* Allocate an indirect page */
1688 if ((new_imap
[0] = (struct vs_map
*)
1689 kalloc(CLMAP_THRESHOLD
)) == NULL
) {
1690 kfree((vm_offset_t
)new_imap
, new_map_size
);
1693 new_dmap
= new_imap
[0];
1694 newdsize
= CLMAP_ENTRIES
;
1698 newdsize
= new_size
;
1700 * If the new map is a direct map, then the old map must
1701 * also have been a direct map. All we have to do is
1702 * to allocate a new direct map, copy the old entries
1703 * into it and free the old map.
1705 if ((new_dmap
= (struct vs_map
*)
1706 kalloc(CLMAP_SIZE(new_size
))) == NULL
) {
1712 /* Free the old map */
1713 old_map
= (void *) vs
->vs_dmap
;
1714 old_map_size
= CLMAP_SIZE(vs
->vs_size
);
1716 /* Copy info from the old map into the new map */
1717 memcpy(new_dmap
, vs
->vs_dmap
, old_map_size
);
1719 /* Initialize the rest of the new map */
1720 for (i
= vs
->vs_size
; i
< newdsize
; i
++)
1721 VSM_CLR(new_dmap
[i
]);
1724 vs
->vs_imap
= new_imap
;
1725 vs
->vs_indirect
= TRUE
;
1727 vs
->vs_dmap
= new_dmap
;
1728 bs_commit(new_size
- vs
->vs_size
);
1729 vs
->vs_size
= new_size
;
1731 kfree((vm_offset_t
)old_map
, old_map_size
);
1739 struct clmap
*clmap
,
1744 vm_offset_t cluster
; /* The cluster of offset. */
1745 vm_offset_t newcl
; /* The new cluster allocated. */
1748 struct vs_map
*vsmap
;
1752 ASSERT(vs
->vs_dmap
);
1753 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
1756 * Initialize cluster error value
1758 clmap
->cl_error
= 0;
1761 * If the object has grown, extend the page map.
1763 if (cluster
>= vs
->vs_size
) {
1764 if (flag
== CL_FIND
) {
1765 /* Do not allocate if just doing a lookup */
1767 return (vm_offset_t
) -1;
1769 if (ps_map_extend(vs
, cluster
+ 1)) {
1771 return (vm_offset_t
) -1;
1776 * Look for the desired cluster. If the map is indirect, then we
1777 * have a two level lookup. First find the indirect block, then
1778 * find the actual cluster. If the indirect block has not yet
1779 * been allocated, then do so. If the cluster has not yet been
1780 * allocated, then do so.
1782 * If any of the allocations fail, then return an error.
1783 * Don't allocate if just doing a lookup.
1785 if (vs
->vs_indirect
) {
1786 long ind_block
= cluster
/CLMAP_ENTRIES
;
1788 /* Is the indirect block allocated? */
1789 vsmap
= vs
->vs_imap
[ind_block
];
1790 if (vsmap
== NULL
) {
1791 if (flag
== CL_FIND
) {
1793 return (vm_offset_t
) -1;
1796 /* Allocate the indirect block */
1797 vsmap
= (struct vs_map
*) kalloc(CLMAP_THRESHOLD
);
1798 if (vsmap
== NULL
) {
1800 return (vm_offset_t
) -1;
1802 /* Initialize the cluster offsets */
1803 for (i
= 0; i
< CLMAP_ENTRIES
; i
++)
1805 vs
->vs_imap
[ind_block
] = vsmap
;
1808 vsmap
= vs
->vs_dmap
;
1811 vsmap
+= cluster%CLMAP_ENTRIES
;
1814 * At this point, vsmap points to the struct vs_map desired.
1816 * Look in the map for the cluster, if there was an error on a
1817 * previous write, flag it and return. If it is not yet
1818 * allocated, then allocate it, if we're writing; if we're
1819 * doing a lookup and the cluster's not allocated, return error.
1821 if (VSM_ISERR(*vsmap
)) {
1822 clmap
->cl_error
= VSM_GETERR(*vsmap
);
1824 return (vm_offset_t
) -1;
1825 } else if (VSM_ISCLR(*vsmap
)) {
1828 if (flag
== CL_FIND
) {
1830 * If there's an error and the entry is clear, then
1831 * we've run out of swap space. Record the error
1835 VSM_SETERR(*vsmap
, error
);
1838 return (vm_offset_t
) -1;
1841 * Attempt to allocate a cluster from the paging segment
1843 newcl
= ps_allocate_cluster(vs
, &psindex
,
1844 PAGING_SEGMENT_NULL
);
1847 return (vm_offset_t
) -1;
1850 VSM_SETCLOFF(*vsmap
, newcl
);
1851 VSM_SETPS(*vsmap
, psindex
);
1854 newcl
= VSM_CLOFF(*vsmap
);
1857 * Fill in pertinent fields of the clmap
1859 clmap
->cl_ps
= VSM_PS(*vsmap
);
1860 clmap
->cl_numpages
= VSCLSIZE(vs
);
1861 clmap
->cl_bmap
.clb_map
= (unsigned int) VSM_BMAP(*vsmap
);
1864 * Byte offset in paging segment is byte offset to cluster plus
1865 * byte offset within cluster. It looks ugly, but should be
1868 ASSERT(trunc_page(offset
) == offset
);
1869 newcl
= ptoa_32(newcl
) << vs
->vs_clshift
;
1870 newoff
= offset
& ((1<<(vm_page_shift
+ vs
->vs_clshift
)) - 1);
1871 if (flag
== CL_ALLOC
) {
1873 * set bits in the allocation bitmap according to which
1874 * pages were requested. size is in bytes.
1876 i
= atop_32(newoff
);
1877 while ((size
> 0) && (i
< VSCLSIZE(vs
))) {
1878 VSM_SETALLOC(*vsmap
, i
);
1880 size
-= vm_page_size
;
1883 clmap
->cl_alloc
.clb_map
= (unsigned int) VSM_ALLOC(*vsmap
);
1886 * Offset is not cluster aligned, so number of pages
1887 * and bitmaps must be adjusted
1889 clmap
->cl_numpages
-= atop_32(newoff
);
1890 CLMAP_SHIFT(clmap
, vs
);
1891 CLMAP_SHIFTALLOC(clmap
, vs
);
1896 * The setting of valid bits and handling of write errors
1897 * must be done here, while we hold the lock on the map.
1898 * It logically should be done in ps_vs_write_complete().
1899 * The size and error information has been passed from
1900 * ps_vs_write_complete(). If the size parameter is non-zero,
1901 * then there is work to be done. If error is also non-zero,
1902 * then the error number is recorded in the cluster and the
1903 * entire cluster is in error.
1905 if (size
&& flag
== CL_FIND
) {
1906 vm_offset_t off
= (vm_offset_t
) 0;
1909 for (i
= VSCLSIZE(vs
) - clmap
->cl_numpages
; size
> 0;
1911 VSM_SETPG(*vsmap
, i
);
1912 size
-= vm_page_size
;
1914 ASSERT(i
<= VSCLSIZE(vs
));
1916 BS_STAT(clmap
->cl_ps
->ps_bs
,
1917 clmap
->cl_ps
->ps_bs
->bs_pages_out_fail
+=
1919 off
= VSM_CLOFF(*vsmap
);
1920 VSM_SETERR(*vsmap
, error
);
1923 * Deallocate cluster if error, and no valid pages
1926 if (off
!= (vm_offset_t
) 0)
1927 ps_deallocate_cluster(clmap
->cl_ps
, off
);
1929 return (vm_offset_t
) 0;
1933 DEBUG(DEBUG_VS_INTERNAL
,
1934 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1935 newcl
+newoff
, (int) vs
, (int) vsmap
, flag
));
1936 DEBUG(DEBUG_VS_INTERNAL
,
1937 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1938 (int) clmap
->cl_ps
, clmap
->cl_numpages
,
1939 (int) clmap
->cl_bmap
.clb_map
, (int) clmap
->cl_alloc
.clb_map
));
1941 return (newcl
+ newoff
);
1944 void ps_clunmap(vstruct_t
, vm_offset_t
, vm_size_t
); /* forward */
1952 vm_offset_t cluster
; /* The cluster number of offset */
1953 struct vs_map
*vsmap
;
1958 * Loop through all clusters in this range, freeing paging segment
1959 * clusters and map entries as encountered.
1961 while (length
> 0) {
1965 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
1966 if (vs
->vs_indirect
) /* indirect map */
1967 vsmap
= vs
->vs_imap
[cluster
/CLMAP_ENTRIES
];
1969 vsmap
= vs
->vs_dmap
;
1970 if (vsmap
== NULL
) {
1974 vsmap
+= cluster%CLMAP_ENTRIES
;
1975 if (VSM_ISCLR(*vsmap
)) {
1976 length
-= vm_page_size
;
1977 offset
+= vm_page_size
;
1981 * We've got a valid mapping. Clear it and deallocate
1982 * paging segment cluster pages.
1983 * Optimize for entire cluster cleraing.
1985 if (newoff
= (offset
&((1<<(vm_page_shift
+vs
->vs_clshift
))-1))) {
1987 * Not cluster aligned.
1989 ASSERT(trunc_page(newoff
) == newoff
);
1990 i
= atop_32(newoff
);
1993 while ((i
< VSCLSIZE(vs
)) && (length
> 0)) {
1994 VSM_CLRPG(*vsmap
, i
);
1995 VSM_CLRALLOC(*vsmap
, i
);
1996 length
-= vm_page_size
;
1997 offset
+= vm_page_size
;
2002 * If map entry is empty, clear and deallocate cluster.
2004 if (!VSM_ALLOC(*vsmap
)) {
2005 ps_deallocate_cluster(VSM_PS(*vsmap
),
2014 void ps_vs_write_complete(vstruct_t
, vm_offset_t
, vm_size_t
, int); /* forward */
2017 ps_vs_write_complete(
2026 * Get the struct vsmap for this cluster.
2027 * Use READ, even though it was written, because the
2028 * cluster MUST be present, unless there was an error
2029 * in the original ps_clmap (e.g. no space), in which
2030 * case, nothing happens.
2032 * Must pass enough information to ps_clmap to allow it
2033 * to set the vs_map structure bitmap under lock.
2035 (void) ps_clmap(vs
, offset
, &clmap
, CL_FIND
, size
, error
);
2038 void vs_cl_write_complete(vstruct_t
, paging_segment_t
, vm_offset_t
, vm_offset_t
, vm_size_t
, boolean_t
, int); /* forward */
2041 vs_cl_write_complete(
2043 paging_segment_t ps
,
2054 * For internal objects, the error is recorded on a
2055 * per-cluster basis by ps_clmap() which is called
2056 * by ps_vs_write_complete() below.
2058 dprintf(("write failed error = 0x%x\n", error
));
2059 /* add upl_abort code here */
2061 GSTAT(global_stats
.gs_pages_out
+= atop_32(size
));
2063 * Notify the vstruct mapping code, so it can do its accounting.
2065 ps_vs_write_complete(vs
, offset
, size
, error
);
2069 ASSERT(vs
->vs_async_pending
> 0);
2070 vs
->vs_async_pending
-= size
;
2071 if (vs
->vs_async_pending
== 0 && vs
->vs_waiting_async
) {
2072 vs
->vs_waiting_async
= FALSE
;
2074 /* mutex_unlock(&vs->vs_waiting_async); */
2075 thread_wakeup(&vs
->vs_async_pending
);
2082 #ifdef DEVICE_PAGING
2083 kern_return_t
device_write_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2087 MACH_PORT_FACE reply_port
,
2088 kern_return_t device_code
,
2089 io_buf_len_t bytes_written
)
2091 struct vs_async
*vsa
;
2093 vsa
= (struct vs_async
*)
2094 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2096 if (device_code
== KERN_SUCCESS
&& bytes_written
!= vsa
->vsa_size
) {
2097 device_code
= KERN_FAILURE
;
2100 vsa
->vsa_error
= device_code
;
2103 ASSERT(vsa
->vsa_vs
!= VSTRUCT_NULL
);
2104 if(vsa
->vsa_flags
& VSA_TRANSFER
) {
2105 /* revisit when async disk segments redone */
2106 if(vsa
->vsa_error
) {
2107 /* need to consider error condition. re-write data or */
2108 /* throw it away here. */
2110 if(vm_map_copyout(kernel_map
, &ioaddr
,
2111 (vm_map_copy_t
)vsa
->vsa_addr
) != KERN_SUCCESS
)
2112 panic("vs_cluster_write: unable to copy source list\n");
2113 vm_deallocate(kernel_map
, ioaddr
, vsa
->vsa_size
);
2115 ps_vs_write_complete(vsa
->vsa_vs
, vsa
->vsa_offset
,
2116 vsa
->vsa_size
, vsa
->vsa_error
);
2118 vs_cl_write_complete(vsa
->vsa_vs
, vsa
->vsa_ps
, vsa
->vsa_offset
,
2119 vsa
->vsa_addr
, vsa
->vsa_size
, TRUE
,
2124 return KERN_SUCCESS
;
2127 kern_return_t
device_write_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2129 device_write_reply_inband(
2130 MACH_PORT_FACE reply_port
,
2131 kern_return_t return_code
,
2132 io_buf_len_t bytes_written
)
2134 panic("device_write_reply_inband: illegal");
2135 return KERN_SUCCESS
;
2138 kern_return_t
device_read_reply(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_t
, mach_msg_type_number_t
);
2141 MACH_PORT_FACE reply_port
,
2142 kern_return_t return_code
,
2144 mach_msg_type_number_t dataCnt
)
2146 struct vs_async
*vsa
;
2147 vsa
= (struct vs_async
*)
2148 ((struct vstruct_alias
*)(reply_port
->alias
))->vs
;
2149 vsa
->vsa_addr
= (vm_offset_t
)data
;
2150 vsa
->vsa_size
= (vm_size_t
)dataCnt
;
2151 vsa
->vsa_error
= return_code
;
2152 thread_wakeup(&vsa
->vsa_lock
);
2153 return KERN_SUCCESS
;
2156 kern_return_t
device_read_reply_inband(MACH_PORT_FACE
, kern_return_t
, io_buf_ptr_inband_t
, mach_msg_type_number_t
);
2158 device_read_reply_inband(
2159 MACH_PORT_FACE reply_port
,
2160 kern_return_t return_code
,
2161 io_buf_ptr_inband_t data
,
2162 mach_msg_type_number_t dataCnt
)
2164 panic("device_read_reply_inband: illegal");
2165 return KERN_SUCCESS
;
2168 kern_return_t
device_read_reply_overwrite(MACH_PORT_FACE
, kern_return_t
, io_buf_len_t
);
2170 device_read_reply_overwrite(
2171 MACH_PORT_FACE reply_port
,
2172 kern_return_t return_code
,
2173 io_buf_len_t bytes_read
)
2175 panic("device_read_reply_overwrite: illegal\n");
2176 return KERN_SUCCESS
;
2179 kern_return_t
device_open_reply(MACH_PORT_FACE
, kern_return_t
, MACH_PORT_FACE
);
2182 MACH_PORT_FACE reply_port
,
2183 kern_return_t return_code
,
2184 MACH_PORT_FACE device_port
)
2186 panic("device_open_reply: illegal\n");
2187 return KERN_SUCCESS
;
2190 kern_return_t
ps_read_device(paging_segment_t
, vm_offset_t
, vm_offset_t
*, unsigned int, unsigned int *, int); /* forward */
2194 paging_segment_t ps
,
2196 vm_offset_t
*bufferp
,
2198 unsigned int *residualp
,
2202 recnum_t dev_offset
;
2203 unsigned int bytes_wanted
;
2204 unsigned int bytes_read
;
2205 unsigned int total_read
;
2206 vm_offset_t dev_buffer
;
2207 vm_offset_t buf_ptr
;
2208 unsigned int records_read
;
2209 struct vs_async
*vsa
;
2210 mutex_t vs_waiting_read_reply
;
2213 vm_map_copy_t device_data
= NULL
;
2214 default_pager_thread_t
*dpt
= NULL
;
2216 device
= dev_port_lookup(ps
->ps_device
);
2217 clustered_reads
[atop_32(size
)]++;
2219 dev_offset
= (ps
->ps_offset
+
2220 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2221 bytes_wanted
= size
;
2223 *bufferp
= (vm_offset_t
)NULL
;
2226 vsa
= VS_ALLOC_ASYNC();
2230 vsa
->vsa_offset
= 0;
2234 mutex_init(&vsa
->vsa_lock
, ETAP_DPAGE_VSSEQNO
);
2235 ip_lock(vsa
->reply_port
);
2236 vsa
->reply_port
->ip_sorights
++;
2237 ip_reference(vsa
->reply_port
);
2238 ip_unlock(vsa
->reply_port
);
2239 kr
= ds_device_read_common(device
,
2241 (mach_msg_type_name_t
)
2242 MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2246 (IO_READ
| IO_CALL
),
2247 (io_buf_ptr_t
*) &dev_buffer
,
2248 (mach_msg_type_number_t
*) &bytes_read
);
2249 if(kr
== MIG_NO_REPLY
) {
2250 assert_wait(&vsa
->vsa_lock
, THREAD_UNINT
);
2251 thread_block(THREAD_CONTINUE_NULL
);
2253 dev_buffer
= vsa
->vsa_addr
;
2254 bytes_read
= (unsigned int)vsa
->vsa_size
;
2255 kr
= vsa
->vsa_error
;
2258 if (kr
!= KERN_SUCCESS
|| bytes_read
== 0) {
2261 total_read
+= bytes_read
;
2264 * If we got the entire range, use the returned dev_buffer.
2266 if (bytes_read
== size
) {
2267 *bufferp
= (vm_offset_t
)dev_buffer
;
2272 dprintf(("read only %d bytes out of %d\n",
2273 bytes_read
, bytes_wanted
));
2276 dpt
= get_read_buffer();
2277 buf_ptr
= dpt
->dpt_buffer
;
2278 *bufferp
= (vm_offset_t
)buf_ptr
;
2281 * Otherwise, copy the data into the provided buffer (*bufferp)
2282 * and append the rest of the range as it comes in.
2284 memcpy((void *) buf_ptr
, (void *) dev_buffer
, bytes_read
);
2285 buf_ptr
+= bytes_read
;
2286 bytes_wanted
-= bytes_read
;
2287 records_read
= (bytes_read
>>
2288 (vm_page_shift
- ps
->ps_record_shift
));
2289 dev_offset
+= records_read
;
2290 DEBUG(DEBUG_VS_INTERNAL
,
2291 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2292 dev_buffer
, bytes_read
));
2293 if (vm_deallocate(kernel_map
, dev_buffer
, bytes_read
)
2295 Panic("dealloc buf");
2296 } while (bytes_wanted
);
2298 *residualp
= size
- total_read
;
2299 if((dev_buffer
!= *bufferp
) && (total_read
!= 0)) {
2300 vm_offset_t temp_buffer
;
2301 vm_allocate(kernel_map
, &temp_buffer
, total_read
, TRUE
);
2302 memcpy((void *) temp_buffer
, (void *) *bufferp
, total_read
);
2303 if(vm_map_copyin_page_list(kernel_map
, temp_buffer
, total_read
,
2304 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2305 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2306 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2307 (vm_map_copy_t
*)&device_data
, FALSE
))
2308 panic("ps_read_device: cannot copyin locally provided buffer\n");
2310 else if((kr
== KERN_SUCCESS
) && (total_read
!= 0) && (dev_buffer
!= 0)){
2311 if(vm_map_copyin_page_list(kernel_map
, dev_buffer
, bytes_read
,
2312 VM_MAP_COPYIN_OPT_SRC_DESTROY
|
2313 VM_MAP_COPYIN_OPT_STEAL_PAGES
|
2314 VM_MAP_COPYIN_OPT_PMAP_ENTER
,
2315 (vm_map_copy_t
*)&device_data
, FALSE
))
2316 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2321 *bufferp
= (vm_offset_t
)device_data
;
2324 /* Free the receive buffer */
2325 dpt
->checked_out
= 0;
2326 thread_wakeup(&dpt_array
);
2328 return KERN_SUCCESS
;
2331 kern_return_t
ps_write_device(paging_segment_t
, vm_offset_t
, vm_offset_t
, unsigned int, struct vs_async
*); /* forward */
2335 paging_segment_t ps
,
2339 struct vs_async
*vsa
)
2341 recnum_t dev_offset
;
2342 io_buf_len_t bytes_to_write
, bytes_written
;
2343 recnum_t records_written
;
2345 MACH_PORT_FACE reply_port
;
2349 clustered_writes
[atop_32(size
)]++;
2351 dev_offset
= (ps
->ps_offset
+
2352 (offset
>> (vm_page_shift
- ps
->ps_record_shift
)));
2353 bytes_to_write
= size
;
2357 * Asynchronous write.
2359 reply_port
= vsa
->reply_port
;
2360 ip_lock(reply_port
);
2361 reply_port
->ip_sorights
++;
2362 ip_reference(reply_port
);
2363 ip_unlock(reply_port
);
2366 device
= dev_port_lookup(ps
->ps_device
);
2368 vsa
->vsa_addr
= addr
;
2369 kr
=ds_device_write_common(device
,
2371 (mach_msg_type_name_t
) MACH_MSG_TYPE_MOVE_SEND_ONCE
,
2374 (io_buf_ptr_t
) addr
,
2376 (IO_WRITE
| IO_CALL
),
2379 if ((kr
!= KERN_SUCCESS
) && (kr
!= MIG_NO_REPLY
)) {
2381 dprintf(("%s0x%x, addr=0x%x,"
2382 "size=0x%x,offset=0x%x\n",
2383 "device_write_request returned ",
2384 kr
, addr
, size
, offset
));
2386 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2387 /* do the completion notification to free resources */
2388 device_write_reply(reply_port
, kr
, 0);
2393 * Synchronous write.
2397 device
= dev_port_lookup(ps
->ps_device
);
2398 kr
=ds_device_write_common(device
,
2402 (io_buf_ptr_t
) addr
,
2404 (IO_WRITE
| IO_SYNC
| IO_KERNEL_BUF
),
2407 if (kr
!= KERN_SUCCESS
) {
2408 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2409 "device_write returned ",
2410 kr
, addr
, size
, offset
));
2412 ps
->ps_bs
->bs_pages_out_fail
+= atop_32(size
));
2415 if (bytes_written
& ((vm_page_size
>> ps
->ps_record_shift
) - 1))
2416 Panic("fragmented write");
2417 records_written
= (bytes_written
>>
2418 (vm_page_shift
- ps
->ps_record_shift
));
2419 dev_offset
+= records_written
;
2421 if (bytes_written
!= bytes_to_write
) {
2422 dprintf(("wrote only %d bytes out of %d\n",
2423 bytes_written
, bytes_to_write
));
2426 bytes_to_write
-= bytes_written
;
2427 addr
+= bytes_written
;
2428 } while (bytes_to_write
> 0);
2430 return PAGER_SUCCESS
;
2434 #else /* !DEVICE_PAGING */
2438 paging_segment_t ps
,
2440 vm_offset_t
*bufferp
,
2442 unsigned int *residualp
,
2445 panic("ps_read_device not supported");
2449 paging_segment_t ps
,
2453 struct vs_async
*vsa
)
2455 panic("ps_write_device not supported");
2458 #endif /* DEVICE_PAGING */
2459 void pvs_object_data_provided(vstruct_t
, upl_t
, vm_offset_t
, vm_size_t
); /* forward */
2462 pvs_object_data_provided(
2469 DEBUG(DEBUG_VS_INTERNAL
,
2470 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2471 upl
, offset
, size
));
2474 GSTAT(global_stats
.gs_pages_in
+= atop_32(size
));
2478 ps_clunmap(vs
, offset
, size
);
2479 #endif /* USE_PRECIOUS */
2486 vm_offset_t vs_offset
,
2490 kern_return_t error
= KERN_SUCCESS
;
2492 unsigned int residual
;
2493 unsigned int request_flags
;
2500 vm_offset_t ps_offset
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2501 paging_segment_t psp
[(VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
];
2504 pages_in_cl
= 1 << vs
->vs_clshift
;
2505 cl_size
= pages_in_cl
* vm_page_size
;
2506 cl_mask
= cl_size
- 1;
2509 * This loop will be executed multiple times until the entire
2510 * request has been satisfied... if the request spans cluster
2511 * boundaries, the clusters will be checked for logical continunity,
2512 * if contiguous the I/O request will span multiple clusters, otherwise
2513 * it will be broken up into the minimal set of I/O's
2515 * If there are holes in a request (either unallocated pages in a paging
2516 * segment or an unallocated paging segment), we stop
2517 * reading at the hole, inform the VM of any data read, inform
2518 * the VM of an unavailable range, then loop again, hoping to
2519 * find valid pages later in the requested range. This continues until
2520 * the entire range has been examined, and read, if present.
2524 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
| UPL_RET_ONLY_ABSENT
;
2526 request_flags
= UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_RET_ONLY_ABSENT
;
2528 while (cnt
&& (error
== KERN_SUCCESS
)) {
2530 int page_list_count
;
2532 if((vs_offset
& cl_mask
) &&
2533 (cnt
> (VM_SUPER_CLUSTER
-
2534 (vs_offset
& cl_mask
)))) {
2535 size
= VM_SUPER_CLUSTER
;
2536 size
-= vs_offset
& cl_mask
;
2537 } else if (cnt
> VM_SUPER_CLUSTER
) {
2538 size
= VM_SUPER_CLUSTER
;
2547 while (size
> 0 && error
== KERN_SUCCESS
) {
2552 vm_offset_t cur_offset
;
2555 if ( !ps_info_valid
) {
2556 ps_offset
[seg_index
] = ps_clmap(vs
, vs_offset
& ~cl_mask
, &clmap
, CL_FIND
, 0, 0);
2557 psp
[seg_index
] = CLMAP_PS(clmap
);
2561 * skip over unallocated physical segments
2563 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2564 abort_size
= cl_size
- (vs_offset
& cl_mask
);
2565 abort_size
= MIN(abort_size
, size
);
2567 page_list_count
= 0;
2568 memory_object_super_upl_request(
2570 (memory_object_offset_t
)vs_offset
,
2571 abort_size
, abort_size
,
2572 &upl
, NULL
, &page_list_count
,
2575 if (clmap
.cl_error
) {
2576 upl_abort(upl
, UPL_ABORT_ERROR
);
2578 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
2580 upl_deallocate(upl
);
2583 vs_offset
+= abort_size
;
2589 cl_index
= (vs_offset
& cl_mask
) / vm_page_size
;
2591 for (abort_size
= 0; cl_index
< pages_in_cl
&& abort_size
< size
; cl_index
++) {
2593 * skip over unallocated pages
2595 if (CLMAP_ISSET(clmap
, cl_index
))
2597 abort_size
+= vm_page_size
;
2601 * Let VM system know about holes in clusters.
2603 GSTAT(global_stats
.gs_pages_unavail
+= atop_32(abort_size
));
2605 page_list_count
= 0;
2606 memory_object_super_upl_request(
2608 (memory_object_offset_t
)vs_offset
,
2609 abort_size
, abort_size
,
2610 &upl
, NULL
, &page_list_count
,
2613 upl_abort(upl
, UPL_ABORT_UNAVAILABLE
);
2614 upl_deallocate(upl
);
2617 vs_offset
+= abort_size
;
2619 if (cl_index
== pages_in_cl
) {
2621 * if we're at the end of this physical cluster
2622 * then bump to the next one and continue looking
2632 * remember the starting point of the first allocated page
2633 * for the I/O we're about to issue
2635 beg_pseg
= seg_index
;
2636 beg_indx
= cl_index
;
2637 cur_offset
= vs_offset
;
2640 * calculate the size of the I/O that we can do...
2641 * this may span multiple physical segments if
2642 * they are contiguous
2644 for (xfer_size
= 0; xfer_size
< size
; ) {
2646 while (cl_index
< pages_in_cl
2647 && xfer_size
< size
) {
2649 * accumulate allocated pages within
2650 * a physical segment
2652 if (CLMAP_ISSET(clmap
, cl_index
)) {
2653 xfer_size
+= vm_page_size
;
2654 cur_offset
+= vm_page_size
;
2657 BS_STAT(psp
[seg_index
]->ps_bs
,
2658 psp
[seg_index
]->ps_bs
->bs_pages_in
++);
2662 if (cl_index
< pages_in_cl
2663 || xfer_size
>= size
) {
2665 * we've hit an unallocated page or
2666 * the end of this request... go fire
2672 * we've hit the end of the current physical
2673 * segment and there's more to do, so try
2674 * moving to the next one
2678 ps_offset
[seg_index
] =
2680 cur_offset
& ~cl_mask
,
2681 &clmap
, CL_FIND
, 0, 0);
2682 psp
[seg_index
] = CLMAP_PS(clmap
);
2685 if ((ps_offset
[seg_index
- 1] != (ps_offset
[seg_index
] - cl_size
)) || (psp
[seg_index
- 1] != psp
[seg_index
])) {
2687 * if the physical segment we're about
2688 * to step into is not contiguous to
2689 * the one we're currently in, or it's
2690 * in a different paging file, or
2691 * it hasn't been allocated....
2692 * we stop here and generate the I/O
2697 * start with first page of the next physical
2704 * we have a contiguous range of allocated pages
2707 page_list_count
= 0;
2708 memory_object_super_upl_request(vs
->vs_control
,
2709 (memory_object_offset_t
)vs_offset
,
2710 xfer_size
, xfer_size
,
2711 &upl
, NULL
, &page_list_count
,
2712 request_flags
| UPL_SET_INTERNAL
);
2714 error
= ps_read_file(psp
[beg_pseg
],
2715 upl
, (vm_offset_t
) 0,
2716 ps_offset
[beg_pseg
] +
2717 (beg_indx
* vm_page_size
),
2718 xfer_size
, &residual
, 0);
2725 * Adjust counts and send response to VM. Optimize
2726 * for the common case, i.e. no error and/or partial
2727 * data. If there was an error, then we need to error
2728 * the entire range, even if some data was successfully
2729 * read. If there was a partial read we may supply some
2730 * data and may error some as well. In all cases the
2731 * VM must receive some notification for every page
2734 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
2736 * Got everything we asked for, supply the data
2737 * to the VM. Note that as a side effect of
2738 * supplying the data, the buffer holding the
2739 * supplied data is deallocated from the pager's
2742 pvs_object_data_provided(
2743 vs
, upl
, vs_offset
, xfer_size
);
2745 failed_size
= xfer_size
;
2747 if (error
== KERN_SUCCESS
) {
2748 if (residual
== xfer_size
) {
2750 * If a read operation returns no error
2751 * and no data moved, we turn it into
2752 * an error, assuming we're reading at
2754 * Fall through and error the entire
2757 error
= KERN_FAILURE
;
2760 * Otherwise, we have partial read. If
2761 * the part read is a integral number
2762 * of pages supply it. Otherwise round
2763 * it up to a page boundary, zero fill
2764 * the unread part, and supply it.
2765 * Fall through and error the remainder
2766 * of the range, if any.
2772 lsize
= (xfer_size
- residual
)
2774 pvs_object_data_provided(
2778 if (lsize
< xfer_size
) {
2781 error
= KERN_FAILURE
;
2787 * If there was an error in any part of the range, tell
2788 * the VM. Note that error is explicitly checked again
2789 * since it can be modified above.
2791 if (error
!= KERN_SUCCESS
) {
2792 BS_STAT(psp
[beg_pseg
]->ps_bs
,
2793 psp
[beg_pseg
]->ps_bs
->bs_pages_in_fail
2794 += atop_32(failed_size
));
2797 vs_offset
+= xfer_size
;
2800 } /* END while (cnt && (error == 0)) */
2804 int vs_do_async_write
= 1;
2812 boolean_t dp_internal
,
2816 vm_offset_t transfer_size
;
2820 vm_offset_t actual_offset
; /* Offset within paging segment */
2821 paging_segment_t ps
;
2822 vm_offset_t subx_size
;
2823 vm_offset_t mobj_base_addr
;
2824 vm_offset_t mobj_target_addr
;
2827 struct vs_async
*vsa
;
2831 upl_page_info_t
*pl
;
2839 pages_in_cl
= 1 << vs
->vs_clshift
;
2840 cl_size
= pages_in_cl
* vm_page_size
;
2843 int page_list_count
;
2850 vm_offset_t upl_offset
;
2851 vm_offset_t seg_offset
;
2852 vm_offset_t ps_offset
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2853 paging_segment_t psp
[((VM_SUPER_CLUSTER
/ PAGE_SIZE
) >> VSTRUCT_DEF_CLSHIFT
) + 1];
2857 super_size
= cl_size
;
2859 request_flags
= UPL_NOBLOCK
|
2860 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2861 UPL_NO_SYNC
| UPL_SET_INTERNAL
;
2863 super_size
= VM_SUPER_CLUSTER
;
2865 request_flags
= UPL_NOBLOCK
| UPL_CLEAN_IN_PLACE
|
2866 UPL_RET_ONLY_DIRTY
| UPL_COPYOUT_FROM
|
2867 UPL_NO_SYNC
| UPL_SET_INTERNAL
;
2870 page_list_count
= 0;
2871 memory_object_super_upl_request(vs
->vs_control
,
2872 (memory_object_offset_t
)offset
,
2874 &upl
, NULL
, &page_list_count
,
2875 request_flags
| UPL_FOR_PAGEOUT
);
2877 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2879 seg_size
= cl_size
- (upl
->offset
% cl_size
);
2880 upl_offset
= upl
->offset
& ~(cl_size
- 1);
2882 for (seg_index
= 0, transfer_size
= upl
->size
;
2883 transfer_size
> 0; ) {
2884 ps_offset
[seg_index
] =
2890 if (ps_offset
[seg_index
] == (vm_offset_t
) -1) {
2892 upl_deallocate(upl
);
2894 return KERN_FAILURE
;
2897 psp
[seg_index
] = CLMAP_PS(clmap
);
2899 if (transfer_size
> seg_size
) {
2900 transfer_size
-= seg_size
;
2901 upl_offset
+= cl_size
;
2908 * Ignore any non-present pages at the end of the
2911 for (page_index
= upl
->size
/ vm_page_size
; page_index
> 0;)
2912 if (UPL_PAGE_PRESENT(pl
, --page_index
))
2914 num_of_pages
= page_index
+ 1;
2916 base_index
= (upl
->offset
% cl_size
) / PAGE_SIZE
;
2918 for (page_index
= 0; page_index
< num_of_pages
; ) {
2920 * skip over non-dirty pages
2922 for ( ; page_index
< num_of_pages
; page_index
++) {
2923 if (UPL_DIRTY_PAGE(pl
, page_index
)
2924 || UPL_PRECIOUS_PAGE(pl
, page_index
))
2926 * this is a page we need to write
2927 * go see if we can buddy it up with
2928 * others that are contiguous to it
2932 * if the page is not-dirty, but present we
2933 * need to commit it... This is an unusual
2934 * case since we only asked for dirty pages
2936 if (UPL_PAGE_PRESENT(pl
, page_index
)) {
2937 boolean_t empty
= FALSE
;
2938 upl_commit_range(upl
,
2939 page_index
* vm_page_size
,
2941 UPL_COMMIT_NOTIFY_EMPTY
,
2946 assert(page_index
==
2948 upl_deallocate(upl
);
2952 if (page_index
== num_of_pages
)
2954 * no more pages to look at, we're out of here
2959 * gather up contiguous dirty pages... we have at
2960 * least 1 * otherwise we would have bailed above
2961 * make sure that each physical segment that we step
2962 * into is contiguous to the one we're currently in
2963 * if it's not, we have to stop and write what we have
2965 for (first_dirty
= page_index
;
2966 page_index
< num_of_pages
; ) {
2967 if ( !UPL_DIRTY_PAGE(pl
, page_index
)
2968 && !UPL_PRECIOUS_PAGE(pl
, page_index
))
2972 * if we just looked at the last page in the UPL
2973 * we don't need to check for physical segment
2976 if (page_index
< num_of_pages
) {
2980 cur_seg
= (base_index
+ (page_index
- 1))/pages_in_cl
;
2981 nxt_seg
= (base_index
+ page_index
)/pages_in_cl
;
2983 if (cur_seg
!= nxt_seg
) {
2984 if ((ps_offset
[cur_seg
] != (ps_offset
[nxt_seg
] - cl_size
)) || (psp
[cur_seg
] != psp
[nxt_seg
]))
2986 * if the segment we're about
2987 * to step into is not
2988 * contiguous to the one we're
2989 * currently in, or it's in a
2990 * different paging file....
2991 * we stop here and generate
2998 num_dirty
= page_index
- first_dirty
;
3001 upl_offset
= first_dirty
* vm_page_size
;
3002 transfer_size
= num_dirty
* vm_page_size
;
3004 while (transfer_size
) {
3006 if ((seg_size
= cl_size
-
3007 ((upl
->offset
+ upl_offset
) % cl_size
))
3009 seg_size
= transfer_size
;
3011 ps_vs_write_complete(vs
,
3012 upl
->offset
+ upl_offset
,
3015 transfer_size
-= seg_size
;
3016 upl_offset
+= seg_size
;
3018 upl_offset
= first_dirty
* vm_page_size
;
3019 transfer_size
= num_dirty
* vm_page_size
;
3021 seg_index
= (base_index
+ first_dirty
) / pages_in_cl
;
3022 seg_offset
= (upl
->offset
+ upl_offset
) % cl_size
;
3024 error
= ps_write_file(psp
[seg_index
],
3026 ps_offset
[seg_index
]
3028 transfer_size
, flags
);
3030 boolean_t empty
= FALSE
;
3031 upl_abort_range(upl
,
3032 first_dirty
* vm_page_size
,
3033 num_dirty
* vm_page_size
,
3034 UPL_ABORT_NOTIFY_EMPTY
,
3037 assert(page_index
== num_of_pages
);
3038 upl_deallocate(upl
);
3044 assert(cnt
<= (vm_page_size
<< vs
->vs_clshift
));
3048 /* The caller provides a mapped_data which is derived */
3049 /* from a temporary object. The targeted pages are */
3050 /* guaranteed to be set at offset 0 in the mapped_data */
3051 /* The actual offset however must still be derived */
3052 /* from the offset in the vs in question */
3053 mobj_base_addr
= offset
;
3054 mobj_target_addr
= mobj_base_addr
;
3056 for (transfer_size
= list_size
; transfer_size
!= 0;) {
3057 actual_offset
= ps_clmap(vs
, mobj_target_addr
,
3059 transfer_size
< cl_size
?
3060 transfer_size
: cl_size
, 0);
3061 if(actual_offset
== (vm_offset_t
) -1) {
3065 cnt
= MIN(transfer_size
,
3066 CLMAP_NPGS(clmap
) * vm_page_size
);
3067 ps
= CLMAP_PS(clmap
);
3068 /* Assume that the caller has given us contiguous */
3071 ps_vs_write_complete(vs
, mobj_target_addr
,
3073 error
= ps_write_file(ps
, internal_upl
,
3081 actual_offset
+= cnt
;
3082 mobj_target_addr
+= cnt
;
3083 transfer_size
-= cnt
;
3091 return KERN_FAILURE
;
3093 return KERN_SUCCESS
;
3097 ps_vstruct_allocated_size(
3101 struct vs_map
*vsmap
;
3105 if (vs
->vs_indirect
) {
3106 /* loop on indirect maps */
3107 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3108 vsmap
= vs
->vs_imap
[i
];
3111 /* loop on clusters in this indirect map */
3112 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3113 if (VSM_ISCLR(vsmap
[j
]) ||
3114 VSM_ISERR(vsmap
[j
]))
3116 /* loop on pages in this cluster */
3117 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3118 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3124 vsmap
= vs
->vs_dmap
;
3127 /* loop on clusters in the direct map */
3128 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3129 if (VSM_ISCLR(vsmap
[j
]) ||
3130 VSM_ISERR(vsmap
[j
]))
3132 /* loop on pages in this cluster */
3133 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3134 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
))
3140 return ptoa_32(num_pages
);
3144 ps_vstruct_allocated_pages(
3146 default_pager_page_t
*pages
,
3150 struct vs_map
*vsmap
;
3156 if (vs
->vs_indirect
) {
3157 /* loop on indirect maps */
3158 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3159 vsmap
= vs
->vs_imap
[i
];
3160 if (vsmap
== NULL
) {
3161 offset
+= (vm_page_size
* CLMAP_ENTRIES
*
3165 /* loop on clusters in this indirect map */
3166 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3167 if (VSM_ISCLR(vsmap
[j
]) ||
3168 VSM_ISERR(vsmap
[j
])) {
3169 offset
+= vm_page_size
* VSCLSIZE(vs
);
3172 /* loop on pages in this cluster */
3173 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3174 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3176 if (num_pages
< pages_size
)
3177 pages
++->dpp_offset
=
3180 offset
+= vm_page_size
;
3185 vsmap
= vs
->vs_dmap
;
3188 /* loop on clusters in the direct map */
3189 for (j
= 0; j
< CLMAP_ENTRIES
; j
++) {
3190 if (VSM_ISCLR(vsmap
[j
]) ||
3191 VSM_ISERR(vsmap
[j
])) {
3192 offset
+= vm_page_size
* VSCLSIZE(vs
);
3195 /* loop on pages in this cluster */
3196 for (k
= 0; k
< VSCLSIZE(vs
); k
++) {
3197 if ((VSM_BMAP(vsmap
[j
])) & (1 << k
)) {
3199 if (num_pages
< pages_size
)
3200 pages
++->dpp_offset
= offset
;
3202 offset
+= vm_page_size
;
3212 ps_vstruct_transfer_from_segment(
3214 paging_segment_t segment
,
3217 struct vs_map
*vsmap
;
3218 struct vs_map old_vsmap
;
3219 struct vs_map new_vsmap
;
3222 VS_LOCK(vs
); /* block all work on this vstruct */
3223 /* can't allow the normal multiple write */
3224 /* semantic because writes may conflict */
3225 vs
->vs_xfer_pending
= TRUE
;
3226 vs_wait_for_sync_writers(vs
);
3228 vs_wait_for_readers(vs
);
3229 /* we will unlock the vs to allow other writes while transferring */
3230 /* and will be guaranteed of the persistance of the vs struct */
3231 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3232 /* vs_async_pending */
3233 /* OK we now have guaranteed no other parties are accessing this */
3234 /* vs. Now that we are also supporting simple lock versions of */
3235 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3236 /* our purpose in holding it before was the multiple write case */
3237 /* we now use the boolean xfer_pending to do that. We can use */
3238 /* a boolean instead of a count because we have guaranteed single */
3239 /* file access to this code in its caller */
3242 if (vs
->vs_indirect
) {
3245 /* loop on indirect maps */
3246 for (i
= 0; i
< INDIRECT_CLMAP_ENTRIES(vs
->vs_size
); i
++) {
3247 vsmap
= vs
->vs_imap
[i
];
3250 /* loop on clusters in this indirect map */
3251 clmap_off
= (vm_page_size
* CLMAP_ENTRIES
*
3253 if(i
+1 == INDIRECT_CLMAP_ENTRIES(vs
->vs_size
))
3254 vsmap_size
= vs
->vs_size
- (CLMAP_ENTRIES
* i
);
3256 vsmap_size
= CLMAP_ENTRIES
;
3257 for (j
= 0; j
< vsmap_size
; j
++) {
3258 if (VSM_ISCLR(vsmap
[j
]) ||
3259 VSM_ISERR(vsmap
[j
]) ||
3260 (VSM_PS(vsmap
[j
]) != segment
))
3262 if(vs_cluster_transfer(vs
,
3263 (vm_page_size
* (j
<< vs
->vs_clshift
))
3265 vm_page_size
<< vs
->vs_clshift
,
3269 vs
->vs_xfer_pending
= FALSE
;
3271 vs_finish_write(vs
);
3272 return KERN_FAILURE
;
3274 /* allow other readers/writers during transfer*/
3276 vs
->vs_xfer_pending
= FALSE
;
3278 vs_finish_write(vs
);
3280 vs
->vs_xfer_pending
= TRUE
;
3281 vs_wait_for_sync_writers(vs
);
3283 vs_wait_for_readers(vs
);
3285 if (!(vs
->vs_indirect
)) {
3291 vsmap
= vs
->vs_dmap
;
3292 if (vsmap
== NULL
) {
3294 vs
->vs_xfer_pending
= FALSE
;
3296 vs_finish_write(vs
);
3297 return KERN_SUCCESS
;
3299 /* loop on clusters in the direct map */
3300 for (j
= 0; j
< vs
->vs_size
; j
++) {
3301 if (VSM_ISCLR(vsmap
[j
]) ||
3302 VSM_ISERR(vsmap
[j
]) ||
3303 (VSM_PS(vsmap
[j
]) != segment
))
3305 if(vs_cluster_transfer(vs
,
3306 vm_page_size
* (j
<< vs
->vs_clshift
),
3307 vm_page_size
<< vs
->vs_clshift
,
3308 upl
) != KERN_SUCCESS
) {
3310 vs
->vs_xfer_pending
= FALSE
;
3312 vs_finish_write(vs
);
3313 return KERN_FAILURE
;
3315 /* allow other readers/writers during transfer*/
3317 vs
->vs_xfer_pending
= FALSE
;
3319 vs_finish_write(vs
);
3321 vs
->vs_xfer_pending
= TRUE
;
3323 vs_wait_for_sync_writers(vs
);
3325 vs_wait_for_readers(vs
);
3326 if (vs
->vs_indirect
) {
3333 vs
->vs_xfer_pending
= FALSE
;
3335 vs_finish_write(vs
);
3336 return KERN_SUCCESS
;
3346 struct vs_map
*vsmap
;
3347 vm_offset_t cluster
;
3349 cluster
= atop_32(offset
) >> vs
->vs_clshift
;
3350 if (vs
->vs_indirect
) {
3351 long ind_block
= cluster
/CLMAP_ENTRIES
;
3353 /* Is the indirect block allocated? */
3354 vsmap
= vs
->vs_imap
[ind_block
];
3355 if(vsmap
== (vs_map_t
) NULL
)
3358 vsmap
= vs
->vs_dmap
;
3359 vsmap
+= cluster%CLMAP_ENTRIES
;
3364 vs_cluster_transfer(
3370 vm_offset_t actual_offset
;
3371 paging_segment_t ps
;
3373 kern_return_t error
= KERN_SUCCESS
;
3374 int size
, size_wanted
, i
;
3375 unsigned int residual
;
3377 default_pager_thread_t
*dpt
;
3379 struct vs_map
*vsmap_ptr
;
3380 struct vs_map read_vsmap
;
3381 struct vs_map original_read_vsmap
;
3382 struct vs_map write_vsmap
;
3386 /* vs_cluster_transfer reads in the pages of a cluster and
3387 * then writes these pages back to new backing store. The
3388 * segment the pages are being read from is assumed to have
3389 * been taken off-line and is no longer considered for new
3394 * This loop will be executed once per cluster referenced.
3395 * Typically this means once, since it's unlikely that the
3396 * VM system will ask for anything spanning cluster boundaries.
3398 * If there are holes in a cluster (in a paging segment), we stop
3399 * reading at the hole, then loop again, hoping to
3400 * find valid pages later in the cluster. This continues until
3401 * the entire range has been examined, and read, if present. The
3402 * pages are written as they are read. If a failure occurs after
3403 * some pages are written the unmap call at the bottom of the loop
3404 * recovers the backing store and the old backing store remains
3408 VSM_CLR(write_vsmap
);
3409 VSM_CLR(original_read_vsmap
);
3410 /* grab the actual object's pages to sync with I/O */
3411 while (cnt
&& (error
== KERN_SUCCESS
)) {
3412 vsmap_ptr
= vs_get_map_entry(vs
, offset
);
3413 actual_offset
= ps_clmap(vs
, offset
, &clmap
, CL_FIND
, 0, 0);
3415 if (actual_offset
== (vm_offset_t
) -1) {
3418 * Nothing left to write in this cluster at least
3419 * set write cluster information for any previous
3420 * write, clear for next cluster, if there is one
3422 unsigned int local_size
, clmask
, clsize
;
3424 clsize
= vm_page_size
<< vs
->vs_clshift
;
3425 clmask
= clsize
- 1;
3426 local_size
= clsize
- (offset
& clmask
);
3428 local_size
= MIN(local_size
, cnt
);
3430 /* This cluster has no data in it beyond what may */
3431 /* have been found on a previous iteration through */
3432 /* the loop "write_vsmap" */
3433 *vsmap_ptr
= write_vsmap
;
3434 VSM_CLR(write_vsmap
);
3435 VSM_CLR(original_read_vsmap
);
3438 offset
+= local_size
;
3443 * Count up contiguous available or unavailable
3446 ps
= CLMAP_PS(clmap
);
3451 (size
< cnt
) && (unavail_size
< cnt
) &&
3452 (i
< CLMAP_NPGS(clmap
)); i
++) {
3453 if (CLMAP_ISSET(clmap
, i
)) {
3454 if (unavail_size
!= 0)
3456 size
+= vm_page_size
;
3458 ps
->ps_bs
->bs_pages_in
++);
3462 unavail_size
+= vm_page_size
;
3467 ASSERT(unavail_size
);
3468 cnt
-= unavail_size
;
3469 offset
+= unavail_size
;
3470 if((offset
& ((vm_page_size
<< vs
->vs_clshift
) - 1))
3472 /* There is no more to transfer in this
3475 *vsmap_ptr
= write_vsmap
;
3476 VSM_CLR(write_vsmap
);
3477 VSM_CLR(original_read_vsmap
);
3482 if(VSM_ISCLR(original_read_vsmap
))
3483 original_read_vsmap
= *vsmap_ptr
;
3485 if(ps
->ps_segtype
== PS_PARTITION
) {
3487 NEED TO ISSUE WITH SYNC & NO COMMIT
3488 error = ps_read_device(ps, actual_offset, &buffer,
3489 size, &residual, flags);
3492 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3493 error
= ps_read_file(ps
, upl
, (vm_offset_t
) 0, actual_offset
,
3495 (UPL_IOSYNC
| UPL_NOCOMMIT
));
3498 read_vsmap
= *vsmap_ptr
;
3502 * Adjust counts and put data in new BS. Optimize for the
3503 * common case, i.e. no error and/or partial data.
3504 * If there was an error, then we need to error the entire
3505 * range, even if some data was successfully read.
3508 if ((error
== KERN_SUCCESS
) && (residual
== 0)) {
3509 int page_list_count
= 0;
3512 * Got everything we asked for, supply the data to
3513 * the new BS. Note that as a side effect of supplying
3514 * the data, the buffer holding the supplied data is
3515 * deallocated from the pager's address space unless
3516 * the write is unsuccessful.
3519 /* note buffer will be cleaned up in all cases by */
3520 /* internal_cluster_write or if an error on write */
3521 /* the vm_map_copy_page_discard call */
3522 *vsmap_ptr
= write_vsmap
;
3524 if(vs_cluster_write(vs
, upl
, offset
,
3525 size
, TRUE
, UPL_IOSYNC
| UPL_NOCOMMIT
) != KERN_SUCCESS
) {
3526 error
= KERN_FAILURE
;
3527 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3528 /* unmap the new backing store object */
3529 ps_clunmap(vs
, offset
, size
);
3531 /* original vsmap */
3532 *vsmap_ptr
= original_read_vsmap
;
3533 VSM_CLR(write_vsmap
);
3535 if((offset
+ size
) &
3536 ((vm_page_size
<< vs
->vs_clshift
)
3538 /* There is more to transfer in this
3541 write_vsmap
= *vsmap_ptr
;
3542 *vsmap_ptr
= read_vsmap
;
3544 /* discard the old backing object */
3545 write_vsmap
= *vsmap_ptr
;
3546 *vsmap_ptr
= read_vsmap
;
3547 ps_clunmap(vs
, offset
, size
);
3548 *vsmap_ptr
= write_vsmap
;
3549 VSM_CLR(write_vsmap
);
3550 VSM_CLR(original_read_vsmap
);
3555 if (error
== KERN_SUCCESS
) {
3556 if (residual
== size
) {
3558 * If a read operation returns no error
3559 * and no data moved, we turn it into
3560 * an error, assuming we're reading at
3562 * Fall through and error the entire
3565 error
= KERN_FAILURE
;
3566 *vsmap_ptr
= write_vsmap
;
3567 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3568 /* unmap the new backing store object */
3569 ps_clunmap(vs
, offset
, size
);
3571 *vsmap_ptr
= original_read_vsmap
;
3572 VSM_CLR(write_vsmap
);
3576 * Otherwise, we have partial read.
3577 * This is also considered an error
3578 * for the purposes of cluster transfer
3580 error
= KERN_FAILURE
;
3581 *vsmap_ptr
= write_vsmap
;
3582 if(!(VSM_ISCLR(*vsmap_ptr
))) {
3583 /* unmap the new backing store object */
3584 ps_clunmap(vs
, offset
, size
);
3586 *vsmap_ptr
= original_read_vsmap
;
3587 VSM_CLR(write_vsmap
);
3596 } /* END while (cnt && (error == 0)) */
3597 if(!VSM_ISCLR(write_vsmap
))
3598 *vsmap_ptr
= write_vsmap
;
3604 default_pager_add_file(MACH_PORT_FACE backing_store
,
3610 paging_segment_t ps
;
3614 if ((bs
= backing_store_lookup(backing_store
))
3615 == BACKING_STORE_NULL
)
3616 return KERN_INVALID_ARGUMENT
;
3619 for (i
= 0; i
<= paging_segment_max
; i
++) {
3620 ps
= paging_segments
[i
];
3621 if (ps
== PAGING_SEGMENT_NULL
)
3623 if (ps
->ps_segtype
!= PS_FILE
)
3627 * Check for overlap on same device.
3629 if (ps
->ps_vnode
== (struct vnode
*)vp
) {
3632 return KERN_INVALID_ARGUMENT
;
3638 * Set up the paging segment
3640 ps
= (paging_segment_t
) kalloc(sizeof (struct paging_segment
));
3641 if (ps
== PAGING_SEGMENT_NULL
) {
3643 return KERN_RESOURCE_SHORTAGE
;
3646 ps
->ps_segtype
= PS_FILE
;
3647 ps
->ps_vnode
= (struct vnode
*)vp
;
3649 ps
->ps_record_shift
= local_log2(vm_page_size
/ record_size
);
3650 ps
->ps_recnum
= size
;
3651 ps
->ps_pgnum
= size
>> ps
->ps_record_shift
;
3653 ps
->ps_pgcount
= ps
->ps_pgnum
;
3654 ps
->ps_clshift
= local_log2(bs
->bs_clsize
);
3655 ps
->ps_clcount
= ps
->ps_ncls
= ps
->ps_pgcount
>> ps
->ps_clshift
;
3659 ps
->ps_bmap
= (unsigned char *) kalloc(RMAPSIZE(ps
->ps_ncls
));
3661 kfree((vm_offset_t
)ps
, sizeof *ps
);
3663 return KERN_RESOURCE_SHORTAGE
;
3665 for (i
= 0; i
< ps
->ps_ncls
; i
++) {
3666 clrbit(ps
->ps_bmap
, i
);
3669 ps
->ps_going_away
= FALSE
;
3672 if ((error
= ps_enter(ps
)) != 0) {
3673 kfree((vm_offset_t
)ps
->ps_bmap
, RMAPSIZE(ps
->ps_ncls
));
3674 kfree((vm_offset_t
)ps
, sizeof *ps
);
3676 return KERN_RESOURCE_SHORTAGE
;
3679 bs
->bs_pages_free
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3680 bs
->bs_pages_total
+= ps
->ps_clcount
<< ps
->ps_clshift
;
3682 dp_pages_free
+= ps
->ps_pgcount
;
3687 bs_more_space(ps
->ps_clcount
);
3689 DEBUG(DEBUG_BS_INTERNAL
,
3690 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3691 device
, offset
, size
, record_size
,
3692 ps
->ps_record_shift
, ps
->ps_pgnum
));
3694 return KERN_SUCCESS
;
3701 paging_segment_t ps
,
3703 vm_offset_t upl_offset
,
3706 unsigned int *residualp
,
3709 vm_object_offset_t f_offset
;
3714 clustered_reads
[atop_32(size
)]++;
3716 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3718 /* for transfer case we need to pass uploffset and flags */
3719 error
= vnode_pagein(ps
->ps_vnode
,
3720 upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
| UPL_NORDAHEAD
, NULL
);
3722 /* The vnode_pagein semantic is somewhat at odds with the existing */
3723 /* device_read semantic. Partial reads are not experienced at this */
3724 /* level. It is up to the bit map code and cluster read code to */
3725 /* check that requested data locations are actually backed, and the */
3726 /* pagein code to either read all of the requested data or return an */
3730 result
= KERN_FAILURE
;
3733 result
= KERN_SUCCESS
;
3740 paging_segment_t ps
,
3742 vm_offset_t upl_offset
,
3747 vm_object_offset_t f_offset
;
3748 kern_return_t result
;
3752 clustered_writes
[atop_32(size
)]++;
3753 f_offset
= (vm_object_offset_t
)(ps
->ps_offset
+ offset
);
3755 if (vnode_pageout(ps
->ps_vnode
,
3756 upl
, upl_offset
, f_offset
, (vm_size_t
)size
, flags
, NULL
))
3757 result
= KERN_FAILURE
;
3759 result
= KERN_SUCCESS
;
3765 default_pager_triggers(MACH_PORT_FACE default_pager
,
3769 MACH_PORT_FACE trigger_port
)
3771 MACH_PORT_FACE release
;
3775 if (flags
== HI_WAT_ALERT
) {
3776 release
= min_pages_trigger_port
;
3777 min_pages_trigger_port
= trigger_port
;
3778 minimum_pages_remaining
= hi_wat
/vm_page_size
;
3781 } else if (flags
== LO_WAT_ALERT
) {
3782 release
= max_pages_trigger_port
;
3783 max_pages_trigger_port
= trigger_port
;
3784 maximum_pages_free
= lo_wat
/vm_page_size
;
3787 release
= trigger_port
;
3788 kr
= KERN_INVALID_ARGUMENT
;
3792 if (IP_VALID(release
))
3793 ipc_port_release_send(release
);
3799 * Monitor the amount of available backing store vs. the amount of
3800 * required backing store, notify a listener (if present) when
3801 * backing store may safely be removed.
3803 * We attempt to avoid the situation where backing store is
3804 * discarded en masse, as this can lead to thrashing as the
3805 * backing store is compacted.
3808 #define PF_INTERVAL 3 /* time between free level checks */
3809 #define PF_LATENCY 10 /* number of intervals before release */
3811 static int dp_pages_free_low_count
= 0;
3814 default_pager_backing_store_monitor(thread_call_param_t p1
, thread_call_param_t p2
)
3816 unsigned long long average
;
3821 * We determine whether it will be safe to release some
3822 * backing store by watching the free page level. If
3823 * it remains below the maximum_pages_free threshold for
3824 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3825 * then we deem it safe.
3827 * Note that this establishes a maximum rate at which backing
3828 * store will be released, as each notification (currently)
3829 * only results in a single backing store object being
3832 if (dp_pages_free
> maximum_pages_free
) {
3833 dp_pages_free_low_count
++;
3835 dp_pages_free_low_count
= 0;
3838 /* decide whether to send notification */
3840 if (max_pages_trigger_port
&&
3841 (backing_store_release_trigger_disable
== 0) &&
3842 (dp_pages_free_low_count
> PF_LATENCY
)) {
3843 trigger
= max_pages_trigger_port
;
3844 max_pages_trigger_port
= NULL
;
3847 /* send notification */
3848 if (trigger
!= IP_NULL
) {
3850 if(backing_store_release_trigger_disable
!= 0) {
3851 assert_wait((event_t
)
3852 &backing_store_release_trigger_disable
,
3855 thread_block(THREAD_CONTINUE_NULL
);
3859 default_pager_space_alert(trigger
, LO_WAT_ALERT
);
3860 ipc_port_release_send(trigger
);
3861 dp_pages_free_low_count
= 0;
3864 clock_interval_to_deadline(PF_INTERVAL
, NSEC_PER_SEC
, &deadline
);
3865 thread_call_func_delayed(default_pager_backing_store_monitor
, NULL
, deadline
);