git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/default_pager/dp_backing

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/*
	23	* @OSF_COPYRIGHT@
	24	*/
	25	/*
	26	* Mach Operating System
	27	* Copyright (c) 1991,1990,1989 Carnegie Mellon University
	28	* All Rights Reserved.
	29	*
	30	* Permission to use, copy, modify and distribute this software and its
	31	* documentation is hereby granted, provided that both the copyright
	32	* notice and this permission notice appear in all copies of the
	33	* software, derivative works or modified versions, and any portions
	34	* thereof, and that both notices appear in supporting documentation.
	35	*
	36	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	37	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	38	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	39	*
	40	* Carnegie Mellon requests users of this software to return to
	41	*
	42	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	43	* School of Computer Science
	44	* Carnegie Mellon University
	45	* Pittsburgh PA 15213-3890
	46	*
	47	* any improvements or extensions that they make and grant Carnegie Mellon
	48	* the rights to redistribute these changes.
	49	*/
	50
	51	/*
	52	* Default Pager.
	53	* Paging File Management.
	54	*/
	55
	56	#include <mach/memory_object_control.h>
	57	#include <mach/memory_object_server.h>
	58	#include "default_pager_internal.h"
	59	#include <default_pager/default_pager_alerts.h>
	60	#include <ipc/ipc_port.h>
	61	#include <ipc/ipc_space.h>
	62	#include <kern/queue.h>
	63	#include <kern/counters.h>
	64	#include <kern/sched_prim.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_pageout.h>
	67	/* CDY CDY */
	68	#include <vm/vm_map.h>
	69
	70	/*
	71	* ALLOC_STRIDE... the maximum number of bytes allocated from
	72	* a swap file before moving on to the next swap file... if
	73	* all swap files reside on a single disk, this value should
	74	* be very large (this is the default assumption)... if the
	75	* swap files are spread across multiple disks, than this value
	76	* should be small (128 * 1024)...
	77	*
	78	* This should be determined dynamically in the future
	79	*/
	80
	81	#define ALLOC_STRIDE (1024 * 1024 * 1024)
	82	int physical_transfer_cluster_count = 0;
	83
	84	#define VM_SUPER_CLUSTER 0x40000
	85	#define VM_SUPER_PAGES 64
	86
	87	/*
	88	* 0 means no shift to pages, so == 1 page/cluster. 1 would mean
	89	* 2 pages/cluster, 2 means 4 pages/cluster, and so on.
	90	*/
	91	#define VSTRUCT_DEF_CLSHIFT 2
	92	int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
	93	int default_pager_clsize = 0;
	94
	95	/* statistics */
	96	unsigned int clustered_writes[VM_SUPER_PAGES+1];
	97	unsigned int clustered_reads[VM_SUPER_PAGES+1];
	98
	99	/*
	100	* Globals used for asynchronous paging operations:
	101	* vs_async_list: head of list of to-be-completed I/O ops
	102	* async_num_queued: number of pages completed, but not yet
	103	* processed by async thread.
	104	* async_requests_out: number of pages of requests not completed.
	105	*/
	106
	107	#if 0
	108	struct vs_async *vs_async_list;
	109	int async_num_queued;
	110	int async_requests_out;
	111	#endif
	112
	113
	114	#define VS_ASYNC_REUSE 1
	115	struct vs_async *vs_async_free_list;
	116
	117	mutex_t default_pager_async_lock; /* Protects globals above */
	118
	119
	120	int vs_alloc_async_failed = 0; /* statistics */
	121	int vs_alloc_async_count = 0; /* statistics */
	122	struct vs_async vs_alloc_async(void); / forward */
	123	void vs_free_async(struct vs_async vsa); / forward */
	124
	125
	126	#define VS_ALLOC_ASYNC() vs_alloc_async()
	127	#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
	128
	129	#define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
	130	#define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
	131	#define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
	132	ETAP_IO_DEV_PAGEH)
	133	#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
	134	/*
	135	* Paging Space Hysteresis triggers and the target notification port
	136	*
	137	*/
	138
	139	unsigned int minimum_pages_remaining = 0;
	140	unsigned int maximum_pages_free = 0;
	141	ipc_port_t min_pages_trigger_port = NULL;
	142	ipc_port_t max_pages_trigger_port = NULL;
	143
	144	boolean_t bs_low = FALSE;
	145	int backing_store_release_trigger_disable = 0;
	146
	147
	148
	149	/*
	150	* Object sizes are rounded up to the next power of 2,
	151	* unless they are bigger than a given maximum size.
	152	*/
	153	vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
	154
	155	/*
	156	* List of all backing store and segments.
	157	*/
	158	struct backing_store_list_head backing_store_list;
	159	paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
	160	mutex_t paging_segments_lock;
	161	int paging_segment_max = 0;
	162	int paging_segment_count = 0;
	163	int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
	164
	165
	166	/*
	167	* Total pages free in system
	168	* This differs from clusters committed/avail which is a measure of the
	169	* over commitment of paging segments to backing store. An idea which is
	170	* likely to be deprecated.
	171	*/
	172	unsigned int dp_pages_free = 0;
	173	unsigned int cluster_transfer_minimum = 100;
	174
	175	kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
	176	kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int , int); / forward */
	177
	178
	179	default_pager_thread_t *
	180	get_read_buffer()
	181	{
	182	int i;
	183
	184	DPT_LOCK(dpt_lock);
	185	while(TRUE) {
	186	for (i=0; i<default_pager_internal_count; i++) {
	187	if(dpt_array[i]->checked_out == FALSE) {
	188	dpt_array[i]->checked_out = TRUE;
	189	DPT_UNLOCK(dpt_lock);
	190	return dpt_array[i];
	191	}
	192	}
	193	DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
	194	}
	195	}
	196
	197	void
	198	bs_initialize(void)
	199	{
	200	int i;
	201
	202	/*
	203	* List of all backing store.
	204	*/
	205	BSL_LOCK_INIT();
	206	queue_init(&backing_store_list.bsl_queue);
	207	PSL_LOCK_INIT();
	208
	209	VS_ASYNC_LOCK_INIT();
	210	#if VS_ASYNC_REUSE
	211	vs_async_free_list = NULL;
	212	#endif /* VS_ASYNC_REUSE */
	213
	214	for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
	215	clustered_writes[i] = 0;
	216	clustered_reads[i] = 0;
	217	}
	218
	219	}
	220
	221	/*
	222	* When things do not quite workout...
	223	*/
	224	void bs_no_paging_space(boolean_t); /* forward */
	225
	226	void
	227	bs_no_paging_space(
	228	boolean_t out_of_memory)
	229	{
	230
	231	if (out_of_memory)
	232	dprintf(("* OUT OF MEMORY *\n"));
	233	panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
	234	}
	235
	236	void bs_more_space(int); /* forward */
	237	void bs_commit(int); /* forward */
	238
	239	boolean_t user_warned = FALSE;
	240	unsigned int clusters_committed = 0;
	241	unsigned int clusters_available = 0;
	242	unsigned int clusters_committed_peak = 0;
	243
	244	void
	245	bs_more_space(
	246	int nclusters)
	247	{
	248	BSL_LOCK();
	249	/*
	250	* Account for new paging space.
	251	*/
	252	clusters_available += nclusters;
	253
	254	if (clusters_available >= clusters_committed) {
	255	if (verbose && user_warned) {
	256	printf("%s%s - %d excess clusters now.\n",
	257	my_name,
	258	"paging space is OK now",
	259	clusters_available - clusters_committed);
	260	user_warned = FALSE;
	261	clusters_committed_peak = 0;
	262	}
	263	} else {
	264	if (verbose && user_warned) {
	265	printf("%s%s - still short of %d clusters.\n",
	266	my_name,
	267	"WARNING: paging space over-committed",
	268	clusters_committed - clusters_available);
	269	clusters_committed_peak -= nclusters;
	270	}
	271	}
	272	BSL_UNLOCK();
	273
	274	return;
	275	}
	276
	277	void
	278	bs_commit(
	279	int nclusters)
	280	{
	281	BSL_LOCK();
	282	clusters_committed += nclusters;
	283	if (clusters_committed > clusters_available) {
	284	if (verbose && !user_warned) {
	285	user_warned = TRUE;
	286	printf("%s%s - short of %d clusters.\n",
	287	my_name,
	288	"WARNING: paging space over-committed",
	289	clusters_committed - clusters_available);
	290	}
	291	if (clusters_committed > clusters_committed_peak) {
	292	clusters_committed_peak = clusters_committed;
	293	}
	294	} else {
	295	if (verbose && user_warned) {
	296	printf("%s%s - was short of up to %d clusters.\n",
	297	my_name,
	298	"paging space is OK now",
	299	clusters_committed_peak - clusters_available);
	300	user_warned = FALSE;
	301	clusters_committed_peak = 0;
	302	}
	303	}
	304	BSL_UNLOCK();
	305
	306	return;
	307	}
	308
	309	int default_pager_info_verbose = 1;
	310
	311	void
	312	bs_global_info(
	313	vm_size_t *totalp,
	314	vm_size_t *freep)
	315	{
	316	vm_size_t pages_total, pages_free;
	317	paging_segment_t ps;
	318	int i;
	319
	320	PSL_LOCK();
	321	pages_total = pages_free = 0;
	322	for (i = 0; i <= paging_segment_max; i++) {
	323	ps = paging_segments[i];
	324	if (ps == PAGING_SEGMENT_NULL)
	325	continue;
	326
	327	/*
	328	* no need to lock: by the time this data
	329	* gets back to any remote requestor it
	330	* will be obsolete anyways
	331	*/
	332	pages_total += ps->ps_pgnum;
	333	pages_free += ps->ps_clcount << ps->ps_clshift;
	334	DEBUG(DEBUG_BS_INTERNAL,
	335	("segment #%d: %d total, %d free\n",
	336	i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
	337	}
	338	*totalp = pages_total;
	339	*freep = pages_free;
	340	if (verbose && user_warned && default_pager_info_verbose) {
	341	if (clusters_available < clusters_committed) {
	342	printf("%s %d clusters committed, %d available.\n",
	343	my_name,
	344	clusters_committed,
	345	clusters_available);
	346	}
	347	}
	348	PSL_UNLOCK();
	349	}
	350
	351	backing_store_t backing_store_alloc(void); /* forward */
	352
	353	backing_store_t
	354	backing_store_alloc(void)
	355	{
	356	backing_store_t bs;
	357
	358	bs = (backing_store_t) kalloc(sizeof (struct backing_store));
	359	if (bs == BACKING_STORE_NULL)
	360	panic("backing_store_alloc: no memory");
	361
	362	BS_LOCK_INIT(bs);
	363	bs->bs_port = MACH_PORT_NULL;
	364	bs->bs_priority = 0;
	365	bs->bs_clsize = 0;
	366	bs->bs_pages_total = 0;
	367	bs->bs_pages_in = 0;
	368	bs->bs_pages_in_fail = 0;
	369	bs->bs_pages_out = 0;
	370	bs->bs_pages_out_fail = 0;
	371
	372	return bs;
	373	}
	374
	375	backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
	376
	377	/* Even in both the component space and external versions of this pager, */
	378	/* backing_store_lookup will be called from tasks in the application space */
	379	backing_store_t
	380	backing_store_lookup(
	381	MACH_PORT_FACE port)
	382	{
	383	backing_store_t bs;
	384
	385	/*
	386	port is currently backed with a vs structure in the alias field
	387	we could create an ISBS alias and a port_is_bs call but frankly
	388	I see no reason for the test, the bs->port == port check below
	389	will work properly on junk entries.
	390
	391	if ((port == MACH_PORT_NULL) \|\| port_is_vs(port))
	392	*/
	393	if ((port == MACH_PORT_NULL))
	394	return BACKING_STORE_NULL;
	395
	396	BSL_LOCK();
	397	queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
	398	bs_links) {
	399	BS_LOCK(bs);
	400	if (bs->bs_port == port) {
	401	BSL_UNLOCK();
	402	/* Success, return it locked. */
	403	return bs;
	404	}
	405	BS_UNLOCK(bs);
	406	}
	407	BSL_UNLOCK();
	408	return BACKING_STORE_NULL;
	409	}
	410
	411	void backing_store_add(backing_store_t); /* forward */
	412
	413	void
	414	backing_store_add(
	415	backing_store_t bs)
	416	{
	417	MACH_PORT_FACE port = bs->bs_port;
	418	MACH_PORT_FACE pset = default_pager_default_set;
	419	kern_return_t kr = KERN_SUCCESS;
	420
	421	if (kr != KERN_SUCCESS)
	422	panic("backing_store_add: add to set");
	423
	424	}
	425
	426	/*
	427	* Set up default page shift, but only if not already
	428	* set and argument is within range.
	429	*/
	430	boolean_t
	431	bs_set_default_clsize(unsigned int npages)
	432	{
	433	switch(npages){
	434	case 1:
	435	case 2:
	436	case 4:
	437	case 8:
	438	if (default_pager_clsize == 0) /* if not yet set */
	439	vstruct_def_clshift = local_log2(npages);
	440	return(TRUE);
	441	}
	442	return(FALSE);
	443	}
	444
	445	int bs_get_global_clsize(int clsize); /* forward */
	446
	447	int
	448	bs_get_global_clsize(
	449	int clsize)
	450	{
	451	int i;
	452	memory_object_default_t dmm;
	453	kern_return_t kr;
	454
	455	/*
	456	* Only allow setting of cluster size once. If called
	457	* with no cluster size (default), we use the compiled-in default
	458	* for the duration. The same cluster size is used for all
	459	* paging segments.
	460	*/
	461	if (default_pager_clsize == 0) {
	462	/*
	463	* Keep cluster size in bit shift because it's quicker
	464	* arithmetic, and easier to keep at a power of 2.
	465	*/
	466	if (clsize != NO_CLSIZE) {
	467	for (i = 0; (1 << i) < clsize; i++);
	468	if (i > MAX_CLUSTER_SHIFT)
	469	i = MAX_CLUSTER_SHIFT;
	470	vstruct_def_clshift = i;
	471	}
	472	default_pager_clsize = (1 << vstruct_def_clshift);
	473
	474	/*
	475	* Let the user know the new (and definitive) cluster size.
	476	*/
	477	if (verbose)
	478	printf("%scluster size = %d page%s\n",
	479	my_name, default_pager_clsize,
	480	(default_pager_clsize == 1) ? "" : "s");
	481
	482	/*
	483	* Let the kernel know too, in case it hasn't used the
	484	* default value provided in main() yet.
	485	*/
	486	dmm = default_pager_object;
	487	clsize = default_pager_clsize * vm_page_size; /* in bytes */
	488	kr = host_default_memory_manager(host_priv_self(),
	489	&dmm,
	490	clsize);
	491	memory_object_default_deallocate(dmm);
	492
	493	if (kr != KERN_SUCCESS) {
	494	panic("bs_get_global_cl_size:host_default_memory_manager");
	495	}
	496	if (dmm != default_pager_object) {
	497	panic("bs_get_global_cl_size:there is another default pager");
	498	}
	499	}
	500	ASSERT(default_pager_clsize > 0 &&
	501	(default_pager_clsize & (default_pager_clsize - 1)) == 0);
	502
	503	return default_pager_clsize;
	504	}
	505
	506	kern_return_t
	507	default_pager_backing_store_create(
	508	memory_object_default_t pager,
	509	int priority,
	510	int clsize, /* in bytes */
	511	MACH_PORT_FACE *backing_store)
	512	{
	513	backing_store_t bs;
	514	MACH_PORT_FACE port;
	515	kern_return_t kr;
	516	struct vstruct_alias *alias_struct;
	517
	518	if (pager != default_pager_object)
	519	return KERN_INVALID_ARGUMENT;
	520
	521	bs = backing_store_alloc();
	522	port = ipc_port_alloc_kernel();
	523	ipc_port_make_send(port);
	524	assert (port != IP_NULL);
	525
	526	DEBUG(DEBUG_BS_EXTERNAL,
	527	("priority=%d clsize=%d bs_port=0x%x\n",
	528	priority, clsize, (int) backing_store));
	529
	530	alias_struct = (struct vstruct_alias *)
	531	kalloc(sizeof (struct vstruct_alias));
	532	if(alias_struct != NULL) {
	533	alias_struct->vs = (struct vstruct *)bs;
	534	alias_struct->name = ISVS;
	535	port->alias = (int) alias_struct;
	536	}
	537	else {
	538	ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
	539	kfree((vm_offset_t)bs, sizeof (struct backing_store));
	540	return KERN_RESOURCE_SHORTAGE;
	541	}
	542
	543	bs->bs_port = port;
	544	if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
	545	priority = BS_MAXPRI;
	546	else if (priority == BS_NOPRI)
	547	priority = BS_MAXPRI;
	548	else
	549	priority = BS_MINPRI;
	550	bs->bs_priority = priority;
	551
	552	bs->bs_clsize = bs_get_global_clsize(atop(clsize));
	553
	554	BSL_LOCK();
	555	queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
	556	bs_links);
	557	BSL_UNLOCK();
	558
	559	backing_store_add(bs);
	560
	561	*backing_store = port;
	562	return KERN_SUCCESS;
	563	}
	564
	565	kern_return_t
	566	default_pager_backing_store_info(
	567	MACH_PORT_FACE backing_store,
	568	backing_store_flavor_t flavour,
	569	backing_store_info_t info,
	570	mach_msg_type_number_t *size)
	571	{
	572	backing_store_t bs;
	573	backing_store_basic_info_t basic;
	574	int i;
	575	paging_segment_t ps;
	576
	577	if (flavour != BACKING_STORE_BASIC_INFO \|\|
	578	*size < BACKING_STORE_BASIC_INFO_COUNT)
	579	return KERN_INVALID_ARGUMENT;
	580
	581	basic = (backing_store_basic_info_t)info;
	582	*size = BACKING_STORE_BASIC_INFO_COUNT;
	583
	584	VSTATS_LOCK(&global_stats.gs_lock);
	585	basic->pageout_calls = global_stats.gs_pageout_calls;
	586	basic->pagein_calls = global_stats.gs_pagein_calls;
	587	basic->pages_in = global_stats.gs_pages_in;
	588	basic->pages_out = global_stats.gs_pages_out;
	589	basic->pages_unavail = global_stats.gs_pages_unavail;
	590	basic->pages_init = global_stats.gs_pages_init;
	591	basic->pages_init_writes= global_stats.gs_pages_init_writes;
	592	VSTATS_UNLOCK(&global_stats.gs_lock);
	593
	594	if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
	595	return KERN_INVALID_ARGUMENT;
	596
	597	basic->bs_pages_total = bs->bs_pages_total;
	598	PSL_LOCK();
	599	bs->bs_pages_free = 0;
	600	for (i = 0; i <= paging_segment_max; i++) {
	601	ps = paging_segments[i];
	602	if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
	603	PS_LOCK(ps);
	604	bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
	605	PS_UNLOCK(ps);
	606	}
	607	}
	608	PSL_UNLOCK();
	609	basic->bs_pages_free = bs->bs_pages_free;
	610	basic->bs_pages_in = bs->bs_pages_in;
	611	basic->bs_pages_in_fail = bs->bs_pages_in_fail;
	612	basic->bs_pages_out = bs->bs_pages_out;
	613	basic->bs_pages_out_fail= bs->bs_pages_out_fail;
	614
	615	basic->bs_priority = bs->bs_priority;
	616	basic->bs_clsize = ptoa(bs->bs_clsize); /* in bytes */
	617
	618	BS_UNLOCK(bs);
	619
	620	return KERN_SUCCESS;
	621	}
	622
	623	int ps_delete(paging_segment_t); /* forward */
	624
	625	int
	626	ps_delete(
	627	paging_segment_t ps)
	628	{
	629	vstruct_t vs;
	630	kern_return_t error = KERN_SUCCESS;
	631	int vs_count;
	632
	633	VSL_LOCK(); /* get the lock on the list of vs's */
	634
	635	/* The lock relationship and sequence is farily complicated */
	636	/* this code looks at a live list, locking and unlocking the list */
	637	/* as it traverses it. It depends on the locking behavior of */
	638	/* default_pager_no_senders. no_senders always locks the vstruct */
	639	/* targeted for removal before locking the vstruct list. However */
	640	/* it will remove that member of the list without locking its */
	641	/* neighbors. We can be sure when we hold a lock on a vstruct */
	642	/* it cannot be removed from the list but we must hold the list */
	643	/* lock to be sure that its pointers to its neighbors are valid. */
	644	/* Also, we can hold off destruction of a vstruct when the list */
	645	/* lock and the vs locks are not being held by bumping the */
	646	/* vs_async_pending count. */
	647
	648
	649	while(backing_store_release_trigger_disable != 0) {
	650	VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
	651	}
	652
	653	/* we will choose instead to hold a send right */
	654	vs_count = vstruct_list.vsl_count;
	655	vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
	656	if(vs == (vstruct_t)&vstruct_list) {
	657	VSL_UNLOCK();
	658	return KERN_SUCCESS;
	659	}
	660	VS_LOCK(vs);
	661	vs_async_wait(vs); /* wait for any pending async writes */
	662	if ((vs_count != 0) && (vs != NULL))
	663	vs->vs_async_pending += 1; /* hold parties calling */
	664	/* vs_async_wait */
	665	VS_UNLOCK(vs);
	666	VSL_UNLOCK();
	667	while((vs_count != 0) && (vs != NULL)) {
	668	/* We take the count of AMO's before beginning the */
	669	/* transfer of of the target segment. */
	670	/* We are guaranteed that the target segment cannot get */
	671	/* more users. We also know that queue entries are */
	672	/* made at the back of the list. If some of the entries */
	673	/* we would check disappear while we are traversing the */
	674	/* list then we will either check new entries which */
	675	/* do not have any backing store in the target segment */
	676	/* or re-check old entries. This might not be optimal */
	677	/* but it will always be correct. The alternative is to */
	678	/* take a snapshot of the list. */
	679	vstruct_t next_vs;
	680
	681	if(dp_pages_free < cluster_transfer_minimum)
	682	error = KERN_FAILURE;
	683	else {
	684	vm_object_t transfer_object;
	685	int count;
	686	upl_t upl;
	687
	688	transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
	689	count = 0;
	690	error = vm_object_upl_request(transfer_object,
	691	(vm_object_offset_t)0, VM_SUPER_CLUSTER,
	692	&upl, NULL, &count,
	693	UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE
	694	\| UPL_SET_INTERNAL);
	695	if(error == KERN_SUCCESS) {
	696	error = ps_vstruct_transfer_from_segment(
	697	vs, ps, upl);
	698	upl_commit(upl, NULL);
	699	upl_deallocate(upl);
	700	} else {
	701	error = KERN_FAILURE;
	702	}
	703	vm_object_deallocate(transfer_object);
	704	}
	705	if(error) {
	706	VS_LOCK(vs);
	707	vs->vs_async_pending -= 1; /* release vs_async_wait */
	708	if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
	709	vs->vs_waiting_async = FALSE;
	710	VS_UNLOCK(vs);
	711	thread_wakeup(&vs->vs_async_pending);
	712	} else {
	713	VS_UNLOCK(vs);
	714	}
	715	return KERN_FAILURE;
	716	}
	717
	718	VSL_LOCK();
	719
	720	while(backing_store_release_trigger_disable != 0) {
	721	VSL_SLEEP(&backing_store_release_trigger_disable,
	722	THREAD_UNINT);
	723	}
	724
	725	next_vs = (vstruct_t) queue_next(&(vs->vs_links));
	726	if((next_vs != (vstruct_t)&vstruct_list) &&
	727	(vs != next_vs) && (vs_count != 1)) {
	728	VS_LOCK(next_vs);
	729	vs_async_wait(next_vs); /* wait for any */
	730	/* pending async writes */
	731	next_vs->vs_async_pending += 1; /* hold parties */
	732	/* calling vs_async_wait */
	733	VS_UNLOCK(next_vs);
	734	}
	735	VSL_UNLOCK();
	736	VS_LOCK(vs);
	737	vs->vs_async_pending -= 1;
	738	if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
	739	vs->vs_waiting_async = FALSE;
	740	VS_UNLOCK(vs);
	741	thread_wakeup(&vs->vs_async_pending);
	742	} else {
	743	VS_UNLOCK(vs);
	744	}
	745	if((vs == next_vs) \|\| (next_vs == (vstruct_t)&vstruct_list))
	746	vs = NULL;
	747	else
	748	vs = next_vs;
	749	vs_count--;
	750	}
	751	return KERN_SUCCESS;
	752	}
	753
	754
	755	kern_return_t
	756	default_pager_backing_store_delete(
	757	MACH_PORT_FACE backing_store)
	758	{
	759	backing_store_t bs;
	760	int i;
	761	paging_segment_t ps;
	762	int error;
	763	int interim_pages_removed = 0;
	764	kern_return_t kr;
	765
	766	if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
	767	return KERN_INVALID_ARGUMENT;
	768
	769	#if 0
	770	/* not implemented */
	771	BS_UNLOCK(bs);
	772	return KERN_FAILURE;
	773	#endif
	774
	775	restart:
	776	PSL_LOCK();
	777	error = KERN_SUCCESS;
	778	for (i = 0; i <= paging_segment_max; i++) {
	779	ps = paging_segments[i];
	780	if (ps != PAGING_SEGMENT_NULL &&
	781	ps->ps_bs == bs &&
	782	! ps->ps_going_away) {
	783	PS_LOCK(ps);
	784	/* disable access to this segment */
	785	ps->ps_going_away = TRUE;
	786	PS_UNLOCK(ps);
	787	/*
	788	* The "ps" segment is "off-line" now,
	789	* we can try and delete it...
	790	*/
	791	if(dp_pages_free < (cluster_transfer_minimum
	792	+ ps->ps_pgcount)) {
	793	error = KERN_FAILURE;
	794	PSL_UNLOCK();
	795	}
	796	else {
	797	/* remove all pages associated with the */
	798	/* segment from the list of free pages */
	799	/* when transfer is through, all target */
	800	/* segment pages will appear to be free */
	801
	802	dp_pages_free -= ps->ps_pgcount;
	803	interim_pages_removed += ps->ps_pgcount;
	804	PSL_UNLOCK();
	805	error = ps_delete(ps);
	806	}
	807	if (error != KERN_SUCCESS) {
	808	/*
	809	* We couldn't delete the segment,
	810	* probably because there's not enough
	811	* virtual memory left.
	812	* Re-enable all the segments.
	813	*/
	814	PSL_LOCK();
	815	break;
	816	}
	817	goto restart;
	818	}
	819	}
	820
	821	if (error != KERN_SUCCESS) {
	822	for (i = 0; i <= paging_segment_max; i++) {
	823	ps = paging_segments[i];
	824	if (ps != PAGING_SEGMENT_NULL &&
	825	ps->ps_bs == bs &&
	826	ps->ps_going_away) {
	827	PS_LOCK(ps);
	828	/* re-enable access to this segment */
	829	ps->ps_going_away = FALSE;
	830	PS_UNLOCK(ps);
	831	}
	832	}
	833	dp_pages_free += interim_pages_removed;
	834	PSL_UNLOCK();
	835	BS_UNLOCK(bs);
	836	return error;
	837	}
	838
	839	for (i = 0; i <= paging_segment_max; i++) {
	840	ps = paging_segments[i];
	841	if (ps != PAGING_SEGMENT_NULL &&
	842	ps->ps_bs == bs) {
	843	if(ps->ps_going_away) {
	844	paging_segments[i] = PAGING_SEGMENT_NULL;
	845	paging_segment_count--;
	846	PS_LOCK(ps);
	847	kfree((vm_offset_t)ps->ps_bmap,
	848	RMAPSIZE(ps->ps_ncls));
	849	kfree((vm_offset_t)ps, sizeof *ps);
	850	}
	851	}
	852	}
	853
	854	/* Scan the entire ps array separately to make certain we find the */
	855	/* proper paging_segment_max */
	856	for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
	857	if(paging_segments[i] != PAGING_SEGMENT_NULL)
	858	paging_segment_max = i;
	859	}
	860
	861	PSL_UNLOCK();
	862
	863	/*
	864	* All the segments have been deleted.
	865	* We can remove the backing store.
	866	*/
	867
	868	/*
	869	* Disable lookups of this backing store.
	870	*/
	871	if((void *)bs->bs_port->alias != NULL)
	872	kfree((vm_offset_t) bs->bs_port->alias,
	873	sizeof (struct vstruct_alias));
	874	ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
	875	bs->bs_port = MACH_PORT_NULL;
	876	BS_UNLOCK(bs);
	877
	878	/*
	879	* Remove backing store from backing_store list.
	880	*/
	881	BSL_LOCK();
	882	queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
	883	bs_links);
	884	BSL_UNLOCK();
	885
	886	/*
	887	* Free the backing store structure.
	888	*/
	889	kfree((vm_offset_t)bs, sizeof *bs);
	890
	891	return KERN_SUCCESS;
	892	}
	893
	894	int ps_enter(paging_segment_t); /* forward */
	895
	896	int
	897	ps_enter(
	898	paging_segment_t ps)
	899	{
	900	int i;
	901
	902	PSL_LOCK();
	903
	904	for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
	905	if (paging_segments[i] == PAGING_SEGMENT_NULL)
	906	break;
	907	}
	908
	909	if (i < MAX_NUM_PAGING_SEGMENTS) {
	910	paging_segments[i] = ps;
	911	if (i > paging_segment_max)
	912	paging_segment_max = i;
	913	paging_segment_count++;
	914	if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) \|\|
	915	(ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
	916	ps_select_array[ps->ps_bs->bs_priority] = 0;
	917	i = 0;
	918	} else {
	919	PSL_UNLOCK();
	920	return KERN_RESOURCE_SHORTAGE;
	921	}
	922
	923	PSL_UNLOCK();
	924	return i;
	925	}
	926
	927	#ifdef DEVICE_PAGING
	928	kern_return_t
	929	default_pager_add_segment(
	930	MACH_PORT_FACE backing_store,
	931	MACH_PORT_FACE device,
	932	recnum_t offset,
	933	recnum_t count,
	934	int record_size)
	935	{
	936	backing_store_t bs;
	937	paging_segment_t ps;
	938	int i;
	939	int error;
	940
	941	if ((bs = backing_store_lookup(backing_store))
	942	== BACKING_STORE_NULL)
	943	return KERN_INVALID_ARGUMENT;
	944
	945	PSL_LOCK();
	946	for (i = 0; i <= paging_segment_max; i++) {
	947	ps = paging_segments[i];
	948	if (ps == PAGING_SEGMENT_NULL)
	949	continue;
	950
	951	/*
	952	* Check for overlap on same device.
	953	*/
	954	if (!(ps->ps_device != device
	955	\|\| offset >= ps->ps_offset + ps->ps_recnum
	956	\|\| offset + count <= ps->ps_offset)) {
	957	PSL_UNLOCK();
	958	BS_UNLOCK(bs);
	959	return KERN_INVALID_ARGUMENT;
	960	}
	961	}
	962	PSL_UNLOCK();
	963
	964	/*
	965	* Set up the paging segment
	966	*/
	967	ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
	968	if (ps == PAGING_SEGMENT_NULL) {
	969	BS_UNLOCK(bs);
	970	return KERN_RESOURCE_SHORTAGE;
	971	}
	972
	973	ps->ps_segtype = PS_PARTITION;
	974	ps->ps_device = device;
	975	ps->ps_offset = offset;
	976	ps->ps_record_shift = local_log2(vm_page_size / record_size);
	977	ps->ps_recnum = count;
	978	ps->ps_pgnum = count >> ps->ps_record_shift;
	979
	980	ps->ps_pgcount = ps->ps_pgnum;
	981	ps->ps_clshift = local_log2(bs->bs_clsize);
	982	ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
	983	ps->ps_hint = 0;
	984
	985	PS_LOCK_INIT(ps);
	986	ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
	987	if (!ps->ps_bmap) {
	988	kfree((vm_offset_t)ps, sizeof *ps);
	989	BS_UNLOCK(bs);
	990	return KERN_RESOURCE_SHORTAGE;
	991	}
	992	for (i = 0; i < ps->ps_ncls; i++) {
	993	clrbit(ps->ps_bmap, i);
	994	}
	995
	996	ps->ps_going_away = FALSE;
	997	ps->ps_bs = bs;
	998
	999	if ((error = ps_enter(ps)) != 0) {
	1000	kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
	1001	kfree((vm_offset_t)ps, sizeof *ps);
	1002	BS_UNLOCK(bs);
	1003	return KERN_RESOURCE_SHORTAGE;
	1004	}
	1005
	1006	bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
	1007	bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
	1008	BS_UNLOCK(bs);
	1009
	1010	PSL_LOCK();
	1011	dp_pages_free += ps->ps_pgcount;
	1012	PSL_UNLOCK();
	1013
	1014	bs_more_space(ps->ps_clcount);
	1015
	1016	DEBUG(DEBUG_BS_INTERNAL,
	1017	("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
	1018	device, offset, count, record_size,
	1019	ps->ps_record_shift, ps->ps_pgnum));
	1020
	1021	return KERN_SUCCESS;
	1022	}
	1023
	1024	boolean_t
	1025	bs_add_device(
	1026	char *dev_name,
	1027	MACH_PORT_FACE master)
	1028	{
	1029	security_token_t null_security_token = {
	1030	{ 0, 0 }
	1031	};
	1032	MACH_PORT_FACE device;
	1033	int info[DEV_GET_SIZE_COUNT];
	1034	mach_msg_type_number_t info_count;
	1035	MACH_PORT_FACE bs = MACH_PORT_NULL;
	1036	unsigned int rec_size;
	1037	recnum_t count;
	1038	int clsize;
	1039	MACH_PORT_FACE reply_port;
	1040
	1041	if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ \| D_WRITE,
	1042	null_security_token, dev_name, &device))
	1043	return FALSE;
	1044
	1045	info_count = DEV_GET_SIZE_COUNT;
	1046	if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
	1047	rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
	1048	count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
	1049	clsize = bs_get_global_clsize(0);
	1050	if (!default_pager_backing_store_create(
	1051	default_pager_object,
	1052	DEFAULT_PAGER_BACKING_STORE_MAXPRI,
	1053	(clsize * vm_page_size),
	1054	&bs)) {
	1055	if (!default_pager_add_segment(bs, device,
	1056	0, count, rec_size)) {
	1057	return TRUE;
	1058	}
	1059	ipc_port_release_receive(bs);
	1060	}
	1061	}
	1062
	1063	ipc_port_release_send(device);
	1064	return FALSE;
	1065	}
	1066	#endif /* DEVICE_PAGING */
	1067
	1068	#if VS_ASYNC_REUSE
	1069
	1070	struct vs_async *
	1071	vs_alloc_async(void)
	1072	{
	1073	struct vs_async *vsa;
	1074	MACH_PORT_FACE reply_port;
	1075	kern_return_t kr;
	1076
	1077	VS_ASYNC_LOCK();
	1078	if (vs_async_free_list == NULL) {
	1079	VS_ASYNC_UNLOCK();
	1080	vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
	1081	if (vsa != NULL) {
	1082	/*
	1083	* Try allocating a reply port named after the
	1084	* address of the vs_async structure.
	1085	*/
	1086	struct vstruct_alias *alias_struct;
	1087
	1088	reply_port = ipc_port_alloc_kernel();
	1089	alias_struct = (struct vstruct_alias *)
	1090	kalloc(sizeof (struct vstruct_alias));
	1091	if(alias_struct != NULL) {
	1092	alias_struct->vs = (struct vstruct *)vsa;
	1093	alias_struct->name = ISVS;
	1094	reply_port->alias = (int) alias_struct;
	1095	vsa->reply_port = reply_port;
	1096	vs_alloc_async_count++;
	1097	}
	1098	else {
	1099	vs_alloc_async_failed++;
	1100	ipc_port_dealloc_kernel((MACH_PORT_FACE)
	1101	(reply_port));
	1102	kfree((vm_offset_t)vsa,
	1103	sizeof (struct vs_async));
	1104	vsa = NULL;
	1105	}
	1106	}
	1107	} else {
	1108	vsa = vs_async_free_list;
	1109	vs_async_free_list = vs_async_free_list->vsa_next;
	1110	VS_ASYNC_UNLOCK();
	1111	}
	1112
	1113	return vsa;
	1114	}
	1115
	1116	void
	1117	vs_free_async(
	1118	struct vs_async *vsa)
	1119	{
	1120	VS_ASYNC_LOCK();
	1121	vsa->vsa_next = vs_async_free_list;
	1122	vs_async_free_list = vsa;
	1123	VS_ASYNC_UNLOCK();
	1124	}
	1125
	1126	#else /* VS_ASYNC_REUSE */
	1127
	1128	struct vs_async *
	1129	vs_alloc_async(void)
	1130	{
	1131	struct vs_async *vsa;
	1132	MACH_PORT_FACE reply_port;
	1133	kern_return_t kr;
	1134
	1135	vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
	1136	if (vsa != NULL) {
	1137	/*
	1138	* Try allocating a reply port named after the
	1139	* address of the vs_async structure.
	1140	*/
	1141	reply_port = ipc_port_alloc_kernel();
	1142	alias_struct = (vstruct_alias *)
	1143	kalloc(sizeof (struct vstruct_alias));
	1144	if(alias_struct != NULL) {
	1145	alias_struct->vs = reply_port;
	1146	alias_struct->name = ISVS;
	1147	reply_port->alias = (int) vsa;
	1148	vsa->reply_port = reply_port;
	1149	vs_alloc_async_count++;
	1150	}
	1151	else {
	1152	vs_alloc_async_failed++;
	1153	ipc_port_dealloc_kernel((MACH_PORT_FACE)
	1154	(reply_port));
	1155	kfree((vm_offset_t) vsa,
	1156	sizeof (struct vs_async));
	1157	vsa = NULL;
	1158	}
	1159	}
	1160
	1161	return vsa;
	1162	}
	1163
	1164	void
	1165	vs_free_async(
	1166	struct vs_async *vsa)
	1167	{
	1168	MACH_PORT_FACE reply_port;
	1169	kern_return_t kr;
	1170
	1171	reply_port = vsa->reply_port;
	1172	kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
	1173	kfree((vm_offset_t) vsa, sizeof (struct vs_async));
	1174	ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
	1175	#if 0
	1176	VS_ASYNC_LOCK();
	1177	vs_alloc_async_count--;
	1178	VS_ASYNC_UNLOCK();
	1179	#endif
	1180	}
	1181
	1182	#endif /* VS_ASYNC_REUSE */
	1183
	1184	zone_t vstruct_zone;
	1185
	1186	vstruct_t
	1187	ps_vstruct_create(
	1188	vm_size_t size)
	1189	{
	1190	vstruct_t vs;
	1191	int i;
	1192
	1193	vs = (vstruct_t) zalloc(vstruct_zone);
	1194	if (vs == VSTRUCT_NULL) {
	1195	return VSTRUCT_NULL;
	1196	}
	1197
	1198	VS_LOCK_INIT(vs);
	1199
	1200	/*
	1201	* The following fields will be provided later.
	1202	*/
	1203	vs->vs_mem_obj = NULL;
	1204	vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
	1205	vs->vs_references = 1;
	1206	vs->vs_seqno = 0;
	1207
	1208	#ifdef MACH_KERNEL
	1209	vs->vs_waiting_seqno = FALSE;
	1210	vs->vs_waiting_read = FALSE;
	1211	vs->vs_waiting_write = FALSE;
	1212	vs->vs_waiting_async = FALSE;
	1213	#else
	1214	mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
	1215	mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
	1216	mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
	1217	mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
	1218	mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
	1219	#endif
	1220
	1221	vs->vs_readers = 0;
	1222	vs->vs_writers = 0;
	1223
	1224	vs->vs_errors = 0;
	1225
	1226	vs->vs_clshift = local_log2(bs_get_global_clsize(0));
	1227	vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
	1228	vs->vs_async_pending = 0;
	1229
	1230	/*
	1231	* Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
	1232	* depending on the size of the memory object.
	1233	*/
	1234	if (INDIRECT_CLMAP(vs->vs_size)) {
	1235	vs->vs_imap = (struct vs_map **)
	1236	kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
	1237	vs->vs_indirect = TRUE;
	1238	} else {
	1239	vs->vs_dmap = (struct vs_map *)
	1240	kalloc(CLMAP_SIZE(vs->vs_size));
	1241	vs->vs_indirect = FALSE;
	1242	}
	1243	vs->vs_xfer_pending = FALSE;
	1244	DEBUG(DEBUG_VS_INTERNAL,
	1245	("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
	1246
	1247	/*
	1248	* Check to see that we got the space.
	1249	*/
	1250	if (!vs->vs_dmap) {
	1251	kfree((vm_offset_t)vs, sizeof *vs);
	1252	return VSTRUCT_NULL;
	1253	}
	1254
	1255	/*
	1256	* Zero the indirect pointers, or clear the direct pointers.
	1257	*/
	1258	if (vs->vs_indirect)
	1259	memset(vs->vs_imap, 0,
	1260	INDIRECT_CLMAP_SIZE(vs->vs_size));
	1261	else
	1262	for (i = 0; i < vs->vs_size; i++)
	1263	VSM_CLR(vs->vs_dmap[i]);
	1264
	1265	VS_MAP_LOCK_INIT(vs);
	1266
	1267	bs_commit(vs->vs_size);
	1268
	1269	return vs;
	1270	}
	1271
	1272	paging_segment_t ps_select_segment(int, int ); / forward */
	1273
	1274	paging_segment_t
	1275	ps_select_segment(
	1276	int shift,
	1277	int *psindex)
	1278	{
	1279	paging_segment_t ps;
	1280	int i;
	1281	int j;
	1282
	1283	/*
	1284	* Optimize case where there's only one segment.
	1285	* paging_segment_max will index the one and only segment.
	1286	*/
	1287
	1288	PSL_LOCK();
	1289	if (paging_segment_count == 1) {
	1290	paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
	1291	ipc_port_t trigger = IP_NULL;
	1292
	1293	ps = paging_segments[paging_segment_max];
	1294	*psindex = paging_segment_max;
	1295	PS_LOCK(ps);
	1296	if (ps->ps_going_away) {
	1297	/* this segment is being turned off */
	1298	lps = PAGING_SEGMENT_NULL;
	1299	} else {
	1300	ASSERT(ps->ps_clshift >= shift);
	1301	if (ps->ps_clcount) {
	1302	ps->ps_clcount--;
	1303	dp_pages_free -= 1 << ps->ps_clshift;
	1304	if(min_pages_trigger_port &&
	1305	(dp_pages_free < minimum_pages_remaining)) {
	1306	trigger = min_pages_trigger_port;
	1307	min_pages_trigger_port = NULL;
	1308	bs_low = TRUE;
	1309	}
	1310	lps = ps;
	1311	} else
	1312	lps = PAGING_SEGMENT_NULL;
	1313	}
	1314	PS_UNLOCK(ps);
	1315	PSL_UNLOCK();
	1316
	1317	if (trigger != IP_NULL) {
	1318	default_pager_space_alert(trigger, HI_WAT_ALERT);
	1319	ipc_port_release_send(trigger);
	1320	}
	1321	return lps;
	1322	}
	1323
	1324	if (paging_segment_count == 0) {
	1325	PSL_UNLOCK();
	1326	return PAGING_SEGMENT_NULL;
	1327	}
	1328
	1329	for (i = BS_MAXPRI;
	1330	i >= BS_MINPRI; i--) {
	1331	int start_index;
	1332
	1333	if ((ps_select_array[i] == BS_NOPRI) \|\|
	1334	(ps_select_array[i] == BS_FULLPRI))
	1335	continue;
	1336	start_index = ps_select_array[i];
	1337
	1338	if(!(paging_segments[start_index])) {
	1339	j = start_index+1;
	1340	physical_transfer_cluster_count = 0;
	1341	}
	1342	else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
	1343	(((paging_segments[start_index])->ps_clshift)
	1344	+ vm_page_shift))) {
	1345	physical_transfer_cluster_count = 0;
	1346	j = start_index + 1;
	1347	} else {
	1348	physical_transfer_cluster_count+=1;
	1349	j = start_index;
	1350	if(start_index == 0)
	1351	start_index = paging_segment_max;
	1352	else
	1353	start_index = start_index - 1;
	1354	}
	1355
	1356	while (1) {
	1357	if (j > paging_segment_max)
	1358	j = 0;
	1359	if ((ps = paging_segments[j]) &&
	1360	(ps->ps_bs->bs_priority == i)) {
	1361	/*
	1362	* Force the ps cluster size to be
	1363	* >= that of the vstruct.
	1364	*/
	1365	PS_LOCK(ps);
	1366	if (ps->ps_going_away) {
	1367	/* this segment is being turned off */
	1368	} else if ((ps->ps_clcount) &&
	1369	(ps->ps_clshift >= shift)) {
	1370	ipc_port_t trigger = IP_NULL;
	1371
	1372	ps->ps_clcount--;
	1373	dp_pages_free -= 1 << ps->ps_clshift;
	1374	if(min_pages_trigger_port &&
	1375	(dp_pages_free <
	1376	minimum_pages_remaining)) {
	1377	trigger = min_pages_trigger_port;
	1378	min_pages_trigger_port = NULL;
	1379	}
	1380	PS_UNLOCK(ps);
	1381	/*
	1382	* found one, quit looking.
	1383	*/
	1384	ps_select_array[i] = j;
	1385	PSL_UNLOCK();
	1386
	1387	if (trigger != IP_NULL) {
	1388	default_pager_space_alert(
	1389	trigger,
	1390	HI_WAT_ALERT);
	1391	ipc_port_release_send(trigger);
	1392	}
	1393	*psindex = j;
	1394	return ps;
	1395	}
	1396	PS_UNLOCK(ps);
	1397	}
	1398	if (j == start_index) {
	1399	/*
	1400	* none at this priority -- mark it full
	1401	*/
	1402	ps_select_array[i] = BS_FULLPRI;
	1403	break;
	1404	}
	1405	j++;
	1406	}
	1407	}
	1408	PSL_UNLOCK();
	1409	return PAGING_SEGMENT_NULL;
	1410	}
	1411
	1412	vm_offset_t ps_allocate_cluster(vstruct_t, int , paging_segment_t); /forward*/
	1413
	1414	vm_offset_t
	1415	ps_allocate_cluster(
	1416	vstruct_t vs,
	1417	int *psindex,
	1418	paging_segment_t use_ps)
	1419	{
	1420	int byte_num;
	1421	int bit_num = 0;
	1422	paging_segment_t ps;
	1423	vm_offset_t cluster;
	1424	ipc_port_t trigger = IP_NULL;
	1425
	1426	/*
	1427	* Find best paging segment.
	1428	* ps_select_segment will decrement cluster count on ps.
	1429	* Must pass cluster shift to find the most appropriate segment.
	1430	*/
	1431	/* NOTE: The addition of paging segment delete capability threatened
	1432	* to seriously complicate the treatment of paging segments in this
	1433	* module and the ones that call it (notably ps_clmap), because of the
	1434	* difficulty in assuring that the paging segment would continue to
	1435	* exist between being unlocked and locked. This was
	1436	* avoided because all calls to this module are based in either
	1437	* dp_memory_object calls which rely on the vs lock, or by
	1438	* the transfer function which is part of the segment delete path.
	1439	* The transfer function which is part of paging segment delete is
	1440	* protected from multiple callers by the backing store lock.
	1441	* The paging segment delete function treats mappings to a paging
	1442	* segment on a vstruct by vstruct basis, locking the vstruct targeted
	1443	* while data is transferred to the remaining segments. This is in
	1444	* line with the view that incomplete or in-transition mappings between
	1445	* data, a vstruct, and backing store are protected by the vs lock.
	1446	* This and the ordering of the paging segment "going_away" bit setting
	1447	* protects us.
	1448	*/
	1449	if (use_ps != PAGING_SEGMENT_NULL) {
	1450	ps = use_ps;
	1451	PSL_LOCK();
	1452	PS_LOCK(ps);
	1453	ps->ps_clcount--;
	1454	dp_pages_free -= 1 << ps->ps_clshift;
	1455	if(min_pages_trigger_port &&
	1456	(dp_pages_free < minimum_pages_remaining)) {
	1457	trigger = min_pages_trigger_port;
	1458	min_pages_trigger_port = NULL;
	1459	}
	1460	PSL_UNLOCK();
	1461	PS_UNLOCK(ps);
	1462	if (trigger != IP_NULL) {
	1463	default_pager_space_alert(trigger, HI_WAT_ALERT);
	1464	ipc_port_release_send(trigger);
	1465	}
	1466
	1467	} else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
	1468	PAGING_SEGMENT_NULL) {
	1469	#if 0
	1470	bs_no_paging_space(TRUE);
	1471	#endif
	1472	#if 0
	1473	if (verbose)
	1474	#endif
	1475	dprintf(("no space in available paging segments; "
	1476	"swapon suggested\n"));
	1477	/* the count got off maybe, reset to zero */
	1478	PSL_LOCK();
	1479	dp_pages_free = 0;
	1480	if(min_pages_trigger_port) {
	1481	trigger = min_pages_trigger_port;
	1482	min_pages_trigger_port = NULL;
	1483	bs_low = TRUE;
	1484	}
	1485	PSL_UNLOCK();
	1486	if (trigger != IP_NULL) {
	1487	default_pager_space_alert(trigger, HI_WAT_ALERT);
	1488	ipc_port_release_send(trigger);
	1489	}
	1490	return (vm_offset_t) -1;
	1491	}
	1492	ASSERT(ps->ps_clcount != 0);
	1493
	1494	/*
	1495	* Look for an available cluster. At the end of the loop,
	1496	* byte_num is the byte offset and bit_num is the bit offset of the
	1497	* first zero bit in the paging segment bitmap.
	1498	*/
	1499	PS_LOCK(ps);
	1500	byte_num = ps->ps_hint;
	1501	for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
	1502	if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
	1503	for (bit_num = 0; bit_num < NBBY; bit_num++) {
	1504	if (isclr((ps->ps_bmap + byte_num), bit_num))
	1505	break;
	1506	}
	1507	ASSERT(bit_num != NBBY);
	1508	break;
	1509	}
	1510	}
	1511	ps->ps_hint = byte_num;
	1512	cluster = (byte_num*NBBY) + bit_num;
	1513
	1514	/* Space was reserved, so this must be true */
	1515	ASSERT(cluster < ps->ps_ncls);
	1516
	1517	setbit(ps->ps_bmap, cluster);
	1518	PS_UNLOCK(ps);
	1519
	1520	return cluster;
	1521	}
	1522
	1523	void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
	1524
	1525	void
	1526	ps_deallocate_cluster(
	1527	paging_segment_t ps,
	1528	vm_offset_t cluster)
	1529	{
	1530	ipc_port_t trigger = IP_NULL;
	1531
	1532	if (cluster >= (vm_offset_t) ps->ps_ncls)
	1533	panic("ps_deallocate_cluster: Invalid cluster number");
	1534
	1535	/*
	1536	* Lock the paging segment, clear the cluster's bitmap and increment the
	1537	* number of free cluster.
	1538	*/
	1539	PSL_LOCK();
	1540	PS_LOCK(ps);
	1541	clrbit(ps->ps_bmap, cluster);
	1542	++ps->ps_clcount;
	1543	dp_pages_free += 1 << ps->ps_clshift;
	1544	if(max_pages_trigger_port
	1545	&& (backing_store_release_trigger_disable == 0)
	1546	&& (dp_pages_free > maximum_pages_free)) {
	1547	trigger = max_pages_trigger_port;
	1548	max_pages_trigger_port = NULL;
	1549	}
	1550	PSL_UNLOCK();
	1551
	1552	/*
	1553	* Move the hint down to the freed cluster if it is
	1554	* less than the current hint.
	1555	*/
	1556	if ((cluster/NBBY) < ps->ps_hint) {
	1557	ps->ps_hint = (cluster/NBBY);
	1558	}
	1559
	1560	PS_UNLOCK(ps);
	1561
	1562	/*
	1563	* If we're freeing space on a full priority, reset the array.
	1564	*/
	1565	PSL_LOCK();
	1566	if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
	1567	ps_select_array[ps->ps_bs->bs_priority] = 0;
	1568	PSL_UNLOCK();
	1569
	1570	if (trigger != IP_NULL) {
	1571	VSL_LOCK();
	1572	if(backing_store_release_trigger_disable != 0) {
	1573	assert_wait((event_t)
	1574	&backing_store_release_trigger_disable,
	1575	THREAD_UNINT);
	1576	VSL_UNLOCK();
	1577	thread_block(THREAD_CONTINUE_NULL);
	1578	} else {
	1579	VSL_UNLOCK();
	1580	}
	1581	default_pager_space_alert(trigger, LO_WAT_ALERT);
	1582	ipc_port_release_send(trigger);
	1583	}
	1584
	1585	return;
	1586	}
	1587
	1588	void ps_dealloc_vsmap(struct vs_map , vm_size_t); / forward */
	1589
	1590	void
	1591	ps_dealloc_vsmap(
	1592	struct vs_map *vsmap,
	1593	vm_size_t size)
	1594	{
	1595	int i;
	1596	for (i = 0; i < size; i++)
	1597	if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
	1598	ps_deallocate_cluster(VSM_PS(vsmap[i]),
	1599	VSM_CLOFF(vsmap[i]));
	1600	}
	1601
	1602	void
	1603	ps_vstruct_dealloc(
	1604	vstruct_t vs)
	1605	{
	1606	int i;
	1607	spl_t s;
	1608
	1609	VS_MAP_LOCK(vs);
	1610
	1611	/*
	1612	* If this is an indirect structure, then we walk through the valid
	1613	* (non-zero) indirect pointers and deallocate the clusters
	1614	* associated with each used map entry (via ps_dealloc_vsmap).
	1615	* When all of the clusters in an indirect block have been
	1616	* freed, we deallocate the block. When all of the indirect
	1617	* blocks have been deallocated we deallocate the memory
	1618	* holding the indirect pointers.
	1619	*/
	1620	if (vs->vs_indirect) {
	1621	for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
	1622	if (vs->vs_imap[i] != NULL) {
	1623	ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
	1624	kfree((vm_offset_t)vs->vs_imap[i],
	1625	CLMAP_THRESHOLD);
	1626	}
	1627	}
	1628	kfree((vm_offset_t)vs->vs_imap,
	1629	INDIRECT_CLMAP_SIZE(vs->vs_size));
	1630	} else {
	1631	/*
	1632	* Direct map. Free used clusters, then memory.
	1633	*/
	1634	ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
	1635	kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
	1636	}
	1637	VS_MAP_UNLOCK(vs);
	1638
	1639	bs_commit(- vs->vs_size);
	1640
	1641	zfree(vstruct_zone, (vm_offset_t)vs);
	1642	}
	1643
	1644	int ps_map_extend(vstruct_t, int); /* forward */
	1645
	1646	int ps_map_extend(
	1647	vstruct_t vs,
	1648	int new_size)
	1649	{
	1650	struct vs_map **new_imap;
	1651	struct vs_map *new_dmap = NULL;
	1652	int newdsize;
	1653	int i;
	1654	void *old_map = NULL;
	1655	int old_map_size = 0;
	1656
	1657	if (vs->vs_size >= new_size) {
	1658	/*
	1659	* Someone has already done the work.
	1660	*/
	1661	return 0;
	1662	}
	1663
	1664	/*
	1665	* If the new size extends into the indirect range, then we have one
	1666	* of two cases: we are going from indirect to indirect, or we are
	1667	* going from direct to indirect. If we are going from indirect to
	1668	* indirect, then it is possible that the new size will fit in the old
	1669	* indirect map. If this is the case, then just reset the size of the
	1670	* vstruct map and we are done. If the new size will not
	1671	* fit into the old indirect map, then we have to allocate a new
	1672	* indirect map and copy the old map pointers into this new map.
	1673	*
	1674	* If we are going from direct to indirect, then we have to allocate a
	1675	* new indirect map and copy the old direct pages into the first
	1676	* indirect page of the new map.
	1677	* NOTE: allocating memory here is dangerous, as we're in the
	1678	* pageout path.
	1679	*/
	1680	if (INDIRECT_CLMAP(new_size)) {
	1681	int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
	1682
	1683	/*
	1684	* Get a new indirect map and zero it.
	1685	*/
	1686	old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
	1687	if (vs->vs_indirect &&
	1688	(new_map_size == old_map_size)) {
	1689	bs_commit(new_size - vs->vs_size);
	1690	vs->vs_size = new_size;
	1691	return 0;
	1692	}
	1693
	1694	new_imap = (struct vs_map **)kalloc(new_map_size);
	1695	if (new_imap == NULL) {
	1696	return -1;
	1697	}
	1698	memset(new_imap, 0, new_map_size);
	1699
	1700	if (vs->vs_indirect) {
	1701	/* Copy old entries into new map */
	1702	memcpy(new_imap, vs->vs_imap, old_map_size);
	1703	/* Arrange to free the old map */
	1704	old_map = (void *) vs->vs_imap;
	1705	newdsize = 0;
	1706	} else { /* Old map was a direct map */
	1707	/* Allocate an indirect page */
	1708	if ((new_imap[0] = (struct vs_map *)
	1709	kalloc(CLMAP_THRESHOLD)) == NULL) {
	1710	kfree((vm_offset_t)new_imap, new_map_size);
	1711	return -1;
	1712	}
	1713	new_dmap = new_imap[0];
	1714	newdsize = CLMAP_ENTRIES;
	1715	}
	1716	} else {
	1717	new_imap = NULL;
	1718	newdsize = new_size;
	1719	/*
	1720	* If the new map is a direct map, then the old map must
	1721	* also have been a direct map. All we have to do is
	1722	* to allocate a new direct map, copy the old entries
	1723	* into it and free the old map.
	1724	*/
	1725	if ((new_dmap = (struct vs_map *)
	1726	kalloc(CLMAP_SIZE(new_size))) == NULL) {
	1727	return -1;
	1728	}
	1729	}
	1730	if (newdsize) {
	1731
	1732	/* Free the old map */
	1733	old_map = (void *) vs->vs_dmap;
	1734	old_map_size = CLMAP_SIZE(vs->vs_size);
	1735
	1736	/* Copy info from the old map into the new map */
	1737	memcpy(new_dmap, vs->vs_dmap, old_map_size);
	1738
	1739	/* Initialize the rest of the new map */
	1740	for (i = vs->vs_size; i < newdsize; i++)
	1741	VSM_CLR(new_dmap[i]);
	1742	}
	1743	if (new_imap) {
	1744	vs->vs_imap = new_imap;
	1745	vs->vs_indirect = TRUE;
	1746	} else
	1747	vs->vs_dmap = new_dmap;
	1748	bs_commit(new_size - vs->vs_size);
	1749	vs->vs_size = new_size;
	1750	if (old_map)
	1751	kfree((vm_offset_t)old_map, old_map_size);
	1752	return 0;
	1753	}
	1754
	1755	vm_offset_t
	1756	ps_clmap(
	1757	vstruct_t vs,
	1758	vm_offset_t offset,
	1759	struct clmap *clmap,
	1760	int flag,
	1761	vm_size_t size,
	1762	int error)
	1763	{
	1764	vm_offset_t cluster; /* The cluster of offset. */
	1765	vm_offset_t newcl; /* The new cluster allocated. */
	1766	vm_offset_t newoff;
	1767	int i;
	1768	struct vs_map *vsmap;
	1769
	1770	VS_MAP_LOCK(vs);
	1771
	1772	ASSERT(vs->vs_dmap);
	1773	cluster = atop(offset) >> vs->vs_clshift;
	1774
	1775	/*
	1776	* Initialize cluster error value
	1777	*/
	1778	clmap->cl_error = 0;
	1779
	1780	/*
	1781	* If the object has grown, extend the page map.
	1782	*/
	1783	if (cluster >= vs->vs_size) {
	1784	if (flag == CL_FIND) {
	1785	/* Do not allocate if just doing a lookup */
	1786	VS_MAP_UNLOCK(vs);
	1787	return (vm_offset_t) -1;
	1788	}
	1789	if (ps_map_extend(vs, cluster + 1)) {
	1790	VS_MAP_UNLOCK(vs);
	1791	return (vm_offset_t) -1;
	1792	}
	1793	}
	1794
	1795	/*
	1796	* Look for the desired cluster. If the map is indirect, then we
	1797	* have a two level lookup. First find the indirect block, then
	1798	* find the actual cluster. If the indirect block has not yet
	1799	* been allocated, then do so. If the cluster has not yet been
	1800	* allocated, then do so.
	1801	*
	1802	* If any of the allocations fail, then return an error.
	1803	* Don't allocate if just doing a lookup.
	1804	*/
	1805	if (vs->vs_indirect) {
	1806	long ind_block = cluster/CLMAP_ENTRIES;
	1807
	1808	/* Is the indirect block allocated? */
	1809	vsmap = vs->vs_imap[ind_block];
	1810	if (vsmap == NULL) {
	1811	if (flag == CL_FIND) {
	1812	VS_MAP_UNLOCK(vs);
	1813	return (vm_offset_t) -1;
	1814	}
	1815
	1816	/* Allocate the indirect block */
	1817	vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
	1818	if (vsmap == NULL) {
	1819	VS_MAP_UNLOCK(vs);
	1820	return (vm_offset_t) -1;
	1821	}
	1822	/* Initialize the cluster offsets */
	1823	for (i = 0; i < CLMAP_ENTRIES; i++)
	1824	VSM_CLR(vsmap[i]);
	1825	vs->vs_imap[ind_block] = vsmap;
	1826	}
	1827	} else
	1828	vsmap = vs->vs_dmap;
	1829
	1830	ASSERT(vsmap);
	1831	vsmap += cluster%CLMAP_ENTRIES;
	1832
	1833	/*
	1834	* At this point, vsmap points to the struct vs_map desired.
	1835	*
	1836	* Look in the map for the cluster, if there was an error on a
	1837	* previous write, flag it and return. If it is not yet
	1838	* allocated, then allocate it, if we're writing; if we're
	1839	* doing a lookup and the cluster's not allocated, return error.
	1840	*/
	1841	if (VSM_ISERR(*vsmap)) {
	1842	clmap->cl_error = VSM_GETERR(*vsmap);
	1843	VS_MAP_UNLOCK(vs);
	1844	return (vm_offset_t) -1;
	1845	} else if (VSM_ISCLR(*vsmap)) {
	1846	int psindex;
	1847
	1848	if (flag == CL_FIND) {
	1849	/*
	1850	* If there's an error and the entry is clear, then
	1851	* we've run out of swap space. Record the error
	1852	* here and return.
	1853	*/
	1854	if (error) {
	1855	VSM_SETERR(*vsmap, error);
	1856	}
	1857	VS_MAP_UNLOCK(vs);
	1858	return (vm_offset_t) -1;
	1859	} else {
	1860	/*
	1861	* Attempt to allocate a cluster from the paging segment
	1862	*/
	1863	newcl = ps_allocate_cluster(vs, &psindex,
	1864	PAGING_SEGMENT_NULL);
	1865	if (newcl == -1) {
	1866	VS_MAP_UNLOCK(vs);
	1867	return (vm_offset_t) -1;
	1868	}
	1869	VSM_CLR(*vsmap);
	1870	VSM_SETCLOFF(*vsmap, newcl);
	1871	VSM_SETPS(*vsmap, psindex);
	1872	}
	1873	} else
	1874	newcl = VSM_CLOFF(*vsmap);
	1875
	1876	/*
	1877	* Fill in pertinent fields of the clmap
	1878	*/
	1879	clmap->cl_ps = VSM_PS(*vsmap);
	1880	clmap->cl_numpages = VSCLSIZE(vs);
	1881	clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
	1882
	1883	/*
	1884	* Byte offset in paging segment is byte offset to cluster plus
	1885	* byte offset within cluster. It looks ugly, but should be
	1886	* relatively quick.
	1887	*/
	1888	ASSERT(trunc_page(offset) == offset);
	1889	newcl = ptoa(newcl) << vs->vs_clshift;
	1890	newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
	1891	if (flag == CL_ALLOC) {
	1892	/*
	1893	* set bits in the allocation bitmap according to which
	1894	* pages were requested. size is in bytes.
	1895	*/
	1896	i = atop(newoff);
	1897	while ((size > 0) && (i < VSCLSIZE(vs))) {
	1898	VSM_SETALLOC(*vsmap, i);
	1899	i++;
	1900	size -= vm_page_size;
	1901	}
	1902	}
	1903	clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
	1904	if (newoff) {
	1905	/*
	1906	* Offset is not cluster aligned, so number of pages
	1907	* and bitmaps must be adjusted
	1908	*/
	1909	clmap->cl_numpages -= atop(newoff);
	1910	CLMAP_SHIFT(clmap, vs);
	1911	CLMAP_SHIFTALLOC(clmap, vs);
	1912	}
	1913
	1914	/*
	1915	*
	1916	* The setting of valid bits and handling of write errors
	1917	* must be done here, while we hold the lock on the map.
	1918	* It logically should be done in ps_vs_write_complete().
	1919	* The size and error information has been passed from
	1920	* ps_vs_write_complete(). If the size parameter is non-zero,
	1921	* then there is work to be done. If error is also non-zero,
	1922	* then the error number is recorded in the cluster and the
	1923	* entire cluster is in error.
	1924	*/
	1925	if (size && flag == CL_FIND) {
	1926	vm_offset_t off = (vm_offset_t) 0;
	1927
	1928	if (!error) {
	1929	for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
	1930	i++) {
	1931	VSM_SETPG(*vsmap, i);
	1932	size -= vm_page_size;
	1933	}
	1934	ASSERT(i <= VSCLSIZE(vs));
	1935	} else {
	1936	BS_STAT(clmap->cl_ps->ps_bs,
	1937	clmap->cl_ps->ps_bs->bs_pages_out_fail +=
	1938	atop(size));
	1939	off = VSM_CLOFF(*vsmap);
	1940	VSM_SETERR(*vsmap, error);
	1941	}
	1942	/*
	1943	* Deallocate cluster if error, and no valid pages
	1944	* already present.
	1945	*/
	1946	if (off != (vm_offset_t) 0)
	1947	ps_deallocate_cluster(clmap->cl_ps, off);
	1948	VS_MAP_UNLOCK(vs);
	1949	return (vm_offset_t) 0;
	1950	} else
	1951	VS_MAP_UNLOCK(vs);
	1952
	1953	DEBUG(DEBUG_VS_INTERNAL,
	1954	("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
	1955	newcl+newoff, (int) vs, (int) vsmap, flag));
	1956	DEBUG(DEBUG_VS_INTERNAL,
	1957	(" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
	1958	(int) clmap->cl_ps, clmap->cl_numpages,
	1959	(int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
	1960
	1961	return (newcl + newoff);
	1962	}
	1963
	1964	void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
	1965
	1966	void
	1967	ps_clunmap(
	1968	vstruct_t vs,
	1969	vm_offset_t offset,
	1970	vm_size_t length)
	1971	{
	1972	vm_offset_t cluster; /* The cluster number of offset */
	1973	struct vs_map *vsmap;
	1974
	1975	VS_MAP_LOCK(vs);
	1976
	1977	/*
	1978	* Loop through all clusters in this range, freeing paging segment
	1979	* clusters and map entries as encountered.
	1980	*/
	1981	while (length > 0) {
	1982	vm_offset_t newoff;
	1983	int i;
	1984
	1985	cluster = atop(offset) >> vs->vs_clshift;
	1986	if (vs->vs_indirect) /* indirect map */
	1987	vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
	1988	else
	1989	vsmap = vs->vs_dmap;
	1990	if (vsmap == NULL) {
	1991	VS_MAP_UNLOCK(vs);
	1992	return;
	1993	}
	1994	vsmap += cluster%CLMAP_ENTRIES;
	1995	if (VSM_ISCLR(*vsmap)) {
	1996	length -= vm_page_size;
	1997	offset += vm_page_size;
	1998	continue;
	1999	}
	2000	/*
	2001	* We've got a valid mapping. Clear it and deallocate
	2002	* paging segment cluster pages.
	2003	* Optimize for entire cluster cleraing.
	2004	*/
	2005	if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
	2006	/*
	2007	* Not cluster aligned.
	2008	*/
	2009	ASSERT(trunc_page(newoff) == newoff);
	2010	i = atop(newoff);
	2011	} else
	2012	i = 0;
	2013	while ((i < VSCLSIZE(vs)) && (length > 0)) {
	2014	VSM_CLRPG(*vsmap, i);
	2015	VSM_CLRALLOC(*vsmap, i);
	2016	length -= vm_page_size;
	2017	offset += vm_page_size;
	2018	i++;
	2019	}
	2020
	2021	/*
	2022	* If map entry is empty, clear and deallocate cluster.
	2023	*/
	2024	if (!VSM_ALLOC(*vsmap)) {
	2025	ps_deallocate_cluster(VSM_PS(*vsmap),
	2026	VSM_CLOFF(*vsmap));
	2027	VSM_CLR(*vsmap);
	2028	}
	2029	}
	2030
	2031	VS_MAP_UNLOCK(vs);
	2032	}
	2033
	2034	void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
	2035
	2036	void
	2037	ps_vs_write_complete(
	2038	vstruct_t vs,
	2039	vm_offset_t offset,
	2040	vm_size_t size,
	2041	int error)
	2042	{
	2043	struct clmap clmap;
	2044
	2045	/*
	2046	* Get the struct vsmap for this cluster.
	2047	* Use READ, even though it was written, because the
	2048	* cluster MUST be present, unless there was an error
	2049	* in the original ps_clmap (e.g. no space), in which
	2050	* case, nothing happens.
	2051	*
	2052	* Must pass enough information to ps_clmap to allow it
	2053	* to set the vs_map structure bitmap under lock.
	2054	*/
	2055	(void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
	2056	}
	2057
	2058	void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
	2059
	2060	void
	2061	vs_cl_write_complete(
	2062	vstruct_t vs,
	2063	paging_segment_t ps,
	2064	vm_offset_t offset,
	2065	vm_offset_t addr,
	2066	vm_size_t size,
	2067	boolean_t async,
	2068	int error)
	2069	{
	2070	kern_return_t kr;
	2071
	2072	if (error) {
	2073	/*
	2074	* For internal objects, the error is recorded on a
	2075	* per-cluster basis by ps_clmap() which is called
	2076	* by ps_vs_write_complete() below.
	2077	*/
	2078	dprintf(("write failed error = 0x%x\n", error));
	2079	/* add upl_abort code here */
	2080	} else
	2081	GSTAT(global_stats.gs_pages_out += atop(size));
	2082	/*
	2083	* Notify the vstruct mapping code, so it can do its accounting.
	2084	*/
	2085	ps_vs_write_complete(vs, offset, size, error);
	2086
	2087	if (async) {
	2088	VS_LOCK(vs);
	2089	ASSERT(vs->vs_async_pending > 0);
	2090	vs->vs_async_pending -= size;
	2091	if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
	2092	vs->vs_waiting_async = FALSE;
	2093	VS_UNLOCK(vs);
	2094	/* mutex_unlock(&vs->vs_waiting_async); */
	2095	thread_wakeup(&vs->vs_async_pending);
	2096	} else {
	2097	VS_UNLOCK(vs);
	2098	}
	2099	}
	2100	}
	2101
	2102	#ifdef DEVICE_PAGING
	2103	kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
	2104
	2105	kern_return_t
	2106	device_write_reply(
	2107	MACH_PORT_FACE reply_port,
	2108	kern_return_t device_code,
	2109	io_buf_len_t bytes_written)
	2110	{
	2111	struct vs_async *vsa;
	2112
	2113	vsa = (struct vs_async *)
	2114	((struct vstruct_alias *)(reply_port->alias))->vs;
	2115
	2116	if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
	2117	device_code = KERN_FAILURE;
	2118	}
	2119
	2120	vsa->vsa_error = device_code;
	2121
	2122
	2123	ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
	2124	if(vsa->vsa_flags & VSA_TRANSFER) {
	2125	/* revisit when async disk segments redone */
	2126	if(vsa->vsa_error) {
	2127	/* need to consider error condition. re-write data or */
	2128	/* throw it away here. */
	2129	vm_offset_t ioaddr;
	2130	if(vm_map_copyout(kernel_map, &ioaddr,
	2131	(vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
	2132	panic("vs_cluster_write: unable to copy source list\n");
	2133	vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
	2134	}
	2135	ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
	2136	vsa->vsa_size, vsa->vsa_error);
	2137	} else {
	2138	vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
	2139	vsa->vsa_addr, vsa->vsa_size, TRUE,
	2140	vsa->vsa_error);
	2141	}
	2142	VS_FREE_ASYNC(vsa);
	2143
	2144	return KERN_SUCCESS;
	2145	}
	2146
	2147	kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
	2148	kern_return_t
	2149	device_write_reply_inband(
	2150	MACH_PORT_FACE reply_port,
	2151	kern_return_t return_code,
	2152	io_buf_len_t bytes_written)
	2153	{
	2154	panic("device_write_reply_inband: illegal");
	2155	return KERN_SUCCESS;
	2156	}
	2157
	2158	kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
	2159	kern_return_t
	2160	device_read_reply(
	2161	MACH_PORT_FACE reply_port,
	2162	kern_return_t return_code,
	2163	io_buf_ptr_t data,
	2164	mach_msg_type_number_t dataCnt)
	2165	{
	2166	struct vs_async *vsa;
	2167	vsa = (struct vs_async *)
	2168	((struct vstruct_alias *)(reply_port->alias))->vs;
	2169	vsa->vsa_addr = (vm_offset_t)data;
	2170	vsa->vsa_size = (vm_size_t)dataCnt;
	2171	vsa->vsa_error = return_code;
	2172	thread_wakeup(&vsa->vsa_lock);
	2173	return KERN_SUCCESS;
	2174	}
	2175
	2176	kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
	2177	kern_return_t
	2178	device_read_reply_inband(
	2179	MACH_PORT_FACE reply_port,
	2180	kern_return_t return_code,
	2181	io_buf_ptr_inband_t data,
	2182	mach_msg_type_number_t dataCnt)
	2183	{
	2184	panic("device_read_reply_inband: illegal");
	2185	return KERN_SUCCESS;
	2186	}
	2187
	2188	kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
	2189	kern_return_t
	2190	device_read_reply_overwrite(
	2191	MACH_PORT_FACE reply_port,
	2192	kern_return_t return_code,
	2193	io_buf_len_t bytes_read)
	2194	{
	2195	panic("device_read_reply_overwrite: illegal\n");
	2196	return KERN_SUCCESS;
	2197	}
	2198
	2199	kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
	2200	kern_return_t
	2201	device_open_reply(
	2202	MACH_PORT_FACE reply_port,
	2203	kern_return_t return_code,
	2204	MACH_PORT_FACE device_port)
	2205	{
	2206	panic("device_open_reply: illegal\n");
	2207	return KERN_SUCCESS;
	2208	}
	2209
	2210	kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t , unsigned int, unsigned int , int); /* forward */
	2211
	2212	kern_return_t
	2213	ps_read_device(
	2214	paging_segment_t ps,
	2215	vm_offset_t offset,
	2216	vm_offset_t *bufferp,
	2217	unsigned int size,
	2218	unsigned int *residualp,
	2219	int flags)
	2220	{
	2221	kern_return_t kr;
	2222	recnum_t dev_offset;
	2223	unsigned int bytes_wanted;
	2224	unsigned int bytes_read;
	2225	unsigned int total_read;
	2226	vm_offset_t dev_buffer;
	2227	vm_offset_t buf_ptr;
	2228	unsigned int records_read;
	2229	struct vs_async *vsa;
	2230	mutex_t vs_waiting_read_reply;
	2231
	2232	device_t device;
	2233	vm_map_copy_t device_data = NULL;
	2234	default_pager_thread_t *dpt = NULL;
	2235
	2236	device = dev_port_lookup(ps->ps_device);
	2237	clustered_reads[atop(size)]++;
	2238
	2239	dev_offset = (ps->ps_offset +
	2240	(offset >> (vm_page_shift - ps->ps_record_shift)));
	2241	bytes_wanted = size;
	2242	total_read = 0;
	2243	*bufferp = (vm_offset_t)NULL;
	2244
	2245	do {
	2246	vsa = VS_ALLOC_ASYNC();
	2247	if (vsa) {
	2248	vsa->vsa_vs = NULL;
	2249	vsa->vsa_addr = 0;
	2250	vsa->vsa_offset = 0;
	2251	vsa->vsa_size = 0;
	2252	vsa->vsa_ps = NULL;
	2253	}
	2254	mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
	2255	ip_lock(vsa->reply_port);
	2256	vsa->reply_port->ip_sorights++;
	2257	ip_reference(vsa->reply_port);
	2258	ip_unlock(vsa->reply_port);
	2259	kr = ds_device_read_common(device,
	2260	vsa->reply_port,
	2261	(mach_msg_type_name_t)
	2262	MACH_MSG_TYPE_MOVE_SEND_ONCE,
	2263	(dev_mode_t) 0,
	2264	dev_offset,
	2265	bytes_wanted,
	2266	(IO_READ \| IO_CALL),
	2267	(io_buf_ptr_t *) &dev_buffer,
	2268	(mach_msg_type_number_t *) &bytes_read);
	2269	if(kr == MIG_NO_REPLY) {
	2270	assert_wait(&vsa->vsa_lock, THREAD_UNINT);
	2271	thread_block(THREAD_CONTINUE_NULL);
	2272
	2273	dev_buffer = vsa->vsa_addr;
	2274	bytes_read = (unsigned int)vsa->vsa_size;
	2275	kr = vsa->vsa_error;
	2276	}
	2277	VS_FREE_ASYNC(vsa);
	2278	if (kr != KERN_SUCCESS \|\| bytes_read == 0) {
	2279	break;
	2280	}
	2281	total_read += bytes_read;
	2282
	2283	/*
	2284	* If we got the entire range, use the returned dev_buffer.
	2285	*/
	2286	if (bytes_read == size) {
	2287	*bufferp = (vm_offset_t)dev_buffer;
	2288	break;
	2289	}
	2290
	2291	#if 1
	2292	dprintf(("read only %d bytes out of %d\n",
	2293	bytes_read, bytes_wanted));
	2294	#endif
	2295	if(dpt == NULL) {
	2296	dpt = get_read_buffer();
	2297	buf_ptr = dpt->dpt_buffer;
	2298	*bufferp = (vm_offset_t)buf_ptr;
	2299	}
	2300	/*
	2301	* Otherwise, copy the data into the provided buffer (*bufferp)
	2302	* and append the rest of the range as it comes in.
	2303	*/
	2304	memcpy((void ) buf_ptr, (void ) dev_buffer, bytes_read);
	2305	buf_ptr += bytes_read;
	2306	bytes_wanted -= bytes_read;
	2307	records_read = (bytes_read >>
	2308	(vm_page_shift - ps->ps_record_shift));
	2309	dev_offset += records_read;
	2310	DEBUG(DEBUG_VS_INTERNAL,
	2311	("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
	2312	dev_buffer, bytes_read));
	2313	if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
	2314	!= KERN_SUCCESS)
	2315	Panic("dealloc buf");
	2316	} while (bytes_wanted);
	2317
	2318	*residualp = size - total_read;
	2319	if((dev_buffer != *bufferp) && (total_read != 0)) {
	2320	vm_offset_t temp_buffer;
	2321	vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
	2322	memcpy((void ) temp_buffer, (void ) *bufferp, total_read);
	2323	if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
	2324	VM_MAP_COPYIN_OPT_SRC_DESTROY \|
	2325	VM_MAP_COPYIN_OPT_STEAL_PAGES \|
	2326	VM_MAP_COPYIN_OPT_PMAP_ENTER,
	2327	(vm_map_copy_t *)&device_data, FALSE))
	2328	panic("ps_read_device: cannot copyin locally provided buffer\n");
	2329	}
	2330	else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
	2331	if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
	2332	VM_MAP_COPYIN_OPT_SRC_DESTROY \|
	2333	VM_MAP_COPYIN_OPT_STEAL_PAGES \|
	2334	VM_MAP_COPYIN_OPT_PMAP_ENTER,
	2335	(vm_map_copy_t *)&device_data, FALSE))
	2336	panic("ps_read_device: cannot copyin backing store provided buffer\n");
	2337	}
	2338	else {
	2339	device_data = NULL;
	2340	}
	2341	*bufferp = (vm_offset_t)device_data;
	2342
	2343	if(dpt != NULL) {
	2344	/* Free the receive buffer */
	2345	dpt->checked_out = 0;
	2346	thread_wakeup(&dpt_array);
	2347	}
	2348	return KERN_SUCCESS;
	2349	}
	2350
	2351	kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async ); / forward */
	2352
	2353	kern_return_t
	2354	ps_write_device(
	2355	paging_segment_t ps,
	2356	vm_offset_t offset,
	2357	vm_offset_t addr,
	2358	unsigned int size,
	2359	struct vs_async *vsa)
	2360	{
	2361	recnum_t dev_offset;
	2362	io_buf_len_t bytes_to_write, bytes_written;
	2363	recnum_t records_written;
	2364	kern_return_t kr;
	2365	MACH_PORT_FACE reply_port;
	2366
	2367
	2368
	2369	clustered_writes[atop(size)]++;
	2370
	2371	dev_offset = (ps->ps_offset +
	2372	(offset >> (vm_page_shift - ps->ps_record_shift)));
	2373	bytes_to_write = size;
	2374
	2375	if (vsa) {
	2376	/*
	2377	* Asynchronous write.
	2378	*/
	2379	reply_port = vsa->reply_port;
	2380	ip_lock(reply_port);
	2381	reply_port->ip_sorights++;
	2382	ip_reference(reply_port);
	2383	ip_unlock(reply_port);
	2384	{
	2385	device_t device;
	2386	device = dev_port_lookup(ps->ps_device);
	2387
	2388	vsa->vsa_addr = addr;
	2389	kr=ds_device_write_common(device,
	2390	reply_port,
	2391	(mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
	2392	(dev_mode_t) 0,
	2393	dev_offset,
	2394	(io_buf_ptr_t) addr,
	2395	size,
	2396	(IO_WRITE \| IO_CALL),
	2397	&bytes_written);
	2398	}
	2399	if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
	2400	if (verbose)
	2401	dprintf(("%s0x%x, addr=0x%x,"
	2402	"size=0x%x,offset=0x%x\n",
	2403	"device_write_request returned ",
	2404	kr, addr, size, offset));
	2405	BS_STAT(ps->ps_bs,
	2406	ps->ps_bs->bs_pages_out_fail += atop(size));
	2407	/* do the completion notification to free resources */
	2408	device_write_reply(reply_port, kr, 0);
	2409	return PAGER_ERROR;
	2410	}
	2411	} else do {
	2412	/*
	2413	* Synchronous write.
	2414	*/
	2415	{
	2416	device_t device;
	2417	device = dev_port_lookup(ps->ps_device);
	2418	kr=ds_device_write_common(device,
	2419	IP_NULL, 0,
	2420	(dev_mode_t) 0,
	2421	dev_offset,
	2422	(io_buf_ptr_t) addr,
	2423	size,
	2424	(IO_WRITE \| IO_SYNC \| IO_KERNEL_BUF),
	2425	&bytes_written);
	2426	}
	2427	if (kr != KERN_SUCCESS) {
	2428	dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
	2429	"device_write returned ",
	2430	kr, addr, size, offset));
	2431	BS_STAT(ps->ps_bs,
	2432	ps->ps_bs->bs_pages_out_fail += atop(size));
	2433	return PAGER_ERROR;
	2434	}
	2435	if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
	2436	Panic("fragmented write");
	2437	records_written = (bytes_written >>
	2438	(vm_page_shift - ps->ps_record_shift));
	2439	dev_offset += records_written;
	2440	#if 1
	2441	if (bytes_written != bytes_to_write) {
	2442	dprintf(("wrote only %d bytes out of %d\n",
	2443	bytes_written, bytes_to_write));
	2444	}
	2445	#endif
	2446	bytes_to_write -= bytes_written;
	2447	addr += bytes_written;
	2448	} while (bytes_to_write > 0);
	2449
	2450	return PAGER_SUCCESS;
	2451	}
	2452
	2453
	2454	#else /* !DEVICE_PAGING */
	2455
	2456	kern_return_t
	2457	ps_read_device(
	2458	paging_segment_t ps,
	2459	vm_offset_t offset,
	2460	vm_offset_t *bufferp,
	2461	unsigned int size,
	2462	unsigned int *residualp,
	2463	int flags)
	2464	{
	2465	panic("ps_read_device not supported");
	2466	}
	2467
	2468	ps_write_device(
	2469	paging_segment_t ps,
	2470	vm_offset_t offset,
	2471	vm_offset_t addr,
	2472	unsigned int size,
	2473	struct vs_async *vsa)
	2474	{
	2475	panic("ps_write_device not supported");
	2476	}
	2477
	2478	#endif /* DEVICE_PAGING */
	2479	void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
	2480
	2481	void
	2482	pvs_object_data_provided(
	2483	vstruct_t vs,
	2484	upl_t upl,
	2485	vm_offset_t offset,
	2486	vm_size_t size)
	2487	{
	2488
	2489	DEBUG(DEBUG_VS_INTERNAL,
	2490	("buffer=0x%x,offset=0x%x,size=0x%x\n",
	2491	upl, offset, size));
	2492
	2493	ASSERT(size > 0);
	2494	GSTAT(global_stats.gs_pages_in += atop(size));
	2495
	2496
	2497	#if USE_PRECIOUS
	2498	ps_clunmap(vs, offset, size);
	2499	#endif /* USE_PRECIOUS */
	2500
	2501	}
	2502
	2503	kern_return_t
	2504	pvs_cluster_read(
	2505	vstruct_t vs,
	2506	vm_offset_t vs_offset,
	2507	vm_size_t cnt)
	2508	{
	2509	upl_t upl;
	2510	kern_return_t error = KERN_SUCCESS;
	2511	int size;
	2512	unsigned int residual;
	2513	unsigned int request_flags;
	2514	int seg_index;
	2515	int pages_in_cl;
	2516	int cl_size;
	2517	int cl_mask;
	2518	int cl_index;
	2519	int xfer_size;
	2520	vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
	2521	paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
	2522	struct clmap clmap;
	2523
	2524	pages_in_cl = 1 << vs->vs_clshift;
	2525	cl_size = pages_in_cl * vm_page_size;
	2526	cl_mask = cl_size - 1;
	2527
	2528	/*
	2529	* This loop will be executed multiple times until the entire
	2530	* request has been satisfied... if the request spans cluster
	2531	* boundaries, the clusters will be checked for logical continunity,
	2532	* if contiguous the I/O request will span multiple clusters, otherwise
	2533	* it will be broken up into the minimal set of I/O's
	2534	*
	2535	* If there are holes in a request (either unallocated pages in a paging
	2536	* segment or an unallocated paging segment), we stop
	2537	* reading at the hole, inform the VM of any data read, inform
	2538	* the VM of an unavailable range, then loop again, hoping to
	2539	* find valid pages later in the requested range. This continues until
	2540	* the entire range has been examined, and read, if present.
	2541	*/
	2542
	2543	#if USE_PRECIOUS
	2544	request_flags = UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_PRECIOUS \| UPL_RET_ONLY_ABSENT;
	2545	#else
	2546	request_flags = UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_RET_ONLY_ABSENT;
	2547	#endif
	2548	while (cnt && (error == KERN_SUCCESS)) {
	2549	int ps_info_valid;
	2550	int page_list_count;
	2551
	2552	if((vs_offset & cl_mask) &&
	2553	(cnt > (VM_SUPER_CLUSTER -
	2554	(vs_offset & cl_mask)))) {
	2555	size = VM_SUPER_CLUSTER;
	2556	size -= vs_offset & cl_mask;
	2557	} else if (cnt > VM_SUPER_CLUSTER) {
	2558	size = VM_SUPER_CLUSTER;
	2559	} else {
	2560	size = cnt;
	2561	}
	2562	cnt -= size;
	2563
	2564	ps_info_valid = 0;
	2565	seg_index = 0;
	2566
	2567	while (size > 0 && error == KERN_SUCCESS) {
	2568	int abort_size;
	2569	int failed_size;
	2570	int beg_pseg;
	2571	int beg_indx;
	2572	vm_offset_t cur_offset;
	2573
	2574
	2575	if ( !ps_info_valid) {
	2576	ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
	2577	psp[seg_index] = CLMAP_PS(clmap);
	2578	ps_info_valid = 1;
	2579	}
	2580	/*
	2581	* skip over unallocated physical segments
	2582	*/
	2583	if (ps_offset[seg_index] == (vm_offset_t) -1) {
	2584	abort_size = cl_size - (vs_offset & cl_mask);
	2585	abort_size = MIN(abort_size, size);
	2586
	2587	page_list_count = 0;
	2588	memory_object_super_upl_request(
	2589	vs->vs_control,
	2590	(memory_object_offset_t)vs_offset,
	2591	abort_size, abort_size,
	2592	&upl, NULL, &page_list_count,
	2593	request_flags);
	2594
	2595	if (clmap.cl_error) {
	2596	upl_abort(upl, UPL_ABORT_ERROR);
	2597	} else {
	2598	upl_abort(upl, UPL_ABORT_UNAVAILABLE);
	2599	}
	2600	upl_deallocate(upl);
	2601
	2602	size -= abort_size;
	2603	vs_offset += abort_size;
	2604
	2605	seg_index++;
	2606	ps_info_valid = 0;
	2607	continue;
	2608	}
	2609	cl_index = (vs_offset & cl_mask) / vm_page_size;
	2610
	2611	for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
	2612	/*
	2613	* skip over unallocated pages
	2614	*/
	2615	if (CLMAP_ISSET(clmap, cl_index))
	2616	break;
	2617	abort_size += vm_page_size;
	2618	}
	2619	if (abort_size) {
	2620	/*
	2621	* Let VM system know about holes in clusters.
	2622	*/
	2623	GSTAT(global_stats.gs_pages_unavail += atop(abort_size));
	2624
	2625	page_list_count = 0;
	2626	memory_object_super_upl_request(
	2627	vs->vs_control,
	2628	(memory_object_offset_t)vs_offset,
	2629	abort_size, abort_size,
	2630	&upl, NULL, &page_list_count,
	2631	request_flags);
	2632
	2633	upl_abort(upl, UPL_ABORT_UNAVAILABLE);
	2634	upl_deallocate(upl);
	2635
	2636	size -= abort_size;
	2637	vs_offset += abort_size;
	2638
	2639	if (cl_index == pages_in_cl) {
	2640	/*
	2641	* if we're at the end of this physical cluster
	2642	* then bump to the next one and continue looking
	2643	*/
	2644	seg_index++;
	2645	ps_info_valid = 0;
	2646	continue;
	2647	}
	2648	if (size == 0)
	2649	break;
	2650	}
	2651	/*
	2652	* remember the starting point of the first allocated page
	2653	* for the I/O we're about to issue
	2654	*/
	2655	beg_pseg = seg_index;
	2656	beg_indx = cl_index;
	2657	cur_offset = vs_offset;
	2658
	2659	/*
	2660	* calculate the size of the I/O that we can do...
	2661	* this may span multiple physical segments if
	2662	* they are contiguous
	2663	*/
	2664	for (xfer_size = 0; xfer_size < size; ) {
	2665
	2666	while (cl_index < pages_in_cl
	2667	&& xfer_size < size) {
	2668	/*
	2669	* accumulate allocated pages within
	2670	* a physical segment
	2671	*/
	2672	if (CLMAP_ISSET(clmap, cl_index)) {
	2673	xfer_size += vm_page_size;
	2674	cur_offset += vm_page_size;
	2675	cl_index++;
	2676
	2677	BS_STAT(psp[seg_index]->ps_bs,
	2678	psp[seg_index]->ps_bs->bs_pages_in++);
	2679	} else
	2680	break;
	2681	}
	2682	if (cl_index < pages_in_cl
	2683	\|\| xfer_size >= size) {
	2684	/*
	2685	* we've hit an unallocated page or
	2686	* the end of this request... go fire
	2687	* the I/O
	2688	*/
	2689	break;
	2690	}
	2691	/*
	2692	* we've hit the end of the current physical
	2693	* segment and there's more to do, so try
	2694	* moving to the next one
	2695	*/
	2696	seg_index++;
	2697
	2698	ps_offset[seg_index] =
	2699	ps_clmap(vs,
	2700	cur_offset & ~cl_mask,
	2701	&clmap, CL_FIND, 0, 0);
	2702	psp[seg_index] = CLMAP_PS(clmap);
	2703	ps_info_valid = 1;
	2704
	2705	if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) \|\| (psp[seg_index - 1] != psp[seg_index])) {
	2706	/*
	2707	* if the physical segment we're about
	2708	* to step into is not contiguous to
	2709	* the one we're currently in, or it's
	2710	* in a different paging file, or
	2711	* it hasn't been allocated....
	2712	* we stop here and generate the I/O
	2713	*/
	2714	break;
	2715	}
	2716	/*
	2717	* start with first page of the next physical
	2718	* segment
	2719	*/
	2720	cl_index = 0;
	2721	}
	2722	if (xfer_size) {
	2723	/*
	2724	* we have a contiguous range of allocated pages
	2725	* to read from
	2726	*/
	2727	page_list_count = 0;
	2728	memory_object_super_upl_request(vs->vs_control,
	2729	(memory_object_offset_t)vs_offset,
	2730	xfer_size, xfer_size,
	2731	&upl, NULL, &page_list_count,
	2732	request_flags \| UPL_SET_INTERNAL);
	2733
	2734	error = ps_read_file(psp[beg_pseg],
	2735	upl, (vm_offset_t) 0,
	2736	ps_offset[beg_pseg] +
	2737	(beg_indx * vm_page_size),
	2738	xfer_size, &residual, 0);
	2739	} else
	2740	continue;
	2741
	2742	failed_size = 0;
	2743
	2744	/*
	2745	* Adjust counts and send response to VM. Optimize
	2746	* for the common case, i.e. no error and/or partial
	2747	* data. If there was an error, then we need to error
	2748	* the entire range, even if some data was successfully
	2749	* read. If there was a partial read we may supply some
	2750	* data and may error some as well. In all cases the
	2751	* VM must receive some notification for every page in the
	2752	* range.
	2753	*/
	2754	if ((error == KERN_SUCCESS) && (residual == 0)) {
	2755	/*
	2756	* Got everything we asked for, supply the data
	2757	* to the VM. Note that as a side effect of
	2758	* supplying * the data, the buffer holding the
	2759	* supplied data is * deallocated from the pager's
	2760	* address space.
	2761	*/
	2762	pvs_object_data_provided(
	2763	vs, upl, vs_offset, xfer_size);
	2764	} else {
	2765	failed_size = xfer_size;
	2766
	2767	if (error == KERN_SUCCESS) {
	2768	if (residual == xfer_size) {
	2769	/*
	2770	* If a read operation returns no error
	2771	* and no data moved, we turn it into
	2772	* an error, assuming we're reading at
	2773	* or beyong EOF.
	2774	* Fall through and error the entire
	2775	* range.
	2776	*/
	2777	error = KERN_FAILURE;
	2778	} else {
	2779	/*
	2780	* Otherwise, we have partial read. If
	2781	* the part read is a integral number
	2782	* of pages supply it. Otherwise round
	2783	* it up to a page boundary, zero fill
	2784	* the unread part, and supply it.
	2785	* Fall through and error the remainder
	2786	* of the range, if any.
	2787	*/
	2788	int fill, lsize;
	2789
	2790	fill = residual
	2791	& ~vm_page_size;
	2792	lsize = (xfer_size - residual)
	2793	+ fill;
	2794	pvs_object_data_provided(
	2795	vs, upl,
	2796	vs_offset, lsize);
	2797
	2798	if (lsize < xfer_size) {
	2799	failed_size =
	2800	xfer_size - lsize;
	2801	error = KERN_FAILURE;
	2802	}
	2803	}
	2804	}
	2805	}
	2806	/*
	2807	* If there was an error in any part of the range, tell
	2808	* the VM. Note that error is explicitly checked again
	2809	* since it can be modified above.
	2810	*/
	2811	if (error != KERN_SUCCESS) {
	2812	BS_STAT(psp[beg_pseg]->ps_bs,
	2813	psp[beg_pseg]->ps_bs->bs_pages_in_fail
	2814	+= atop(failed_size));
	2815	}
	2816	size -= xfer_size;
	2817	vs_offset += xfer_size;
	2818	}
	2819
	2820	} /* END while (cnt && (error == 0)) */
	2821	return error;
	2822	}
	2823
	2824	int vs_do_async_write = 1;
	2825
	2826	kern_return_t
	2827	vs_cluster_write(
	2828	vstruct_t vs,
	2829	upl_t internal_upl,
	2830	vm_offset_t offset,
	2831	vm_size_t cnt,
	2832	boolean_t dp_internal,
	2833	int flags)
	2834	{
	2835	vm_offset_t size;
	2836	vm_offset_t transfer_size;
	2837	int error = 0;
	2838	struct clmap clmap;
	2839
	2840	vm_offset_t actual_offset; /* Offset within paging segment */
	2841	paging_segment_t ps;
	2842	vm_offset_t subx_size;
	2843	vm_offset_t mobj_base_addr;
	2844	vm_offset_t mobj_target_addr;
	2845	int mobj_size;
	2846
	2847	struct vs_async *vsa;
	2848	vm_map_copy_t copy;
	2849
	2850	upl_t upl;
	2851	upl_page_info_t *pl;
	2852	int page_index;
	2853	int list_size;
	2854	int cl_size;
	2855
	2856	if (!dp_internal) {
	2857	int page_list_count;
	2858	int request_flags;
	2859	int super_size;
	2860	int first_dirty;
	2861	int num_dirty;
	2862	int num_of_pages;
	2863	int seg_index;
	2864	int pages_in_cl;
	2865	int must_abort;
	2866	vm_offset_t upl_offset;
	2867	vm_offset_t seg_offset;
	2868	vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
	2869	paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
	2870
	2871
	2872	pages_in_cl = 1 << vs->vs_clshift;
	2873	cl_size = pages_in_cl * vm_page_size;
	2874
	2875	if (bs_low) {
	2876	super_size = cl_size;
	2877
	2878	request_flags = UPL_NOBLOCK \|
	2879	UPL_RET_ONLY_DIRTY \| UPL_COPYOUT_FROM \|
	2880	UPL_NO_SYNC \| UPL_SET_INTERNAL;
	2881	} else {
	2882	super_size = VM_SUPER_CLUSTER;
	2883
	2884	request_flags = UPL_NOBLOCK \| UPL_CLEAN_IN_PLACE \|
	2885	UPL_RET_ONLY_DIRTY \| UPL_COPYOUT_FROM \|
	2886	UPL_NO_SYNC \| UPL_SET_INTERNAL;
	2887	}
	2888
	2889	page_list_count = 0;
	2890	memory_object_super_upl_request(vs->vs_control,
	2891	(memory_object_offset_t)offset,
	2892	cnt, super_size,
	2893	&upl, NULL, &page_list_count,
	2894	request_flags \| UPL_PAGEOUT);
	2895
	2896	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2897
	2898	for (seg_index = 0, transfer_size = upl->size;
	2899	transfer_size > 0; ) {
	2900
	2901	ps_offset[seg_index] =
	2902	ps_clmap(vs, upl->offset + (seg_index * cl_size),
	2903	&clmap, CL_ALLOC,
	2904	transfer_size < cl_size ?
	2905	transfer_size : cl_size, 0);
	2906
	2907	if (ps_offset[seg_index] == (vm_offset_t) -1) {
	2908	upl_abort(upl, 0);
	2909	upl_deallocate(upl);
	2910
	2911	return KERN_FAILURE;
	2912
	2913	}
	2914	psp[seg_index] = CLMAP_PS(clmap);
	2915
	2916	if (transfer_size > cl_size) {
	2917	transfer_size -= cl_size;
	2918	seg_index++;
	2919	} else
	2920	transfer_size = 0;
	2921	}
	2922	for (page_index = 0,
	2923	num_of_pages = upl->size / vm_page_size;
	2924	page_index < num_of_pages; ) {
	2925	/*
	2926	* skip over non-dirty pages
	2927	*/
	2928	for ( ; page_index < num_of_pages; page_index++) {
	2929	if (UPL_DIRTY_PAGE(pl, page_index)
	2930	\|\| UPL_PRECIOUS_PAGE(pl, page_index))
	2931	/*
	2932	* this is a page we need to write
	2933	* go see if we can buddy it up with
	2934	* others that are contiguous to it
	2935	*/
	2936	break;
	2937	/*
	2938	* if the page is not-dirty, but present we
	2939	* need to commit it... This is an unusual
	2940	* case since we only asked for dirty pages
	2941	*/
	2942	if (UPL_PAGE_PRESENT(pl, page_index)) {
	2943	boolean_t empty = FALSE;
	2944	upl_commit_range(upl,
	2945	page_index * vm_page_size,
	2946	vm_page_size,
	2947	UPL_COMMIT_NOTIFY_EMPTY,
	2948	pl,
	2949	page_list_count,
	2950	&empty);
	2951	if (empty)
	2952	upl_deallocate(upl);
	2953	}
	2954	}
	2955	if (page_index == num_of_pages)
	2956	/*
	2957	* no more pages to look at, we're out of here
	2958	*/
	2959	break;
	2960
	2961	/*
	2962	* gather up contiguous dirty pages... we have at
	2963	* least 1 otherwise we would have bailed above
	2964	* make sure that each physical segment that we step
	2965	* into is contiguous to the one we're currently in
	2966	* if it's not, we have to stop and write what we have
	2967	*/
	2968	for (first_dirty = page_index;
	2969	page_index < num_of_pages; ) {
	2970	if ( !UPL_DIRTY_PAGE(pl, page_index)
	2971	&& !UPL_PRECIOUS_PAGE(pl, page_index))
	2972	break;
	2973	page_index++;
	2974	/*
	2975	* if we just looked at the last page in the UPL
	2976	* we don't need to check for physical segment
	2977	* continuity
	2978	*/
	2979	if (page_index < num_of_pages) {
	2980	int cur_seg;
	2981	int nxt_seg;
	2982
	2983	cur_seg =
	2984	(page_index - 1) / pages_in_cl;
	2985	nxt_seg = page_index / pages_in_cl;
	2986
	2987	if (cur_seg != nxt_seg) {
	2988	if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) \|\| (psp[cur_seg] != psp[nxt_seg]))
	2989	/*
	2990	* if the segment we're about
	2991	* to step into is not
	2992	* contiguous to the one we're
	2993	* currently in, or it's in a
	2994	* different paging file....
	2995	* we stop here and generate
	2996	* the I/O
	2997	*/
	2998	break;
	2999	}
	3000	}
	3001	}
	3002	num_dirty = page_index - first_dirty;
	3003	must_abort = 1;
	3004
	3005	if (num_dirty) {
	3006	upl_offset = first_dirty * vm_page_size;
	3007	seg_index = first_dirty / pages_in_cl;
	3008	seg_offset = upl_offset - (seg_index * cl_size);
	3009	transfer_size = num_dirty * vm_page_size;
	3010
	3011
	3012	while (transfer_size) {
	3013	int seg_size;
	3014
	3015	if ((seg_size = cl_size -
	3016	(upl_offset % cl_size))
	3017	> transfer_size)
	3018	seg_size = transfer_size;
	3019
	3020	ps_vs_write_complete(vs,
	3021	upl->offset + upl_offset,
	3022	seg_size, error);
	3023
	3024	transfer_size -= seg_size;
	3025	upl_offset += seg_size;
	3026	}
	3027	upl_offset = first_dirty * vm_page_size;
	3028	transfer_size = num_dirty * vm_page_size;
	3029	error = ps_write_file(psp[seg_index],
	3030	upl, upl_offset,
	3031	ps_offset[seg_index]
	3032	+ seg_offset,
	3033	transfer_size, flags);
	3034	must_abort = 0;
	3035	}
	3036	if (must_abort) {
	3037	boolean_t empty = FALSE;
	3038	upl_abort_range(upl,
	3039	first_dirty * vm_page_size,
	3040	num_dirty * vm_page_size,
	3041	UPL_ABORT_NOTIFY_EMPTY,
	3042	&empty);
	3043	if (empty)
	3044	upl_deallocate(upl);
	3045	}
	3046	}
	3047
	3048	} else {
	3049	assert(cnt <= (vm_page_size << vs->vs_clshift));
	3050	list_size = cnt;
	3051
	3052	page_index = 0;
	3053	/* The caller provides a mapped_data which is derived */
	3054	/* from a temporary object. The targeted pages are */
	3055	/* guaranteed to be set at offset 0 in the mapped_data */
	3056	/* The actual offset however must still be derived */
	3057	/* from the offset in the vs in question */
	3058	mobj_base_addr = offset;
	3059	mobj_target_addr = mobj_base_addr;
	3060
	3061	for (transfer_size = list_size; transfer_size != 0;) {
	3062	actual_offset = ps_clmap(vs, mobj_target_addr,
	3063	&clmap, CL_ALLOC,
	3064	transfer_size < cl_size ?
	3065	transfer_size : cl_size, 0);
	3066	if(actual_offset == (vm_offset_t) -1) {
	3067	error = 1;
	3068	break;
	3069	}
	3070	cnt = MIN(transfer_size,
	3071	CLMAP_NPGS(clmap) * vm_page_size);
	3072	ps = CLMAP_PS(clmap);
	3073	/* Assume that the caller has given us contiguous */
	3074	/* pages */
	3075	if(cnt) {
	3076	ps_vs_write_complete(vs, mobj_target_addr,
	3077	cnt, error);
	3078	error = ps_write_file(ps, internal_upl,
	3079	0, actual_offset,
	3080	cnt, flags);
	3081	if (error)
	3082	break;
	3083	}
	3084	if (error)
	3085	break;
	3086	actual_offset += cnt;
	3087	mobj_target_addr += cnt;
	3088	transfer_size -= cnt;
	3089	cnt = 0;
	3090
	3091	if (error)
	3092	break;
	3093	}
	3094	}
	3095	if(error)
	3096	return KERN_FAILURE;
	3097	else
	3098	return KERN_SUCCESS;
	3099	}
	3100
	3101	vm_size_t
	3102	ps_vstruct_allocated_size(
	3103	vstruct_t vs)
	3104	{
	3105	int num_pages;
	3106	struct vs_map *vsmap;
	3107	int i, j, k;
	3108
	3109	num_pages = 0;
	3110	if (vs->vs_indirect) {
	3111	/* loop on indirect maps */
	3112	for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
	3113	vsmap = vs->vs_imap[i];
	3114	if (vsmap == NULL)
	3115	continue;
	3116	/* loop on clusters in this indirect map */
	3117	for (j = 0; j < CLMAP_ENTRIES; j++) {
	3118	if (VSM_ISCLR(vsmap[j]) \|\|
	3119	VSM_ISERR(vsmap[j]))
	3120	continue;
	3121	/* loop on pages in this cluster */
	3122	for (k = 0; k < VSCLSIZE(vs); k++) {
	3123	if ((VSM_BMAP(vsmap[j])) & (1 << k))
	3124	num_pages++;
	3125	}
	3126	}
	3127	}
	3128	} else {
	3129	vsmap = vs->vs_dmap;
	3130	if (vsmap == NULL)
	3131	return 0;
	3132	/* loop on clusters in the direct map */
	3133	for (j = 0; j < CLMAP_ENTRIES; j++) {
	3134	if (VSM_ISCLR(vsmap[j]) \|\|
	3135	VSM_ISERR(vsmap[j]))
	3136	continue;
	3137	/* loop on pages in this cluster */
	3138	for (k = 0; k < VSCLSIZE(vs); k++) {
	3139	if ((VSM_BMAP(vsmap[j])) & (1 << k))
	3140	num_pages++;
	3141	}
	3142	}
	3143	}
	3144
	3145	return ptoa(num_pages);
	3146	}
	3147
	3148	size_t
	3149	ps_vstruct_allocated_pages(
	3150	vstruct_t vs,
	3151	default_pager_page_t *pages,
	3152	size_t pages_size)
	3153	{
	3154	int num_pages;
	3155	struct vs_map *vsmap;
	3156	vm_offset_t offset;
	3157	int i, j, k;
	3158
	3159	num_pages = 0;
	3160	offset = 0;
	3161	if (vs->vs_indirect) {
	3162	/* loop on indirect maps */
	3163	for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
	3164	vsmap = vs->vs_imap[i];
	3165	if (vsmap == NULL) {
	3166	offset += (vm_page_size * CLMAP_ENTRIES *
	3167	VSCLSIZE(vs));
	3168	continue;
	3169	}
	3170	/* loop on clusters in this indirect map */
	3171	for (j = 0; j < CLMAP_ENTRIES; j++) {
	3172	if (VSM_ISCLR(vsmap[j]) \|\|
	3173	VSM_ISERR(vsmap[j])) {
	3174	offset += vm_page_size * VSCLSIZE(vs);
	3175	continue;
	3176	}
	3177	/* loop on pages in this cluster */
	3178	for (k = 0; k < VSCLSIZE(vs); k++) {
	3179	if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
	3180	num_pages++;
	3181	if (num_pages < pages_size)
	3182	pages++->dpp_offset =
	3183	offset;
	3184	}
	3185	offset += vm_page_size;
	3186	}
	3187	}
	3188	}
	3189	} else {
	3190	vsmap = vs->vs_dmap;
	3191	if (vsmap == NULL)
	3192	return 0;
	3193	/* loop on clusters in the direct map */
	3194	for (j = 0; j < CLMAP_ENTRIES; j++) {
	3195	if (VSM_ISCLR(vsmap[j]) \|\|
	3196	VSM_ISERR(vsmap[j])) {
	3197	offset += vm_page_size * VSCLSIZE(vs);
	3198	continue;
	3199	}
	3200	/* loop on pages in this cluster */
	3201	for (k = 0; k < VSCLSIZE(vs); k++) {
	3202	if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
	3203	num_pages++;
	3204	if (num_pages < pages_size)
	3205	pages++->dpp_offset = offset;
	3206	}
	3207	offset += vm_page_size;
	3208	}
	3209	}
	3210	}
	3211
	3212	return num_pages;
	3213	}
	3214
	3215
	3216	kern_return_t
	3217	ps_vstruct_transfer_from_segment(
	3218	vstruct_t vs,
	3219	paging_segment_t segment,
	3220	upl_t upl)
	3221	{
	3222	struct vs_map *vsmap;
	3223	struct vs_map old_vsmap;
	3224	struct vs_map new_vsmap;
	3225	int i, j, k;
	3226
	3227	VS_LOCK(vs); /* block all work on this vstruct */
	3228	/* can't allow the normal multiple write */
	3229	/* semantic because writes may conflict */
	3230	vs->vs_xfer_pending = TRUE;
	3231	vs_wait_for_sync_writers(vs);
	3232	vs_start_write(vs);
	3233	vs_wait_for_readers(vs);
	3234	/* we will unlock the vs to allow other writes while transferring */
	3235	/* and will be guaranteed of the persistance of the vs struct */
	3236	/* because the caller of ps_vstruct_transfer_from_segment bumped */
	3237	/* vs_async_pending */
	3238	/* OK we now have guaranteed no other parties are accessing this */
	3239	/* vs. Now that we are also supporting simple lock versions of */
	3240	/* vs_lock we cannot hold onto VS_LOCK as we may block below. */
	3241	/* our purpose in holding it before was the multiple write case */
	3242	/* we now use the boolean xfer_pending to do that. We can use */
	3243	/* a boolean instead of a count because we have guaranteed single */
	3244	/* file access to this code in its caller */
	3245	VS_UNLOCK(vs);
	3246	vs_changed:
	3247	if (vs->vs_indirect) {
	3248	int vsmap_size;
	3249	int clmap_off;
	3250	/* loop on indirect maps */
	3251	for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
	3252	vsmap = vs->vs_imap[i];
	3253	if (vsmap == NULL)
	3254	continue;
	3255	/* loop on clusters in this indirect map */
	3256	clmap_off = (vm_page_size * CLMAP_ENTRIES *
	3257	VSCLSIZE(vs) * i);
	3258	if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
	3259	vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
	3260	else
	3261	vsmap_size = CLMAP_ENTRIES;
	3262	for (j = 0; j < vsmap_size; j++) {
	3263	if (VSM_ISCLR(vsmap[j]) \|\|
	3264	VSM_ISERR(vsmap[j]) \|\|
	3265	(VSM_PS(vsmap[j]) != segment))
	3266	continue;
	3267	if(vs_cluster_transfer(vs,
	3268	(vm_page_size * (j << vs->vs_clshift))
	3269	+ clmap_off,
	3270	vm_page_size << vs->vs_clshift,
	3271	upl)
	3272	!= KERN_SUCCESS) {
	3273	VS_LOCK(vs);
	3274	vs->vs_xfer_pending = FALSE;
	3275	VS_UNLOCK(vs);
	3276	vs_finish_write(vs);
	3277	return KERN_FAILURE;
	3278	}
	3279	/* allow other readers/writers during transfer*/
	3280	VS_LOCK(vs);
	3281	vs->vs_xfer_pending = FALSE;
	3282	VS_UNLOCK(vs);
	3283	vs_finish_write(vs);
	3284	VS_LOCK(vs);
	3285	vs->vs_xfer_pending = TRUE;
	3286	vs_wait_for_sync_writers(vs);
	3287	vs_start_write(vs);
	3288	vs_wait_for_readers(vs);
	3289	VS_UNLOCK(vs);
	3290	if (!(vs->vs_indirect)) {
	3291	goto vs_changed;
	3292	}
	3293	}
	3294	}
	3295	} else {
	3296	vsmap = vs->vs_dmap;
	3297	if (vsmap == NULL) {
	3298	VS_LOCK(vs);
	3299	vs->vs_xfer_pending = FALSE;
	3300	VS_UNLOCK(vs);
	3301	vs_finish_write(vs);
	3302	return KERN_SUCCESS;
	3303	}
	3304	/* loop on clusters in the direct map */
	3305	for (j = 0; j < vs->vs_size; j++) {
	3306	if (VSM_ISCLR(vsmap[j]) \|\|
	3307	VSM_ISERR(vsmap[j]) \|\|
	3308	(VSM_PS(vsmap[j]) != segment))
	3309	continue;
	3310	if(vs_cluster_transfer(vs,
	3311	vm_page_size * (j << vs->vs_clshift),
	3312	vm_page_size << vs->vs_clshift,
	3313	upl) != KERN_SUCCESS) {
	3314	VS_LOCK(vs);
	3315	vs->vs_xfer_pending = FALSE;
	3316	VS_UNLOCK(vs);
	3317	vs_finish_write(vs);
	3318	return KERN_FAILURE;
	3319	}
	3320	/* allow other readers/writers during transfer*/
	3321	VS_LOCK(vs);
	3322	vs->vs_xfer_pending = FALSE;
	3323	VS_UNLOCK(vs);
	3324	vs_finish_write(vs);
	3325	VS_LOCK(vs);
	3326	vs->vs_xfer_pending = TRUE;
	3327	VS_UNLOCK(vs);
	3328	vs_wait_for_sync_writers(vs);
	3329	vs_start_write(vs);
	3330	vs_wait_for_readers(vs);
	3331	if (vs->vs_indirect) {
	3332	goto vs_changed;
	3333	}
	3334	}
	3335	}
	3336
	3337	VS_LOCK(vs);
	3338	vs->vs_xfer_pending = FALSE;
	3339	VS_UNLOCK(vs);
	3340	vs_finish_write(vs);
	3341	return KERN_SUCCESS;
	3342	}
	3343
	3344
	3345
	3346	vs_map_t
	3347	vs_get_map_entry(
	3348	vstruct_t vs,
	3349	vm_offset_t offset)
	3350	{
	3351	struct vs_map *vsmap;
	3352	vm_offset_t cluster;
	3353
	3354	cluster = atop(offset) >> vs->vs_clshift;
	3355	if (vs->vs_indirect) {
	3356	long ind_block = cluster/CLMAP_ENTRIES;
	3357
	3358	/* Is the indirect block allocated? */
	3359	vsmap = vs->vs_imap[ind_block];
	3360	if(vsmap == (vs_map_t) NULL)
	3361	return vsmap;
	3362	} else
	3363	vsmap = vs->vs_dmap;
	3364	vsmap += cluster%CLMAP_ENTRIES;
	3365	return vsmap;
	3366	}
	3367
	3368	kern_return_t
	3369	vs_cluster_transfer(
	3370	vstruct_t vs,
	3371	vm_offset_t offset,
	3372	vm_size_t cnt,
	3373	upl_t upl)
	3374	{
	3375	vm_offset_t actual_offset;
	3376	paging_segment_t ps;
	3377	struct clmap clmap;
	3378	kern_return_t error = KERN_SUCCESS;
	3379	int size, size_wanted, i;
	3380	unsigned int residual;
	3381	int unavail_size;
	3382	default_pager_thread_t *dpt;
	3383	boolean_t dealloc;
	3384	struct vs_map *vsmap_ptr;
	3385	struct vs_map read_vsmap;
	3386	struct vs_map original_read_vsmap;
	3387	struct vs_map write_vsmap;
	3388	upl_t sync_upl;
	3389	vm_offset_t ioaddr;
	3390
	3391	/* vs_cluster_transfer reads in the pages of a cluster and
	3392	* then writes these pages back to new backing store. The
	3393	* segment the pages are being read from is assumed to have
	3394	* been taken off-line and is no longer considered for new
	3395	* space requests.
	3396	*/
	3397
	3398	/*
	3399	* This loop will be executed once per cluster referenced.
	3400	* Typically this means once, since it's unlikely that the
	3401	* VM system will ask for anything spanning cluster boundaries.
	3402	*
	3403	* If there are holes in a cluster (in a paging segment), we stop
	3404	* reading at the hole, then loop again, hoping to
	3405	* find valid pages later in the cluster. This continues until
	3406	* the entire range has been examined, and read, if present. The
	3407	* pages are written as they are read. If a failure occurs after
	3408	* some pages are written the unmap call at the bottom of the loop
	3409	* recovers the backing store and the old backing store remains
	3410	* in effect.
	3411	*/
	3412
	3413	VSM_CLR(write_vsmap);
	3414	VSM_CLR(original_read_vsmap);
	3415	/* grab the actual object's pages to sync with I/O */
	3416	while (cnt && (error == KERN_SUCCESS)) {
	3417	vsmap_ptr = vs_get_map_entry(vs, offset);
	3418	actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
	3419
	3420	if (actual_offset == (vm_offset_t) -1) {
	3421
	3422	/*
	3423	* Nothing left to write in this cluster at least
	3424	* set write cluster information for any previous
	3425	* write, clear for next cluster, if there is one
	3426	*/
	3427	unsigned int local_size, clmask, clsize;
	3428
	3429	clsize = vm_page_size << vs->vs_clshift;
	3430	clmask = clsize - 1;
	3431	local_size = clsize - (offset & clmask);
	3432	ASSERT(local_size);
	3433	local_size = MIN(local_size, cnt);
	3434
	3435	/* This cluster has no data in it beyond what may */
	3436	/* have been found on a previous iteration through */
	3437	/* the loop "write_vsmap" */
	3438	*vsmap_ptr = write_vsmap;
	3439	VSM_CLR(write_vsmap);
	3440	VSM_CLR(original_read_vsmap);
	3441
	3442	cnt -= local_size;
	3443	offset += local_size;
	3444	continue;
	3445	}
	3446
	3447	/*
	3448	* Count up contiguous available or unavailable
	3449	* pages.
	3450	*/
	3451	ps = CLMAP_PS(clmap);
	3452	ASSERT(ps);
	3453	size = 0;
	3454	unavail_size = 0;
	3455	for (i = 0;
	3456	(size < cnt) && (unavail_size < cnt) &&
	3457	(i < CLMAP_NPGS(clmap)); i++) {
	3458	if (CLMAP_ISSET(clmap, i)) {
	3459	if (unavail_size != 0)
	3460	break;
	3461	size += vm_page_size;
	3462	BS_STAT(ps->ps_bs,
	3463	ps->ps_bs->bs_pages_in++);
	3464	} else {
	3465	if (size != 0)
	3466	break;
	3467	unavail_size += vm_page_size;
	3468	}
	3469	}
	3470
	3471	if (size == 0) {
	3472	ASSERT(unavail_size);
	3473	cnt -= unavail_size;
	3474	offset += unavail_size;
	3475	if((offset & ((vm_page_size << vs->vs_clshift) - 1))
	3476	== 0) {
	3477	/* There is no more to transfer in this
	3478	cluster
	3479	*/
	3480	*vsmap_ptr = write_vsmap;
	3481	VSM_CLR(write_vsmap);
	3482	VSM_CLR(original_read_vsmap);
	3483	}
	3484	continue;
	3485	}
	3486
	3487	if(VSM_ISCLR(original_read_vsmap))
	3488	original_read_vsmap = *vsmap_ptr;
	3489
	3490	if(ps->ps_segtype == PS_PARTITION) {
	3491	/*
	3492	NEED TO ISSUE WITH SYNC & NO COMMIT
	3493	error = ps_read_device(ps, actual_offset, &buffer,
	3494	size, &residual, flags);
	3495	*/
	3496	} else {
	3497	/* NEED TO ISSUE WITH SYNC & NO COMMIT */
	3498	error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
	3499	size, &residual,
	3500	(UPL_IOSYNC \| UPL_NOCOMMIT));
	3501	}
	3502
	3503	read_vsmap = *vsmap_ptr;
	3504
	3505
	3506	/*
	3507	* Adjust counts and put data in new BS. Optimize for the
	3508	* common case, i.e. no error and/or partial data.
	3509	* If there was an error, then we need to error the entire
	3510	* range, even if some data was successfully read.
	3511	*
	3512	*/
	3513	if ((error == KERN_SUCCESS) && (residual == 0)) {
	3514	int page_list_count = 0;
	3515
	3516	/*
	3517	* Got everything we asked for, supply the data to
	3518	* the new BS. Note that as a side effect of supplying
	3519	* the data, the buffer holding the supplied data is
	3520	* deallocated from the pager's address space unless
	3521	* the write is unsuccessful.
	3522	*/
	3523
	3524	/* note buffer will be cleaned up in all cases by */
	3525	/* internal_cluster_write or if an error on write */
	3526	/* the vm_map_copy_page_discard call */
	3527	*vsmap_ptr = write_vsmap;
	3528
	3529	if(vs_cluster_write(vs, upl, offset,
	3530	size, TRUE, UPL_IOSYNC \| UPL_NOCOMMIT ) != KERN_SUCCESS) {
	3531	error = KERN_FAILURE;
	3532	if(!(VSM_ISCLR(*vsmap_ptr))) {
	3533	/* unmap the new backing store object */
	3534	ps_clunmap(vs, offset, size);
	3535	}
	3536	/* original vsmap */
	3537	*vsmap_ptr = original_read_vsmap;
	3538	VSM_CLR(write_vsmap);
	3539	} else {
	3540	if((offset + size) &
	3541	((vm_page_size << vs->vs_clshift)
	3542	- 1)) {
	3543	/* There is more to transfer in this
	3544	cluster
	3545	*/
	3546	write_vsmap = *vsmap_ptr;
	3547	*vsmap_ptr = read_vsmap;
	3548	} else {
	3549	/* discard the old backing object */
	3550	write_vsmap = *vsmap_ptr;
	3551	*vsmap_ptr = read_vsmap;
	3552	ps_clunmap(vs, offset, size);
	3553	*vsmap_ptr = write_vsmap;
	3554	VSM_CLR(write_vsmap);
	3555	VSM_CLR(original_read_vsmap);
	3556	}
	3557	}
	3558	} else {
	3559	size_wanted = size;
	3560	if (error == KERN_SUCCESS) {
	3561	if (residual == size) {
	3562	/*
	3563	* If a read operation returns no error
	3564	* and no data moved, we turn it into
	3565	* an error, assuming we're reading at
	3566	* or beyond EOF.
	3567	* Fall through and error the entire
	3568	* range.
	3569	*/
	3570	error = KERN_FAILURE;
	3571	*vsmap_ptr = write_vsmap;
	3572	if(!(VSM_ISCLR(*vsmap_ptr))) {
	3573	/* unmap the new backing store object */
	3574	ps_clunmap(vs, offset, size);
	3575	}
	3576	*vsmap_ptr = original_read_vsmap;
	3577	VSM_CLR(write_vsmap);
	3578	continue;
	3579	} else {
	3580	/*
	3581	* Otherwise, we have partial read.
	3582	* This is also considered an error
	3583	* for the purposes of cluster transfer
	3584	*/
	3585	error = KERN_FAILURE;
	3586	*vsmap_ptr = write_vsmap;
	3587	if(!(VSM_ISCLR(*vsmap_ptr))) {
	3588	/* unmap the new backing store object */
	3589	ps_clunmap(vs, offset, size);
	3590	}
	3591	*vsmap_ptr = original_read_vsmap;
	3592	VSM_CLR(write_vsmap);
	3593	continue;
	3594	}
	3595	}
	3596
	3597	}
	3598	cnt -= size;
	3599	offset += size;
	3600
	3601	} /* END while (cnt && (error == 0)) */
	3602	if(!VSM_ISCLR(write_vsmap))
	3603	*vsmap_ptr = write_vsmap;
	3604
	3605	return error;
	3606	}
	3607
	3608	kern_return_t
	3609	default_pager_add_file(MACH_PORT_FACE backing_store,
	3610	int *vp,
	3611	int record_size,
	3612	long size)
	3613	{
	3614	backing_store_t bs;
	3615	paging_segment_t ps;
	3616	int i;
	3617	int error;
	3618
	3619	if ((bs = backing_store_lookup(backing_store))
	3620	== BACKING_STORE_NULL)
	3621	return KERN_INVALID_ARGUMENT;
	3622
	3623	PSL_LOCK();
	3624	for (i = 0; i <= paging_segment_max; i++) {
	3625	ps = paging_segments[i];
	3626	if (ps == PAGING_SEGMENT_NULL)
	3627	continue;
	3628	if (ps->ps_segtype != PS_FILE)
	3629	continue;
	3630
	3631	/*
	3632	* Check for overlap on same device.
	3633	*/
	3634	if (ps->ps_vnode == (struct vnode *)vp) {
	3635	PSL_UNLOCK();
	3636	BS_UNLOCK(bs);
	3637	return KERN_INVALID_ARGUMENT;
	3638	}
	3639	}
	3640	PSL_UNLOCK();
	3641
	3642	/*
	3643	* Set up the paging segment
	3644	*/
	3645	ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
	3646	if (ps == PAGING_SEGMENT_NULL) {
	3647	BS_UNLOCK(bs);
	3648	return KERN_RESOURCE_SHORTAGE;
	3649	}
	3650
	3651	ps->ps_segtype = PS_FILE;
	3652	ps->ps_vnode = (struct vnode *)vp;
	3653	ps->ps_offset = 0;
	3654	ps->ps_record_shift = local_log2(vm_page_size / record_size);
	3655	ps->ps_recnum = size;
	3656	ps->ps_pgnum = size >> ps->ps_record_shift;
	3657
	3658	ps->ps_pgcount = ps->ps_pgnum;
	3659	ps->ps_clshift = local_log2(bs->bs_clsize);
	3660	ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
	3661	ps->ps_hint = 0;
	3662
	3663	PS_LOCK_INIT(ps);
	3664	ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
	3665	if (!ps->ps_bmap) {
	3666	kfree((vm_offset_t)ps, sizeof *ps);
	3667	BS_UNLOCK(bs);
	3668	return KERN_RESOURCE_SHORTAGE;
	3669	}
	3670	for (i = 0; i < ps->ps_ncls; i++) {
	3671	clrbit(ps->ps_bmap, i);
	3672	}
	3673
	3674	ps->ps_going_away = FALSE;
	3675	ps->ps_bs = bs;
	3676
	3677	if ((error = ps_enter(ps)) != 0) {
	3678	kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
	3679	kfree((vm_offset_t)ps, sizeof *ps);
	3680	BS_UNLOCK(bs);
	3681	return KERN_RESOURCE_SHORTAGE;
	3682	}
	3683
	3684	bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
	3685	bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
	3686	PSL_LOCK();
	3687	dp_pages_free += ps->ps_pgcount;
	3688	PSL_UNLOCK();
	3689
	3690	BS_UNLOCK(bs);
	3691
	3692	bs_more_space(ps->ps_clcount);
	3693
	3694	DEBUG(DEBUG_BS_INTERNAL,
	3695	("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
	3696	device, offset, size, record_size,
	3697	ps->ps_record_shift, ps->ps_pgnum));
	3698
	3699	return KERN_SUCCESS;
	3700	}
	3701
	3702
	3703
	3704	kern_return_t
	3705	ps_read_file(
	3706	paging_segment_t ps,
	3707	upl_t upl,
	3708	vm_offset_t upl_offset,
	3709	vm_offset_t offset,
	3710	unsigned int size,
	3711	unsigned int *residualp,
	3712	int flags)
	3713	{
	3714	vm_object_offset_t f_offset;
	3715	int error = 0;
	3716	int result;
	3717
	3718
	3719	clustered_reads[atop(size)]++;
	3720
	3721	f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
	3722
	3723	/* for transfer case we need to pass uploffset and flags */
	3724	error = vnode_pagein(ps->ps_vnode,
	3725	upl, upl_offset, f_offset, (vm_size_t)size, flags \| UPL_NORDAHEAD, NULL);
	3726
	3727	/* The vnode_pagein semantic is somewhat at odds with the existing */
	3728	/* device_read semantic. Partial reads are not experienced at this */
	3729	/* level. It is up to the bit map code and cluster read code to */
	3730	/* check that requested data locations are actually backed, and the */
	3731	/* pagein code to either read all of the requested data or return an */
	3732	/* error. */
	3733
	3734	if (error)
	3735	result = KERN_FAILURE;
	3736	else {
	3737	*residualp = 0;
	3738	result = KERN_SUCCESS;
	3739	}
	3740	return result;
	3741	}
	3742
	3743	kern_return_t
	3744	ps_write_file(
	3745	paging_segment_t ps,
	3746	upl_t upl,
	3747	vm_offset_t upl_offset,
	3748	vm_offset_t offset,
	3749	unsigned int size,
	3750	int flags)
	3751	{
	3752	vm_object_offset_t f_offset;
	3753	kern_return_t result;
	3754
	3755	int error = 0;
	3756
	3757	clustered_writes[atop(size)]++;
	3758	f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
	3759
	3760	if (vnode_pageout(ps->ps_vnode,
	3761	upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
	3762	result = KERN_FAILURE;
	3763	else
	3764	result = KERN_SUCCESS;
	3765
	3766	return result;
	3767	}
	3768
	3769	kern_return_t
	3770	default_pager_triggers(MACH_PORT_FACE default_pager,
	3771	int hi_wat,
	3772	int lo_wat,
	3773	int flags,
	3774	MACH_PORT_FACE trigger_port)
	3775	{
	3776	MACH_PORT_FACE release;
	3777	kern_return_t kr;
	3778
	3779	PSL_LOCK();
	3780	if (flags == HI_WAT_ALERT) {
	3781	release = min_pages_trigger_port;
	3782	min_pages_trigger_port = trigger_port;
	3783	minimum_pages_remaining = hi_wat/vm_page_size;
	3784	bs_low = FALSE;
	3785	kr = KERN_SUCCESS;
	3786	} else if (flags == LO_WAT_ALERT) {
	3787	release = max_pages_trigger_port;
	3788	max_pages_trigger_port = trigger_port;
	3789	maximum_pages_free = lo_wat/vm_page_size;
	3790	kr = KERN_SUCCESS;
	3791	} else {
	3792	release = trigger_port;
	3793	kr = KERN_INVALID_ARGUMENT;
	3794	}
	3795	PSL_UNLOCK();
	3796
	3797	if (IP_VALID(release))
	3798	ipc_port_release_send(release);
	3799
	3800	return kr;
	3801	}