]> git.saurik.com Git - apple/xnu.git/blame - osfmk/default_pager/dp_backing_store.c
xnu-1504.15.3.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57/*
58 * Default Pager.
59 * Paging File Management.
60 */
61
91447636 62#include <mach/host_priv.h>
0b4e3aa0 63#include <mach/memory_object_control.h>
1c79356b 64#include <mach/memory_object_server.h>
91447636
A
65#include <mach/upl.h>
66#include <default_pager/default_pager_internal.h>
1c79356b 67#include <default_pager/default_pager_alerts.h>
91447636
A
68#include <default_pager/default_pager_object_server.h>
69
70#include <ipc/ipc_types.h>
1c79356b
A
71#include <ipc/ipc_port.h>
72#include <ipc/ipc_space.h>
91447636
A
73
74#include <kern/kern_types.h>
75#include <kern/host.h>
1c79356b
A
76#include <kern/queue.h>
77#include <kern/counters.h>
78#include <kern/sched_prim.h>
91447636 79
1c79356b
A
80#include <vm/vm_kern.h>
81#include <vm/vm_pageout.h>
1c79356b 82#include <vm/vm_map.h>
91447636
A
83#include <vm/vm_object.h>
84#include <vm/vm_protos.h>
85
2d21ac55 86
b0d623f7 87/* todo - need large internal object support */
1c79356b 88
0b4e3aa0
A
89/*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
1c79356b 99
0b4e3aa0 100#define ALLOC_STRIDE (1024 * 1024 * 1024)
1c79356b
A
101int physical_transfer_cluster_count = 0;
102
9bccf70c 103#define VM_SUPER_CLUSTER 0x40000
0b4c1975 104#define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_SIZE)
1c79356b
A
105
106/*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
0b4c1975
A
110#define VSTRUCT_MIN_CLSHIFT 0
111
1c79356b 112#define VSTRUCT_DEF_CLSHIFT 2
1c79356b
A
113int default_pager_clsize = 0;
114
0b4c1975
A
115int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
116
1c79356b 117/* statistics */
0b4e3aa0
A
118unsigned int clustered_writes[VM_SUPER_PAGES+1];
119unsigned int clustered_reads[VM_SUPER_PAGES+1];
1c79356b
A
120
121/*
122 * Globals used for asynchronous paging operations:
123 * vs_async_list: head of list of to-be-completed I/O ops
124 * async_num_queued: number of pages completed, but not yet
125 * processed by async thread.
126 * async_requests_out: number of pages of requests not completed.
127 */
128
129#if 0
130struct vs_async *vs_async_list;
131int async_num_queued;
132int async_requests_out;
133#endif
134
135
136#define VS_ASYNC_REUSE 1
137struct vs_async *vs_async_free_list;
138
b0d623f7 139lck_mtx_t default_pager_async_lock; /* Protects globals above */
1c79356b
A
140
141
142int vs_alloc_async_failed = 0; /* statistics */
143int vs_alloc_async_count = 0; /* statistics */
144struct vs_async *vs_alloc_async(void); /* forward */
145void vs_free_async(struct vs_async *vsa); /* forward */
146
147
148#define VS_ALLOC_ASYNC() vs_alloc_async()
149#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
150
b0d623f7
A
151#define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock)
152#define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock)
153#define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
1c79356b
A
154#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
155/*
156 * Paging Space Hysteresis triggers and the target notification port
157 *
158 */
b0d623f7
A
159unsigned int dp_pages_free_drift_count = 0;
160unsigned int dp_pages_free_drifted_max = 0;
1c79356b
A
161unsigned int minimum_pages_remaining = 0;
162unsigned int maximum_pages_free = 0;
163ipc_port_t min_pages_trigger_port = NULL;
164ipc_port_t max_pages_trigger_port = NULL;
165
b0d623f7 166boolean_t use_emergency_swap_file_first = FALSE;
1c79356b 167boolean_t bs_low = FALSE;
0b4e3aa0 168int backing_store_release_trigger_disable = 0;
b0d623f7 169boolean_t backing_store_stop_compaction = FALSE;
91447636
A
170
171
172/* Have we decided if swap needs to be encrypted yet ? */
173boolean_t dp_encryption_inited = FALSE;
174/* Should we encrypt swap ? */
175boolean_t dp_encryption = FALSE;
1c79356b 176
0b4c1975
A
177boolean_t dp_isssd = FALSE;
178
1c79356b
A
179
180/*
181 * Object sizes are rounded up to the next power of 2,
182 * unless they are bigger than a given maximum size.
183 */
184vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
185
186/*
187 * List of all backing store and segments.
188 */
b0d623f7 189MACH_PORT_FACE emergency_segment_backing_store;
1c79356b
A
190struct backing_store_list_head backing_store_list;
191paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
b0d623f7 192lck_mtx_t paging_segments_lock;
1c79356b
A
193int paging_segment_max = 0;
194int paging_segment_count = 0;
195int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
196
197
198/*
199 * Total pages free in system
200 * This differs from clusters committed/avail which is a measure of the
201 * over commitment of paging segments to backing store. An idea which is
202 * likely to be deprecated.
203 */
204unsigned int dp_pages_free = 0;
b0d623f7 205unsigned int dp_pages_reserve = 0;
1c79356b
A
206unsigned int cluster_transfer_minimum = 100;
207
91447636 208/* forward declarations */
b0d623f7
A
209kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */
210kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */
91447636
A
211default_pager_thread_t *get_read_buffer( void );
212kern_return_t ps_vstruct_transfer_from_segment(
213 vstruct_t vs,
214 paging_segment_t segment,
215 upl_t upl);
b0d623f7
A
216kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
217kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
91447636
A
218kern_return_t vs_cluster_transfer(
219 vstruct_t vs,
b0d623f7
A
220 dp_offset_t offset,
221 dp_size_t cnt,
91447636
A
222 upl_t upl);
223vs_map_t vs_get_map_entry(
224 vstruct_t vs,
b0d623f7 225 dp_offset_t offset);
0b4e3aa0 226
b0d623f7
A
227kern_return_t
228default_pager_backing_store_delete_internal( MACH_PORT_FACE );
1c79356b
A
229
230default_pager_thread_t *
91447636 231get_read_buffer( void )
1c79356b
A
232{
233 int i;
234
235 DPT_LOCK(dpt_lock);
236 while(TRUE) {
237 for (i=0; i<default_pager_internal_count; i++) {
238 if(dpt_array[i]->checked_out == FALSE) {
239 dpt_array[i]->checked_out = TRUE;
240 DPT_UNLOCK(dpt_lock);
241 return dpt_array[i];
242 }
243 }
9bccf70c 244 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
1c79356b
A
245 }
246}
247
248void
249bs_initialize(void)
250{
251 int i;
252
253 /*
254 * List of all backing store.
255 */
256 BSL_LOCK_INIT();
257 queue_init(&backing_store_list.bsl_queue);
258 PSL_LOCK_INIT();
259
260 VS_ASYNC_LOCK_INIT();
261#if VS_ASYNC_REUSE
262 vs_async_free_list = NULL;
263#endif /* VS_ASYNC_REUSE */
264
0b4e3aa0 265 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
1c79356b
A
266 clustered_writes[i] = 0;
267 clustered_reads[i] = 0;
268 }
269
270}
271
272/*
273 * When things do not quite workout...
274 */
275void bs_no_paging_space(boolean_t); /* forward */
276
277void
278bs_no_paging_space(
279 boolean_t out_of_memory)
280{
1c79356b
A
281
282 if (out_of_memory)
283 dprintf(("*** OUT OF MEMORY ***\n"));
284 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
285}
286
287void bs_more_space(int); /* forward */
288void bs_commit(int); /* forward */
289
290boolean_t user_warned = FALSE;
291unsigned int clusters_committed = 0;
292unsigned int clusters_available = 0;
293unsigned int clusters_committed_peak = 0;
294
295void
296bs_more_space(
297 int nclusters)
298{
299 BSL_LOCK();
300 /*
301 * Account for new paging space.
302 */
303 clusters_available += nclusters;
304
305 if (clusters_available >= clusters_committed) {
306 if (verbose && user_warned) {
307 printf("%s%s - %d excess clusters now.\n",
308 my_name,
309 "paging space is OK now",
310 clusters_available - clusters_committed);
311 user_warned = FALSE;
312 clusters_committed_peak = 0;
313 }
314 } else {
315 if (verbose && user_warned) {
316 printf("%s%s - still short of %d clusters.\n",
317 my_name,
318 "WARNING: paging space over-committed",
319 clusters_committed - clusters_available);
320 clusters_committed_peak -= nclusters;
321 }
322 }
323 BSL_UNLOCK();
324
325 return;
326}
327
328void
329bs_commit(
330 int nclusters)
331{
332 BSL_LOCK();
333 clusters_committed += nclusters;
334 if (clusters_committed > clusters_available) {
335 if (verbose && !user_warned) {
336 user_warned = TRUE;
337 printf("%s%s - short of %d clusters.\n",
338 my_name,
339 "WARNING: paging space over-committed",
340 clusters_committed - clusters_available);
341 }
342 if (clusters_committed > clusters_committed_peak) {
343 clusters_committed_peak = clusters_committed;
344 }
345 } else {
346 if (verbose && user_warned) {
347 printf("%s%s - was short of up to %d clusters.\n",
348 my_name,
349 "paging space is OK now",
350 clusters_committed_peak - clusters_available);
351 user_warned = FALSE;
352 clusters_committed_peak = 0;
353 }
354 }
355 BSL_UNLOCK();
356
357 return;
358}
359
360int default_pager_info_verbose = 1;
361
362void
363bs_global_info(
b0d623f7
A
364 uint64_t *totalp,
365 uint64_t *freep)
1c79356b 366{
b0d623f7 367 uint64_t pages_total, pages_free;
1c79356b
A
368 paging_segment_t ps;
369 int i;
1c79356b
A
370
371 PSL_LOCK();
372 pages_total = pages_free = 0;
373 for (i = 0; i <= paging_segment_max; i++) {
374 ps = paging_segments[i];
375 if (ps == PAGING_SEGMENT_NULL)
376 continue;
377
378 /*
379 * no need to lock: by the time this data
380 * gets back to any remote requestor it
381 * will be obsolete anyways
382 */
383 pages_total += ps->ps_pgnum;
384 pages_free += ps->ps_clcount << ps->ps_clshift;
91447636
A
385 DP_DEBUG(DEBUG_BS_INTERNAL,
386 ("segment #%d: %d total, %d free\n",
387 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
1c79356b
A
388 }
389 *totalp = pages_total;
390 *freep = pages_free;
391 if (verbose && user_warned && default_pager_info_verbose) {
392 if (clusters_available < clusters_committed) {
393 printf("%s %d clusters committed, %d available.\n",
394 my_name,
395 clusters_committed,
396 clusters_available);
397 }
398 }
399 PSL_UNLOCK();
400}
401
402backing_store_t backing_store_alloc(void); /* forward */
403
404backing_store_t
405backing_store_alloc(void)
406{
407 backing_store_t bs;
1c79356b
A
408
409 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
410 if (bs == BACKING_STORE_NULL)
411 panic("backing_store_alloc: no memory");
412
413 BS_LOCK_INIT(bs);
414 bs->bs_port = MACH_PORT_NULL;
415 bs->bs_priority = 0;
416 bs->bs_clsize = 0;
417 bs->bs_pages_total = 0;
418 bs->bs_pages_in = 0;
419 bs->bs_pages_in_fail = 0;
420 bs->bs_pages_out = 0;
421 bs->bs_pages_out_fail = 0;
422
423 return bs;
424}
425
426backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
427
428/* Even in both the component space and external versions of this pager, */
429/* backing_store_lookup will be called from tasks in the application space */
430backing_store_t
431backing_store_lookup(
432 MACH_PORT_FACE port)
433{
434 backing_store_t bs;
435
436/*
437 port is currently backed with a vs structure in the alias field
438 we could create an ISBS alias and a port_is_bs call but frankly
439 I see no reason for the test, the bs->port == port check below
440 will work properly on junk entries.
441
442 if ((port == MACH_PORT_NULL) || port_is_vs(port))
443*/
444 if ((port == MACH_PORT_NULL))
445 return BACKING_STORE_NULL;
446
447 BSL_LOCK();
448 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
449 bs_links) {
450 BS_LOCK(bs);
451 if (bs->bs_port == port) {
452 BSL_UNLOCK();
453 /* Success, return it locked. */
454 return bs;
455 }
456 BS_UNLOCK(bs);
457 }
458 BSL_UNLOCK();
459 return BACKING_STORE_NULL;
460}
461
462void backing_store_add(backing_store_t); /* forward */
463
464void
465backing_store_add(
91447636 466 __unused backing_store_t bs)
1c79356b 467{
91447636
A
468// MACH_PORT_FACE port = bs->bs_port;
469// MACH_PORT_FACE pset = default_pager_default_set;
1c79356b 470 kern_return_t kr = KERN_SUCCESS;
1c79356b
A
471
472 if (kr != KERN_SUCCESS)
473 panic("backing_store_add: add to set");
474
475}
476
477/*
478 * Set up default page shift, but only if not already
479 * set and argument is within range.
480 */
481boolean_t
482bs_set_default_clsize(unsigned int npages)
483{
484 switch(npages){
485 case 1:
486 case 2:
487 case 4:
488 case 8:
489 if (default_pager_clsize == 0) /* if not yet set */
490 vstruct_def_clshift = local_log2(npages);
491 return(TRUE);
492 }
493 return(FALSE);
494}
495
496int bs_get_global_clsize(int clsize); /* forward */
497
498int
499bs_get_global_clsize(
500 int clsize)
501{
502 int i;
0b4e3aa0 503 memory_object_default_t dmm;
1c79356b 504 kern_return_t kr;
1c79356b
A
505
506 /*
507 * Only allow setting of cluster size once. If called
508 * with no cluster size (default), we use the compiled-in default
509 * for the duration. The same cluster size is used for all
510 * paging segments.
511 */
512 if (default_pager_clsize == 0) {
1c79356b
A
513 /*
514 * Keep cluster size in bit shift because it's quicker
515 * arithmetic, and easier to keep at a power of 2.
516 */
517 if (clsize != NO_CLSIZE) {
518 for (i = 0; (1 << i) < clsize; i++);
519 if (i > MAX_CLUSTER_SHIFT)
520 i = MAX_CLUSTER_SHIFT;
521 vstruct_def_clshift = i;
522 }
523 default_pager_clsize = (1 << vstruct_def_clshift);
524
525 /*
526 * Let the user know the new (and definitive) cluster size.
527 */
528 if (verbose)
529 printf("%scluster size = %d page%s\n",
530 my_name, default_pager_clsize,
531 (default_pager_clsize == 1) ? "" : "s");
0b4e3aa0 532
1c79356b
A
533 /*
534 * Let the kernel know too, in case it hasn't used the
535 * default value provided in main() yet.
536 */
0b4e3aa0 537 dmm = default_pager_object;
1c79356b
A
538 clsize = default_pager_clsize * vm_page_size; /* in bytes */
539 kr = host_default_memory_manager(host_priv_self(),
0b4e3aa0 540 &dmm,
1c79356b 541 clsize);
0b4e3aa0
A
542 memory_object_default_deallocate(dmm);
543
1c79356b
A
544 if (kr != KERN_SUCCESS) {
545 panic("bs_get_global_cl_size:host_default_memory_manager");
546 }
0b4e3aa0 547 if (dmm != default_pager_object) {
1c79356b
A
548 panic("bs_get_global_cl_size:there is another default pager");
549 }
550 }
551 ASSERT(default_pager_clsize > 0 &&
552 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
553
554 return default_pager_clsize;
555}
556
557kern_return_t
558default_pager_backing_store_create(
0b4e3aa0
A
559 memory_object_default_t pager,
560 int priority,
561 int clsize, /* in bytes */
562 MACH_PORT_FACE *backing_store)
1c79356b
A
563{
564 backing_store_t bs;
565 MACH_PORT_FACE port;
91447636 566// kern_return_t kr;
1c79356b 567 struct vstruct_alias *alias_struct;
1c79356b 568
0b4e3aa0 569 if (pager != default_pager_object)
1c79356b
A
570 return KERN_INVALID_ARGUMENT;
571
572 bs = backing_store_alloc();
573 port = ipc_port_alloc_kernel();
574 ipc_port_make_send(port);
575 assert (port != IP_NULL);
576
91447636
A
577 DP_DEBUG(DEBUG_BS_EXTERNAL,
578 ("priority=%d clsize=%d bs_port=0x%x\n",
579 priority, clsize, (int) backing_store));
1c79356b
A
580
581 alias_struct = (struct vstruct_alias *)
582 kalloc(sizeof (struct vstruct_alias));
583 if(alias_struct != NULL) {
584 alias_struct->vs = (struct vstruct *)bs;
0c530ab8 585 alias_struct->name = &default_pager_ops;
b0d623f7 586 port->alias = (uintptr_t) alias_struct;
1c79356b
A
587 }
588 else {
589 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
91447636 590 kfree(bs, sizeof (struct backing_store));
1c79356b
A
591 return KERN_RESOURCE_SHORTAGE;
592 }
593
594 bs->bs_port = port;
595 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
596 priority = BS_MAXPRI;
597 else if (priority == BS_NOPRI)
598 priority = BS_MAXPRI;
599 else
600 priority = BS_MINPRI;
601 bs->bs_priority = priority;
602
55e303ae 603 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
1c79356b
A
604
605 BSL_LOCK();
606 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
607 bs_links);
608 BSL_UNLOCK();
609
610 backing_store_add(bs);
611
612 *backing_store = port;
613 return KERN_SUCCESS;
614}
615
616kern_return_t
617default_pager_backing_store_info(
618 MACH_PORT_FACE backing_store,
619 backing_store_flavor_t flavour,
620 backing_store_info_t info,
621 mach_msg_type_number_t *size)
622{
623 backing_store_t bs;
624 backing_store_basic_info_t basic;
625 int i;
626 paging_segment_t ps;
627
628 if (flavour != BACKING_STORE_BASIC_INFO ||
629 *size < BACKING_STORE_BASIC_INFO_COUNT)
630 return KERN_INVALID_ARGUMENT;
631
632 basic = (backing_store_basic_info_t)info;
633 *size = BACKING_STORE_BASIC_INFO_COUNT;
634
635 VSTATS_LOCK(&global_stats.gs_lock);
636 basic->pageout_calls = global_stats.gs_pageout_calls;
637 basic->pagein_calls = global_stats.gs_pagein_calls;
638 basic->pages_in = global_stats.gs_pages_in;
639 basic->pages_out = global_stats.gs_pages_out;
640 basic->pages_unavail = global_stats.gs_pages_unavail;
641 basic->pages_init = global_stats.gs_pages_init;
642 basic->pages_init_writes= global_stats.gs_pages_init_writes;
643 VSTATS_UNLOCK(&global_stats.gs_lock);
644
645 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
646 return KERN_INVALID_ARGUMENT;
647
648 basic->bs_pages_total = bs->bs_pages_total;
649 PSL_LOCK();
650 bs->bs_pages_free = 0;
651 for (i = 0; i <= paging_segment_max; i++) {
652 ps = paging_segments[i];
653 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
654 PS_LOCK(ps);
655 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
656 PS_UNLOCK(ps);
657 }
658 }
659 PSL_UNLOCK();
660 basic->bs_pages_free = bs->bs_pages_free;
661 basic->bs_pages_in = bs->bs_pages_in;
662 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
663 basic->bs_pages_out = bs->bs_pages_out;
664 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
665
666 basic->bs_priority = bs->bs_priority;
55e303ae 667 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
1c79356b
A
668
669 BS_UNLOCK(bs);
670
671 return KERN_SUCCESS;
672}
673
674int ps_delete(paging_segment_t); /* forward */
b0d623f7 675boolean_t current_thread_aborted(void);
1c79356b
A
676
677int
678ps_delete(
679 paging_segment_t ps)
680{
681 vstruct_t vs;
682 kern_return_t error = KERN_SUCCESS;
683 int vs_count;
684
685 VSL_LOCK(); /* get the lock on the list of vs's */
686
687 /* The lock relationship and sequence is farily complicated */
688 /* this code looks at a live list, locking and unlocking the list */
689 /* as it traverses it. It depends on the locking behavior of */
690 /* default_pager_no_senders. no_senders always locks the vstruct */
691 /* targeted for removal before locking the vstruct list. However */
692 /* it will remove that member of the list without locking its */
693 /* neighbors. We can be sure when we hold a lock on a vstruct */
694 /* it cannot be removed from the list but we must hold the list */
695 /* lock to be sure that its pointers to its neighbors are valid. */
696 /* Also, we can hold off destruction of a vstruct when the list */
697 /* lock and the vs locks are not being held by bumping the */
698 /* vs_async_pending count. */
699
0b4e3aa0
A
700
701 while(backing_store_release_trigger_disable != 0) {
9bccf70c 702 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
0b4e3aa0
A
703 }
704
1c79356b
A
705 /* we will choose instead to hold a send right */
706 vs_count = vstruct_list.vsl_count;
707 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
708 if(vs == (vstruct_t)&vstruct_list) {
709 VSL_UNLOCK();
710 return KERN_SUCCESS;
711 }
712 VS_LOCK(vs);
713 vs_async_wait(vs); /* wait for any pending async writes */
714 if ((vs_count != 0) && (vs != NULL))
715 vs->vs_async_pending += 1; /* hold parties calling */
716 /* vs_async_wait */
717 VS_UNLOCK(vs);
718 VSL_UNLOCK();
719 while((vs_count != 0) && (vs != NULL)) {
720 /* We take the count of AMO's before beginning the */
721 /* transfer of of the target segment. */
722 /* We are guaranteed that the target segment cannot get */
723 /* more users. We also know that queue entries are */
724 /* made at the back of the list. If some of the entries */
725 /* we would check disappear while we are traversing the */
726 /* list then we will either check new entries which */
727 /* do not have any backing store in the target segment */
728 /* or re-check old entries. This might not be optimal */
729 /* but it will always be correct. The alternative is to */
730 /* take a snapshot of the list. */
731 vstruct_t next_vs;
732
733 if(dp_pages_free < cluster_transfer_minimum)
734 error = KERN_FAILURE;
735 else {
736 vm_object_t transfer_object;
0c530ab8 737 unsigned int count;
1c79356b
A
738 upl_t upl;
739
91447636 740 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
0b4e3aa0
A
741 count = 0;
742 error = vm_object_upl_request(transfer_object,
743 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
744 &upl, NULL, &count,
2d21ac55
A
745 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
746
1c79356b 747 if(error == KERN_SUCCESS) {
1c79356b
A
748 error = ps_vstruct_transfer_from_segment(
749 vs, ps, upl);
91447636 750 upl_commit(upl, NULL, 0);
0b4e3aa0 751 upl_deallocate(upl);
1c79356b 752 } else {
1c79356b
A
753 error = KERN_FAILURE;
754 }
9bccf70c 755 vm_object_deallocate(transfer_object);
1c79356b 756 }
b0d623f7 757 if(error || current_thread_aborted() || backing_store_stop_compaction) {
1c79356b
A
758 VS_LOCK(vs);
759 vs->vs_async_pending -= 1; /* release vs_async_wait */
0b4e3aa0
A
760 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
761 vs->vs_waiting_async = FALSE;
1c79356b 762 VS_UNLOCK(vs);
0b4e3aa0 763 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
764 } else {
765 VS_UNLOCK(vs);
766 }
767 return KERN_FAILURE;
768 }
769
770 VSL_LOCK();
0b4e3aa0
A
771
772 while(backing_store_release_trigger_disable != 0) {
9bccf70c
A
773 VSL_SLEEP(&backing_store_release_trigger_disable,
774 THREAD_UNINT);
0b4e3aa0
A
775 }
776
1c79356b
A
777 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
778 if((next_vs != (vstruct_t)&vstruct_list) &&
779 (vs != next_vs) && (vs_count != 1)) {
780 VS_LOCK(next_vs);
781 vs_async_wait(next_vs); /* wait for any */
782 /* pending async writes */
783 next_vs->vs_async_pending += 1; /* hold parties */
784 /* calling vs_async_wait */
785 VS_UNLOCK(next_vs);
786 }
787 VSL_UNLOCK();
788 VS_LOCK(vs);
789 vs->vs_async_pending -= 1;
0b4e3aa0
A
790 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
791 vs->vs_waiting_async = FALSE;
1c79356b 792 VS_UNLOCK(vs);
0b4e3aa0 793 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
794 } else {
795 VS_UNLOCK(vs);
796 }
797 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
798 vs = NULL;
799 else
800 vs = next_vs;
801 vs_count--;
802 }
803 return KERN_SUCCESS;
804}
805
806
807kern_return_t
b0d623f7 808default_pager_backing_store_delete_internal(
1c79356b
A
809 MACH_PORT_FACE backing_store)
810{
811 backing_store_t bs;
812 int i;
813 paging_segment_t ps;
814 int error;
815 int interim_pages_removed = 0;
b0d623f7 816 boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
1c79356b
A
817
818 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
819 return KERN_INVALID_ARGUMENT;
820
b0d623f7 821restart:
1c79356b
A
822 PSL_LOCK();
823 error = KERN_SUCCESS;
824 for (i = 0; i <= paging_segment_max; i++) {
825 ps = paging_segments[i];
826 if (ps != PAGING_SEGMENT_NULL &&
827 ps->ps_bs == bs &&
b0d623f7 828 ! IS_PS_GOING_AWAY(ps)) {
1c79356b 829 PS_LOCK(ps);
b0d623f7
A
830
831 if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
832 /*
833 * Someone is already busy reclamining this paging segment.
834 * If it's the emergency segment we are looking at then check
835 * that someone has not already recovered it and set the right
836 * state i.e. online but not activated.
837 */
838 PS_UNLOCK(ps);
839 continue;
840 }
841
1c79356b 842 /* disable access to this segment */
b0d623f7
A
843 ps->ps_state &= ~PS_CAN_USE;
844 ps->ps_state |= PS_GOING_AWAY;
1c79356b
A
845 PS_UNLOCK(ps);
846 /*
847 * The "ps" segment is "off-line" now,
848 * we can try and delete it...
849 */
850 if(dp_pages_free < (cluster_transfer_minimum
851 + ps->ps_pgcount)) {
852 error = KERN_FAILURE;
853 PSL_UNLOCK();
854 }
855 else {
856 /* remove all pages associated with the */
857 /* segment from the list of free pages */
858 /* when transfer is through, all target */
859 /* segment pages will appear to be free */
860
861 dp_pages_free -= ps->ps_pgcount;
862 interim_pages_removed += ps->ps_pgcount;
863 PSL_UNLOCK();
864 error = ps_delete(ps);
865 }
866 if (error != KERN_SUCCESS) {
867 /*
868 * We couldn't delete the segment,
869 * probably because there's not enough
870 * virtual memory left.
871 * Re-enable all the segments.
872 */
873 PSL_LOCK();
874 break;
875 }
876 goto restart;
877 }
878 }
879
880 if (error != KERN_SUCCESS) {
881 for (i = 0; i <= paging_segment_max; i++) {
882 ps = paging_segments[i];
883 if (ps != PAGING_SEGMENT_NULL &&
884 ps->ps_bs == bs &&
b0d623f7 885 IS_PS_GOING_AWAY(ps)) {
1c79356b 886 PS_LOCK(ps);
b0d623f7
A
887
888 if( !IS_PS_GOING_AWAY(ps)) {
889 PS_UNLOCK(ps);
890 continue;
891 }
892 /* Handle the special clusters that came in while we let go the lock*/
893 if( ps->ps_special_clusters) {
894 dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
895 ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
896 ps->ps_clcount += ps->ps_special_clusters;
897 if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
898 ps_select_array[ps->ps_bs->bs_priority] = 0;
899 }
900 ps->ps_special_clusters = 0;
901 }
1c79356b 902 /* re-enable access to this segment */
b0d623f7
A
903 ps->ps_state &= ~PS_GOING_AWAY;
904 ps->ps_state |= PS_CAN_USE;
1c79356b
A
905 PS_UNLOCK(ps);
906 }
907 }
908 dp_pages_free += interim_pages_removed;
909 PSL_UNLOCK();
910 BS_UNLOCK(bs);
911 return error;
912 }
913
914 for (i = 0; i <= paging_segment_max; i++) {
915 ps = paging_segments[i];
916 if (ps != PAGING_SEGMENT_NULL &&
917 ps->ps_bs == bs) {
b0d623f7
A
918 if(IS_PS_GOING_AWAY(ps)) {
919 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
920 PS_LOCK(ps);
921 ps->ps_state &= ~PS_GOING_AWAY;
922 ps->ps_special_clusters = 0;
923 ps->ps_pgcount = ps->ps_pgnum;
924 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
d1ecb069 925 dp_pages_reserve += ps->ps_pgcount;
b0d623f7 926 PS_UNLOCK(ps);
b0d623f7
A
927 } else {
928 paging_segments[i] = PAGING_SEGMENT_NULL;
929 paging_segment_count--;
930 PS_LOCK(ps);
931 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
932 kfree(ps, sizeof *ps);
933 }
1c79356b
A
934 }
935 }
936 }
937
938 /* Scan the entire ps array separately to make certain we find the */
939 /* proper paging_segment_max */
940 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
941 if(paging_segments[i] != PAGING_SEGMENT_NULL)
942 paging_segment_max = i;
943 }
944
945 PSL_UNLOCK();
946
b0d623f7
A
947 if( dealing_with_emergency_segment ) {
948 BS_UNLOCK(bs);
949 return KERN_SUCCESS;
950 }
951
1c79356b
A
952 /*
953 * All the segments have been deleted.
954 * We can remove the backing store.
955 */
956
957 /*
958 * Disable lookups of this backing store.
959 */
960 if((void *)bs->bs_port->alias != NULL)
91447636
A
961 kfree((void *) bs->bs_port->alias,
962 sizeof (struct vstruct_alias));
1c79356b
A
963 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
964 bs->bs_port = MACH_PORT_NULL;
965 BS_UNLOCK(bs);
966
967 /*
968 * Remove backing store from backing_store list.
969 */
970 BSL_LOCK();
971 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
972 bs_links);
973 BSL_UNLOCK();
974
975 /*
976 * Free the backing store structure.
977 */
91447636 978 kfree(bs, sizeof *bs);
1c79356b
A
979
980 return KERN_SUCCESS;
981}
982
b0d623f7
A
983kern_return_t
984default_pager_backing_store_delete(
985 MACH_PORT_FACE backing_store)
986{
987 if( backing_store != emergency_segment_backing_store ) {
988 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
989 }
990 return(default_pager_backing_store_delete_internal(backing_store));
991}
992
1c79356b
A
993int ps_enter(paging_segment_t); /* forward */
994
995int
996ps_enter(
997 paging_segment_t ps)
998{
999 int i;
1000
1001 PSL_LOCK();
1002
1003 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1004 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1005 break;
1006 }
1007
1008 if (i < MAX_NUM_PAGING_SEGMENTS) {
1009 paging_segments[i] = ps;
1010 if (i > paging_segment_max)
1011 paging_segment_max = i;
1012 paging_segment_count++;
1013 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1014 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1015 ps_select_array[ps->ps_bs->bs_priority] = 0;
1016 i = 0;
1017 } else {
1018 PSL_UNLOCK();
1019 return KERN_RESOURCE_SHORTAGE;
1020 }
1021
1022 PSL_UNLOCK();
1023 return i;
1024}
1025
1026#ifdef DEVICE_PAGING
1027kern_return_t
1028default_pager_add_segment(
1029 MACH_PORT_FACE backing_store,
1030 MACH_PORT_FACE device,
1031 recnum_t offset,
1032 recnum_t count,
1033 int record_size)
1034{
1035 backing_store_t bs;
1036 paging_segment_t ps;
1037 int i;
1038 int error;
1c79356b
A
1039
1040 if ((bs = backing_store_lookup(backing_store))
1041 == BACKING_STORE_NULL)
1042 return KERN_INVALID_ARGUMENT;
1043
1044 PSL_LOCK();
1045 for (i = 0; i <= paging_segment_max; i++) {
1046 ps = paging_segments[i];
1047 if (ps == PAGING_SEGMENT_NULL)
1048 continue;
1049
1050 /*
1051 * Check for overlap on same device.
1052 */
1053 if (!(ps->ps_device != device
1054 || offset >= ps->ps_offset + ps->ps_recnum
1055 || offset + count <= ps->ps_offset)) {
1056 PSL_UNLOCK();
1057 BS_UNLOCK(bs);
1058 return KERN_INVALID_ARGUMENT;
1059 }
1060 }
1061 PSL_UNLOCK();
1062
1063 /*
1064 * Set up the paging segment
1065 */
1066 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1067 if (ps == PAGING_SEGMENT_NULL) {
1068 BS_UNLOCK(bs);
1069 return KERN_RESOURCE_SHORTAGE;
1070 }
1071
1072 ps->ps_segtype = PS_PARTITION;
1073 ps->ps_device = device;
1074 ps->ps_offset = offset;
1075 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1076 ps->ps_recnum = count;
1077 ps->ps_pgnum = count >> ps->ps_record_shift;
1078
1079 ps->ps_pgcount = ps->ps_pgnum;
1080 ps->ps_clshift = local_log2(bs->bs_clsize);
1081 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1082 ps->ps_hint = 0;
1083
1084 PS_LOCK_INIT(ps);
1085 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1086 if (!ps->ps_bmap) {
91447636 1087 kfree(ps, sizeof *ps);
1c79356b
A
1088 BS_UNLOCK(bs);
1089 return KERN_RESOURCE_SHORTAGE;
1090 }
1091 for (i = 0; i < ps->ps_ncls; i++) {
1092 clrbit(ps->ps_bmap, i);
1093 }
1094
b0d623f7
A
1095 if(paging_segment_count == 0) {
1096 ps->ps_state = PS_EMERGENCY_SEGMENT;
1097 if(use_emergency_swap_file_first) {
1098 ps->ps_state |= PS_CAN_USE;
1099 }
1100 } else {
1101 ps->ps_state = PS_CAN_USE;
1102 }
1103
1c79356b
A
1104 ps->ps_bs = bs;
1105
1106 if ((error = ps_enter(ps)) != 0) {
91447636
A
1107 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1108 kfree(ps, sizeof *ps);
1c79356b
A
1109 BS_UNLOCK(bs);
1110 return KERN_RESOURCE_SHORTAGE;
1111 }
1112
1113 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1114 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1115 BS_UNLOCK(bs);
1116
1117 PSL_LOCK();
b0d623f7
A
1118 if(IS_PS_OK_TO_USE(ps)) {
1119 dp_pages_free += ps->ps_pgcount;
1120 } else {
1121 dp_pages_reserve += ps->ps_pgcount;
1122 }
1c79356b
A
1123 PSL_UNLOCK();
1124
1125 bs_more_space(ps->ps_clcount);
1126
91447636
A
1127 DP_DEBUG(DEBUG_BS_INTERNAL,
1128 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1129 device, offset, count, record_size,
1130 ps->ps_record_shift, ps->ps_pgnum));
1c79356b
A
1131
1132 return KERN_SUCCESS;
1133}
1134
1135boolean_t
1136bs_add_device(
1137 char *dev_name,
1138 MACH_PORT_FACE master)
1139{
1140 security_token_t null_security_token = {
1141 { 0, 0 }
1142 };
1143 MACH_PORT_FACE device;
1144 int info[DEV_GET_SIZE_COUNT];
1145 mach_msg_type_number_t info_count;
1146 MACH_PORT_FACE bs = MACH_PORT_NULL;
1147 unsigned int rec_size;
1148 recnum_t count;
1149 int clsize;
1150 MACH_PORT_FACE reply_port;
1151
1152 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1153 null_security_token, dev_name, &device))
1154 return FALSE;
1155
1156 info_count = DEV_GET_SIZE_COUNT;
1157 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1158 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1159 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1160 clsize = bs_get_global_clsize(0);
1161 if (!default_pager_backing_store_create(
0b4e3aa0 1162 default_pager_object,
1c79356b
A
1163 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1164 (clsize * vm_page_size),
1165 &bs)) {
1166 if (!default_pager_add_segment(bs, device,
1167 0, count, rec_size)) {
1168 return TRUE;
1169 }
1170 ipc_port_release_receive(bs);
1171 }
1172 }
1173
1174 ipc_port_release_send(device);
1175 return FALSE;
1176}
1177#endif /* DEVICE_PAGING */
1178
1179#if VS_ASYNC_REUSE
1180
1181struct vs_async *
1182vs_alloc_async(void)
1183{
1184 struct vs_async *vsa;
1185 MACH_PORT_FACE reply_port;
91447636 1186// kern_return_t kr;
1c79356b
A
1187
1188 VS_ASYNC_LOCK();
1189 if (vs_async_free_list == NULL) {
1190 VS_ASYNC_UNLOCK();
1191 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1192 if (vsa != NULL) {
1193 /*
1194 * Try allocating a reply port named after the
1195 * address of the vs_async structure.
1196 */
1197 struct vstruct_alias *alias_struct;
1198
1199 reply_port = ipc_port_alloc_kernel();
1200 alias_struct = (struct vstruct_alias *)
1201 kalloc(sizeof (struct vstruct_alias));
1202 if(alias_struct != NULL) {
1203 alias_struct->vs = (struct vstruct *)vsa;
0c530ab8 1204 alias_struct->name = &default_pager_ops;
b0d623f7 1205 reply_port->alias = (uintptr_t) alias_struct;
1c79356b
A
1206 vsa->reply_port = reply_port;
1207 vs_alloc_async_count++;
1208 }
1209 else {
1210 vs_alloc_async_failed++;
1211 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1212 (reply_port));
91447636 1213 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1214 vsa = NULL;
1215 }
1216 }
1217 } else {
1218 vsa = vs_async_free_list;
1219 vs_async_free_list = vs_async_free_list->vsa_next;
1220 VS_ASYNC_UNLOCK();
1221 }
1222
1223 return vsa;
1224}
1225
1226void
1227vs_free_async(
1228 struct vs_async *vsa)
1229{
1230 VS_ASYNC_LOCK();
1231 vsa->vsa_next = vs_async_free_list;
1232 vs_async_free_list = vsa;
1233 VS_ASYNC_UNLOCK();
1234}
1235
1236#else /* VS_ASYNC_REUSE */
1237
1238struct vs_async *
1239vs_alloc_async(void)
1240{
1241 struct vs_async *vsa;
1242 MACH_PORT_FACE reply_port;
1243 kern_return_t kr;
1244
1245 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1246 if (vsa != NULL) {
1247 /*
1248 * Try allocating a reply port named after the
1249 * address of the vs_async structure.
1250 */
1251 reply_port = ipc_port_alloc_kernel();
1252 alias_struct = (vstruct_alias *)
1253 kalloc(sizeof (struct vstruct_alias));
1254 if(alias_struct != NULL) {
1255 alias_struct->vs = reply_port;
0c530ab8 1256 alias_struct->name = &default_pager_ops;
1c79356b
A
1257 reply_port->alias = (int) vsa;
1258 vsa->reply_port = reply_port;
1259 vs_alloc_async_count++;
1260 }
1261 else {
1262 vs_alloc_async_failed++;
1263 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1264 (reply_port));
91447636 1265 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1266 vsa = NULL;
1267 }
1268 }
1269
1270 return vsa;
1271}
1272
1273void
1274vs_free_async(
1275 struct vs_async *vsa)
1276{
1c79356b
A
1277 MACH_PORT_FACE reply_port;
1278 kern_return_t kr;
1279
1280 reply_port = vsa->reply_port;
91447636
A
1281 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1282 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1283 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1284#if 0
1285 VS_ASYNC_LOCK();
1286 vs_alloc_async_count--;
1287 VS_ASYNC_UNLOCK();
1288#endif
1289}
1290
1291#endif /* VS_ASYNC_REUSE */
1292
0b4e3aa0
A
1293zone_t vstruct_zone;
1294
1c79356b
A
1295vstruct_t
1296ps_vstruct_create(
b0d623f7 1297 dp_size_t size)
1c79356b
A
1298{
1299 vstruct_t vs;
91447636 1300 unsigned int i;
1c79356b 1301
0b4e3aa0 1302 vs = (vstruct_t) zalloc(vstruct_zone);
1c79356b
A
1303 if (vs == VSTRUCT_NULL) {
1304 return VSTRUCT_NULL;
1305 }
1306
1307 VS_LOCK_INIT(vs);
1308
1309 /*
1310 * The following fields will be provided later.
1311 */
0c530ab8 1312 vs->vs_pager_ops = NULL;
0b4e3aa0
A
1313 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1314 vs->vs_references = 1;
1c79356b 1315 vs->vs_seqno = 0;
1c79356b 1316
1c79356b
A
1317 vs->vs_waiting_seqno = FALSE;
1318 vs->vs_waiting_read = FALSE;
1319 vs->vs_waiting_write = FALSE;
1c79356b 1320 vs->vs_waiting_async = FALSE;
1c79356b
A
1321
1322 vs->vs_readers = 0;
1323 vs->vs_writers = 0;
1324
1325 vs->vs_errors = 0;
1326
1327 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
55e303ae 1328 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1c79356b
A
1329 vs->vs_async_pending = 0;
1330
1331 /*
1332 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1333 * depending on the size of the memory object.
1334 */
1335 if (INDIRECT_CLMAP(vs->vs_size)) {
1336 vs->vs_imap = (struct vs_map **)
1337 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1338 vs->vs_indirect = TRUE;
1339 } else {
1340 vs->vs_dmap = (struct vs_map *)
1341 kalloc(CLMAP_SIZE(vs->vs_size));
1342 vs->vs_indirect = FALSE;
1343 }
1344 vs->vs_xfer_pending = FALSE;
91447636
A
1345 DP_DEBUG(DEBUG_VS_INTERNAL,
1346 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1c79356b
A
1347
1348 /*
1349 * Check to see that we got the space.
1350 */
1351 if (!vs->vs_dmap) {
91447636 1352 kfree(vs, sizeof *vs);
1c79356b
A
1353 return VSTRUCT_NULL;
1354 }
1355
1356 /*
1357 * Zero the indirect pointers, or clear the direct pointers.
1358 */
1359 if (vs->vs_indirect)
1360 memset(vs->vs_imap, 0,
1361 INDIRECT_CLMAP_SIZE(vs->vs_size));
1362 else
1363 for (i = 0; i < vs->vs_size; i++)
1364 VSM_CLR(vs->vs_dmap[i]);
1365
1366 VS_MAP_LOCK_INIT(vs);
1367
1368 bs_commit(vs->vs_size);
1369
1370 return vs;
1371}
1372
91447636 1373paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1c79356b
A
1374
1375paging_segment_t
1376ps_select_segment(
91447636
A
1377 unsigned int shift,
1378 int *psindex)
1c79356b
A
1379{
1380 paging_segment_t ps;
1381 int i;
1382 int j;
1c79356b
A
1383
1384 /*
1385 * Optimize case where there's only one segment.
1386 * paging_segment_max will index the one and only segment.
1387 */
1388
1389 PSL_LOCK();
1390 if (paging_segment_count == 1) {
b0d623f7 1391 paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */
0b4e3aa0 1392 ipc_port_t trigger = IP_NULL;
1c79356b
A
1393
1394 ps = paging_segments[paging_segment_max];
1395 *psindex = paging_segment_max;
1396 PS_LOCK(ps);
b0d623f7
A
1397 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1398 panic("Emergency paging segment missing\n");
1399 }
1400 ASSERT(ps->ps_clshift >= shift);
1401 if(IS_PS_OK_TO_USE(ps)) {
1c79356b
A
1402 if (ps->ps_clcount) {
1403 ps->ps_clcount--;
1404 dp_pages_free -= 1 << ps->ps_clshift;
b0d623f7 1405 ps->ps_pgcount -= 1 << ps->ps_clshift;
1c79356b
A
1406 if(min_pages_trigger_port &&
1407 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1408 trigger = min_pages_trigger_port;
1c79356b
A
1409 min_pages_trigger_port = NULL;
1410 bs_low = TRUE;
1411 }
1412 lps = ps;
b0d623f7
A
1413 }
1414 }
1c79356b 1415 PS_UNLOCK(ps);
b0d623f7
A
1416
1417 if( lps == PAGING_SEGMENT_NULL ) {
1418 if(dp_pages_free) {
1419 dp_pages_free_drift_count++;
1420 if(dp_pages_free > dp_pages_free_drifted_max) {
1421 dp_pages_free_drifted_max = dp_pages_free;
1422 }
1423 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1424 }
1425 dp_pages_free = 0;
1426 }
1427
1c79356b 1428 PSL_UNLOCK();
0b4e3aa0
A
1429
1430 if (trigger != IP_NULL) {
1431 default_pager_space_alert(trigger, HI_WAT_ALERT);
1432 ipc_port_release_send(trigger);
1433 }
1c79356b
A
1434 return lps;
1435 }
1436
1437 if (paging_segment_count == 0) {
b0d623f7
A
1438 if(dp_pages_free) {
1439 dp_pages_free_drift_count++;
1440 if(dp_pages_free > dp_pages_free_drifted_max) {
1441 dp_pages_free_drifted_max = dp_pages_free;
1442 }
1443 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1444 }
1445 dp_pages_free = 0;
1c79356b
A
1446 PSL_UNLOCK();
1447 return PAGING_SEGMENT_NULL;
1448 }
1449
1450 for (i = BS_MAXPRI;
1451 i >= BS_MINPRI; i--) {
1452 int start_index;
1453
1454 if ((ps_select_array[i] == BS_NOPRI) ||
1455 (ps_select_array[i] == BS_FULLPRI))
1456 continue;
1457 start_index = ps_select_array[i];
1458
1459 if(!(paging_segments[start_index])) {
1460 j = start_index+1;
1461 physical_transfer_cluster_count = 0;
1462 }
0b4e3aa0 1463 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1c79356b 1464 (((paging_segments[start_index])->ps_clshift)
0b4e3aa0 1465 + vm_page_shift))) {
1c79356b
A
1466 physical_transfer_cluster_count = 0;
1467 j = start_index + 1;
1468 } else {
1469 physical_transfer_cluster_count+=1;
1470 j = start_index;
1471 if(start_index == 0)
1472 start_index = paging_segment_max;
1473 else
1474 start_index = start_index - 1;
1475 }
1476
1477 while (1) {
1478 if (j > paging_segment_max)
1479 j = 0;
1480 if ((ps = paging_segments[j]) &&
1481 (ps->ps_bs->bs_priority == i)) {
1482 /*
1483 * Force the ps cluster size to be
1484 * >= that of the vstruct.
1485 */
1486 PS_LOCK(ps);
b0d623f7
A
1487 if (IS_PS_OK_TO_USE(ps)) {
1488 if ((ps->ps_clcount) &&
1489 (ps->ps_clshift >= shift)) {
1490 ipc_port_t trigger = IP_NULL;
1491
1492 ps->ps_clcount--;
1493 dp_pages_free -= 1 << ps->ps_clshift;
1494 ps->ps_pgcount -= 1 << ps->ps_clshift;
1495 if(min_pages_trigger_port &&
1496 (dp_pages_free <
1497 minimum_pages_remaining)) {
1498 trigger = min_pages_trigger_port;
1499 min_pages_trigger_port = NULL;
1500 }
1501 PS_UNLOCK(ps);
1502 /*
1503 * found one, quit looking.
1504 */
1505 ps_select_array[i] = j;
1506 PSL_UNLOCK();
1507
1508 if (trigger != IP_NULL) {
1509 default_pager_space_alert(
1510 trigger,
1511 HI_WAT_ALERT);
1512 ipc_port_release_send(trigger);
1513 }
1514 *psindex = j;
1515 return ps;
0b4e3aa0 1516 }
1c79356b
A
1517 }
1518 PS_UNLOCK(ps);
1519 }
1520 if (j == start_index) {
1521 /*
1522 * none at this priority -- mark it full
1523 */
1524 ps_select_array[i] = BS_FULLPRI;
1525 break;
1526 }
1527 j++;
1528 }
1529 }
b0d623f7
A
1530
1531 if(dp_pages_free) {
1532 dp_pages_free_drift_count++;
1533 if(dp_pages_free > dp_pages_free_drifted_max) {
1534 dp_pages_free_drifted_max = dp_pages_free;
1535 }
1536 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1537 }
1538 dp_pages_free = 0;
1c79356b
A
1539 PSL_UNLOCK();
1540 return PAGING_SEGMENT_NULL;
1541}
1542
b0d623f7 1543dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1c79356b 1544
b0d623f7 1545dp_offset_t
1c79356b
A
1546ps_allocate_cluster(
1547 vstruct_t vs,
1548 int *psindex,
1549 paging_segment_t use_ps)
1550{
91447636 1551 unsigned int byte_num;
1c79356b
A
1552 int bit_num = 0;
1553 paging_segment_t ps;
b0d623f7 1554 dp_offset_t cluster;
0b4e3aa0 1555 ipc_port_t trigger = IP_NULL;
1c79356b
A
1556
1557 /*
1558 * Find best paging segment.
1559 * ps_select_segment will decrement cluster count on ps.
1560 * Must pass cluster shift to find the most appropriate segment.
1561 */
1562 /* NOTE: The addition of paging segment delete capability threatened
1563 * to seriously complicate the treatment of paging segments in this
1564 * module and the ones that call it (notably ps_clmap), because of the
1565 * difficulty in assuring that the paging segment would continue to
1566 * exist between being unlocked and locked. This was
1567 * avoided because all calls to this module are based in either
1568 * dp_memory_object calls which rely on the vs lock, or by
1569 * the transfer function which is part of the segment delete path.
1570 * The transfer function which is part of paging segment delete is
1571 * protected from multiple callers by the backing store lock.
1572 * The paging segment delete function treats mappings to a paging
1573 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1574 * while data is transferred to the remaining segments. This is in
1575 * line with the view that incomplete or in-transition mappings between
1576 * data, a vstruct, and backing store are protected by the vs lock.
1577 * This and the ordering of the paging segment "going_away" bit setting
1578 * protects us.
1579 */
b0d623f7 1580retry:
1c79356b
A
1581 if (use_ps != PAGING_SEGMENT_NULL) {
1582 ps = use_ps;
1583 PSL_LOCK();
1584 PS_LOCK(ps);
55e303ae
A
1585
1586 ASSERT(ps->ps_clcount != 0);
1587
1c79356b
A
1588 ps->ps_clcount--;
1589 dp_pages_free -= 1 << ps->ps_clshift;
b0d623f7 1590 ps->ps_pgcount -= 1 << ps->ps_clshift;
1c79356b
A
1591 if(min_pages_trigger_port &&
1592 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1593 trigger = min_pages_trigger_port;
1c79356b
A
1594 min_pages_trigger_port = NULL;
1595 }
0b4e3aa0 1596 PSL_UNLOCK();
1c79356b 1597 PS_UNLOCK(ps);
0b4e3aa0
A
1598 if (trigger != IP_NULL) {
1599 default_pager_space_alert(trigger, HI_WAT_ALERT);
1600 ipc_port_release_send(trigger);
1601 }
1602
1c79356b
A
1603 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1604 PAGING_SEGMENT_NULL) {
b0d623f7
A
1605 static clock_sec_t lastnotify = 0;
1606 clock_sec_t now;
1607 clock_nsec_t nanoseconds_dummy;
1608
1609 /*
1610 * Don't immediately jump to the emergency segment. Give the
1611 * dynamic pager a chance to create it's first normal swap file.
1612 * Unless, of course the very first normal swap file can't be
1613 * created due to some problem and we didn't expect that problem
1614 * i.e. use_emergency_swap_file_first was never set to true initially.
1615 * It then gets set in the swap file creation error handling.
1616 */
1617 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1618
1619 ps = paging_segments[EMERGENCY_PSEG_INDEX];
1620 if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1621 PSL_LOCK();
1622 PS_LOCK(ps);
1623
1624 if(IS_PS_GOING_AWAY(ps)) {
1625 /* Someone de-activated the emergency paging segment*/
1626 PS_UNLOCK(ps);
1627 PSL_UNLOCK();
91447636 1628
b0d623f7
A
1629 } else if(dp_pages_free) {
1630 /*
1631 * Someone has already activated the emergency paging segment
1632 * OR
1633 * Between us having rec'd a NULL segment from ps_select_segment
1634 * and reaching here a new normal segment could have been added.
1635 * E.g. we get NULL segment and another thread just added the
1636 * new swap file. Hence check to see if we have more dp_pages_free
1637 * before activating the emergency segment.
1638 */
1639 PS_UNLOCK(ps);
1640 PSL_UNLOCK();
1641 goto retry;
1642
1643 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1644 /*
1645 * PS_CAN_USE is only reset from the emergency segment when it's
1646 * been successfully recovered. So it's legal to have an emergency
1647 * segment that has PS_CAN_USE but no clusters because it's recovery
1648 * failed.
1649 */
1650 backing_store_t bs = ps->ps_bs;
1651 ps->ps_state |= PS_CAN_USE;
1652 if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1653 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1654 ps_select_array[bs->bs_priority] = 0;
1655 }
1656 dp_pages_free += ps->ps_pgcount;
1657 dp_pages_reserve -= ps->ps_pgcount;
1658 PS_UNLOCK(ps);
1659 PSL_UNLOCK();
1660 dprintf(("Switching ON Emergency paging segment\n"));
1661 goto retry;
1662 }
1663
1664 PS_UNLOCK(ps);
1665 PSL_UNLOCK();
1666 }
1667 }
1668
91447636
A
1669 /*
1670 * Emit a notification of the low-paging resource condition
1671 * but don't issue it more than once every five seconds. This
1672 * prevents us from overflowing logs with thousands of
1673 * repetitions of the message.
1674 */
1675 clock_get_system_nanotime(&now, &nanoseconds_dummy);
b0d623f7
A
1676 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1677 /* With an activated emergency paging segment we still
1678 * didn't get any clusters. This could mean that the
1679 * emergency paging segment is exhausted.
1680 */
1681 dprintf(("System is out of paging space.\n"));
91447636
A
1682 lastnotify = now;
1683 }
1684
0b4e3aa0 1685 PSL_LOCK();
b0d623f7 1686
1c79356b 1687 if(min_pages_trigger_port) {
0b4e3aa0 1688 trigger = min_pages_trigger_port;
1c79356b
A
1689 min_pages_trigger_port = NULL;
1690 bs_low = TRUE;
1691 }
0b4e3aa0
A
1692 PSL_UNLOCK();
1693 if (trigger != IP_NULL) {
1694 default_pager_space_alert(trigger, HI_WAT_ALERT);
1695 ipc_port_release_send(trigger);
1696 }
b0d623f7 1697 return (dp_offset_t) -1;
1c79356b 1698 }
1c79356b
A
1699
1700 /*
1701 * Look for an available cluster. At the end of the loop,
1702 * byte_num is the byte offset and bit_num is the bit offset of the
1703 * first zero bit in the paging segment bitmap.
1704 */
1705 PS_LOCK(ps);
1706 byte_num = ps->ps_hint;
1707 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1708 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1709 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1710 if (isclr((ps->ps_bmap + byte_num), bit_num))
1711 break;
1712 }
1713 ASSERT(bit_num != NBBY);
1714 break;
1715 }
1716 }
1717 ps->ps_hint = byte_num;
1718 cluster = (byte_num*NBBY) + bit_num;
1719
1720 /* Space was reserved, so this must be true */
1721 ASSERT(cluster < ps->ps_ncls);
1722
1723 setbit(ps->ps_bmap, cluster);
1724 PS_UNLOCK(ps);
1725
1726 return cluster;
1727}
1728
b0d623f7 1729void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */
1c79356b
A
1730
1731void
1732ps_deallocate_cluster(
1733 paging_segment_t ps,
b0d623f7 1734 dp_offset_t cluster)
1c79356b
A
1735{
1736
b0d623f7 1737 if (cluster >= ps->ps_ncls)
1c79356b
A
1738 panic("ps_deallocate_cluster: Invalid cluster number");
1739
1740 /*
1741 * Lock the paging segment, clear the cluster's bitmap and increment the
1742 * number of free cluster.
1743 */
1744 PSL_LOCK();
1745 PS_LOCK(ps);
1746 clrbit(ps->ps_bmap, cluster);
b0d623f7
A
1747 if( IS_PS_OK_TO_USE(ps)) {
1748 ++ps->ps_clcount;
1749 ps->ps_pgcount += 1 << ps->ps_clshift;
1750 dp_pages_free += 1 << ps->ps_clshift;
1751 } else {
1752 ps->ps_special_clusters += 1;
1753 }
1c79356b
A
1754
1755 /*
1756 * Move the hint down to the freed cluster if it is
1757 * less than the current hint.
1758 */
1759 if ((cluster/NBBY) < ps->ps_hint) {
1760 ps->ps_hint = (cluster/NBBY);
1761 }
1762
1c79356b
A
1763
1764 /*
1765 * If we're freeing space on a full priority, reset the array.
1766 */
b0d623f7 1767 if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1c79356b 1768 ps_select_array[ps->ps_bs->bs_priority] = 0;
b0d623f7 1769 PS_UNLOCK(ps);
1c79356b
A
1770 PSL_UNLOCK();
1771
1772 return;
1773}
1774
b0d623f7 1775void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */
1c79356b
A
1776
1777void
1778ps_dealloc_vsmap(
1779 struct vs_map *vsmap,
b0d623f7 1780 dp_size_t size)
1c79356b 1781{
91447636 1782 unsigned int i;
1c79356b
A
1783 for (i = 0; i < size; i++)
1784 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1785 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1786 VSM_CLOFF(vsmap[i]));
1787}
1788
1789void
1790ps_vstruct_dealloc(
1791 vstruct_t vs)
1792{
91447636
A
1793 unsigned int i;
1794// spl_t s;
1c79356b
A
1795
1796 VS_MAP_LOCK(vs);
1797
1798 /*
1799 * If this is an indirect structure, then we walk through the valid
1800 * (non-zero) indirect pointers and deallocate the clusters
1801 * associated with each used map entry (via ps_dealloc_vsmap).
1802 * When all of the clusters in an indirect block have been
1803 * freed, we deallocate the block. When all of the indirect
1804 * blocks have been deallocated we deallocate the memory
1805 * holding the indirect pointers.
1806 */
1807 if (vs->vs_indirect) {
1808 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1809 if (vs->vs_imap[i] != NULL) {
1810 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
91447636 1811 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1c79356b
A
1812 }
1813 }
91447636 1814 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1c79356b
A
1815 } else {
1816 /*
1817 * Direct map. Free used clusters, then memory.
1818 */
1819 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
91447636 1820 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1c79356b
A
1821 }
1822 VS_MAP_UNLOCK(vs);
1823
1824 bs_commit(- vs->vs_size);
1825
91447636 1826 zfree(vstruct_zone, vs);
1c79356b
A
1827}
1828
91447636 1829int ps_map_extend(vstruct_t, unsigned int); /* forward */
1c79356b
A
1830
1831int ps_map_extend(
1832 vstruct_t vs,
91447636 1833 unsigned int new_size)
1c79356b
A
1834{
1835 struct vs_map **new_imap;
1836 struct vs_map *new_dmap = NULL;
1837 int newdsize;
1838 int i;
1839 void *old_map = NULL;
1840 int old_map_size = 0;
1841
1842 if (vs->vs_size >= new_size) {
1843 /*
1844 * Someone has already done the work.
1845 */
1846 return 0;
1847 }
1848
1849 /*
1850 * If the new size extends into the indirect range, then we have one
1851 * of two cases: we are going from indirect to indirect, or we are
1852 * going from direct to indirect. If we are going from indirect to
1853 * indirect, then it is possible that the new size will fit in the old
1854 * indirect map. If this is the case, then just reset the size of the
1855 * vstruct map and we are done. If the new size will not
1856 * fit into the old indirect map, then we have to allocate a new
1857 * indirect map and copy the old map pointers into this new map.
1858 *
1859 * If we are going from direct to indirect, then we have to allocate a
1860 * new indirect map and copy the old direct pages into the first
1861 * indirect page of the new map.
1862 * NOTE: allocating memory here is dangerous, as we're in the
1863 * pageout path.
1864 */
1865 if (INDIRECT_CLMAP(new_size)) {
1866 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1867
1868 /*
1869 * Get a new indirect map and zero it.
1870 */
1871 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1872 if (vs->vs_indirect &&
1873 (new_map_size == old_map_size)) {
1874 bs_commit(new_size - vs->vs_size);
1875 vs->vs_size = new_size;
1876 return 0;
1877 }
1878
1879 new_imap = (struct vs_map **)kalloc(new_map_size);
1880 if (new_imap == NULL) {
1881 return -1;
1882 }
1883 memset(new_imap, 0, new_map_size);
1884
1885 if (vs->vs_indirect) {
1886 /* Copy old entries into new map */
1887 memcpy(new_imap, vs->vs_imap, old_map_size);
1888 /* Arrange to free the old map */
1889 old_map = (void *) vs->vs_imap;
1890 newdsize = 0;
1891 } else { /* Old map was a direct map */
1892 /* Allocate an indirect page */
1893 if ((new_imap[0] = (struct vs_map *)
1894 kalloc(CLMAP_THRESHOLD)) == NULL) {
91447636 1895 kfree(new_imap, new_map_size);
1c79356b
A
1896 return -1;
1897 }
1898 new_dmap = new_imap[0];
1899 newdsize = CLMAP_ENTRIES;
1900 }
1901 } else {
1902 new_imap = NULL;
1903 newdsize = new_size;
1904 /*
1905 * If the new map is a direct map, then the old map must
1906 * also have been a direct map. All we have to do is
1907 * to allocate a new direct map, copy the old entries
1908 * into it and free the old map.
1909 */
1910 if ((new_dmap = (struct vs_map *)
1911 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1912 return -1;
1913 }
1914 }
1915 if (newdsize) {
1916
1917 /* Free the old map */
1918 old_map = (void *) vs->vs_dmap;
1919 old_map_size = CLMAP_SIZE(vs->vs_size);
1920
1921 /* Copy info from the old map into the new map */
1922 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1923
1924 /* Initialize the rest of the new map */
1925 for (i = vs->vs_size; i < newdsize; i++)
1926 VSM_CLR(new_dmap[i]);
1927 }
1928 if (new_imap) {
1929 vs->vs_imap = new_imap;
1930 vs->vs_indirect = TRUE;
1931 } else
1932 vs->vs_dmap = new_dmap;
1933 bs_commit(new_size - vs->vs_size);
1934 vs->vs_size = new_size;
1935 if (old_map)
91447636 1936 kfree(old_map, old_map_size);
1c79356b
A
1937 return 0;
1938}
1939
b0d623f7 1940dp_offset_t
1c79356b
A
1941ps_clmap(
1942 vstruct_t vs,
b0d623f7 1943 dp_offset_t offset,
1c79356b
A
1944 struct clmap *clmap,
1945 int flag,
b0d623f7 1946 dp_size_t size,
1c79356b
A
1947 int error)
1948{
b0d623f7
A
1949 dp_offset_t cluster; /* The cluster of offset. */
1950 dp_offset_t newcl; /* The new cluster allocated. */
1951 dp_offset_t newoff;
91447636 1952 unsigned int i;
1c79356b 1953 struct vs_map *vsmap;
1c79356b
A
1954
1955 VS_MAP_LOCK(vs);
1956
1957 ASSERT(vs->vs_dmap);
55e303ae 1958 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
1959
1960 /*
1961 * Initialize cluster error value
1962 */
1963 clmap->cl_error = 0;
1964
1965 /*
1966 * If the object has grown, extend the page map.
1967 */
1968 if (cluster >= vs->vs_size) {
1969 if (flag == CL_FIND) {
1970 /* Do not allocate if just doing a lookup */
1971 VS_MAP_UNLOCK(vs);
b0d623f7 1972 return (dp_offset_t) -1;
1c79356b
A
1973 }
1974 if (ps_map_extend(vs, cluster + 1)) {
1975 VS_MAP_UNLOCK(vs);
b0d623f7 1976 return (dp_offset_t) -1;
1c79356b
A
1977 }
1978 }
1979
1980 /*
1981 * Look for the desired cluster. If the map is indirect, then we
1982 * have a two level lookup. First find the indirect block, then
1983 * find the actual cluster. If the indirect block has not yet
1984 * been allocated, then do so. If the cluster has not yet been
1985 * allocated, then do so.
1986 *
1987 * If any of the allocations fail, then return an error.
1988 * Don't allocate if just doing a lookup.
1989 */
1990 if (vs->vs_indirect) {
1991 long ind_block = cluster/CLMAP_ENTRIES;
1992
1993 /* Is the indirect block allocated? */
1994 vsmap = vs->vs_imap[ind_block];
1995 if (vsmap == NULL) {
1996 if (flag == CL_FIND) {
1997 VS_MAP_UNLOCK(vs);
b0d623f7 1998 return (dp_offset_t) -1;
1c79356b
A
1999 }
2000
2001 /* Allocate the indirect block */
2002 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2003 if (vsmap == NULL) {
2004 VS_MAP_UNLOCK(vs);
b0d623f7 2005 return (dp_offset_t) -1;
1c79356b
A
2006 }
2007 /* Initialize the cluster offsets */
2008 for (i = 0; i < CLMAP_ENTRIES; i++)
2009 VSM_CLR(vsmap[i]);
2010 vs->vs_imap[ind_block] = vsmap;
2011 }
2012 } else
2013 vsmap = vs->vs_dmap;
2014
2015 ASSERT(vsmap);
2016 vsmap += cluster%CLMAP_ENTRIES;
2017
2018 /*
2019 * At this point, vsmap points to the struct vs_map desired.
2020 *
2021 * Look in the map for the cluster, if there was an error on a
2022 * previous write, flag it and return. If it is not yet
2023 * allocated, then allocate it, if we're writing; if we're
2024 * doing a lookup and the cluster's not allocated, return error.
2025 */
2026 if (VSM_ISERR(*vsmap)) {
2027 clmap->cl_error = VSM_GETERR(*vsmap);
2028 VS_MAP_UNLOCK(vs);
b0d623f7 2029 return (dp_offset_t) -1;
1c79356b
A
2030 } else if (VSM_ISCLR(*vsmap)) {
2031 int psindex;
2032
2033 if (flag == CL_FIND) {
2034 /*
2035 * If there's an error and the entry is clear, then
2036 * we've run out of swap space. Record the error
2037 * here and return.
2038 */
2039 if (error) {
2040 VSM_SETERR(*vsmap, error);
2041 }
2042 VS_MAP_UNLOCK(vs);
b0d623f7 2043 return (dp_offset_t) -1;
1c79356b
A
2044 } else {
2045 /*
2046 * Attempt to allocate a cluster from the paging segment
2047 */
2048 newcl = ps_allocate_cluster(vs, &psindex,
2049 PAGING_SEGMENT_NULL);
b0d623f7 2050 if (newcl == (dp_offset_t) -1) {
1c79356b 2051 VS_MAP_UNLOCK(vs);
b0d623f7 2052 return (dp_offset_t) -1;
1c79356b
A
2053 }
2054 VSM_CLR(*vsmap);
2055 VSM_SETCLOFF(*vsmap, newcl);
2056 VSM_SETPS(*vsmap, psindex);
2057 }
2058 } else
2059 newcl = VSM_CLOFF(*vsmap);
2060
2061 /*
2062 * Fill in pertinent fields of the clmap
2063 */
2064 clmap->cl_ps = VSM_PS(*vsmap);
2065 clmap->cl_numpages = VSCLSIZE(vs);
2066 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2067
2068 /*
2069 * Byte offset in paging segment is byte offset to cluster plus
2070 * byte offset within cluster. It looks ugly, but should be
2071 * relatively quick.
2072 */
2073 ASSERT(trunc_page(offset) == offset);
55e303ae 2074 newcl = ptoa_32(newcl) << vs->vs_clshift;
1c79356b
A
2075 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2076 if (flag == CL_ALLOC) {
2077 /*
2078 * set bits in the allocation bitmap according to which
2079 * pages were requested. size is in bytes.
2080 */
55e303ae 2081 i = atop_32(newoff);
1c79356b
A
2082 while ((size > 0) && (i < VSCLSIZE(vs))) {
2083 VSM_SETALLOC(*vsmap, i);
2084 i++;
2085 size -= vm_page_size;
2086 }
2087 }
2088 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2089 if (newoff) {
2090 /*
2091 * Offset is not cluster aligned, so number of pages
2092 * and bitmaps must be adjusted
2093 */
55e303ae 2094 clmap->cl_numpages -= atop_32(newoff);
1c79356b
A
2095 CLMAP_SHIFT(clmap, vs);
2096 CLMAP_SHIFTALLOC(clmap, vs);
2097 }
2098
2099 /*
2100 *
2101 * The setting of valid bits and handling of write errors
2102 * must be done here, while we hold the lock on the map.
2103 * It logically should be done in ps_vs_write_complete().
2104 * The size and error information has been passed from
2105 * ps_vs_write_complete(). If the size parameter is non-zero,
2106 * then there is work to be done. If error is also non-zero,
2107 * then the error number is recorded in the cluster and the
2108 * entire cluster is in error.
2109 */
2110 if (size && flag == CL_FIND) {
b0d623f7 2111 dp_offset_t off = (dp_offset_t) 0;
1c79356b
A
2112
2113 if (!error) {
2114 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2115 i++) {
2116 VSM_SETPG(*vsmap, i);
2117 size -= vm_page_size;
2118 }
2119 ASSERT(i <= VSCLSIZE(vs));
2120 } else {
2121 BS_STAT(clmap->cl_ps->ps_bs,
2122 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
55e303ae 2123 atop_32(size));
1c79356b
A
2124 off = VSM_CLOFF(*vsmap);
2125 VSM_SETERR(*vsmap, error);
2126 }
2127 /*
2128 * Deallocate cluster if error, and no valid pages
2129 * already present.
2130 */
b0d623f7 2131 if (off != (dp_offset_t) 0)
1c79356b
A
2132 ps_deallocate_cluster(clmap->cl_ps, off);
2133 VS_MAP_UNLOCK(vs);
b0d623f7 2134 return (dp_offset_t) 0;
1c79356b
A
2135 } else
2136 VS_MAP_UNLOCK(vs);
2137
91447636
A
2138 DP_DEBUG(DEBUG_VS_INTERNAL,
2139 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2140 newcl+newoff, (int) vs, (int) vsmap, flag));
2141 DP_DEBUG(DEBUG_VS_INTERNAL,
2142 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2143 (int) clmap->cl_ps, clmap->cl_numpages,
2144 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1c79356b
A
2145
2146 return (newcl + newoff);
2147}
2148
b0d623f7 2149void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */
1c79356b
A
2150
2151void
2152ps_clunmap(
2153 vstruct_t vs,
b0d623f7
A
2154 dp_offset_t offset,
2155 dp_size_t length)
1c79356b 2156{
b0d623f7 2157 dp_offset_t cluster; /* The cluster number of offset */
1c79356b 2158 struct vs_map *vsmap;
1c79356b
A
2159
2160 VS_MAP_LOCK(vs);
2161
2162 /*
2163 * Loop through all clusters in this range, freeing paging segment
2164 * clusters and map entries as encountered.
2165 */
2166 while (length > 0) {
b0d623f7 2167 dp_offset_t newoff;
91447636 2168 unsigned int i;
1c79356b 2169
55e303ae 2170 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
2171 if (vs->vs_indirect) /* indirect map */
2172 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2173 else
2174 vsmap = vs->vs_dmap;
2175 if (vsmap == NULL) {
2176 VS_MAP_UNLOCK(vs);
2177 return;
2178 }
2179 vsmap += cluster%CLMAP_ENTRIES;
2180 if (VSM_ISCLR(*vsmap)) {
2181 length -= vm_page_size;
2182 offset += vm_page_size;
2183 continue;
2184 }
2185 /*
2186 * We've got a valid mapping. Clear it and deallocate
2187 * paging segment cluster pages.
2188 * Optimize for entire cluster cleraing.
2189 */
91447636 2190 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
1c79356b
A
2191 /*
2192 * Not cluster aligned.
2193 */
2194 ASSERT(trunc_page(newoff) == newoff);
55e303ae 2195 i = atop_32(newoff);
1c79356b
A
2196 } else
2197 i = 0;
2198 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2199 VSM_CLRPG(*vsmap, i);
2200 VSM_CLRALLOC(*vsmap, i);
2201 length -= vm_page_size;
2202 offset += vm_page_size;
2203 i++;
2204 }
2205
2206 /*
2207 * If map entry is empty, clear and deallocate cluster.
2208 */
2209 if (!VSM_ALLOC(*vsmap)) {
2210 ps_deallocate_cluster(VSM_PS(*vsmap),
2211 VSM_CLOFF(*vsmap));
2212 VSM_CLR(*vsmap);
2213 }
2214 }
2215
2216 VS_MAP_UNLOCK(vs);
2217}
2218
b0d623f7 2219void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
1c79356b
A
2220
2221void
2222ps_vs_write_complete(
2223 vstruct_t vs,
b0d623f7
A
2224 dp_offset_t offset,
2225 dp_size_t size,
1c79356b
A
2226 int error)
2227{
2228 struct clmap clmap;
2229
2230 /*
2231 * Get the struct vsmap for this cluster.
2232 * Use READ, even though it was written, because the
2233 * cluster MUST be present, unless there was an error
2234 * in the original ps_clmap (e.g. no space), in which
2235 * case, nothing happens.
2236 *
2237 * Must pass enough information to ps_clmap to allow it
2238 * to set the vs_map structure bitmap under lock.
2239 */
2240 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2241}
2242
b0d623f7 2243void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */
1c79356b
A
2244
2245void
2246vs_cl_write_complete(
b0d623f7 2247 vstruct_t vs,
91447636 2248 __unused paging_segment_t ps,
b0d623f7 2249 dp_offset_t offset,
91447636 2250 __unused vm_offset_t addr,
b0d623f7
A
2251 dp_size_t size,
2252 boolean_t async,
2253 int error)
1c79356b 2254{
91447636 2255// kern_return_t kr;
1c79356b
A
2256
2257 if (error) {
2258 /*
2259 * For internal objects, the error is recorded on a
2260 * per-cluster basis by ps_clmap() which is called
2261 * by ps_vs_write_complete() below.
2262 */
2263 dprintf(("write failed error = 0x%x\n", error));
2264 /* add upl_abort code here */
2265 } else
55e303ae 2266 GSTAT(global_stats.gs_pages_out += atop_32(size));
1c79356b
A
2267 /*
2268 * Notify the vstruct mapping code, so it can do its accounting.
2269 */
2270 ps_vs_write_complete(vs, offset, size, error);
2271
2272 if (async) {
2273 VS_LOCK(vs);
2274 ASSERT(vs->vs_async_pending > 0);
2275 vs->vs_async_pending -= size;
0b4e3aa0
A
2276 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2277 vs->vs_waiting_async = FALSE;
1c79356b 2278 VS_UNLOCK(vs);
0b4e3aa0 2279 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
2280 } else {
2281 VS_UNLOCK(vs);
2282 }
2283 }
2284}
2285
2286#ifdef DEVICE_PAGING
2287kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2288
2289kern_return_t
2290device_write_reply(
2291 MACH_PORT_FACE reply_port,
2292 kern_return_t device_code,
2293 io_buf_len_t bytes_written)
2294{
2295 struct vs_async *vsa;
1c79356b
A
2296
2297 vsa = (struct vs_async *)
2298 ((struct vstruct_alias *)(reply_port->alias))->vs;
2299
2300 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2301 device_code = KERN_FAILURE;
2302 }
2303
2304 vsa->vsa_error = device_code;
2305
2306
2307 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2308 if(vsa->vsa_flags & VSA_TRANSFER) {
2309 /* revisit when async disk segments redone */
2310 if(vsa->vsa_error) {
2311 /* need to consider error condition. re-write data or */
2312 /* throw it away here. */
91447636 2313 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
1c79356b
A
2314 }
2315 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2316 vsa->vsa_size, vsa->vsa_error);
2317 } else {
2318 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2319 vsa->vsa_addr, vsa->vsa_size, TRUE,
2320 vsa->vsa_error);
2321 }
2322 VS_FREE_ASYNC(vsa);
2323
2324 return KERN_SUCCESS;
2325}
2326
2327kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2328kern_return_t
2329device_write_reply_inband(
2330 MACH_PORT_FACE reply_port,
2331 kern_return_t return_code,
2332 io_buf_len_t bytes_written)
2333{
2334 panic("device_write_reply_inband: illegal");
2335 return KERN_SUCCESS;
2336}
2337
2338kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2339kern_return_t
2340device_read_reply(
2341 MACH_PORT_FACE reply_port,
2342 kern_return_t return_code,
2343 io_buf_ptr_t data,
2344 mach_msg_type_number_t dataCnt)
2345{
2346 struct vs_async *vsa;
2347 vsa = (struct vs_async *)
2348 ((struct vstruct_alias *)(reply_port->alias))->vs;
2349 vsa->vsa_addr = (vm_offset_t)data;
2350 vsa->vsa_size = (vm_size_t)dataCnt;
2351 vsa->vsa_error = return_code;
b0d623f7 2352 thread_wakeup(&vsa);
1c79356b
A
2353 return KERN_SUCCESS;
2354}
2355
2356kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2357kern_return_t
2358device_read_reply_inband(
2359 MACH_PORT_FACE reply_port,
2360 kern_return_t return_code,
2361 io_buf_ptr_inband_t data,
2362 mach_msg_type_number_t dataCnt)
2363{
2364 panic("device_read_reply_inband: illegal");
2365 return KERN_SUCCESS;
2366}
2367
2368kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2369kern_return_t
2370device_read_reply_overwrite(
2371 MACH_PORT_FACE reply_port,
2372 kern_return_t return_code,
2373 io_buf_len_t bytes_read)
2374{
2375 panic("device_read_reply_overwrite: illegal\n");
2376 return KERN_SUCCESS;
2377}
2378
2379kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2380kern_return_t
2381device_open_reply(
2382 MACH_PORT_FACE reply_port,
2383 kern_return_t return_code,
2384 MACH_PORT_FACE device_port)
2385{
2386 panic("device_open_reply: illegal\n");
2387 return KERN_SUCCESS;
2388}
2389
1c79356b
A
2390kern_return_t
2391ps_read_device(
2392 paging_segment_t ps,
b0d623f7 2393 dp_offset_t offset,
1c79356b
A
2394 vm_offset_t *bufferp,
2395 unsigned int size,
2396 unsigned int *residualp,
2397 int flags)
2398{
2399 kern_return_t kr;
2400 recnum_t dev_offset;
2401 unsigned int bytes_wanted;
2402 unsigned int bytes_read;
2403 unsigned int total_read;
2404 vm_offset_t dev_buffer;
2405 vm_offset_t buf_ptr;
2406 unsigned int records_read;
1c79356b 2407 struct vs_async *vsa;
1c79356b
A
2408
2409 device_t device;
2410 vm_map_copy_t device_data = NULL;
2411 default_pager_thread_t *dpt = NULL;
2412
2413 device = dev_port_lookup(ps->ps_device);
55e303ae 2414 clustered_reads[atop_32(size)]++;
1c79356b
A
2415
2416 dev_offset = (ps->ps_offset +
2417 (offset >> (vm_page_shift - ps->ps_record_shift)));
2418 bytes_wanted = size;
2419 total_read = 0;
2420 *bufferp = (vm_offset_t)NULL;
2421
2422 do {
2423 vsa = VS_ALLOC_ASYNC();
2424 if (vsa) {
2425 vsa->vsa_vs = NULL;
2426 vsa->vsa_addr = 0;
2427 vsa->vsa_offset = 0;
2428 vsa->vsa_size = 0;
2429 vsa->vsa_ps = NULL;
2430 }
1c79356b
A
2431 ip_lock(vsa->reply_port);
2432 vsa->reply_port->ip_sorights++;
2433 ip_reference(vsa->reply_port);
2434 ip_unlock(vsa->reply_port);
2435 kr = ds_device_read_common(device,
2436 vsa->reply_port,
2437 (mach_msg_type_name_t)
2438 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2439 (dev_mode_t) 0,
2440 dev_offset,
2441 bytes_wanted,
2442 (IO_READ | IO_CALL),
2443 (io_buf_ptr_t *) &dev_buffer,
2444 (mach_msg_type_number_t *) &bytes_read);
2445 if(kr == MIG_NO_REPLY) {
b0d623f7 2446 assert_wait(&vsa, THREAD_UNINT);
9bccf70c 2447 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
2448
2449 dev_buffer = vsa->vsa_addr;
2450 bytes_read = (unsigned int)vsa->vsa_size;
2451 kr = vsa->vsa_error;
2452 }
2453 VS_FREE_ASYNC(vsa);
2454 if (kr != KERN_SUCCESS || bytes_read == 0) {
2455 break;
2456 }
2457 total_read += bytes_read;
2458
2459 /*
2460 * If we got the entire range, use the returned dev_buffer.
2461 */
2462 if (bytes_read == size) {
2463 *bufferp = (vm_offset_t)dev_buffer;
2464 break;
2465 }
2466
2467#if 1
2468 dprintf(("read only %d bytes out of %d\n",
2469 bytes_read, bytes_wanted));
2470#endif
2471 if(dpt == NULL) {
2472 dpt = get_read_buffer();
2473 buf_ptr = dpt->dpt_buffer;
2474 *bufferp = (vm_offset_t)buf_ptr;
2475 }
2476 /*
2477 * Otherwise, copy the data into the provided buffer (*bufferp)
2478 * and append the rest of the range as it comes in.
2479 */
2480 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2481 buf_ptr += bytes_read;
2482 bytes_wanted -= bytes_read;
2483 records_read = (bytes_read >>
2484 (vm_page_shift - ps->ps_record_shift));
2485 dev_offset += records_read;
91447636
A
2486 DP_DEBUG(DEBUG_VS_INTERNAL,
2487 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2488 dev_buffer, bytes_read));
1c79356b
A
2489 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2490 != KERN_SUCCESS)
2491 Panic("dealloc buf");
2492 } while (bytes_wanted);
2493
2494 *residualp = size - total_read;
2495 if((dev_buffer != *bufferp) && (total_read != 0)) {
2496 vm_offset_t temp_buffer;
91447636 2497 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
1c79356b
A
2498 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2499 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2500 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2501 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2502 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2503 (vm_map_copy_t *)&device_data, FALSE))
2504 panic("ps_read_device: cannot copyin locally provided buffer\n");
2505 }
2506 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2507 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2508 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2509 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2510 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2511 (vm_map_copy_t *)&device_data, FALSE))
2512 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2513 }
2514 else {
2515 device_data = NULL;
2516 }
2517 *bufferp = (vm_offset_t)device_data;
2518
2519 if(dpt != NULL) {
2520 /* Free the receive buffer */
2521 dpt->checked_out = 0;
2522 thread_wakeup(&dpt_array);
2523 }
2524 return KERN_SUCCESS;
2525}
2526
1c79356b
A
2527kern_return_t
2528ps_write_device(
2529 paging_segment_t ps,
b0d623f7 2530 dp_offset_t offset,
1c79356b
A
2531 vm_offset_t addr,
2532 unsigned int size,
2533 struct vs_async *vsa)
2534{
2535 recnum_t dev_offset;
2536 io_buf_len_t bytes_to_write, bytes_written;
2537 recnum_t records_written;
2538 kern_return_t kr;
2539 MACH_PORT_FACE reply_port;
1c79356b
A
2540
2541
2542
55e303ae 2543 clustered_writes[atop_32(size)]++;
1c79356b
A
2544
2545 dev_offset = (ps->ps_offset +
2546 (offset >> (vm_page_shift - ps->ps_record_shift)));
2547 bytes_to_write = size;
2548
2549 if (vsa) {
2550 /*
2551 * Asynchronous write.
2552 */
2553 reply_port = vsa->reply_port;
2554 ip_lock(reply_port);
2555 reply_port->ip_sorights++;
2556 ip_reference(reply_port);
2557 ip_unlock(reply_port);
2558 {
2559 device_t device;
2560 device = dev_port_lookup(ps->ps_device);
2561
2562 vsa->vsa_addr = addr;
2563 kr=ds_device_write_common(device,
2564 reply_port,
2565 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2566 (dev_mode_t) 0,
2567 dev_offset,
2568 (io_buf_ptr_t) addr,
2569 size,
2570 (IO_WRITE | IO_CALL),
2571 &bytes_written);
2572 }
2573 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2574 if (verbose)
2575 dprintf(("%s0x%x, addr=0x%x,"
2576 "size=0x%x,offset=0x%x\n",
2577 "device_write_request returned ",
2578 kr, addr, size, offset));
2579 BS_STAT(ps->ps_bs,
55e303ae 2580 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2581 /* do the completion notification to free resources */
2582 device_write_reply(reply_port, kr, 0);
2583 return PAGER_ERROR;
2584 }
2585 } else do {
2586 /*
2587 * Synchronous write.
2588 */
2589 {
2590 device_t device;
2591 device = dev_port_lookup(ps->ps_device);
2592 kr=ds_device_write_common(device,
2593 IP_NULL, 0,
2594 (dev_mode_t) 0,
2595 dev_offset,
2596 (io_buf_ptr_t) addr,
2597 size,
2598 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2599 &bytes_written);
2600 }
2601 if (kr != KERN_SUCCESS) {
2602 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2603 "device_write returned ",
2604 kr, addr, size, offset));
2605 BS_STAT(ps->ps_bs,
55e303ae 2606 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2607 return PAGER_ERROR;
2608 }
2609 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2610 Panic("fragmented write");
2611 records_written = (bytes_written >>
2612 (vm_page_shift - ps->ps_record_shift));
2613 dev_offset += records_written;
2614#if 1
2615 if (bytes_written != bytes_to_write) {
2616 dprintf(("wrote only %d bytes out of %d\n",
2617 bytes_written, bytes_to_write));
2618 }
2619#endif
2620 bytes_to_write -= bytes_written;
2621 addr += bytes_written;
2622 } while (bytes_to_write > 0);
2623
2624 return PAGER_SUCCESS;
2625}
2626
2627
2628#else /* !DEVICE_PAGING */
2629
2630kern_return_t
2631ps_read_device(
91447636 2632 __unused paging_segment_t ps,
b0d623f7 2633 __unused dp_offset_t offset,
91447636
A
2634 __unused vm_offset_t *bufferp,
2635 __unused unsigned int size,
2636 __unused unsigned int *residualp,
2637 __unused int flags)
1c79356b
A
2638{
2639 panic("ps_read_device not supported");
0c530ab8 2640 return KERN_FAILURE;
1c79356b
A
2641}
2642
91447636 2643kern_return_t
1c79356b 2644ps_write_device(
91447636 2645 __unused paging_segment_t ps,
b0d623f7 2646 __unused dp_offset_t offset,
91447636
A
2647 __unused vm_offset_t addr,
2648 __unused unsigned int size,
2649 __unused struct vs_async *vsa)
1c79356b
A
2650{
2651 panic("ps_write_device not supported");
0c530ab8 2652 return KERN_FAILURE;
1c79356b
A
2653}
2654
2655#endif /* DEVICE_PAGING */
91447636 2656void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
1c79356b
A
2657
2658void
2659pvs_object_data_provided(
91447636
A
2660 __unused vstruct_t vs,
2661 __unused upl_t upl,
2662 __unused upl_offset_t offset,
2663 upl_size_t size)
1c79356b 2664{
1c79356b 2665
91447636
A
2666 DP_DEBUG(DEBUG_VS_INTERNAL,
2667 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2668 upl, offset, size));
1c79356b
A
2669
2670 ASSERT(size > 0);
55e303ae 2671 GSTAT(global_stats.gs_pages_in += atop_32(size));
1c79356b
A
2672
2673
2674#if USE_PRECIOUS
2675 ps_clunmap(vs, offset, size);
2676#endif /* USE_PRECIOUS */
2677
2678}
2679
2d21ac55
A
2680static memory_object_offset_t last_start;
2681static vm_size_t last_length;
2682
1c79356b
A
2683kern_return_t
2684pvs_cluster_read(
2685 vstruct_t vs,
b0d623f7
A
2686 dp_offset_t vs_offset,
2687 dp_size_t cnt,
2d21ac55 2688 void *fault_info)
1c79356b 2689{
1c79356b 2690 kern_return_t error = KERN_SUCCESS;
2d21ac55 2691 unsigned int size;
0c530ab8 2692 unsigned int residual;
1c79356b 2693 unsigned int request_flags;
b0d623f7 2694 int io_flags = 0;
2d21ac55
A
2695 int seg_index;
2696 int pages_in_cl;
0b4e3aa0
A
2697 int cl_size;
2698 int cl_mask;
2d21ac55
A
2699 int cl_index;
2700 unsigned int xfer_size;
b0d623f7 2701 dp_offset_t orig_vs_offset;
0b4c1975
A
2702 dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2703 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
0b4e3aa0 2704 struct clmap clmap;
2d21ac55
A
2705 upl_t upl;
2706 unsigned int page_list_count;
b0d623f7
A
2707 memory_object_offset_t cluster_start;
2708 vm_size_t cluster_length;
2709 uint32_t io_streaming;
0b4e3aa0
A
2710
2711 pages_in_cl = 1 << vs->vs_clshift;
2712 cl_size = pages_in_cl * vm_page_size;
2713 cl_mask = cl_size - 1;
1c79356b 2714
1c79356b 2715#if USE_PRECIOUS
2d21ac55 2716 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
1c79356b 2717#else
2d21ac55 2718 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
1c79356b 2719#endif
2d21ac55
A
2720 cl_index = (vs_offset & cl_mask) / vm_page_size;
2721
b0d623f7 2722 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2d21ac55
A
2723 !CLMAP_ISSET(clmap, cl_index)) {
2724 /*
2725 * the needed page doesn't exist in the backing store...
2726 * we don't want to try to do any I/O, just abort the
2727 * page and let the fault handler provide a zero-fill
2728 */
2729 if (cnt == 0) {
2730 /*
2731 * The caller was just poking at us to see if
2732 * the page has been paged out. No need to
2733 * mess with the page at all.
2734 * Just let the caller know we don't have that page.
2735 */
2736 return KERN_FAILURE;
2737 }
2738
2739 page_list_count = 0;
2740
2741 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2742 PAGE_SIZE, PAGE_SIZE,
2743 &upl, NULL, &page_list_count,
2744 request_flags);
2745
2746 if (clmap.cl_error)
2747 upl_abort(upl, UPL_ABORT_ERROR);
2748 else
2749 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2750 upl_deallocate(upl);
91447636 2751
2d21ac55
A
2752 return KERN_SUCCESS;
2753 }
2754
2755 if (cnt == 0) {
2756 /*
2757 * The caller was just poking at us to see if
2758 * the page has been paged out. No need to
2759 * mess with the page at all.
2760 * Just let the caller know we do have that page.
2761 */
2762 return KERN_SUCCESS;
2763 }
2764
91447636
A
2765 assert(dp_encryption_inited);
2766 if (dp_encryption) {
2767 /*
2768 * ENCRYPTED SWAP:
2769 * request that the UPL be prepared for
2770 * decryption.
2771 */
2772 request_flags |= UPL_ENCRYPT;
2773 }
2d21ac55 2774 orig_vs_offset = vs_offset;
91447636 2775
2d21ac55
A
2776 assert(cnt != 0);
2777 cnt = VM_SUPER_CLUSTER;
b0d623f7
A
2778 cluster_start = (memory_object_offset_t) vs_offset;
2779 cluster_length = (vm_size_t) cnt;
2780 io_streaming = 0;
1c79356b 2781
2d21ac55
A
2782 /*
2783 * determine how big a speculative I/O we should try for...
2784 */
b0d623f7
A
2785 if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2786 assert(vs_offset >= (dp_offset_t) cluster_start &&
2787 vs_offset < (dp_offset_t) (cluster_start + cluster_length));
2788 vs_offset = (dp_offset_t) cluster_start;
2789 cnt = (dp_size_t) cluster_length;
2790 } else {
2791 cluster_length = PAGE_SIZE;
2d21ac55 2792 cnt = PAGE_SIZE;
b0d623f7
A
2793 }
2794
2795 if (io_streaming)
2796 io_flags |= UPL_IOSTREAMING;
2d21ac55 2797
b0d623f7
A
2798 last_start = cluster_start;
2799 last_length = cluster_length;
2d21ac55
A
2800
2801 /*
2802 * This loop will be executed multiple times until the entire
2803 * range has been looked at or we issue an I/O... if the request spans cluster
2804 * boundaries, the clusters will be checked for logical continunity,
2805 * if contiguous the I/O request will span multiple clusters...
2806 * at most only 1 I/O will be issued... it will encompass the original offset
2807 */
2808 while (cnt && error == KERN_SUCCESS) {
2809 int ps_info_valid;
2810
2811 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
d12e1678
A
2812 size = VM_SUPER_CLUSTER;
2813 size -= vs_offset & cl_mask;
2d21ac55 2814 } else if (cnt > VM_SUPER_CLUSTER)
0b4e3aa0 2815 size = VM_SUPER_CLUSTER;
2d21ac55 2816 else
0b4e3aa0 2817 size = cnt;
2d21ac55 2818
0b4e3aa0 2819 cnt -= size;
1c79356b 2820
0b4e3aa0
A
2821 ps_info_valid = 0;
2822 seg_index = 0;
1c79356b 2823
0b4e3aa0 2824 while (size > 0 && error == KERN_SUCCESS) {
2d21ac55 2825 unsigned int abort_size;
0b4e3aa0
A
2826 int failed_size;
2827 int beg_pseg;
2828 int beg_indx;
b0d623f7 2829 dp_offset_t cur_offset;
1c79356b 2830
0b4e3aa0
A
2831 if ( !ps_info_valid) {
2832 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2833 psp[seg_index] = CLMAP_PS(clmap);
2834 ps_info_valid = 1;
1c79356b 2835 }
0b4e3aa0
A
2836 /*
2837 * skip over unallocated physical segments
2838 */
b0d623f7 2839 if (ps_offset[seg_index] == (dp_offset_t) -1) {
0b4e3aa0
A
2840 abort_size = cl_size - (vs_offset & cl_mask);
2841 abort_size = MIN(abort_size, size);
2842
2d21ac55
A
2843 size -= abort_size;
2844 vs_offset += abort_size;
1c79356b 2845
0b4e3aa0
A
2846 seg_index++;
2847 ps_info_valid = 0;
2d21ac55 2848
0b4e3aa0 2849 continue;
1c79356b 2850 }
0b4e3aa0
A
2851 cl_index = (vs_offset & cl_mask) / vm_page_size;
2852
2853 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2854 /*
2855 * skip over unallocated pages
2856 */
2857 if (CLMAP_ISSET(clmap, cl_index))
2858 break;
2859 abort_size += vm_page_size;
2860 }
2861 if (abort_size) {
2d21ac55
A
2862 size -= abort_size;
2863 vs_offset += abort_size;
0b4e3aa0
A
2864
2865 if (cl_index == pages_in_cl) {
2866 /*
2867 * if we're at the end of this physical cluster
2868 * then bump to the next one and continue looking
2869 */
2870 seg_index++;
2871 ps_info_valid = 0;
2d21ac55 2872
0b4e3aa0
A
2873 continue;
2874 }
2875 if (size == 0)
2876 break;
2877 }
1c79356b 2878 /*
0b4e3aa0
A
2879 * remember the starting point of the first allocated page
2880 * for the I/O we're about to issue
1c79356b 2881 */
0b4e3aa0
A
2882 beg_pseg = seg_index;
2883 beg_indx = cl_index;
2884 cur_offset = vs_offset;
2885
2886 /*
2887 * calculate the size of the I/O that we can do...
2888 * this may span multiple physical segments if
2889 * they are contiguous
2890 */
2891 for (xfer_size = 0; xfer_size < size; ) {
2892
2d21ac55 2893 while (cl_index < pages_in_cl && xfer_size < size) {
0b4e3aa0 2894 /*
55e303ae 2895 * accumulate allocated pages within
d12e1678 2896 * a physical segment
1c79356b 2897 */
0b4e3aa0
A
2898 if (CLMAP_ISSET(clmap, cl_index)) {
2899 xfer_size += vm_page_size;
2900 cur_offset += vm_page_size;
2901 cl_index++;
2902
2903 BS_STAT(psp[seg_index]->ps_bs,
2904 psp[seg_index]->ps_bs->bs_pages_in++);
2905 } else
2906 break;
2907 }
2d21ac55 2908 if (cl_index < pages_in_cl || xfer_size >= size) {
0b4e3aa0 2909 /*
55e303ae 2910 * we've hit an unallocated page or
2d21ac55
A
2911 * the end of this request... see if
2912 * it's time to fire the I/O
1c79356b 2913 */
0b4e3aa0
A
2914 break;
2915 }
2916 /*
d12e1678 2917 * we've hit the end of the current physical
55e303ae 2918 * segment and there's more to do, so try
d12e1678 2919 * moving to the next one
0b4e3aa0
A
2920 */
2921 seg_index++;
2922
2d21ac55 2923 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
d12e1678 2924 psp[seg_index] = CLMAP_PS(clmap);
0b4e3aa0
A
2925 ps_info_valid = 1;
2926
2927 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2928 /*
55e303ae
A
2929 * if the physical segment we're about
2930 * to step into is not contiguous to
2931 * the one we're currently in, or it's
d12e1678 2932 * in a different paging file, or
0b4e3aa0 2933 * it hasn't been allocated....
2d21ac55
A
2934 * we stop this run and go check
2935 * to see if it's time to fire the I/O
0b4e3aa0
A
2936 */
2937 break;
1c79356b 2938 }
0b4e3aa0 2939 /*
d12e1678 2940 * start with first page of the next physical
2d21ac55 2941 * segment
0b4e3aa0
A
2942 */
2943 cl_index = 0;
1c79356b 2944 }
2d21ac55 2945 if (xfer_size == 0) {
0b4e3aa0 2946 /*
2d21ac55 2947 * no I/O to generate for this segment
0b4e3aa0 2948 */
0b4e3aa0 2949 continue;
2d21ac55
A
2950 }
2951 if (cur_offset <= orig_vs_offset) {
2952 /*
2953 * we've hit a hole in our speculative cluster
2954 * before the offset that we're really after...
2955 * don't issue the I/O since it doesn't encompass
2956 * the original offset and we're looking to only
2957 * pull in the speculative pages if they can be
2958 * made part of a single I/O
2959 */
2960 size -= xfer_size;
2961 vs_offset += xfer_size;
1c79356b 2962
2d21ac55
A
2963 continue;
2964 }
2965 /*
2966 * we have a contiguous range of allocated pages
2967 * to read from that encompasses the original offset
2968 */
2969 page_list_count = 0;
2970 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2971 xfer_size, xfer_size,
2972 &upl, NULL, &page_list_count,
2973 request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2974
2975 error = ps_read_file(psp[beg_pseg],
2976 upl, (upl_offset_t) 0,
2977 ps_offset[beg_pseg] + (beg_indx * vm_page_size),
b0d623f7 2978 xfer_size, &residual, io_flags);
2d21ac55 2979
0b4e3aa0
A
2980 failed_size = 0;
2981
2982 /*
55e303ae 2983 * Adjust counts and send response to VM. Optimize
d12e1678 2984 * for the common case, i.e. no error and/or partial
55e303ae 2985 * data. If there was an error, then we need to error
d12e1678 2986 * the entire range, even if some data was successfully
55e303ae 2987 * read. If there was a partial read we may supply some
0b4e3aa0 2988 * data and may error some as well. In all cases the
55e303ae
A
2989 * VM must receive some notification for every page
2990 * in the range.
0b4e3aa0
A
2991 */
2992 if ((error == KERN_SUCCESS) && (residual == 0)) {
2993 /*
d12e1678 2994 * Got everything we asked for, supply the data
55e303ae
A
2995 * to the VM. Note that as a side effect of
2996 * supplying the data, the buffer holding the
2997 * supplied data is deallocated from the pager's
2998 * address space.
0b4e3aa0 2999 */
2d21ac55 3000 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
0b4e3aa0
A
3001 } else {
3002 failed_size = xfer_size;
3003
3004 if (error == KERN_SUCCESS) {
2d21ac55
A
3005 if (residual == xfer_size) {
3006 /*
3007 * If a read operation returns no error
3008 * and no data moved, we turn it into
3009 * an error, assuming we're reading at
3010 * or beyong EOF.
3011 * Fall through and error the entire range.
3012 */
0b4e3aa0
A
3013 error = KERN_FAILURE;
3014 } else {
2d21ac55
A
3015 /*
3016 * Otherwise, we have partial read. If
3017 * the part read is a integral number
3018 * of pages supply it. Otherwise round
3019 * it up to a page boundary, zero fill
3020 * the unread part, and supply it.
3021 * Fall through and error the remainder
3022 * of the range, if any.
3023 */
3024 int fill;
3025 unsigned int lsize;
3026
3027 fill = residual & ~vm_page_size;
3028 lsize = (xfer_size - residual) + fill;
0b4e3aa0 3029
2d21ac55 3030 pvs_object_data_provided(vs, upl, vs_offset, lsize);
0b4e3aa0
A
3031
3032 if (lsize < xfer_size) {
2d21ac55 3033 failed_size = xfer_size - lsize;
0b4e3aa0
A
3034 error = KERN_FAILURE;
3035 }
3036 }
3037 }
3038 }
1c79356b 3039 if (error != KERN_SUCCESS) {
2d21ac55
A
3040 /*
3041 * There was an error in some part of the range, tell
3042 * the VM. Note that error is explicitly checked again
3043 * since it can be modified above.
3044 */
0b4e3aa0 3045 BS_STAT(psp[beg_pseg]->ps_bs,
2d21ac55 3046 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
1c79356b 3047 }
2d21ac55
A
3048 /*
3049 * we've issued a single I/O that encompassed the original offset
3050 * at this point we either met our speculative request length or
3051 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3052 * not present or not physically contiguous to the previous one), so
3053 * we're done issuing I/O at this point
3054 */
3055 return (error);
1c79356b 3056 }
2d21ac55 3057 }
1c79356b
A
3058 return error;
3059}
3060
3061int vs_do_async_write = 1;
3062
3063kern_return_t
3064vs_cluster_write(
3065 vstruct_t vs,
3066 upl_t internal_upl,
91447636
A
3067 upl_offset_t offset,
3068 upl_size_t cnt,
1c79356b
A
3069 boolean_t dp_internal,
3070 int flags)
3071{
91447636 3072 upl_size_t transfer_size;
1c79356b
A
3073 int error = 0;
3074 struct clmap clmap;
0b4e3aa0 3075
b0d623f7 3076 dp_offset_t actual_offset; /* Offset within paging segment */
1c79356b 3077 paging_segment_t ps;
b0d623f7
A
3078 dp_offset_t mobj_base_addr;
3079 dp_offset_t mobj_target_addr;
1c79356b
A
3080
3081 upl_t upl;
0b4e3aa0 3082 upl_page_info_t *pl;
1c79356b 3083 int page_index;
0b4c1975 3084 unsigned int page_max_index;
1c79356b 3085 int list_size;
55e303ae 3086 int pages_in_cl;
91447636 3087 unsigned int cl_size;
55e303ae 3088 int base_index;
91447636 3089 unsigned int seg_size;
b0d623f7 3090 unsigned int upl_offset_in_object;
0b4c1975
A
3091 boolean_t minimal_clustering = FALSE;
3092 boolean_t found_dirty;
55e303ae
A
3093
3094 pages_in_cl = 1 << vs->vs_clshift;
3095 cl_size = pages_in_cl * vm_page_size;
1c79356b 3096
0b4c1975
A
3097#if CONFIG_FREEZE
3098 minimal_clustering = TRUE;
3099#endif
3100 if (dp_isssd == TRUE)
3101 minimal_clustering = TRUE;
3102
1c79356b 3103 if (!dp_internal) {
0c530ab8 3104 unsigned int page_list_count;
1c79356b 3105 int request_flags;
91447636 3106 unsigned int super_size;
0b4e3aa0
A
3107 int first_dirty;
3108 int num_dirty;
3109 int num_of_pages;
3110 int seg_index;
91447636 3111 upl_offset_t upl_offset;
0b4c1975 3112 upl_offset_t upl_offset_aligned;
b0d623f7 3113 dp_offset_t seg_offset;
0b4c1975
A
3114 dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3115 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
0b4e3aa0 3116
1c79356b 3117
0b4c1975 3118 if (bs_low)
1c79356b 3119 super_size = cl_size;
0b4c1975 3120 else
1c79356b 3121 super_size = VM_SUPER_CLUSTER;
0b4e3aa0 3122
0b4c1975
A
3123 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3124 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2d21ac55 3125 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
1c79356b 3126
91447636
A
3127 if (!dp_encryption_inited) {
3128 /*
3129 * ENCRYPTED SWAP:
3130 * Once we've started using swap, we
3131 * can't change our mind on whether
3132 * it needs to be encrypted or
3133 * not.
3134 */
3135 dp_encryption_inited = TRUE;
3136 }
3137 if (dp_encryption) {
3138 /*
3139 * ENCRYPTED SWAP:
3140 * request that the UPL be prepared for
3141 * encryption.
3142 */
3143 request_flags |= UPL_ENCRYPT;
3144 flags |= UPL_PAGING_ENCRYPTED;
3145 }
0b4e3aa0
A
3146 page_list_count = 0;
3147 memory_object_super_upl_request(vs->vs_control,
3148 (memory_object_offset_t)offset,
3149 cnt, super_size,
3150 &upl, NULL, &page_list_count,
55e303ae 3151 request_flags | UPL_FOR_PAGEOUT);
1c79356b 3152
b0d623f7
A
3153 /*
3154 * The default pager does not handle objects larger than
3155 * 4GB, so it does not deal with offset that don't fit in
3156 * 32-bit. Cast down upl->offset now and make sure we
3157 * did not lose any valuable bits.
3158 */
3159 upl_offset_in_object = (unsigned int) upl->offset;
3160 assert(upl->offset == upl_offset_in_object);
3161
0b4e3aa0 3162 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 3163
b0d623f7 3164 seg_size = cl_size - (upl_offset_in_object % cl_size);
0b4c1975
A
3165 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3166 page_index = 0;
3167 page_max_index = upl->size / PAGE_SIZE;
3168 found_dirty = TRUE;
55e303ae 3169
0b4c1975
A
3170 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3171 unsigned int seg_pgcnt;
1c79356b 3172
0b4c1975 3173 seg_pgcnt = seg_size / PAGE_SIZE;
1c79356b 3174
0b4c1975
A
3175 if (minimal_clustering == TRUE) {
3176 unsigned int non_dirty;
1c79356b 3177
0b4c1975
A
3178 non_dirty = 0;
3179 found_dirty = FALSE;
3180
3181 for (; non_dirty < seg_pgcnt; non_dirty++) {
3182 if ((page_index + non_dirty) >= page_max_index)
3183 break;
3184
3185 if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3186 UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3187 found_dirty = TRUE;
3188 break;
3189 }
3190 }
3191 }
3192 if (found_dirty == TRUE) {
3193 ps_offset[seg_index] =
3194 ps_clmap(vs,
3195 upl_offset_aligned,
3196 &clmap, CL_ALLOC,
3197 cl_size, 0);
3198
3199 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3200 upl_abort(upl, 0);
3201 upl_deallocate(upl);
3202
3203 return KERN_FAILURE;
3204 }
3205 psp[seg_index] = CLMAP_PS(clmap);
3206 }
55e303ae 3207 if (transfer_size > seg_size) {
0b4c1975 3208 page_index += seg_pgcnt;
55e303ae 3209 transfer_size -= seg_size;
0b4c1975 3210 upl_offset_aligned += cl_size;
55e303ae 3211 seg_size = cl_size;
0b4e3aa0
A
3212 seg_index++;
3213 } else
3214 transfer_size = 0;
3215 }
55e303ae
A
3216 /*
3217 * Ignore any non-present pages at the end of the
3218 * UPL.
3219 */
3220 for (page_index = upl->size / vm_page_size; page_index > 0;)
3221 if (UPL_PAGE_PRESENT(pl, --page_index))
3222 break;
3223 num_of_pages = page_index + 1;
3224
b0d623f7 3225 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
55e303ae
A
3226
3227 for (page_index = 0; page_index < num_of_pages; ) {
0b4e3aa0
A
3228 /*
3229 * skip over non-dirty pages
3230 */
3231 for ( ; page_index < num_of_pages; page_index++) {
55e303ae 3232 if (UPL_DIRTY_PAGE(pl, page_index)
d12e1678 3233 || UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
3234 /*
3235 * this is a page we need to write
55e303ae 3236 * go see if we can buddy it up with
d12e1678 3237 * others that are contiguous to it
0b4e3aa0
A
3238 */
3239 break;
3240 /*
d12e1678 3241 * if the page is not-dirty, but present we
55e303ae 3242 * need to commit it... This is an unusual
d12e1678 3243 * case since we only asked for dirty pages
0b4e3aa0
A
3244 */
3245 if (UPL_PAGE_PRESENT(pl, page_index)) {
3246 boolean_t empty = FALSE;
3247 upl_commit_range(upl,
3248 page_index * vm_page_size,
3249 vm_page_size,
3250 UPL_COMMIT_NOTIFY_EMPTY,
3251 pl,
d52fe63f 3252 page_list_count,
0b4e3aa0 3253 &empty);
55e303ae
A
3254 if (empty) {
3255 assert(page_index ==
3256 num_of_pages - 1);
0b4e3aa0 3257 upl_deallocate(upl);
55e303ae 3258 }
1c79356b 3259 }
1c79356b 3260 }
0b4e3aa0
A
3261 if (page_index == num_of_pages)
3262 /*
3263 * no more pages to look at, we're out of here
3264 */
3265 break;
1c79356b 3266
0b4e3aa0 3267 /*
55e303ae
A
3268 * gather up contiguous dirty pages... we have at
3269 * least 1 * otherwise we would have bailed above
0b4e3aa0
A
3270 * make sure that each physical segment that we step
3271 * into is contiguous to the one we're currently in
3272 * if it's not, we have to stop and write what we have
3273 */
55e303ae 3274 for (first_dirty = page_index;
d12e1678 3275 page_index < num_of_pages; ) {
55e303ae 3276 if ( !UPL_DIRTY_PAGE(pl, page_index)
d12e1678 3277 && !UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
3278 break;
3279 page_index++;
3280 /*
3281 * if we just looked at the last page in the UPL
3282 * we don't need to check for physical segment
3283 * continuity
3284 */
3285 if (page_index < num_of_pages) {
3286 int cur_seg;
3287 int nxt_seg;
3288
55e303ae
A
3289 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3290 nxt_seg = (base_index + page_index)/pages_in_cl;
0b4e3aa0
A
3291
3292 if (cur_seg != nxt_seg) {
3293 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
55e303ae
A
3294 /*
3295 * if the segment we're about
3296 * to step into is not
3297 * contiguous to the one we're
3298 * currently in, or it's in a
d12e1678 3299 * different paging file....
55e303ae 3300 * we stop here and generate
d12e1678
A
3301 * the I/O
3302 */
0b4e3aa0 3303 break;
1c79356b 3304 }
1c79356b 3305 }
0b4e3aa0
A
3306 }
3307 num_dirty = page_index - first_dirty;
1c79356b 3308
0b4e3aa0
A
3309 if (num_dirty) {
3310 upl_offset = first_dirty * vm_page_size;
0b4e3aa0
A
3311 transfer_size = num_dirty * vm_page_size;
3312
d12e1678 3313 while (transfer_size) {
1c79356b 3314
d12e1678 3315 if ((seg_size = cl_size -
b0d623f7
A
3316 ((upl_offset_in_object +
3317 upl_offset) % cl_size))
d12e1678
A
3318 > transfer_size)
3319 seg_size = transfer_size;
0b4e3aa0 3320
b0d623f7
A
3321 ps_vs_write_complete(
3322 vs,
3323 (upl_offset_in_object +
3324 upl_offset),
d12e1678 3325 seg_size, error);
0b4e3aa0 3326
d12e1678
A
3327 transfer_size -= seg_size;
3328 upl_offset += seg_size;
0b4e3aa0 3329 }
d12e1678
A
3330 upl_offset = first_dirty * vm_page_size;
3331 transfer_size = num_dirty * vm_page_size;
55e303ae
A
3332
3333 seg_index = (base_index + first_dirty) / pages_in_cl;
b0d623f7 3334 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
55e303ae 3335
d12e1678
A
3336 error = ps_write_file(psp[seg_index],
3337 upl, upl_offset,
3338 ps_offset[seg_index]
3339 + seg_offset,
3340 transfer_size, flags);
55e303ae 3341 } else {
0b4e3aa0
A
3342 boolean_t empty = FALSE;
3343 upl_abort_range(upl,
3344 first_dirty * vm_page_size,
3345 num_dirty * vm_page_size,
3346 UPL_ABORT_NOTIFY_EMPTY,
3347 &empty);
55e303ae
A
3348 if (empty) {
3349 assert(page_index == num_of_pages);
0b4e3aa0 3350 upl_deallocate(upl);
55e303ae 3351 }
1c79356b 3352 }
1c79356b 3353 }
0b4e3aa0 3354
1c79356b 3355 } else {
b0d623f7 3356 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
1c79356b
A
3357 list_size = cnt;
3358
3359 page_index = 0;
3360 /* The caller provides a mapped_data which is derived */
3361 /* from a temporary object. The targeted pages are */
3362 /* guaranteed to be set at offset 0 in the mapped_data */
3363 /* The actual offset however must still be derived */
3364 /* from the offset in the vs in question */
3365 mobj_base_addr = offset;
3366 mobj_target_addr = mobj_base_addr;
3367
3368 for (transfer_size = list_size; transfer_size != 0;) {
3369 actual_offset = ps_clmap(vs, mobj_target_addr,
3370 &clmap, CL_ALLOC,
3371 transfer_size < cl_size ?
3372 transfer_size : cl_size, 0);
b0d623f7 3373 if(actual_offset == (dp_offset_t) -1) {
1c79356b
A
3374 error = 1;
3375 break;
3376 }
3377 cnt = MIN(transfer_size,
b0d623f7 3378 (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
1c79356b
A
3379 ps = CLMAP_PS(clmap);
3380 /* Assume that the caller has given us contiguous */
3381 /* pages */
3382 if(cnt) {
d12e1678
A
3383 ps_vs_write_complete(vs, mobj_target_addr,
3384 cnt, error);
1c79356b
A
3385 error = ps_write_file(ps, internal_upl,
3386 0, actual_offset,
3387 cnt, flags);
3388 if (error)
3389 break;
55e303ae 3390 }
1c79356b
A
3391 if (error)
3392 break;
3393 actual_offset += cnt;
3394 mobj_target_addr += cnt;
3395 transfer_size -= cnt;
3396 cnt = 0;
3397
3398 if (error)
3399 break;
3400 }
3401 }
3402 if(error)
3403 return KERN_FAILURE;
3404 else
3405 return KERN_SUCCESS;
3406}
3407
3408vm_size_t
3409ps_vstruct_allocated_size(
3410 vstruct_t vs)
3411{
3412 int num_pages;
3413 struct vs_map *vsmap;
91447636 3414 unsigned int i, j, k;
1c79356b
A
3415
3416 num_pages = 0;
3417 if (vs->vs_indirect) {
3418 /* loop on indirect maps */
3419 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3420 vsmap = vs->vs_imap[i];
3421 if (vsmap == NULL)
3422 continue;
3423 /* loop on clusters in this indirect map */
3424 for (j = 0; j < CLMAP_ENTRIES; j++) {
3425 if (VSM_ISCLR(vsmap[j]) ||
3426 VSM_ISERR(vsmap[j]))
3427 continue;
3428 /* loop on pages in this cluster */
3429 for (k = 0; k < VSCLSIZE(vs); k++) {
3430 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3431 num_pages++;
3432 }
3433 }
3434 }
3435 } else {
3436 vsmap = vs->vs_dmap;
3437 if (vsmap == NULL)
3438 return 0;
3439 /* loop on clusters in the direct map */
3440 for (j = 0; j < CLMAP_ENTRIES; j++) {
3441 if (VSM_ISCLR(vsmap[j]) ||
3442 VSM_ISERR(vsmap[j]))
3443 continue;
3444 /* loop on pages in this cluster */
3445 for (k = 0; k < VSCLSIZE(vs); k++) {
3446 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3447 num_pages++;
3448 }
3449 }
3450 }
3451
55e303ae 3452 return ptoa_32(num_pages);
1c79356b
A
3453}
3454
b0d623f7 3455unsigned int
1c79356b
A
3456ps_vstruct_allocated_pages(
3457 vstruct_t vs,
3458 default_pager_page_t *pages,
b0d623f7 3459 unsigned int pages_size)
1c79356b 3460{
91447636 3461 unsigned int num_pages;
1c79356b 3462 struct vs_map *vsmap;
b0d623f7 3463 dp_offset_t offset;
91447636 3464 unsigned int i, j, k;
1c79356b
A
3465
3466 num_pages = 0;
3467 offset = 0;
3468 if (vs->vs_indirect) {
3469 /* loop on indirect maps */
3470 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3471 vsmap = vs->vs_imap[i];
3472 if (vsmap == NULL) {
3473 offset += (vm_page_size * CLMAP_ENTRIES *
3474 VSCLSIZE(vs));
3475 continue;
3476 }
3477 /* loop on clusters in this indirect map */
3478 for (j = 0; j < CLMAP_ENTRIES; j++) {
3479 if (VSM_ISCLR(vsmap[j]) ||
3480 VSM_ISERR(vsmap[j])) {
3481 offset += vm_page_size * VSCLSIZE(vs);
3482 continue;
3483 }
3484 /* loop on pages in this cluster */
3485 for (k = 0; k < VSCLSIZE(vs); k++) {
3486 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3487 num_pages++;
3488 if (num_pages < pages_size)
3489 pages++->dpp_offset =
3490 offset;
3491 }
3492 offset += vm_page_size;
3493 }
3494 }
3495 }
3496 } else {
3497 vsmap = vs->vs_dmap;
3498 if (vsmap == NULL)
3499 return 0;
3500 /* loop on clusters in the direct map */
3501 for (j = 0; j < CLMAP_ENTRIES; j++) {
3502 if (VSM_ISCLR(vsmap[j]) ||
3503 VSM_ISERR(vsmap[j])) {
3504 offset += vm_page_size * VSCLSIZE(vs);
3505 continue;
3506 }
3507 /* loop on pages in this cluster */
3508 for (k = 0; k < VSCLSIZE(vs); k++) {
3509 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3510 num_pages++;
3511 if (num_pages < pages_size)
3512 pages++->dpp_offset = offset;
3513 }
3514 offset += vm_page_size;
3515 }
3516 }
3517 }
3518
3519 return num_pages;
3520}
3521
3522
3523kern_return_t
3524ps_vstruct_transfer_from_segment(
3525 vstruct_t vs,
3526 paging_segment_t segment,
1c79356b 3527 upl_t upl)
1c79356b
A
3528{
3529 struct vs_map *vsmap;
91447636
A
3530// struct vs_map old_vsmap;
3531// struct vs_map new_vsmap;
3532 unsigned int i, j;
1c79356b
A
3533
3534 VS_LOCK(vs); /* block all work on this vstruct */
3535 /* can't allow the normal multiple write */
3536 /* semantic because writes may conflict */
3537 vs->vs_xfer_pending = TRUE;
3538 vs_wait_for_sync_writers(vs);
3539 vs_start_write(vs);
3540 vs_wait_for_readers(vs);
3541 /* we will unlock the vs to allow other writes while transferring */
3542 /* and will be guaranteed of the persistance of the vs struct */
3543 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3544 /* vs_async_pending */
3545 /* OK we now have guaranteed no other parties are accessing this */
3546 /* vs. Now that we are also supporting simple lock versions of */
3547 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3548 /* our purpose in holding it before was the multiple write case */
3549 /* we now use the boolean xfer_pending to do that. We can use */
3550 /* a boolean instead of a count because we have guaranteed single */
3551 /* file access to this code in its caller */
3552 VS_UNLOCK(vs);
3553vs_changed:
3554 if (vs->vs_indirect) {
91447636
A
3555 unsigned int vsmap_size;
3556 int clmap_off;
1c79356b
A
3557 /* loop on indirect maps */
3558 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3559 vsmap = vs->vs_imap[i];
3560 if (vsmap == NULL)
3561 continue;
3562 /* loop on clusters in this indirect map */
3563 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3564 VSCLSIZE(vs) * i);
3565 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3566 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3567 else
3568 vsmap_size = CLMAP_ENTRIES;
3569 for (j = 0; j < vsmap_size; j++) {
3570 if (VSM_ISCLR(vsmap[j]) ||
3571 VSM_ISERR(vsmap[j]) ||
3572 (VSM_PS(vsmap[j]) != segment))
3573 continue;
3574 if(vs_cluster_transfer(vs,
3575 (vm_page_size * (j << vs->vs_clshift))
3576 + clmap_off,
3577 vm_page_size << vs->vs_clshift,
1c79356b 3578 upl)
1c79356b
A
3579 != KERN_SUCCESS) {
3580 VS_LOCK(vs);
3581 vs->vs_xfer_pending = FALSE;
3582 VS_UNLOCK(vs);
3583 vs_finish_write(vs);
3584 return KERN_FAILURE;
3585 }
3586 /* allow other readers/writers during transfer*/
3587 VS_LOCK(vs);
3588 vs->vs_xfer_pending = FALSE;
3589 VS_UNLOCK(vs);
3590 vs_finish_write(vs);
3591 VS_LOCK(vs);
3592 vs->vs_xfer_pending = TRUE;
1c79356b
A
3593 vs_wait_for_sync_writers(vs);
3594 vs_start_write(vs);
3595 vs_wait_for_readers(vs);
0b4e3aa0 3596 VS_UNLOCK(vs);
1c79356b
A
3597 if (!(vs->vs_indirect)) {
3598 goto vs_changed;
3599 }
3600 }
3601 }
3602 } else {
3603 vsmap = vs->vs_dmap;
3604 if (vsmap == NULL) {
3605 VS_LOCK(vs);
3606 vs->vs_xfer_pending = FALSE;
3607 VS_UNLOCK(vs);
3608 vs_finish_write(vs);
3609 return KERN_SUCCESS;
3610 }
3611 /* loop on clusters in the direct map */
3612 for (j = 0; j < vs->vs_size; j++) {
3613 if (VSM_ISCLR(vsmap[j]) ||
3614 VSM_ISERR(vsmap[j]) ||
3615 (VSM_PS(vsmap[j]) != segment))
3616 continue;
3617 if(vs_cluster_transfer(vs,
3618 vm_page_size * (j << vs->vs_clshift),
3619 vm_page_size << vs->vs_clshift,
1c79356b 3620 upl) != KERN_SUCCESS) {
1c79356b
A
3621 VS_LOCK(vs);
3622 vs->vs_xfer_pending = FALSE;
3623 VS_UNLOCK(vs);
3624 vs_finish_write(vs);
3625 return KERN_FAILURE;
3626 }
3627 /* allow other readers/writers during transfer*/
3628 VS_LOCK(vs);
3629 vs->vs_xfer_pending = FALSE;
3630 VS_UNLOCK(vs);
3631 vs_finish_write(vs);
3632 VS_LOCK(vs);
3633 vs->vs_xfer_pending = TRUE;
1c79356b
A
3634 vs_wait_for_sync_writers(vs);
3635 vs_start_write(vs);
3636 vs_wait_for_readers(vs);
b0d623f7 3637 VS_UNLOCK(vs);
1c79356b
A
3638 if (vs->vs_indirect) {
3639 goto vs_changed;
3640 }
3641 }
3642 }
3643
3644 VS_LOCK(vs);
3645 vs->vs_xfer_pending = FALSE;
3646 VS_UNLOCK(vs);
3647 vs_finish_write(vs);
3648 return KERN_SUCCESS;
3649}
3650
3651
3652
3653vs_map_t
3654vs_get_map_entry(
3655 vstruct_t vs,
b0d623f7 3656 dp_offset_t offset)
1c79356b
A
3657{
3658 struct vs_map *vsmap;
b0d623f7 3659 dp_offset_t cluster;
1c79356b 3660
55e303ae 3661 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
3662 if (vs->vs_indirect) {
3663 long ind_block = cluster/CLMAP_ENTRIES;
3664
3665 /* Is the indirect block allocated? */
3666 vsmap = vs->vs_imap[ind_block];
3667 if(vsmap == (vs_map_t) NULL)
3668 return vsmap;
3669 } else
3670 vsmap = vs->vs_dmap;
3671 vsmap += cluster%CLMAP_ENTRIES;
3672 return vsmap;
3673}
3674
3675kern_return_t
3676vs_cluster_transfer(
3677 vstruct_t vs,
b0d623f7
A
3678 dp_offset_t offset,
3679 dp_size_t cnt,
1c79356b 3680 upl_t upl)
1c79356b 3681{
b0d623f7 3682 dp_offset_t actual_offset;
1c79356b
A
3683 paging_segment_t ps;
3684 struct clmap clmap;
3685 kern_return_t error = KERN_SUCCESS;
91447636
A
3686 unsigned int size, size_wanted;
3687 int i;
0c530ab8 3688 unsigned int residual = 0;
91447636
A
3689 unsigned int unavail_size;
3690// default_pager_thread_t *dpt;
3691// boolean_t dealloc;
3692 struct vs_map *vsmap_ptr = NULL;
1c79356b
A
3693 struct vs_map read_vsmap;
3694 struct vs_map original_read_vsmap;
3695 struct vs_map write_vsmap;
91447636
A
3696// upl_t sync_upl;
3697// vm_offset_t ioaddr;
1c79356b 3698
1c79356b
A
3699 /* vs_cluster_transfer reads in the pages of a cluster and
3700 * then writes these pages back to new backing store. The
3701 * segment the pages are being read from is assumed to have
3702 * been taken off-line and is no longer considered for new
3703 * space requests.
3704 */
3705
3706 /*
3707 * This loop will be executed once per cluster referenced.
3708 * Typically this means once, since it's unlikely that the
3709 * VM system will ask for anything spanning cluster boundaries.
3710 *
3711 * If there are holes in a cluster (in a paging segment), we stop
3712 * reading at the hole, then loop again, hoping to
3713 * find valid pages later in the cluster. This continues until
3714 * the entire range has been examined, and read, if present. The
3715 * pages are written as they are read. If a failure occurs after
3716 * some pages are written the unmap call at the bottom of the loop
3717 * recovers the backing store and the old backing store remains
3718 * in effect.
3719 */
3720
1c79356b
A
3721 VSM_CLR(write_vsmap);
3722 VSM_CLR(original_read_vsmap);
3723 /* grab the actual object's pages to sync with I/O */
3724 while (cnt && (error == KERN_SUCCESS)) {
3725 vsmap_ptr = vs_get_map_entry(vs, offset);
3726 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3727
b0d623f7 3728 if (actual_offset == (dp_offset_t) -1) {
1c79356b
A
3729
3730 /*
3731 * Nothing left to write in this cluster at least
3732 * set write cluster information for any previous
3733 * write, clear for next cluster, if there is one
3734 */
3735 unsigned int local_size, clmask, clsize;
3736
3737 clsize = vm_page_size << vs->vs_clshift;
3738 clmask = clsize - 1;
3739 local_size = clsize - (offset & clmask);
3740 ASSERT(local_size);
3741 local_size = MIN(local_size, cnt);
3742
3743 /* This cluster has no data in it beyond what may */
3744 /* have been found on a previous iteration through */
3745 /* the loop "write_vsmap" */
3746 *vsmap_ptr = write_vsmap;
3747 VSM_CLR(write_vsmap);
3748 VSM_CLR(original_read_vsmap);
3749
3750 cnt -= local_size;
3751 offset += local_size;
3752 continue;
3753 }
3754
3755 /*
3756 * Count up contiguous available or unavailable
3757 * pages.
3758 */
3759 ps = CLMAP_PS(clmap);
3760 ASSERT(ps);
3761 size = 0;
3762 unavail_size = 0;
3763 for (i = 0;
3764 (size < cnt) && (unavail_size < cnt) &&
3765 (i < CLMAP_NPGS(clmap)); i++) {
3766 if (CLMAP_ISSET(clmap, i)) {
3767 if (unavail_size != 0)
3768 break;
3769 size += vm_page_size;
3770 BS_STAT(ps->ps_bs,
3771 ps->ps_bs->bs_pages_in++);
3772 } else {
3773 if (size != 0)
3774 break;
3775 unavail_size += vm_page_size;
3776 }
3777 }
3778
3779 if (size == 0) {
3780 ASSERT(unavail_size);
593a1d5f 3781 ps_clunmap(vs, offset, unavail_size);
1c79356b
A
3782 cnt -= unavail_size;
3783 offset += unavail_size;
3784 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3785 == 0) {
3786 /* There is no more to transfer in this
3787 cluster
3788 */
3789 *vsmap_ptr = write_vsmap;
3790 VSM_CLR(write_vsmap);
3791 VSM_CLR(original_read_vsmap);
3792 }
3793 continue;
3794 }
3795
3796 if(VSM_ISCLR(original_read_vsmap))
3797 original_read_vsmap = *vsmap_ptr;
3798
3799 if(ps->ps_segtype == PS_PARTITION) {
0c530ab8
A
3800 panic("swap partition not supported\n");
3801 /*NOTREACHED*/
3802 error = KERN_FAILURE;
3803 residual = size;
1c79356b 3804/*
9bccf70c 3805 NEED TO ISSUE WITH SYNC & NO COMMIT
1c79356b
A
3806 error = ps_read_device(ps, actual_offset, &buffer,
3807 size, &residual, flags);
3808*/
3809 } else {
9bccf70c 3810 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
91447636 3811 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
1c79356b 3812 size, &residual,
9bccf70c 3813 (UPL_IOSYNC | UPL_NOCOMMIT));
1c79356b
A
3814 }
3815
3816 read_vsmap = *vsmap_ptr;
3817
3818
3819 /*
3820 * Adjust counts and put data in new BS. Optimize for the
3821 * common case, i.e. no error and/or partial data.
3822 * If there was an error, then we need to error the entire
3823 * range, even if some data was successfully read.
3824 *
3825 */
3826 if ((error == KERN_SUCCESS) && (residual == 0)) {
0b4e3aa0 3827
1c79356b
A
3828 /*
3829 * Got everything we asked for, supply the data to
3830 * the new BS. Note that as a side effect of supplying
3831 * the data, the buffer holding the supplied data is
3832 * deallocated from the pager's address space unless
3833 * the write is unsuccessful.
3834 */
3835
3836 /* note buffer will be cleaned up in all cases by */
3837 /* internal_cluster_write or if an error on write */
3838 /* the vm_map_copy_page_discard call */
3839 *vsmap_ptr = write_vsmap;
3840
1c79356b
A
3841 if(vs_cluster_write(vs, upl, offset,
3842 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
1c79356b
A
3843 error = KERN_FAILURE;
3844 if(!(VSM_ISCLR(*vsmap_ptr))) {
3845 /* unmap the new backing store object */
3846 ps_clunmap(vs, offset, size);
3847 }
3848 /* original vsmap */
3849 *vsmap_ptr = original_read_vsmap;
3850 VSM_CLR(write_vsmap);
3851 } else {
3852 if((offset + size) &
3853 ((vm_page_size << vs->vs_clshift)
3854 - 1)) {
3855 /* There is more to transfer in this
3856 cluster
3857 */
3858 write_vsmap = *vsmap_ptr;
3859 *vsmap_ptr = read_vsmap;
593a1d5f 3860 ps_clunmap(vs, offset, size);
1c79356b
A
3861 } else {
3862 /* discard the old backing object */
3863 write_vsmap = *vsmap_ptr;
3864 *vsmap_ptr = read_vsmap;
3865 ps_clunmap(vs, offset, size);
3866 *vsmap_ptr = write_vsmap;
3867 VSM_CLR(write_vsmap);
3868 VSM_CLR(original_read_vsmap);
3869 }
3870 }
3871 } else {
3872 size_wanted = size;
3873 if (error == KERN_SUCCESS) {
3874 if (residual == size) {
3875 /*
3876 * If a read operation returns no error
3877 * and no data moved, we turn it into
3878 * an error, assuming we're reading at
3879 * or beyond EOF.
3880 * Fall through and error the entire
3881 * range.
3882 */
3883 error = KERN_FAILURE;
3884 *vsmap_ptr = write_vsmap;
3885 if(!(VSM_ISCLR(*vsmap_ptr))) {
3886 /* unmap the new backing store object */
3887 ps_clunmap(vs, offset, size);
3888 }
3889 *vsmap_ptr = original_read_vsmap;
3890 VSM_CLR(write_vsmap);
3891 continue;
3892 } else {
3893 /*
3894 * Otherwise, we have partial read.
3895 * This is also considered an error
3896 * for the purposes of cluster transfer
3897 */
3898 error = KERN_FAILURE;
3899 *vsmap_ptr = write_vsmap;
3900 if(!(VSM_ISCLR(*vsmap_ptr))) {
3901 /* unmap the new backing store object */
3902 ps_clunmap(vs, offset, size);
3903 }
3904 *vsmap_ptr = original_read_vsmap;
3905 VSM_CLR(write_vsmap);
3906 continue;
3907 }
3908 }
3909
3910 }
3911 cnt -= size;
3912 offset += size;
3913
3914 } /* END while (cnt && (error == 0)) */
3915 if(!VSM_ISCLR(write_vsmap))
3916 *vsmap_ptr = write_vsmap;
3917
1c79356b
A
3918 return error;
3919}
3920
3921kern_return_t
91447636
A
3922default_pager_add_file(
3923 MACH_PORT_FACE backing_store,
3924 vnode_ptr_t vp,
1c79356b 3925 int record_size,
91447636 3926 vm_size_t size)
1c79356b
A
3927{
3928 backing_store_t bs;
3929 paging_segment_t ps;
3930 int i;
91447636 3931 unsigned int j;
1c79356b 3932 int error;
1c79356b
A
3933
3934 if ((bs = backing_store_lookup(backing_store))
3935 == BACKING_STORE_NULL)
3936 return KERN_INVALID_ARGUMENT;
3937
3938 PSL_LOCK();
3939 for (i = 0; i <= paging_segment_max; i++) {
3940 ps = paging_segments[i];
3941 if (ps == PAGING_SEGMENT_NULL)
3942 continue;
3943 if (ps->ps_segtype != PS_FILE)
3944 continue;
3945
3946 /*
3947 * Check for overlap on same device.
3948 */
3949 if (ps->ps_vnode == (struct vnode *)vp) {
3950 PSL_UNLOCK();
3951 BS_UNLOCK(bs);
3952 return KERN_INVALID_ARGUMENT;
3953 }
3954 }
3955 PSL_UNLOCK();
3956
3957 /*
3958 * Set up the paging segment
3959 */
3960 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3961 if (ps == PAGING_SEGMENT_NULL) {
3962 BS_UNLOCK(bs);
3963 return KERN_RESOURCE_SHORTAGE;
3964 }
3965
3966 ps->ps_segtype = PS_FILE;
3967 ps->ps_vnode = (struct vnode *)vp;
3968 ps->ps_offset = 0;
3969 ps->ps_record_shift = local_log2(vm_page_size / record_size);
b0d623f7
A
3970 assert((dp_size_t) size == size);
3971 ps->ps_recnum = (dp_size_t) size;
3972 ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
1c79356b
A
3973
3974 ps->ps_pgcount = ps->ps_pgnum;
3975 ps->ps_clshift = local_log2(bs->bs_clsize);
3976 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
b0d623f7 3977 ps->ps_special_clusters = 0;
1c79356b
A
3978 ps->ps_hint = 0;
3979
3980 PS_LOCK_INIT(ps);
3981 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3982 if (!ps->ps_bmap) {
91447636 3983 kfree(ps, sizeof *ps);
1c79356b
A
3984 BS_UNLOCK(bs);
3985 return KERN_RESOURCE_SHORTAGE;
3986 }
91447636
A
3987 for (j = 0; j < ps->ps_ncls; j++) {
3988 clrbit(ps->ps_bmap, j);
1c79356b
A
3989 }
3990
b0d623f7
A
3991 if(paging_segment_count == 0) {
3992 ps->ps_state = PS_EMERGENCY_SEGMENT;
3993 if(use_emergency_swap_file_first) {
3994 ps->ps_state |= PS_CAN_USE;
3995 }
3996 emergency_segment_backing_store = backing_store;
3997 } else {
3998 ps->ps_state = PS_CAN_USE;
3999 }
4000
1c79356b
A
4001 ps->ps_bs = bs;
4002
4003 if ((error = ps_enter(ps)) != 0) {
91447636
A
4004 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4005 kfree(ps, sizeof *ps);
1c79356b
A
4006 BS_UNLOCK(bs);
4007 return KERN_RESOURCE_SHORTAGE;
4008 }
4009
4010 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4011 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4012 PSL_LOCK();
b0d623f7
A
4013 if(IS_PS_OK_TO_USE(ps)) {
4014 dp_pages_free += ps->ps_pgcount;
4015 } else {
4016 dp_pages_reserve += ps->ps_pgcount;
4017 }
1c79356b
A
4018 PSL_UNLOCK();
4019
4020 BS_UNLOCK(bs);
4021
4022 bs_more_space(ps->ps_clcount);
4023
b0d623f7
A
4024 /*
4025 * If the paging segment being activated is not the emergency
4026 * segment and we notice that the emergency segment is being
4027 * used then we help recover it. If all goes well, the
4028 * emergency segment will be back to its original state of
4029 * online but not activated (till it's needed the next time).
4030 */
4031 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4032 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4033 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4034 dprintf(("Failed to recover emergency paging segment\n"));
4035 } else {
4036 dprintf(("Recovered emergency paging segment\n"));
4037 }
4038 }
4039
91447636
A
4040 DP_DEBUG(DEBUG_BS_INTERNAL,
4041 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
b0d623f7 4042 device, offset, (dp_size_t) size, record_size,
91447636 4043 ps->ps_record_shift, ps->ps_pgnum));
1c79356b
A
4044
4045 return KERN_SUCCESS;
4046}
4047
4048
4049
1c79356b
A
4050kern_return_t
4051ps_read_file(
4052 paging_segment_t ps,
4053 upl_t upl,
91447636 4054 upl_offset_t upl_offset,
b0d623f7 4055 dp_offset_t offset,
91447636 4056 upl_size_t size,
1c79356b
A
4057 unsigned int *residualp,
4058 int flags)
4059{
4060 vm_object_offset_t f_offset;
4061 int error = 0;
4062 int result;
1c79356b 4063
91447636 4064 assert(dp_encryption_inited);
1c79356b 4065
55e303ae 4066 clustered_reads[atop_32(size)]++;
1c79356b
A
4067
4068 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4069
2d21ac55
A
4070 /*
4071 * for transfer case we need to pass uploffset and flags
4072 */
b0d623f7
A
4073 assert((upl_size_t) size == size);
4074 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
1c79356b
A
4075
4076 /* The vnode_pagein semantic is somewhat at odds with the existing */
4077 /* device_read semantic. Partial reads are not experienced at this */
4078 /* level. It is up to the bit map code and cluster read code to */
4079 /* check that requested data locations are actually backed, and the */
4080 /* pagein code to either read all of the requested data or return an */
4081 /* error. */
4082
4083 if (error)
4084 result = KERN_FAILURE;
4085 else {
4086 *residualp = 0;
4087 result = KERN_SUCCESS;
4088 }
4089 return result;
1c79356b
A
4090}
4091
4092kern_return_t
4093ps_write_file(
4094 paging_segment_t ps,
4095 upl_t upl,
91447636 4096 upl_offset_t upl_offset,
b0d623f7 4097 dp_offset_t offset,
1c79356b
A
4098 unsigned int size,
4099 int flags)
4100{
4101 vm_object_offset_t f_offset;
4102 kern_return_t result;
1c79356b 4103
91447636 4104 assert(dp_encryption_inited);
1c79356b 4105
55e303ae 4106 clustered_writes[atop_32(size)]++;
1c79356b
A
4107 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4108
91447636
A
4109 if (flags & UPL_PAGING_ENCRYPTED) {
4110 /*
4111 * ENCRYPTED SWAP:
4112 * encrypt all the pages that we're going
4113 * to pageout.
4114 */
4115 upl_encrypt(upl, upl_offset, size);
4116 }
b0d623f7
A
4117 assert((upl_size_t) size == size);
4118 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
1c79356b
A
4119 result = KERN_FAILURE;
4120 else
4121 result = KERN_SUCCESS;
4122
4123 return result;
4124}
4125
4126kern_return_t
91447636 4127default_pager_triggers( __unused MACH_PORT_FACE default_pager,
1c79356b
A
4128 int hi_wat,
4129 int lo_wat,
4130 int flags,
4131 MACH_PORT_FACE trigger_port)
4132{
0b4e3aa0
A
4133 MACH_PORT_FACE release;
4134 kern_return_t kr;
b0d623f7
A
4135 clock_sec_t now;
4136 clock_nsec_t nanoseconds_dummy;
4137 static clock_sec_t error_notify = 0;
1c79356b 4138
0b4e3aa0 4139 PSL_LOCK();
91447636
A
4140 if (flags == SWAP_ENCRYPT_ON) {
4141 /* ENCRYPTED SWAP: turn encryption on */
4142 release = trigger_port;
4143 if (!dp_encryption_inited) {
4144 dp_encryption_inited = TRUE;
4145 dp_encryption = TRUE;
4146 kr = KERN_SUCCESS;
4147 } else {
4148 kr = KERN_FAILURE;
4149 }
4150 } else if (flags == SWAP_ENCRYPT_OFF) {
4151 /* ENCRYPTED SWAP: turn encryption off */
4152 release = trigger_port;
4153 if (!dp_encryption_inited) {
4154 dp_encryption_inited = TRUE;
4155 dp_encryption = FALSE;
4156 kr = KERN_SUCCESS;
4157 } else {
4158 kr = KERN_FAILURE;
4159 }
4160 } else if (flags == HI_WAT_ALERT) {
0b4e3aa0 4161 release = min_pages_trigger_port;
1c79356b
A
4162 min_pages_trigger_port = trigger_port;
4163 minimum_pages_remaining = hi_wat/vm_page_size;
4164 bs_low = FALSE;
0b4e3aa0
A
4165 kr = KERN_SUCCESS;
4166 } else if (flags == LO_WAT_ALERT) {
4167 release = max_pages_trigger_port;
1c79356b
A
4168 max_pages_trigger_port = trigger_port;
4169 maximum_pages_free = lo_wat/vm_page_size;
0b4e3aa0 4170 kr = KERN_SUCCESS;
b0d623f7
A
4171 } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4172 use_emergency_swap_file_first = TRUE;
4173 release = trigger_port;
4174 kr = KERN_SUCCESS;
4175 } else if (flags == SWAP_FILE_CREATION_ERROR) {
4176 release = trigger_port;
4177 kr = KERN_SUCCESS;
4178 if( paging_segment_count == 1) {
4179 use_emergency_swap_file_first = TRUE;
4180 }
4181 no_paging_space_action();
4182 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4183 if (now > error_notify + 5) {
4184 dprintf(("Swap File Error.\n"));
4185 error_notify = now;
4186 }
0b4e3aa0
A
4187 } else {
4188 release = trigger_port;
4189 kr = KERN_INVALID_ARGUMENT;
1c79356b 4190 }
0b4e3aa0
A
4191 PSL_UNLOCK();
4192
4193 if (IP_VALID(release))
4194 ipc_port_release_send(release);
4195
4196 return kr;
1c79356b 4197}
55e303ae
A
4198
4199/*
4200 * Monitor the amount of available backing store vs. the amount of
4201 * required backing store, notify a listener (if present) when
4202 * backing store may safely be removed.
4203 *
4204 * We attempt to avoid the situation where backing store is
4205 * discarded en masse, as this can lead to thrashing as the
4206 * backing store is compacted.
4207 */
4208
4209#define PF_INTERVAL 3 /* time between free level checks */
4210#define PF_LATENCY 10 /* number of intervals before release */
4211
4212static int dp_pages_free_low_count = 0;
91447636 4213thread_call_t default_pager_backing_store_monitor_callout;
55e303ae
A
4214
4215void
91447636
A
4216default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4217 __unused thread_call_param_t p2)
55e303ae 4218{
91447636 4219// unsigned long long average;
55e303ae
A
4220 ipc_port_t trigger;
4221 uint64_t deadline;
4222
4223 /*
4224 * We determine whether it will be safe to release some
4225 * backing store by watching the free page level. If
4226 * it remains below the maximum_pages_free threshold for
4227 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4228 * then we deem it safe.
4229 *
4230 * Note that this establishes a maximum rate at which backing
4231 * store will be released, as each notification (currently)
4232 * only results in a single backing store object being
4233 * released.
4234 */
4235 if (dp_pages_free > maximum_pages_free) {
4236 dp_pages_free_low_count++;
4237 } else {
4238 dp_pages_free_low_count = 0;
4239 }
4240
4241 /* decide whether to send notification */
4242 trigger = IP_NULL;
4243 if (max_pages_trigger_port &&
4244 (backing_store_release_trigger_disable == 0) &&
4245 (dp_pages_free_low_count > PF_LATENCY)) {
4246 trigger = max_pages_trigger_port;
4247 max_pages_trigger_port = NULL;
4248 }
4249
4250 /* send notification */
4251 if (trigger != IP_NULL) {
4252 VSL_LOCK();
4253 if(backing_store_release_trigger_disable != 0) {
4254 assert_wait((event_t)
4255 &backing_store_release_trigger_disable,
4256 THREAD_UNINT);
4257 VSL_UNLOCK();
4258 thread_block(THREAD_CONTINUE_NULL);
4259 } else {
4260 VSL_UNLOCK();
4261 }
4262 default_pager_space_alert(trigger, LO_WAT_ALERT);
4263 ipc_port_release_send(trigger);
4264 dp_pages_free_low_count = 0;
4265 }
4266
4267 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
91447636 4268 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
55e303ae 4269}