]> git.saurik.com Git - apple/xnu.git/blame - osfmk/default_pager/dp_backing_store.c
xnu-1699.32.7.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
CommitLineData
1c79356b 1/*
b0d623f7 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57/*
58 * Default Pager.
59 * Paging File Management.
60 */
61
91447636 62#include <mach/host_priv.h>
0b4e3aa0 63#include <mach/memory_object_control.h>
1c79356b 64#include <mach/memory_object_server.h>
91447636
A
65#include <mach/upl.h>
66#include <default_pager/default_pager_internal.h>
1c79356b 67#include <default_pager/default_pager_alerts.h>
91447636
A
68#include <default_pager/default_pager_object_server.h>
69
70#include <ipc/ipc_types.h>
1c79356b
A
71#include <ipc/ipc_port.h>
72#include <ipc/ipc_space.h>
91447636
A
73
74#include <kern/kern_types.h>
75#include <kern/host.h>
1c79356b
A
76#include <kern/queue.h>
77#include <kern/counters.h>
78#include <kern/sched_prim.h>
91447636 79
1c79356b
A
80#include <vm/vm_kern.h>
81#include <vm/vm_pageout.h>
1c79356b 82#include <vm/vm_map.h>
91447636
A
83#include <vm/vm_object.h>
84#include <vm/vm_protos.h>
85
2d21ac55 86
b0d623f7 87/* todo - need large internal object support */
1c79356b 88
0b4e3aa0
A
89/*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
1c79356b 99
0b4e3aa0 100#define ALLOC_STRIDE (1024 * 1024 * 1024)
1c79356b
A
101int physical_transfer_cluster_count = 0;
102
9bccf70c 103#define VM_SUPER_CLUSTER 0x40000
0b4c1975 104#define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_SIZE)
1c79356b
A
105
106/*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
0b4c1975
A
110#define VSTRUCT_MIN_CLSHIFT 0
111
1c79356b 112#define VSTRUCT_DEF_CLSHIFT 2
1c79356b
A
113int default_pager_clsize = 0;
114
0b4c1975
A
115int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
116
1c79356b 117/* statistics */
0b4e3aa0
A
118unsigned int clustered_writes[VM_SUPER_PAGES+1];
119unsigned int clustered_reads[VM_SUPER_PAGES+1];
1c79356b
A
120
121/*
122 * Globals used for asynchronous paging operations:
123 * vs_async_list: head of list of to-be-completed I/O ops
124 * async_num_queued: number of pages completed, but not yet
125 * processed by async thread.
126 * async_requests_out: number of pages of requests not completed.
127 */
128
129#if 0
130struct vs_async *vs_async_list;
131int async_num_queued;
132int async_requests_out;
133#endif
134
135
136#define VS_ASYNC_REUSE 1
137struct vs_async *vs_async_free_list;
138
b0d623f7 139lck_mtx_t default_pager_async_lock; /* Protects globals above */
1c79356b
A
140
141
142int vs_alloc_async_failed = 0; /* statistics */
143int vs_alloc_async_count = 0; /* statistics */
144struct vs_async *vs_alloc_async(void); /* forward */
145void vs_free_async(struct vs_async *vsa); /* forward */
146
147
148#define VS_ALLOC_ASYNC() vs_alloc_async()
149#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
150
b0d623f7
A
151#define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock)
152#define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock)
153#define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
1c79356b
A
154#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
155/*
156 * Paging Space Hysteresis triggers and the target notification port
157 *
158 */
b0d623f7
A
159unsigned int dp_pages_free_drift_count = 0;
160unsigned int dp_pages_free_drifted_max = 0;
1c79356b
A
161unsigned int minimum_pages_remaining = 0;
162unsigned int maximum_pages_free = 0;
163ipc_port_t min_pages_trigger_port = NULL;
164ipc_port_t max_pages_trigger_port = NULL;
165
6d2010ae
A
166#if CONFIG_FREEZE
167boolean_t use_emergency_swap_file_first = TRUE;
168#else
b0d623f7 169boolean_t use_emergency_swap_file_first = FALSE;
6d2010ae 170#endif
1c79356b 171boolean_t bs_low = FALSE;
0b4e3aa0 172int backing_store_release_trigger_disable = 0;
b0d623f7 173boolean_t backing_store_stop_compaction = FALSE;
6d2010ae 174boolean_t backing_store_abort_compaction = FALSE;
91447636
A
175
176/* Have we decided if swap needs to be encrypted yet ? */
177boolean_t dp_encryption_inited = FALSE;
178/* Should we encrypt swap ? */
179boolean_t dp_encryption = FALSE;
1c79356b 180
0b4c1975
A
181boolean_t dp_isssd = FALSE;
182
1c79356b
A
183/*
184 * Object sizes are rounded up to the next power of 2,
185 * unless they are bigger than a given maximum size.
186 */
187vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
188
189/*
190 * List of all backing store and segments.
191 */
b0d623f7 192MACH_PORT_FACE emergency_segment_backing_store;
1c79356b
A
193struct backing_store_list_head backing_store_list;
194paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
b0d623f7 195lck_mtx_t paging_segments_lock;
1c79356b
A
196int paging_segment_max = 0;
197int paging_segment_count = 0;
198int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
199
200
201/*
202 * Total pages free in system
203 * This differs from clusters committed/avail which is a measure of the
204 * over commitment of paging segments to backing store. An idea which is
205 * likely to be deprecated.
206 */
207unsigned int dp_pages_free = 0;
b0d623f7 208unsigned int dp_pages_reserve = 0;
1c79356b
A
209unsigned int cluster_transfer_minimum = 100;
210
6d2010ae
A
211/*
212 * Trim state
213 */
214struct ps_vnode_trim_data {
215 struct vnode *vp;
216 dp_offset_t offset;
217 dp_size_t length;
218};
219
91447636 220/* forward declarations */
b0d623f7
A
221kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */
222kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */
91447636
A
223default_pager_thread_t *get_read_buffer( void );
224kern_return_t ps_vstruct_transfer_from_segment(
225 vstruct_t vs,
226 paging_segment_t segment,
227 upl_t upl);
b0d623f7
A
228kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
229kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
91447636
A
230kern_return_t vs_cluster_transfer(
231 vstruct_t vs,
b0d623f7
A
232 dp_offset_t offset,
233 dp_size_t cnt,
91447636
A
234 upl_t upl);
235vs_map_t vs_get_map_entry(
236 vstruct_t vs,
b0d623f7 237 dp_offset_t offset);
0b4e3aa0 238
b0d623f7
A
239kern_return_t
240default_pager_backing_store_delete_internal( MACH_PORT_FACE );
1c79356b 241
6d2010ae
A
242static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
243static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
244static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
245
1c79356b 246default_pager_thread_t *
91447636 247get_read_buffer( void )
1c79356b
A
248{
249 int i;
250
251 DPT_LOCK(dpt_lock);
252 while(TRUE) {
253 for (i=0; i<default_pager_internal_count; i++) {
254 if(dpt_array[i]->checked_out == FALSE) {
255 dpt_array[i]->checked_out = TRUE;
256 DPT_UNLOCK(dpt_lock);
257 return dpt_array[i];
258 }
259 }
9bccf70c 260 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
1c79356b
A
261 }
262}
263
264void
265bs_initialize(void)
266{
267 int i;
268
269 /*
270 * List of all backing store.
271 */
272 BSL_LOCK_INIT();
273 queue_init(&backing_store_list.bsl_queue);
274 PSL_LOCK_INIT();
275
276 VS_ASYNC_LOCK_INIT();
277#if VS_ASYNC_REUSE
278 vs_async_free_list = NULL;
279#endif /* VS_ASYNC_REUSE */
280
0b4e3aa0 281 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
1c79356b
A
282 clustered_writes[i] = 0;
283 clustered_reads[i] = 0;
284 }
285
286}
287
288/*
289 * When things do not quite workout...
290 */
291void bs_no_paging_space(boolean_t); /* forward */
292
293void
294bs_no_paging_space(
295 boolean_t out_of_memory)
296{
1c79356b
A
297
298 if (out_of_memory)
299 dprintf(("*** OUT OF MEMORY ***\n"));
300 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
301}
302
303void bs_more_space(int); /* forward */
304void bs_commit(int); /* forward */
305
306boolean_t user_warned = FALSE;
307unsigned int clusters_committed = 0;
308unsigned int clusters_available = 0;
309unsigned int clusters_committed_peak = 0;
310
311void
312bs_more_space(
313 int nclusters)
314{
315 BSL_LOCK();
316 /*
317 * Account for new paging space.
318 */
319 clusters_available += nclusters;
320
321 if (clusters_available >= clusters_committed) {
322 if (verbose && user_warned) {
323 printf("%s%s - %d excess clusters now.\n",
324 my_name,
325 "paging space is OK now",
326 clusters_available - clusters_committed);
327 user_warned = FALSE;
328 clusters_committed_peak = 0;
329 }
330 } else {
331 if (verbose && user_warned) {
332 printf("%s%s - still short of %d clusters.\n",
333 my_name,
334 "WARNING: paging space over-committed",
335 clusters_committed - clusters_available);
336 clusters_committed_peak -= nclusters;
337 }
338 }
339 BSL_UNLOCK();
340
341 return;
342}
343
344void
345bs_commit(
346 int nclusters)
347{
348 BSL_LOCK();
349 clusters_committed += nclusters;
350 if (clusters_committed > clusters_available) {
351 if (verbose && !user_warned) {
352 user_warned = TRUE;
353 printf("%s%s - short of %d clusters.\n",
354 my_name,
355 "WARNING: paging space over-committed",
356 clusters_committed - clusters_available);
357 }
358 if (clusters_committed > clusters_committed_peak) {
359 clusters_committed_peak = clusters_committed;
360 }
361 } else {
362 if (verbose && user_warned) {
363 printf("%s%s - was short of up to %d clusters.\n",
364 my_name,
365 "paging space is OK now",
366 clusters_committed_peak - clusters_available);
367 user_warned = FALSE;
368 clusters_committed_peak = 0;
369 }
370 }
371 BSL_UNLOCK();
372
373 return;
374}
375
376int default_pager_info_verbose = 1;
377
378void
379bs_global_info(
b0d623f7
A
380 uint64_t *totalp,
381 uint64_t *freep)
1c79356b 382{
b0d623f7 383 uint64_t pages_total, pages_free;
1c79356b
A
384 paging_segment_t ps;
385 int i;
1c79356b
A
386
387 PSL_LOCK();
388 pages_total = pages_free = 0;
389 for (i = 0; i <= paging_segment_max; i++) {
390 ps = paging_segments[i];
391 if (ps == PAGING_SEGMENT_NULL)
392 continue;
393
394 /*
395 * no need to lock: by the time this data
396 * gets back to any remote requestor it
397 * will be obsolete anyways
398 */
399 pages_total += ps->ps_pgnum;
400 pages_free += ps->ps_clcount << ps->ps_clshift;
91447636
A
401 DP_DEBUG(DEBUG_BS_INTERNAL,
402 ("segment #%d: %d total, %d free\n",
403 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
1c79356b
A
404 }
405 *totalp = pages_total;
406 *freep = pages_free;
407 if (verbose && user_warned && default_pager_info_verbose) {
408 if (clusters_available < clusters_committed) {
409 printf("%s %d clusters committed, %d available.\n",
410 my_name,
411 clusters_committed,
412 clusters_available);
413 }
414 }
415 PSL_UNLOCK();
416}
417
418backing_store_t backing_store_alloc(void); /* forward */
419
420backing_store_t
421backing_store_alloc(void)
422{
423 backing_store_t bs;
1c79356b
A
424
425 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
426 if (bs == BACKING_STORE_NULL)
427 panic("backing_store_alloc: no memory");
428
429 BS_LOCK_INIT(bs);
430 bs->bs_port = MACH_PORT_NULL;
431 bs->bs_priority = 0;
432 bs->bs_clsize = 0;
433 bs->bs_pages_total = 0;
434 bs->bs_pages_in = 0;
435 bs->bs_pages_in_fail = 0;
436 bs->bs_pages_out = 0;
437 bs->bs_pages_out_fail = 0;
438
439 return bs;
440}
441
442backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
443
444/* Even in both the component space and external versions of this pager, */
445/* backing_store_lookup will be called from tasks in the application space */
446backing_store_t
447backing_store_lookup(
448 MACH_PORT_FACE port)
449{
450 backing_store_t bs;
451
452/*
453 port is currently backed with a vs structure in the alias field
454 we could create an ISBS alias and a port_is_bs call but frankly
455 I see no reason for the test, the bs->port == port check below
456 will work properly on junk entries.
457
458 if ((port == MACH_PORT_NULL) || port_is_vs(port))
459*/
6d2010ae 460 if (port == MACH_PORT_NULL)
1c79356b
A
461 return BACKING_STORE_NULL;
462
463 BSL_LOCK();
464 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
465 bs_links) {
466 BS_LOCK(bs);
467 if (bs->bs_port == port) {
468 BSL_UNLOCK();
469 /* Success, return it locked. */
470 return bs;
471 }
472 BS_UNLOCK(bs);
473 }
474 BSL_UNLOCK();
475 return BACKING_STORE_NULL;
476}
477
478void backing_store_add(backing_store_t); /* forward */
479
480void
481backing_store_add(
91447636 482 __unused backing_store_t bs)
1c79356b 483{
91447636
A
484// MACH_PORT_FACE port = bs->bs_port;
485// MACH_PORT_FACE pset = default_pager_default_set;
1c79356b 486 kern_return_t kr = KERN_SUCCESS;
1c79356b
A
487
488 if (kr != KERN_SUCCESS)
489 panic("backing_store_add: add to set");
490
491}
492
493/*
494 * Set up default page shift, but only if not already
495 * set and argument is within range.
496 */
497boolean_t
498bs_set_default_clsize(unsigned int npages)
499{
500 switch(npages){
501 case 1:
502 case 2:
503 case 4:
504 case 8:
505 if (default_pager_clsize == 0) /* if not yet set */
506 vstruct_def_clshift = local_log2(npages);
507 return(TRUE);
508 }
509 return(FALSE);
510}
511
512int bs_get_global_clsize(int clsize); /* forward */
513
514int
515bs_get_global_clsize(
516 int clsize)
517{
518 int i;
0b4e3aa0 519 memory_object_default_t dmm;
1c79356b 520 kern_return_t kr;
1c79356b
A
521
522 /*
523 * Only allow setting of cluster size once. If called
524 * with no cluster size (default), we use the compiled-in default
525 * for the duration. The same cluster size is used for all
526 * paging segments.
527 */
528 if (default_pager_clsize == 0) {
1c79356b
A
529 /*
530 * Keep cluster size in bit shift because it's quicker
531 * arithmetic, and easier to keep at a power of 2.
532 */
533 if (clsize != NO_CLSIZE) {
534 for (i = 0; (1 << i) < clsize; i++);
535 if (i > MAX_CLUSTER_SHIFT)
536 i = MAX_CLUSTER_SHIFT;
537 vstruct_def_clshift = i;
538 }
539 default_pager_clsize = (1 << vstruct_def_clshift);
540
541 /*
542 * Let the user know the new (and definitive) cluster size.
543 */
544 if (verbose)
545 printf("%scluster size = %d page%s\n",
546 my_name, default_pager_clsize,
547 (default_pager_clsize == 1) ? "" : "s");
0b4e3aa0 548
1c79356b
A
549 /*
550 * Let the kernel know too, in case it hasn't used the
551 * default value provided in main() yet.
552 */
0b4e3aa0 553 dmm = default_pager_object;
1c79356b
A
554 clsize = default_pager_clsize * vm_page_size; /* in bytes */
555 kr = host_default_memory_manager(host_priv_self(),
0b4e3aa0 556 &dmm,
1c79356b 557 clsize);
0b4e3aa0
A
558 memory_object_default_deallocate(dmm);
559
1c79356b
A
560 if (kr != KERN_SUCCESS) {
561 panic("bs_get_global_cl_size:host_default_memory_manager");
562 }
0b4e3aa0 563 if (dmm != default_pager_object) {
1c79356b
A
564 panic("bs_get_global_cl_size:there is another default pager");
565 }
566 }
567 ASSERT(default_pager_clsize > 0 &&
568 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
569
570 return default_pager_clsize;
571}
572
573kern_return_t
574default_pager_backing_store_create(
0b4e3aa0
A
575 memory_object_default_t pager,
576 int priority,
577 int clsize, /* in bytes */
578 MACH_PORT_FACE *backing_store)
1c79356b
A
579{
580 backing_store_t bs;
581 MACH_PORT_FACE port;
91447636 582// kern_return_t kr;
1c79356b 583 struct vstruct_alias *alias_struct;
1c79356b 584
0b4e3aa0 585 if (pager != default_pager_object)
1c79356b
A
586 return KERN_INVALID_ARGUMENT;
587
588 bs = backing_store_alloc();
589 port = ipc_port_alloc_kernel();
590 ipc_port_make_send(port);
591 assert (port != IP_NULL);
592
91447636
A
593 DP_DEBUG(DEBUG_BS_EXTERNAL,
594 ("priority=%d clsize=%d bs_port=0x%x\n",
595 priority, clsize, (int) backing_store));
1c79356b
A
596
597 alias_struct = (struct vstruct_alias *)
598 kalloc(sizeof (struct vstruct_alias));
599 if(alias_struct != NULL) {
600 alias_struct->vs = (struct vstruct *)bs;
0c530ab8 601 alias_struct->name = &default_pager_ops;
b0d623f7 602 port->alias = (uintptr_t) alias_struct;
1c79356b
A
603 }
604 else {
605 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
91447636 606 kfree(bs, sizeof (struct backing_store));
1c79356b
A
607 return KERN_RESOURCE_SHORTAGE;
608 }
609
610 bs->bs_port = port;
611 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
612 priority = BS_MAXPRI;
613 else if (priority == BS_NOPRI)
614 priority = BS_MAXPRI;
615 else
616 priority = BS_MINPRI;
617 bs->bs_priority = priority;
618
55e303ae 619 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
1c79356b
A
620
621 BSL_LOCK();
622 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
623 bs_links);
624 BSL_UNLOCK();
625
626 backing_store_add(bs);
627
628 *backing_store = port;
629 return KERN_SUCCESS;
630}
631
632kern_return_t
633default_pager_backing_store_info(
634 MACH_PORT_FACE backing_store,
635 backing_store_flavor_t flavour,
636 backing_store_info_t info,
637 mach_msg_type_number_t *size)
638{
639 backing_store_t bs;
640 backing_store_basic_info_t basic;
641 int i;
642 paging_segment_t ps;
643
644 if (flavour != BACKING_STORE_BASIC_INFO ||
645 *size < BACKING_STORE_BASIC_INFO_COUNT)
646 return KERN_INVALID_ARGUMENT;
647
648 basic = (backing_store_basic_info_t)info;
649 *size = BACKING_STORE_BASIC_INFO_COUNT;
650
651 VSTATS_LOCK(&global_stats.gs_lock);
652 basic->pageout_calls = global_stats.gs_pageout_calls;
653 basic->pagein_calls = global_stats.gs_pagein_calls;
654 basic->pages_in = global_stats.gs_pages_in;
655 basic->pages_out = global_stats.gs_pages_out;
656 basic->pages_unavail = global_stats.gs_pages_unavail;
657 basic->pages_init = global_stats.gs_pages_init;
658 basic->pages_init_writes= global_stats.gs_pages_init_writes;
659 VSTATS_UNLOCK(&global_stats.gs_lock);
660
661 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
662 return KERN_INVALID_ARGUMENT;
663
664 basic->bs_pages_total = bs->bs_pages_total;
665 PSL_LOCK();
666 bs->bs_pages_free = 0;
667 for (i = 0; i <= paging_segment_max; i++) {
668 ps = paging_segments[i];
669 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
670 PS_LOCK(ps);
671 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
672 PS_UNLOCK(ps);
673 }
674 }
675 PSL_UNLOCK();
676 basic->bs_pages_free = bs->bs_pages_free;
677 basic->bs_pages_in = bs->bs_pages_in;
678 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
679 basic->bs_pages_out = bs->bs_pages_out;
680 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
681
682 basic->bs_priority = bs->bs_priority;
55e303ae 683 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
1c79356b
A
684
685 BS_UNLOCK(bs);
686
687 return KERN_SUCCESS;
688}
689
690int ps_delete(paging_segment_t); /* forward */
b0d623f7 691boolean_t current_thread_aborted(void);
1c79356b
A
692
693int
694ps_delete(
695 paging_segment_t ps)
696{
697 vstruct_t vs;
698 kern_return_t error = KERN_SUCCESS;
699 int vs_count;
700
701 VSL_LOCK(); /* get the lock on the list of vs's */
702
703 /* The lock relationship and sequence is farily complicated */
704 /* this code looks at a live list, locking and unlocking the list */
705 /* as it traverses it. It depends on the locking behavior of */
706 /* default_pager_no_senders. no_senders always locks the vstruct */
707 /* targeted for removal before locking the vstruct list. However */
708 /* it will remove that member of the list without locking its */
709 /* neighbors. We can be sure when we hold a lock on a vstruct */
710 /* it cannot be removed from the list but we must hold the list */
711 /* lock to be sure that its pointers to its neighbors are valid. */
712 /* Also, we can hold off destruction of a vstruct when the list */
713 /* lock and the vs locks are not being held by bumping the */
714 /* vs_async_pending count. */
715
0b4e3aa0
A
716
717 while(backing_store_release_trigger_disable != 0) {
9bccf70c 718 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
0b4e3aa0
A
719 }
720
1c79356b
A
721 /* we will choose instead to hold a send right */
722 vs_count = vstruct_list.vsl_count;
723 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
724 if(vs == (vstruct_t)&vstruct_list) {
725 VSL_UNLOCK();
726 return KERN_SUCCESS;
727 }
728 VS_LOCK(vs);
729 vs_async_wait(vs); /* wait for any pending async writes */
730 if ((vs_count != 0) && (vs != NULL))
731 vs->vs_async_pending += 1; /* hold parties calling */
732 /* vs_async_wait */
6d2010ae
A
733
734 if (bs_low == FALSE)
735 backing_store_abort_compaction = FALSE;
736
1c79356b
A
737 VS_UNLOCK(vs);
738 VSL_UNLOCK();
739 while((vs_count != 0) && (vs != NULL)) {
740 /* We take the count of AMO's before beginning the */
741 /* transfer of of the target segment. */
742 /* We are guaranteed that the target segment cannot get */
743 /* more users. We also know that queue entries are */
744 /* made at the back of the list. If some of the entries */
745 /* we would check disappear while we are traversing the */
746 /* list then we will either check new entries which */
747 /* do not have any backing store in the target segment */
748 /* or re-check old entries. This might not be optimal */
749 /* but it will always be correct. The alternative is to */
750 /* take a snapshot of the list. */
751 vstruct_t next_vs;
752
753 if(dp_pages_free < cluster_transfer_minimum)
754 error = KERN_FAILURE;
755 else {
756 vm_object_t transfer_object;
0c530ab8 757 unsigned int count;
1c79356b 758 upl_t upl;
6d2010ae 759 int upl_flags;
1c79356b 760
91447636 761 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
0b4e3aa0 762 count = 0;
6d2010ae
A
763 upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
764 UPL_SET_LITE | UPL_SET_INTERNAL);
765 if (dp_encryption) {
766 /* mark the pages as "encrypted" when they come in */
767 upl_flags |= UPL_ENCRYPT;
768 }
0b4e3aa0
A
769 error = vm_object_upl_request(transfer_object,
770 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
6d2010ae 771 &upl, NULL, &count, upl_flags);
2d21ac55 772
1c79356b 773 if(error == KERN_SUCCESS) {
1c79356b
A
774 error = ps_vstruct_transfer_from_segment(
775 vs, ps, upl);
91447636 776 upl_commit(upl, NULL, 0);
0b4e3aa0 777 upl_deallocate(upl);
1c79356b 778 } else {
1c79356b
A
779 error = KERN_FAILURE;
780 }
9bccf70c 781 vm_object_deallocate(transfer_object);
1c79356b 782 }
6d2010ae 783 if(error || current_thread_aborted()) {
1c79356b
A
784 VS_LOCK(vs);
785 vs->vs_async_pending -= 1; /* release vs_async_wait */
0b4e3aa0
A
786 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
787 vs->vs_waiting_async = FALSE;
1c79356b 788 VS_UNLOCK(vs);
0b4e3aa0 789 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
790 } else {
791 VS_UNLOCK(vs);
792 }
793 return KERN_FAILURE;
794 }
795
796 VSL_LOCK();
0b4e3aa0
A
797
798 while(backing_store_release_trigger_disable != 0) {
9bccf70c
A
799 VSL_SLEEP(&backing_store_release_trigger_disable,
800 THREAD_UNINT);
0b4e3aa0
A
801 }
802
1c79356b
A
803 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
804 if((next_vs != (vstruct_t)&vstruct_list) &&
805 (vs != next_vs) && (vs_count != 1)) {
806 VS_LOCK(next_vs);
807 vs_async_wait(next_vs); /* wait for any */
808 /* pending async writes */
809 next_vs->vs_async_pending += 1; /* hold parties */
810 /* calling vs_async_wait */
811 VS_UNLOCK(next_vs);
812 }
813 VSL_UNLOCK();
814 VS_LOCK(vs);
815 vs->vs_async_pending -= 1;
0b4e3aa0
A
816 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
817 vs->vs_waiting_async = FALSE;
1c79356b 818 VS_UNLOCK(vs);
0b4e3aa0 819 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
820 } else {
821 VS_UNLOCK(vs);
822 }
823 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
824 vs = NULL;
825 else
826 vs = next_vs;
827 vs_count--;
828 }
829 return KERN_SUCCESS;
830}
831
832
833kern_return_t
b0d623f7 834default_pager_backing_store_delete_internal(
1c79356b
A
835 MACH_PORT_FACE backing_store)
836{
837 backing_store_t bs;
838 int i;
839 paging_segment_t ps;
840 int error;
841 int interim_pages_removed = 0;
b0d623f7 842 boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
1c79356b
A
843
844 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
845 return KERN_INVALID_ARGUMENT;
846
b0d623f7 847restart:
1c79356b
A
848 PSL_LOCK();
849 error = KERN_SUCCESS;
850 for (i = 0; i <= paging_segment_max; i++) {
851 ps = paging_segments[i];
852 if (ps != PAGING_SEGMENT_NULL &&
853 ps->ps_bs == bs &&
b0d623f7 854 ! IS_PS_GOING_AWAY(ps)) {
1c79356b 855 PS_LOCK(ps);
b0d623f7
A
856
857 if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
858 /*
859 * Someone is already busy reclamining this paging segment.
860 * If it's the emergency segment we are looking at then check
861 * that someone has not already recovered it and set the right
862 * state i.e. online but not activated.
863 */
864 PS_UNLOCK(ps);
865 continue;
866 }
867
1c79356b 868 /* disable access to this segment */
b0d623f7
A
869 ps->ps_state &= ~PS_CAN_USE;
870 ps->ps_state |= PS_GOING_AWAY;
1c79356b
A
871 PS_UNLOCK(ps);
872 /*
873 * The "ps" segment is "off-line" now,
874 * we can try and delete it...
875 */
876 if(dp_pages_free < (cluster_transfer_minimum
877 + ps->ps_pgcount)) {
878 error = KERN_FAILURE;
879 PSL_UNLOCK();
880 }
881 else {
882 /* remove all pages associated with the */
883 /* segment from the list of free pages */
884 /* when transfer is through, all target */
885 /* segment pages will appear to be free */
886
887 dp_pages_free -= ps->ps_pgcount;
888 interim_pages_removed += ps->ps_pgcount;
889 PSL_UNLOCK();
890 error = ps_delete(ps);
891 }
892 if (error != KERN_SUCCESS) {
893 /*
894 * We couldn't delete the segment,
895 * probably because there's not enough
896 * virtual memory left.
897 * Re-enable all the segments.
898 */
899 PSL_LOCK();
900 break;
901 }
902 goto restart;
903 }
904 }
905
906 if (error != KERN_SUCCESS) {
907 for (i = 0; i <= paging_segment_max; i++) {
908 ps = paging_segments[i];
909 if (ps != PAGING_SEGMENT_NULL &&
910 ps->ps_bs == bs &&
b0d623f7 911 IS_PS_GOING_AWAY(ps)) {
1c79356b 912 PS_LOCK(ps);
b0d623f7
A
913
914 if( !IS_PS_GOING_AWAY(ps)) {
915 PS_UNLOCK(ps);
916 continue;
917 }
918 /* Handle the special clusters that came in while we let go the lock*/
919 if( ps->ps_special_clusters) {
920 dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
921 ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
922 ps->ps_clcount += ps->ps_special_clusters;
923 if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
924 ps_select_array[ps->ps_bs->bs_priority] = 0;
925 }
926 ps->ps_special_clusters = 0;
927 }
1c79356b 928 /* re-enable access to this segment */
b0d623f7
A
929 ps->ps_state &= ~PS_GOING_AWAY;
930 ps->ps_state |= PS_CAN_USE;
1c79356b
A
931 PS_UNLOCK(ps);
932 }
933 }
934 dp_pages_free += interim_pages_removed;
935 PSL_UNLOCK();
936 BS_UNLOCK(bs);
937 return error;
938 }
939
940 for (i = 0; i <= paging_segment_max; i++) {
941 ps = paging_segments[i];
942 if (ps != PAGING_SEGMENT_NULL &&
943 ps->ps_bs == bs) {
b0d623f7
A
944 if(IS_PS_GOING_AWAY(ps)) {
945 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
946 PS_LOCK(ps);
947 ps->ps_state &= ~PS_GOING_AWAY;
948 ps->ps_special_clusters = 0;
949 ps->ps_pgcount = ps->ps_pgnum;
950 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
d1ecb069 951 dp_pages_reserve += ps->ps_pgcount;
b0d623f7 952 PS_UNLOCK(ps);
b0d623f7
A
953 } else {
954 paging_segments[i] = PAGING_SEGMENT_NULL;
955 paging_segment_count--;
956 PS_LOCK(ps);
957 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
958 kfree(ps, sizeof *ps);
959 }
1c79356b
A
960 }
961 }
962 }
963
964 /* Scan the entire ps array separately to make certain we find the */
965 /* proper paging_segment_max */
966 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
967 if(paging_segments[i] != PAGING_SEGMENT_NULL)
968 paging_segment_max = i;
969 }
970
971 PSL_UNLOCK();
972
b0d623f7
A
973 if( dealing_with_emergency_segment ) {
974 BS_UNLOCK(bs);
975 return KERN_SUCCESS;
976 }
977
1c79356b
A
978 /*
979 * All the segments have been deleted.
980 * We can remove the backing store.
981 */
982
983 /*
984 * Disable lookups of this backing store.
985 */
986 if((void *)bs->bs_port->alias != NULL)
91447636
A
987 kfree((void *) bs->bs_port->alias,
988 sizeof (struct vstruct_alias));
1c79356b
A
989 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
990 bs->bs_port = MACH_PORT_NULL;
991 BS_UNLOCK(bs);
992
993 /*
994 * Remove backing store from backing_store list.
995 */
996 BSL_LOCK();
997 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
998 bs_links);
999 BSL_UNLOCK();
1000
1001 /*
1002 * Free the backing store structure.
1003 */
91447636 1004 kfree(bs, sizeof *bs);
1c79356b
A
1005
1006 return KERN_SUCCESS;
1007}
1008
b0d623f7
A
1009kern_return_t
1010default_pager_backing_store_delete(
1011 MACH_PORT_FACE backing_store)
1012{
1013 if( backing_store != emergency_segment_backing_store ) {
1014 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1015 }
1016 return(default_pager_backing_store_delete_internal(backing_store));
1017}
1018
1c79356b
A
1019int ps_enter(paging_segment_t); /* forward */
1020
1021int
1022ps_enter(
1023 paging_segment_t ps)
1024{
1025 int i;
1026
1027 PSL_LOCK();
1028
1029 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1030 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1031 break;
1032 }
1033
1034 if (i < MAX_NUM_PAGING_SEGMENTS) {
1035 paging_segments[i] = ps;
1036 if (i > paging_segment_max)
1037 paging_segment_max = i;
1038 paging_segment_count++;
1039 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1040 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1041 ps_select_array[ps->ps_bs->bs_priority] = 0;
1042 i = 0;
1043 } else {
1044 PSL_UNLOCK();
1045 return KERN_RESOURCE_SHORTAGE;
1046 }
1047
1048 PSL_UNLOCK();
1049 return i;
1050}
1051
1052#ifdef DEVICE_PAGING
1053kern_return_t
1054default_pager_add_segment(
1055 MACH_PORT_FACE backing_store,
1056 MACH_PORT_FACE device,
1057 recnum_t offset,
1058 recnum_t count,
1059 int record_size)
1060{
1061 backing_store_t bs;
1062 paging_segment_t ps;
1063 int i;
1064 int error;
1c79356b
A
1065
1066 if ((bs = backing_store_lookup(backing_store))
1067 == BACKING_STORE_NULL)
1068 return KERN_INVALID_ARGUMENT;
1069
1070 PSL_LOCK();
1071 for (i = 0; i <= paging_segment_max; i++) {
1072 ps = paging_segments[i];
1073 if (ps == PAGING_SEGMENT_NULL)
1074 continue;
1075
1076 /*
1077 * Check for overlap on same device.
1078 */
1079 if (!(ps->ps_device != device
1080 || offset >= ps->ps_offset + ps->ps_recnum
1081 || offset + count <= ps->ps_offset)) {
1082 PSL_UNLOCK();
1083 BS_UNLOCK(bs);
1084 return KERN_INVALID_ARGUMENT;
1085 }
1086 }
1087 PSL_UNLOCK();
1088
1089 /*
1090 * Set up the paging segment
1091 */
1092 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1093 if (ps == PAGING_SEGMENT_NULL) {
1094 BS_UNLOCK(bs);
1095 return KERN_RESOURCE_SHORTAGE;
1096 }
1097
1098 ps->ps_segtype = PS_PARTITION;
1099 ps->ps_device = device;
1100 ps->ps_offset = offset;
1101 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1102 ps->ps_recnum = count;
1103 ps->ps_pgnum = count >> ps->ps_record_shift;
1104
1105 ps->ps_pgcount = ps->ps_pgnum;
1106 ps->ps_clshift = local_log2(bs->bs_clsize);
1107 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1108 ps->ps_hint = 0;
1109
1110 PS_LOCK_INIT(ps);
1111 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1112 if (!ps->ps_bmap) {
91447636 1113 kfree(ps, sizeof *ps);
1c79356b
A
1114 BS_UNLOCK(bs);
1115 return KERN_RESOURCE_SHORTAGE;
1116 }
1117 for (i = 0; i < ps->ps_ncls; i++) {
1118 clrbit(ps->ps_bmap, i);
1119 }
1120
b0d623f7
A
1121 if(paging_segment_count == 0) {
1122 ps->ps_state = PS_EMERGENCY_SEGMENT;
1123 if(use_emergency_swap_file_first) {
1124 ps->ps_state |= PS_CAN_USE;
1125 }
1126 } else {
1127 ps->ps_state = PS_CAN_USE;
1128 }
1129
1c79356b
A
1130 ps->ps_bs = bs;
1131
1132 if ((error = ps_enter(ps)) != 0) {
91447636
A
1133 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1134 kfree(ps, sizeof *ps);
1c79356b
A
1135 BS_UNLOCK(bs);
1136 return KERN_RESOURCE_SHORTAGE;
1137 }
1138
1139 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1140 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1141 BS_UNLOCK(bs);
1142
1143 PSL_LOCK();
b0d623f7
A
1144 if(IS_PS_OK_TO_USE(ps)) {
1145 dp_pages_free += ps->ps_pgcount;
1146 } else {
1147 dp_pages_reserve += ps->ps_pgcount;
1148 }
1c79356b
A
1149 PSL_UNLOCK();
1150
1151 bs_more_space(ps->ps_clcount);
1152
91447636
A
1153 DP_DEBUG(DEBUG_BS_INTERNAL,
1154 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1155 device, offset, count, record_size,
1156 ps->ps_record_shift, ps->ps_pgnum));
1c79356b
A
1157
1158 return KERN_SUCCESS;
1159}
1160
1161boolean_t
1162bs_add_device(
1163 char *dev_name,
1164 MACH_PORT_FACE master)
1165{
1166 security_token_t null_security_token = {
1167 { 0, 0 }
1168 };
1169 MACH_PORT_FACE device;
1170 int info[DEV_GET_SIZE_COUNT];
1171 mach_msg_type_number_t info_count;
1172 MACH_PORT_FACE bs = MACH_PORT_NULL;
1173 unsigned int rec_size;
1174 recnum_t count;
1175 int clsize;
1176 MACH_PORT_FACE reply_port;
1177
1178 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1179 null_security_token, dev_name, &device))
1180 return FALSE;
1181
1182 info_count = DEV_GET_SIZE_COUNT;
1183 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1184 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1185 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1186 clsize = bs_get_global_clsize(0);
1187 if (!default_pager_backing_store_create(
0b4e3aa0 1188 default_pager_object,
1c79356b
A
1189 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1190 (clsize * vm_page_size),
1191 &bs)) {
1192 if (!default_pager_add_segment(bs, device,
1193 0, count, rec_size)) {
1194 return TRUE;
1195 }
1196 ipc_port_release_receive(bs);
1197 }
1198 }
1199
1200 ipc_port_release_send(device);
1201 return FALSE;
1202}
1203#endif /* DEVICE_PAGING */
1204
1205#if VS_ASYNC_REUSE
1206
1207struct vs_async *
1208vs_alloc_async(void)
1209{
1210 struct vs_async *vsa;
1211 MACH_PORT_FACE reply_port;
91447636 1212// kern_return_t kr;
1c79356b
A
1213
1214 VS_ASYNC_LOCK();
1215 if (vs_async_free_list == NULL) {
1216 VS_ASYNC_UNLOCK();
1217 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1218 if (vsa != NULL) {
1219 /*
1220 * Try allocating a reply port named after the
1221 * address of the vs_async structure.
1222 */
1223 struct vstruct_alias *alias_struct;
1224
1225 reply_port = ipc_port_alloc_kernel();
1226 alias_struct = (struct vstruct_alias *)
1227 kalloc(sizeof (struct vstruct_alias));
1228 if(alias_struct != NULL) {
1229 alias_struct->vs = (struct vstruct *)vsa;
0c530ab8 1230 alias_struct->name = &default_pager_ops;
b0d623f7 1231 reply_port->alias = (uintptr_t) alias_struct;
1c79356b
A
1232 vsa->reply_port = reply_port;
1233 vs_alloc_async_count++;
1234 }
1235 else {
1236 vs_alloc_async_failed++;
1237 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1238 (reply_port));
91447636 1239 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1240 vsa = NULL;
1241 }
1242 }
1243 } else {
1244 vsa = vs_async_free_list;
1245 vs_async_free_list = vs_async_free_list->vsa_next;
1246 VS_ASYNC_UNLOCK();
1247 }
1248
1249 return vsa;
1250}
1251
1252void
1253vs_free_async(
1254 struct vs_async *vsa)
1255{
1256 VS_ASYNC_LOCK();
1257 vsa->vsa_next = vs_async_free_list;
1258 vs_async_free_list = vsa;
1259 VS_ASYNC_UNLOCK();
1260}
1261
1262#else /* VS_ASYNC_REUSE */
1263
1264struct vs_async *
1265vs_alloc_async(void)
1266{
1267 struct vs_async *vsa;
1268 MACH_PORT_FACE reply_port;
1269 kern_return_t kr;
1270
1271 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1272 if (vsa != NULL) {
1273 /*
1274 * Try allocating a reply port named after the
1275 * address of the vs_async structure.
1276 */
1277 reply_port = ipc_port_alloc_kernel();
1278 alias_struct = (vstruct_alias *)
1279 kalloc(sizeof (struct vstruct_alias));
1280 if(alias_struct != NULL) {
1281 alias_struct->vs = reply_port;
0c530ab8 1282 alias_struct->name = &default_pager_ops;
1c79356b
A
1283 reply_port->alias = (int) vsa;
1284 vsa->reply_port = reply_port;
1285 vs_alloc_async_count++;
1286 }
1287 else {
1288 vs_alloc_async_failed++;
1289 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1290 (reply_port));
91447636 1291 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1292 vsa = NULL;
1293 }
1294 }
1295
1296 return vsa;
1297}
1298
1299void
1300vs_free_async(
1301 struct vs_async *vsa)
1302{
1c79356b
A
1303 MACH_PORT_FACE reply_port;
1304 kern_return_t kr;
1305
1306 reply_port = vsa->reply_port;
91447636
A
1307 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1308 kfree(vsa, sizeof (struct vs_async));
1c79356b
A
1309 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1310#if 0
1311 VS_ASYNC_LOCK();
1312 vs_alloc_async_count--;
1313 VS_ASYNC_UNLOCK();
1314#endif
1315}
1316
1317#endif /* VS_ASYNC_REUSE */
1318
0b4e3aa0
A
1319zone_t vstruct_zone;
1320
1c79356b
A
1321vstruct_t
1322ps_vstruct_create(
b0d623f7 1323 dp_size_t size)
1c79356b
A
1324{
1325 vstruct_t vs;
91447636 1326 unsigned int i;
1c79356b 1327
0b4e3aa0 1328 vs = (vstruct_t) zalloc(vstruct_zone);
1c79356b
A
1329 if (vs == VSTRUCT_NULL) {
1330 return VSTRUCT_NULL;
1331 }
1332
1333 VS_LOCK_INIT(vs);
1334
1335 /*
1336 * The following fields will be provided later.
1337 */
0c530ab8 1338 vs->vs_pager_ops = NULL;
0b4e3aa0
A
1339 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1340 vs->vs_references = 1;
1c79356b 1341 vs->vs_seqno = 0;
1c79356b 1342
1c79356b
A
1343 vs->vs_waiting_seqno = FALSE;
1344 vs->vs_waiting_read = FALSE;
1345 vs->vs_waiting_write = FALSE;
1c79356b 1346 vs->vs_waiting_async = FALSE;
1c79356b
A
1347
1348 vs->vs_readers = 0;
1349 vs->vs_writers = 0;
1350
1351 vs->vs_errors = 0;
1352
1353 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
55e303ae 1354 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1c79356b
A
1355 vs->vs_async_pending = 0;
1356
1357 /*
1358 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1359 * depending on the size of the memory object.
1360 */
1361 if (INDIRECT_CLMAP(vs->vs_size)) {
1362 vs->vs_imap = (struct vs_map **)
1363 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1364 vs->vs_indirect = TRUE;
1365 } else {
1366 vs->vs_dmap = (struct vs_map *)
1367 kalloc(CLMAP_SIZE(vs->vs_size));
1368 vs->vs_indirect = FALSE;
1369 }
1370 vs->vs_xfer_pending = FALSE;
91447636
A
1371 DP_DEBUG(DEBUG_VS_INTERNAL,
1372 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1c79356b
A
1373
1374 /*
1375 * Check to see that we got the space.
1376 */
1377 if (!vs->vs_dmap) {
91447636 1378 kfree(vs, sizeof *vs);
1c79356b
A
1379 return VSTRUCT_NULL;
1380 }
1381
1382 /*
1383 * Zero the indirect pointers, or clear the direct pointers.
1384 */
1385 if (vs->vs_indirect)
1386 memset(vs->vs_imap, 0,
1387 INDIRECT_CLMAP_SIZE(vs->vs_size));
1388 else
1389 for (i = 0; i < vs->vs_size; i++)
1390 VSM_CLR(vs->vs_dmap[i]);
1391
1392 VS_MAP_LOCK_INIT(vs);
1393
1394 bs_commit(vs->vs_size);
1395
1396 return vs;
1397}
1398
91447636 1399paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1c79356b
A
1400
1401paging_segment_t
1402ps_select_segment(
91447636
A
1403 unsigned int shift,
1404 int *psindex)
1c79356b
A
1405{
1406 paging_segment_t ps;
1407 int i;
1408 int j;
1c79356b
A
1409
1410 /*
1411 * Optimize case where there's only one segment.
1412 * paging_segment_max will index the one and only segment.
1413 */
1414
1415 PSL_LOCK();
1416 if (paging_segment_count == 1) {
b0d623f7 1417 paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */
0b4e3aa0 1418 ipc_port_t trigger = IP_NULL;
1c79356b
A
1419
1420 ps = paging_segments[paging_segment_max];
1421 *psindex = paging_segment_max;
1422 PS_LOCK(ps);
b0d623f7
A
1423 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1424 panic("Emergency paging segment missing\n");
1425 }
1426 ASSERT(ps->ps_clshift >= shift);
1427 if(IS_PS_OK_TO_USE(ps)) {
1c79356b
A
1428 if (ps->ps_clcount) {
1429 ps->ps_clcount--;
1430 dp_pages_free -= 1 << ps->ps_clshift;
b0d623f7 1431 ps->ps_pgcount -= 1 << ps->ps_clshift;
1c79356b
A
1432 if(min_pages_trigger_port &&
1433 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1434 trigger = min_pages_trigger_port;
1c79356b
A
1435 min_pages_trigger_port = NULL;
1436 bs_low = TRUE;
6d2010ae 1437 backing_store_abort_compaction = TRUE;
1c79356b
A
1438 }
1439 lps = ps;
b0d623f7
A
1440 }
1441 }
1c79356b 1442 PS_UNLOCK(ps);
b0d623f7
A
1443
1444 if( lps == PAGING_SEGMENT_NULL ) {
1445 if(dp_pages_free) {
1446 dp_pages_free_drift_count++;
1447 if(dp_pages_free > dp_pages_free_drifted_max) {
1448 dp_pages_free_drifted_max = dp_pages_free;
1449 }
1450 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1451 }
1452 dp_pages_free = 0;
1453 }
1454
1c79356b 1455 PSL_UNLOCK();
0b4e3aa0
A
1456
1457 if (trigger != IP_NULL) {
6d2010ae
A
1458 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1459
0b4e3aa0
A
1460 default_pager_space_alert(trigger, HI_WAT_ALERT);
1461 ipc_port_release_send(trigger);
1462 }
1c79356b
A
1463 return lps;
1464 }
1465
1466 if (paging_segment_count == 0) {
b0d623f7
A
1467 if(dp_pages_free) {
1468 dp_pages_free_drift_count++;
1469 if(dp_pages_free > dp_pages_free_drifted_max) {
1470 dp_pages_free_drifted_max = dp_pages_free;
1471 }
1472 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1473 }
1474 dp_pages_free = 0;
1c79356b
A
1475 PSL_UNLOCK();
1476 return PAGING_SEGMENT_NULL;
1477 }
1478
1479 for (i = BS_MAXPRI;
1480 i >= BS_MINPRI; i--) {
1481 int start_index;
1482
1483 if ((ps_select_array[i] == BS_NOPRI) ||
1484 (ps_select_array[i] == BS_FULLPRI))
1485 continue;
1486 start_index = ps_select_array[i];
1487
1488 if(!(paging_segments[start_index])) {
1489 j = start_index+1;
1490 physical_transfer_cluster_count = 0;
1491 }
0b4e3aa0 1492 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1c79356b 1493 (((paging_segments[start_index])->ps_clshift)
0b4e3aa0 1494 + vm_page_shift))) {
1c79356b
A
1495 physical_transfer_cluster_count = 0;
1496 j = start_index + 1;
1497 } else {
1498 physical_transfer_cluster_count+=1;
1499 j = start_index;
1500 if(start_index == 0)
1501 start_index = paging_segment_max;
1502 else
1503 start_index = start_index - 1;
1504 }
1505
1506 while (1) {
1507 if (j > paging_segment_max)
1508 j = 0;
1509 if ((ps = paging_segments[j]) &&
1510 (ps->ps_bs->bs_priority == i)) {
1511 /*
1512 * Force the ps cluster size to be
1513 * >= that of the vstruct.
1514 */
1515 PS_LOCK(ps);
b0d623f7
A
1516 if (IS_PS_OK_TO_USE(ps)) {
1517 if ((ps->ps_clcount) &&
1518 (ps->ps_clshift >= shift)) {
1519 ipc_port_t trigger = IP_NULL;
1520
1521 ps->ps_clcount--;
1522 dp_pages_free -= 1 << ps->ps_clshift;
1523 ps->ps_pgcount -= 1 << ps->ps_clshift;
1524 if(min_pages_trigger_port &&
1525 (dp_pages_free <
1526 minimum_pages_remaining)) {
1527 trigger = min_pages_trigger_port;
1528 min_pages_trigger_port = NULL;
6d2010ae
A
1529 bs_low = TRUE;
1530 backing_store_abort_compaction = TRUE;
b0d623f7
A
1531 }
1532 PS_UNLOCK(ps);
1533 /*
1534 * found one, quit looking.
1535 */
1536 ps_select_array[i] = j;
1537 PSL_UNLOCK();
1538
1539 if (trigger != IP_NULL) {
6d2010ae
A
1540 dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1541
b0d623f7
A
1542 default_pager_space_alert(
1543 trigger,
1544 HI_WAT_ALERT);
1545 ipc_port_release_send(trigger);
1546 }
1547 *psindex = j;
1548 return ps;
0b4e3aa0 1549 }
1c79356b
A
1550 }
1551 PS_UNLOCK(ps);
1552 }
1553 if (j == start_index) {
1554 /*
1555 * none at this priority -- mark it full
1556 */
1557 ps_select_array[i] = BS_FULLPRI;
1558 break;
1559 }
1560 j++;
1561 }
1562 }
b0d623f7
A
1563
1564 if(dp_pages_free) {
1565 dp_pages_free_drift_count++;
1566 if(dp_pages_free > dp_pages_free_drifted_max) {
1567 dp_pages_free_drifted_max = dp_pages_free;
1568 }
1569 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1570 }
1571 dp_pages_free = 0;
1c79356b
A
1572 PSL_UNLOCK();
1573 return PAGING_SEGMENT_NULL;
1574}
1575
b0d623f7 1576dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1c79356b 1577
b0d623f7 1578dp_offset_t
1c79356b
A
1579ps_allocate_cluster(
1580 vstruct_t vs,
1581 int *psindex,
1582 paging_segment_t use_ps)
1583{
91447636 1584 unsigned int byte_num;
1c79356b
A
1585 int bit_num = 0;
1586 paging_segment_t ps;
b0d623f7 1587 dp_offset_t cluster;
0b4e3aa0 1588 ipc_port_t trigger = IP_NULL;
1c79356b
A
1589
1590 /*
1591 * Find best paging segment.
1592 * ps_select_segment will decrement cluster count on ps.
1593 * Must pass cluster shift to find the most appropriate segment.
1594 */
1595 /* NOTE: The addition of paging segment delete capability threatened
1596 * to seriously complicate the treatment of paging segments in this
1597 * module and the ones that call it (notably ps_clmap), because of the
1598 * difficulty in assuring that the paging segment would continue to
1599 * exist between being unlocked and locked. This was
1600 * avoided because all calls to this module are based in either
1601 * dp_memory_object calls which rely on the vs lock, or by
1602 * the transfer function which is part of the segment delete path.
1603 * The transfer function which is part of paging segment delete is
1604 * protected from multiple callers by the backing store lock.
1605 * The paging segment delete function treats mappings to a paging
1606 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1607 * while data is transferred to the remaining segments. This is in
1608 * line with the view that incomplete or in-transition mappings between
1609 * data, a vstruct, and backing store are protected by the vs lock.
1610 * This and the ordering of the paging segment "going_away" bit setting
1611 * protects us.
1612 */
b0d623f7 1613retry:
1c79356b
A
1614 if (use_ps != PAGING_SEGMENT_NULL) {
1615 ps = use_ps;
1616 PSL_LOCK();
1617 PS_LOCK(ps);
55e303ae
A
1618
1619 ASSERT(ps->ps_clcount != 0);
1620
1c79356b
A
1621 ps->ps_clcount--;
1622 dp_pages_free -= 1 << ps->ps_clshift;
b0d623f7 1623 ps->ps_pgcount -= 1 << ps->ps_clshift;
1c79356b
A
1624 if(min_pages_trigger_port &&
1625 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1626 trigger = min_pages_trigger_port;
1c79356b 1627 min_pages_trigger_port = NULL;
6d2010ae
A
1628 bs_low = TRUE;
1629 backing_store_abort_compaction = TRUE;
1c79356b 1630 }
0b4e3aa0 1631 PSL_UNLOCK();
1c79356b 1632 PS_UNLOCK(ps);
0b4e3aa0 1633 if (trigger != IP_NULL) {
6d2010ae
A
1634 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1635
0b4e3aa0
A
1636 default_pager_space_alert(trigger, HI_WAT_ALERT);
1637 ipc_port_release_send(trigger);
1638 }
1639
1c79356b
A
1640 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1641 PAGING_SEGMENT_NULL) {
b0d623f7
A
1642 static clock_sec_t lastnotify = 0;
1643 clock_sec_t now;
1644 clock_nsec_t nanoseconds_dummy;
1645
1646 /*
1647 * Don't immediately jump to the emergency segment. Give the
1648 * dynamic pager a chance to create it's first normal swap file.
1649 * Unless, of course the very first normal swap file can't be
1650 * created due to some problem and we didn't expect that problem
1651 * i.e. use_emergency_swap_file_first was never set to true initially.
1652 * It then gets set in the swap file creation error handling.
1653 */
1654 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1655
1656 ps = paging_segments[EMERGENCY_PSEG_INDEX];
1657 if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1658 PSL_LOCK();
1659 PS_LOCK(ps);
1660
1661 if(IS_PS_GOING_AWAY(ps)) {
1662 /* Someone de-activated the emergency paging segment*/
1663 PS_UNLOCK(ps);
1664 PSL_UNLOCK();
91447636 1665
b0d623f7
A
1666 } else if(dp_pages_free) {
1667 /*
1668 * Someone has already activated the emergency paging segment
1669 * OR
1670 * Between us having rec'd a NULL segment from ps_select_segment
1671 * and reaching here a new normal segment could have been added.
1672 * E.g. we get NULL segment and another thread just added the
1673 * new swap file. Hence check to see if we have more dp_pages_free
1674 * before activating the emergency segment.
1675 */
1676 PS_UNLOCK(ps);
1677 PSL_UNLOCK();
1678 goto retry;
1679
1680 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1681 /*
1682 * PS_CAN_USE is only reset from the emergency segment when it's
1683 * been successfully recovered. So it's legal to have an emergency
1684 * segment that has PS_CAN_USE but no clusters because it's recovery
1685 * failed.
1686 */
1687 backing_store_t bs = ps->ps_bs;
1688 ps->ps_state |= PS_CAN_USE;
1689 if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1690 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1691 ps_select_array[bs->bs_priority] = 0;
1692 }
1693 dp_pages_free += ps->ps_pgcount;
1694 dp_pages_reserve -= ps->ps_pgcount;
1695 PS_UNLOCK(ps);
1696 PSL_UNLOCK();
1697 dprintf(("Switching ON Emergency paging segment\n"));
1698 goto retry;
1699 }
1700
1701 PS_UNLOCK(ps);
1702 PSL_UNLOCK();
1703 }
1704 }
1705
91447636
A
1706 /*
1707 * Emit a notification of the low-paging resource condition
1708 * but don't issue it more than once every five seconds. This
1709 * prevents us from overflowing logs with thousands of
1710 * repetitions of the message.
1711 */
1712 clock_get_system_nanotime(&now, &nanoseconds_dummy);
b0d623f7
A
1713 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1714 /* With an activated emergency paging segment we still
1715 * didn't get any clusters. This could mean that the
1716 * emergency paging segment is exhausted.
1717 */
1718 dprintf(("System is out of paging space.\n"));
91447636
A
1719 lastnotify = now;
1720 }
1721
0b4e3aa0 1722 PSL_LOCK();
b0d623f7 1723
1c79356b 1724 if(min_pages_trigger_port) {
0b4e3aa0 1725 trigger = min_pages_trigger_port;
1c79356b
A
1726 min_pages_trigger_port = NULL;
1727 bs_low = TRUE;
6d2010ae 1728 backing_store_abort_compaction = TRUE;
1c79356b 1729 }
0b4e3aa0
A
1730 PSL_UNLOCK();
1731 if (trigger != IP_NULL) {
6d2010ae
A
1732 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1733
0b4e3aa0
A
1734 default_pager_space_alert(trigger, HI_WAT_ALERT);
1735 ipc_port_release_send(trigger);
1736 }
b0d623f7 1737 return (dp_offset_t) -1;
1c79356b 1738 }
1c79356b
A
1739
1740 /*
1741 * Look for an available cluster. At the end of the loop,
1742 * byte_num is the byte offset and bit_num is the bit offset of the
1743 * first zero bit in the paging segment bitmap.
1744 */
1745 PS_LOCK(ps);
1746 byte_num = ps->ps_hint;
1747 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1748 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1749 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1750 if (isclr((ps->ps_bmap + byte_num), bit_num))
1751 break;
1752 }
1753 ASSERT(bit_num != NBBY);
1754 break;
1755 }
1756 }
1757 ps->ps_hint = byte_num;
1758 cluster = (byte_num*NBBY) + bit_num;
1759
1760 /* Space was reserved, so this must be true */
1761 ASSERT(cluster < ps->ps_ncls);
1762
1763 setbit(ps->ps_bmap, cluster);
1764 PS_UNLOCK(ps);
1765
1766 return cluster;
1767}
1768
b0d623f7 1769void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */
1c79356b
A
1770
1771void
1772ps_deallocate_cluster(
1773 paging_segment_t ps,
b0d623f7 1774 dp_offset_t cluster)
1c79356b
A
1775{
1776
b0d623f7 1777 if (cluster >= ps->ps_ncls)
1c79356b
A
1778 panic("ps_deallocate_cluster: Invalid cluster number");
1779
1780 /*
1781 * Lock the paging segment, clear the cluster's bitmap and increment the
1782 * number of free cluster.
1783 */
1784 PSL_LOCK();
1785 PS_LOCK(ps);
1786 clrbit(ps->ps_bmap, cluster);
b0d623f7
A
1787 if( IS_PS_OK_TO_USE(ps)) {
1788 ++ps->ps_clcount;
1789 ps->ps_pgcount += 1 << ps->ps_clshift;
1790 dp_pages_free += 1 << ps->ps_clshift;
1791 } else {
1792 ps->ps_special_clusters += 1;
1793 }
1c79356b
A
1794
1795 /*
1796 * Move the hint down to the freed cluster if it is
1797 * less than the current hint.
1798 */
1799 if ((cluster/NBBY) < ps->ps_hint) {
1800 ps->ps_hint = (cluster/NBBY);
1801 }
1802
1c79356b
A
1803
1804 /*
1805 * If we're freeing space on a full priority, reset the array.
1806 */
b0d623f7 1807 if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1c79356b 1808 ps_select_array[ps->ps_bs->bs_priority] = 0;
b0d623f7 1809 PS_UNLOCK(ps);
1c79356b
A
1810 PSL_UNLOCK();
1811
1812 return;
1813}
1814
b0d623f7 1815void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */
1c79356b
A
1816
1817void
1818ps_dealloc_vsmap(
1819 struct vs_map *vsmap,
b0d623f7 1820 dp_size_t size)
1c79356b 1821{
91447636 1822 unsigned int i;
6d2010ae
A
1823 struct ps_vnode_trim_data trim_data;
1824
1825 ps_vnode_trim_init(&trim_data);
1826
1827 for (i = 0; i < size; i++) {
1828 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1829 ps_vnode_trim_more(&trim_data,
1830 &vsmap[i],
1831 VSM_PS(vsmap[i])->ps_clshift,
1832 vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1c79356b
A
1833 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1834 VSM_CLOFF(vsmap[i]));
6d2010ae
A
1835 } else {
1836 ps_vnode_trim_now(&trim_data);
1837 }
1838 }
1839 ps_vnode_trim_now(&trim_data);
1c79356b
A
1840}
1841
1842void
1843ps_vstruct_dealloc(
1844 vstruct_t vs)
1845{
91447636
A
1846 unsigned int i;
1847// spl_t s;
1c79356b
A
1848
1849 VS_MAP_LOCK(vs);
1850
1851 /*
1852 * If this is an indirect structure, then we walk through the valid
1853 * (non-zero) indirect pointers and deallocate the clusters
1854 * associated with each used map entry (via ps_dealloc_vsmap).
1855 * When all of the clusters in an indirect block have been
1856 * freed, we deallocate the block. When all of the indirect
1857 * blocks have been deallocated we deallocate the memory
1858 * holding the indirect pointers.
1859 */
1860 if (vs->vs_indirect) {
1861 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1862 if (vs->vs_imap[i] != NULL) {
1863 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
91447636 1864 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1c79356b
A
1865 }
1866 }
91447636 1867 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1c79356b
A
1868 } else {
1869 /*
1870 * Direct map. Free used clusters, then memory.
1871 */
1872 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
91447636 1873 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1c79356b
A
1874 }
1875 VS_MAP_UNLOCK(vs);
1876
1877 bs_commit(- vs->vs_size);
1878
91447636 1879 zfree(vstruct_zone, vs);
1c79356b
A
1880}
1881
6d2010ae
A
1882void
1883ps_vstruct_reclaim(
1884 vstruct_t vs,
1885 boolean_t return_to_vm,
1886 boolean_t reclaim_backing_store)
1887{
1888 unsigned int i, j;
1889// spl_t s;
1890 unsigned int request_flags;
1891 struct vs_map *vsmap;
1892 boolean_t vsmap_all_clear, vsimap_all_clear;
1893 struct vm_object_fault_info fault_info;
1894 int clmap_off;
1895 unsigned int vsmap_size;
1896 kern_return_t kr;
1897
1898 request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
1899 if (reclaim_backing_store) {
1900#if USE_PRECIOUS
1901 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
1902#else /* USE_PRECIOUS */
1903 request_flags |= UPL_REQUEST_SET_DIRTY;
1904#endif /* USE_PRECIOUS */
1905 }
1906
1907 VS_MAP_LOCK(vs);
1908
1909 fault_info.cluster_size = VM_SUPER_CLUSTER;
1910 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1911 fault_info.user_tag = 0;
1912 fault_info.lo_offset = 0;
1913 fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1914 fault_info.io_sync = reclaim_backing_store;
1915
1916 /*
1917 * If this is an indirect structure, then we walk through the valid
1918 * (non-zero) indirect pointers and deallocate the clusters
1919 * associated with each used map entry (via ps_dealloc_vsmap).
1920 * When all of the clusters in an indirect block have been
1921 * freed, we deallocate the block. When all of the indirect
1922 * blocks have been deallocated we deallocate the memory
1923 * holding the indirect pointers.
1924 */
1925 if (vs->vs_indirect) {
1926 vsimap_all_clear = TRUE;
1927 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1928 vsmap = vs->vs_imap[i];
1929 if (vsmap == NULL)
1930 continue;
1931 /* loop on clusters in this indirect map */
1932 clmap_off = (vm_page_size * CLMAP_ENTRIES *
1933 VSCLSIZE(vs) * i);
1934 if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1935 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1936 else
1937 vsmap_size = CLMAP_ENTRIES;
1938 vsmap_all_clear = TRUE;
1939 if (return_to_vm) {
1940 for (j = 0; j < vsmap_size;) {
1941 if (VSM_ISCLR(vsmap[j]) ||
1942 VSM_ISERR(vsmap[j])) {
1943 j++;
1944 clmap_off += vm_page_size * VSCLSIZE(vs);
1945 continue;
1946 }
1947 VS_MAP_UNLOCK(vs);
1948 kr = pvs_cluster_read(
1949 vs,
1950 clmap_off,
1951 (dp_size_t) -1, /* read whole cluster */
1952 &fault_info);
1953 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1954 if (kr != KERN_SUCCESS) {
1955 vsmap_all_clear = FALSE;
1956 vsimap_all_clear = FALSE;
1957 }
1958 }
1959 }
1960 if (vsmap_all_clear) {
1961 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1962 kfree(vsmap, CLMAP_THRESHOLD);
1963 vs->vs_imap[i] = NULL;
1964 }
1965 }
1966 if (vsimap_all_clear) {
1967// kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1968 }
1969 } else {
1970 /*
1971 * Direct map. Free used clusters, then memory.
1972 */
1973 vsmap = vs->vs_dmap;
1974 if (vsmap == NULL) {
1975 goto out;
1976 }
1977 vsmap_all_clear = TRUE;
1978 /* loop on clusters in the direct map */
1979 if (return_to_vm) {
1980 for (j = 0; j < vs->vs_size;) {
1981 if (VSM_ISCLR(vsmap[j]) ||
1982 VSM_ISERR(vsmap[j])) {
1983 j++;
1984 continue;
1985 }
1986 clmap_off = vm_page_size * (j << vs->vs_clshift);
1987 VS_MAP_UNLOCK(vs);
1988 kr = pvs_cluster_read(
1989 vs,
1990 clmap_off,
1991 (dp_size_t) -1, /* read whole cluster */
1992 &fault_info);
1993 VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1994 if (kr != KERN_SUCCESS) {
1995 vsmap_all_clear = FALSE;
1996 } else {
1997// VSM_CLR(vsmap[j]);
1998 }
1999 }
2000 }
2001 if (vsmap_all_clear) {
2002 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2003// kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2004 }
2005 }
2006out:
2007 VS_MAP_UNLOCK(vs);
2008}
2009
91447636 2010int ps_map_extend(vstruct_t, unsigned int); /* forward */
1c79356b
A
2011
2012int ps_map_extend(
2013 vstruct_t vs,
91447636 2014 unsigned int new_size)
1c79356b
A
2015{
2016 struct vs_map **new_imap;
2017 struct vs_map *new_dmap = NULL;
2018 int newdsize;
2019 int i;
2020 void *old_map = NULL;
2021 int old_map_size = 0;
2022
2023 if (vs->vs_size >= new_size) {
2024 /*
2025 * Someone has already done the work.
2026 */
2027 return 0;
2028 }
2029
2030 /*
2031 * If the new size extends into the indirect range, then we have one
2032 * of two cases: we are going from indirect to indirect, or we are
2033 * going from direct to indirect. If we are going from indirect to
2034 * indirect, then it is possible that the new size will fit in the old
2035 * indirect map. If this is the case, then just reset the size of the
2036 * vstruct map and we are done. If the new size will not
2037 * fit into the old indirect map, then we have to allocate a new
2038 * indirect map and copy the old map pointers into this new map.
2039 *
2040 * If we are going from direct to indirect, then we have to allocate a
2041 * new indirect map and copy the old direct pages into the first
2042 * indirect page of the new map.
2043 * NOTE: allocating memory here is dangerous, as we're in the
2044 * pageout path.
2045 */
2046 if (INDIRECT_CLMAP(new_size)) {
2047 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2048
2049 /*
2050 * Get a new indirect map and zero it.
2051 */
2052 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2053 if (vs->vs_indirect &&
2054 (new_map_size == old_map_size)) {
2055 bs_commit(new_size - vs->vs_size);
2056 vs->vs_size = new_size;
2057 return 0;
2058 }
2059
2060 new_imap = (struct vs_map **)kalloc(new_map_size);
2061 if (new_imap == NULL) {
2062 return -1;
2063 }
2064 memset(new_imap, 0, new_map_size);
2065
2066 if (vs->vs_indirect) {
2067 /* Copy old entries into new map */
2068 memcpy(new_imap, vs->vs_imap, old_map_size);
2069 /* Arrange to free the old map */
2070 old_map = (void *) vs->vs_imap;
2071 newdsize = 0;
2072 } else { /* Old map was a direct map */
2073 /* Allocate an indirect page */
2074 if ((new_imap[0] = (struct vs_map *)
2075 kalloc(CLMAP_THRESHOLD)) == NULL) {
91447636 2076 kfree(new_imap, new_map_size);
1c79356b
A
2077 return -1;
2078 }
2079 new_dmap = new_imap[0];
2080 newdsize = CLMAP_ENTRIES;
2081 }
2082 } else {
2083 new_imap = NULL;
2084 newdsize = new_size;
2085 /*
2086 * If the new map is a direct map, then the old map must
2087 * also have been a direct map. All we have to do is
2088 * to allocate a new direct map, copy the old entries
2089 * into it and free the old map.
2090 */
2091 if ((new_dmap = (struct vs_map *)
2092 kalloc(CLMAP_SIZE(new_size))) == NULL) {
2093 return -1;
2094 }
2095 }
2096 if (newdsize) {
2097
2098 /* Free the old map */
2099 old_map = (void *) vs->vs_dmap;
2100 old_map_size = CLMAP_SIZE(vs->vs_size);
2101
2102 /* Copy info from the old map into the new map */
2103 memcpy(new_dmap, vs->vs_dmap, old_map_size);
2104
2105 /* Initialize the rest of the new map */
2106 for (i = vs->vs_size; i < newdsize; i++)
2107 VSM_CLR(new_dmap[i]);
2108 }
2109 if (new_imap) {
2110 vs->vs_imap = new_imap;
2111 vs->vs_indirect = TRUE;
2112 } else
2113 vs->vs_dmap = new_dmap;
2114 bs_commit(new_size - vs->vs_size);
2115 vs->vs_size = new_size;
2116 if (old_map)
91447636 2117 kfree(old_map, old_map_size);
1c79356b
A
2118 return 0;
2119}
2120
b0d623f7 2121dp_offset_t
1c79356b
A
2122ps_clmap(
2123 vstruct_t vs,
b0d623f7 2124 dp_offset_t offset,
1c79356b
A
2125 struct clmap *clmap,
2126 int flag,
b0d623f7 2127 dp_size_t size,
1c79356b
A
2128 int error)
2129{
b0d623f7
A
2130 dp_offset_t cluster; /* The cluster of offset. */
2131 dp_offset_t newcl; /* The new cluster allocated. */
2132 dp_offset_t newoff;
91447636 2133 unsigned int i;
1c79356b 2134 struct vs_map *vsmap;
1c79356b
A
2135
2136 VS_MAP_LOCK(vs);
2137
2138 ASSERT(vs->vs_dmap);
55e303ae 2139 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
2140
2141 /*
2142 * Initialize cluster error value
2143 */
2144 clmap->cl_error = 0;
2145
2146 /*
2147 * If the object has grown, extend the page map.
2148 */
2149 if (cluster >= vs->vs_size) {
2150 if (flag == CL_FIND) {
2151 /* Do not allocate if just doing a lookup */
2152 VS_MAP_UNLOCK(vs);
b0d623f7 2153 return (dp_offset_t) -1;
1c79356b
A
2154 }
2155 if (ps_map_extend(vs, cluster + 1)) {
2156 VS_MAP_UNLOCK(vs);
b0d623f7 2157 return (dp_offset_t) -1;
1c79356b
A
2158 }
2159 }
2160
2161 /*
2162 * Look for the desired cluster. If the map is indirect, then we
2163 * have a two level lookup. First find the indirect block, then
2164 * find the actual cluster. If the indirect block has not yet
2165 * been allocated, then do so. If the cluster has not yet been
2166 * allocated, then do so.
2167 *
2168 * If any of the allocations fail, then return an error.
2169 * Don't allocate if just doing a lookup.
2170 */
2171 if (vs->vs_indirect) {
2172 long ind_block = cluster/CLMAP_ENTRIES;
2173
2174 /* Is the indirect block allocated? */
2175 vsmap = vs->vs_imap[ind_block];
2176 if (vsmap == NULL) {
2177 if (flag == CL_FIND) {
2178 VS_MAP_UNLOCK(vs);
b0d623f7 2179 return (dp_offset_t) -1;
1c79356b
A
2180 }
2181
2182 /* Allocate the indirect block */
2183 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2184 if (vsmap == NULL) {
2185 VS_MAP_UNLOCK(vs);
b0d623f7 2186 return (dp_offset_t) -1;
1c79356b
A
2187 }
2188 /* Initialize the cluster offsets */
2189 for (i = 0; i < CLMAP_ENTRIES; i++)
2190 VSM_CLR(vsmap[i]);
2191 vs->vs_imap[ind_block] = vsmap;
2192 }
2193 } else
2194 vsmap = vs->vs_dmap;
2195
2196 ASSERT(vsmap);
2197 vsmap += cluster%CLMAP_ENTRIES;
2198
2199 /*
2200 * At this point, vsmap points to the struct vs_map desired.
2201 *
2202 * Look in the map for the cluster, if there was an error on a
2203 * previous write, flag it and return. If it is not yet
2204 * allocated, then allocate it, if we're writing; if we're
2205 * doing a lookup and the cluster's not allocated, return error.
2206 */
2207 if (VSM_ISERR(*vsmap)) {
2208 clmap->cl_error = VSM_GETERR(*vsmap);
2209 VS_MAP_UNLOCK(vs);
b0d623f7 2210 return (dp_offset_t) -1;
1c79356b
A
2211 } else if (VSM_ISCLR(*vsmap)) {
2212 int psindex;
2213
2214 if (flag == CL_FIND) {
2215 /*
2216 * If there's an error and the entry is clear, then
2217 * we've run out of swap space. Record the error
2218 * here and return.
2219 */
2220 if (error) {
2221 VSM_SETERR(*vsmap, error);
2222 }
2223 VS_MAP_UNLOCK(vs);
b0d623f7 2224 return (dp_offset_t) -1;
1c79356b
A
2225 } else {
2226 /*
2227 * Attempt to allocate a cluster from the paging segment
2228 */
2229 newcl = ps_allocate_cluster(vs, &psindex,
2230 PAGING_SEGMENT_NULL);
b0d623f7 2231 if (newcl == (dp_offset_t) -1) {
1c79356b 2232 VS_MAP_UNLOCK(vs);
b0d623f7 2233 return (dp_offset_t) -1;
1c79356b
A
2234 }
2235 VSM_CLR(*vsmap);
2236 VSM_SETCLOFF(*vsmap, newcl);
2237 VSM_SETPS(*vsmap, psindex);
2238 }
2239 } else
2240 newcl = VSM_CLOFF(*vsmap);
2241
2242 /*
2243 * Fill in pertinent fields of the clmap
2244 */
2245 clmap->cl_ps = VSM_PS(*vsmap);
2246 clmap->cl_numpages = VSCLSIZE(vs);
2247 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2248
2249 /*
2250 * Byte offset in paging segment is byte offset to cluster plus
2251 * byte offset within cluster. It looks ugly, but should be
2252 * relatively quick.
2253 */
2254 ASSERT(trunc_page(offset) == offset);
55e303ae 2255 newcl = ptoa_32(newcl) << vs->vs_clshift;
1c79356b
A
2256 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2257 if (flag == CL_ALLOC) {
2258 /*
2259 * set bits in the allocation bitmap according to which
2260 * pages were requested. size is in bytes.
2261 */
55e303ae 2262 i = atop_32(newoff);
1c79356b
A
2263 while ((size > 0) && (i < VSCLSIZE(vs))) {
2264 VSM_SETALLOC(*vsmap, i);
2265 i++;
2266 size -= vm_page_size;
2267 }
2268 }
2269 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2270 if (newoff) {
2271 /*
2272 * Offset is not cluster aligned, so number of pages
2273 * and bitmaps must be adjusted
2274 */
55e303ae 2275 clmap->cl_numpages -= atop_32(newoff);
1c79356b
A
2276 CLMAP_SHIFT(clmap, vs);
2277 CLMAP_SHIFTALLOC(clmap, vs);
2278 }
2279
2280 /*
2281 *
2282 * The setting of valid bits and handling of write errors
2283 * must be done here, while we hold the lock on the map.
2284 * It logically should be done in ps_vs_write_complete().
2285 * The size and error information has been passed from
2286 * ps_vs_write_complete(). If the size parameter is non-zero,
2287 * then there is work to be done. If error is also non-zero,
2288 * then the error number is recorded in the cluster and the
2289 * entire cluster is in error.
2290 */
2291 if (size && flag == CL_FIND) {
b0d623f7 2292 dp_offset_t off = (dp_offset_t) 0;
1c79356b
A
2293
2294 if (!error) {
2295 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2296 i++) {
2297 VSM_SETPG(*vsmap, i);
2298 size -= vm_page_size;
2299 }
2300 ASSERT(i <= VSCLSIZE(vs));
2301 } else {
2302 BS_STAT(clmap->cl_ps->ps_bs,
2303 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
55e303ae 2304 atop_32(size));
1c79356b
A
2305 off = VSM_CLOFF(*vsmap);
2306 VSM_SETERR(*vsmap, error);
2307 }
2308 /*
2309 * Deallocate cluster if error, and no valid pages
2310 * already present.
2311 */
b0d623f7 2312 if (off != (dp_offset_t) 0)
1c79356b
A
2313 ps_deallocate_cluster(clmap->cl_ps, off);
2314 VS_MAP_UNLOCK(vs);
b0d623f7 2315 return (dp_offset_t) 0;
1c79356b
A
2316 } else
2317 VS_MAP_UNLOCK(vs);
2318
91447636
A
2319 DP_DEBUG(DEBUG_VS_INTERNAL,
2320 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2321 newcl+newoff, (int) vs, (int) vsmap, flag));
2322 DP_DEBUG(DEBUG_VS_INTERNAL,
2323 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2324 (int) clmap->cl_ps, clmap->cl_numpages,
2325 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1c79356b
A
2326
2327 return (newcl + newoff);
2328}
2329
b0d623f7 2330void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */
1c79356b
A
2331
2332void
2333ps_clunmap(
2334 vstruct_t vs,
b0d623f7
A
2335 dp_offset_t offset,
2336 dp_size_t length)
1c79356b 2337{
b0d623f7 2338 dp_offset_t cluster; /* The cluster number of offset */
1c79356b 2339 struct vs_map *vsmap;
6d2010ae
A
2340 struct ps_vnode_trim_data trim_data;
2341
2342 ps_vnode_trim_init(&trim_data);
1c79356b
A
2343
2344 VS_MAP_LOCK(vs);
2345
2346 /*
2347 * Loop through all clusters in this range, freeing paging segment
2348 * clusters and map entries as encountered.
2349 */
2350 while (length > 0) {
b0d623f7 2351 dp_offset_t newoff;
91447636 2352 unsigned int i;
1c79356b 2353
55e303ae 2354 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
2355 if (vs->vs_indirect) /* indirect map */
2356 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2357 else
2358 vsmap = vs->vs_dmap;
2359 if (vsmap == NULL) {
6d2010ae 2360 ps_vnode_trim_now(&trim_data);
1c79356b
A
2361 VS_MAP_UNLOCK(vs);
2362 return;
2363 }
2364 vsmap += cluster%CLMAP_ENTRIES;
2365 if (VSM_ISCLR(*vsmap)) {
6d2010ae 2366 ps_vnode_trim_now(&trim_data);
1c79356b
A
2367 length -= vm_page_size;
2368 offset += vm_page_size;
2369 continue;
2370 }
2371 /*
2372 * We've got a valid mapping. Clear it and deallocate
2373 * paging segment cluster pages.
2374 * Optimize for entire cluster cleraing.
2375 */
91447636 2376 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
1c79356b
A
2377 /*
2378 * Not cluster aligned.
2379 */
2380 ASSERT(trunc_page(newoff) == newoff);
55e303ae 2381 i = atop_32(newoff);
1c79356b
A
2382 } else
2383 i = 0;
2384 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2385 VSM_CLRPG(*vsmap, i);
2386 VSM_CLRALLOC(*vsmap, i);
2387 length -= vm_page_size;
2388 offset += vm_page_size;
2389 i++;
2390 }
2391
2392 /*
2393 * If map entry is empty, clear and deallocate cluster.
2394 */
6d2010ae
A
2395 if (!VSM_BMAP(*vsmap)) {
2396 ps_vnode_trim_more(&trim_data,
2397 vsmap,
2398 vs->vs_clshift,
2399 VSCLSIZE(vs) * vm_page_size);
1c79356b
A
2400 ps_deallocate_cluster(VSM_PS(*vsmap),
2401 VSM_CLOFF(*vsmap));
2402 VSM_CLR(*vsmap);
6d2010ae
A
2403 } else {
2404 ps_vnode_trim_now(&trim_data);
1c79356b
A
2405 }
2406 }
6d2010ae 2407 ps_vnode_trim_now(&trim_data);
1c79356b
A
2408
2409 VS_MAP_UNLOCK(vs);
2410}
2411
b0d623f7 2412void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
1c79356b
A
2413
2414void
2415ps_vs_write_complete(
2416 vstruct_t vs,
b0d623f7
A
2417 dp_offset_t offset,
2418 dp_size_t size,
1c79356b
A
2419 int error)
2420{
2421 struct clmap clmap;
2422
2423 /*
2424 * Get the struct vsmap for this cluster.
2425 * Use READ, even though it was written, because the
2426 * cluster MUST be present, unless there was an error
2427 * in the original ps_clmap (e.g. no space), in which
2428 * case, nothing happens.
2429 *
2430 * Must pass enough information to ps_clmap to allow it
2431 * to set the vs_map structure bitmap under lock.
2432 */
2433 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2434}
2435
b0d623f7 2436void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */
1c79356b
A
2437
2438void
2439vs_cl_write_complete(
b0d623f7 2440 vstruct_t vs,
91447636 2441 __unused paging_segment_t ps,
b0d623f7 2442 dp_offset_t offset,
91447636 2443 __unused vm_offset_t addr,
b0d623f7
A
2444 dp_size_t size,
2445 boolean_t async,
2446 int error)
1c79356b 2447{
91447636 2448// kern_return_t kr;
1c79356b
A
2449
2450 if (error) {
2451 /*
2452 * For internal objects, the error is recorded on a
2453 * per-cluster basis by ps_clmap() which is called
2454 * by ps_vs_write_complete() below.
2455 */
2456 dprintf(("write failed error = 0x%x\n", error));
2457 /* add upl_abort code here */
2458 } else
55e303ae 2459 GSTAT(global_stats.gs_pages_out += atop_32(size));
1c79356b
A
2460 /*
2461 * Notify the vstruct mapping code, so it can do its accounting.
2462 */
2463 ps_vs_write_complete(vs, offset, size, error);
2464
2465 if (async) {
2466 VS_LOCK(vs);
2467 ASSERT(vs->vs_async_pending > 0);
2468 vs->vs_async_pending -= size;
0b4e3aa0
A
2469 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2470 vs->vs_waiting_async = FALSE;
1c79356b 2471 VS_UNLOCK(vs);
0b4e3aa0 2472 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
2473 } else {
2474 VS_UNLOCK(vs);
2475 }
2476 }
2477}
2478
2479#ifdef DEVICE_PAGING
2480kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2481
2482kern_return_t
2483device_write_reply(
2484 MACH_PORT_FACE reply_port,
2485 kern_return_t device_code,
2486 io_buf_len_t bytes_written)
2487{
2488 struct vs_async *vsa;
1c79356b
A
2489
2490 vsa = (struct vs_async *)
2491 ((struct vstruct_alias *)(reply_port->alias))->vs;
2492
2493 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2494 device_code = KERN_FAILURE;
2495 }
2496
2497 vsa->vsa_error = device_code;
2498
2499
2500 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2501 if(vsa->vsa_flags & VSA_TRANSFER) {
2502 /* revisit when async disk segments redone */
2503 if(vsa->vsa_error) {
2504 /* need to consider error condition. re-write data or */
2505 /* throw it away here. */
91447636 2506 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
1c79356b
A
2507 }
2508 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2509 vsa->vsa_size, vsa->vsa_error);
2510 } else {
2511 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2512 vsa->vsa_addr, vsa->vsa_size, TRUE,
2513 vsa->vsa_error);
2514 }
2515 VS_FREE_ASYNC(vsa);
2516
2517 return KERN_SUCCESS;
2518}
2519
2520kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2521kern_return_t
2522device_write_reply_inband(
2523 MACH_PORT_FACE reply_port,
2524 kern_return_t return_code,
2525 io_buf_len_t bytes_written)
2526{
2527 panic("device_write_reply_inband: illegal");
2528 return KERN_SUCCESS;
2529}
2530
2531kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2532kern_return_t
2533device_read_reply(
2534 MACH_PORT_FACE reply_port,
2535 kern_return_t return_code,
2536 io_buf_ptr_t data,
2537 mach_msg_type_number_t dataCnt)
2538{
2539 struct vs_async *vsa;
2540 vsa = (struct vs_async *)
2541 ((struct vstruct_alias *)(reply_port->alias))->vs;
2542 vsa->vsa_addr = (vm_offset_t)data;
2543 vsa->vsa_size = (vm_size_t)dataCnt;
2544 vsa->vsa_error = return_code;
b0d623f7 2545 thread_wakeup(&vsa);
1c79356b
A
2546 return KERN_SUCCESS;
2547}
2548
2549kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2550kern_return_t
2551device_read_reply_inband(
2552 MACH_PORT_FACE reply_port,
2553 kern_return_t return_code,
2554 io_buf_ptr_inband_t data,
2555 mach_msg_type_number_t dataCnt)
2556{
2557 panic("device_read_reply_inband: illegal");
2558 return KERN_SUCCESS;
2559}
2560
2561kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2562kern_return_t
2563device_read_reply_overwrite(
2564 MACH_PORT_FACE reply_port,
2565 kern_return_t return_code,
2566 io_buf_len_t bytes_read)
2567{
2568 panic("device_read_reply_overwrite: illegal\n");
2569 return KERN_SUCCESS;
2570}
2571
2572kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2573kern_return_t
2574device_open_reply(
2575 MACH_PORT_FACE reply_port,
2576 kern_return_t return_code,
2577 MACH_PORT_FACE device_port)
2578{
2579 panic("device_open_reply: illegal\n");
2580 return KERN_SUCCESS;
2581}
2582
1c79356b
A
2583kern_return_t
2584ps_read_device(
2585 paging_segment_t ps,
b0d623f7 2586 dp_offset_t offset,
1c79356b
A
2587 vm_offset_t *bufferp,
2588 unsigned int size,
2589 unsigned int *residualp,
2590 int flags)
2591{
2592 kern_return_t kr;
2593 recnum_t dev_offset;
2594 unsigned int bytes_wanted;
2595 unsigned int bytes_read;
2596 unsigned int total_read;
2597 vm_offset_t dev_buffer;
2598 vm_offset_t buf_ptr;
2599 unsigned int records_read;
1c79356b 2600 struct vs_async *vsa;
1c79356b
A
2601
2602 device_t device;
2603 vm_map_copy_t device_data = NULL;
2604 default_pager_thread_t *dpt = NULL;
2605
2606 device = dev_port_lookup(ps->ps_device);
55e303ae 2607 clustered_reads[atop_32(size)]++;
1c79356b
A
2608
2609 dev_offset = (ps->ps_offset +
2610 (offset >> (vm_page_shift - ps->ps_record_shift)));
2611 bytes_wanted = size;
2612 total_read = 0;
2613 *bufferp = (vm_offset_t)NULL;
2614
2615 do {
2616 vsa = VS_ALLOC_ASYNC();
2617 if (vsa) {
2618 vsa->vsa_vs = NULL;
2619 vsa->vsa_addr = 0;
2620 vsa->vsa_offset = 0;
2621 vsa->vsa_size = 0;
2622 vsa->vsa_ps = NULL;
2623 }
1c79356b
A
2624 ip_lock(vsa->reply_port);
2625 vsa->reply_port->ip_sorights++;
2626 ip_reference(vsa->reply_port);
2627 ip_unlock(vsa->reply_port);
2628 kr = ds_device_read_common(device,
2629 vsa->reply_port,
2630 (mach_msg_type_name_t)
2631 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2632 (dev_mode_t) 0,
2633 dev_offset,
2634 bytes_wanted,
2635 (IO_READ | IO_CALL),
2636 (io_buf_ptr_t *) &dev_buffer,
2637 (mach_msg_type_number_t *) &bytes_read);
2638 if(kr == MIG_NO_REPLY) {
b0d623f7 2639 assert_wait(&vsa, THREAD_UNINT);
9bccf70c 2640 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
2641
2642 dev_buffer = vsa->vsa_addr;
2643 bytes_read = (unsigned int)vsa->vsa_size;
2644 kr = vsa->vsa_error;
2645 }
2646 VS_FREE_ASYNC(vsa);
2647 if (kr != KERN_SUCCESS || bytes_read == 0) {
2648 break;
2649 }
2650 total_read += bytes_read;
2651
2652 /*
2653 * If we got the entire range, use the returned dev_buffer.
2654 */
2655 if (bytes_read == size) {
2656 *bufferp = (vm_offset_t)dev_buffer;
2657 break;
2658 }
2659
2660#if 1
2661 dprintf(("read only %d bytes out of %d\n",
2662 bytes_read, bytes_wanted));
2663#endif
2664 if(dpt == NULL) {
2665 dpt = get_read_buffer();
2666 buf_ptr = dpt->dpt_buffer;
2667 *bufferp = (vm_offset_t)buf_ptr;
2668 }
2669 /*
2670 * Otherwise, copy the data into the provided buffer (*bufferp)
2671 * and append the rest of the range as it comes in.
2672 */
2673 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2674 buf_ptr += bytes_read;
2675 bytes_wanted -= bytes_read;
2676 records_read = (bytes_read >>
2677 (vm_page_shift - ps->ps_record_shift));
2678 dev_offset += records_read;
91447636
A
2679 DP_DEBUG(DEBUG_VS_INTERNAL,
2680 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2681 dev_buffer, bytes_read));
1c79356b
A
2682 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2683 != KERN_SUCCESS)
2684 Panic("dealloc buf");
2685 } while (bytes_wanted);
2686
2687 *residualp = size - total_read;
2688 if((dev_buffer != *bufferp) && (total_read != 0)) {
2689 vm_offset_t temp_buffer;
91447636 2690 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
1c79356b
A
2691 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2692 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2693 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2694 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2695 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2696 (vm_map_copy_t *)&device_data, FALSE))
2697 panic("ps_read_device: cannot copyin locally provided buffer\n");
2698 }
2699 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2700 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2701 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2702 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2703 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2704 (vm_map_copy_t *)&device_data, FALSE))
2705 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2706 }
2707 else {
2708 device_data = NULL;
2709 }
2710 *bufferp = (vm_offset_t)device_data;
2711
2712 if(dpt != NULL) {
2713 /* Free the receive buffer */
2714 dpt->checked_out = 0;
2715 thread_wakeup(&dpt_array);
2716 }
2717 return KERN_SUCCESS;
2718}
2719
1c79356b
A
2720kern_return_t
2721ps_write_device(
2722 paging_segment_t ps,
b0d623f7 2723 dp_offset_t offset,
1c79356b
A
2724 vm_offset_t addr,
2725 unsigned int size,
2726 struct vs_async *vsa)
2727{
2728 recnum_t dev_offset;
2729 io_buf_len_t bytes_to_write, bytes_written;
2730 recnum_t records_written;
2731 kern_return_t kr;
2732 MACH_PORT_FACE reply_port;
1c79356b
A
2733
2734
2735
55e303ae 2736 clustered_writes[atop_32(size)]++;
1c79356b
A
2737
2738 dev_offset = (ps->ps_offset +
2739 (offset >> (vm_page_shift - ps->ps_record_shift)));
2740 bytes_to_write = size;
2741
2742 if (vsa) {
2743 /*
2744 * Asynchronous write.
2745 */
2746 reply_port = vsa->reply_port;
2747 ip_lock(reply_port);
2748 reply_port->ip_sorights++;
2749 ip_reference(reply_port);
2750 ip_unlock(reply_port);
2751 {
2752 device_t device;
2753 device = dev_port_lookup(ps->ps_device);
2754
2755 vsa->vsa_addr = addr;
2756 kr=ds_device_write_common(device,
2757 reply_port,
2758 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2759 (dev_mode_t) 0,
2760 dev_offset,
2761 (io_buf_ptr_t) addr,
2762 size,
2763 (IO_WRITE | IO_CALL),
2764 &bytes_written);
2765 }
2766 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2767 if (verbose)
2768 dprintf(("%s0x%x, addr=0x%x,"
2769 "size=0x%x,offset=0x%x\n",
2770 "device_write_request returned ",
2771 kr, addr, size, offset));
2772 BS_STAT(ps->ps_bs,
55e303ae 2773 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2774 /* do the completion notification to free resources */
2775 device_write_reply(reply_port, kr, 0);
2776 return PAGER_ERROR;
2777 }
2778 } else do {
2779 /*
2780 * Synchronous write.
2781 */
2782 {
2783 device_t device;
2784 device = dev_port_lookup(ps->ps_device);
2785 kr=ds_device_write_common(device,
2786 IP_NULL, 0,
2787 (dev_mode_t) 0,
2788 dev_offset,
2789 (io_buf_ptr_t) addr,
2790 size,
2791 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2792 &bytes_written);
2793 }
2794 if (kr != KERN_SUCCESS) {
2795 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2796 "device_write returned ",
2797 kr, addr, size, offset));
2798 BS_STAT(ps->ps_bs,
55e303ae 2799 ps->ps_bs->bs_pages_out_fail += atop_32(size));
1c79356b
A
2800 return PAGER_ERROR;
2801 }
2802 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2803 Panic("fragmented write");
2804 records_written = (bytes_written >>
2805 (vm_page_shift - ps->ps_record_shift));
2806 dev_offset += records_written;
2807#if 1
2808 if (bytes_written != bytes_to_write) {
2809 dprintf(("wrote only %d bytes out of %d\n",
2810 bytes_written, bytes_to_write));
2811 }
2812#endif
2813 bytes_to_write -= bytes_written;
2814 addr += bytes_written;
2815 } while (bytes_to_write > 0);
2816
2817 return PAGER_SUCCESS;
2818}
2819
2820
2821#else /* !DEVICE_PAGING */
2822
2823kern_return_t
2824ps_read_device(
91447636 2825 __unused paging_segment_t ps,
b0d623f7 2826 __unused dp_offset_t offset,
91447636
A
2827 __unused vm_offset_t *bufferp,
2828 __unused unsigned int size,
2829 __unused unsigned int *residualp,
2830 __unused int flags)
1c79356b
A
2831{
2832 panic("ps_read_device not supported");
0c530ab8 2833 return KERN_FAILURE;
1c79356b
A
2834}
2835
91447636 2836kern_return_t
1c79356b 2837ps_write_device(
91447636 2838 __unused paging_segment_t ps,
b0d623f7 2839 __unused dp_offset_t offset,
91447636
A
2840 __unused vm_offset_t addr,
2841 __unused unsigned int size,
2842 __unused struct vs_async *vsa)
1c79356b
A
2843{
2844 panic("ps_write_device not supported");
0c530ab8 2845 return KERN_FAILURE;
1c79356b
A
2846}
2847
2848#endif /* DEVICE_PAGING */
91447636 2849void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
1c79356b
A
2850
2851void
2852pvs_object_data_provided(
91447636
A
2853 __unused vstruct_t vs,
2854 __unused upl_t upl,
2855 __unused upl_offset_t offset,
2856 upl_size_t size)
1c79356b 2857{
1c79356b 2858
91447636
A
2859 DP_DEBUG(DEBUG_VS_INTERNAL,
2860 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2861 upl, offset, size));
1c79356b
A
2862
2863 ASSERT(size > 0);
55e303ae 2864 GSTAT(global_stats.gs_pages_in += atop_32(size));
1c79356b 2865
6d2010ae
A
2866/* check upl iosync flag instead of using RECLAIM_SWAP*/
2867#if RECLAIM_SWAP
2868 if (size != upl->size) {
2869 upl_abort(upl, UPL_ABORT_ERROR);
2870 upl_deallocate(upl);
2871 } else {
2872 ps_clunmap(vs, offset, size);
2873 upl_commit(upl, NULL, 0);
2874 upl_deallocate(upl);
2875 }
2876#endif /* RECLAIM_SWAP */
1c79356b
A
2877
2878}
2879
2d21ac55
A
2880static memory_object_offset_t last_start;
2881static vm_size_t last_length;
2882
6d2010ae
A
2883/*
2884 * A "cnt" of 0 means that the caller just wants to check if the page at
2885 * offset "vs_offset" exists in the backing store. That page hasn't been
2886 * prepared, so no need to release it.
2887 *
2888 * A "cnt" of -1 means that the caller wants to bring back from the backing
2889 * store all existing pages in the cluster containing "vs_offset".
2890 */
1c79356b
A
2891kern_return_t
2892pvs_cluster_read(
2893 vstruct_t vs,
b0d623f7
A
2894 dp_offset_t vs_offset,
2895 dp_size_t cnt,
2d21ac55 2896 void *fault_info)
1c79356b 2897{
1c79356b 2898 kern_return_t error = KERN_SUCCESS;
2d21ac55 2899 unsigned int size;
0c530ab8 2900 unsigned int residual;
1c79356b 2901 unsigned int request_flags;
b0d623f7 2902 int io_flags = 0;
2d21ac55
A
2903 int seg_index;
2904 int pages_in_cl;
0b4e3aa0
A
2905 int cl_size;
2906 int cl_mask;
2d21ac55
A
2907 int cl_index;
2908 unsigned int xfer_size;
b0d623f7 2909 dp_offset_t orig_vs_offset;
0b4c1975
A
2910 dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2911 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
0b4e3aa0 2912 struct clmap clmap;
2d21ac55
A
2913 upl_t upl;
2914 unsigned int page_list_count;
b0d623f7
A
2915 memory_object_offset_t cluster_start;
2916 vm_size_t cluster_length;
2917 uint32_t io_streaming;
6d2010ae
A
2918 int i;
2919 boolean_t io_sync = FALSE;
0b4e3aa0
A
2920
2921 pages_in_cl = 1 << vs->vs_clshift;
2922 cl_size = pages_in_cl * vm_page_size;
2923 cl_mask = cl_size - 1;
1c79356b 2924
6d2010ae
A
2925 request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2926
2927 if (cnt == (dp_size_t) -1) {
2928 /*
2929 * We've been called from ps_vstruct_reclaim() to move all
2930 * the object's swapped pages back to VM pages.
2931 * This can put memory pressure on the system, so we do want
2932 * to wait for free pages, to avoid getting in the way of the
2933 * vm_pageout_scan() thread.
2934 * Let's not use UPL_NOBLOCK in this case.
2935 */
2936 vs_offset &= ~cl_mask;
2937 i = pages_in_cl;
2938 } else {
2939 i = 1;
2940 request_flags |= UPL_NOBLOCK;
2941 }
2942
2943again:
2d21ac55
A
2944 cl_index = (vs_offset & cl_mask) / vm_page_size;
2945
b0d623f7 2946 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2d21ac55
A
2947 !CLMAP_ISSET(clmap, cl_index)) {
2948 /*
2949 * the needed page doesn't exist in the backing store...
2950 * we don't want to try to do any I/O, just abort the
2951 * page and let the fault handler provide a zero-fill
2952 */
2953 if (cnt == 0) {
2954 /*
2955 * The caller was just poking at us to see if
2956 * the page has been paged out. No need to
2957 * mess with the page at all.
2958 * Just let the caller know we don't have that page.
2959 */
2960 return KERN_FAILURE;
2961 }
6d2010ae
A
2962 if (cnt == (dp_size_t) -1) {
2963 i--;
2964 if (i == 0) {
2965 /* no more pages in this cluster */
2966 return KERN_FAILURE;
2967 }
2968 /* try the next page in this cluster */
2969 vs_offset += vm_page_size;
2970 goto again;
2971 }
2d21ac55
A
2972
2973 page_list_count = 0;
2974
2975 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2976 PAGE_SIZE, PAGE_SIZE,
2977 &upl, NULL, &page_list_count,
2978 request_flags);
2979
2980 if (clmap.cl_error)
2981 upl_abort(upl, UPL_ABORT_ERROR);
2982 else
2983 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2984 upl_deallocate(upl);
91447636 2985
2d21ac55
A
2986 return KERN_SUCCESS;
2987 }
2988
2989 if (cnt == 0) {
2990 /*
2991 * The caller was just poking at us to see if
2992 * the page has been paged out. No need to
2993 * mess with the page at all.
2994 * Just let the caller know we do have that page.
2995 */
2996 return KERN_SUCCESS;
2997 }
2998
6d2010ae
A
2999 if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3000 io_sync = TRUE;
3001 } else {
3002#if RECLAIM_SWAP
3003 io_sync = TRUE;
3004#endif /* RECLAIM_SWAP */
3005 }
3006
3007 if( io_sync == TRUE ) {
3008
3009 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3010#if USE_PRECIOUS
3011 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3012#else /* USE_PRECIOUS */
3013 request_flags |= UPL_REQUEST_SET_DIRTY;
3014#endif /* USE_PRECIOUS */
3015 }
3016
91447636
A
3017 assert(dp_encryption_inited);
3018 if (dp_encryption) {
3019 /*
3020 * ENCRYPTED SWAP:
3021 * request that the UPL be prepared for
3022 * decryption.
3023 */
3024 request_flags |= UPL_ENCRYPT;
6d2010ae 3025 io_flags |= UPL_PAGING_ENCRYPTED;
91447636 3026 }
2d21ac55 3027 orig_vs_offset = vs_offset;
91447636 3028
2d21ac55
A
3029 assert(cnt != 0);
3030 cnt = VM_SUPER_CLUSTER;
b0d623f7
A
3031 cluster_start = (memory_object_offset_t) vs_offset;
3032 cluster_length = (vm_size_t) cnt;
3033 io_streaming = 0;
1c79356b 3034
2d21ac55
A
3035 /*
3036 * determine how big a speculative I/O we should try for...
3037 */
b0d623f7
A
3038 if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3039 assert(vs_offset >= (dp_offset_t) cluster_start &&
3040 vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3041 vs_offset = (dp_offset_t) cluster_start;
3042 cnt = (dp_size_t) cluster_length;
3043 } else {
3044 cluster_length = PAGE_SIZE;
2d21ac55 3045 cnt = PAGE_SIZE;
b0d623f7
A
3046 }
3047
3048 if (io_streaming)
3049 io_flags |= UPL_IOSTREAMING;
2d21ac55 3050
b0d623f7
A
3051 last_start = cluster_start;
3052 last_length = cluster_length;
2d21ac55
A
3053
3054 /*
3055 * This loop will be executed multiple times until the entire
3056 * range has been looked at or we issue an I/O... if the request spans cluster
3057 * boundaries, the clusters will be checked for logical continunity,
3058 * if contiguous the I/O request will span multiple clusters...
3059 * at most only 1 I/O will be issued... it will encompass the original offset
3060 */
3061 while (cnt && error == KERN_SUCCESS) {
3062 int ps_info_valid;
3063
3064 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
d12e1678
A
3065 size = VM_SUPER_CLUSTER;
3066 size -= vs_offset & cl_mask;
2d21ac55 3067 } else if (cnt > VM_SUPER_CLUSTER)
0b4e3aa0 3068 size = VM_SUPER_CLUSTER;
2d21ac55 3069 else
0b4e3aa0 3070 size = cnt;
2d21ac55 3071
0b4e3aa0 3072 cnt -= size;
1c79356b 3073
0b4e3aa0
A
3074 ps_info_valid = 0;
3075 seg_index = 0;
1c79356b 3076
0b4e3aa0 3077 while (size > 0 && error == KERN_SUCCESS) {
2d21ac55 3078 unsigned int abort_size;
0b4e3aa0
A
3079 int failed_size;
3080 int beg_pseg;
3081 int beg_indx;
b0d623f7 3082 dp_offset_t cur_offset;
1c79356b 3083
0b4e3aa0
A
3084 if ( !ps_info_valid) {
3085 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3086 psp[seg_index] = CLMAP_PS(clmap);
3087 ps_info_valid = 1;
1c79356b 3088 }
0b4e3aa0
A
3089 /*
3090 * skip over unallocated physical segments
3091 */
b0d623f7 3092 if (ps_offset[seg_index] == (dp_offset_t) -1) {
0b4e3aa0
A
3093 abort_size = cl_size - (vs_offset & cl_mask);
3094 abort_size = MIN(abort_size, size);
3095
2d21ac55
A
3096 size -= abort_size;
3097 vs_offset += abort_size;
1c79356b 3098
0b4e3aa0
A
3099 seg_index++;
3100 ps_info_valid = 0;
2d21ac55 3101
0b4e3aa0 3102 continue;
1c79356b 3103 }
0b4e3aa0
A
3104 cl_index = (vs_offset & cl_mask) / vm_page_size;
3105
3106 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3107 /*
3108 * skip over unallocated pages
3109 */
3110 if (CLMAP_ISSET(clmap, cl_index))
3111 break;
3112 abort_size += vm_page_size;
3113 }
3114 if (abort_size) {
2d21ac55
A
3115 size -= abort_size;
3116 vs_offset += abort_size;
0b4e3aa0
A
3117
3118 if (cl_index == pages_in_cl) {
3119 /*
3120 * if we're at the end of this physical cluster
3121 * then bump to the next one and continue looking
3122 */
3123 seg_index++;
3124 ps_info_valid = 0;
2d21ac55 3125
0b4e3aa0
A
3126 continue;
3127 }
3128 if (size == 0)
3129 break;
3130 }
1c79356b 3131 /*
0b4e3aa0
A
3132 * remember the starting point of the first allocated page
3133 * for the I/O we're about to issue
1c79356b 3134 */
0b4e3aa0
A
3135 beg_pseg = seg_index;
3136 beg_indx = cl_index;
3137 cur_offset = vs_offset;
3138
3139 /*
3140 * calculate the size of the I/O that we can do...
3141 * this may span multiple physical segments if
3142 * they are contiguous
3143 */
3144 for (xfer_size = 0; xfer_size < size; ) {
3145
2d21ac55 3146 while (cl_index < pages_in_cl && xfer_size < size) {
0b4e3aa0 3147 /*
55e303ae 3148 * accumulate allocated pages within
d12e1678 3149 * a physical segment
1c79356b 3150 */
0b4e3aa0
A
3151 if (CLMAP_ISSET(clmap, cl_index)) {
3152 xfer_size += vm_page_size;
3153 cur_offset += vm_page_size;
3154 cl_index++;
3155
3156 BS_STAT(psp[seg_index]->ps_bs,
3157 psp[seg_index]->ps_bs->bs_pages_in++);
3158 } else
3159 break;
3160 }
2d21ac55 3161 if (cl_index < pages_in_cl || xfer_size >= size) {
0b4e3aa0 3162 /*
55e303ae 3163 * we've hit an unallocated page or
2d21ac55
A
3164 * the end of this request... see if
3165 * it's time to fire the I/O
1c79356b 3166 */
0b4e3aa0
A
3167 break;
3168 }
3169 /*
d12e1678 3170 * we've hit the end of the current physical
55e303ae 3171 * segment and there's more to do, so try
d12e1678 3172 * moving to the next one
0b4e3aa0
A
3173 */
3174 seg_index++;
3175
2d21ac55 3176 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
d12e1678 3177 psp[seg_index] = CLMAP_PS(clmap);
0b4e3aa0
A
3178 ps_info_valid = 1;
3179
3180 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3181 /*
55e303ae
A
3182 * if the physical segment we're about
3183 * to step into is not contiguous to
3184 * the one we're currently in, or it's
d12e1678 3185 * in a different paging file, or
0b4e3aa0 3186 * it hasn't been allocated....
2d21ac55
A
3187 * we stop this run and go check
3188 * to see if it's time to fire the I/O
0b4e3aa0
A
3189 */
3190 break;
1c79356b 3191 }
0b4e3aa0 3192 /*
d12e1678 3193 * start with first page of the next physical
2d21ac55 3194 * segment
0b4e3aa0
A
3195 */
3196 cl_index = 0;
1c79356b 3197 }
2d21ac55 3198 if (xfer_size == 0) {
0b4e3aa0 3199 /*
2d21ac55 3200 * no I/O to generate for this segment
0b4e3aa0 3201 */
0b4e3aa0 3202 continue;
2d21ac55
A
3203 }
3204 if (cur_offset <= orig_vs_offset) {
3205 /*
3206 * we've hit a hole in our speculative cluster
3207 * before the offset that we're really after...
3208 * don't issue the I/O since it doesn't encompass
3209 * the original offset and we're looking to only
3210 * pull in the speculative pages if they can be
3211 * made part of a single I/O
3212 */
3213 size -= xfer_size;
3214 vs_offset += xfer_size;
1c79356b 3215
2d21ac55
A
3216 continue;
3217 }
3218 /*
3219 * we have a contiguous range of allocated pages
3220 * to read from that encompasses the original offset
3221 */
3222 page_list_count = 0;
3223 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3224 xfer_size, xfer_size,
3225 &upl, NULL, &page_list_count,
6d2010ae 3226 request_flags | UPL_SET_INTERNAL);
2d21ac55
A
3227
3228 error = ps_read_file(psp[beg_pseg],
3229 upl, (upl_offset_t) 0,
3230 ps_offset[beg_pseg] + (beg_indx * vm_page_size),
b0d623f7 3231 xfer_size, &residual, io_flags);
2d21ac55 3232
0b4e3aa0
A
3233 failed_size = 0;
3234
3235 /*
55e303ae 3236 * Adjust counts and send response to VM. Optimize
d12e1678 3237 * for the common case, i.e. no error and/or partial
55e303ae 3238 * data. If there was an error, then we need to error
d12e1678 3239 * the entire range, even if some data was successfully
55e303ae 3240 * read. If there was a partial read we may supply some
0b4e3aa0 3241 * data and may error some as well. In all cases the
55e303ae
A
3242 * VM must receive some notification for every page
3243 * in the range.
0b4e3aa0
A
3244 */
3245 if ((error == KERN_SUCCESS) && (residual == 0)) {
3246 /*
d12e1678 3247 * Got everything we asked for, supply the data
55e303ae
A
3248 * to the VM. Note that as a side effect of
3249 * supplying the data, the buffer holding the
3250 * supplied data is deallocated from the pager's
3251 * address space.
0b4e3aa0 3252 */
2d21ac55 3253 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
0b4e3aa0
A
3254 } else {
3255 failed_size = xfer_size;
3256
3257 if (error == KERN_SUCCESS) {
2d21ac55
A
3258 if (residual == xfer_size) {
3259 /*
3260 * If a read operation returns no error
3261 * and no data moved, we turn it into
3262 * an error, assuming we're reading at
3263 * or beyong EOF.
3264 * Fall through and error the entire range.
3265 */
0b4e3aa0
A
3266 error = KERN_FAILURE;
3267 } else {
2d21ac55
A
3268 /*
3269 * Otherwise, we have partial read. If
3270 * the part read is a integral number
3271 * of pages supply it. Otherwise round
3272 * it up to a page boundary, zero fill
3273 * the unread part, and supply it.
3274 * Fall through and error the remainder
3275 * of the range, if any.
3276 */
3277 int fill;
3278 unsigned int lsize;
3279
3280 fill = residual & ~vm_page_size;
3281 lsize = (xfer_size - residual) + fill;
0b4e3aa0 3282
2d21ac55 3283 pvs_object_data_provided(vs, upl, vs_offset, lsize);
0b4e3aa0
A
3284
3285 if (lsize < xfer_size) {
2d21ac55 3286 failed_size = xfer_size - lsize;
0b4e3aa0
A
3287 error = KERN_FAILURE;
3288 }
3289 }
3290 }
3291 }
1c79356b 3292 if (error != KERN_SUCCESS) {
2d21ac55
A
3293 /*
3294 * There was an error in some part of the range, tell
3295 * the VM. Note that error is explicitly checked again
3296 * since it can be modified above.
3297 */
0b4e3aa0 3298 BS_STAT(psp[beg_pseg]->ps_bs,
2d21ac55 3299 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
1c79356b 3300 }
2d21ac55
A
3301 /*
3302 * we've issued a single I/O that encompassed the original offset
3303 * at this point we either met our speculative request length or
3304 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3305 * not present or not physically contiguous to the previous one), so
3306 * we're done issuing I/O at this point
3307 */
3308 return (error);
1c79356b 3309 }
2d21ac55 3310 }
1c79356b
A
3311 return error;
3312}
3313
3314int vs_do_async_write = 1;
3315
3316kern_return_t
3317vs_cluster_write(
3318 vstruct_t vs,
3319 upl_t internal_upl,
91447636
A
3320 upl_offset_t offset,
3321 upl_size_t cnt,
1c79356b
A
3322 boolean_t dp_internal,
3323 int flags)
3324{
91447636 3325 upl_size_t transfer_size;
1c79356b
A
3326 int error = 0;
3327 struct clmap clmap;
0b4e3aa0 3328
b0d623f7 3329 dp_offset_t actual_offset; /* Offset within paging segment */
1c79356b 3330 paging_segment_t ps;
b0d623f7
A
3331 dp_offset_t mobj_base_addr;
3332 dp_offset_t mobj_target_addr;
1c79356b
A
3333
3334 upl_t upl;
0b4e3aa0 3335 upl_page_info_t *pl;
1c79356b 3336 int page_index;
0b4c1975 3337 unsigned int page_max_index;
1c79356b 3338 int list_size;
55e303ae 3339 int pages_in_cl;
91447636 3340 unsigned int cl_size;
55e303ae 3341 int base_index;
91447636 3342 unsigned int seg_size;
b0d623f7 3343 unsigned int upl_offset_in_object;
0b4c1975
A
3344 boolean_t minimal_clustering = FALSE;
3345 boolean_t found_dirty;
55e303ae 3346
6d2010ae
A
3347 if (!dp_encryption_inited) {
3348 /*
3349 * ENCRYPTED SWAP:
3350 * Once we've started using swap, we
3351 * can't change our mind on whether
3352 * it needs to be encrypted or
3353 * not.
3354 */
3355 dp_encryption_inited = TRUE;
3356 }
3357 if (dp_encryption) {
3358 /*
3359 * ENCRYPTED SWAP:
3360 * the UPL will need to be encrypted...
3361 */
3362 flags |= UPL_PAGING_ENCRYPTED;
3363 }
3364
55e303ae
A
3365 pages_in_cl = 1 << vs->vs_clshift;
3366 cl_size = pages_in_cl * vm_page_size;
1c79356b 3367
0b4c1975
A
3368#if CONFIG_FREEZE
3369 minimal_clustering = TRUE;
6d2010ae 3370#else
0b4c1975
A
3371 if (dp_isssd == TRUE)
3372 minimal_clustering = TRUE;
6d2010ae 3373#endif
1c79356b 3374 if (!dp_internal) {
0c530ab8 3375 unsigned int page_list_count;
1c79356b 3376 int request_flags;
91447636 3377 unsigned int super_size;
0b4e3aa0
A
3378 int first_dirty;
3379 int num_dirty;
3380 int num_of_pages;
3381 int seg_index;
91447636 3382 upl_offset_t upl_offset;
0b4c1975 3383 upl_offset_t upl_offset_aligned;
b0d623f7 3384 dp_offset_t seg_offset;
0b4c1975
A
3385 dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3386 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
0b4e3aa0 3387
1c79356b 3388
0b4c1975 3389 if (bs_low)
1c79356b 3390 super_size = cl_size;
0b4c1975 3391 else
1c79356b 3392 super_size = VM_SUPER_CLUSTER;
0b4e3aa0 3393
0b4c1975
A
3394 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3395 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2d21ac55 3396 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
1c79356b 3397
91447636
A
3398 if (dp_encryption) {
3399 /*
3400 * ENCRYPTED SWAP:
3401 * request that the UPL be prepared for
3402 * encryption.
3403 */
3404 request_flags |= UPL_ENCRYPT;
3405 flags |= UPL_PAGING_ENCRYPTED;
3406 }
6d2010ae 3407
0b4e3aa0
A
3408 page_list_count = 0;
3409 memory_object_super_upl_request(vs->vs_control,
3410 (memory_object_offset_t)offset,
3411 cnt, super_size,
3412 &upl, NULL, &page_list_count,
55e303ae 3413 request_flags | UPL_FOR_PAGEOUT);
1c79356b 3414
b0d623f7
A
3415 /*
3416 * The default pager does not handle objects larger than
3417 * 4GB, so it does not deal with offset that don't fit in
3418 * 32-bit. Cast down upl->offset now and make sure we
3419 * did not lose any valuable bits.
3420 */
3421 upl_offset_in_object = (unsigned int) upl->offset;
3422 assert(upl->offset == upl_offset_in_object);
3423
0b4e3aa0 3424 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 3425
b0d623f7 3426 seg_size = cl_size - (upl_offset_in_object % cl_size);
0b4c1975
A
3427 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3428 page_index = 0;
3429 page_max_index = upl->size / PAGE_SIZE;
3430 found_dirty = TRUE;
55e303ae 3431
0b4c1975 3432 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
6d2010ae 3433
0b4c1975 3434 unsigned int seg_pgcnt;
1c79356b 3435
0b4c1975 3436 seg_pgcnt = seg_size / PAGE_SIZE;
1c79356b 3437
0b4c1975
A
3438 if (minimal_clustering == TRUE) {
3439 unsigned int non_dirty;
1c79356b 3440
0b4c1975
A
3441 non_dirty = 0;
3442 found_dirty = FALSE;
3443
3444 for (; non_dirty < seg_pgcnt; non_dirty++) {
3445 if ((page_index + non_dirty) >= page_max_index)
3446 break;
3447
3448 if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3449 UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3450 found_dirty = TRUE;
3451 break;
3452 }
3453 }
3454 }
3455 if (found_dirty == TRUE) {
3456 ps_offset[seg_index] =
3457 ps_clmap(vs,
3458 upl_offset_aligned,
3459 &clmap, CL_ALLOC,
3460 cl_size, 0);
3461
3462 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3463 upl_abort(upl, 0);
3464 upl_deallocate(upl);
3465
3466 return KERN_FAILURE;
3467 }
3468 psp[seg_index] = CLMAP_PS(clmap);
3469 }
55e303ae 3470 if (transfer_size > seg_size) {
0b4c1975 3471 page_index += seg_pgcnt;
55e303ae 3472 transfer_size -= seg_size;
0b4c1975 3473 upl_offset_aligned += cl_size;
6d2010ae 3474 seg_size = cl_size;
0b4e3aa0
A
3475 seg_index++;
3476 } else
3477 transfer_size = 0;
3478 }
55e303ae
A
3479 /*
3480 * Ignore any non-present pages at the end of the
3481 * UPL.
3482 */
3483 for (page_index = upl->size / vm_page_size; page_index > 0;)
3484 if (UPL_PAGE_PRESENT(pl, --page_index))
3485 break;
3486 num_of_pages = page_index + 1;
3487
b0d623f7 3488 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
55e303ae
A
3489
3490 for (page_index = 0; page_index < num_of_pages; ) {
0b4e3aa0
A
3491 /*
3492 * skip over non-dirty pages
3493 */
3494 for ( ; page_index < num_of_pages; page_index++) {
55e303ae 3495 if (UPL_DIRTY_PAGE(pl, page_index)
d12e1678 3496 || UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
3497 /*
3498 * this is a page we need to write
55e303ae 3499 * go see if we can buddy it up with
d12e1678 3500 * others that are contiguous to it
0b4e3aa0
A
3501 */
3502 break;
3503 /*
d12e1678 3504 * if the page is not-dirty, but present we
55e303ae 3505 * need to commit it... This is an unusual
d12e1678 3506 * case since we only asked for dirty pages
0b4e3aa0
A
3507 */
3508 if (UPL_PAGE_PRESENT(pl, page_index)) {
3509 boolean_t empty = FALSE;
3510 upl_commit_range(upl,
3511 page_index * vm_page_size,
3512 vm_page_size,
3513 UPL_COMMIT_NOTIFY_EMPTY,
3514 pl,
d52fe63f 3515 page_list_count,
0b4e3aa0 3516 &empty);
55e303ae
A
3517 if (empty) {
3518 assert(page_index ==
3519 num_of_pages - 1);
0b4e3aa0 3520 upl_deallocate(upl);
55e303ae 3521 }
1c79356b 3522 }
1c79356b 3523 }
0b4e3aa0
A
3524 if (page_index == num_of_pages)
3525 /*
3526 * no more pages to look at, we're out of here
3527 */
3528 break;
1c79356b 3529
0b4e3aa0 3530 /*
55e303ae
A
3531 * gather up contiguous dirty pages... we have at
3532 * least 1 * otherwise we would have bailed above
0b4e3aa0
A
3533 * make sure that each physical segment that we step
3534 * into is contiguous to the one we're currently in
3535 * if it's not, we have to stop and write what we have
3536 */
55e303ae 3537 for (first_dirty = page_index;
d12e1678 3538 page_index < num_of_pages; ) {
55e303ae 3539 if ( !UPL_DIRTY_PAGE(pl, page_index)
d12e1678 3540 && !UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
3541 break;
3542 page_index++;
3543 /*
3544 * if we just looked at the last page in the UPL
3545 * we don't need to check for physical segment
3546 * continuity
3547 */
3548 if (page_index < num_of_pages) {
3549 int cur_seg;
3550 int nxt_seg;
3551
55e303ae
A
3552 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3553 nxt_seg = (base_index + page_index)/pages_in_cl;
0b4e3aa0
A
3554
3555 if (cur_seg != nxt_seg) {
3556 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
55e303ae
A
3557 /*
3558 * if the segment we're about
3559 * to step into is not
3560 * contiguous to the one we're
3561 * currently in, or it's in a
d12e1678 3562 * different paging file....
55e303ae 3563 * we stop here and generate
d12e1678
A
3564 * the I/O
3565 */
0b4e3aa0 3566 break;
1c79356b 3567 }
1c79356b 3568 }
0b4e3aa0
A
3569 }
3570 num_dirty = page_index - first_dirty;
1c79356b 3571
0b4e3aa0
A
3572 if (num_dirty) {
3573 upl_offset = first_dirty * vm_page_size;
0b4e3aa0
A
3574 transfer_size = num_dirty * vm_page_size;
3575
d12e1678 3576 while (transfer_size) {
1c79356b 3577
d12e1678 3578 if ((seg_size = cl_size -
b0d623f7
A
3579 ((upl_offset_in_object +
3580 upl_offset) % cl_size))
d12e1678
A
3581 > transfer_size)
3582 seg_size = transfer_size;
0b4e3aa0 3583
b0d623f7
A
3584 ps_vs_write_complete(
3585 vs,
3586 (upl_offset_in_object +
3587 upl_offset),
d12e1678 3588 seg_size, error);
0b4e3aa0 3589
d12e1678
A
3590 transfer_size -= seg_size;
3591 upl_offset += seg_size;
0b4e3aa0 3592 }
d12e1678
A
3593 upl_offset = first_dirty * vm_page_size;
3594 transfer_size = num_dirty * vm_page_size;
55e303ae
A
3595
3596 seg_index = (base_index + first_dirty) / pages_in_cl;
b0d623f7 3597 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
55e303ae 3598
d12e1678
A
3599 error = ps_write_file(psp[seg_index],
3600 upl, upl_offset,
3601 ps_offset[seg_index]
3602 + seg_offset,
3603 transfer_size, flags);
55e303ae 3604 } else {
0b4e3aa0
A
3605 boolean_t empty = FALSE;
3606 upl_abort_range(upl,
3607 first_dirty * vm_page_size,
3608 num_dirty * vm_page_size,
3609 UPL_ABORT_NOTIFY_EMPTY,
3610 &empty);
55e303ae
A
3611 if (empty) {
3612 assert(page_index == num_of_pages);
0b4e3aa0 3613 upl_deallocate(upl);
55e303ae 3614 }
1c79356b 3615 }
1c79356b 3616 }
0b4e3aa0 3617
1c79356b 3618 } else {
b0d623f7 3619 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
1c79356b
A
3620 list_size = cnt;
3621
3622 page_index = 0;
3623 /* The caller provides a mapped_data which is derived */
3624 /* from a temporary object. The targeted pages are */
3625 /* guaranteed to be set at offset 0 in the mapped_data */
3626 /* The actual offset however must still be derived */
3627 /* from the offset in the vs in question */
3628 mobj_base_addr = offset;
3629 mobj_target_addr = mobj_base_addr;
3630
3631 for (transfer_size = list_size; transfer_size != 0;) {
3632 actual_offset = ps_clmap(vs, mobj_target_addr,
3633 &clmap, CL_ALLOC,
3634 transfer_size < cl_size ?
3635 transfer_size : cl_size, 0);
b0d623f7 3636 if(actual_offset == (dp_offset_t) -1) {
1c79356b
A
3637 error = 1;
3638 break;
3639 }
3640 cnt = MIN(transfer_size,
b0d623f7 3641 (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
1c79356b
A
3642 ps = CLMAP_PS(clmap);
3643 /* Assume that the caller has given us contiguous */
3644 /* pages */
3645 if(cnt) {
d12e1678
A
3646 ps_vs_write_complete(vs, mobj_target_addr,
3647 cnt, error);
1c79356b
A
3648 error = ps_write_file(ps, internal_upl,
3649 0, actual_offset,
3650 cnt, flags);
3651 if (error)
3652 break;
55e303ae 3653 }
1c79356b
A
3654 if (error)
3655 break;
3656 actual_offset += cnt;
3657 mobj_target_addr += cnt;
3658 transfer_size -= cnt;
3659 cnt = 0;
3660
3661 if (error)
3662 break;
3663 }
3664 }
3665 if(error)
3666 return KERN_FAILURE;
3667 else
3668 return KERN_SUCCESS;
3669}
3670
3671vm_size_t
3672ps_vstruct_allocated_size(
3673 vstruct_t vs)
3674{
3675 int num_pages;
3676 struct vs_map *vsmap;
91447636 3677 unsigned int i, j, k;
1c79356b
A
3678
3679 num_pages = 0;
3680 if (vs->vs_indirect) {
3681 /* loop on indirect maps */
3682 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3683 vsmap = vs->vs_imap[i];
3684 if (vsmap == NULL)
3685 continue;
3686 /* loop on clusters in this indirect map */
3687 for (j = 0; j < CLMAP_ENTRIES; j++) {
3688 if (VSM_ISCLR(vsmap[j]) ||
3689 VSM_ISERR(vsmap[j]))
3690 continue;
3691 /* loop on pages in this cluster */
3692 for (k = 0; k < VSCLSIZE(vs); k++) {
3693 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3694 num_pages++;
3695 }
3696 }
3697 }
3698 } else {
3699 vsmap = vs->vs_dmap;
3700 if (vsmap == NULL)
3701 return 0;
3702 /* loop on clusters in the direct map */
3703 for (j = 0; j < CLMAP_ENTRIES; j++) {
3704 if (VSM_ISCLR(vsmap[j]) ||
3705 VSM_ISERR(vsmap[j]))
3706 continue;
3707 /* loop on pages in this cluster */
3708 for (k = 0; k < VSCLSIZE(vs); k++) {
3709 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3710 num_pages++;
3711 }
3712 }
3713 }
3714
55e303ae 3715 return ptoa_32(num_pages);
1c79356b
A
3716}
3717
b0d623f7 3718unsigned int
1c79356b
A
3719ps_vstruct_allocated_pages(
3720 vstruct_t vs,
3721 default_pager_page_t *pages,
b0d623f7 3722 unsigned int pages_size)
1c79356b 3723{
91447636 3724 unsigned int num_pages;
1c79356b 3725 struct vs_map *vsmap;
b0d623f7 3726 dp_offset_t offset;
91447636 3727 unsigned int i, j, k;
1c79356b
A
3728
3729 num_pages = 0;
3730 offset = 0;
3731 if (vs->vs_indirect) {
3732 /* loop on indirect maps */
3733 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3734 vsmap = vs->vs_imap[i];
3735 if (vsmap == NULL) {
3736 offset += (vm_page_size * CLMAP_ENTRIES *
3737 VSCLSIZE(vs));
3738 continue;
3739 }
3740 /* loop on clusters in this indirect map */
3741 for (j = 0; j < CLMAP_ENTRIES; j++) {
3742 if (VSM_ISCLR(vsmap[j]) ||
3743 VSM_ISERR(vsmap[j])) {
3744 offset += vm_page_size * VSCLSIZE(vs);
3745 continue;
3746 }
3747 /* loop on pages in this cluster */
3748 for (k = 0; k < VSCLSIZE(vs); k++) {
3749 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3750 num_pages++;
3751 if (num_pages < pages_size)
3752 pages++->dpp_offset =
3753 offset;
3754 }
3755 offset += vm_page_size;
3756 }
3757 }
3758 }
3759 } else {
3760 vsmap = vs->vs_dmap;
3761 if (vsmap == NULL)
3762 return 0;
3763 /* loop on clusters in the direct map */
3764 for (j = 0; j < CLMAP_ENTRIES; j++) {
3765 if (VSM_ISCLR(vsmap[j]) ||
3766 VSM_ISERR(vsmap[j])) {
3767 offset += vm_page_size * VSCLSIZE(vs);
3768 continue;
3769 }
3770 /* loop on pages in this cluster */
3771 for (k = 0; k < VSCLSIZE(vs); k++) {
3772 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3773 num_pages++;
3774 if (num_pages < pages_size)
3775 pages++->dpp_offset = offset;
3776 }
3777 offset += vm_page_size;
3778 }
3779 }
3780 }
3781
3782 return num_pages;
3783}
3784
3785
3786kern_return_t
3787ps_vstruct_transfer_from_segment(
3788 vstruct_t vs,
3789 paging_segment_t segment,
1c79356b 3790 upl_t upl)
1c79356b
A
3791{
3792 struct vs_map *vsmap;
91447636
A
3793// struct vs_map old_vsmap;
3794// struct vs_map new_vsmap;
3795 unsigned int i, j;
1c79356b
A
3796
3797 VS_LOCK(vs); /* block all work on this vstruct */
3798 /* can't allow the normal multiple write */
3799 /* semantic because writes may conflict */
3800 vs->vs_xfer_pending = TRUE;
3801 vs_wait_for_sync_writers(vs);
3802 vs_start_write(vs);
3803 vs_wait_for_readers(vs);
3804 /* we will unlock the vs to allow other writes while transferring */
3805 /* and will be guaranteed of the persistance of the vs struct */
3806 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3807 /* vs_async_pending */
3808 /* OK we now have guaranteed no other parties are accessing this */
3809 /* vs. Now that we are also supporting simple lock versions of */
3810 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3811 /* our purpose in holding it before was the multiple write case */
3812 /* we now use the boolean xfer_pending to do that. We can use */
3813 /* a boolean instead of a count because we have guaranteed single */
3814 /* file access to this code in its caller */
3815 VS_UNLOCK(vs);
3816vs_changed:
3817 if (vs->vs_indirect) {
91447636
A
3818 unsigned int vsmap_size;
3819 int clmap_off;
1c79356b
A
3820 /* loop on indirect maps */
3821 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3822 vsmap = vs->vs_imap[i];
3823 if (vsmap == NULL)
3824 continue;
3825 /* loop on clusters in this indirect map */
3826 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3827 VSCLSIZE(vs) * i);
3828 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3829 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3830 else
3831 vsmap_size = CLMAP_ENTRIES;
3832 for (j = 0; j < vsmap_size; j++) {
3833 if (VSM_ISCLR(vsmap[j]) ||
3834 VSM_ISERR(vsmap[j]) ||
3835 (VSM_PS(vsmap[j]) != segment))
3836 continue;
3837 if(vs_cluster_transfer(vs,
3838 (vm_page_size * (j << vs->vs_clshift))
3839 + clmap_off,
3840 vm_page_size << vs->vs_clshift,
1c79356b 3841 upl)
1c79356b
A
3842 != KERN_SUCCESS) {
3843 VS_LOCK(vs);
3844 vs->vs_xfer_pending = FALSE;
3845 VS_UNLOCK(vs);
3846 vs_finish_write(vs);
3847 return KERN_FAILURE;
3848 }
3849 /* allow other readers/writers during transfer*/
3850 VS_LOCK(vs);
3851 vs->vs_xfer_pending = FALSE;
3852 VS_UNLOCK(vs);
3853 vs_finish_write(vs);
6d2010ae
A
3854
3855 if (backing_store_abort_compaction || backing_store_stop_compaction) {
3856 backing_store_abort_compaction = FALSE;
3857 dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3858 return KERN_FAILURE;
3859 }
3860 vnode_pager_throttle();
3861
1c79356b
A
3862 VS_LOCK(vs);
3863 vs->vs_xfer_pending = TRUE;
1c79356b
A
3864 vs_wait_for_sync_writers(vs);
3865 vs_start_write(vs);
3866 vs_wait_for_readers(vs);
0b4e3aa0 3867 VS_UNLOCK(vs);
1c79356b
A
3868 if (!(vs->vs_indirect)) {
3869 goto vs_changed;
3870 }
3871 }
3872 }
3873 } else {
3874 vsmap = vs->vs_dmap;
3875 if (vsmap == NULL) {
3876 VS_LOCK(vs);
3877 vs->vs_xfer_pending = FALSE;
3878 VS_UNLOCK(vs);
3879 vs_finish_write(vs);
3880 return KERN_SUCCESS;
3881 }
3882 /* loop on clusters in the direct map */
3883 for (j = 0; j < vs->vs_size; j++) {
3884 if (VSM_ISCLR(vsmap[j]) ||
3885 VSM_ISERR(vsmap[j]) ||
3886 (VSM_PS(vsmap[j]) != segment))
3887 continue;
3888 if(vs_cluster_transfer(vs,
3889 vm_page_size * (j << vs->vs_clshift),
3890 vm_page_size << vs->vs_clshift,
1c79356b 3891 upl) != KERN_SUCCESS) {
1c79356b
A
3892 VS_LOCK(vs);
3893 vs->vs_xfer_pending = FALSE;
3894 VS_UNLOCK(vs);
3895 vs_finish_write(vs);
3896 return KERN_FAILURE;
3897 }
3898 /* allow other readers/writers during transfer*/
3899 VS_LOCK(vs);
3900 vs->vs_xfer_pending = FALSE;
3901 VS_UNLOCK(vs);
3902 vs_finish_write(vs);
3903 VS_LOCK(vs);
3904 vs->vs_xfer_pending = TRUE;
1c79356b
A
3905 vs_wait_for_sync_writers(vs);
3906 vs_start_write(vs);
3907 vs_wait_for_readers(vs);
b0d623f7 3908 VS_UNLOCK(vs);
1c79356b
A
3909 if (vs->vs_indirect) {
3910 goto vs_changed;
3911 }
3912 }
3913 }
3914
3915 VS_LOCK(vs);
3916 vs->vs_xfer_pending = FALSE;
3917 VS_UNLOCK(vs);
3918 vs_finish_write(vs);
3919 return KERN_SUCCESS;
3920}
3921
3922
3923
3924vs_map_t
3925vs_get_map_entry(
3926 vstruct_t vs,
b0d623f7 3927 dp_offset_t offset)
1c79356b
A
3928{
3929 struct vs_map *vsmap;
b0d623f7 3930 dp_offset_t cluster;
1c79356b 3931
55e303ae 3932 cluster = atop_32(offset) >> vs->vs_clshift;
1c79356b
A
3933 if (vs->vs_indirect) {
3934 long ind_block = cluster/CLMAP_ENTRIES;
3935
3936 /* Is the indirect block allocated? */
3937 vsmap = vs->vs_imap[ind_block];
3938 if(vsmap == (vs_map_t) NULL)
3939 return vsmap;
3940 } else
3941 vsmap = vs->vs_dmap;
3942 vsmap += cluster%CLMAP_ENTRIES;
3943 return vsmap;
3944}
3945
3946kern_return_t
3947vs_cluster_transfer(
3948 vstruct_t vs,
b0d623f7
A
3949 dp_offset_t offset,
3950 dp_size_t cnt,
1c79356b 3951 upl_t upl)
1c79356b 3952{
b0d623f7 3953 dp_offset_t actual_offset;
1c79356b
A
3954 paging_segment_t ps;
3955 struct clmap clmap;
3956 kern_return_t error = KERN_SUCCESS;
91447636
A
3957 unsigned int size, size_wanted;
3958 int i;
0c530ab8 3959 unsigned int residual = 0;
91447636
A
3960 unsigned int unavail_size;
3961// default_pager_thread_t *dpt;
3962// boolean_t dealloc;
3963 struct vs_map *vsmap_ptr = NULL;
1c79356b
A
3964 struct vs_map read_vsmap;
3965 struct vs_map original_read_vsmap;
3966 struct vs_map write_vsmap;
91447636
A
3967// upl_t sync_upl;
3968// vm_offset_t ioaddr;
1c79356b 3969
1c79356b
A
3970 /* vs_cluster_transfer reads in the pages of a cluster and
3971 * then writes these pages back to new backing store. The
3972 * segment the pages are being read from is assumed to have
3973 * been taken off-line and is no longer considered for new
3974 * space requests.
3975 */
3976
3977 /*
3978 * This loop will be executed once per cluster referenced.
3979 * Typically this means once, since it's unlikely that the
3980 * VM system will ask for anything spanning cluster boundaries.
3981 *
3982 * If there are holes in a cluster (in a paging segment), we stop
3983 * reading at the hole, then loop again, hoping to
3984 * find valid pages later in the cluster. This continues until
3985 * the entire range has been examined, and read, if present. The
3986 * pages are written as they are read. If a failure occurs after
3987 * some pages are written the unmap call at the bottom of the loop
3988 * recovers the backing store and the old backing store remains
3989 * in effect.
3990 */
3991
1c79356b
A
3992 VSM_CLR(write_vsmap);
3993 VSM_CLR(original_read_vsmap);
3994 /* grab the actual object's pages to sync with I/O */
3995 while (cnt && (error == KERN_SUCCESS)) {
3996 vsmap_ptr = vs_get_map_entry(vs, offset);
3997 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3998
b0d623f7 3999 if (actual_offset == (dp_offset_t) -1) {
1c79356b
A
4000
4001 /*
4002 * Nothing left to write in this cluster at least
4003 * set write cluster information for any previous
4004 * write, clear for next cluster, if there is one
4005 */
4006 unsigned int local_size, clmask, clsize;
4007
4008 clsize = vm_page_size << vs->vs_clshift;
4009 clmask = clsize - 1;
4010 local_size = clsize - (offset & clmask);
4011 ASSERT(local_size);
4012 local_size = MIN(local_size, cnt);
4013
4014 /* This cluster has no data in it beyond what may */
4015 /* have been found on a previous iteration through */
4016 /* the loop "write_vsmap" */
4017 *vsmap_ptr = write_vsmap;
4018 VSM_CLR(write_vsmap);
4019 VSM_CLR(original_read_vsmap);
4020
4021 cnt -= local_size;
4022 offset += local_size;
4023 continue;
4024 }
4025
4026 /*
4027 * Count up contiguous available or unavailable
4028 * pages.
4029 */
4030 ps = CLMAP_PS(clmap);
4031 ASSERT(ps);
4032 size = 0;
4033 unavail_size = 0;
4034 for (i = 0;
4035 (size < cnt) && (unavail_size < cnt) &&
4036 (i < CLMAP_NPGS(clmap)); i++) {
4037 if (CLMAP_ISSET(clmap, i)) {
4038 if (unavail_size != 0)
4039 break;
4040 size += vm_page_size;
4041 BS_STAT(ps->ps_bs,
4042 ps->ps_bs->bs_pages_in++);
4043 } else {
4044 if (size != 0)
4045 break;
4046 unavail_size += vm_page_size;
4047 }
4048 }
4049
4050 if (size == 0) {
4051 ASSERT(unavail_size);
593a1d5f 4052 ps_clunmap(vs, offset, unavail_size);
1c79356b
A
4053 cnt -= unavail_size;
4054 offset += unavail_size;
4055 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4056 == 0) {
4057 /* There is no more to transfer in this
4058 cluster
4059 */
4060 *vsmap_ptr = write_vsmap;
4061 VSM_CLR(write_vsmap);
4062 VSM_CLR(original_read_vsmap);
4063 }
4064 continue;
4065 }
4066
4067 if(VSM_ISCLR(original_read_vsmap))
4068 original_read_vsmap = *vsmap_ptr;
4069
4070 if(ps->ps_segtype == PS_PARTITION) {
0c530ab8
A
4071 panic("swap partition not supported\n");
4072 /*NOTREACHED*/
4073 error = KERN_FAILURE;
4074 residual = size;
1c79356b 4075/*
9bccf70c 4076 NEED TO ISSUE WITH SYNC & NO COMMIT
1c79356b
A
4077 error = ps_read_device(ps, actual_offset, &buffer,
4078 size, &residual, flags);
4079*/
4080 } else {
9bccf70c 4081 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
91447636 4082 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
1c79356b 4083 size, &residual,
6d2010ae 4084 (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
1c79356b
A
4085 }
4086
4087 read_vsmap = *vsmap_ptr;
4088
4089
4090 /*
4091 * Adjust counts and put data in new BS. Optimize for the
4092 * common case, i.e. no error and/or partial data.
4093 * If there was an error, then we need to error the entire
4094 * range, even if some data was successfully read.
4095 *
4096 */
4097 if ((error == KERN_SUCCESS) && (residual == 0)) {
0b4e3aa0 4098
1c79356b
A
4099 /*
4100 * Got everything we asked for, supply the data to
4101 * the new BS. Note that as a side effect of supplying
4102 * the data, the buffer holding the supplied data is
4103 * deallocated from the pager's address space unless
4104 * the write is unsuccessful.
4105 */
4106
4107 /* note buffer will be cleaned up in all cases by */
4108 /* internal_cluster_write or if an error on write */
4109 /* the vm_map_copy_page_discard call */
4110 *vsmap_ptr = write_vsmap;
4111
1c79356b
A
4112 if(vs_cluster_write(vs, upl, offset,
4113 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
1c79356b
A
4114 error = KERN_FAILURE;
4115 if(!(VSM_ISCLR(*vsmap_ptr))) {
4116 /* unmap the new backing store object */
4117 ps_clunmap(vs, offset, size);
4118 }
4119 /* original vsmap */
4120 *vsmap_ptr = original_read_vsmap;
4121 VSM_CLR(write_vsmap);
4122 } else {
4123 if((offset + size) &
4124 ((vm_page_size << vs->vs_clshift)
4125 - 1)) {
4126 /* There is more to transfer in this
4127 cluster
4128 */
4129 write_vsmap = *vsmap_ptr;
4130 *vsmap_ptr = read_vsmap;
593a1d5f 4131 ps_clunmap(vs, offset, size);
1c79356b
A
4132 } else {
4133 /* discard the old backing object */
4134 write_vsmap = *vsmap_ptr;
4135 *vsmap_ptr = read_vsmap;
4136 ps_clunmap(vs, offset, size);
4137 *vsmap_ptr = write_vsmap;
4138 VSM_CLR(write_vsmap);
4139 VSM_CLR(original_read_vsmap);
4140 }
4141 }
4142 } else {
4143 size_wanted = size;
4144 if (error == KERN_SUCCESS) {
4145 if (residual == size) {
4146 /*
4147 * If a read operation returns no error
4148 * and no data moved, we turn it into
4149 * an error, assuming we're reading at
4150 * or beyond EOF.
4151 * Fall through and error the entire
4152 * range.
4153 */
4154 error = KERN_FAILURE;
4155 *vsmap_ptr = write_vsmap;
4156 if(!(VSM_ISCLR(*vsmap_ptr))) {
4157 /* unmap the new backing store object */
4158 ps_clunmap(vs, offset, size);
4159 }
4160 *vsmap_ptr = original_read_vsmap;
4161 VSM_CLR(write_vsmap);
4162 continue;
4163 } else {
4164 /*
4165 * Otherwise, we have partial read.
4166 * This is also considered an error
4167 * for the purposes of cluster transfer
4168 */
4169 error = KERN_FAILURE;
4170 *vsmap_ptr = write_vsmap;
4171 if(!(VSM_ISCLR(*vsmap_ptr))) {
4172 /* unmap the new backing store object */
4173 ps_clunmap(vs, offset, size);
4174 }
4175 *vsmap_ptr = original_read_vsmap;
4176 VSM_CLR(write_vsmap);
4177 continue;
4178 }
4179 }
4180
4181 }
4182 cnt -= size;
4183 offset += size;
4184
4185 } /* END while (cnt && (error == 0)) */
4186 if(!VSM_ISCLR(write_vsmap))
4187 *vsmap_ptr = write_vsmap;
4188
1c79356b
A
4189 return error;
4190}
4191
4192kern_return_t
91447636
A
4193default_pager_add_file(
4194 MACH_PORT_FACE backing_store,
4195 vnode_ptr_t vp,
1c79356b 4196 int record_size,
91447636 4197 vm_size_t size)
1c79356b
A
4198{
4199 backing_store_t bs;
4200 paging_segment_t ps;
4201 int i;
91447636 4202 unsigned int j;
1c79356b 4203 int error;
1c79356b
A
4204
4205 if ((bs = backing_store_lookup(backing_store))
4206 == BACKING_STORE_NULL)
4207 return KERN_INVALID_ARGUMENT;
4208
4209 PSL_LOCK();
4210 for (i = 0; i <= paging_segment_max; i++) {
4211 ps = paging_segments[i];
4212 if (ps == PAGING_SEGMENT_NULL)
4213 continue;
4214 if (ps->ps_segtype != PS_FILE)
4215 continue;
4216
4217 /*
4218 * Check for overlap on same device.
4219 */
4220 if (ps->ps_vnode == (struct vnode *)vp) {
4221 PSL_UNLOCK();
4222 BS_UNLOCK(bs);
4223 return KERN_INVALID_ARGUMENT;
4224 }
4225 }
4226 PSL_UNLOCK();
4227
4228 /*
4229 * Set up the paging segment
4230 */
4231 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4232 if (ps == PAGING_SEGMENT_NULL) {
4233 BS_UNLOCK(bs);
4234 return KERN_RESOURCE_SHORTAGE;
4235 }
4236
4237 ps->ps_segtype = PS_FILE;
4238 ps->ps_vnode = (struct vnode *)vp;
4239 ps->ps_offset = 0;
4240 ps->ps_record_shift = local_log2(vm_page_size / record_size);
b0d623f7
A
4241 assert((dp_size_t) size == size);
4242 ps->ps_recnum = (dp_size_t) size;
4243 ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
1c79356b
A
4244
4245 ps->ps_pgcount = ps->ps_pgnum;
4246 ps->ps_clshift = local_log2(bs->bs_clsize);
4247 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
b0d623f7 4248 ps->ps_special_clusters = 0;
1c79356b
A
4249 ps->ps_hint = 0;
4250
4251 PS_LOCK_INIT(ps);
4252 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4253 if (!ps->ps_bmap) {
91447636 4254 kfree(ps, sizeof *ps);
1c79356b
A
4255 BS_UNLOCK(bs);
4256 return KERN_RESOURCE_SHORTAGE;
4257 }
91447636
A
4258 for (j = 0; j < ps->ps_ncls; j++) {
4259 clrbit(ps->ps_bmap, j);
1c79356b
A
4260 }
4261
b0d623f7
A
4262 if(paging_segment_count == 0) {
4263 ps->ps_state = PS_EMERGENCY_SEGMENT;
4264 if(use_emergency_swap_file_first) {
4265 ps->ps_state |= PS_CAN_USE;
4266 }
4267 emergency_segment_backing_store = backing_store;
4268 } else {
4269 ps->ps_state = PS_CAN_USE;
4270 }
4271
1c79356b
A
4272 ps->ps_bs = bs;
4273
4274 if ((error = ps_enter(ps)) != 0) {
91447636
A
4275 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4276 kfree(ps, sizeof *ps);
1c79356b
A
4277 BS_UNLOCK(bs);
4278 return KERN_RESOURCE_SHORTAGE;
4279 }
4280
4281 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4282 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4283 PSL_LOCK();
b0d623f7
A
4284 if(IS_PS_OK_TO_USE(ps)) {
4285 dp_pages_free += ps->ps_pgcount;
4286 } else {
4287 dp_pages_reserve += ps->ps_pgcount;
4288 }
1c79356b
A
4289 PSL_UNLOCK();
4290
4291 BS_UNLOCK(bs);
4292
4293 bs_more_space(ps->ps_clcount);
4294
b0d623f7
A
4295 /*
4296 * If the paging segment being activated is not the emergency
4297 * segment and we notice that the emergency segment is being
4298 * used then we help recover it. If all goes well, the
4299 * emergency segment will be back to its original state of
4300 * online but not activated (till it's needed the next time).
4301 */
6d2010ae
A
4302#if CONFIG_FREEZE
4303 if (!vm_freeze_enabled)
4304#endif
4305 {
4306 ps = paging_segments[EMERGENCY_PSEG_INDEX];
4307 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4308 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4309 dprintf(("Failed to recover emergency paging segment\n"));
4310 } else {
4311 dprintf(("Recovered emergency paging segment\n"));
4312 }
b0d623f7
A
4313 }
4314 }
4315
91447636
A
4316 DP_DEBUG(DEBUG_BS_INTERNAL,
4317 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
b0d623f7 4318 device, offset, (dp_size_t) size, record_size,
91447636 4319 ps->ps_record_shift, ps->ps_pgnum));
1c79356b
A
4320
4321 return KERN_SUCCESS;
4322}
4323
4324
4325
1c79356b
A
4326kern_return_t
4327ps_read_file(
4328 paging_segment_t ps,
4329 upl_t upl,
91447636 4330 upl_offset_t upl_offset,
b0d623f7 4331 dp_offset_t offset,
91447636 4332 upl_size_t size,
1c79356b
A
4333 unsigned int *residualp,
4334 int flags)
4335{
4336 vm_object_offset_t f_offset;
4337 int error = 0;
4338 int result;
1c79356b 4339
91447636 4340 assert(dp_encryption_inited);
1c79356b 4341
55e303ae 4342 clustered_reads[atop_32(size)]++;
1c79356b
A
4343
4344 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4345
2d21ac55
A
4346 /*
4347 * for transfer case we need to pass uploffset and flags
4348 */
b0d623f7
A
4349 assert((upl_size_t) size == size);
4350 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
1c79356b
A
4351
4352 /* The vnode_pagein semantic is somewhat at odds with the existing */
4353 /* device_read semantic. Partial reads are not experienced at this */
4354 /* level. It is up to the bit map code and cluster read code to */
4355 /* check that requested data locations are actually backed, and the */
4356 /* pagein code to either read all of the requested data or return an */
4357 /* error. */
4358
4359 if (error)
4360 result = KERN_FAILURE;
4361 else {
4362 *residualp = 0;
4363 result = KERN_SUCCESS;
4364 }
4365 return result;
1c79356b
A
4366}
4367
4368kern_return_t
4369ps_write_file(
4370 paging_segment_t ps,
4371 upl_t upl,
91447636 4372 upl_offset_t upl_offset,
b0d623f7 4373 dp_offset_t offset,
1c79356b
A
4374 unsigned int size,
4375 int flags)
4376{
4377 vm_object_offset_t f_offset;
4378 kern_return_t result;
1c79356b 4379
91447636 4380 assert(dp_encryption_inited);
1c79356b 4381
55e303ae 4382 clustered_writes[atop_32(size)]++;
1c79356b
A
4383 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4384
91447636
A
4385 if (flags & UPL_PAGING_ENCRYPTED) {
4386 /*
4387 * ENCRYPTED SWAP:
4388 * encrypt all the pages that we're going
4389 * to pageout.
4390 */
4391 upl_encrypt(upl, upl_offset, size);
4392 }
b0d623f7
A
4393 assert((upl_size_t) size == size);
4394 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
1c79356b
A
4395 result = KERN_FAILURE;
4396 else
4397 result = KERN_SUCCESS;
4398
4399 return result;
4400}
4401
6d2010ae
A
4402static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4403{
4404#if CONFIG_EMBEDDED
4405 data->vp = NULL;
4406 data->offset = 0;
4407 data->length = 0;
4408#else
4409#pragma unused(data)
4410#endif
4411}
4412
4413static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4414{
4415#if CONFIG_EMBEDDED
4416 if ((data->vp) != NULL) {
4417 vnode_trim(data->vp,
4418 data->offset,
4419 data->length);
4420 ps_vnode_trim_init(data);
4421 }
4422#else
4423#pragma unused(data)
4424#endif
4425}
4426
4427static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4428{
4429#if CONFIG_EMBEDDED
4430 struct vnode *vp = VSM_PS(*map)->ps_vnode;
4431 dp_offset_t offset = ptoa_32(VSM_CLOFF(*map)) << shift;
4432
4433 if ((vp != data->vp) || (offset) != (data->offset + data->length)) {
4434 ps_vnode_trim_now(data);
4435 data->vp = vp;
4436 data->offset = offset;
4437 data->length = 0;
4438 }
4439 data->length += (length);
4440#else
4441#pragma unused(data, map, shift, length)
4442#endif
4443}
4444
1c79356b 4445kern_return_t
91447636 4446default_pager_triggers( __unused MACH_PORT_FACE default_pager,
1c79356b
A
4447 int hi_wat,
4448 int lo_wat,
4449 int flags,
4450 MACH_PORT_FACE trigger_port)
4451{
6d2010ae 4452 MACH_PORT_FACE release = IPC_PORT_NULL;
0b4e3aa0 4453 kern_return_t kr;
b0d623f7
A
4454 clock_sec_t now;
4455 clock_nsec_t nanoseconds_dummy;
4456 static clock_sec_t error_notify = 0;
1c79356b 4457
0b4e3aa0 4458 PSL_LOCK();
91447636
A
4459 if (flags == SWAP_ENCRYPT_ON) {
4460 /* ENCRYPTED SWAP: turn encryption on */
4461 release = trigger_port;
4462 if (!dp_encryption_inited) {
4463 dp_encryption_inited = TRUE;
4464 dp_encryption = TRUE;
4465 kr = KERN_SUCCESS;
4466 } else {
4467 kr = KERN_FAILURE;
4468 }
4469 } else if (flags == SWAP_ENCRYPT_OFF) {
4470 /* ENCRYPTED SWAP: turn encryption off */
4471 release = trigger_port;
4472 if (!dp_encryption_inited) {
4473 dp_encryption_inited = TRUE;
4474 dp_encryption = FALSE;
4475 kr = KERN_SUCCESS;
4476 } else {
4477 kr = KERN_FAILURE;
4478 }
4479 } else if (flags == HI_WAT_ALERT) {
0b4e3aa0 4480 release = min_pages_trigger_port;
6d2010ae
A
4481#if CONFIG_FREEZE
4482 /* High and low water signals aren't applicable when freeze is */
4483 /* enabled, so release the trigger ports here and return */
4484 /* KERN_FAILURE. */
4485 if (vm_freeze_enabled) {
4486 if (IP_VALID( trigger_port )){
4487 ipc_port_release_send( trigger_port );
4488 }
4489 min_pages_trigger_port = IPC_PORT_NULL;
4490 kr = KERN_FAILURE;
4491 }
4492 else
4493#endif
4494 {
4495 min_pages_trigger_port = trigger_port;
4496 minimum_pages_remaining = hi_wat/vm_page_size;
4497 bs_low = FALSE;
4498 kr = KERN_SUCCESS;
4499 }
0b4e3aa0
A
4500 } else if (flags == LO_WAT_ALERT) {
4501 release = max_pages_trigger_port;
6d2010ae
A
4502#if CONFIG_FREEZE
4503 if (vm_freeze_enabled) {
4504 if (IP_VALID( trigger_port )){
4505 ipc_port_release_send( trigger_port );
4506 }
4507 max_pages_trigger_port = IPC_PORT_NULL;
4508 kr = KERN_FAILURE;
4509 }
4510 else
4511#endif
4512 {
4513 max_pages_trigger_port = trigger_port;
4514 maximum_pages_free = lo_wat/vm_page_size;
4515 kr = KERN_SUCCESS;
4516 }
b0d623f7
A
4517 } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4518 use_emergency_swap_file_first = TRUE;
4519 release = trigger_port;
4520 kr = KERN_SUCCESS;
4521 } else if (flags == SWAP_FILE_CREATION_ERROR) {
4522 release = trigger_port;
4523 kr = KERN_SUCCESS;
4524 if( paging_segment_count == 1) {
4525 use_emergency_swap_file_first = TRUE;
4526 }
4527 no_paging_space_action();
4528 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4529 if (now > error_notify + 5) {
4530 dprintf(("Swap File Error.\n"));
4531 error_notify = now;
4532 }
0b4e3aa0
A
4533 } else {
4534 release = trigger_port;
4535 kr = KERN_INVALID_ARGUMENT;
1c79356b 4536 }
0b4e3aa0
A
4537 PSL_UNLOCK();
4538
4539 if (IP_VALID(release))
4540 ipc_port_release_send(release);
4541
4542 return kr;
1c79356b 4543}
55e303ae
A
4544
4545/*
4546 * Monitor the amount of available backing store vs. the amount of
4547 * required backing store, notify a listener (if present) when
4548 * backing store may safely be removed.
4549 *
4550 * We attempt to avoid the situation where backing store is
4551 * discarded en masse, as this can lead to thrashing as the
4552 * backing store is compacted.
4553 */
4554
4555#define PF_INTERVAL 3 /* time between free level checks */
4556#define PF_LATENCY 10 /* number of intervals before release */
4557
4558static int dp_pages_free_low_count = 0;
91447636 4559thread_call_t default_pager_backing_store_monitor_callout;
55e303ae
A
4560
4561void
91447636
A
4562default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4563 __unused thread_call_param_t p2)
55e303ae 4564{
91447636 4565// unsigned long long average;
55e303ae
A
4566 ipc_port_t trigger;
4567 uint64_t deadline;
4568
4569 /*
4570 * We determine whether it will be safe to release some
4571 * backing store by watching the free page level. If
4572 * it remains below the maximum_pages_free threshold for
4573 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4574 * then we deem it safe.
4575 *
4576 * Note that this establishes a maximum rate at which backing
4577 * store will be released, as each notification (currently)
4578 * only results in a single backing store object being
4579 * released.
4580 */
4581 if (dp_pages_free > maximum_pages_free) {
4582 dp_pages_free_low_count++;
4583 } else {
4584 dp_pages_free_low_count = 0;
4585 }
4586
4587 /* decide whether to send notification */
4588 trigger = IP_NULL;
4589 if (max_pages_trigger_port &&
4590 (backing_store_release_trigger_disable == 0) &&
4591 (dp_pages_free_low_count > PF_LATENCY)) {
4592 trigger = max_pages_trigger_port;
4593 max_pages_trigger_port = NULL;
4594 }
4595
4596 /* send notification */
4597 if (trigger != IP_NULL) {
4598 VSL_LOCK();
4599 if(backing_store_release_trigger_disable != 0) {
4600 assert_wait((event_t)
4601 &backing_store_release_trigger_disable,
4602 THREAD_UNINT);
4603 VSL_UNLOCK();
4604 thread_block(THREAD_CONTINUE_NULL);
4605 } else {
4606 VSL_UNLOCK();
4607 }
6d2010ae
A
4608 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4609
55e303ae
A
4610 default_pager_space_alert(trigger, LO_WAT_ALERT);
4611 ipc_port_release_send(trigger);
4612 dp_pages_free_low_count = 0;
4613 }
4614
4615 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
91447636 4616 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
55e303ae 4617}
6d2010ae
A
4618
4619#if CONFIG_FREEZE
4620unsigned int default_pager_swap_pages_free() {
4621 return dp_pages_free;
4622}
4623#endif