]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
33aa791170cad604f8ddb25f32e1faa1a830c104
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 /*
58 * Default Pager.
59 * Paging File Management.
60 */
61
62 #include <mach/host_priv.h>
63 #include <mach/memory_object_control.h>
64 #include <mach/memory_object_server.h>
65 #include <mach/upl.h>
66 #include <default_pager/default_pager_internal.h>
67 #include <default_pager/default_pager_alerts.h>
68 #include <default_pager/default_pager_object_server.h>
69
70 #include <ipc/ipc_types.h>
71 #include <ipc/ipc_port.h>
72 #include <ipc/ipc_space.h>
73
74 #include <kern/kern_types.h>
75 #include <kern/host.h>
76 #include <kern/queue.h>
77 #include <kern/counters.h>
78 #include <kern/sched_prim.h>
79
80 #include <vm/vm_kern.h>
81 #include <vm/vm_pageout.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_protos.h>
85
86
87 /* todo - need large internal object support */
88
89 /*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
99
100 #define ALLOC_STRIDE (1024 * 1024 * 1024)
101 int physical_transfer_cluster_count = 0;
102
103 #define VM_SUPER_CLUSTER 0x40000
104 #define VM_SUPER_PAGES 64
105
106 /*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
110 #define VSTRUCT_DEF_CLSHIFT 2
111 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
112 int default_pager_clsize = 0;
113
114 /* statistics */
115 unsigned int clustered_writes[VM_SUPER_PAGES+1];
116 unsigned int clustered_reads[VM_SUPER_PAGES+1];
117
118 /*
119 * Globals used for asynchronous paging operations:
120 * vs_async_list: head of list of to-be-completed I/O ops
121 * async_num_queued: number of pages completed, but not yet
122 * processed by async thread.
123 * async_requests_out: number of pages of requests not completed.
124 */
125
126 #if 0
127 struct vs_async *vs_async_list;
128 int async_num_queued;
129 int async_requests_out;
130 #endif
131
132
133 #define VS_ASYNC_REUSE 1
134 struct vs_async *vs_async_free_list;
135
136 lck_mtx_t default_pager_async_lock; /* Protects globals above */
137
138
139 int vs_alloc_async_failed = 0; /* statistics */
140 int vs_alloc_async_count = 0; /* statistics */
141 struct vs_async *vs_alloc_async(void); /* forward */
142 void vs_free_async(struct vs_async *vsa); /* forward */
143
144
145 #define VS_ALLOC_ASYNC() vs_alloc_async()
146 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
147
148 #define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock)
149 #define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock)
150 #define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
151 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
152 /*
153 * Paging Space Hysteresis triggers and the target notification port
154 *
155 */
156 unsigned int dp_pages_free_drift_count = 0;
157 unsigned int dp_pages_free_drifted_max = 0;
158 unsigned int minimum_pages_remaining = 0;
159 unsigned int maximum_pages_free = 0;
160 ipc_port_t min_pages_trigger_port = NULL;
161 ipc_port_t max_pages_trigger_port = NULL;
162
163 boolean_t use_emergency_swap_file_first = FALSE;
164 boolean_t bs_low = FALSE;
165 int backing_store_release_trigger_disable = 0;
166 boolean_t backing_store_stop_compaction = FALSE;
167
168
169 /* Have we decided if swap needs to be encrypted yet ? */
170 boolean_t dp_encryption_inited = FALSE;
171 /* Should we encrypt swap ? */
172 boolean_t dp_encryption = FALSE;
173
174
175 /*
176 * Object sizes are rounded up to the next power of 2,
177 * unless they are bigger than a given maximum size.
178 */
179 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
180
181 /*
182 * List of all backing store and segments.
183 */
184 MACH_PORT_FACE emergency_segment_backing_store;
185 struct backing_store_list_head backing_store_list;
186 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
187 lck_mtx_t paging_segments_lock;
188 int paging_segment_max = 0;
189 int paging_segment_count = 0;
190 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
191
192
193 /*
194 * Total pages free in system
195 * This differs from clusters committed/avail which is a measure of the
196 * over commitment of paging segments to backing store. An idea which is
197 * likely to be deprecated.
198 */
199 unsigned int dp_pages_free = 0;
200 unsigned int dp_pages_reserve = 0;
201 unsigned int cluster_transfer_minimum = 100;
202
203 /* forward declarations */
204 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */
205 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */
206 default_pager_thread_t *get_read_buffer( void );
207 kern_return_t ps_vstruct_transfer_from_segment(
208 vstruct_t vs,
209 paging_segment_t segment,
210 upl_t upl);
211 kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
212 kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
213 kern_return_t vs_cluster_transfer(
214 vstruct_t vs,
215 dp_offset_t offset,
216 dp_size_t cnt,
217 upl_t upl);
218 vs_map_t vs_get_map_entry(
219 vstruct_t vs,
220 dp_offset_t offset);
221
222 kern_return_t
223 default_pager_backing_store_delete_internal( MACH_PORT_FACE );
224
225 default_pager_thread_t *
226 get_read_buffer( void )
227 {
228 int i;
229
230 DPT_LOCK(dpt_lock);
231 while(TRUE) {
232 for (i=0; i<default_pager_internal_count; i++) {
233 if(dpt_array[i]->checked_out == FALSE) {
234 dpt_array[i]->checked_out = TRUE;
235 DPT_UNLOCK(dpt_lock);
236 return dpt_array[i];
237 }
238 }
239 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
240 }
241 }
242
243 void
244 bs_initialize(void)
245 {
246 int i;
247
248 /*
249 * List of all backing store.
250 */
251 BSL_LOCK_INIT();
252 queue_init(&backing_store_list.bsl_queue);
253 PSL_LOCK_INIT();
254
255 VS_ASYNC_LOCK_INIT();
256 #if VS_ASYNC_REUSE
257 vs_async_free_list = NULL;
258 #endif /* VS_ASYNC_REUSE */
259
260 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
261 clustered_writes[i] = 0;
262 clustered_reads[i] = 0;
263 }
264
265 }
266
267 /*
268 * When things do not quite workout...
269 */
270 void bs_no_paging_space(boolean_t); /* forward */
271
272 void
273 bs_no_paging_space(
274 boolean_t out_of_memory)
275 {
276
277 if (out_of_memory)
278 dprintf(("*** OUT OF MEMORY ***\n"));
279 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
280 }
281
282 void bs_more_space(int); /* forward */
283 void bs_commit(int); /* forward */
284
285 boolean_t user_warned = FALSE;
286 unsigned int clusters_committed = 0;
287 unsigned int clusters_available = 0;
288 unsigned int clusters_committed_peak = 0;
289
290 void
291 bs_more_space(
292 int nclusters)
293 {
294 BSL_LOCK();
295 /*
296 * Account for new paging space.
297 */
298 clusters_available += nclusters;
299
300 if (clusters_available >= clusters_committed) {
301 if (verbose && user_warned) {
302 printf("%s%s - %d excess clusters now.\n",
303 my_name,
304 "paging space is OK now",
305 clusters_available - clusters_committed);
306 user_warned = FALSE;
307 clusters_committed_peak = 0;
308 }
309 } else {
310 if (verbose && user_warned) {
311 printf("%s%s - still short of %d clusters.\n",
312 my_name,
313 "WARNING: paging space over-committed",
314 clusters_committed - clusters_available);
315 clusters_committed_peak -= nclusters;
316 }
317 }
318 BSL_UNLOCK();
319
320 return;
321 }
322
323 void
324 bs_commit(
325 int nclusters)
326 {
327 BSL_LOCK();
328 clusters_committed += nclusters;
329 if (clusters_committed > clusters_available) {
330 if (verbose && !user_warned) {
331 user_warned = TRUE;
332 printf("%s%s - short of %d clusters.\n",
333 my_name,
334 "WARNING: paging space over-committed",
335 clusters_committed - clusters_available);
336 }
337 if (clusters_committed > clusters_committed_peak) {
338 clusters_committed_peak = clusters_committed;
339 }
340 } else {
341 if (verbose && user_warned) {
342 printf("%s%s - was short of up to %d clusters.\n",
343 my_name,
344 "paging space is OK now",
345 clusters_committed_peak - clusters_available);
346 user_warned = FALSE;
347 clusters_committed_peak = 0;
348 }
349 }
350 BSL_UNLOCK();
351
352 return;
353 }
354
355 int default_pager_info_verbose = 1;
356
357 void
358 bs_global_info(
359 uint64_t *totalp,
360 uint64_t *freep)
361 {
362 uint64_t pages_total, pages_free;
363 paging_segment_t ps;
364 int i;
365
366 PSL_LOCK();
367 pages_total = pages_free = 0;
368 for (i = 0; i <= paging_segment_max; i++) {
369 ps = paging_segments[i];
370 if (ps == PAGING_SEGMENT_NULL)
371 continue;
372
373 /*
374 * no need to lock: by the time this data
375 * gets back to any remote requestor it
376 * will be obsolete anyways
377 */
378 pages_total += ps->ps_pgnum;
379 pages_free += ps->ps_clcount << ps->ps_clshift;
380 DP_DEBUG(DEBUG_BS_INTERNAL,
381 ("segment #%d: %d total, %d free\n",
382 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
383 }
384 *totalp = pages_total;
385 *freep = pages_free;
386 if (verbose && user_warned && default_pager_info_verbose) {
387 if (clusters_available < clusters_committed) {
388 printf("%s %d clusters committed, %d available.\n",
389 my_name,
390 clusters_committed,
391 clusters_available);
392 }
393 }
394 PSL_UNLOCK();
395 }
396
397 backing_store_t backing_store_alloc(void); /* forward */
398
399 backing_store_t
400 backing_store_alloc(void)
401 {
402 backing_store_t bs;
403
404 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
405 if (bs == BACKING_STORE_NULL)
406 panic("backing_store_alloc: no memory");
407
408 BS_LOCK_INIT(bs);
409 bs->bs_port = MACH_PORT_NULL;
410 bs->bs_priority = 0;
411 bs->bs_clsize = 0;
412 bs->bs_pages_total = 0;
413 bs->bs_pages_in = 0;
414 bs->bs_pages_in_fail = 0;
415 bs->bs_pages_out = 0;
416 bs->bs_pages_out_fail = 0;
417
418 return bs;
419 }
420
421 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
422
423 /* Even in both the component space and external versions of this pager, */
424 /* backing_store_lookup will be called from tasks in the application space */
425 backing_store_t
426 backing_store_lookup(
427 MACH_PORT_FACE port)
428 {
429 backing_store_t bs;
430
431 /*
432 port is currently backed with a vs structure in the alias field
433 we could create an ISBS alias and a port_is_bs call but frankly
434 I see no reason for the test, the bs->port == port check below
435 will work properly on junk entries.
436
437 if ((port == MACH_PORT_NULL) || port_is_vs(port))
438 */
439 if ((port == MACH_PORT_NULL))
440 return BACKING_STORE_NULL;
441
442 BSL_LOCK();
443 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
444 bs_links) {
445 BS_LOCK(bs);
446 if (bs->bs_port == port) {
447 BSL_UNLOCK();
448 /* Success, return it locked. */
449 return bs;
450 }
451 BS_UNLOCK(bs);
452 }
453 BSL_UNLOCK();
454 return BACKING_STORE_NULL;
455 }
456
457 void backing_store_add(backing_store_t); /* forward */
458
459 void
460 backing_store_add(
461 __unused backing_store_t bs)
462 {
463 // MACH_PORT_FACE port = bs->bs_port;
464 // MACH_PORT_FACE pset = default_pager_default_set;
465 kern_return_t kr = KERN_SUCCESS;
466
467 if (kr != KERN_SUCCESS)
468 panic("backing_store_add: add to set");
469
470 }
471
472 /*
473 * Set up default page shift, but only if not already
474 * set and argument is within range.
475 */
476 boolean_t
477 bs_set_default_clsize(unsigned int npages)
478 {
479 switch(npages){
480 case 1:
481 case 2:
482 case 4:
483 case 8:
484 if (default_pager_clsize == 0) /* if not yet set */
485 vstruct_def_clshift = local_log2(npages);
486 return(TRUE);
487 }
488 return(FALSE);
489 }
490
491 int bs_get_global_clsize(int clsize); /* forward */
492
493 int
494 bs_get_global_clsize(
495 int clsize)
496 {
497 int i;
498 memory_object_default_t dmm;
499 kern_return_t kr;
500
501 /*
502 * Only allow setting of cluster size once. If called
503 * with no cluster size (default), we use the compiled-in default
504 * for the duration. The same cluster size is used for all
505 * paging segments.
506 */
507 if (default_pager_clsize == 0) {
508 /*
509 * Keep cluster size in bit shift because it's quicker
510 * arithmetic, and easier to keep at a power of 2.
511 */
512 if (clsize != NO_CLSIZE) {
513 for (i = 0; (1 << i) < clsize; i++);
514 if (i > MAX_CLUSTER_SHIFT)
515 i = MAX_CLUSTER_SHIFT;
516 vstruct_def_clshift = i;
517 }
518 default_pager_clsize = (1 << vstruct_def_clshift);
519
520 /*
521 * Let the user know the new (and definitive) cluster size.
522 */
523 if (verbose)
524 printf("%scluster size = %d page%s\n",
525 my_name, default_pager_clsize,
526 (default_pager_clsize == 1) ? "" : "s");
527
528 /*
529 * Let the kernel know too, in case it hasn't used the
530 * default value provided in main() yet.
531 */
532 dmm = default_pager_object;
533 clsize = default_pager_clsize * vm_page_size; /* in bytes */
534 kr = host_default_memory_manager(host_priv_self(),
535 &dmm,
536 clsize);
537 memory_object_default_deallocate(dmm);
538
539 if (kr != KERN_SUCCESS) {
540 panic("bs_get_global_cl_size:host_default_memory_manager");
541 }
542 if (dmm != default_pager_object) {
543 panic("bs_get_global_cl_size:there is another default pager");
544 }
545 }
546 ASSERT(default_pager_clsize > 0 &&
547 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
548
549 return default_pager_clsize;
550 }
551
552 kern_return_t
553 default_pager_backing_store_create(
554 memory_object_default_t pager,
555 int priority,
556 int clsize, /* in bytes */
557 MACH_PORT_FACE *backing_store)
558 {
559 backing_store_t bs;
560 MACH_PORT_FACE port;
561 // kern_return_t kr;
562 struct vstruct_alias *alias_struct;
563
564 if (pager != default_pager_object)
565 return KERN_INVALID_ARGUMENT;
566
567 bs = backing_store_alloc();
568 port = ipc_port_alloc_kernel();
569 ipc_port_make_send(port);
570 assert (port != IP_NULL);
571
572 DP_DEBUG(DEBUG_BS_EXTERNAL,
573 ("priority=%d clsize=%d bs_port=0x%x\n",
574 priority, clsize, (int) backing_store));
575
576 alias_struct = (struct vstruct_alias *)
577 kalloc(sizeof (struct vstruct_alias));
578 if(alias_struct != NULL) {
579 alias_struct->vs = (struct vstruct *)bs;
580 alias_struct->name = &default_pager_ops;
581 port->alias = (uintptr_t) alias_struct;
582 }
583 else {
584 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
585 kfree(bs, sizeof (struct backing_store));
586 return KERN_RESOURCE_SHORTAGE;
587 }
588
589 bs->bs_port = port;
590 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
591 priority = BS_MAXPRI;
592 else if (priority == BS_NOPRI)
593 priority = BS_MAXPRI;
594 else
595 priority = BS_MINPRI;
596 bs->bs_priority = priority;
597
598 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
599
600 BSL_LOCK();
601 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
602 bs_links);
603 BSL_UNLOCK();
604
605 backing_store_add(bs);
606
607 *backing_store = port;
608 return KERN_SUCCESS;
609 }
610
611 kern_return_t
612 default_pager_backing_store_info(
613 MACH_PORT_FACE backing_store,
614 backing_store_flavor_t flavour,
615 backing_store_info_t info,
616 mach_msg_type_number_t *size)
617 {
618 backing_store_t bs;
619 backing_store_basic_info_t basic;
620 int i;
621 paging_segment_t ps;
622
623 if (flavour != BACKING_STORE_BASIC_INFO ||
624 *size < BACKING_STORE_BASIC_INFO_COUNT)
625 return KERN_INVALID_ARGUMENT;
626
627 basic = (backing_store_basic_info_t)info;
628 *size = BACKING_STORE_BASIC_INFO_COUNT;
629
630 VSTATS_LOCK(&global_stats.gs_lock);
631 basic->pageout_calls = global_stats.gs_pageout_calls;
632 basic->pagein_calls = global_stats.gs_pagein_calls;
633 basic->pages_in = global_stats.gs_pages_in;
634 basic->pages_out = global_stats.gs_pages_out;
635 basic->pages_unavail = global_stats.gs_pages_unavail;
636 basic->pages_init = global_stats.gs_pages_init;
637 basic->pages_init_writes= global_stats.gs_pages_init_writes;
638 VSTATS_UNLOCK(&global_stats.gs_lock);
639
640 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
641 return KERN_INVALID_ARGUMENT;
642
643 basic->bs_pages_total = bs->bs_pages_total;
644 PSL_LOCK();
645 bs->bs_pages_free = 0;
646 for (i = 0; i <= paging_segment_max; i++) {
647 ps = paging_segments[i];
648 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
649 PS_LOCK(ps);
650 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
651 PS_UNLOCK(ps);
652 }
653 }
654 PSL_UNLOCK();
655 basic->bs_pages_free = bs->bs_pages_free;
656 basic->bs_pages_in = bs->bs_pages_in;
657 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
658 basic->bs_pages_out = bs->bs_pages_out;
659 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
660
661 basic->bs_priority = bs->bs_priority;
662 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
663
664 BS_UNLOCK(bs);
665
666 return KERN_SUCCESS;
667 }
668
669 int ps_delete(paging_segment_t); /* forward */
670 boolean_t current_thread_aborted(void);
671
672 int
673 ps_delete(
674 paging_segment_t ps)
675 {
676 vstruct_t vs;
677 kern_return_t error = KERN_SUCCESS;
678 int vs_count;
679
680 VSL_LOCK(); /* get the lock on the list of vs's */
681
682 /* The lock relationship and sequence is farily complicated */
683 /* this code looks at a live list, locking and unlocking the list */
684 /* as it traverses it. It depends on the locking behavior of */
685 /* default_pager_no_senders. no_senders always locks the vstruct */
686 /* targeted for removal before locking the vstruct list. However */
687 /* it will remove that member of the list without locking its */
688 /* neighbors. We can be sure when we hold a lock on a vstruct */
689 /* it cannot be removed from the list but we must hold the list */
690 /* lock to be sure that its pointers to its neighbors are valid. */
691 /* Also, we can hold off destruction of a vstruct when the list */
692 /* lock and the vs locks are not being held by bumping the */
693 /* vs_async_pending count. */
694
695
696 while(backing_store_release_trigger_disable != 0) {
697 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
698 }
699
700 /* we will choose instead to hold a send right */
701 vs_count = vstruct_list.vsl_count;
702 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
703 if(vs == (vstruct_t)&vstruct_list) {
704 VSL_UNLOCK();
705 return KERN_SUCCESS;
706 }
707 VS_LOCK(vs);
708 vs_async_wait(vs); /* wait for any pending async writes */
709 if ((vs_count != 0) && (vs != NULL))
710 vs->vs_async_pending += 1; /* hold parties calling */
711 /* vs_async_wait */
712 VS_UNLOCK(vs);
713 VSL_UNLOCK();
714 while((vs_count != 0) && (vs != NULL)) {
715 /* We take the count of AMO's before beginning the */
716 /* transfer of of the target segment. */
717 /* We are guaranteed that the target segment cannot get */
718 /* more users. We also know that queue entries are */
719 /* made at the back of the list. If some of the entries */
720 /* we would check disappear while we are traversing the */
721 /* list then we will either check new entries which */
722 /* do not have any backing store in the target segment */
723 /* or re-check old entries. This might not be optimal */
724 /* but it will always be correct. The alternative is to */
725 /* take a snapshot of the list. */
726 vstruct_t next_vs;
727
728 if(dp_pages_free < cluster_transfer_minimum)
729 error = KERN_FAILURE;
730 else {
731 vm_object_t transfer_object;
732 unsigned int count;
733 upl_t upl;
734
735 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
736 count = 0;
737 error = vm_object_upl_request(transfer_object,
738 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
739 &upl, NULL, &count,
740 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
741
742 if(error == KERN_SUCCESS) {
743 error = ps_vstruct_transfer_from_segment(
744 vs, ps, upl);
745 upl_commit(upl, NULL, 0);
746 upl_deallocate(upl);
747 } else {
748 error = KERN_FAILURE;
749 }
750 vm_object_deallocate(transfer_object);
751 }
752 if(error || current_thread_aborted() || backing_store_stop_compaction) {
753 VS_LOCK(vs);
754 vs->vs_async_pending -= 1; /* release vs_async_wait */
755 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
756 vs->vs_waiting_async = FALSE;
757 VS_UNLOCK(vs);
758 thread_wakeup(&vs->vs_async_pending);
759 } else {
760 VS_UNLOCK(vs);
761 }
762 return KERN_FAILURE;
763 }
764
765 VSL_LOCK();
766
767 while(backing_store_release_trigger_disable != 0) {
768 VSL_SLEEP(&backing_store_release_trigger_disable,
769 THREAD_UNINT);
770 }
771
772 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
773 if((next_vs != (vstruct_t)&vstruct_list) &&
774 (vs != next_vs) && (vs_count != 1)) {
775 VS_LOCK(next_vs);
776 vs_async_wait(next_vs); /* wait for any */
777 /* pending async writes */
778 next_vs->vs_async_pending += 1; /* hold parties */
779 /* calling vs_async_wait */
780 VS_UNLOCK(next_vs);
781 }
782 VSL_UNLOCK();
783 VS_LOCK(vs);
784 vs->vs_async_pending -= 1;
785 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
786 vs->vs_waiting_async = FALSE;
787 VS_UNLOCK(vs);
788 thread_wakeup(&vs->vs_async_pending);
789 } else {
790 VS_UNLOCK(vs);
791 }
792 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
793 vs = NULL;
794 else
795 vs = next_vs;
796 vs_count--;
797 }
798 return KERN_SUCCESS;
799 }
800
801
802 kern_return_t
803 default_pager_backing_store_delete_internal(
804 MACH_PORT_FACE backing_store)
805 {
806 backing_store_t bs;
807 int i;
808 paging_segment_t ps;
809 int error;
810 int interim_pages_removed = 0;
811 boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
812
813 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
814 return KERN_INVALID_ARGUMENT;
815
816 restart:
817 PSL_LOCK();
818 error = KERN_SUCCESS;
819 for (i = 0; i <= paging_segment_max; i++) {
820 ps = paging_segments[i];
821 if (ps != PAGING_SEGMENT_NULL &&
822 ps->ps_bs == bs &&
823 ! IS_PS_GOING_AWAY(ps)) {
824 PS_LOCK(ps);
825
826 if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
827 /*
828 * Someone is already busy reclamining this paging segment.
829 * If it's the emergency segment we are looking at then check
830 * that someone has not already recovered it and set the right
831 * state i.e. online but not activated.
832 */
833 PS_UNLOCK(ps);
834 continue;
835 }
836
837 /* disable access to this segment */
838 ps->ps_state &= ~PS_CAN_USE;
839 ps->ps_state |= PS_GOING_AWAY;
840 PS_UNLOCK(ps);
841 /*
842 * The "ps" segment is "off-line" now,
843 * we can try and delete it...
844 */
845 if(dp_pages_free < (cluster_transfer_minimum
846 + ps->ps_pgcount)) {
847 error = KERN_FAILURE;
848 PSL_UNLOCK();
849 }
850 else {
851 /* remove all pages associated with the */
852 /* segment from the list of free pages */
853 /* when transfer is through, all target */
854 /* segment pages will appear to be free */
855
856 dp_pages_free -= ps->ps_pgcount;
857 interim_pages_removed += ps->ps_pgcount;
858 PSL_UNLOCK();
859 error = ps_delete(ps);
860 }
861 if (error != KERN_SUCCESS) {
862 /*
863 * We couldn't delete the segment,
864 * probably because there's not enough
865 * virtual memory left.
866 * Re-enable all the segments.
867 */
868 PSL_LOCK();
869 break;
870 }
871 goto restart;
872 }
873 }
874
875 if (error != KERN_SUCCESS) {
876 for (i = 0; i <= paging_segment_max; i++) {
877 ps = paging_segments[i];
878 if (ps != PAGING_SEGMENT_NULL &&
879 ps->ps_bs == bs &&
880 IS_PS_GOING_AWAY(ps)) {
881 PS_LOCK(ps);
882
883 if( !IS_PS_GOING_AWAY(ps)) {
884 PS_UNLOCK(ps);
885 continue;
886 }
887 /* Handle the special clusters that came in while we let go the lock*/
888 if( ps->ps_special_clusters) {
889 dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
890 ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
891 ps->ps_clcount += ps->ps_special_clusters;
892 if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
893 ps_select_array[ps->ps_bs->bs_priority] = 0;
894 }
895 ps->ps_special_clusters = 0;
896 }
897 /* re-enable access to this segment */
898 ps->ps_state &= ~PS_GOING_AWAY;
899 ps->ps_state |= PS_CAN_USE;
900 PS_UNLOCK(ps);
901 }
902 }
903 dp_pages_free += interim_pages_removed;
904 PSL_UNLOCK();
905 BS_UNLOCK(bs);
906 return error;
907 }
908
909 for (i = 0; i <= paging_segment_max; i++) {
910 ps = paging_segments[i];
911 if (ps != PAGING_SEGMENT_NULL &&
912 ps->ps_bs == bs) {
913 if(IS_PS_GOING_AWAY(ps)) {
914 if(IS_PS_EMERGENCY_SEGMENT(ps)) {
915 PS_LOCK(ps);
916 ps->ps_state &= ~PS_GOING_AWAY;
917 ps->ps_special_clusters = 0;
918 ps->ps_pgcount = ps->ps_pgnum;
919 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
920 PS_UNLOCK(ps);
921 dp_pages_reserve += interim_pages_removed;
922 } else {
923 paging_segments[i] = PAGING_SEGMENT_NULL;
924 paging_segment_count--;
925 PS_LOCK(ps);
926 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
927 kfree(ps, sizeof *ps);
928 }
929 }
930 }
931 }
932
933 /* Scan the entire ps array separately to make certain we find the */
934 /* proper paging_segment_max */
935 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
936 if(paging_segments[i] != PAGING_SEGMENT_NULL)
937 paging_segment_max = i;
938 }
939
940 PSL_UNLOCK();
941
942 if( dealing_with_emergency_segment ) {
943 BS_UNLOCK(bs);
944 return KERN_SUCCESS;
945 }
946
947 /*
948 * All the segments have been deleted.
949 * We can remove the backing store.
950 */
951
952 /*
953 * Disable lookups of this backing store.
954 */
955 if((void *)bs->bs_port->alias != NULL)
956 kfree((void *) bs->bs_port->alias,
957 sizeof (struct vstruct_alias));
958 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
959 bs->bs_port = MACH_PORT_NULL;
960 BS_UNLOCK(bs);
961
962 /*
963 * Remove backing store from backing_store list.
964 */
965 BSL_LOCK();
966 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
967 bs_links);
968 BSL_UNLOCK();
969
970 /*
971 * Free the backing store structure.
972 */
973 kfree(bs, sizeof *bs);
974
975 return KERN_SUCCESS;
976 }
977
978 kern_return_t
979 default_pager_backing_store_delete(
980 MACH_PORT_FACE backing_store)
981 {
982 if( backing_store != emergency_segment_backing_store ) {
983 default_pager_backing_store_delete_internal(emergency_segment_backing_store);
984 }
985 return(default_pager_backing_store_delete_internal(backing_store));
986 }
987
988 int ps_enter(paging_segment_t); /* forward */
989
990 int
991 ps_enter(
992 paging_segment_t ps)
993 {
994 int i;
995
996 PSL_LOCK();
997
998 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
999 if (paging_segments[i] == PAGING_SEGMENT_NULL)
1000 break;
1001 }
1002
1003 if (i < MAX_NUM_PAGING_SEGMENTS) {
1004 paging_segments[i] = ps;
1005 if (i > paging_segment_max)
1006 paging_segment_max = i;
1007 paging_segment_count++;
1008 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1009 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1010 ps_select_array[ps->ps_bs->bs_priority] = 0;
1011 i = 0;
1012 } else {
1013 PSL_UNLOCK();
1014 return KERN_RESOURCE_SHORTAGE;
1015 }
1016
1017 PSL_UNLOCK();
1018 return i;
1019 }
1020
1021 #ifdef DEVICE_PAGING
1022 kern_return_t
1023 default_pager_add_segment(
1024 MACH_PORT_FACE backing_store,
1025 MACH_PORT_FACE device,
1026 recnum_t offset,
1027 recnum_t count,
1028 int record_size)
1029 {
1030 backing_store_t bs;
1031 paging_segment_t ps;
1032 int i;
1033 int error;
1034
1035 if ((bs = backing_store_lookup(backing_store))
1036 == BACKING_STORE_NULL)
1037 return KERN_INVALID_ARGUMENT;
1038
1039 PSL_LOCK();
1040 for (i = 0; i <= paging_segment_max; i++) {
1041 ps = paging_segments[i];
1042 if (ps == PAGING_SEGMENT_NULL)
1043 continue;
1044
1045 /*
1046 * Check for overlap on same device.
1047 */
1048 if (!(ps->ps_device != device
1049 || offset >= ps->ps_offset + ps->ps_recnum
1050 || offset + count <= ps->ps_offset)) {
1051 PSL_UNLOCK();
1052 BS_UNLOCK(bs);
1053 return KERN_INVALID_ARGUMENT;
1054 }
1055 }
1056 PSL_UNLOCK();
1057
1058 /*
1059 * Set up the paging segment
1060 */
1061 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1062 if (ps == PAGING_SEGMENT_NULL) {
1063 BS_UNLOCK(bs);
1064 return KERN_RESOURCE_SHORTAGE;
1065 }
1066
1067 ps->ps_segtype = PS_PARTITION;
1068 ps->ps_device = device;
1069 ps->ps_offset = offset;
1070 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1071 ps->ps_recnum = count;
1072 ps->ps_pgnum = count >> ps->ps_record_shift;
1073
1074 ps->ps_pgcount = ps->ps_pgnum;
1075 ps->ps_clshift = local_log2(bs->bs_clsize);
1076 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1077 ps->ps_hint = 0;
1078
1079 PS_LOCK_INIT(ps);
1080 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1081 if (!ps->ps_bmap) {
1082 kfree(ps, sizeof *ps);
1083 BS_UNLOCK(bs);
1084 return KERN_RESOURCE_SHORTAGE;
1085 }
1086 for (i = 0; i < ps->ps_ncls; i++) {
1087 clrbit(ps->ps_bmap, i);
1088 }
1089
1090 if(paging_segment_count == 0) {
1091 ps->ps_state = PS_EMERGENCY_SEGMENT;
1092 if(use_emergency_swap_file_first) {
1093 ps->ps_state |= PS_CAN_USE;
1094 }
1095 } else {
1096 ps->ps_state = PS_CAN_USE;
1097 }
1098
1099 ps->ps_bs = bs;
1100
1101 if ((error = ps_enter(ps)) != 0) {
1102 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1103 kfree(ps, sizeof *ps);
1104 BS_UNLOCK(bs);
1105 return KERN_RESOURCE_SHORTAGE;
1106 }
1107
1108 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1109 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1110 BS_UNLOCK(bs);
1111
1112 PSL_LOCK();
1113 if(IS_PS_OK_TO_USE(ps)) {
1114 dp_pages_free += ps->ps_pgcount;
1115 } else {
1116 dp_pages_reserve += ps->ps_pgcount;
1117 }
1118 PSL_UNLOCK();
1119
1120 bs_more_space(ps->ps_clcount);
1121
1122 DP_DEBUG(DEBUG_BS_INTERNAL,
1123 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1124 device, offset, count, record_size,
1125 ps->ps_record_shift, ps->ps_pgnum));
1126
1127 return KERN_SUCCESS;
1128 }
1129
1130 boolean_t
1131 bs_add_device(
1132 char *dev_name,
1133 MACH_PORT_FACE master)
1134 {
1135 security_token_t null_security_token = {
1136 { 0, 0 }
1137 };
1138 MACH_PORT_FACE device;
1139 int info[DEV_GET_SIZE_COUNT];
1140 mach_msg_type_number_t info_count;
1141 MACH_PORT_FACE bs = MACH_PORT_NULL;
1142 unsigned int rec_size;
1143 recnum_t count;
1144 int clsize;
1145 MACH_PORT_FACE reply_port;
1146
1147 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1148 null_security_token, dev_name, &device))
1149 return FALSE;
1150
1151 info_count = DEV_GET_SIZE_COUNT;
1152 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1153 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1154 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1155 clsize = bs_get_global_clsize(0);
1156 if (!default_pager_backing_store_create(
1157 default_pager_object,
1158 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1159 (clsize * vm_page_size),
1160 &bs)) {
1161 if (!default_pager_add_segment(bs, device,
1162 0, count, rec_size)) {
1163 return TRUE;
1164 }
1165 ipc_port_release_receive(bs);
1166 }
1167 }
1168
1169 ipc_port_release_send(device);
1170 return FALSE;
1171 }
1172 #endif /* DEVICE_PAGING */
1173
1174 #if VS_ASYNC_REUSE
1175
1176 struct vs_async *
1177 vs_alloc_async(void)
1178 {
1179 struct vs_async *vsa;
1180 MACH_PORT_FACE reply_port;
1181 // kern_return_t kr;
1182
1183 VS_ASYNC_LOCK();
1184 if (vs_async_free_list == NULL) {
1185 VS_ASYNC_UNLOCK();
1186 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1187 if (vsa != NULL) {
1188 /*
1189 * Try allocating a reply port named after the
1190 * address of the vs_async structure.
1191 */
1192 struct vstruct_alias *alias_struct;
1193
1194 reply_port = ipc_port_alloc_kernel();
1195 alias_struct = (struct vstruct_alias *)
1196 kalloc(sizeof (struct vstruct_alias));
1197 if(alias_struct != NULL) {
1198 alias_struct->vs = (struct vstruct *)vsa;
1199 alias_struct->name = &default_pager_ops;
1200 reply_port->alias = (uintptr_t) alias_struct;
1201 vsa->reply_port = reply_port;
1202 vs_alloc_async_count++;
1203 }
1204 else {
1205 vs_alloc_async_failed++;
1206 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1207 (reply_port));
1208 kfree(vsa, sizeof (struct vs_async));
1209 vsa = NULL;
1210 }
1211 }
1212 } else {
1213 vsa = vs_async_free_list;
1214 vs_async_free_list = vs_async_free_list->vsa_next;
1215 VS_ASYNC_UNLOCK();
1216 }
1217
1218 return vsa;
1219 }
1220
1221 void
1222 vs_free_async(
1223 struct vs_async *vsa)
1224 {
1225 VS_ASYNC_LOCK();
1226 vsa->vsa_next = vs_async_free_list;
1227 vs_async_free_list = vsa;
1228 VS_ASYNC_UNLOCK();
1229 }
1230
1231 #else /* VS_ASYNC_REUSE */
1232
1233 struct vs_async *
1234 vs_alloc_async(void)
1235 {
1236 struct vs_async *vsa;
1237 MACH_PORT_FACE reply_port;
1238 kern_return_t kr;
1239
1240 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1241 if (vsa != NULL) {
1242 /*
1243 * Try allocating a reply port named after the
1244 * address of the vs_async structure.
1245 */
1246 reply_port = ipc_port_alloc_kernel();
1247 alias_struct = (vstruct_alias *)
1248 kalloc(sizeof (struct vstruct_alias));
1249 if(alias_struct != NULL) {
1250 alias_struct->vs = reply_port;
1251 alias_struct->name = &default_pager_ops;
1252 reply_port->alias = (int) vsa;
1253 vsa->reply_port = reply_port;
1254 vs_alloc_async_count++;
1255 }
1256 else {
1257 vs_alloc_async_failed++;
1258 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1259 (reply_port));
1260 kfree(vsa, sizeof (struct vs_async));
1261 vsa = NULL;
1262 }
1263 }
1264
1265 return vsa;
1266 }
1267
1268 void
1269 vs_free_async(
1270 struct vs_async *vsa)
1271 {
1272 MACH_PORT_FACE reply_port;
1273 kern_return_t kr;
1274
1275 reply_port = vsa->reply_port;
1276 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1277 kfree(vsa, sizeof (struct vs_async));
1278 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1279 #if 0
1280 VS_ASYNC_LOCK();
1281 vs_alloc_async_count--;
1282 VS_ASYNC_UNLOCK();
1283 #endif
1284 }
1285
1286 #endif /* VS_ASYNC_REUSE */
1287
1288 zone_t vstruct_zone;
1289
1290 vstruct_t
1291 ps_vstruct_create(
1292 dp_size_t size)
1293 {
1294 vstruct_t vs;
1295 unsigned int i;
1296
1297 vs = (vstruct_t) zalloc(vstruct_zone);
1298 if (vs == VSTRUCT_NULL) {
1299 return VSTRUCT_NULL;
1300 }
1301
1302 VS_LOCK_INIT(vs);
1303
1304 /*
1305 * The following fields will be provided later.
1306 */
1307 vs->vs_pager_ops = NULL;
1308 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1309 vs->vs_references = 1;
1310 vs->vs_seqno = 0;
1311
1312 vs->vs_waiting_seqno = FALSE;
1313 vs->vs_waiting_read = FALSE;
1314 vs->vs_waiting_write = FALSE;
1315 vs->vs_waiting_async = FALSE;
1316
1317 vs->vs_readers = 0;
1318 vs->vs_writers = 0;
1319
1320 vs->vs_errors = 0;
1321
1322 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1323 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1324 vs->vs_async_pending = 0;
1325
1326 /*
1327 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1328 * depending on the size of the memory object.
1329 */
1330 if (INDIRECT_CLMAP(vs->vs_size)) {
1331 vs->vs_imap = (struct vs_map **)
1332 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1333 vs->vs_indirect = TRUE;
1334 } else {
1335 vs->vs_dmap = (struct vs_map *)
1336 kalloc(CLMAP_SIZE(vs->vs_size));
1337 vs->vs_indirect = FALSE;
1338 }
1339 vs->vs_xfer_pending = FALSE;
1340 DP_DEBUG(DEBUG_VS_INTERNAL,
1341 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1342
1343 /*
1344 * Check to see that we got the space.
1345 */
1346 if (!vs->vs_dmap) {
1347 kfree(vs, sizeof *vs);
1348 return VSTRUCT_NULL;
1349 }
1350
1351 /*
1352 * Zero the indirect pointers, or clear the direct pointers.
1353 */
1354 if (vs->vs_indirect)
1355 memset(vs->vs_imap, 0,
1356 INDIRECT_CLMAP_SIZE(vs->vs_size));
1357 else
1358 for (i = 0; i < vs->vs_size; i++)
1359 VSM_CLR(vs->vs_dmap[i]);
1360
1361 VS_MAP_LOCK_INIT(vs);
1362
1363 bs_commit(vs->vs_size);
1364
1365 return vs;
1366 }
1367
1368 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1369
1370 paging_segment_t
1371 ps_select_segment(
1372 unsigned int shift,
1373 int *psindex)
1374 {
1375 paging_segment_t ps;
1376 int i;
1377 int j;
1378
1379 /*
1380 * Optimize case where there's only one segment.
1381 * paging_segment_max will index the one and only segment.
1382 */
1383
1384 PSL_LOCK();
1385 if (paging_segment_count == 1) {
1386 paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */
1387 ipc_port_t trigger = IP_NULL;
1388
1389 ps = paging_segments[paging_segment_max];
1390 *psindex = paging_segment_max;
1391 PS_LOCK(ps);
1392 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1393 panic("Emergency paging segment missing\n");
1394 }
1395 ASSERT(ps->ps_clshift >= shift);
1396 if(IS_PS_OK_TO_USE(ps)) {
1397 if (ps->ps_clcount) {
1398 ps->ps_clcount--;
1399 dp_pages_free -= 1 << ps->ps_clshift;
1400 ps->ps_pgcount -= 1 << ps->ps_clshift;
1401 if(min_pages_trigger_port &&
1402 (dp_pages_free < minimum_pages_remaining)) {
1403 trigger = min_pages_trigger_port;
1404 min_pages_trigger_port = NULL;
1405 bs_low = TRUE;
1406 }
1407 lps = ps;
1408 }
1409 }
1410 PS_UNLOCK(ps);
1411
1412 if( lps == PAGING_SEGMENT_NULL ) {
1413 if(dp_pages_free) {
1414 dp_pages_free_drift_count++;
1415 if(dp_pages_free > dp_pages_free_drifted_max) {
1416 dp_pages_free_drifted_max = dp_pages_free;
1417 }
1418 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1419 }
1420 dp_pages_free = 0;
1421 }
1422
1423 PSL_UNLOCK();
1424
1425 if (trigger != IP_NULL) {
1426 default_pager_space_alert(trigger, HI_WAT_ALERT);
1427 ipc_port_release_send(trigger);
1428 }
1429 return lps;
1430 }
1431
1432 if (paging_segment_count == 0) {
1433 if(dp_pages_free) {
1434 dp_pages_free_drift_count++;
1435 if(dp_pages_free > dp_pages_free_drifted_max) {
1436 dp_pages_free_drifted_max = dp_pages_free;
1437 }
1438 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1439 }
1440 dp_pages_free = 0;
1441 PSL_UNLOCK();
1442 return PAGING_SEGMENT_NULL;
1443 }
1444
1445 for (i = BS_MAXPRI;
1446 i >= BS_MINPRI; i--) {
1447 int start_index;
1448
1449 if ((ps_select_array[i] == BS_NOPRI) ||
1450 (ps_select_array[i] == BS_FULLPRI))
1451 continue;
1452 start_index = ps_select_array[i];
1453
1454 if(!(paging_segments[start_index])) {
1455 j = start_index+1;
1456 physical_transfer_cluster_count = 0;
1457 }
1458 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1459 (((paging_segments[start_index])->ps_clshift)
1460 + vm_page_shift))) {
1461 physical_transfer_cluster_count = 0;
1462 j = start_index + 1;
1463 } else {
1464 physical_transfer_cluster_count+=1;
1465 j = start_index;
1466 if(start_index == 0)
1467 start_index = paging_segment_max;
1468 else
1469 start_index = start_index - 1;
1470 }
1471
1472 while (1) {
1473 if (j > paging_segment_max)
1474 j = 0;
1475 if ((ps = paging_segments[j]) &&
1476 (ps->ps_bs->bs_priority == i)) {
1477 /*
1478 * Force the ps cluster size to be
1479 * >= that of the vstruct.
1480 */
1481 PS_LOCK(ps);
1482 if (IS_PS_OK_TO_USE(ps)) {
1483 if ((ps->ps_clcount) &&
1484 (ps->ps_clshift >= shift)) {
1485 ipc_port_t trigger = IP_NULL;
1486
1487 ps->ps_clcount--;
1488 dp_pages_free -= 1 << ps->ps_clshift;
1489 ps->ps_pgcount -= 1 << ps->ps_clshift;
1490 if(min_pages_trigger_port &&
1491 (dp_pages_free <
1492 minimum_pages_remaining)) {
1493 trigger = min_pages_trigger_port;
1494 min_pages_trigger_port = NULL;
1495 }
1496 PS_UNLOCK(ps);
1497 /*
1498 * found one, quit looking.
1499 */
1500 ps_select_array[i] = j;
1501 PSL_UNLOCK();
1502
1503 if (trigger != IP_NULL) {
1504 default_pager_space_alert(
1505 trigger,
1506 HI_WAT_ALERT);
1507 ipc_port_release_send(trigger);
1508 }
1509 *psindex = j;
1510 return ps;
1511 }
1512 }
1513 PS_UNLOCK(ps);
1514 }
1515 if (j == start_index) {
1516 /*
1517 * none at this priority -- mark it full
1518 */
1519 ps_select_array[i] = BS_FULLPRI;
1520 break;
1521 }
1522 j++;
1523 }
1524 }
1525
1526 if(dp_pages_free) {
1527 dp_pages_free_drift_count++;
1528 if(dp_pages_free > dp_pages_free_drifted_max) {
1529 dp_pages_free_drifted_max = dp_pages_free;
1530 }
1531 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1532 }
1533 dp_pages_free = 0;
1534 PSL_UNLOCK();
1535 return PAGING_SEGMENT_NULL;
1536 }
1537
1538 dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1539
1540 dp_offset_t
1541 ps_allocate_cluster(
1542 vstruct_t vs,
1543 int *psindex,
1544 paging_segment_t use_ps)
1545 {
1546 unsigned int byte_num;
1547 int bit_num = 0;
1548 paging_segment_t ps;
1549 dp_offset_t cluster;
1550 ipc_port_t trigger = IP_NULL;
1551
1552 /*
1553 * Find best paging segment.
1554 * ps_select_segment will decrement cluster count on ps.
1555 * Must pass cluster shift to find the most appropriate segment.
1556 */
1557 /* NOTE: The addition of paging segment delete capability threatened
1558 * to seriously complicate the treatment of paging segments in this
1559 * module and the ones that call it (notably ps_clmap), because of the
1560 * difficulty in assuring that the paging segment would continue to
1561 * exist between being unlocked and locked. This was
1562 * avoided because all calls to this module are based in either
1563 * dp_memory_object calls which rely on the vs lock, or by
1564 * the transfer function which is part of the segment delete path.
1565 * The transfer function which is part of paging segment delete is
1566 * protected from multiple callers by the backing store lock.
1567 * The paging segment delete function treats mappings to a paging
1568 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1569 * while data is transferred to the remaining segments. This is in
1570 * line with the view that incomplete or in-transition mappings between
1571 * data, a vstruct, and backing store are protected by the vs lock.
1572 * This and the ordering of the paging segment "going_away" bit setting
1573 * protects us.
1574 */
1575 retry:
1576 if (use_ps != PAGING_SEGMENT_NULL) {
1577 ps = use_ps;
1578 PSL_LOCK();
1579 PS_LOCK(ps);
1580
1581 ASSERT(ps->ps_clcount != 0);
1582
1583 ps->ps_clcount--;
1584 dp_pages_free -= 1 << ps->ps_clshift;
1585 ps->ps_pgcount -= 1 << ps->ps_clshift;
1586 if(min_pages_trigger_port &&
1587 (dp_pages_free < minimum_pages_remaining)) {
1588 trigger = min_pages_trigger_port;
1589 min_pages_trigger_port = NULL;
1590 }
1591 PSL_UNLOCK();
1592 PS_UNLOCK(ps);
1593 if (trigger != IP_NULL) {
1594 default_pager_space_alert(trigger, HI_WAT_ALERT);
1595 ipc_port_release_send(trigger);
1596 }
1597
1598 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1599 PAGING_SEGMENT_NULL) {
1600 static clock_sec_t lastnotify = 0;
1601 clock_sec_t now;
1602 clock_nsec_t nanoseconds_dummy;
1603
1604 /*
1605 * Don't immediately jump to the emergency segment. Give the
1606 * dynamic pager a chance to create it's first normal swap file.
1607 * Unless, of course the very first normal swap file can't be
1608 * created due to some problem and we didn't expect that problem
1609 * i.e. use_emergency_swap_file_first was never set to true initially.
1610 * It then gets set in the swap file creation error handling.
1611 */
1612 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1613
1614 ps = paging_segments[EMERGENCY_PSEG_INDEX];
1615 if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1616 PSL_LOCK();
1617 PS_LOCK(ps);
1618
1619 if(IS_PS_GOING_AWAY(ps)) {
1620 /* Someone de-activated the emergency paging segment*/
1621 PS_UNLOCK(ps);
1622 PSL_UNLOCK();
1623
1624 } else if(dp_pages_free) {
1625 /*
1626 * Someone has already activated the emergency paging segment
1627 * OR
1628 * Between us having rec'd a NULL segment from ps_select_segment
1629 * and reaching here a new normal segment could have been added.
1630 * E.g. we get NULL segment and another thread just added the
1631 * new swap file. Hence check to see if we have more dp_pages_free
1632 * before activating the emergency segment.
1633 */
1634 PS_UNLOCK(ps);
1635 PSL_UNLOCK();
1636 goto retry;
1637
1638 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1639 /*
1640 * PS_CAN_USE is only reset from the emergency segment when it's
1641 * been successfully recovered. So it's legal to have an emergency
1642 * segment that has PS_CAN_USE but no clusters because it's recovery
1643 * failed.
1644 */
1645 backing_store_t bs = ps->ps_bs;
1646 ps->ps_state |= PS_CAN_USE;
1647 if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1648 ps_select_array[bs->bs_priority] == BS_NOPRI) {
1649 ps_select_array[bs->bs_priority] = 0;
1650 }
1651 dp_pages_free += ps->ps_pgcount;
1652 dp_pages_reserve -= ps->ps_pgcount;
1653 PS_UNLOCK(ps);
1654 PSL_UNLOCK();
1655 dprintf(("Switching ON Emergency paging segment\n"));
1656 goto retry;
1657 }
1658
1659 PS_UNLOCK(ps);
1660 PSL_UNLOCK();
1661 }
1662 }
1663
1664 /*
1665 * Emit a notification of the low-paging resource condition
1666 * but don't issue it more than once every five seconds. This
1667 * prevents us from overflowing logs with thousands of
1668 * repetitions of the message.
1669 */
1670 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1671 if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1672 /* With an activated emergency paging segment we still
1673 * didn't get any clusters. This could mean that the
1674 * emergency paging segment is exhausted.
1675 */
1676 dprintf(("System is out of paging space.\n"));
1677 lastnotify = now;
1678 }
1679
1680 PSL_LOCK();
1681
1682 if(min_pages_trigger_port) {
1683 trigger = min_pages_trigger_port;
1684 min_pages_trigger_port = NULL;
1685 bs_low = TRUE;
1686 }
1687 PSL_UNLOCK();
1688 if (trigger != IP_NULL) {
1689 default_pager_space_alert(trigger, HI_WAT_ALERT);
1690 ipc_port_release_send(trigger);
1691 }
1692 return (dp_offset_t) -1;
1693 }
1694
1695 /*
1696 * Look for an available cluster. At the end of the loop,
1697 * byte_num is the byte offset and bit_num is the bit offset of the
1698 * first zero bit in the paging segment bitmap.
1699 */
1700 PS_LOCK(ps);
1701 byte_num = ps->ps_hint;
1702 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1703 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1704 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1705 if (isclr((ps->ps_bmap + byte_num), bit_num))
1706 break;
1707 }
1708 ASSERT(bit_num != NBBY);
1709 break;
1710 }
1711 }
1712 ps->ps_hint = byte_num;
1713 cluster = (byte_num*NBBY) + bit_num;
1714
1715 /* Space was reserved, so this must be true */
1716 ASSERT(cluster < ps->ps_ncls);
1717
1718 setbit(ps->ps_bmap, cluster);
1719 PS_UNLOCK(ps);
1720
1721 return cluster;
1722 }
1723
1724 void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */
1725
1726 void
1727 ps_deallocate_cluster(
1728 paging_segment_t ps,
1729 dp_offset_t cluster)
1730 {
1731
1732 if (cluster >= ps->ps_ncls)
1733 panic("ps_deallocate_cluster: Invalid cluster number");
1734
1735 /*
1736 * Lock the paging segment, clear the cluster's bitmap and increment the
1737 * number of free cluster.
1738 */
1739 PSL_LOCK();
1740 PS_LOCK(ps);
1741 clrbit(ps->ps_bmap, cluster);
1742 if( IS_PS_OK_TO_USE(ps)) {
1743 ++ps->ps_clcount;
1744 ps->ps_pgcount += 1 << ps->ps_clshift;
1745 dp_pages_free += 1 << ps->ps_clshift;
1746 } else {
1747 ps->ps_special_clusters += 1;
1748 }
1749
1750 /*
1751 * Move the hint down to the freed cluster if it is
1752 * less than the current hint.
1753 */
1754 if ((cluster/NBBY) < ps->ps_hint) {
1755 ps->ps_hint = (cluster/NBBY);
1756 }
1757
1758
1759 /*
1760 * If we're freeing space on a full priority, reset the array.
1761 */
1762 if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1763 ps_select_array[ps->ps_bs->bs_priority] = 0;
1764 PS_UNLOCK(ps);
1765 PSL_UNLOCK();
1766
1767 return;
1768 }
1769
1770 void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */
1771
1772 void
1773 ps_dealloc_vsmap(
1774 struct vs_map *vsmap,
1775 dp_size_t size)
1776 {
1777 unsigned int i;
1778 for (i = 0; i < size; i++)
1779 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1780 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1781 VSM_CLOFF(vsmap[i]));
1782 }
1783
1784 void
1785 ps_vstruct_dealloc(
1786 vstruct_t vs)
1787 {
1788 unsigned int i;
1789 // spl_t s;
1790
1791 VS_MAP_LOCK(vs);
1792
1793 /*
1794 * If this is an indirect structure, then we walk through the valid
1795 * (non-zero) indirect pointers and deallocate the clusters
1796 * associated with each used map entry (via ps_dealloc_vsmap).
1797 * When all of the clusters in an indirect block have been
1798 * freed, we deallocate the block. When all of the indirect
1799 * blocks have been deallocated we deallocate the memory
1800 * holding the indirect pointers.
1801 */
1802 if (vs->vs_indirect) {
1803 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1804 if (vs->vs_imap[i] != NULL) {
1805 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1806 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1807 }
1808 }
1809 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1810 } else {
1811 /*
1812 * Direct map. Free used clusters, then memory.
1813 */
1814 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1815 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1816 }
1817 VS_MAP_UNLOCK(vs);
1818
1819 bs_commit(- vs->vs_size);
1820
1821 zfree(vstruct_zone, vs);
1822 }
1823
1824 int ps_map_extend(vstruct_t, unsigned int); /* forward */
1825
1826 int ps_map_extend(
1827 vstruct_t vs,
1828 unsigned int new_size)
1829 {
1830 struct vs_map **new_imap;
1831 struct vs_map *new_dmap = NULL;
1832 int newdsize;
1833 int i;
1834 void *old_map = NULL;
1835 int old_map_size = 0;
1836
1837 if (vs->vs_size >= new_size) {
1838 /*
1839 * Someone has already done the work.
1840 */
1841 return 0;
1842 }
1843
1844 /*
1845 * If the new size extends into the indirect range, then we have one
1846 * of two cases: we are going from indirect to indirect, or we are
1847 * going from direct to indirect. If we are going from indirect to
1848 * indirect, then it is possible that the new size will fit in the old
1849 * indirect map. If this is the case, then just reset the size of the
1850 * vstruct map and we are done. If the new size will not
1851 * fit into the old indirect map, then we have to allocate a new
1852 * indirect map and copy the old map pointers into this new map.
1853 *
1854 * If we are going from direct to indirect, then we have to allocate a
1855 * new indirect map and copy the old direct pages into the first
1856 * indirect page of the new map.
1857 * NOTE: allocating memory here is dangerous, as we're in the
1858 * pageout path.
1859 */
1860 if (INDIRECT_CLMAP(new_size)) {
1861 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1862
1863 /*
1864 * Get a new indirect map and zero it.
1865 */
1866 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1867 if (vs->vs_indirect &&
1868 (new_map_size == old_map_size)) {
1869 bs_commit(new_size - vs->vs_size);
1870 vs->vs_size = new_size;
1871 return 0;
1872 }
1873
1874 new_imap = (struct vs_map **)kalloc(new_map_size);
1875 if (new_imap == NULL) {
1876 return -1;
1877 }
1878 memset(new_imap, 0, new_map_size);
1879
1880 if (vs->vs_indirect) {
1881 /* Copy old entries into new map */
1882 memcpy(new_imap, vs->vs_imap, old_map_size);
1883 /* Arrange to free the old map */
1884 old_map = (void *) vs->vs_imap;
1885 newdsize = 0;
1886 } else { /* Old map was a direct map */
1887 /* Allocate an indirect page */
1888 if ((new_imap[0] = (struct vs_map *)
1889 kalloc(CLMAP_THRESHOLD)) == NULL) {
1890 kfree(new_imap, new_map_size);
1891 return -1;
1892 }
1893 new_dmap = new_imap[0];
1894 newdsize = CLMAP_ENTRIES;
1895 }
1896 } else {
1897 new_imap = NULL;
1898 newdsize = new_size;
1899 /*
1900 * If the new map is a direct map, then the old map must
1901 * also have been a direct map. All we have to do is
1902 * to allocate a new direct map, copy the old entries
1903 * into it and free the old map.
1904 */
1905 if ((new_dmap = (struct vs_map *)
1906 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1907 return -1;
1908 }
1909 }
1910 if (newdsize) {
1911
1912 /* Free the old map */
1913 old_map = (void *) vs->vs_dmap;
1914 old_map_size = CLMAP_SIZE(vs->vs_size);
1915
1916 /* Copy info from the old map into the new map */
1917 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1918
1919 /* Initialize the rest of the new map */
1920 for (i = vs->vs_size; i < newdsize; i++)
1921 VSM_CLR(new_dmap[i]);
1922 }
1923 if (new_imap) {
1924 vs->vs_imap = new_imap;
1925 vs->vs_indirect = TRUE;
1926 } else
1927 vs->vs_dmap = new_dmap;
1928 bs_commit(new_size - vs->vs_size);
1929 vs->vs_size = new_size;
1930 if (old_map)
1931 kfree(old_map, old_map_size);
1932 return 0;
1933 }
1934
1935 dp_offset_t
1936 ps_clmap(
1937 vstruct_t vs,
1938 dp_offset_t offset,
1939 struct clmap *clmap,
1940 int flag,
1941 dp_size_t size,
1942 int error)
1943 {
1944 dp_offset_t cluster; /* The cluster of offset. */
1945 dp_offset_t newcl; /* The new cluster allocated. */
1946 dp_offset_t newoff;
1947 unsigned int i;
1948 struct vs_map *vsmap;
1949
1950 VS_MAP_LOCK(vs);
1951
1952 ASSERT(vs->vs_dmap);
1953 cluster = atop_32(offset) >> vs->vs_clshift;
1954
1955 /*
1956 * Initialize cluster error value
1957 */
1958 clmap->cl_error = 0;
1959
1960 /*
1961 * If the object has grown, extend the page map.
1962 */
1963 if (cluster >= vs->vs_size) {
1964 if (flag == CL_FIND) {
1965 /* Do not allocate if just doing a lookup */
1966 VS_MAP_UNLOCK(vs);
1967 return (dp_offset_t) -1;
1968 }
1969 if (ps_map_extend(vs, cluster + 1)) {
1970 VS_MAP_UNLOCK(vs);
1971 return (dp_offset_t) -1;
1972 }
1973 }
1974
1975 /*
1976 * Look for the desired cluster. If the map is indirect, then we
1977 * have a two level lookup. First find the indirect block, then
1978 * find the actual cluster. If the indirect block has not yet
1979 * been allocated, then do so. If the cluster has not yet been
1980 * allocated, then do so.
1981 *
1982 * If any of the allocations fail, then return an error.
1983 * Don't allocate if just doing a lookup.
1984 */
1985 if (vs->vs_indirect) {
1986 long ind_block = cluster/CLMAP_ENTRIES;
1987
1988 /* Is the indirect block allocated? */
1989 vsmap = vs->vs_imap[ind_block];
1990 if (vsmap == NULL) {
1991 if (flag == CL_FIND) {
1992 VS_MAP_UNLOCK(vs);
1993 return (dp_offset_t) -1;
1994 }
1995
1996 /* Allocate the indirect block */
1997 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1998 if (vsmap == NULL) {
1999 VS_MAP_UNLOCK(vs);
2000 return (dp_offset_t) -1;
2001 }
2002 /* Initialize the cluster offsets */
2003 for (i = 0; i < CLMAP_ENTRIES; i++)
2004 VSM_CLR(vsmap[i]);
2005 vs->vs_imap[ind_block] = vsmap;
2006 }
2007 } else
2008 vsmap = vs->vs_dmap;
2009
2010 ASSERT(vsmap);
2011 vsmap += cluster%CLMAP_ENTRIES;
2012
2013 /*
2014 * At this point, vsmap points to the struct vs_map desired.
2015 *
2016 * Look in the map for the cluster, if there was an error on a
2017 * previous write, flag it and return. If it is not yet
2018 * allocated, then allocate it, if we're writing; if we're
2019 * doing a lookup and the cluster's not allocated, return error.
2020 */
2021 if (VSM_ISERR(*vsmap)) {
2022 clmap->cl_error = VSM_GETERR(*vsmap);
2023 VS_MAP_UNLOCK(vs);
2024 return (dp_offset_t) -1;
2025 } else if (VSM_ISCLR(*vsmap)) {
2026 int psindex;
2027
2028 if (flag == CL_FIND) {
2029 /*
2030 * If there's an error and the entry is clear, then
2031 * we've run out of swap space. Record the error
2032 * here and return.
2033 */
2034 if (error) {
2035 VSM_SETERR(*vsmap, error);
2036 }
2037 VS_MAP_UNLOCK(vs);
2038 return (dp_offset_t) -1;
2039 } else {
2040 /*
2041 * Attempt to allocate a cluster from the paging segment
2042 */
2043 newcl = ps_allocate_cluster(vs, &psindex,
2044 PAGING_SEGMENT_NULL);
2045 if (newcl == (dp_offset_t) -1) {
2046 VS_MAP_UNLOCK(vs);
2047 return (dp_offset_t) -1;
2048 }
2049 VSM_CLR(*vsmap);
2050 VSM_SETCLOFF(*vsmap, newcl);
2051 VSM_SETPS(*vsmap, psindex);
2052 }
2053 } else
2054 newcl = VSM_CLOFF(*vsmap);
2055
2056 /*
2057 * Fill in pertinent fields of the clmap
2058 */
2059 clmap->cl_ps = VSM_PS(*vsmap);
2060 clmap->cl_numpages = VSCLSIZE(vs);
2061 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2062
2063 /*
2064 * Byte offset in paging segment is byte offset to cluster plus
2065 * byte offset within cluster. It looks ugly, but should be
2066 * relatively quick.
2067 */
2068 ASSERT(trunc_page(offset) == offset);
2069 newcl = ptoa_32(newcl) << vs->vs_clshift;
2070 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2071 if (flag == CL_ALLOC) {
2072 /*
2073 * set bits in the allocation bitmap according to which
2074 * pages were requested. size is in bytes.
2075 */
2076 i = atop_32(newoff);
2077 while ((size > 0) && (i < VSCLSIZE(vs))) {
2078 VSM_SETALLOC(*vsmap, i);
2079 i++;
2080 size -= vm_page_size;
2081 }
2082 }
2083 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2084 if (newoff) {
2085 /*
2086 * Offset is not cluster aligned, so number of pages
2087 * and bitmaps must be adjusted
2088 */
2089 clmap->cl_numpages -= atop_32(newoff);
2090 CLMAP_SHIFT(clmap, vs);
2091 CLMAP_SHIFTALLOC(clmap, vs);
2092 }
2093
2094 /*
2095 *
2096 * The setting of valid bits and handling of write errors
2097 * must be done here, while we hold the lock on the map.
2098 * It logically should be done in ps_vs_write_complete().
2099 * The size and error information has been passed from
2100 * ps_vs_write_complete(). If the size parameter is non-zero,
2101 * then there is work to be done. If error is also non-zero,
2102 * then the error number is recorded in the cluster and the
2103 * entire cluster is in error.
2104 */
2105 if (size && flag == CL_FIND) {
2106 dp_offset_t off = (dp_offset_t) 0;
2107
2108 if (!error) {
2109 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2110 i++) {
2111 VSM_SETPG(*vsmap, i);
2112 size -= vm_page_size;
2113 }
2114 ASSERT(i <= VSCLSIZE(vs));
2115 } else {
2116 BS_STAT(clmap->cl_ps->ps_bs,
2117 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2118 atop_32(size));
2119 off = VSM_CLOFF(*vsmap);
2120 VSM_SETERR(*vsmap, error);
2121 }
2122 /*
2123 * Deallocate cluster if error, and no valid pages
2124 * already present.
2125 */
2126 if (off != (dp_offset_t) 0)
2127 ps_deallocate_cluster(clmap->cl_ps, off);
2128 VS_MAP_UNLOCK(vs);
2129 return (dp_offset_t) 0;
2130 } else
2131 VS_MAP_UNLOCK(vs);
2132
2133 DP_DEBUG(DEBUG_VS_INTERNAL,
2134 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2135 newcl+newoff, (int) vs, (int) vsmap, flag));
2136 DP_DEBUG(DEBUG_VS_INTERNAL,
2137 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2138 (int) clmap->cl_ps, clmap->cl_numpages,
2139 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2140
2141 return (newcl + newoff);
2142 }
2143
2144 void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */
2145
2146 void
2147 ps_clunmap(
2148 vstruct_t vs,
2149 dp_offset_t offset,
2150 dp_size_t length)
2151 {
2152 dp_offset_t cluster; /* The cluster number of offset */
2153 struct vs_map *vsmap;
2154
2155 VS_MAP_LOCK(vs);
2156
2157 /*
2158 * Loop through all clusters in this range, freeing paging segment
2159 * clusters and map entries as encountered.
2160 */
2161 while (length > 0) {
2162 dp_offset_t newoff;
2163 unsigned int i;
2164
2165 cluster = atop_32(offset) >> vs->vs_clshift;
2166 if (vs->vs_indirect) /* indirect map */
2167 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2168 else
2169 vsmap = vs->vs_dmap;
2170 if (vsmap == NULL) {
2171 VS_MAP_UNLOCK(vs);
2172 return;
2173 }
2174 vsmap += cluster%CLMAP_ENTRIES;
2175 if (VSM_ISCLR(*vsmap)) {
2176 length -= vm_page_size;
2177 offset += vm_page_size;
2178 continue;
2179 }
2180 /*
2181 * We've got a valid mapping. Clear it and deallocate
2182 * paging segment cluster pages.
2183 * Optimize for entire cluster cleraing.
2184 */
2185 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2186 /*
2187 * Not cluster aligned.
2188 */
2189 ASSERT(trunc_page(newoff) == newoff);
2190 i = atop_32(newoff);
2191 } else
2192 i = 0;
2193 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2194 VSM_CLRPG(*vsmap, i);
2195 VSM_CLRALLOC(*vsmap, i);
2196 length -= vm_page_size;
2197 offset += vm_page_size;
2198 i++;
2199 }
2200
2201 /*
2202 * If map entry is empty, clear and deallocate cluster.
2203 */
2204 if (!VSM_ALLOC(*vsmap)) {
2205 ps_deallocate_cluster(VSM_PS(*vsmap),
2206 VSM_CLOFF(*vsmap));
2207 VSM_CLR(*vsmap);
2208 }
2209 }
2210
2211 VS_MAP_UNLOCK(vs);
2212 }
2213
2214 void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2215
2216 void
2217 ps_vs_write_complete(
2218 vstruct_t vs,
2219 dp_offset_t offset,
2220 dp_size_t size,
2221 int error)
2222 {
2223 struct clmap clmap;
2224
2225 /*
2226 * Get the struct vsmap for this cluster.
2227 * Use READ, even though it was written, because the
2228 * cluster MUST be present, unless there was an error
2229 * in the original ps_clmap (e.g. no space), in which
2230 * case, nothing happens.
2231 *
2232 * Must pass enough information to ps_clmap to allow it
2233 * to set the vs_map structure bitmap under lock.
2234 */
2235 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2236 }
2237
2238 void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */
2239
2240 void
2241 vs_cl_write_complete(
2242 vstruct_t vs,
2243 __unused paging_segment_t ps,
2244 dp_offset_t offset,
2245 __unused vm_offset_t addr,
2246 dp_size_t size,
2247 boolean_t async,
2248 int error)
2249 {
2250 // kern_return_t kr;
2251
2252 if (error) {
2253 /*
2254 * For internal objects, the error is recorded on a
2255 * per-cluster basis by ps_clmap() which is called
2256 * by ps_vs_write_complete() below.
2257 */
2258 dprintf(("write failed error = 0x%x\n", error));
2259 /* add upl_abort code here */
2260 } else
2261 GSTAT(global_stats.gs_pages_out += atop_32(size));
2262 /*
2263 * Notify the vstruct mapping code, so it can do its accounting.
2264 */
2265 ps_vs_write_complete(vs, offset, size, error);
2266
2267 if (async) {
2268 VS_LOCK(vs);
2269 ASSERT(vs->vs_async_pending > 0);
2270 vs->vs_async_pending -= size;
2271 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2272 vs->vs_waiting_async = FALSE;
2273 VS_UNLOCK(vs);
2274 thread_wakeup(&vs->vs_async_pending);
2275 } else {
2276 VS_UNLOCK(vs);
2277 }
2278 }
2279 }
2280
2281 #ifdef DEVICE_PAGING
2282 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2283
2284 kern_return_t
2285 device_write_reply(
2286 MACH_PORT_FACE reply_port,
2287 kern_return_t device_code,
2288 io_buf_len_t bytes_written)
2289 {
2290 struct vs_async *vsa;
2291
2292 vsa = (struct vs_async *)
2293 ((struct vstruct_alias *)(reply_port->alias))->vs;
2294
2295 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2296 device_code = KERN_FAILURE;
2297 }
2298
2299 vsa->vsa_error = device_code;
2300
2301
2302 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2303 if(vsa->vsa_flags & VSA_TRANSFER) {
2304 /* revisit when async disk segments redone */
2305 if(vsa->vsa_error) {
2306 /* need to consider error condition. re-write data or */
2307 /* throw it away here. */
2308 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2309 }
2310 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2311 vsa->vsa_size, vsa->vsa_error);
2312 } else {
2313 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2314 vsa->vsa_addr, vsa->vsa_size, TRUE,
2315 vsa->vsa_error);
2316 }
2317 VS_FREE_ASYNC(vsa);
2318
2319 return KERN_SUCCESS;
2320 }
2321
2322 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2323 kern_return_t
2324 device_write_reply_inband(
2325 MACH_PORT_FACE reply_port,
2326 kern_return_t return_code,
2327 io_buf_len_t bytes_written)
2328 {
2329 panic("device_write_reply_inband: illegal");
2330 return KERN_SUCCESS;
2331 }
2332
2333 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2334 kern_return_t
2335 device_read_reply(
2336 MACH_PORT_FACE reply_port,
2337 kern_return_t return_code,
2338 io_buf_ptr_t data,
2339 mach_msg_type_number_t dataCnt)
2340 {
2341 struct vs_async *vsa;
2342 vsa = (struct vs_async *)
2343 ((struct vstruct_alias *)(reply_port->alias))->vs;
2344 vsa->vsa_addr = (vm_offset_t)data;
2345 vsa->vsa_size = (vm_size_t)dataCnt;
2346 vsa->vsa_error = return_code;
2347 thread_wakeup(&vsa);
2348 return KERN_SUCCESS;
2349 }
2350
2351 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2352 kern_return_t
2353 device_read_reply_inband(
2354 MACH_PORT_FACE reply_port,
2355 kern_return_t return_code,
2356 io_buf_ptr_inband_t data,
2357 mach_msg_type_number_t dataCnt)
2358 {
2359 panic("device_read_reply_inband: illegal");
2360 return KERN_SUCCESS;
2361 }
2362
2363 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2364 kern_return_t
2365 device_read_reply_overwrite(
2366 MACH_PORT_FACE reply_port,
2367 kern_return_t return_code,
2368 io_buf_len_t bytes_read)
2369 {
2370 panic("device_read_reply_overwrite: illegal\n");
2371 return KERN_SUCCESS;
2372 }
2373
2374 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2375 kern_return_t
2376 device_open_reply(
2377 MACH_PORT_FACE reply_port,
2378 kern_return_t return_code,
2379 MACH_PORT_FACE device_port)
2380 {
2381 panic("device_open_reply: illegal\n");
2382 return KERN_SUCCESS;
2383 }
2384
2385 kern_return_t
2386 ps_read_device(
2387 paging_segment_t ps,
2388 dp_offset_t offset,
2389 vm_offset_t *bufferp,
2390 unsigned int size,
2391 unsigned int *residualp,
2392 int flags)
2393 {
2394 kern_return_t kr;
2395 recnum_t dev_offset;
2396 unsigned int bytes_wanted;
2397 unsigned int bytes_read;
2398 unsigned int total_read;
2399 vm_offset_t dev_buffer;
2400 vm_offset_t buf_ptr;
2401 unsigned int records_read;
2402 struct vs_async *vsa;
2403
2404 device_t device;
2405 vm_map_copy_t device_data = NULL;
2406 default_pager_thread_t *dpt = NULL;
2407
2408 device = dev_port_lookup(ps->ps_device);
2409 clustered_reads[atop_32(size)]++;
2410
2411 dev_offset = (ps->ps_offset +
2412 (offset >> (vm_page_shift - ps->ps_record_shift)));
2413 bytes_wanted = size;
2414 total_read = 0;
2415 *bufferp = (vm_offset_t)NULL;
2416
2417 do {
2418 vsa = VS_ALLOC_ASYNC();
2419 if (vsa) {
2420 vsa->vsa_vs = NULL;
2421 vsa->vsa_addr = 0;
2422 vsa->vsa_offset = 0;
2423 vsa->vsa_size = 0;
2424 vsa->vsa_ps = NULL;
2425 }
2426 ip_lock(vsa->reply_port);
2427 vsa->reply_port->ip_sorights++;
2428 ip_reference(vsa->reply_port);
2429 ip_unlock(vsa->reply_port);
2430 kr = ds_device_read_common(device,
2431 vsa->reply_port,
2432 (mach_msg_type_name_t)
2433 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2434 (dev_mode_t) 0,
2435 dev_offset,
2436 bytes_wanted,
2437 (IO_READ | IO_CALL),
2438 (io_buf_ptr_t *) &dev_buffer,
2439 (mach_msg_type_number_t *) &bytes_read);
2440 if(kr == MIG_NO_REPLY) {
2441 assert_wait(&vsa, THREAD_UNINT);
2442 thread_block(THREAD_CONTINUE_NULL);
2443
2444 dev_buffer = vsa->vsa_addr;
2445 bytes_read = (unsigned int)vsa->vsa_size;
2446 kr = vsa->vsa_error;
2447 }
2448 VS_FREE_ASYNC(vsa);
2449 if (kr != KERN_SUCCESS || bytes_read == 0) {
2450 break;
2451 }
2452 total_read += bytes_read;
2453
2454 /*
2455 * If we got the entire range, use the returned dev_buffer.
2456 */
2457 if (bytes_read == size) {
2458 *bufferp = (vm_offset_t)dev_buffer;
2459 break;
2460 }
2461
2462 #if 1
2463 dprintf(("read only %d bytes out of %d\n",
2464 bytes_read, bytes_wanted));
2465 #endif
2466 if(dpt == NULL) {
2467 dpt = get_read_buffer();
2468 buf_ptr = dpt->dpt_buffer;
2469 *bufferp = (vm_offset_t)buf_ptr;
2470 }
2471 /*
2472 * Otherwise, copy the data into the provided buffer (*bufferp)
2473 * and append the rest of the range as it comes in.
2474 */
2475 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2476 buf_ptr += bytes_read;
2477 bytes_wanted -= bytes_read;
2478 records_read = (bytes_read >>
2479 (vm_page_shift - ps->ps_record_shift));
2480 dev_offset += records_read;
2481 DP_DEBUG(DEBUG_VS_INTERNAL,
2482 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2483 dev_buffer, bytes_read));
2484 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2485 != KERN_SUCCESS)
2486 Panic("dealloc buf");
2487 } while (bytes_wanted);
2488
2489 *residualp = size - total_read;
2490 if((dev_buffer != *bufferp) && (total_read != 0)) {
2491 vm_offset_t temp_buffer;
2492 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2493 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2494 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2495 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2496 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2497 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2498 (vm_map_copy_t *)&device_data, FALSE))
2499 panic("ps_read_device: cannot copyin locally provided buffer\n");
2500 }
2501 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2502 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2503 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2504 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2505 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2506 (vm_map_copy_t *)&device_data, FALSE))
2507 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2508 }
2509 else {
2510 device_data = NULL;
2511 }
2512 *bufferp = (vm_offset_t)device_data;
2513
2514 if(dpt != NULL) {
2515 /* Free the receive buffer */
2516 dpt->checked_out = 0;
2517 thread_wakeup(&dpt_array);
2518 }
2519 return KERN_SUCCESS;
2520 }
2521
2522 kern_return_t
2523 ps_write_device(
2524 paging_segment_t ps,
2525 dp_offset_t offset,
2526 vm_offset_t addr,
2527 unsigned int size,
2528 struct vs_async *vsa)
2529 {
2530 recnum_t dev_offset;
2531 io_buf_len_t bytes_to_write, bytes_written;
2532 recnum_t records_written;
2533 kern_return_t kr;
2534 MACH_PORT_FACE reply_port;
2535
2536
2537
2538 clustered_writes[atop_32(size)]++;
2539
2540 dev_offset = (ps->ps_offset +
2541 (offset >> (vm_page_shift - ps->ps_record_shift)));
2542 bytes_to_write = size;
2543
2544 if (vsa) {
2545 /*
2546 * Asynchronous write.
2547 */
2548 reply_port = vsa->reply_port;
2549 ip_lock(reply_port);
2550 reply_port->ip_sorights++;
2551 ip_reference(reply_port);
2552 ip_unlock(reply_port);
2553 {
2554 device_t device;
2555 device = dev_port_lookup(ps->ps_device);
2556
2557 vsa->vsa_addr = addr;
2558 kr=ds_device_write_common(device,
2559 reply_port,
2560 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2561 (dev_mode_t) 0,
2562 dev_offset,
2563 (io_buf_ptr_t) addr,
2564 size,
2565 (IO_WRITE | IO_CALL),
2566 &bytes_written);
2567 }
2568 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2569 if (verbose)
2570 dprintf(("%s0x%x, addr=0x%x,"
2571 "size=0x%x,offset=0x%x\n",
2572 "device_write_request returned ",
2573 kr, addr, size, offset));
2574 BS_STAT(ps->ps_bs,
2575 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2576 /* do the completion notification to free resources */
2577 device_write_reply(reply_port, kr, 0);
2578 return PAGER_ERROR;
2579 }
2580 } else do {
2581 /*
2582 * Synchronous write.
2583 */
2584 {
2585 device_t device;
2586 device = dev_port_lookup(ps->ps_device);
2587 kr=ds_device_write_common(device,
2588 IP_NULL, 0,
2589 (dev_mode_t) 0,
2590 dev_offset,
2591 (io_buf_ptr_t) addr,
2592 size,
2593 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2594 &bytes_written);
2595 }
2596 if (kr != KERN_SUCCESS) {
2597 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2598 "device_write returned ",
2599 kr, addr, size, offset));
2600 BS_STAT(ps->ps_bs,
2601 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2602 return PAGER_ERROR;
2603 }
2604 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2605 Panic("fragmented write");
2606 records_written = (bytes_written >>
2607 (vm_page_shift - ps->ps_record_shift));
2608 dev_offset += records_written;
2609 #if 1
2610 if (bytes_written != bytes_to_write) {
2611 dprintf(("wrote only %d bytes out of %d\n",
2612 bytes_written, bytes_to_write));
2613 }
2614 #endif
2615 bytes_to_write -= bytes_written;
2616 addr += bytes_written;
2617 } while (bytes_to_write > 0);
2618
2619 return PAGER_SUCCESS;
2620 }
2621
2622
2623 #else /* !DEVICE_PAGING */
2624
2625 kern_return_t
2626 ps_read_device(
2627 __unused paging_segment_t ps,
2628 __unused dp_offset_t offset,
2629 __unused vm_offset_t *bufferp,
2630 __unused unsigned int size,
2631 __unused unsigned int *residualp,
2632 __unused int flags)
2633 {
2634 panic("ps_read_device not supported");
2635 return KERN_FAILURE;
2636 }
2637
2638 kern_return_t
2639 ps_write_device(
2640 __unused paging_segment_t ps,
2641 __unused dp_offset_t offset,
2642 __unused vm_offset_t addr,
2643 __unused unsigned int size,
2644 __unused struct vs_async *vsa)
2645 {
2646 panic("ps_write_device not supported");
2647 return KERN_FAILURE;
2648 }
2649
2650 #endif /* DEVICE_PAGING */
2651 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2652
2653 void
2654 pvs_object_data_provided(
2655 __unused vstruct_t vs,
2656 __unused upl_t upl,
2657 __unused upl_offset_t offset,
2658 upl_size_t size)
2659 {
2660
2661 DP_DEBUG(DEBUG_VS_INTERNAL,
2662 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2663 upl, offset, size));
2664
2665 ASSERT(size > 0);
2666 GSTAT(global_stats.gs_pages_in += atop_32(size));
2667
2668
2669 #if USE_PRECIOUS
2670 ps_clunmap(vs, offset, size);
2671 #endif /* USE_PRECIOUS */
2672
2673 }
2674
2675 static memory_object_offset_t last_start;
2676 static vm_size_t last_length;
2677
2678 kern_return_t
2679 pvs_cluster_read(
2680 vstruct_t vs,
2681 dp_offset_t vs_offset,
2682 dp_size_t cnt,
2683 void *fault_info)
2684 {
2685 kern_return_t error = KERN_SUCCESS;
2686 unsigned int size;
2687 unsigned int residual;
2688 unsigned int request_flags;
2689 int io_flags = 0;
2690 int seg_index;
2691 int pages_in_cl;
2692 int cl_size;
2693 int cl_mask;
2694 int cl_index;
2695 unsigned int xfer_size;
2696 dp_offset_t orig_vs_offset;
2697 dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2698 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2699 struct clmap clmap;
2700 upl_t upl;
2701 unsigned int page_list_count;
2702 memory_object_offset_t cluster_start;
2703 vm_size_t cluster_length;
2704 uint32_t io_streaming;
2705
2706 pages_in_cl = 1 << vs->vs_clshift;
2707 cl_size = pages_in_cl * vm_page_size;
2708 cl_mask = cl_size - 1;
2709
2710 #if USE_PRECIOUS
2711 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2712 #else
2713 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2714 #endif
2715 cl_index = (vs_offset & cl_mask) / vm_page_size;
2716
2717 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2718 !CLMAP_ISSET(clmap, cl_index)) {
2719 /*
2720 * the needed page doesn't exist in the backing store...
2721 * we don't want to try to do any I/O, just abort the
2722 * page and let the fault handler provide a zero-fill
2723 */
2724 if (cnt == 0) {
2725 /*
2726 * The caller was just poking at us to see if
2727 * the page has been paged out. No need to
2728 * mess with the page at all.
2729 * Just let the caller know we don't have that page.
2730 */
2731 return KERN_FAILURE;
2732 }
2733
2734 page_list_count = 0;
2735
2736 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2737 PAGE_SIZE, PAGE_SIZE,
2738 &upl, NULL, &page_list_count,
2739 request_flags);
2740
2741 if (clmap.cl_error)
2742 upl_abort(upl, UPL_ABORT_ERROR);
2743 else
2744 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2745 upl_deallocate(upl);
2746
2747 return KERN_SUCCESS;
2748 }
2749
2750 if (cnt == 0) {
2751 /*
2752 * The caller was just poking at us to see if
2753 * the page has been paged out. No need to
2754 * mess with the page at all.
2755 * Just let the caller know we do have that page.
2756 */
2757 return KERN_SUCCESS;
2758 }
2759
2760 assert(dp_encryption_inited);
2761 if (dp_encryption) {
2762 /*
2763 * ENCRYPTED SWAP:
2764 * request that the UPL be prepared for
2765 * decryption.
2766 */
2767 request_flags |= UPL_ENCRYPT;
2768 }
2769 orig_vs_offset = vs_offset;
2770
2771 assert(cnt != 0);
2772 cnt = VM_SUPER_CLUSTER;
2773 cluster_start = (memory_object_offset_t) vs_offset;
2774 cluster_length = (vm_size_t) cnt;
2775 io_streaming = 0;
2776
2777 /*
2778 * determine how big a speculative I/O we should try for...
2779 */
2780 if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2781 assert(vs_offset >= (dp_offset_t) cluster_start &&
2782 vs_offset < (dp_offset_t) (cluster_start + cluster_length));
2783 vs_offset = (dp_offset_t) cluster_start;
2784 cnt = (dp_size_t) cluster_length;
2785 } else {
2786 cluster_length = PAGE_SIZE;
2787 cnt = PAGE_SIZE;
2788 }
2789
2790 if (io_streaming)
2791 io_flags |= UPL_IOSTREAMING;
2792
2793 last_start = cluster_start;
2794 last_length = cluster_length;
2795
2796 /*
2797 * This loop will be executed multiple times until the entire
2798 * range has been looked at or we issue an I/O... if the request spans cluster
2799 * boundaries, the clusters will be checked for logical continunity,
2800 * if contiguous the I/O request will span multiple clusters...
2801 * at most only 1 I/O will be issued... it will encompass the original offset
2802 */
2803 while (cnt && error == KERN_SUCCESS) {
2804 int ps_info_valid;
2805
2806 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
2807 size = VM_SUPER_CLUSTER;
2808 size -= vs_offset & cl_mask;
2809 } else if (cnt > VM_SUPER_CLUSTER)
2810 size = VM_SUPER_CLUSTER;
2811 else
2812 size = cnt;
2813
2814 cnt -= size;
2815
2816 ps_info_valid = 0;
2817 seg_index = 0;
2818
2819 while (size > 0 && error == KERN_SUCCESS) {
2820 unsigned int abort_size;
2821 int failed_size;
2822 int beg_pseg;
2823 int beg_indx;
2824 dp_offset_t cur_offset;
2825
2826 if ( !ps_info_valid) {
2827 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2828 psp[seg_index] = CLMAP_PS(clmap);
2829 ps_info_valid = 1;
2830 }
2831 /*
2832 * skip over unallocated physical segments
2833 */
2834 if (ps_offset[seg_index] == (dp_offset_t) -1) {
2835 abort_size = cl_size - (vs_offset & cl_mask);
2836 abort_size = MIN(abort_size, size);
2837
2838 size -= abort_size;
2839 vs_offset += abort_size;
2840
2841 seg_index++;
2842 ps_info_valid = 0;
2843
2844 continue;
2845 }
2846 cl_index = (vs_offset & cl_mask) / vm_page_size;
2847
2848 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2849 /*
2850 * skip over unallocated pages
2851 */
2852 if (CLMAP_ISSET(clmap, cl_index))
2853 break;
2854 abort_size += vm_page_size;
2855 }
2856 if (abort_size) {
2857 size -= abort_size;
2858 vs_offset += abort_size;
2859
2860 if (cl_index == pages_in_cl) {
2861 /*
2862 * if we're at the end of this physical cluster
2863 * then bump to the next one and continue looking
2864 */
2865 seg_index++;
2866 ps_info_valid = 0;
2867
2868 continue;
2869 }
2870 if (size == 0)
2871 break;
2872 }
2873 /*
2874 * remember the starting point of the first allocated page
2875 * for the I/O we're about to issue
2876 */
2877 beg_pseg = seg_index;
2878 beg_indx = cl_index;
2879 cur_offset = vs_offset;
2880
2881 /*
2882 * calculate the size of the I/O that we can do...
2883 * this may span multiple physical segments if
2884 * they are contiguous
2885 */
2886 for (xfer_size = 0; xfer_size < size; ) {
2887
2888 while (cl_index < pages_in_cl && xfer_size < size) {
2889 /*
2890 * accumulate allocated pages within
2891 * a physical segment
2892 */
2893 if (CLMAP_ISSET(clmap, cl_index)) {
2894 xfer_size += vm_page_size;
2895 cur_offset += vm_page_size;
2896 cl_index++;
2897
2898 BS_STAT(psp[seg_index]->ps_bs,
2899 psp[seg_index]->ps_bs->bs_pages_in++);
2900 } else
2901 break;
2902 }
2903 if (cl_index < pages_in_cl || xfer_size >= size) {
2904 /*
2905 * we've hit an unallocated page or
2906 * the end of this request... see if
2907 * it's time to fire the I/O
2908 */
2909 break;
2910 }
2911 /*
2912 * we've hit the end of the current physical
2913 * segment and there's more to do, so try
2914 * moving to the next one
2915 */
2916 seg_index++;
2917
2918 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2919 psp[seg_index] = CLMAP_PS(clmap);
2920 ps_info_valid = 1;
2921
2922 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2923 /*
2924 * if the physical segment we're about
2925 * to step into is not contiguous to
2926 * the one we're currently in, or it's
2927 * in a different paging file, or
2928 * it hasn't been allocated....
2929 * we stop this run and go check
2930 * to see if it's time to fire the I/O
2931 */
2932 break;
2933 }
2934 /*
2935 * start with first page of the next physical
2936 * segment
2937 */
2938 cl_index = 0;
2939 }
2940 if (xfer_size == 0) {
2941 /*
2942 * no I/O to generate for this segment
2943 */
2944 continue;
2945 }
2946 if (cur_offset <= orig_vs_offset) {
2947 /*
2948 * we've hit a hole in our speculative cluster
2949 * before the offset that we're really after...
2950 * don't issue the I/O since it doesn't encompass
2951 * the original offset and we're looking to only
2952 * pull in the speculative pages if they can be
2953 * made part of a single I/O
2954 */
2955 size -= xfer_size;
2956 vs_offset += xfer_size;
2957
2958 continue;
2959 }
2960 /*
2961 * we have a contiguous range of allocated pages
2962 * to read from that encompasses the original offset
2963 */
2964 page_list_count = 0;
2965 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2966 xfer_size, xfer_size,
2967 &upl, NULL, &page_list_count,
2968 request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2969
2970 error = ps_read_file(psp[beg_pseg],
2971 upl, (upl_offset_t) 0,
2972 ps_offset[beg_pseg] + (beg_indx * vm_page_size),
2973 xfer_size, &residual, io_flags);
2974
2975 failed_size = 0;
2976
2977 /*
2978 * Adjust counts and send response to VM. Optimize
2979 * for the common case, i.e. no error and/or partial
2980 * data. If there was an error, then we need to error
2981 * the entire range, even if some data was successfully
2982 * read. If there was a partial read we may supply some
2983 * data and may error some as well. In all cases the
2984 * VM must receive some notification for every page
2985 * in the range.
2986 */
2987 if ((error == KERN_SUCCESS) && (residual == 0)) {
2988 /*
2989 * Got everything we asked for, supply the data
2990 * to the VM. Note that as a side effect of
2991 * supplying the data, the buffer holding the
2992 * supplied data is deallocated from the pager's
2993 * address space.
2994 */
2995 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2996 } else {
2997 failed_size = xfer_size;
2998
2999 if (error == KERN_SUCCESS) {
3000 if (residual == xfer_size) {
3001 /*
3002 * If a read operation returns no error
3003 * and no data moved, we turn it into
3004 * an error, assuming we're reading at
3005 * or beyong EOF.
3006 * Fall through and error the entire range.
3007 */
3008 error = KERN_FAILURE;
3009 } else {
3010 /*
3011 * Otherwise, we have partial read. If
3012 * the part read is a integral number
3013 * of pages supply it. Otherwise round
3014 * it up to a page boundary, zero fill
3015 * the unread part, and supply it.
3016 * Fall through and error the remainder
3017 * of the range, if any.
3018 */
3019 int fill;
3020 unsigned int lsize;
3021
3022 fill = residual & ~vm_page_size;
3023 lsize = (xfer_size - residual) + fill;
3024
3025 pvs_object_data_provided(vs, upl, vs_offset, lsize);
3026
3027 if (lsize < xfer_size) {
3028 failed_size = xfer_size - lsize;
3029 error = KERN_FAILURE;
3030 }
3031 }
3032 }
3033 }
3034 if (error != KERN_SUCCESS) {
3035 /*
3036 * There was an error in some part of the range, tell
3037 * the VM. Note that error is explicitly checked again
3038 * since it can be modified above.
3039 */
3040 BS_STAT(psp[beg_pseg]->ps_bs,
3041 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3042 }
3043 /*
3044 * we've issued a single I/O that encompassed the original offset
3045 * at this point we either met our speculative request length or
3046 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3047 * not present or not physically contiguous to the previous one), so
3048 * we're done issuing I/O at this point
3049 */
3050 return (error);
3051 }
3052 }
3053 return error;
3054 }
3055
3056 int vs_do_async_write = 1;
3057
3058 kern_return_t
3059 vs_cluster_write(
3060 vstruct_t vs,
3061 upl_t internal_upl,
3062 upl_offset_t offset,
3063 upl_size_t cnt,
3064 boolean_t dp_internal,
3065 int flags)
3066 {
3067 upl_size_t transfer_size;
3068 int error = 0;
3069 struct clmap clmap;
3070
3071 dp_offset_t actual_offset; /* Offset within paging segment */
3072 paging_segment_t ps;
3073 dp_offset_t mobj_base_addr;
3074 dp_offset_t mobj_target_addr;
3075
3076 upl_t upl;
3077 upl_page_info_t *pl;
3078 int page_index;
3079 int list_size;
3080 int pages_in_cl;
3081 unsigned int cl_size;
3082 int base_index;
3083 unsigned int seg_size;
3084 unsigned int upl_offset_in_object;
3085
3086 pages_in_cl = 1 << vs->vs_clshift;
3087 cl_size = pages_in_cl * vm_page_size;
3088
3089 if (!dp_internal) {
3090 unsigned int page_list_count;
3091 int request_flags;
3092 unsigned int super_size;
3093 int first_dirty;
3094 int num_dirty;
3095 int num_of_pages;
3096 int seg_index;
3097 upl_offset_t upl_offset;
3098 dp_offset_t seg_offset;
3099 dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
3100 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
3101
3102
3103 if (bs_low) {
3104 super_size = cl_size;
3105
3106 request_flags = UPL_NOBLOCK |
3107 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3108 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3109 } else {
3110 super_size = VM_SUPER_CLUSTER;
3111
3112 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3113 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3114 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3115 }
3116
3117 if (!dp_encryption_inited) {
3118 /*
3119 * ENCRYPTED SWAP:
3120 * Once we've started using swap, we
3121 * can't change our mind on whether
3122 * it needs to be encrypted or
3123 * not.
3124 */
3125 dp_encryption_inited = TRUE;
3126 }
3127 if (dp_encryption) {
3128 /*
3129 * ENCRYPTED SWAP:
3130 * request that the UPL be prepared for
3131 * encryption.
3132 */
3133 request_flags |= UPL_ENCRYPT;
3134 flags |= UPL_PAGING_ENCRYPTED;
3135 }
3136
3137 page_list_count = 0;
3138 memory_object_super_upl_request(vs->vs_control,
3139 (memory_object_offset_t)offset,
3140 cnt, super_size,
3141 &upl, NULL, &page_list_count,
3142 request_flags | UPL_FOR_PAGEOUT);
3143
3144 /*
3145 * The default pager does not handle objects larger than
3146 * 4GB, so it does not deal with offset that don't fit in
3147 * 32-bit. Cast down upl->offset now and make sure we
3148 * did not lose any valuable bits.
3149 */
3150 upl_offset_in_object = (unsigned int) upl->offset;
3151 assert(upl->offset == upl_offset_in_object);
3152
3153 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3154
3155 seg_size = cl_size - (upl_offset_in_object % cl_size);
3156 upl_offset = upl_offset_in_object & ~(cl_size - 1);
3157
3158 for (seg_index = 0, transfer_size = upl->size;
3159 transfer_size > 0; ) {
3160 ps_offset[seg_index] =
3161 ps_clmap(vs,
3162 upl_offset,
3163 &clmap, CL_ALLOC,
3164 cl_size, 0);
3165
3166 if (ps_offset[seg_index] == (dp_offset_t) -1) {
3167 upl_abort(upl, 0);
3168 upl_deallocate(upl);
3169
3170 return KERN_FAILURE;
3171
3172 }
3173 psp[seg_index] = CLMAP_PS(clmap);
3174
3175 if (transfer_size > seg_size) {
3176 transfer_size -= seg_size;
3177 upl_offset += cl_size;
3178 seg_size = cl_size;
3179 seg_index++;
3180 } else
3181 transfer_size = 0;
3182 }
3183 /*
3184 * Ignore any non-present pages at the end of the
3185 * UPL.
3186 */
3187 for (page_index = upl->size / vm_page_size; page_index > 0;)
3188 if (UPL_PAGE_PRESENT(pl, --page_index))
3189 break;
3190 num_of_pages = page_index + 1;
3191
3192 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3193
3194 for (page_index = 0; page_index < num_of_pages; ) {
3195 /*
3196 * skip over non-dirty pages
3197 */
3198 for ( ; page_index < num_of_pages; page_index++) {
3199 if (UPL_DIRTY_PAGE(pl, page_index)
3200 || UPL_PRECIOUS_PAGE(pl, page_index))
3201 /*
3202 * this is a page we need to write
3203 * go see if we can buddy it up with
3204 * others that are contiguous to it
3205 */
3206 break;
3207 /*
3208 * if the page is not-dirty, but present we
3209 * need to commit it... This is an unusual
3210 * case since we only asked for dirty pages
3211 */
3212 if (UPL_PAGE_PRESENT(pl, page_index)) {
3213 boolean_t empty = FALSE;
3214 upl_commit_range(upl,
3215 page_index * vm_page_size,
3216 vm_page_size,
3217 UPL_COMMIT_NOTIFY_EMPTY,
3218 pl,
3219 page_list_count,
3220 &empty);
3221 if (empty) {
3222 assert(page_index ==
3223 num_of_pages - 1);
3224 upl_deallocate(upl);
3225 }
3226 }
3227 }
3228 if (page_index == num_of_pages)
3229 /*
3230 * no more pages to look at, we're out of here
3231 */
3232 break;
3233
3234 /*
3235 * gather up contiguous dirty pages... we have at
3236 * least 1 * otherwise we would have bailed above
3237 * make sure that each physical segment that we step
3238 * into is contiguous to the one we're currently in
3239 * if it's not, we have to stop and write what we have
3240 */
3241 for (first_dirty = page_index;
3242 page_index < num_of_pages; ) {
3243 if ( !UPL_DIRTY_PAGE(pl, page_index)
3244 && !UPL_PRECIOUS_PAGE(pl, page_index))
3245 break;
3246 page_index++;
3247 /*
3248 * if we just looked at the last page in the UPL
3249 * we don't need to check for physical segment
3250 * continuity
3251 */
3252 if (page_index < num_of_pages) {
3253 int cur_seg;
3254 int nxt_seg;
3255
3256 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3257 nxt_seg = (base_index + page_index)/pages_in_cl;
3258
3259 if (cur_seg != nxt_seg) {
3260 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3261 /*
3262 * if the segment we're about
3263 * to step into is not
3264 * contiguous to the one we're
3265 * currently in, or it's in a
3266 * different paging file....
3267 * we stop here and generate
3268 * the I/O
3269 */
3270 break;
3271 }
3272 }
3273 }
3274 num_dirty = page_index - first_dirty;
3275
3276 if (num_dirty) {
3277 upl_offset = first_dirty * vm_page_size;
3278 transfer_size = num_dirty * vm_page_size;
3279
3280 while (transfer_size) {
3281
3282 if ((seg_size = cl_size -
3283 ((upl_offset_in_object +
3284 upl_offset) % cl_size))
3285 > transfer_size)
3286 seg_size = transfer_size;
3287
3288 ps_vs_write_complete(
3289 vs,
3290 (upl_offset_in_object +
3291 upl_offset),
3292 seg_size, error);
3293
3294 transfer_size -= seg_size;
3295 upl_offset += seg_size;
3296 }
3297 upl_offset = first_dirty * vm_page_size;
3298 transfer_size = num_dirty * vm_page_size;
3299
3300 seg_index = (base_index + first_dirty) / pages_in_cl;
3301 seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3302
3303 error = ps_write_file(psp[seg_index],
3304 upl, upl_offset,
3305 ps_offset[seg_index]
3306 + seg_offset,
3307 transfer_size, flags);
3308 } else {
3309 boolean_t empty = FALSE;
3310 upl_abort_range(upl,
3311 first_dirty * vm_page_size,
3312 num_dirty * vm_page_size,
3313 UPL_ABORT_NOTIFY_EMPTY,
3314 &empty);
3315 if (empty) {
3316 assert(page_index == num_of_pages);
3317 upl_deallocate(upl);
3318 }
3319 }
3320 }
3321
3322 } else {
3323 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3324 list_size = cnt;
3325
3326 page_index = 0;
3327 /* The caller provides a mapped_data which is derived */
3328 /* from a temporary object. The targeted pages are */
3329 /* guaranteed to be set at offset 0 in the mapped_data */
3330 /* The actual offset however must still be derived */
3331 /* from the offset in the vs in question */
3332 mobj_base_addr = offset;
3333 mobj_target_addr = mobj_base_addr;
3334
3335 for (transfer_size = list_size; transfer_size != 0;) {
3336 actual_offset = ps_clmap(vs, mobj_target_addr,
3337 &clmap, CL_ALLOC,
3338 transfer_size < cl_size ?
3339 transfer_size : cl_size, 0);
3340 if(actual_offset == (dp_offset_t) -1) {
3341 error = 1;
3342 break;
3343 }
3344 cnt = MIN(transfer_size,
3345 (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3346 ps = CLMAP_PS(clmap);
3347 /* Assume that the caller has given us contiguous */
3348 /* pages */
3349 if(cnt) {
3350 ps_vs_write_complete(vs, mobj_target_addr,
3351 cnt, error);
3352 error = ps_write_file(ps, internal_upl,
3353 0, actual_offset,
3354 cnt, flags);
3355 if (error)
3356 break;
3357 }
3358 if (error)
3359 break;
3360 actual_offset += cnt;
3361 mobj_target_addr += cnt;
3362 transfer_size -= cnt;
3363 cnt = 0;
3364
3365 if (error)
3366 break;
3367 }
3368 }
3369 if(error)
3370 return KERN_FAILURE;
3371 else
3372 return KERN_SUCCESS;
3373 }
3374
3375 vm_size_t
3376 ps_vstruct_allocated_size(
3377 vstruct_t vs)
3378 {
3379 int num_pages;
3380 struct vs_map *vsmap;
3381 unsigned int i, j, k;
3382
3383 num_pages = 0;
3384 if (vs->vs_indirect) {
3385 /* loop on indirect maps */
3386 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3387 vsmap = vs->vs_imap[i];
3388 if (vsmap == NULL)
3389 continue;
3390 /* loop on clusters in this indirect map */
3391 for (j = 0; j < CLMAP_ENTRIES; j++) {
3392 if (VSM_ISCLR(vsmap[j]) ||
3393 VSM_ISERR(vsmap[j]))
3394 continue;
3395 /* loop on pages in this cluster */
3396 for (k = 0; k < VSCLSIZE(vs); k++) {
3397 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3398 num_pages++;
3399 }
3400 }
3401 }
3402 } else {
3403 vsmap = vs->vs_dmap;
3404 if (vsmap == NULL)
3405 return 0;
3406 /* loop on clusters in the direct map */
3407 for (j = 0; j < CLMAP_ENTRIES; j++) {
3408 if (VSM_ISCLR(vsmap[j]) ||
3409 VSM_ISERR(vsmap[j]))
3410 continue;
3411 /* loop on pages in this cluster */
3412 for (k = 0; k < VSCLSIZE(vs); k++) {
3413 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3414 num_pages++;
3415 }
3416 }
3417 }
3418
3419 return ptoa_32(num_pages);
3420 }
3421
3422 unsigned int
3423 ps_vstruct_allocated_pages(
3424 vstruct_t vs,
3425 default_pager_page_t *pages,
3426 unsigned int pages_size)
3427 {
3428 unsigned int num_pages;
3429 struct vs_map *vsmap;
3430 dp_offset_t offset;
3431 unsigned int i, j, k;
3432
3433 num_pages = 0;
3434 offset = 0;
3435 if (vs->vs_indirect) {
3436 /* loop on indirect maps */
3437 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3438 vsmap = vs->vs_imap[i];
3439 if (vsmap == NULL) {
3440 offset += (vm_page_size * CLMAP_ENTRIES *
3441 VSCLSIZE(vs));
3442 continue;
3443 }
3444 /* loop on clusters in this indirect map */
3445 for (j = 0; j < CLMAP_ENTRIES; j++) {
3446 if (VSM_ISCLR(vsmap[j]) ||
3447 VSM_ISERR(vsmap[j])) {
3448 offset += vm_page_size * VSCLSIZE(vs);
3449 continue;
3450 }
3451 /* loop on pages in this cluster */
3452 for (k = 0; k < VSCLSIZE(vs); k++) {
3453 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3454 num_pages++;
3455 if (num_pages < pages_size)
3456 pages++->dpp_offset =
3457 offset;
3458 }
3459 offset += vm_page_size;
3460 }
3461 }
3462 }
3463 } else {
3464 vsmap = vs->vs_dmap;
3465 if (vsmap == NULL)
3466 return 0;
3467 /* loop on clusters in the direct map */
3468 for (j = 0; j < CLMAP_ENTRIES; j++) {
3469 if (VSM_ISCLR(vsmap[j]) ||
3470 VSM_ISERR(vsmap[j])) {
3471 offset += vm_page_size * VSCLSIZE(vs);
3472 continue;
3473 }
3474 /* loop on pages in this cluster */
3475 for (k = 0; k < VSCLSIZE(vs); k++) {
3476 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3477 num_pages++;
3478 if (num_pages < pages_size)
3479 pages++->dpp_offset = offset;
3480 }
3481 offset += vm_page_size;
3482 }
3483 }
3484 }
3485
3486 return num_pages;
3487 }
3488
3489
3490 kern_return_t
3491 ps_vstruct_transfer_from_segment(
3492 vstruct_t vs,
3493 paging_segment_t segment,
3494 upl_t upl)
3495 {
3496 struct vs_map *vsmap;
3497 // struct vs_map old_vsmap;
3498 // struct vs_map new_vsmap;
3499 unsigned int i, j;
3500
3501 VS_LOCK(vs); /* block all work on this vstruct */
3502 /* can't allow the normal multiple write */
3503 /* semantic because writes may conflict */
3504 vs->vs_xfer_pending = TRUE;
3505 vs_wait_for_sync_writers(vs);
3506 vs_start_write(vs);
3507 vs_wait_for_readers(vs);
3508 /* we will unlock the vs to allow other writes while transferring */
3509 /* and will be guaranteed of the persistance of the vs struct */
3510 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3511 /* vs_async_pending */
3512 /* OK we now have guaranteed no other parties are accessing this */
3513 /* vs. Now that we are also supporting simple lock versions of */
3514 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3515 /* our purpose in holding it before was the multiple write case */
3516 /* we now use the boolean xfer_pending to do that. We can use */
3517 /* a boolean instead of a count because we have guaranteed single */
3518 /* file access to this code in its caller */
3519 VS_UNLOCK(vs);
3520 vs_changed:
3521 if (vs->vs_indirect) {
3522 unsigned int vsmap_size;
3523 int clmap_off;
3524 /* loop on indirect maps */
3525 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3526 vsmap = vs->vs_imap[i];
3527 if (vsmap == NULL)
3528 continue;
3529 /* loop on clusters in this indirect map */
3530 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3531 VSCLSIZE(vs) * i);
3532 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3533 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3534 else
3535 vsmap_size = CLMAP_ENTRIES;
3536 for (j = 0; j < vsmap_size; j++) {
3537 if (VSM_ISCLR(vsmap[j]) ||
3538 VSM_ISERR(vsmap[j]) ||
3539 (VSM_PS(vsmap[j]) != segment))
3540 continue;
3541 if(vs_cluster_transfer(vs,
3542 (vm_page_size * (j << vs->vs_clshift))
3543 + clmap_off,
3544 vm_page_size << vs->vs_clshift,
3545 upl)
3546 != KERN_SUCCESS) {
3547 VS_LOCK(vs);
3548 vs->vs_xfer_pending = FALSE;
3549 VS_UNLOCK(vs);
3550 vs_finish_write(vs);
3551 return KERN_FAILURE;
3552 }
3553 /* allow other readers/writers during transfer*/
3554 VS_LOCK(vs);
3555 vs->vs_xfer_pending = FALSE;
3556 VS_UNLOCK(vs);
3557 vs_finish_write(vs);
3558 VS_LOCK(vs);
3559 vs->vs_xfer_pending = TRUE;
3560 vs_wait_for_sync_writers(vs);
3561 vs_start_write(vs);
3562 vs_wait_for_readers(vs);
3563 VS_UNLOCK(vs);
3564 if (!(vs->vs_indirect)) {
3565 goto vs_changed;
3566 }
3567 }
3568 }
3569 } else {
3570 vsmap = vs->vs_dmap;
3571 if (vsmap == NULL) {
3572 VS_LOCK(vs);
3573 vs->vs_xfer_pending = FALSE;
3574 VS_UNLOCK(vs);
3575 vs_finish_write(vs);
3576 return KERN_SUCCESS;
3577 }
3578 /* loop on clusters in the direct map */
3579 for (j = 0; j < vs->vs_size; j++) {
3580 if (VSM_ISCLR(vsmap[j]) ||
3581 VSM_ISERR(vsmap[j]) ||
3582 (VSM_PS(vsmap[j]) != segment))
3583 continue;
3584 if(vs_cluster_transfer(vs,
3585 vm_page_size * (j << vs->vs_clshift),
3586 vm_page_size << vs->vs_clshift,
3587 upl) != KERN_SUCCESS) {
3588 VS_LOCK(vs);
3589 vs->vs_xfer_pending = FALSE;
3590 VS_UNLOCK(vs);
3591 vs_finish_write(vs);
3592 return KERN_FAILURE;
3593 }
3594 /* allow other readers/writers during transfer*/
3595 VS_LOCK(vs);
3596 vs->vs_xfer_pending = FALSE;
3597 VS_UNLOCK(vs);
3598 vs_finish_write(vs);
3599 VS_LOCK(vs);
3600 vs->vs_xfer_pending = TRUE;
3601 vs_wait_for_sync_writers(vs);
3602 vs_start_write(vs);
3603 vs_wait_for_readers(vs);
3604 VS_UNLOCK(vs);
3605 if (vs->vs_indirect) {
3606 goto vs_changed;
3607 }
3608 }
3609 }
3610
3611 VS_LOCK(vs);
3612 vs->vs_xfer_pending = FALSE;
3613 VS_UNLOCK(vs);
3614 vs_finish_write(vs);
3615 return KERN_SUCCESS;
3616 }
3617
3618
3619
3620 vs_map_t
3621 vs_get_map_entry(
3622 vstruct_t vs,
3623 dp_offset_t offset)
3624 {
3625 struct vs_map *vsmap;
3626 dp_offset_t cluster;
3627
3628 cluster = atop_32(offset) >> vs->vs_clshift;
3629 if (vs->vs_indirect) {
3630 long ind_block = cluster/CLMAP_ENTRIES;
3631
3632 /* Is the indirect block allocated? */
3633 vsmap = vs->vs_imap[ind_block];
3634 if(vsmap == (vs_map_t) NULL)
3635 return vsmap;
3636 } else
3637 vsmap = vs->vs_dmap;
3638 vsmap += cluster%CLMAP_ENTRIES;
3639 return vsmap;
3640 }
3641
3642 kern_return_t
3643 vs_cluster_transfer(
3644 vstruct_t vs,
3645 dp_offset_t offset,
3646 dp_size_t cnt,
3647 upl_t upl)
3648 {
3649 dp_offset_t actual_offset;
3650 paging_segment_t ps;
3651 struct clmap clmap;
3652 kern_return_t error = KERN_SUCCESS;
3653 unsigned int size, size_wanted;
3654 int i;
3655 unsigned int residual = 0;
3656 unsigned int unavail_size;
3657 // default_pager_thread_t *dpt;
3658 // boolean_t dealloc;
3659 struct vs_map *vsmap_ptr = NULL;
3660 struct vs_map read_vsmap;
3661 struct vs_map original_read_vsmap;
3662 struct vs_map write_vsmap;
3663 // upl_t sync_upl;
3664 // vm_offset_t ioaddr;
3665
3666 /* vs_cluster_transfer reads in the pages of a cluster and
3667 * then writes these pages back to new backing store. The
3668 * segment the pages are being read from is assumed to have
3669 * been taken off-line and is no longer considered for new
3670 * space requests.
3671 */
3672
3673 /*
3674 * This loop will be executed once per cluster referenced.
3675 * Typically this means once, since it's unlikely that the
3676 * VM system will ask for anything spanning cluster boundaries.
3677 *
3678 * If there are holes in a cluster (in a paging segment), we stop
3679 * reading at the hole, then loop again, hoping to
3680 * find valid pages later in the cluster. This continues until
3681 * the entire range has been examined, and read, if present. The
3682 * pages are written as they are read. If a failure occurs after
3683 * some pages are written the unmap call at the bottom of the loop
3684 * recovers the backing store and the old backing store remains
3685 * in effect.
3686 */
3687
3688 VSM_CLR(write_vsmap);
3689 VSM_CLR(original_read_vsmap);
3690 /* grab the actual object's pages to sync with I/O */
3691 while (cnt && (error == KERN_SUCCESS)) {
3692 vsmap_ptr = vs_get_map_entry(vs, offset);
3693 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3694
3695 if (actual_offset == (dp_offset_t) -1) {
3696
3697 /*
3698 * Nothing left to write in this cluster at least
3699 * set write cluster information for any previous
3700 * write, clear for next cluster, if there is one
3701 */
3702 unsigned int local_size, clmask, clsize;
3703
3704 clsize = vm_page_size << vs->vs_clshift;
3705 clmask = clsize - 1;
3706 local_size = clsize - (offset & clmask);
3707 ASSERT(local_size);
3708 local_size = MIN(local_size, cnt);
3709
3710 /* This cluster has no data in it beyond what may */
3711 /* have been found on a previous iteration through */
3712 /* the loop "write_vsmap" */
3713 *vsmap_ptr = write_vsmap;
3714 VSM_CLR(write_vsmap);
3715 VSM_CLR(original_read_vsmap);
3716
3717 cnt -= local_size;
3718 offset += local_size;
3719 continue;
3720 }
3721
3722 /*
3723 * Count up contiguous available or unavailable
3724 * pages.
3725 */
3726 ps = CLMAP_PS(clmap);
3727 ASSERT(ps);
3728 size = 0;
3729 unavail_size = 0;
3730 for (i = 0;
3731 (size < cnt) && (unavail_size < cnt) &&
3732 (i < CLMAP_NPGS(clmap)); i++) {
3733 if (CLMAP_ISSET(clmap, i)) {
3734 if (unavail_size != 0)
3735 break;
3736 size += vm_page_size;
3737 BS_STAT(ps->ps_bs,
3738 ps->ps_bs->bs_pages_in++);
3739 } else {
3740 if (size != 0)
3741 break;
3742 unavail_size += vm_page_size;
3743 }
3744 }
3745
3746 if (size == 0) {
3747 ASSERT(unavail_size);
3748 ps_clunmap(vs, offset, unavail_size);
3749 cnt -= unavail_size;
3750 offset += unavail_size;
3751 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3752 == 0) {
3753 /* There is no more to transfer in this
3754 cluster
3755 */
3756 *vsmap_ptr = write_vsmap;
3757 VSM_CLR(write_vsmap);
3758 VSM_CLR(original_read_vsmap);
3759 }
3760 continue;
3761 }
3762
3763 if(VSM_ISCLR(original_read_vsmap))
3764 original_read_vsmap = *vsmap_ptr;
3765
3766 if(ps->ps_segtype == PS_PARTITION) {
3767 panic("swap partition not supported\n");
3768 /*NOTREACHED*/
3769 error = KERN_FAILURE;
3770 residual = size;
3771 /*
3772 NEED TO ISSUE WITH SYNC & NO COMMIT
3773 error = ps_read_device(ps, actual_offset, &buffer,
3774 size, &residual, flags);
3775 */
3776 } else {
3777 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3778 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3779 size, &residual,
3780 (UPL_IOSYNC | UPL_NOCOMMIT));
3781 }
3782
3783 read_vsmap = *vsmap_ptr;
3784
3785
3786 /*
3787 * Adjust counts and put data in new BS. Optimize for the
3788 * common case, i.e. no error and/or partial data.
3789 * If there was an error, then we need to error the entire
3790 * range, even if some data was successfully read.
3791 *
3792 */
3793 if ((error == KERN_SUCCESS) && (residual == 0)) {
3794
3795 /*
3796 * Got everything we asked for, supply the data to
3797 * the new BS. Note that as a side effect of supplying
3798 * the data, the buffer holding the supplied data is
3799 * deallocated from the pager's address space unless
3800 * the write is unsuccessful.
3801 */
3802
3803 /* note buffer will be cleaned up in all cases by */
3804 /* internal_cluster_write or if an error on write */
3805 /* the vm_map_copy_page_discard call */
3806 *vsmap_ptr = write_vsmap;
3807
3808 if(vs_cluster_write(vs, upl, offset,
3809 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3810 error = KERN_FAILURE;
3811 if(!(VSM_ISCLR(*vsmap_ptr))) {
3812 /* unmap the new backing store object */
3813 ps_clunmap(vs, offset, size);
3814 }
3815 /* original vsmap */
3816 *vsmap_ptr = original_read_vsmap;
3817 VSM_CLR(write_vsmap);
3818 } else {
3819 if((offset + size) &
3820 ((vm_page_size << vs->vs_clshift)
3821 - 1)) {
3822 /* There is more to transfer in this
3823 cluster
3824 */
3825 write_vsmap = *vsmap_ptr;
3826 *vsmap_ptr = read_vsmap;
3827 ps_clunmap(vs, offset, size);
3828 } else {
3829 /* discard the old backing object */
3830 write_vsmap = *vsmap_ptr;
3831 *vsmap_ptr = read_vsmap;
3832 ps_clunmap(vs, offset, size);
3833 *vsmap_ptr = write_vsmap;
3834 VSM_CLR(write_vsmap);
3835 VSM_CLR(original_read_vsmap);
3836 }
3837 }
3838 } else {
3839 size_wanted = size;
3840 if (error == KERN_SUCCESS) {
3841 if (residual == size) {
3842 /*
3843 * If a read operation returns no error
3844 * and no data moved, we turn it into
3845 * an error, assuming we're reading at
3846 * or beyond EOF.
3847 * Fall through and error the entire
3848 * range.
3849 */
3850 error = KERN_FAILURE;
3851 *vsmap_ptr = write_vsmap;
3852 if(!(VSM_ISCLR(*vsmap_ptr))) {
3853 /* unmap the new backing store object */
3854 ps_clunmap(vs, offset, size);
3855 }
3856 *vsmap_ptr = original_read_vsmap;
3857 VSM_CLR(write_vsmap);
3858 continue;
3859 } else {
3860 /*
3861 * Otherwise, we have partial read.
3862 * This is also considered an error
3863 * for the purposes of cluster transfer
3864 */
3865 error = KERN_FAILURE;
3866 *vsmap_ptr = write_vsmap;
3867 if(!(VSM_ISCLR(*vsmap_ptr))) {
3868 /* unmap the new backing store object */
3869 ps_clunmap(vs, offset, size);
3870 }
3871 *vsmap_ptr = original_read_vsmap;
3872 VSM_CLR(write_vsmap);
3873 continue;
3874 }
3875 }
3876
3877 }
3878 cnt -= size;
3879 offset += size;
3880
3881 } /* END while (cnt && (error == 0)) */
3882 if(!VSM_ISCLR(write_vsmap))
3883 *vsmap_ptr = write_vsmap;
3884
3885 return error;
3886 }
3887
3888 kern_return_t
3889 default_pager_add_file(
3890 MACH_PORT_FACE backing_store,
3891 vnode_ptr_t vp,
3892 int record_size,
3893 vm_size_t size)
3894 {
3895 backing_store_t bs;
3896 paging_segment_t ps;
3897 int i;
3898 unsigned int j;
3899 int error;
3900
3901 if ((bs = backing_store_lookup(backing_store))
3902 == BACKING_STORE_NULL)
3903 return KERN_INVALID_ARGUMENT;
3904
3905 PSL_LOCK();
3906 for (i = 0; i <= paging_segment_max; i++) {
3907 ps = paging_segments[i];
3908 if (ps == PAGING_SEGMENT_NULL)
3909 continue;
3910 if (ps->ps_segtype != PS_FILE)
3911 continue;
3912
3913 /*
3914 * Check for overlap on same device.
3915 */
3916 if (ps->ps_vnode == (struct vnode *)vp) {
3917 PSL_UNLOCK();
3918 BS_UNLOCK(bs);
3919 return KERN_INVALID_ARGUMENT;
3920 }
3921 }
3922 PSL_UNLOCK();
3923
3924 /*
3925 * Set up the paging segment
3926 */
3927 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3928 if (ps == PAGING_SEGMENT_NULL) {
3929 BS_UNLOCK(bs);
3930 return KERN_RESOURCE_SHORTAGE;
3931 }
3932
3933 ps->ps_segtype = PS_FILE;
3934 ps->ps_vnode = (struct vnode *)vp;
3935 ps->ps_offset = 0;
3936 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3937 assert((dp_size_t) size == size);
3938 ps->ps_recnum = (dp_size_t) size;
3939 ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
3940
3941 ps->ps_pgcount = ps->ps_pgnum;
3942 ps->ps_clshift = local_log2(bs->bs_clsize);
3943 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3944 ps->ps_special_clusters = 0;
3945 ps->ps_hint = 0;
3946
3947 PS_LOCK_INIT(ps);
3948 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3949 if (!ps->ps_bmap) {
3950 kfree(ps, sizeof *ps);
3951 BS_UNLOCK(bs);
3952 return KERN_RESOURCE_SHORTAGE;
3953 }
3954 for (j = 0; j < ps->ps_ncls; j++) {
3955 clrbit(ps->ps_bmap, j);
3956 }
3957
3958 if(paging_segment_count == 0) {
3959 ps->ps_state = PS_EMERGENCY_SEGMENT;
3960 if(use_emergency_swap_file_first) {
3961 ps->ps_state |= PS_CAN_USE;
3962 }
3963 emergency_segment_backing_store = backing_store;
3964 } else {
3965 ps->ps_state = PS_CAN_USE;
3966 }
3967
3968 ps->ps_bs = bs;
3969
3970 if ((error = ps_enter(ps)) != 0) {
3971 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3972 kfree(ps, sizeof *ps);
3973 BS_UNLOCK(bs);
3974 return KERN_RESOURCE_SHORTAGE;
3975 }
3976
3977 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3978 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3979 PSL_LOCK();
3980 if(IS_PS_OK_TO_USE(ps)) {
3981 dp_pages_free += ps->ps_pgcount;
3982 } else {
3983 dp_pages_reserve += ps->ps_pgcount;
3984 }
3985 PSL_UNLOCK();
3986
3987 BS_UNLOCK(bs);
3988
3989 bs_more_space(ps->ps_clcount);
3990
3991 /*
3992 * If the paging segment being activated is not the emergency
3993 * segment and we notice that the emergency segment is being
3994 * used then we help recover it. If all goes well, the
3995 * emergency segment will be back to its original state of
3996 * online but not activated (till it's needed the next time).
3997 */
3998 ps = paging_segments[EMERGENCY_PSEG_INDEX];
3999 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4000 if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4001 dprintf(("Failed to recover emergency paging segment\n"));
4002 } else {
4003 dprintf(("Recovered emergency paging segment\n"));
4004 }
4005 }
4006
4007 DP_DEBUG(DEBUG_BS_INTERNAL,
4008 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4009 device, offset, (dp_size_t) size, record_size,
4010 ps->ps_record_shift, ps->ps_pgnum));
4011
4012 return KERN_SUCCESS;
4013 }
4014
4015
4016
4017 kern_return_t
4018 ps_read_file(
4019 paging_segment_t ps,
4020 upl_t upl,
4021 upl_offset_t upl_offset,
4022 dp_offset_t offset,
4023 upl_size_t size,
4024 unsigned int *residualp,
4025 int flags)
4026 {
4027 vm_object_offset_t f_offset;
4028 int error = 0;
4029 int result;
4030
4031 assert(dp_encryption_inited);
4032
4033 clustered_reads[atop_32(size)]++;
4034
4035 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4036
4037 /*
4038 * for transfer case we need to pass uploffset and flags
4039 */
4040 assert((upl_size_t) size == size);
4041 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4042
4043 /* The vnode_pagein semantic is somewhat at odds with the existing */
4044 /* device_read semantic. Partial reads are not experienced at this */
4045 /* level. It is up to the bit map code and cluster read code to */
4046 /* check that requested data locations are actually backed, and the */
4047 /* pagein code to either read all of the requested data or return an */
4048 /* error. */
4049
4050 if (error)
4051 result = KERN_FAILURE;
4052 else {
4053 *residualp = 0;
4054 result = KERN_SUCCESS;
4055 }
4056 return result;
4057 }
4058
4059 kern_return_t
4060 ps_write_file(
4061 paging_segment_t ps,
4062 upl_t upl,
4063 upl_offset_t upl_offset,
4064 dp_offset_t offset,
4065 unsigned int size,
4066 int flags)
4067 {
4068 vm_object_offset_t f_offset;
4069 kern_return_t result;
4070
4071 assert(dp_encryption_inited);
4072
4073 clustered_writes[atop_32(size)]++;
4074 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4075
4076 if (flags & UPL_PAGING_ENCRYPTED) {
4077 /*
4078 * ENCRYPTED SWAP:
4079 * encrypt all the pages that we're going
4080 * to pageout.
4081 */
4082 upl_encrypt(upl, upl_offset, size);
4083 }
4084 assert((upl_size_t) size == size);
4085 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4086 result = KERN_FAILURE;
4087 else
4088 result = KERN_SUCCESS;
4089
4090 return result;
4091 }
4092
4093 kern_return_t
4094 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4095 int hi_wat,
4096 int lo_wat,
4097 int flags,
4098 MACH_PORT_FACE trigger_port)
4099 {
4100 MACH_PORT_FACE release;
4101 kern_return_t kr;
4102 clock_sec_t now;
4103 clock_nsec_t nanoseconds_dummy;
4104 static clock_sec_t error_notify = 0;
4105
4106 PSL_LOCK();
4107 if (flags == SWAP_ENCRYPT_ON) {
4108 /* ENCRYPTED SWAP: turn encryption on */
4109 release = trigger_port;
4110 if (!dp_encryption_inited) {
4111 dp_encryption_inited = TRUE;
4112 dp_encryption = TRUE;
4113 kr = KERN_SUCCESS;
4114 } else {
4115 kr = KERN_FAILURE;
4116 }
4117 } else if (flags == SWAP_ENCRYPT_OFF) {
4118 /* ENCRYPTED SWAP: turn encryption off */
4119 release = trigger_port;
4120 if (!dp_encryption_inited) {
4121 dp_encryption_inited = TRUE;
4122 dp_encryption = FALSE;
4123 kr = KERN_SUCCESS;
4124 } else {
4125 kr = KERN_FAILURE;
4126 }
4127 } else if (flags == HI_WAT_ALERT) {
4128 release = min_pages_trigger_port;
4129 min_pages_trigger_port = trigger_port;
4130 minimum_pages_remaining = hi_wat/vm_page_size;
4131 bs_low = FALSE;
4132 kr = KERN_SUCCESS;
4133 } else if (flags == LO_WAT_ALERT) {
4134 release = max_pages_trigger_port;
4135 max_pages_trigger_port = trigger_port;
4136 maximum_pages_free = lo_wat/vm_page_size;
4137 kr = KERN_SUCCESS;
4138 } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4139 use_emergency_swap_file_first = TRUE;
4140 release = trigger_port;
4141 kr = KERN_SUCCESS;
4142 } else if (flags == SWAP_FILE_CREATION_ERROR) {
4143 release = trigger_port;
4144 kr = KERN_SUCCESS;
4145 if( paging_segment_count == 1) {
4146 use_emergency_swap_file_first = TRUE;
4147 }
4148 no_paging_space_action();
4149 clock_get_system_nanotime(&now, &nanoseconds_dummy);
4150 if (now > error_notify + 5) {
4151 dprintf(("Swap File Error.\n"));
4152 error_notify = now;
4153 }
4154 } else {
4155 release = trigger_port;
4156 kr = KERN_INVALID_ARGUMENT;
4157 }
4158 PSL_UNLOCK();
4159
4160 if (IP_VALID(release))
4161 ipc_port_release_send(release);
4162
4163 return kr;
4164 }
4165
4166 /*
4167 * Monitor the amount of available backing store vs. the amount of
4168 * required backing store, notify a listener (if present) when
4169 * backing store may safely be removed.
4170 *
4171 * We attempt to avoid the situation where backing store is
4172 * discarded en masse, as this can lead to thrashing as the
4173 * backing store is compacted.
4174 */
4175
4176 #define PF_INTERVAL 3 /* time between free level checks */
4177 #define PF_LATENCY 10 /* number of intervals before release */
4178
4179 static int dp_pages_free_low_count = 0;
4180 thread_call_t default_pager_backing_store_monitor_callout;
4181
4182 void
4183 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4184 __unused thread_call_param_t p2)
4185 {
4186 // unsigned long long average;
4187 ipc_port_t trigger;
4188 uint64_t deadline;
4189
4190 /*
4191 * We determine whether it will be safe to release some
4192 * backing store by watching the free page level. If
4193 * it remains below the maximum_pages_free threshold for
4194 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4195 * then we deem it safe.
4196 *
4197 * Note that this establishes a maximum rate at which backing
4198 * store will be released, as each notification (currently)
4199 * only results in a single backing store object being
4200 * released.
4201 */
4202 if (dp_pages_free > maximum_pages_free) {
4203 dp_pages_free_low_count++;
4204 } else {
4205 dp_pages_free_low_count = 0;
4206 }
4207
4208 /* decide whether to send notification */
4209 trigger = IP_NULL;
4210 if (max_pages_trigger_port &&
4211 (backing_store_release_trigger_disable == 0) &&
4212 (dp_pages_free_low_count > PF_LATENCY)) {
4213 trigger = max_pages_trigger_port;
4214 max_pages_trigger_port = NULL;
4215 }
4216
4217 /* send notification */
4218 if (trigger != IP_NULL) {
4219 VSL_LOCK();
4220 if(backing_store_release_trigger_disable != 0) {
4221 assert_wait((event_t)
4222 &backing_store_release_trigger_disable,
4223 THREAD_UNINT);
4224 VSL_UNLOCK();
4225 thread_block(THREAD_CONTINUE_NULL);
4226 } else {
4227 VSL_UNLOCK();
4228 }
4229 default_pager_space_alert(trigger, LO_WAT_ALERT);
4230 ipc_port_release_send(trigger);
4231 dp_pages_free_low_count = 0;
4232 }
4233
4234 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4235 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4236 }