]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
xnu-792.13.8.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * @OSF_COPYRIGHT@
32 */
33 /*
34 * Mach Operating System
35 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
36 * All Rights Reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and its
39 * documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
46 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
51 * School of Computer Science
52 * Carnegie Mellon University
53 * Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie Mellon
56 * the rights to redistribute these changes.
57 */
58
59 /*
60 * Default Pager.
61 * Paging File Management.
62 */
63
64 #include <mach/host_priv.h>
65 #include <mach/memory_object_control.h>
66 #include <mach/memory_object_server.h>
67 #include <mach/upl.h>
68 #include <default_pager/default_pager_internal.h>
69 #include <default_pager/default_pager_alerts.h>
70 #include <default_pager/default_pager_object_server.h>
71
72 #include <ipc/ipc_types.h>
73 #include <ipc/ipc_port.h>
74 #include <ipc/ipc_space.h>
75
76 #include <kern/kern_types.h>
77 #include <kern/host.h>
78 #include <kern/queue.h>
79 #include <kern/counters.h>
80 #include <kern/sched_prim.h>
81
82 #include <vm/vm_kern.h>
83 #include <vm/vm_pageout.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_protos.h>
87
88 /* LP64todo - need large internal object support */
89
90 /*
91 * ALLOC_STRIDE... the maximum number of bytes allocated from
92 * a swap file before moving on to the next swap file... if
93 * all swap files reside on a single disk, this value should
94 * be very large (this is the default assumption)... if the
95 * swap files are spread across multiple disks, than this value
96 * should be small (128 * 1024)...
97 *
98 * This should be determined dynamically in the future
99 */
100
101 #define ALLOC_STRIDE (1024 * 1024 * 1024)
102 int physical_transfer_cluster_count = 0;
103
104 #define VM_SUPER_CLUSTER 0x40000
105 #define VM_SUPER_PAGES 64
106
107 /*
108 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
109 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
110 */
111 #define VSTRUCT_DEF_CLSHIFT 2
112 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
113 int default_pager_clsize = 0;
114
115 /* statistics */
116 unsigned int clustered_writes[VM_SUPER_PAGES+1];
117 unsigned int clustered_reads[VM_SUPER_PAGES+1];
118
119 /*
120 * Globals used for asynchronous paging operations:
121 * vs_async_list: head of list of to-be-completed I/O ops
122 * async_num_queued: number of pages completed, but not yet
123 * processed by async thread.
124 * async_requests_out: number of pages of requests not completed.
125 */
126
127 #if 0
128 struct vs_async *vs_async_list;
129 int async_num_queued;
130 int async_requests_out;
131 #endif
132
133
134 #define VS_ASYNC_REUSE 1
135 struct vs_async *vs_async_free_list;
136
137 mutex_t default_pager_async_lock; /* Protects globals above */
138
139
140 int vs_alloc_async_failed = 0; /* statistics */
141 int vs_alloc_async_count = 0; /* statistics */
142 struct vs_async *vs_alloc_async(void); /* forward */
143 void vs_free_async(struct vs_async *vsa); /* forward */
144
145
146 #define VS_ALLOC_ASYNC() vs_alloc_async()
147 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
148
149 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
150 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
151 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, 0)
152 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
153 /*
154 * Paging Space Hysteresis triggers and the target notification port
155 *
156 */
157
158 unsigned int minimum_pages_remaining = 0;
159 unsigned int maximum_pages_free = 0;
160 ipc_port_t min_pages_trigger_port = NULL;
161 ipc_port_t max_pages_trigger_port = NULL;
162
163 boolean_t bs_low = FALSE;
164 int backing_store_release_trigger_disable = 0;
165
166
167 /* Have we decided if swap needs to be encrypted yet ? */
168 boolean_t dp_encryption_inited = FALSE;
169 /* Should we encrypt swap ? */
170 boolean_t dp_encryption = FALSE;
171
172
173 /*
174 * Object sizes are rounded up to the next power of 2,
175 * unless they are bigger than a given maximum size.
176 */
177 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
178
179 /*
180 * List of all backing store and segments.
181 */
182 struct backing_store_list_head backing_store_list;
183 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
184 mutex_t paging_segments_lock;
185 int paging_segment_max = 0;
186 int paging_segment_count = 0;
187 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
188
189
190 /*
191 * Total pages free in system
192 * This differs from clusters committed/avail which is a measure of the
193 * over commitment of paging segments to backing store. An idea which is
194 * likely to be deprecated.
195 */
196 unsigned int dp_pages_free = 0;
197 unsigned int cluster_transfer_minimum = 100;
198
199 /* forward declarations */
200 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int); /* forward */
201 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
202 default_pager_thread_t *get_read_buffer( void );
203 kern_return_t ps_vstruct_transfer_from_segment(
204 vstruct_t vs,
205 paging_segment_t segment,
206 upl_t upl);
207 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
208 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
209 kern_return_t vs_cluster_transfer(
210 vstruct_t vs,
211 upl_offset_t offset,
212 upl_size_t cnt,
213 upl_t upl);
214 vs_map_t vs_get_map_entry(
215 vstruct_t vs,
216 vm_offset_t offset);
217
218
219 default_pager_thread_t *
220 get_read_buffer( void )
221 {
222 int i;
223
224 DPT_LOCK(dpt_lock);
225 while(TRUE) {
226 for (i=0; i<default_pager_internal_count; i++) {
227 if(dpt_array[i]->checked_out == FALSE) {
228 dpt_array[i]->checked_out = TRUE;
229 DPT_UNLOCK(dpt_lock);
230 return dpt_array[i];
231 }
232 }
233 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
234 }
235 }
236
237 void
238 bs_initialize(void)
239 {
240 int i;
241
242 /*
243 * List of all backing store.
244 */
245 BSL_LOCK_INIT();
246 queue_init(&backing_store_list.bsl_queue);
247 PSL_LOCK_INIT();
248
249 VS_ASYNC_LOCK_INIT();
250 #if VS_ASYNC_REUSE
251 vs_async_free_list = NULL;
252 #endif /* VS_ASYNC_REUSE */
253
254 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
255 clustered_writes[i] = 0;
256 clustered_reads[i] = 0;
257 }
258
259 }
260
261 /*
262 * When things do not quite workout...
263 */
264 void bs_no_paging_space(boolean_t); /* forward */
265
266 void
267 bs_no_paging_space(
268 boolean_t out_of_memory)
269 {
270
271 if (out_of_memory)
272 dprintf(("*** OUT OF MEMORY ***\n"));
273 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
274 }
275
276 void bs_more_space(int); /* forward */
277 void bs_commit(int); /* forward */
278
279 boolean_t user_warned = FALSE;
280 unsigned int clusters_committed = 0;
281 unsigned int clusters_available = 0;
282 unsigned int clusters_committed_peak = 0;
283
284 void
285 bs_more_space(
286 int nclusters)
287 {
288 BSL_LOCK();
289 /*
290 * Account for new paging space.
291 */
292 clusters_available += nclusters;
293
294 if (clusters_available >= clusters_committed) {
295 if (verbose && user_warned) {
296 printf("%s%s - %d excess clusters now.\n",
297 my_name,
298 "paging space is OK now",
299 clusters_available - clusters_committed);
300 user_warned = FALSE;
301 clusters_committed_peak = 0;
302 }
303 } else {
304 if (verbose && user_warned) {
305 printf("%s%s - still short of %d clusters.\n",
306 my_name,
307 "WARNING: paging space over-committed",
308 clusters_committed - clusters_available);
309 clusters_committed_peak -= nclusters;
310 }
311 }
312 BSL_UNLOCK();
313
314 return;
315 }
316
317 void
318 bs_commit(
319 int nclusters)
320 {
321 BSL_LOCK();
322 clusters_committed += nclusters;
323 if (clusters_committed > clusters_available) {
324 if (verbose && !user_warned) {
325 user_warned = TRUE;
326 printf("%s%s - short of %d clusters.\n",
327 my_name,
328 "WARNING: paging space over-committed",
329 clusters_committed - clusters_available);
330 }
331 if (clusters_committed > clusters_committed_peak) {
332 clusters_committed_peak = clusters_committed;
333 }
334 } else {
335 if (verbose && user_warned) {
336 printf("%s%s - was short of up to %d clusters.\n",
337 my_name,
338 "paging space is OK now",
339 clusters_committed_peak - clusters_available);
340 user_warned = FALSE;
341 clusters_committed_peak = 0;
342 }
343 }
344 BSL_UNLOCK();
345
346 return;
347 }
348
349 int default_pager_info_verbose = 1;
350
351 void
352 bs_global_info(
353 vm_size_t *totalp,
354 vm_size_t *freep)
355 {
356 vm_size_t pages_total, pages_free;
357 paging_segment_t ps;
358 int i;
359
360 PSL_LOCK();
361 pages_total = pages_free = 0;
362 for (i = 0; i <= paging_segment_max; i++) {
363 ps = paging_segments[i];
364 if (ps == PAGING_SEGMENT_NULL)
365 continue;
366
367 /*
368 * no need to lock: by the time this data
369 * gets back to any remote requestor it
370 * will be obsolete anyways
371 */
372 pages_total += ps->ps_pgnum;
373 pages_free += ps->ps_clcount << ps->ps_clshift;
374 DP_DEBUG(DEBUG_BS_INTERNAL,
375 ("segment #%d: %d total, %d free\n",
376 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
377 }
378 *totalp = pages_total;
379 *freep = pages_free;
380 if (verbose && user_warned && default_pager_info_verbose) {
381 if (clusters_available < clusters_committed) {
382 printf("%s %d clusters committed, %d available.\n",
383 my_name,
384 clusters_committed,
385 clusters_available);
386 }
387 }
388 PSL_UNLOCK();
389 }
390
391 backing_store_t backing_store_alloc(void); /* forward */
392
393 backing_store_t
394 backing_store_alloc(void)
395 {
396 backing_store_t bs;
397
398 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
399 if (bs == BACKING_STORE_NULL)
400 panic("backing_store_alloc: no memory");
401
402 BS_LOCK_INIT(bs);
403 bs->bs_port = MACH_PORT_NULL;
404 bs->bs_priority = 0;
405 bs->bs_clsize = 0;
406 bs->bs_pages_total = 0;
407 bs->bs_pages_in = 0;
408 bs->bs_pages_in_fail = 0;
409 bs->bs_pages_out = 0;
410 bs->bs_pages_out_fail = 0;
411
412 return bs;
413 }
414
415 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
416
417 /* Even in both the component space and external versions of this pager, */
418 /* backing_store_lookup will be called from tasks in the application space */
419 backing_store_t
420 backing_store_lookup(
421 MACH_PORT_FACE port)
422 {
423 backing_store_t bs;
424
425 /*
426 port is currently backed with a vs structure in the alias field
427 we could create an ISBS alias and a port_is_bs call but frankly
428 I see no reason for the test, the bs->port == port check below
429 will work properly on junk entries.
430
431 if ((port == MACH_PORT_NULL) || port_is_vs(port))
432 */
433 if ((port == MACH_PORT_NULL))
434 return BACKING_STORE_NULL;
435
436 BSL_LOCK();
437 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
438 bs_links) {
439 BS_LOCK(bs);
440 if (bs->bs_port == port) {
441 BSL_UNLOCK();
442 /* Success, return it locked. */
443 return bs;
444 }
445 BS_UNLOCK(bs);
446 }
447 BSL_UNLOCK();
448 return BACKING_STORE_NULL;
449 }
450
451 void backing_store_add(backing_store_t); /* forward */
452
453 void
454 backing_store_add(
455 __unused backing_store_t bs)
456 {
457 // MACH_PORT_FACE port = bs->bs_port;
458 // MACH_PORT_FACE pset = default_pager_default_set;
459 kern_return_t kr = KERN_SUCCESS;
460
461 if (kr != KERN_SUCCESS)
462 panic("backing_store_add: add to set");
463
464 }
465
466 /*
467 * Set up default page shift, but only if not already
468 * set and argument is within range.
469 */
470 boolean_t
471 bs_set_default_clsize(unsigned int npages)
472 {
473 switch(npages){
474 case 1:
475 case 2:
476 case 4:
477 case 8:
478 if (default_pager_clsize == 0) /* if not yet set */
479 vstruct_def_clshift = local_log2(npages);
480 return(TRUE);
481 }
482 return(FALSE);
483 }
484
485 int bs_get_global_clsize(int clsize); /* forward */
486
487 int
488 bs_get_global_clsize(
489 int clsize)
490 {
491 int i;
492 memory_object_default_t dmm;
493 kern_return_t kr;
494
495 /*
496 * Only allow setting of cluster size once. If called
497 * with no cluster size (default), we use the compiled-in default
498 * for the duration. The same cluster size is used for all
499 * paging segments.
500 */
501 if (default_pager_clsize == 0) {
502 /*
503 * Keep cluster size in bit shift because it's quicker
504 * arithmetic, and easier to keep at a power of 2.
505 */
506 if (clsize != NO_CLSIZE) {
507 for (i = 0; (1 << i) < clsize; i++);
508 if (i > MAX_CLUSTER_SHIFT)
509 i = MAX_CLUSTER_SHIFT;
510 vstruct_def_clshift = i;
511 }
512 default_pager_clsize = (1 << vstruct_def_clshift);
513
514 /*
515 * Let the user know the new (and definitive) cluster size.
516 */
517 if (verbose)
518 printf("%scluster size = %d page%s\n",
519 my_name, default_pager_clsize,
520 (default_pager_clsize == 1) ? "" : "s");
521
522 /*
523 * Let the kernel know too, in case it hasn't used the
524 * default value provided in main() yet.
525 */
526 dmm = default_pager_object;
527 clsize = default_pager_clsize * vm_page_size; /* in bytes */
528 kr = host_default_memory_manager(host_priv_self(),
529 &dmm,
530 clsize);
531 memory_object_default_deallocate(dmm);
532
533 if (kr != KERN_SUCCESS) {
534 panic("bs_get_global_cl_size:host_default_memory_manager");
535 }
536 if (dmm != default_pager_object) {
537 panic("bs_get_global_cl_size:there is another default pager");
538 }
539 }
540 ASSERT(default_pager_clsize > 0 &&
541 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
542
543 return default_pager_clsize;
544 }
545
546 kern_return_t
547 default_pager_backing_store_create(
548 memory_object_default_t pager,
549 int priority,
550 int clsize, /* in bytes */
551 MACH_PORT_FACE *backing_store)
552 {
553 backing_store_t bs;
554 MACH_PORT_FACE port;
555 // kern_return_t kr;
556 struct vstruct_alias *alias_struct;
557
558 if (pager != default_pager_object)
559 return KERN_INVALID_ARGUMENT;
560
561 bs = backing_store_alloc();
562 port = ipc_port_alloc_kernel();
563 ipc_port_make_send(port);
564 assert (port != IP_NULL);
565
566 DP_DEBUG(DEBUG_BS_EXTERNAL,
567 ("priority=%d clsize=%d bs_port=0x%x\n",
568 priority, clsize, (int) backing_store));
569
570 alias_struct = (struct vstruct_alias *)
571 kalloc(sizeof (struct vstruct_alias));
572 if(alias_struct != NULL) {
573 alias_struct->vs = (struct vstruct *)bs;
574 alias_struct->name = &default_pager_ops;
575 port->alias = (int) alias_struct;
576 }
577 else {
578 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
579 kfree(bs, sizeof (struct backing_store));
580 return KERN_RESOURCE_SHORTAGE;
581 }
582
583 bs->bs_port = port;
584 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
585 priority = BS_MAXPRI;
586 else if (priority == BS_NOPRI)
587 priority = BS_MAXPRI;
588 else
589 priority = BS_MINPRI;
590 bs->bs_priority = priority;
591
592 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
593
594 BSL_LOCK();
595 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
596 bs_links);
597 BSL_UNLOCK();
598
599 backing_store_add(bs);
600
601 *backing_store = port;
602 return KERN_SUCCESS;
603 }
604
605 kern_return_t
606 default_pager_backing_store_info(
607 MACH_PORT_FACE backing_store,
608 backing_store_flavor_t flavour,
609 backing_store_info_t info,
610 mach_msg_type_number_t *size)
611 {
612 backing_store_t bs;
613 backing_store_basic_info_t basic;
614 int i;
615 paging_segment_t ps;
616
617 if (flavour != BACKING_STORE_BASIC_INFO ||
618 *size < BACKING_STORE_BASIC_INFO_COUNT)
619 return KERN_INVALID_ARGUMENT;
620
621 basic = (backing_store_basic_info_t)info;
622 *size = BACKING_STORE_BASIC_INFO_COUNT;
623
624 VSTATS_LOCK(&global_stats.gs_lock);
625 basic->pageout_calls = global_stats.gs_pageout_calls;
626 basic->pagein_calls = global_stats.gs_pagein_calls;
627 basic->pages_in = global_stats.gs_pages_in;
628 basic->pages_out = global_stats.gs_pages_out;
629 basic->pages_unavail = global_stats.gs_pages_unavail;
630 basic->pages_init = global_stats.gs_pages_init;
631 basic->pages_init_writes= global_stats.gs_pages_init_writes;
632 VSTATS_UNLOCK(&global_stats.gs_lock);
633
634 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
635 return KERN_INVALID_ARGUMENT;
636
637 basic->bs_pages_total = bs->bs_pages_total;
638 PSL_LOCK();
639 bs->bs_pages_free = 0;
640 for (i = 0; i <= paging_segment_max; i++) {
641 ps = paging_segments[i];
642 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
643 PS_LOCK(ps);
644 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
645 PS_UNLOCK(ps);
646 }
647 }
648 PSL_UNLOCK();
649 basic->bs_pages_free = bs->bs_pages_free;
650 basic->bs_pages_in = bs->bs_pages_in;
651 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
652 basic->bs_pages_out = bs->bs_pages_out;
653 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
654
655 basic->bs_priority = bs->bs_priority;
656 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
657
658 BS_UNLOCK(bs);
659
660 return KERN_SUCCESS;
661 }
662
663 int ps_delete(paging_segment_t); /* forward */
664
665 int
666 ps_delete(
667 paging_segment_t ps)
668 {
669 vstruct_t vs;
670 kern_return_t error = KERN_SUCCESS;
671 int vs_count;
672
673 VSL_LOCK(); /* get the lock on the list of vs's */
674
675 /* The lock relationship and sequence is farily complicated */
676 /* this code looks at a live list, locking and unlocking the list */
677 /* as it traverses it. It depends on the locking behavior of */
678 /* default_pager_no_senders. no_senders always locks the vstruct */
679 /* targeted for removal before locking the vstruct list. However */
680 /* it will remove that member of the list without locking its */
681 /* neighbors. We can be sure when we hold a lock on a vstruct */
682 /* it cannot be removed from the list but we must hold the list */
683 /* lock to be sure that its pointers to its neighbors are valid. */
684 /* Also, we can hold off destruction of a vstruct when the list */
685 /* lock and the vs locks are not being held by bumping the */
686 /* vs_async_pending count. */
687
688
689 while(backing_store_release_trigger_disable != 0) {
690 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
691 }
692
693 /* we will choose instead to hold a send right */
694 vs_count = vstruct_list.vsl_count;
695 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
696 if(vs == (vstruct_t)&vstruct_list) {
697 VSL_UNLOCK();
698 return KERN_SUCCESS;
699 }
700 VS_LOCK(vs);
701 vs_async_wait(vs); /* wait for any pending async writes */
702 if ((vs_count != 0) && (vs != NULL))
703 vs->vs_async_pending += 1; /* hold parties calling */
704 /* vs_async_wait */
705 VS_UNLOCK(vs);
706 VSL_UNLOCK();
707 while((vs_count != 0) && (vs != NULL)) {
708 /* We take the count of AMO's before beginning the */
709 /* transfer of of the target segment. */
710 /* We are guaranteed that the target segment cannot get */
711 /* more users. We also know that queue entries are */
712 /* made at the back of the list. If some of the entries */
713 /* we would check disappear while we are traversing the */
714 /* list then we will either check new entries which */
715 /* do not have any backing store in the target segment */
716 /* or re-check old entries. This might not be optimal */
717 /* but it will always be correct. The alternative is to */
718 /* take a snapshot of the list. */
719 vstruct_t next_vs;
720
721 if(dp_pages_free < cluster_transfer_minimum)
722 error = KERN_FAILURE;
723 else {
724 vm_object_t transfer_object;
725 unsigned int count;
726 upl_t upl;
727
728 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
729 count = 0;
730 error = vm_object_upl_request(transfer_object,
731 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
732 &upl, NULL, &count,
733 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
734 | UPL_SET_INTERNAL);
735 if(error == KERN_SUCCESS) {
736 error = ps_vstruct_transfer_from_segment(
737 vs, ps, upl);
738 upl_commit(upl, NULL, 0);
739 upl_deallocate(upl);
740 } else {
741 error = KERN_FAILURE;
742 }
743 vm_object_deallocate(transfer_object);
744 }
745 if(error) {
746 VS_LOCK(vs);
747 vs->vs_async_pending -= 1; /* release vs_async_wait */
748 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
749 vs->vs_waiting_async = FALSE;
750 VS_UNLOCK(vs);
751 thread_wakeup(&vs->vs_async_pending);
752 } else {
753 VS_UNLOCK(vs);
754 }
755 return KERN_FAILURE;
756 }
757
758 VSL_LOCK();
759
760 while(backing_store_release_trigger_disable != 0) {
761 VSL_SLEEP(&backing_store_release_trigger_disable,
762 THREAD_UNINT);
763 }
764
765 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
766 if((next_vs != (vstruct_t)&vstruct_list) &&
767 (vs != next_vs) && (vs_count != 1)) {
768 VS_LOCK(next_vs);
769 vs_async_wait(next_vs); /* wait for any */
770 /* pending async writes */
771 next_vs->vs_async_pending += 1; /* hold parties */
772 /* calling vs_async_wait */
773 VS_UNLOCK(next_vs);
774 }
775 VSL_UNLOCK();
776 VS_LOCK(vs);
777 vs->vs_async_pending -= 1;
778 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
779 vs->vs_waiting_async = FALSE;
780 VS_UNLOCK(vs);
781 thread_wakeup(&vs->vs_async_pending);
782 } else {
783 VS_UNLOCK(vs);
784 }
785 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
786 vs = NULL;
787 else
788 vs = next_vs;
789 vs_count--;
790 }
791 return KERN_SUCCESS;
792 }
793
794
795 kern_return_t
796 default_pager_backing_store_delete(
797 MACH_PORT_FACE backing_store)
798 {
799 backing_store_t bs;
800 int i;
801 paging_segment_t ps;
802 int error;
803 int interim_pages_removed = 0;
804 // kern_return_t kr;
805
806 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
807 return KERN_INVALID_ARGUMENT;
808
809 #if 0
810 /* not implemented */
811 BS_UNLOCK(bs);
812 return KERN_FAILURE;
813 #endif
814
815 restart:
816 PSL_LOCK();
817 error = KERN_SUCCESS;
818 for (i = 0; i <= paging_segment_max; i++) {
819 ps = paging_segments[i];
820 if (ps != PAGING_SEGMENT_NULL &&
821 ps->ps_bs == bs &&
822 ! ps->ps_going_away) {
823 PS_LOCK(ps);
824 /* disable access to this segment */
825 ps->ps_going_away = TRUE;
826 PS_UNLOCK(ps);
827 /*
828 * The "ps" segment is "off-line" now,
829 * we can try and delete it...
830 */
831 if(dp_pages_free < (cluster_transfer_minimum
832 + ps->ps_pgcount)) {
833 error = KERN_FAILURE;
834 PSL_UNLOCK();
835 }
836 else {
837 /* remove all pages associated with the */
838 /* segment from the list of free pages */
839 /* when transfer is through, all target */
840 /* segment pages will appear to be free */
841
842 dp_pages_free -= ps->ps_pgcount;
843 interim_pages_removed += ps->ps_pgcount;
844 PSL_UNLOCK();
845 error = ps_delete(ps);
846 }
847 if (error != KERN_SUCCESS) {
848 /*
849 * We couldn't delete the segment,
850 * probably because there's not enough
851 * virtual memory left.
852 * Re-enable all the segments.
853 */
854 PSL_LOCK();
855 break;
856 }
857 goto restart;
858 }
859 }
860
861 if (error != KERN_SUCCESS) {
862 for (i = 0; i <= paging_segment_max; i++) {
863 ps = paging_segments[i];
864 if (ps != PAGING_SEGMENT_NULL &&
865 ps->ps_bs == bs &&
866 ps->ps_going_away) {
867 PS_LOCK(ps);
868 /* re-enable access to this segment */
869 ps->ps_going_away = FALSE;
870 PS_UNLOCK(ps);
871 }
872 }
873 dp_pages_free += interim_pages_removed;
874 PSL_UNLOCK();
875 BS_UNLOCK(bs);
876 return error;
877 }
878
879 for (i = 0; i <= paging_segment_max; i++) {
880 ps = paging_segments[i];
881 if (ps != PAGING_SEGMENT_NULL &&
882 ps->ps_bs == bs) {
883 if(ps->ps_going_away) {
884 paging_segments[i] = PAGING_SEGMENT_NULL;
885 paging_segment_count--;
886 PS_LOCK(ps);
887 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
888 kfree(ps, sizeof *ps);
889 }
890 }
891 }
892
893 /* Scan the entire ps array separately to make certain we find the */
894 /* proper paging_segment_max */
895 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
896 if(paging_segments[i] != PAGING_SEGMENT_NULL)
897 paging_segment_max = i;
898 }
899
900 PSL_UNLOCK();
901
902 /*
903 * All the segments have been deleted.
904 * We can remove the backing store.
905 */
906
907 /*
908 * Disable lookups of this backing store.
909 */
910 if((void *)bs->bs_port->alias != NULL)
911 kfree((void *) bs->bs_port->alias,
912 sizeof (struct vstruct_alias));
913 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
914 bs->bs_port = MACH_PORT_NULL;
915 BS_UNLOCK(bs);
916
917 /*
918 * Remove backing store from backing_store list.
919 */
920 BSL_LOCK();
921 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
922 bs_links);
923 BSL_UNLOCK();
924
925 /*
926 * Free the backing store structure.
927 */
928 kfree(bs, sizeof *bs);
929
930 return KERN_SUCCESS;
931 }
932
933 int ps_enter(paging_segment_t); /* forward */
934
935 int
936 ps_enter(
937 paging_segment_t ps)
938 {
939 int i;
940
941 PSL_LOCK();
942
943 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
944 if (paging_segments[i] == PAGING_SEGMENT_NULL)
945 break;
946 }
947
948 if (i < MAX_NUM_PAGING_SEGMENTS) {
949 paging_segments[i] = ps;
950 if (i > paging_segment_max)
951 paging_segment_max = i;
952 paging_segment_count++;
953 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
954 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
955 ps_select_array[ps->ps_bs->bs_priority] = 0;
956 i = 0;
957 } else {
958 PSL_UNLOCK();
959 return KERN_RESOURCE_SHORTAGE;
960 }
961
962 PSL_UNLOCK();
963 return i;
964 }
965
966 #ifdef DEVICE_PAGING
967 kern_return_t
968 default_pager_add_segment(
969 MACH_PORT_FACE backing_store,
970 MACH_PORT_FACE device,
971 recnum_t offset,
972 recnum_t count,
973 int record_size)
974 {
975 backing_store_t bs;
976 paging_segment_t ps;
977 int i;
978 int error;
979
980 if ((bs = backing_store_lookup(backing_store))
981 == BACKING_STORE_NULL)
982 return KERN_INVALID_ARGUMENT;
983
984 PSL_LOCK();
985 for (i = 0; i <= paging_segment_max; i++) {
986 ps = paging_segments[i];
987 if (ps == PAGING_SEGMENT_NULL)
988 continue;
989
990 /*
991 * Check for overlap on same device.
992 */
993 if (!(ps->ps_device != device
994 || offset >= ps->ps_offset + ps->ps_recnum
995 || offset + count <= ps->ps_offset)) {
996 PSL_UNLOCK();
997 BS_UNLOCK(bs);
998 return KERN_INVALID_ARGUMENT;
999 }
1000 }
1001 PSL_UNLOCK();
1002
1003 /*
1004 * Set up the paging segment
1005 */
1006 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1007 if (ps == PAGING_SEGMENT_NULL) {
1008 BS_UNLOCK(bs);
1009 return KERN_RESOURCE_SHORTAGE;
1010 }
1011
1012 ps->ps_segtype = PS_PARTITION;
1013 ps->ps_device = device;
1014 ps->ps_offset = offset;
1015 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1016 ps->ps_recnum = count;
1017 ps->ps_pgnum = count >> ps->ps_record_shift;
1018
1019 ps->ps_pgcount = ps->ps_pgnum;
1020 ps->ps_clshift = local_log2(bs->bs_clsize);
1021 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1022 ps->ps_hint = 0;
1023
1024 PS_LOCK_INIT(ps);
1025 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1026 if (!ps->ps_bmap) {
1027 kfree(ps, sizeof *ps);
1028 BS_UNLOCK(bs);
1029 return KERN_RESOURCE_SHORTAGE;
1030 }
1031 for (i = 0; i < ps->ps_ncls; i++) {
1032 clrbit(ps->ps_bmap, i);
1033 }
1034
1035 ps->ps_going_away = FALSE;
1036 ps->ps_bs = bs;
1037
1038 if ((error = ps_enter(ps)) != 0) {
1039 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1040 kfree(ps, sizeof *ps);
1041 BS_UNLOCK(bs);
1042 return KERN_RESOURCE_SHORTAGE;
1043 }
1044
1045 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1046 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1047 BS_UNLOCK(bs);
1048
1049 PSL_LOCK();
1050 dp_pages_free += ps->ps_pgcount;
1051 PSL_UNLOCK();
1052
1053 bs_more_space(ps->ps_clcount);
1054
1055 DP_DEBUG(DEBUG_BS_INTERNAL,
1056 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1057 device, offset, count, record_size,
1058 ps->ps_record_shift, ps->ps_pgnum));
1059
1060 return KERN_SUCCESS;
1061 }
1062
1063 boolean_t
1064 bs_add_device(
1065 char *dev_name,
1066 MACH_PORT_FACE master)
1067 {
1068 security_token_t null_security_token = {
1069 { 0, 0 }
1070 };
1071 MACH_PORT_FACE device;
1072 int info[DEV_GET_SIZE_COUNT];
1073 mach_msg_type_number_t info_count;
1074 MACH_PORT_FACE bs = MACH_PORT_NULL;
1075 unsigned int rec_size;
1076 recnum_t count;
1077 int clsize;
1078 MACH_PORT_FACE reply_port;
1079
1080 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1081 null_security_token, dev_name, &device))
1082 return FALSE;
1083
1084 info_count = DEV_GET_SIZE_COUNT;
1085 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1086 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1087 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1088 clsize = bs_get_global_clsize(0);
1089 if (!default_pager_backing_store_create(
1090 default_pager_object,
1091 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1092 (clsize * vm_page_size),
1093 &bs)) {
1094 if (!default_pager_add_segment(bs, device,
1095 0, count, rec_size)) {
1096 return TRUE;
1097 }
1098 ipc_port_release_receive(bs);
1099 }
1100 }
1101
1102 ipc_port_release_send(device);
1103 return FALSE;
1104 }
1105 #endif /* DEVICE_PAGING */
1106
1107 #if VS_ASYNC_REUSE
1108
1109 struct vs_async *
1110 vs_alloc_async(void)
1111 {
1112 struct vs_async *vsa;
1113 MACH_PORT_FACE reply_port;
1114 // kern_return_t kr;
1115
1116 VS_ASYNC_LOCK();
1117 if (vs_async_free_list == NULL) {
1118 VS_ASYNC_UNLOCK();
1119 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1120 if (vsa != NULL) {
1121 /*
1122 * Try allocating a reply port named after the
1123 * address of the vs_async structure.
1124 */
1125 struct vstruct_alias *alias_struct;
1126
1127 reply_port = ipc_port_alloc_kernel();
1128 alias_struct = (struct vstruct_alias *)
1129 kalloc(sizeof (struct vstruct_alias));
1130 if(alias_struct != NULL) {
1131 alias_struct->vs = (struct vstruct *)vsa;
1132 alias_struct->name = &default_pager_ops;
1133 reply_port->alias = (int) alias_struct;
1134 vsa->reply_port = reply_port;
1135 vs_alloc_async_count++;
1136 }
1137 else {
1138 vs_alloc_async_failed++;
1139 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1140 (reply_port));
1141 kfree(vsa, sizeof (struct vs_async));
1142 vsa = NULL;
1143 }
1144 }
1145 } else {
1146 vsa = vs_async_free_list;
1147 vs_async_free_list = vs_async_free_list->vsa_next;
1148 VS_ASYNC_UNLOCK();
1149 }
1150
1151 return vsa;
1152 }
1153
1154 void
1155 vs_free_async(
1156 struct vs_async *vsa)
1157 {
1158 VS_ASYNC_LOCK();
1159 vsa->vsa_next = vs_async_free_list;
1160 vs_async_free_list = vsa;
1161 VS_ASYNC_UNLOCK();
1162 }
1163
1164 #else /* VS_ASYNC_REUSE */
1165
1166 struct vs_async *
1167 vs_alloc_async(void)
1168 {
1169 struct vs_async *vsa;
1170 MACH_PORT_FACE reply_port;
1171 kern_return_t kr;
1172
1173 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1174 if (vsa != NULL) {
1175 /*
1176 * Try allocating a reply port named after the
1177 * address of the vs_async structure.
1178 */
1179 reply_port = ipc_port_alloc_kernel();
1180 alias_struct = (vstruct_alias *)
1181 kalloc(sizeof (struct vstruct_alias));
1182 if(alias_struct != NULL) {
1183 alias_struct->vs = reply_port;
1184 alias_struct->name = &default_pager_ops;
1185 reply_port->alias = (int) vsa;
1186 vsa->reply_port = reply_port;
1187 vs_alloc_async_count++;
1188 }
1189 else {
1190 vs_alloc_async_failed++;
1191 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1192 (reply_port));
1193 kfree(vsa, sizeof (struct vs_async));
1194 vsa = NULL;
1195 }
1196 }
1197
1198 return vsa;
1199 }
1200
1201 void
1202 vs_free_async(
1203 struct vs_async *vsa)
1204 {
1205 MACH_PORT_FACE reply_port;
1206 kern_return_t kr;
1207
1208 reply_port = vsa->reply_port;
1209 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1210 kfree(vsa, sizeof (struct vs_async));
1211 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1212 #if 0
1213 VS_ASYNC_LOCK();
1214 vs_alloc_async_count--;
1215 VS_ASYNC_UNLOCK();
1216 #endif
1217 }
1218
1219 #endif /* VS_ASYNC_REUSE */
1220
1221 zone_t vstruct_zone;
1222
1223 vstruct_t
1224 ps_vstruct_create(
1225 vm_size_t size)
1226 {
1227 vstruct_t vs;
1228 unsigned int i;
1229
1230 vs = (vstruct_t) zalloc(vstruct_zone);
1231 if (vs == VSTRUCT_NULL) {
1232 return VSTRUCT_NULL;
1233 }
1234
1235 VS_LOCK_INIT(vs);
1236
1237 /*
1238 * The following fields will be provided later.
1239 */
1240 vs->vs_pager_ops = NULL;
1241 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1242 vs->vs_references = 1;
1243 vs->vs_seqno = 0;
1244
1245 #ifdef MACH_KERNEL
1246 vs->vs_waiting_seqno = FALSE;
1247 vs->vs_waiting_read = FALSE;
1248 vs->vs_waiting_write = FALSE;
1249 vs->vs_waiting_async = FALSE;
1250 #else
1251 mutex_init(&vs->vs_waiting_seqno, 0);
1252 mutex_init(&vs->vs_waiting_read, 0);
1253 mutex_init(&vs->vs_waiting_write, 0);
1254 mutex_init(&vs->vs_waiting_refs, 0);
1255 mutex_init(&vs->vs_waiting_async, 0);
1256 #endif
1257
1258 vs->vs_readers = 0;
1259 vs->vs_writers = 0;
1260
1261 vs->vs_errors = 0;
1262
1263 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1264 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1265 vs->vs_async_pending = 0;
1266
1267 /*
1268 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1269 * depending on the size of the memory object.
1270 */
1271 if (INDIRECT_CLMAP(vs->vs_size)) {
1272 vs->vs_imap = (struct vs_map **)
1273 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1274 vs->vs_indirect = TRUE;
1275 } else {
1276 vs->vs_dmap = (struct vs_map *)
1277 kalloc(CLMAP_SIZE(vs->vs_size));
1278 vs->vs_indirect = FALSE;
1279 }
1280 vs->vs_xfer_pending = FALSE;
1281 DP_DEBUG(DEBUG_VS_INTERNAL,
1282 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1283
1284 /*
1285 * Check to see that we got the space.
1286 */
1287 if (!vs->vs_dmap) {
1288 kfree(vs, sizeof *vs);
1289 return VSTRUCT_NULL;
1290 }
1291
1292 /*
1293 * Zero the indirect pointers, or clear the direct pointers.
1294 */
1295 if (vs->vs_indirect)
1296 memset(vs->vs_imap, 0,
1297 INDIRECT_CLMAP_SIZE(vs->vs_size));
1298 else
1299 for (i = 0; i < vs->vs_size; i++)
1300 VSM_CLR(vs->vs_dmap[i]);
1301
1302 VS_MAP_LOCK_INIT(vs);
1303
1304 bs_commit(vs->vs_size);
1305
1306 return vs;
1307 }
1308
1309 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1310
1311 paging_segment_t
1312 ps_select_segment(
1313 unsigned int shift,
1314 int *psindex)
1315 {
1316 paging_segment_t ps;
1317 int i;
1318 int j;
1319
1320 /*
1321 * Optimize case where there's only one segment.
1322 * paging_segment_max will index the one and only segment.
1323 */
1324
1325 PSL_LOCK();
1326 if (paging_segment_count == 1) {
1327 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1328 ipc_port_t trigger = IP_NULL;
1329
1330 ps = paging_segments[paging_segment_max];
1331 *psindex = paging_segment_max;
1332 PS_LOCK(ps);
1333 if (ps->ps_going_away) {
1334 /* this segment is being turned off */
1335 lps = PAGING_SEGMENT_NULL;
1336 } else {
1337 ASSERT(ps->ps_clshift >= shift);
1338 if (ps->ps_clcount) {
1339 ps->ps_clcount--;
1340 dp_pages_free -= 1 << ps->ps_clshift;
1341 if(min_pages_trigger_port &&
1342 (dp_pages_free < minimum_pages_remaining)) {
1343 trigger = min_pages_trigger_port;
1344 min_pages_trigger_port = NULL;
1345 bs_low = TRUE;
1346 }
1347 lps = ps;
1348 } else
1349 lps = PAGING_SEGMENT_NULL;
1350 }
1351 PS_UNLOCK(ps);
1352 PSL_UNLOCK();
1353
1354 if (trigger != IP_NULL) {
1355 default_pager_space_alert(trigger, HI_WAT_ALERT);
1356 ipc_port_release_send(trigger);
1357 }
1358 return lps;
1359 }
1360
1361 if (paging_segment_count == 0) {
1362 PSL_UNLOCK();
1363 return PAGING_SEGMENT_NULL;
1364 }
1365
1366 for (i = BS_MAXPRI;
1367 i >= BS_MINPRI; i--) {
1368 int start_index;
1369
1370 if ((ps_select_array[i] == BS_NOPRI) ||
1371 (ps_select_array[i] == BS_FULLPRI))
1372 continue;
1373 start_index = ps_select_array[i];
1374
1375 if(!(paging_segments[start_index])) {
1376 j = start_index+1;
1377 physical_transfer_cluster_count = 0;
1378 }
1379 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1380 (((paging_segments[start_index])->ps_clshift)
1381 + vm_page_shift))) {
1382 physical_transfer_cluster_count = 0;
1383 j = start_index + 1;
1384 } else {
1385 physical_transfer_cluster_count+=1;
1386 j = start_index;
1387 if(start_index == 0)
1388 start_index = paging_segment_max;
1389 else
1390 start_index = start_index - 1;
1391 }
1392
1393 while (1) {
1394 if (j > paging_segment_max)
1395 j = 0;
1396 if ((ps = paging_segments[j]) &&
1397 (ps->ps_bs->bs_priority == i)) {
1398 /*
1399 * Force the ps cluster size to be
1400 * >= that of the vstruct.
1401 */
1402 PS_LOCK(ps);
1403 if (ps->ps_going_away) {
1404 /* this segment is being turned off */
1405 } else if ((ps->ps_clcount) &&
1406 (ps->ps_clshift >= shift)) {
1407 ipc_port_t trigger = IP_NULL;
1408
1409 ps->ps_clcount--;
1410 dp_pages_free -= 1 << ps->ps_clshift;
1411 if(min_pages_trigger_port &&
1412 (dp_pages_free <
1413 minimum_pages_remaining)) {
1414 trigger = min_pages_trigger_port;
1415 min_pages_trigger_port = NULL;
1416 }
1417 PS_UNLOCK(ps);
1418 /*
1419 * found one, quit looking.
1420 */
1421 ps_select_array[i] = j;
1422 PSL_UNLOCK();
1423
1424 if (trigger != IP_NULL) {
1425 default_pager_space_alert(
1426 trigger,
1427 HI_WAT_ALERT);
1428 ipc_port_release_send(trigger);
1429 }
1430 *psindex = j;
1431 return ps;
1432 }
1433 PS_UNLOCK(ps);
1434 }
1435 if (j == start_index) {
1436 /*
1437 * none at this priority -- mark it full
1438 */
1439 ps_select_array[i] = BS_FULLPRI;
1440 break;
1441 }
1442 j++;
1443 }
1444 }
1445 PSL_UNLOCK();
1446 return PAGING_SEGMENT_NULL;
1447 }
1448
1449 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1450
1451 vm_offset_t
1452 ps_allocate_cluster(
1453 vstruct_t vs,
1454 int *psindex,
1455 paging_segment_t use_ps)
1456 {
1457 unsigned int byte_num;
1458 int bit_num = 0;
1459 paging_segment_t ps;
1460 vm_offset_t cluster;
1461 ipc_port_t trigger = IP_NULL;
1462
1463 /*
1464 * Find best paging segment.
1465 * ps_select_segment will decrement cluster count on ps.
1466 * Must pass cluster shift to find the most appropriate segment.
1467 */
1468 /* NOTE: The addition of paging segment delete capability threatened
1469 * to seriously complicate the treatment of paging segments in this
1470 * module and the ones that call it (notably ps_clmap), because of the
1471 * difficulty in assuring that the paging segment would continue to
1472 * exist between being unlocked and locked. This was
1473 * avoided because all calls to this module are based in either
1474 * dp_memory_object calls which rely on the vs lock, or by
1475 * the transfer function which is part of the segment delete path.
1476 * The transfer function which is part of paging segment delete is
1477 * protected from multiple callers by the backing store lock.
1478 * The paging segment delete function treats mappings to a paging
1479 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1480 * while data is transferred to the remaining segments. This is in
1481 * line with the view that incomplete or in-transition mappings between
1482 * data, a vstruct, and backing store are protected by the vs lock.
1483 * This and the ordering of the paging segment "going_away" bit setting
1484 * protects us.
1485 */
1486 if (use_ps != PAGING_SEGMENT_NULL) {
1487 ps = use_ps;
1488 PSL_LOCK();
1489 PS_LOCK(ps);
1490
1491 ASSERT(ps->ps_clcount != 0);
1492
1493 ps->ps_clcount--;
1494 dp_pages_free -= 1 << ps->ps_clshift;
1495 if(min_pages_trigger_port &&
1496 (dp_pages_free < minimum_pages_remaining)) {
1497 trigger = min_pages_trigger_port;
1498 min_pages_trigger_port = NULL;
1499 }
1500 PSL_UNLOCK();
1501 PS_UNLOCK(ps);
1502 if (trigger != IP_NULL) {
1503 default_pager_space_alert(trigger, HI_WAT_ALERT);
1504 ipc_port_release_send(trigger);
1505 }
1506
1507 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1508 PAGING_SEGMENT_NULL) {
1509 static uint32_t lastnotify = 0;
1510 uint32_t now, nanoseconds_dummy;
1511
1512 /*
1513 * Emit a notification of the low-paging resource condition
1514 * but don't issue it more than once every five seconds. This
1515 * prevents us from overflowing logs with thousands of
1516 * repetitions of the message.
1517 */
1518 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1519 if (now > lastnotify + 5) {
1520 dprintf(("no space in available paging segments\n"));
1521 lastnotify = now;
1522 }
1523
1524 /* the count got off maybe, reset to zero */
1525 PSL_LOCK();
1526 dp_pages_free = 0;
1527 if(min_pages_trigger_port) {
1528 trigger = min_pages_trigger_port;
1529 min_pages_trigger_port = NULL;
1530 bs_low = TRUE;
1531 }
1532 PSL_UNLOCK();
1533 if (trigger != IP_NULL) {
1534 default_pager_space_alert(trigger, HI_WAT_ALERT);
1535 ipc_port_release_send(trigger);
1536 }
1537 return (vm_offset_t) -1;
1538 }
1539
1540 /*
1541 * Look for an available cluster. At the end of the loop,
1542 * byte_num is the byte offset and bit_num is the bit offset of the
1543 * first zero bit in the paging segment bitmap.
1544 */
1545 PS_LOCK(ps);
1546 byte_num = ps->ps_hint;
1547 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1548 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1549 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1550 if (isclr((ps->ps_bmap + byte_num), bit_num))
1551 break;
1552 }
1553 ASSERT(bit_num != NBBY);
1554 break;
1555 }
1556 }
1557 ps->ps_hint = byte_num;
1558 cluster = (byte_num*NBBY) + bit_num;
1559
1560 /* Space was reserved, so this must be true */
1561 ASSERT(cluster < ps->ps_ncls);
1562
1563 setbit(ps->ps_bmap, cluster);
1564 PS_UNLOCK(ps);
1565
1566 return cluster;
1567 }
1568
1569 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1570
1571 void
1572 ps_deallocate_cluster(
1573 paging_segment_t ps,
1574 vm_offset_t cluster)
1575 {
1576
1577 if (cluster >= (vm_offset_t) ps->ps_ncls)
1578 panic("ps_deallocate_cluster: Invalid cluster number");
1579
1580 /*
1581 * Lock the paging segment, clear the cluster's bitmap and increment the
1582 * number of free cluster.
1583 */
1584 PSL_LOCK();
1585 PS_LOCK(ps);
1586 clrbit(ps->ps_bmap, cluster);
1587 ++ps->ps_clcount;
1588 dp_pages_free += 1 << ps->ps_clshift;
1589 PSL_UNLOCK();
1590
1591 /*
1592 * Move the hint down to the freed cluster if it is
1593 * less than the current hint.
1594 */
1595 if ((cluster/NBBY) < ps->ps_hint) {
1596 ps->ps_hint = (cluster/NBBY);
1597 }
1598
1599 PS_UNLOCK(ps);
1600
1601 /*
1602 * If we're freeing space on a full priority, reset the array.
1603 */
1604 PSL_LOCK();
1605 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1606 ps_select_array[ps->ps_bs->bs_priority] = 0;
1607 PSL_UNLOCK();
1608
1609 return;
1610 }
1611
1612 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1613
1614 void
1615 ps_dealloc_vsmap(
1616 struct vs_map *vsmap,
1617 vm_size_t size)
1618 {
1619 unsigned int i;
1620 for (i = 0; i < size; i++)
1621 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1622 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1623 VSM_CLOFF(vsmap[i]));
1624 }
1625
1626 void
1627 ps_vstruct_dealloc(
1628 vstruct_t vs)
1629 {
1630 unsigned int i;
1631 // spl_t s;
1632
1633 VS_MAP_LOCK(vs);
1634
1635 /*
1636 * If this is an indirect structure, then we walk through the valid
1637 * (non-zero) indirect pointers and deallocate the clusters
1638 * associated with each used map entry (via ps_dealloc_vsmap).
1639 * When all of the clusters in an indirect block have been
1640 * freed, we deallocate the block. When all of the indirect
1641 * blocks have been deallocated we deallocate the memory
1642 * holding the indirect pointers.
1643 */
1644 if (vs->vs_indirect) {
1645 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1646 if (vs->vs_imap[i] != NULL) {
1647 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1648 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1649 }
1650 }
1651 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1652 } else {
1653 /*
1654 * Direct map. Free used clusters, then memory.
1655 */
1656 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1657 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1658 }
1659 VS_MAP_UNLOCK(vs);
1660
1661 bs_commit(- vs->vs_size);
1662
1663 zfree(vstruct_zone, vs);
1664 }
1665
1666 int ps_map_extend(vstruct_t, unsigned int); /* forward */
1667
1668 int ps_map_extend(
1669 vstruct_t vs,
1670 unsigned int new_size)
1671 {
1672 struct vs_map **new_imap;
1673 struct vs_map *new_dmap = NULL;
1674 int newdsize;
1675 int i;
1676 void *old_map = NULL;
1677 int old_map_size = 0;
1678
1679 if (vs->vs_size >= new_size) {
1680 /*
1681 * Someone has already done the work.
1682 */
1683 return 0;
1684 }
1685
1686 /*
1687 * If the new size extends into the indirect range, then we have one
1688 * of two cases: we are going from indirect to indirect, or we are
1689 * going from direct to indirect. If we are going from indirect to
1690 * indirect, then it is possible that the new size will fit in the old
1691 * indirect map. If this is the case, then just reset the size of the
1692 * vstruct map and we are done. If the new size will not
1693 * fit into the old indirect map, then we have to allocate a new
1694 * indirect map and copy the old map pointers into this new map.
1695 *
1696 * If we are going from direct to indirect, then we have to allocate a
1697 * new indirect map and copy the old direct pages into the first
1698 * indirect page of the new map.
1699 * NOTE: allocating memory here is dangerous, as we're in the
1700 * pageout path.
1701 */
1702 if (INDIRECT_CLMAP(new_size)) {
1703 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1704
1705 /*
1706 * Get a new indirect map and zero it.
1707 */
1708 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1709 if (vs->vs_indirect &&
1710 (new_map_size == old_map_size)) {
1711 bs_commit(new_size - vs->vs_size);
1712 vs->vs_size = new_size;
1713 return 0;
1714 }
1715
1716 new_imap = (struct vs_map **)kalloc(new_map_size);
1717 if (new_imap == NULL) {
1718 return -1;
1719 }
1720 memset(new_imap, 0, new_map_size);
1721
1722 if (vs->vs_indirect) {
1723 /* Copy old entries into new map */
1724 memcpy(new_imap, vs->vs_imap, old_map_size);
1725 /* Arrange to free the old map */
1726 old_map = (void *) vs->vs_imap;
1727 newdsize = 0;
1728 } else { /* Old map was a direct map */
1729 /* Allocate an indirect page */
1730 if ((new_imap[0] = (struct vs_map *)
1731 kalloc(CLMAP_THRESHOLD)) == NULL) {
1732 kfree(new_imap, new_map_size);
1733 return -1;
1734 }
1735 new_dmap = new_imap[0];
1736 newdsize = CLMAP_ENTRIES;
1737 }
1738 } else {
1739 new_imap = NULL;
1740 newdsize = new_size;
1741 /*
1742 * If the new map is a direct map, then the old map must
1743 * also have been a direct map. All we have to do is
1744 * to allocate a new direct map, copy the old entries
1745 * into it and free the old map.
1746 */
1747 if ((new_dmap = (struct vs_map *)
1748 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1749 return -1;
1750 }
1751 }
1752 if (newdsize) {
1753
1754 /* Free the old map */
1755 old_map = (void *) vs->vs_dmap;
1756 old_map_size = CLMAP_SIZE(vs->vs_size);
1757
1758 /* Copy info from the old map into the new map */
1759 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1760
1761 /* Initialize the rest of the new map */
1762 for (i = vs->vs_size; i < newdsize; i++)
1763 VSM_CLR(new_dmap[i]);
1764 }
1765 if (new_imap) {
1766 vs->vs_imap = new_imap;
1767 vs->vs_indirect = TRUE;
1768 } else
1769 vs->vs_dmap = new_dmap;
1770 bs_commit(new_size - vs->vs_size);
1771 vs->vs_size = new_size;
1772 if (old_map)
1773 kfree(old_map, old_map_size);
1774 return 0;
1775 }
1776
1777 vm_offset_t
1778 ps_clmap(
1779 vstruct_t vs,
1780 vm_offset_t offset,
1781 struct clmap *clmap,
1782 int flag,
1783 vm_size_t size,
1784 int error)
1785 {
1786 vm_offset_t cluster; /* The cluster of offset. */
1787 vm_offset_t newcl; /* The new cluster allocated. */
1788 vm_offset_t newoff;
1789 unsigned int i;
1790 struct vs_map *vsmap;
1791
1792 VS_MAP_LOCK(vs);
1793
1794 ASSERT(vs->vs_dmap);
1795 cluster = atop_32(offset) >> vs->vs_clshift;
1796
1797 /*
1798 * Initialize cluster error value
1799 */
1800 clmap->cl_error = 0;
1801
1802 /*
1803 * If the object has grown, extend the page map.
1804 */
1805 if (cluster >= vs->vs_size) {
1806 if (flag == CL_FIND) {
1807 /* Do not allocate if just doing a lookup */
1808 VS_MAP_UNLOCK(vs);
1809 return (vm_offset_t) -1;
1810 }
1811 if (ps_map_extend(vs, cluster + 1)) {
1812 VS_MAP_UNLOCK(vs);
1813 return (vm_offset_t) -1;
1814 }
1815 }
1816
1817 /*
1818 * Look for the desired cluster. If the map is indirect, then we
1819 * have a two level lookup. First find the indirect block, then
1820 * find the actual cluster. If the indirect block has not yet
1821 * been allocated, then do so. If the cluster has not yet been
1822 * allocated, then do so.
1823 *
1824 * If any of the allocations fail, then return an error.
1825 * Don't allocate if just doing a lookup.
1826 */
1827 if (vs->vs_indirect) {
1828 long ind_block = cluster/CLMAP_ENTRIES;
1829
1830 /* Is the indirect block allocated? */
1831 vsmap = vs->vs_imap[ind_block];
1832 if (vsmap == NULL) {
1833 if (flag == CL_FIND) {
1834 VS_MAP_UNLOCK(vs);
1835 return (vm_offset_t) -1;
1836 }
1837
1838 /* Allocate the indirect block */
1839 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1840 if (vsmap == NULL) {
1841 VS_MAP_UNLOCK(vs);
1842 return (vm_offset_t) -1;
1843 }
1844 /* Initialize the cluster offsets */
1845 for (i = 0; i < CLMAP_ENTRIES; i++)
1846 VSM_CLR(vsmap[i]);
1847 vs->vs_imap[ind_block] = vsmap;
1848 }
1849 } else
1850 vsmap = vs->vs_dmap;
1851
1852 ASSERT(vsmap);
1853 vsmap += cluster%CLMAP_ENTRIES;
1854
1855 /*
1856 * At this point, vsmap points to the struct vs_map desired.
1857 *
1858 * Look in the map for the cluster, if there was an error on a
1859 * previous write, flag it and return. If it is not yet
1860 * allocated, then allocate it, if we're writing; if we're
1861 * doing a lookup and the cluster's not allocated, return error.
1862 */
1863 if (VSM_ISERR(*vsmap)) {
1864 clmap->cl_error = VSM_GETERR(*vsmap);
1865 VS_MAP_UNLOCK(vs);
1866 return (vm_offset_t) -1;
1867 } else if (VSM_ISCLR(*vsmap)) {
1868 int psindex;
1869
1870 if (flag == CL_FIND) {
1871 /*
1872 * If there's an error and the entry is clear, then
1873 * we've run out of swap space. Record the error
1874 * here and return.
1875 */
1876 if (error) {
1877 VSM_SETERR(*vsmap, error);
1878 }
1879 VS_MAP_UNLOCK(vs);
1880 return (vm_offset_t) -1;
1881 } else {
1882 /*
1883 * Attempt to allocate a cluster from the paging segment
1884 */
1885 newcl = ps_allocate_cluster(vs, &psindex,
1886 PAGING_SEGMENT_NULL);
1887 if (newcl == (vm_offset_t) -1) {
1888 VS_MAP_UNLOCK(vs);
1889 return (vm_offset_t) -1;
1890 }
1891 VSM_CLR(*vsmap);
1892 VSM_SETCLOFF(*vsmap, newcl);
1893 VSM_SETPS(*vsmap, psindex);
1894 }
1895 } else
1896 newcl = VSM_CLOFF(*vsmap);
1897
1898 /*
1899 * Fill in pertinent fields of the clmap
1900 */
1901 clmap->cl_ps = VSM_PS(*vsmap);
1902 clmap->cl_numpages = VSCLSIZE(vs);
1903 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1904
1905 /*
1906 * Byte offset in paging segment is byte offset to cluster plus
1907 * byte offset within cluster. It looks ugly, but should be
1908 * relatively quick.
1909 */
1910 ASSERT(trunc_page(offset) == offset);
1911 newcl = ptoa_32(newcl) << vs->vs_clshift;
1912 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1913 if (flag == CL_ALLOC) {
1914 /*
1915 * set bits in the allocation bitmap according to which
1916 * pages were requested. size is in bytes.
1917 */
1918 i = atop_32(newoff);
1919 while ((size > 0) && (i < VSCLSIZE(vs))) {
1920 VSM_SETALLOC(*vsmap, i);
1921 i++;
1922 size -= vm_page_size;
1923 }
1924 }
1925 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1926 if (newoff) {
1927 /*
1928 * Offset is not cluster aligned, so number of pages
1929 * and bitmaps must be adjusted
1930 */
1931 clmap->cl_numpages -= atop_32(newoff);
1932 CLMAP_SHIFT(clmap, vs);
1933 CLMAP_SHIFTALLOC(clmap, vs);
1934 }
1935
1936 /*
1937 *
1938 * The setting of valid bits and handling of write errors
1939 * must be done here, while we hold the lock on the map.
1940 * It logically should be done in ps_vs_write_complete().
1941 * The size and error information has been passed from
1942 * ps_vs_write_complete(). If the size parameter is non-zero,
1943 * then there is work to be done. If error is also non-zero,
1944 * then the error number is recorded in the cluster and the
1945 * entire cluster is in error.
1946 */
1947 if (size && flag == CL_FIND) {
1948 vm_offset_t off = (vm_offset_t) 0;
1949
1950 if (!error) {
1951 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1952 i++) {
1953 VSM_SETPG(*vsmap, i);
1954 size -= vm_page_size;
1955 }
1956 ASSERT(i <= VSCLSIZE(vs));
1957 } else {
1958 BS_STAT(clmap->cl_ps->ps_bs,
1959 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1960 atop_32(size));
1961 off = VSM_CLOFF(*vsmap);
1962 VSM_SETERR(*vsmap, error);
1963 }
1964 /*
1965 * Deallocate cluster if error, and no valid pages
1966 * already present.
1967 */
1968 if (off != (vm_offset_t) 0)
1969 ps_deallocate_cluster(clmap->cl_ps, off);
1970 VS_MAP_UNLOCK(vs);
1971 return (vm_offset_t) 0;
1972 } else
1973 VS_MAP_UNLOCK(vs);
1974
1975 DP_DEBUG(DEBUG_VS_INTERNAL,
1976 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1977 newcl+newoff, (int) vs, (int) vsmap, flag));
1978 DP_DEBUG(DEBUG_VS_INTERNAL,
1979 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1980 (int) clmap->cl_ps, clmap->cl_numpages,
1981 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1982
1983 return (newcl + newoff);
1984 }
1985
1986 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1987
1988 void
1989 ps_clunmap(
1990 vstruct_t vs,
1991 vm_offset_t offset,
1992 vm_size_t length)
1993 {
1994 vm_offset_t cluster; /* The cluster number of offset */
1995 struct vs_map *vsmap;
1996
1997 VS_MAP_LOCK(vs);
1998
1999 /*
2000 * Loop through all clusters in this range, freeing paging segment
2001 * clusters and map entries as encountered.
2002 */
2003 while (length > 0) {
2004 vm_offset_t newoff;
2005 unsigned int i;
2006
2007 cluster = atop_32(offset) >> vs->vs_clshift;
2008 if (vs->vs_indirect) /* indirect map */
2009 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2010 else
2011 vsmap = vs->vs_dmap;
2012 if (vsmap == NULL) {
2013 VS_MAP_UNLOCK(vs);
2014 return;
2015 }
2016 vsmap += cluster%CLMAP_ENTRIES;
2017 if (VSM_ISCLR(*vsmap)) {
2018 length -= vm_page_size;
2019 offset += vm_page_size;
2020 continue;
2021 }
2022 /*
2023 * We've got a valid mapping. Clear it and deallocate
2024 * paging segment cluster pages.
2025 * Optimize for entire cluster cleraing.
2026 */
2027 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2028 /*
2029 * Not cluster aligned.
2030 */
2031 ASSERT(trunc_page(newoff) == newoff);
2032 i = atop_32(newoff);
2033 } else
2034 i = 0;
2035 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2036 VSM_CLRPG(*vsmap, i);
2037 VSM_CLRALLOC(*vsmap, i);
2038 length -= vm_page_size;
2039 offset += vm_page_size;
2040 i++;
2041 }
2042
2043 /*
2044 * If map entry is empty, clear and deallocate cluster.
2045 */
2046 if (!VSM_ALLOC(*vsmap)) {
2047 ps_deallocate_cluster(VSM_PS(*vsmap),
2048 VSM_CLOFF(*vsmap));
2049 VSM_CLR(*vsmap);
2050 }
2051 }
2052
2053 VS_MAP_UNLOCK(vs);
2054 }
2055
2056 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2057
2058 void
2059 ps_vs_write_complete(
2060 vstruct_t vs,
2061 vm_offset_t offset,
2062 vm_size_t size,
2063 int error)
2064 {
2065 struct clmap clmap;
2066
2067 /*
2068 * Get the struct vsmap for this cluster.
2069 * Use READ, even though it was written, because the
2070 * cluster MUST be present, unless there was an error
2071 * in the original ps_clmap (e.g. no space), in which
2072 * case, nothing happens.
2073 *
2074 * Must pass enough information to ps_clmap to allow it
2075 * to set the vs_map structure bitmap under lock.
2076 */
2077 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2078 }
2079
2080 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2081
2082 void
2083 vs_cl_write_complete(
2084 vstruct_t vs,
2085 __unused paging_segment_t ps,
2086 vm_offset_t offset,
2087 __unused vm_offset_t addr,
2088 vm_size_t size,
2089 boolean_t async,
2090 int error)
2091 {
2092 // kern_return_t kr;
2093
2094 if (error) {
2095 /*
2096 * For internal objects, the error is recorded on a
2097 * per-cluster basis by ps_clmap() which is called
2098 * by ps_vs_write_complete() below.
2099 */
2100 dprintf(("write failed error = 0x%x\n", error));
2101 /* add upl_abort code here */
2102 } else
2103 GSTAT(global_stats.gs_pages_out += atop_32(size));
2104 /*
2105 * Notify the vstruct mapping code, so it can do its accounting.
2106 */
2107 ps_vs_write_complete(vs, offset, size, error);
2108
2109 if (async) {
2110 VS_LOCK(vs);
2111 ASSERT(vs->vs_async_pending > 0);
2112 vs->vs_async_pending -= size;
2113 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2114 vs->vs_waiting_async = FALSE;
2115 VS_UNLOCK(vs);
2116 /* mutex_unlock(&vs->vs_waiting_async); */
2117 thread_wakeup(&vs->vs_async_pending);
2118 } else {
2119 VS_UNLOCK(vs);
2120 }
2121 }
2122 }
2123
2124 #ifdef DEVICE_PAGING
2125 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2126
2127 kern_return_t
2128 device_write_reply(
2129 MACH_PORT_FACE reply_port,
2130 kern_return_t device_code,
2131 io_buf_len_t bytes_written)
2132 {
2133 struct vs_async *vsa;
2134
2135 vsa = (struct vs_async *)
2136 ((struct vstruct_alias *)(reply_port->alias))->vs;
2137
2138 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2139 device_code = KERN_FAILURE;
2140 }
2141
2142 vsa->vsa_error = device_code;
2143
2144
2145 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2146 if(vsa->vsa_flags & VSA_TRANSFER) {
2147 /* revisit when async disk segments redone */
2148 if(vsa->vsa_error) {
2149 /* need to consider error condition. re-write data or */
2150 /* throw it away here. */
2151 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2152 }
2153 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2154 vsa->vsa_size, vsa->vsa_error);
2155 } else {
2156 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2157 vsa->vsa_addr, vsa->vsa_size, TRUE,
2158 vsa->vsa_error);
2159 }
2160 VS_FREE_ASYNC(vsa);
2161
2162 return KERN_SUCCESS;
2163 }
2164
2165 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2166 kern_return_t
2167 device_write_reply_inband(
2168 MACH_PORT_FACE reply_port,
2169 kern_return_t return_code,
2170 io_buf_len_t bytes_written)
2171 {
2172 panic("device_write_reply_inband: illegal");
2173 return KERN_SUCCESS;
2174 }
2175
2176 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2177 kern_return_t
2178 device_read_reply(
2179 MACH_PORT_FACE reply_port,
2180 kern_return_t return_code,
2181 io_buf_ptr_t data,
2182 mach_msg_type_number_t dataCnt)
2183 {
2184 struct vs_async *vsa;
2185 vsa = (struct vs_async *)
2186 ((struct vstruct_alias *)(reply_port->alias))->vs;
2187 vsa->vsa_addr = (vm_offset_t)data;
2188 vsa->vsa_size = (vm_size_t)dataCnt;
2189 vsa->vsa_error = return_code;
2190 thread_wakeup(&vsa->vsa_lock);
2191 return KERN_SUCCESS;
2192 }
2193
2194 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2195 kern_return_t
2196 device_read_reply_inband(
2197 MACH_PORT_FACE reply_port,
2198 kern_return_t return_code,
2199 io_buf_ptr_inband_t data,
2200 mach_msg_type_number_t dataCnt)
2201 {
2202 panic("device_read_reply_inband: illegal");
2203 return KERN_SUCCESS;
2204 }
2205
2206 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2207 kern_return_t
2208 device_read_reply_overwrite(
2209 MACH_PORT_FACE reply_port,
2210 kern_return_t return_code,
2211 io_buf_len_t bytes_read)
2212 {
2213 panic("device_read_reply_overwrite: illegal\n");
2214 return KERN_SUCCESS;
2215 }
2216
2217 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2218 kern_return_t
2219 device_open_reply(
2220 MACH_PORT_FACE reply_port,
2221 kern_return_t return_code,
2222 MACH_PORT_FACE device_port)
2223 {
2224 panic("device_open_reply: illegal\n");
2225 return KERN_SUCCESS;
2226 }
2227
2228 kern_return_t
2229 ps_read_device(
2230 paging_segment_t ps,
2231 vm_offset_t offset,
2232 vm_offset_t *bufferp,
2233 unsigned int size,
2234 unsigned int *residualp,
2235 int flags)
2236 {
2237 kern_return_t kr;
2238 recnum_t dev_offset;
2239 unsigned int bytes_wanted;
2240 unsigned int bytes_read;
2241 unsigned int total_read;
2242 vm_offset_t dev_buffer;
2243 vm_offset_t buf_ptr;
2244 unsigned int records_read;
2245 struct vs_async *vsa;
2246 mutex_t vs_waiting_read_reply;
2247
2248 device_t device;
2249 vm_map_copy_t device_data = NULL;
2250 default_pager_thread_t *dpt = NULL;
2251
2252 device = dev_port_lookup(ps->ps_device);
2253 clustered_reads[atop_32(size)]++;
2254
2255 dev_offset = (ps->ps_offset +
2256 (offset >> (vm_page_shift - ps->ps_record_shift)));
2257 bytes_wanted = size;
2258 total_read = 0;
2259 *bufferp = (vm_offset_t)NULL;
2260
2261 do {
2262 vsa = VS_ALLOC_ASYNC();
2263 if (vsa) {
2264 vsa->vsa_vs = NULL;
2265 vsa->vsa_addr = 0;
2266 vsa->vsa_offset = 0;
2267 vsa->vsa_size = 0;
2268 vsa->vsa_ps = NULL;
2269 }
2270 mutex_init(&vsa->vsa_lock, 0);
2271 ip_lock(vsa->reply_port);
2272 vsa->reply_port->ip_sorights++;
2273 ip_reference(vsa->reply_port);
2274 ip_unlock(vsa->reply_port);
2275 kr = ds_device_read_common(device,
2276 vsa->reply_port,
2277 (mach_msg_type_name_t)
2278 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2279 (dev_mode_t) 0,
2280 dev_offset,
2281 bytes_wanted,
2282 (IO_READ | IO_CALL),
2283 (io_buf_ptr_t *) &dev_buffer,
2284 (mach_msg_type_number_t *) &bytes_read);
2285 if(kr == MIG_NO_REPLY) {
2286 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2287 thread_block(THREAD_CONTINUE_NULL);
2288
2289 dev_buffer = vsa->vsa_addr;
2290 bytes_read = (unsigned int)vsa->vsa_size;
2291 kr = vsa->vsa_error;
2292 }
2293 VS_FREE_ASYNC(vsa);
2294 if (kr != KERN_SUCCESS || bytes_read == 0) {
2295 break;
2296 }
2297 total_read += bytes_read;
2298
2299 /*
2300 * If we got the entire range, use the returned dev_buffer.
2301 */
2302 if (bytes_read == size) {
2303 *bufferp = (vm_offset_t)dev_buffer;
2304 break;
2305 }
2306
2307 #if 1
2308 dprintf(("read only %d bytes out of %d\n",
2309 bytes_read, bytes_wanted));
2310 #endif
2311 if(dpt == NULL) {
2312 dpt = get_read_buffer();
2313 buf_ptr = dpt->dpt_buffer;
2314 *bufferp = (vm_offset_t)buf_ptr;
2315 }
2316 /*
2317 * Otherwise, copy the data into the provided buffer (*bufferp)
2318 * and append the rest of the range as it comes in.
2319 */
2320 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2321 buf_ptr += bytes_read;
2322 bytes_wanted -= bytes_read;
2323 records_read = (bytes_read >>
2324 (vm_page_shift - ps->ps_record_shift));
2325 dev_offset += records_read;
2326 DP_DEBUG(DEBUG_VS_INTERNAL,
2327 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2328 dev_buffer, bytes_read));
2329 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2330 != KERN_SUCCESS)
2331 Panic("dealloc buf");
2332 } while (bytes_wanted);
2333
2334 *residualp = size - total_read;
2335 if((dev_buffer != *bufferp) && (total_read != 0)) {
2336 vm_offset_t temp_buffer;
2337 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2338 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2339 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2340 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2341 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2342 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2343 (vm_map_copy_t *)&device_data, FALSE))
2344 panic("ps_read_device: cannot copyin locally provided buffer\n");
2345 }
2346 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2347 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2348 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2349 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2350 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2351 (vm_map_copy_t *)&device_data, FALSE))
2352 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2353 }
2354 else {
2355 device_data = NULL;
2356 }
2357 *bufferp = (vm_offset_t)device_data;
2358
2359 if(dpt != NULL) {
2360 /* Free the receive buffer */
2361 dpt->checked_out = 0;
2362 thread_wakeup(&dpt_array);
2363 }
2364 return KERN_SUCCESS;
2365 }
2366
2367 kern_return_t
2368 ps_write_device(
2369 paging_segment_t ps,
2370 vm_offset_t offset,
2371 vm_offset_t addr,
2372 unsigned int size,
2373 struct vs_async *vsa)
2374 {
2375 recnum_t dev_offset;
2376 io_buf_len_t bytes_to_write, bytes_written;
2377 recnum_t records_written;
2378 kern_return_t kr;
2379 MACH_PORT_FACE reply_port;
2380
2381
2382
2383 clustered_writes[atop_32(size)]++;
2384
2385 dev_offset = (ps->ps_offset +
2386 (offset >> (vm_page_shift - ps->ps_record_shift)));
2387 bytes_to_write = size;
2388
2389 if (vsa) {
2390 /*
2391 * Asynchronous write.
2392 */
2393 reply_port = vsa->reply_port;
2394 ip_lock(reply_port);
2395 reply_port->ip_sorights++;
2396 ip_reference(reply_port);
2397 ip_unlock(reply_port);
2398 {
2399 device_t device;
2400 device = dev_port_lookup(ps->ps_device);
2401
2402 vsa->vsa_addr = addr;
2403 kr=ds_device_write_common(device,
2404 reply_port,
2405 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2406 (dev_mode_t) 0,
2407 dev_offset,
2408 (io_buf_ptr_t) addr,
2409 size,
2410 (IO_WRITE | IO_CALL),
2411 &bytes_written);
2412 }
2413 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2414 if (verbose)
2415 dprintf(("%s0x%x, addr=0x%x,"
2416 "size=0x%x,offset=0x%x\n",
2417 "device_write_request returned ",
2418 kr, addr, size, offset));
2419 BS_STAT(ps->ps_bs,
2420 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2421 /* do the completion notification to free resources */
2422 device_write_reply(reply_port, kr, 0);
2423 return PAGER_ERROR;
2424 }
2425 } else do {
2426 /*
2427 * Synchronous write.
2428 */
2429 {
2430 device_t device;
2431 device = dev_port_lookup(ps->ps_device);
2432 kr=ds_device_write_common(device,
2433 IP_NULL, 0,
2434 (dev_mode_t) 0,
2435 dev_offset,
2436 (io_buf_ptr_t) addr,
2437 size,
2438 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2439 &bytes_written);
2440 }
2441 if (kr != KERN_SUCCESS) {
2442 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2443 "device_write returned ",
2444 kr, addr, size, offset));
2445 BS_STAT(ps->ps_bs,
2446 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2447 return PAGER_ERROR;
2448 }
2449 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2450 Panic("fragmented write");
2451 records_written = (bytes_written >>
2452 (vm_page_shift - ps->ps_record_shift));
2453 dev_offset += records_written;
2454 #if 1
2455 if (bytes_written != bytes_to_write) {
2456 dprintf(("wrote only %d bytes out of %d\n",
2457 bytes_written, bytes_to_write));
2458 }
2459 #endif
2460 bytes_to_write -= bytes_written;
2461 addr += bytes_written;
2462 } while (bytes_to_write > 0);
2463
2464 return PAGER_SUCCESS;
2465 }
2466
2467
2468 #else /* !DEVICE_PAGING */
2469
2470 kern_return_t
2471 ps_read_device(
2472 __unused paging_segment_t ps,
2473 __unused vm_offset_t offset,
2474 __unused vm_offset_t *bufferp,
2475 __unused unsigned int size,
2476 __unused unsigned int *residualp,
2477 __unused int flags)
2478 {
2479 panic("ps_read_device not supported");
2480 return KERN_FAILURE;
2481 }
2482
2483 kern_return_t
2484 ps_write_device(
2485 __unused paging_segment_t ps,
2486 __unused vm_offset_t offset,
2487 __unused vm_offset_t addr,
2488 __unused unsigned int size,
2489 __unused struct vs_async *vsa)
2490 {
2491 panic("ps_write_device not supported");
2492 return KERN_FAILURE;
2493 }
2494
2495 #endif /* DEVICE_PAGING */
2496 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2497
2498 void
2499 pvs_object_data_provided(
2500 __unused vstruct_t vs,
2501 __unused upl_t upl,
2502 __unused upl_offset_t offset,
2503 upl_size_t size)
2504 {
2505
2506 DP_DEBUG(DEBUG_VS_INTERNAL,
2507 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2508 upl, offset, size));
2509
2510 ASSERT(size > 0);
2511 GSTAT(global_stats.gs_pages_in += atop_32(size));
2512
2513
2514 #if USE_PRECIOUS
2515 ps_clunmap(vs, offset, size);
2516 #endif /* USE_PRECIOUS */
2517
2518 }
2519
2520 kern_return_t
2521 pvs_cluster_read(
2522 vstruct_t vs,
2523 vm_offset_t vs_offset,
2524 vm_size_t cnt)
2525 {
2526 upl_t upl;
2527 kern_return_t error = KERN_SUCCESS;
2528 int size;
2529 unsigned int residual;
2530 unsigned int request_flags;
2531 int seg_index;
2532 int pages_in_cl;
2533 int cl_size;
2534 int cl_mask;
2535 int cl_index;
2536 int xfer_size;
2537 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2538 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2539 struct clmap clmap;
2540
2541 pages_in_cl = 1 << vs->vs_clshift;
2542 cl_size = pages_in_cl * vm_page_size;
2543 cl_mask = cl_size - 1;
2544
2545 /*
2546 * This loop will be executed multiple times until the entire
2547 * request has been satisfied... if the request spans cluster
2548 * boundaries, the clusters will be checked for logical continunity,
2549 * if contiguous the I/O request will span multiple clusters, otherwise
2550 * it will be broken up into the minimal set of I/O's
2551 *
2552 * If there are holes in a request (either unallocated pages in a paging
2553 * segment or an unallocated paging segment), we stop
2554 * reading at the hole, inform the VM of any data read, inform
2555 * the VM of an unavailable range, then loop again, hoping to
2556 * find valid pages later in the requested range. This continues until
2557 * the entire range has been examined, and read, if present.
2558 */
2559
2560 #if USE_PRECIOUS
2561 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
2562 #else
2563 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
2564 #endif
2565
2566 assert(dp_encryption_inited);
2567 if (dp_encryption) {
2568 /*
2569 * ENCRYPTED SWAP:
2570 * request that the UPL be prepared for
2571 * decryption.
2572 */
2573 request_flags |= UPL_ENCRYPT;
2574 }
2575
2576 while (cnt && (error == KERN_SUCCESS)) {
2577 int ps_info_valid;
2578 unsigned int page_list_count;
2579
2580 if((vs_offset & cl_mask) &&
2581 (cnt > (VM_SUPER_CLUSTER -
2582 (vs_offset & cl_mask)))) {
2583 size = VM_SUPER_CLUSTER;
2584 size -= vs_offset & cl_mask;
2585 } else if (cnt > VM_SUPER_CLUSTER) {
2586 size = VM_SUPER_CLUSTER;
2587 } else {
2588 size = cnt;
2589 }
2590 cnt -= size;
2591
2592 ps_info_valid = 0;
2593 seg_index = 0;
2594
2595 while (size > 0 && error == KERN_SUCCESS) {
2596 int abort_size;
2597 int failed_size;
2598 int beg_pseg;
2599 int beg_indx;
2600 vm_offset_t cur_offset;
2601
2602
2603 if ( !ps_info_valid) {
2604 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2605 psp[seg_index] = CLMAP_PS(clmap);
2606 ps_info_valid = 1;
2607 }
2608 /*
2609 * skip over unallocated physical segments
2610 */
2611 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2612 abort_size = cl_size - (vs_offset & cl_mask);
2613 abort_size = MIN(abort_size, size);
2614
2615 page_list_count = 0;
2616 memory_object_super_upl_request(
2617 vs->vs_control,
2618 (memory_object_offset_t)vs_offset,
2619 abort_size, abort_size,
2620 &upl, NULL, &page_list_count,
2621 request_flags);
2622
2623 if (clmap.cl_error) {
2624 upl_abort(upl, UPL_ABORT_ERROR);
2625 } else {
2626 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2627 }
2628 upl_deallocate(upl);
2629
2630 size -= abort_size;
2631 vs_offset += abort_size;
2632
2633 seg_index++;
2634 ps_info_valid = 0;
2635 continue;
2636 }
2637 cl_index = (vs_offset & cl_mask) / vm_page_size;
2638
2639 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2640 /*
2641 * skip over unallocated pages
2642 */
2643 if (CLMAP_ISSET(clmap, cl_index))
2644 break;
2645 abort_size += vm_page_size;
2646 }
2647 if (abort_size) {
2648 /*
2649 * Let VM system know about holes in clusters.
2650 */
2651 GSTAT(global_stats.gs_pages_unavail += atop_32(abort_size));
2652
2653 page_list_count = 0;
2654 memory_object_super_upl_request(
2655 vs->vs_control,
2656 (memory_object_offset_t)vs_offset,
2657 abort_size, abort_size,
2658 &upl, NULL, &page_list_count,
2659 request_flags);
2660
2661 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2662 upl_deallocate(upl);
2663
2664 size -= abort_size;
2665 vs_offset += abort_size;
2666
2667 if (cl_index == pages_in_cl) {
2668 /*
2669 * if we're at the end of this physical cluster
2670 * then bump to the next one and continue looking
2671 */
2672 seg_index++;
2673 ps_info_valid = 0;
2674 continue;
2675 }
2676 if (size == 0)
2677 break;
2678 }
2679 /*
2680 * remember the starting point of the first allocated page
2681 * for the I/O we're about to issue
2682 */
2683 beg_pseg = seg_index;
2684 beg_indx = cl_index;
2685 cur_offset = vs_offset;
2686
2687 /*
2688 * calculate the size of the I/O that we can do...
2689 * this may span multiple physical segments if
2690 * they are contiguous
2691 */
2692 for (xfer_size = 0; xfer_size < size; ) {
2693
2694 while (cl_index < pages_in_cl
2695 && xfer_size < size) {
2696 /*
2697 * accumulate allocated pages within
2698 * a physical segment
2699 */
2700 if (CLMAP_ISSET(clmap, cl_index)) {
2701 xfer_size += vm_page_size;
2702 cur_offset += vm_page_size;
2703 cl_index++;
2704
2705 BS_STAT(psp[seg_index]->ps_bs,
2706 psp[seg_index]->ps_bs->bs_pages_in++);
2707 } else
2708 break;
2709 }
2710 if (cl_index < pages_in_cl
2711 || xfer_size >= size) {
2712 /*
2713 * we've hit an unallocated page or
2714 * the end of this request... go fire
2715 * the I/O
2716 */
2717 break;
2718 }
2719 /*
2720 * we've hit the end of the current physical
2721 * segment and there's more to do, so try
2722 * moving to the next one
2723 */
2724 seg_index++;
2725
2726 ps_offset[seg_index] =
2727 ps_clmap(vs,
2728 cur_offset & ~cl_mask,
2729 &clmap, CL_FIND, 0, 0);
2730 psp[seg_index] = CLMAP_PS(clmap);
2731 ps_info_valid = 1;
2732
2733 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2734 /*
2735 * if the physical segment we're about
2736 * to step into is not contiguous to
2737 * the one we're currently in, or it's
2738 * in a different paging file, or
2739 * it hasn't been allocated....
2740 * we stop here and generate the I/O
2741 */
2742 break;
2743 }
2744 /*
2745 * start with first page of the next physical
2746 * segment
2747 */
2748 cl_index = 0;
2749 }
2750 if (xfer_size) {
2751 /*
2752 * we have a contiguous range of allocated pages
2753 * to read from
2754 */
2755 page_list_count = 0;
2756 memory_object_super_upl_request(vs->vs_control,
2757 (memory_object_offset_t)vs_offset,
2758 xfer_size, xfer_size,
2759 &upl, NULL, &page_list_count,
2760 request_flags | UPL_SET_INTERNAL);
2761
2762 error = ps_read_file(psp[beg_pseg],
2763 upl, (upl_offset_t) 0,
2764 ps_offset[beg_pseg] +
2765 (beg_indx * vm_page_size),
2766 xfer_size, &residual, 0);
2767 } else
2768 continue;
2769
2770 failed_size = 0;
2771
2772 /*
2773 * Adjust counts and send response to VM. Optimize
2774 * for the common case, i.e. no error and/or partial
2775 * data. If there was an error, then we need to error
2776 * the entire range, even if some data was successfully
2777 * read. If there was a partial read we may supply some
2778 * data and may error some as well. In all cases the
2779 * VM must receive some notification for every page
2780 * in the range.
2781 */
2782 if ((error == KERN_SUCCESS) && (residual == 0)) {
2783 /*
2784 * Got everything we asked for, supply the data
2785 * to the VM. Note that as a side effect of
2786 * supplying the data, the buffer holding the
2787 * supplied data is deallocated from the pager's
2788 * address space.
2789 */
2790 pvs_object_data_provided(
2791 vs, upl, vs_offset, xfer_size);
2792 } else {
2793 failed_size = xfer_size;
2794
2795 if (error == KERN_SUCCESS) {
2796 if ((signed) residual == xfer_size) {
2797 /*
2798 * If a read operation returns no error
2799 * and no data moved, we turn it into
2800 * an error, assuming we're reading at
2801 * or beyong EOF.
2802 * Fall through and error the entire
2803 * range.
2804 */
2805 error = KERN_FAILURE;
2806 } else {
2807 /*
2808 * Otherwise, we have partial read. If
2809 * the part read is a integral number
2810 * of pages supply it. Otherwise round
2811 * it up to a page boundary, zero fill
2812 * the unread part, and supply it.
2813 * Fall through and error the remainder
2814 * of the range, if any.
2815 */
2816 int fill, lsize;
2817
2818 fill = residual
2819 & ~vm_page_size;
2820 lsize = (xfer_size - residual)
2821 + fill;
2822 pvs_object_data_provided(
2823 vs, upl,
2824 vs_offset, lsize);
2825
2826 if (lsize < xfer_size) {
2827 failed_size =
2828 xfer_size - lsize;
2829 error = KERN_FAILURE;
2830 }
2831 }
2832 }
2833 }
2834 /*
2835 * If there was an error in any part of the range, tell
2836 * the VM. Note that error is explicitly checked again
2837 * since it can be modified above.
2838 */
2839 if (error != KERN_SUCCESS) {
2840 BS_STAT(psp[beg_pseg]->ps_bs,
2841 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2842 += atop_32(failed_size));
2843 }
2844 size -= xfer_size;
2845 vs_offset += xfer_size;
2846 }
2847
2848 } /* END while (cnt && (error == 0)) */
2849 return error;
2850 }
2851
2852 int vs_do_async_write = 1;
2853
2854 kern_return_t
2855 vs_cluster_write(
2856 vstruct_t vs,
2857 upl_t internal_upl,
2858 upl_offset_t offset,
2859 upl_size_t cnt,
2860 boolean_t dp_internal,
2861 int flags)
2862 {
2863 upl_size_t transfer_size;
2864 int error = 0;
2865 struct clmap clmap;
2866
2867 vm_offset_t actual_offset; /* Offset within paging segment */
2868 paging_segment_t ps;
2869 vm_offset_t mobj_base_addr;
2870 vm_offset_t mobj_target_addr;
2871
2872 upl_t upl;
2873 upl_page_info_t *pl;
2874 int page_index;
2875 int list_size;
2876 int pages_in_cl;
2877 unsigned int cl_size;
2878 int base_index;
2879 unsigned int seg_size;
2880
2881 pages_in_cl = 1 << vs->vs_clshift;
2882 cl_size = pages_in_cl * vm_page_size;
2883
2884 if (!dp_internal) {
2885 unsigned int page_list_count;
2886 int request_flags;
2887 unsigned int super_size;
2888 int first_dirty;
2889 int num_dirty;
2890 int num_of_pages;
2891 int seg_index;
2892 upl_offset_t upl_offset;
2893 vm_offset_t seg_offset;
2894 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2895 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2896
2897
2898 if (bs_low) {
2899 super_size = cl_size;
2900
2901 request_flags = UPL_NOBLOCK |
2902 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2903 UPL_NO_SYNC | UPL_SET_INTERNAL;
2904 } else {
2905 super_size = VM_SUPER_CLUSTER;
2906
2907 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2908 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2909 UPL_NO_SYNC | UPL_SET_INTERNAL;
2910 }
2911
2912 if (!dp_encryption_inited) {
2913 /*
2914 * ENCRYPTED SWAP:
2915 * Once we've started using swap, we
2916 * can't change our mind on whether
2917 * it needs to be encrypted or
2918 * not.
2919 */
2920 dp_encryption_inited = TRUE;
2921 }
2922 if (dp_encryption) {
2923 /*
2924 * ENCRYPTED SWAP:
2925 * request that the UPL be prepared for
2926 * encryption.
2927 */
2928 request_flags |= UPL_ENCRYPT;
2929 flags |= UPL_PAGING_ENCRYPTED;
2930 }
2931
2932 page_list_count = 0;
2933 memory_object_super_upl_request(vs->vs_control,
2934 (memory_object_offset_t)offset,
2935 cnt, super_size,
2936 &upl, NULL, &page_list_count,
2937 request_flags | UPL_FOR_PAGEOUT);
2938
2939 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2940
2941 seg_size = cl_size - (upl->offset % cl_size);
2942 upl_offset = upl->offset & ~(cl_size - 1);
2943
2944 for (seg_index = 0, transfer_size = upl->size;
2945 transfer_size > 0; ) {
2946 ps_offset[seg_index] =
2947 ps_clmap(vs,
2948 upl_offset,
2949 &clmap, CL_ALLOC,
2950 cl_size, 0);
2951
2952 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2953 upl_abort(upl, 0);
2954 upl_deallocate(upl);
2955
2956 return KERN_FAILURE;
2957
2958 }
2959 psp[seg_index] = CLMAP_PS(clmap);
2960
2961 if (transfer_size > seg_size) {
2962 transfer_size -= seg_size;
2963 upl_offset += cl_size;
2964 seg_size = cl_size;
2965 seg_index++;
2966 } else
2967 transfer_size = 0;
2968 }
2969 /*
2970 * Ignore any non-present pages at the end of the
2971 * UPL.
2972 */
2973 for (page_index = upl->size / vm_page_size; page_index > 0;)
2974 if (UPL_PAGE_PRESENT(pl, --page_index))
2975 break;
2976 num_of_pages = page_index + 1;
2977
2978 base_index = (upl->offset % cl_size) / PAGE_SIZE;
2979
2980 for (page_index = 0; page_index < num_of_pages; ) {
2981 /*
2982 * skip over non-dirty pages
2983 */
2984 for ( ; page_index < num_of_pages; page_index++) {
2985 if (UPL_DIRTY_PAGE(pl, page_index)
2986 || UPL_PRECIOUS_PAGE(pl, page_index))
2987 /*
2988 * this is a page we need to write
2989 * go see if we can buddy it up with
2990 * others that are contiguous to it
2991 */
2992 break;
2993 /*
2994 * if the page is not-dirty, but present we
2995 * need to commit it... This is an unusual
2996 * case since we only asked for dirty pages
2997 */
2998 if (UPL_PAGE_PRESENT(pl, page_index)) {
2999 boolean_t empty = FALSE;
3000 upl_commit_range(upl,
3001 page_index * vm_page_size,
3002 vm_page_size,
3003 UPL_COMMIT_NOTIFY_EMPTY,
3004 pl,
3005 page_list_count,
3006 &empty);
3007 if (empty) {
3008 assert(page_index ==
3009 num_of_pages - 1);
3010 upl_deallocate(upl);
3011 }
3012 }
3013 }
3014 if (page_index == num_of_pages)
3015 /*
3016 * no more pages to look at, we're out of here
3017 */
3018 break;
3019
3020 /*
3021 * gather up contiguous dirty pages... we have at
3022 * least 1 * otherwise we would have bailed above
3023 * make sure that each physical segment that we step
3024 * into is contiguous to the one we're currently in
3025 * if it's not, we have to stop and write what we have
3026 */
3027 for (first_dirty = page_index;
3028 page_index < num_of_pages; ) {
3029 if ( !UPL_DIRTY_PAGE(pl, page_index)
3030 && !UPL_PRECIOUS_PAGE(pl, page_index))
3031 break;
3032 page_index++;
3033 /*
3034 * if we just looked at the last page in the UPL
3035 * we don't need to check for physical segment
3036 * continuity
3037 */
3038 if (page_index < num_of_pages) {
3039 int cur_seg;
3040 int nxt_seg;
3041
3042 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3043 nxt_seg = (base_index + page_index)/pages_in_cl;
3044
3045 if (cur_seg != nxt_seg) {
3046 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3047 /*
3048 * if the segment we're about
3049 * to step into is not
3050 * contiguous to the one we're
3051 * currently in, or it's in a
3052 * different paging file....
3053 * we stop here and generate
3054 * the I/O
3055 */
3056 break;
3057 }
3058 }
3059 }
3060 num_dirty = page_index - first_dirty;
3061
3062 if (num_dirty) {
3063 upl_offset = first_dirty * vm_page_size;
3064 transfer_size = num_dirty * vm_page_size;
3065
3066 while (transfer_size) {
3067
3068 if ((seg_size = cl_size -
3069 ((upl->offset + upl_offset) % cl_size))
3070 > transfer_size)
3071 seg_size = transfer_size;
3072
3073 ps_vs_write_complete(vs,
3074 upl->offset + upl_offset,
3075 seg_size, error);
3076
3077 transfer_size -= seg_size;
3078 upl_offset += seg_size;
3079 }
3080 upl_offset = first_dirty * vm_page_size;
3081 transfer_size = num_dirty * vm_page_size;
3082
3083 seg_index = (base_index + first_dirty) / pages_in_cl;
3084 seg_offset = (upl->offset + upl_offset) % cl_size;
3085
3086 error = ps_write_file(psp[seg_index],
3087 upl, upl_offset,
3088 ps_offset[seg_index]
3089 + seg_offset,
3090 transfer_size, flags);
3091 } else {
3092 boolean_t empty = FALSE;
3093 upl_abort_range(upl,
3094 first_dirty * vm_page_size,
3095 num_dirty * vm_page_size,
3096 UPL_ABORT_NOTIFY_EMPTY,
3097 &empty);
3098 if (empty) {
3099 assert(page_index == num_of_pages);
3100 upl_deallocate(upl);
3101 }
3102 }
3103 }
3104
3105 } else {
3106 assert(cnt <= (vm_page_size << vs->vs_clshift));
3107 list_size = cnt;
3108
3109 page_index = 0;
3110 /* The caller provides a mapped_data which is derived */
3111 /* from a temporary object. The targeted pages are */
3112 /* guaranteed to be set at offset 0 in the mapped_data */
3113 /* The actual offset however must still be derived */
3114 /* from the offset in the vs in question */
3115 mobj_base_addr = offset;
3116 mobj_target_addr = mobj_base_addr;
3117
3118 for (transfer_size = list_size; transfer_size != 0;) {
3119 actual_offset = ps_clmap(vs, mobj_target_addr,
3120 &clmap, CL_ALLOC,
3121 transfer_size < cl_size ?
3122 transfer_size : cl_size, 0);
3123 if(actual_offset == (vm_offset_t) -1) {
3124 error = 1;
3125 break;
3126 }
3127 cnt = MIN(transfer_size,
3128 CLMAP_NPGS(clmap) * vm_page_size);
3129 ps = CLMAP_PS(clmap);
3130 /* Assume that the caller has given us contiguous */
3131 /* pages */
3132 if(cnt) {
3133 ps_vs_write_complete(vs, mobj_target_addr,
3134 cnt, error);
3135 error = ps_write_file(ps, internal_upl,
3136 0, actual_offset,
3137 cnt, flags);
3138 if (error)
3139 break;
3140 }
3141 if (error)
3142 break;
3143 actual_offset += cnt;
3144 mobj_target_addr += cnt;
3145 transfer_size -= cnt;
3146 cnt = 0;
3147
3148 if (error)
3149 break;
3150 }
3151 }
3152 if(error)
3153 return KERN_FAILURE;
3154 else
3155 return KERN_SUCCESS;
3156 }
3157
3158 vm_size_t
3159 ps_vstruct_allocated_size(
3160 vstruct_t vs)
3161 {
3162 int num_pages;
3163 struct vs_map *vsmap;
3164 unsigned int i, j, k;
3165
3166 num_pages = 0;
3167 if (vs->vs_indirect) {
3168 /* loop on indirect maps */
3169 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3170 vsmap = vs->vs_imap[i];
3171 if (vsmap == NULL)
3172 continue;
3173 /* loop on clusters in this indirect map */
3174 for (j = 0; j < CLMAP_ENTRIES; j++) {
3175 if (VSM_ISCLR(vsmap[j]) ||
3176 VSM_ISERR(vsmap[j]))
3177 continue;
3178 /* loop on pages in this cluster */
3179 for (k = 0; k < VSCLSIZE(vs); k++) {
3180 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3181 num_pages++;
3182 }
3183 }
3184 }
3185 } else {
3186 vsmap = vs->vs_dmap;
3187 if (vsmap == NULL)
3188 return 0;
3189 /* loop on clusters in the direct map */
3190 for (j = 0; j < CLMAP_ENTRIES; j++) {
3191 if (VSM_ISCLR(vsmap[j]) ||
3192 VSM_ISERR(vsmap[j]))
3193 continue;
3194 /* loop on pages in this cluster */
3195 for (k = 0; k < VSCLSIZE(vs); k++) {
3196 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3197 num_pages++;
3198 }
3199 }
3200 }
3201
3202 return ptoa_32(num_pages);
3203 }
3204
3205 size_t
3206 ps_vstruct_allocated_pages(
3207 vstruct_t vs,
3208 default_pager_page_t *pages,
3209 size_t pages_size)
3210 {
3211 unsigned int num_pages;
3212 struct vs_map *vsmap;
3213 vm_offset_t offset;
3214 unsigned int i, j, k;
3215
3216 num_pages = 0;
3217 offset = 0;
3218 if (vs->vs_indirect) {
3219 /* loop on indirect maps */
3220 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3221 vsmap = vs->vs_imap[i];
3222 if (vsmap == NULL) {
3223 offset += (vm_page_size * CLMAP_ENTRIES *
3224 VSCLSIZE(vs));
3225 continue;
3226 }
3227 /* loop on clusters in this indirect map */
3228 for (j = 0; j < CLMAP_ENTRIES; j++) {
3229 if (VSM_ISCLR(vsmap[j]) ||
3230 VSM_ISERR(vsmap[j])) {
3231 offset += vm_page_size * VSCLSIZE(vs);
3232 continue;
3233 }
3234 /* loop on pages in this cluster */
3235 for (k = 0; k < VSCLSIZE(vs); k++) {
3236 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3237 num_pages++;
3238 if (num_pages < pages_size)
3239 pages++->dpp_offset =
3240 offset;
3241 }
3242 offset += vm_page_size;
3243 }
3244 }
3245 }
3246 } else {
3247 vsmap = vs->vs_dmap;
3248 if (vsmap == NULL)
3249 return 0;
3250 /* loop on clusters in the direct map */
3251 for (j = 0; j < CLMAP_ENTRIES; j++) {
3252 if (VSM_ISCLR(vsmap[j]) ||
3253 VSM_ISERR(vsmap[j])) {
3254 offset += vm_page_size * VSCLSIZE(vs);
3255 continue;
3256 }
3257 /* loop on pages in this cluster */
3258 for (k = 0; k < VSCLSIZE(vs); k++) {
3259 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3260 num_pages++;
3261 if (num_pages < pages_size)
3262 pages++->dpp_offset = offset;
3263 }
3264 offset += vm_page_size;
3265 }
3266 }
3267 }
3268
3269 return num_pages;
3270 }
3271
3272
3273 kern_return_t
3274 ps_vstruct_transfer_from_segment(
3275 vstruct_t vs,
3276 paging_segment_t segment,
3277 upl_t upl)
3278 {
3279 struct vs_map *vsmap;
3280 // struct vs_map old_vsmap;
3281 // struct vs_map new_vsmap;
3282 unsigned int i, j;
3283
3284 VS_LOCK(vs); /* block all work on this vstruct */
3285 /* can't allow the normal multiple write */
3286 /* semantic because writes may conflict */
3287 vs->vs_xfer_pending = TRUE;
3288 vs_wait_for_sync_writers(vs);
3289 vs_start_write(vs);
3290 vs_wait_for_readers(vs);
3291 /* we will unlock the vs to allow other writes while transferring */
3292 /* and will be guaranteed of the persistance of the vs struct */
3293 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3294 /* vs_async_pending */
3295 /* OK we now have guaranteed no other parties are accessing this */
3296 /* vs. Now that we are also supporting simple lock versions of */
3297 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3298 /* our purpose in holding it before was the multiple write case */
3299 /* we now use the boolean xfer_pending to do that. We can use */
3300 /* a boolean instead of a count because we have guaranteed single */
3301 /* file access to this code in its caller */
3302 VS_UNLOCK(vs);
3303 vs_changed:
3304 if (vs->vs_indirect) {
3305 unsigned int vsmap_size;
3306 int clmap_off;
3307 /* loop on indirect maps */
3308 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3309 vsmap = vs->vs_imap[i];
3310 if (vsmap == NULL)
3311 continue;
3312 /* loop on clusters in this indirect map */
3313 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3314 VSCLSIZE(vs) * i);
3315 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3316 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3317 else
3318 vsmap_size = CLMAP_ENTRIES;
3319 for (j = 0; j < vsmap_size; j++) {
3320 if (VSM_ISCLR(vsmap[j]) ||
3321 VSM_ISERR(vsmap[j]) ||
3322 (VSM_PS(vsmap[j]) != segment))
3323 continue;
3324 if(vs_cluster_transfer(vs,
3325 (vm_page_size * (j << vs->vs_clshift))
3326 + clmap_off,
3327 vm_page_size << vs->vs_clshift,
3328 upl)
3329 != KERN_SUCCESS) {
3330 VS_LOCK(vs);
3331 vs->vs_xfer_pending = FALSE;
3332 VS_UNLOCK(vs);
3333 vs_finish_write(vs);
3334 return KERN_FAILURE;
3335 }
3336 /* allow other readers/writers during transfer*/
3337 VS_LOCK(vs);
3338 vs->vs_xfer_pending = FALSE;
3339 VS_UNLOCK(vs);
3340 vs_finish_write(vs);
3341 VS_LOCK(vs);
3342 vs->vs_xfer_pending = TRUE;
3343 vs_wait_for_sync_writers(vs);
3344 vs_start_write(vs);
3345 vs_wait_for_readers(vs);
3346 VS_UNLOCK(vs);
3347 if (!(vs->vs_indirect)) {
3348 goto vs_changed;
3349 }
3350 }
3351 }
3352 } else {
3353 vsmap = vs->vs_dmap;
3354 if (vsmap == NULL) {
3355 VS_LOCK(vs);
3356 vs->vs_xfer_pending = FALSE;
3357 VS_UNLOCK(vs);
3358 vs_finish_write(vs);
3359 return KERN_SUCCESS;
3360 }
3361 /* loop on clusters in the direct map */
3362 for (j = 0; j < vs->vs_size; j++) {
3363 if (VSM_ISCLR(vsmap[j]) ||
3364 VSM_ISERR(vsmap[j]) ||
3365 (VSM_PS(vsmap[j]) != segment))
3366 continue;
3367 if(vs_cluster_transfer(vs,
3368 vm_page_size * (j << vs->vs_clshift),
3369 vm_page_size << vs->vs_clshift,
3370 upl) != KERN_SUCCESS) {
3371 VS_LOCK(vs);
3372 vs->vs_xfer_pending = FALSE;
3373 VS_UNLOCK(vs);
3374 vs_finish_write(vs);
3375 return KERN_FAILURE;
3376 }
3377 /* allow other readers/writers during transfer*/
3378 VS_LOCK(vs);
3379 vs->vs_xfer_pending = FALSE;
3380 VS_UNLOCK(vs);
3381 vs_finish_write(vs);
3382 VS_LOCK(vs);
3383 vs->vs_xfer_pending = TRUE;
3384 VS_UNLOCK(vs);
3385 vs_wait_for_sync_writers(vs);
3386 vs_start_write(vs);
3387 vs_wait_for_readers(vs);
3388 if (vs->vs_indirect) {
3389 goto vs_changed;
3390 }
3391 }
3392 }
3393
3394 VS_LOCK(vs);
3395 vs->vs_xfer_pending = FALSE;
3396 VS_UNLOCK(vs);
3397 vs_finish_write(vs);
3398 return KERN_SUCCESS;
3399 }
3400
3401
3402
3403 vs_map_t
3404 vs_get_map_entry(
3405 vstruct_t vs,
3406 vm_offset_t offset)
3407 {
3408 struct vs_map *vsmap;
3409 vm_offset_t cluster;
3410
3411 cluster = atop_32(offset) >> vs->vs_clshift;
3412 if (vs->vs_indirect) {
3413 long ind_block = cluster/CLMAP_ENTRIES;
3414
3415 /* Is the indirect block allocated? */
3416 vsmap = vs->vs_imap[ind_block];
3417 if(vsmap == (vs_map_t) NULL)
3418 return vsmap;
3419 } else
3420 vsmap = vs->vs_dmap;
3421 vsmap += cluster%CLMAP_ENTRIES;
3422 return vsmap;
3423 }
3424
3425 kern_return_t
3426 vs_cluster_transfer(
3427 vstruct_t vs,
3428 vm_offset_t offset,
3429 vm_size_t cnt,
3430 upl_t upl)
3431 {
3432 vm_offset_t actual_offset;
3433 paging_segment_t ps;
3434 struct clmap clmap;
3435 kern_return_t error = KERN_SUCCESS;
3436 unsigned int size, size_wanted;
3437 int i;
3438 unsigned int residual = 0;
3439 unsigned int unavail_size;
3440 // default_pager_thread_t *dpt;
3441 // boolean_t dealloc;
3442 struct vs_map *vsmap_ptr = NULL;
3443 struct vs_map read_vsmap;
3444 struct vs_map original_read_vsmap;
3445 struct vs_map write_vsmap;
3446 // upl_t sync_upl;
3447 // vm_offset_t ioaddr;
3448
3449 /* vs_cluster_transfer reads in the pages of a cluster and
3450 * then writes these pages back to new backing store. The
3451 * segment the pages are being read from is assumed to have
3452 * been taken off-line and is no longer considered for new
3453 * space requests.
3454 */
3455
3456 /*
3457 * This loop will be executed once per cluster referenced.
3458 * Typically this means once, since it's unlikely that the
3459 * VM system will ask for anything spanning cluster boundaries.
3460 *
3461 * If there are holes in a cluster (in a paging segment), we stop
3462 * reading at the hole, then loop again, hoping to
3463 * find valid pages later in the cluster. This continues until
3464 * the entire range has been examined, and read, if present. The
3465 * pages are written as they are read. If a failure occurs after
3466 * some pages are written the unmap call at the bottom of the loop
3467 * recovers the backing store and the old backing store remains
3468 * in effect.
3469 */
3470
3471 VSM_CLR(write_vsmap);
3472 VSM_CLR(original_read_vsmap);
3473 /* grab the actual object's pages to sync with I/O */
3474 while (cnt && (error == KERN_SUCCESS)) {
3475 vsmap_ptr = vs_get_map_entry(vs, offset);
3476 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3477
3478 if (actual_offset == (vm_offset_t) -1) {
3479
3480 /*
3481 * Nothing left to write in this cluster at least
3482 * set write cluster information for any previous
3483 * write, clear for next cluster, if there is one
3484 */
3485 unsigned int local_size, clmask, clsize;
3486
3487 clsize = vm_page_size << vs->vs_clshift;
3488 clmask = clsize - 1;
3489 local_size = clsize - (offset & clmask);
3490 ASSERT(local_size);
3491 local_size = MIN(local_size, cnt);
3492
3493 /* This cluster has no data in it beyond what may */
3494 /* have been found on a previous iteration through */
3495 /* the loop "write_vsmap" */
3496 *vsmap_ptr = write_vsmap;
3497 VSM_CLR(write_vsmap);
3498 VSM_CLR(original_read_vsmap);
3499
3500 cnt -= local_size;
3501 offset += local_size;
3502 continue;
3503 }
3504
3505 /*
3506 * Count up contiguous available or unavailable
3507 * pages.
3508 */
3509 ps = CLMAP_PS(clmap);
3510 ASSERT(ps);
3511 size = 0;
3512 unavail_size = 0;
3513 for (i = 0;
3514 (size < cnt) && (unavail_size < cnt) &&
3515 (i < CLMAP_NPGS(clmap)); i++) {
3516 if (CLMAP_ISSET(clmap, i)) {
3517 if (unavail_size != 0)
3518 break;
3519 size += vm_page_size;
3520 BS_STAT(ps->ps_bs,
3521 ps->ps_bs->bs_pages_in++);
3522 } else {
3523 if (size != 0)
3524 break;
3525 unavail_size += vm_page_size;
3526 }
3527 }
3528
3529 if (size == 0) {
3530 ASSERT(unavail_size);
3531 cnt -= unavail_size;
3532 offset += unavail_size;
3533 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3534 == 0) {
3535 /* There is no more to transfer in this
3536 cluster
3537 */
3538 *vsmap_ptr = write_vsmap;
3539 VSM_CLR(write_vsmap);
3540 VSM_CLR(original_read_vsmap);
3541 }
3542 continue;
3543 }
3544
3545 if(VSM_ISCLR(original_read_vsmap))
3546 original_read_vsmap = *vsmap_ptr;
3547
3548 if(ps->ps_segtype == PS_PARTITION) {
3549 panic("swap partition not supported\n");
3550 /*NOTREACHED*/
3551 error = KERN_FAILURE;
3552 residual = size;
3553 /*
3554 NEED TO ISSUE WITH SYNC & NO COMMIT
3555 error = ps_read_device(ps, actual_offset, &buffer,
3556 size, &residual, flags);
3557 */
3558 } else {
3559 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3560 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3561 size, &residual,
3562 (UPL_IOSYNC | UPL_NOCOMMIT));
3563 }
3564
3565 read_vsmap = *vsmap_ptr;
3566
3567
3568 /*
3569 * Adjust counts and put data in new BS. Optimize for the
3570 * common case, i.e. no error and/or partial data.
3571 * If there was an error, then we need to error the entire
3572 * range, even if some data was successfully read.
3573 *
3574 */
3575 if ((error == KERN_SUCCESS) && (residual == 0)) {
3576
3577 /*
3578 * Got everything we asked for, supply the data to
3579 * the new BS. Note that as a side effect of supplying
3580 * the data, the buffer holding the supplied data is
3581 * deallocated from the pager's address space unless
3582 * the write is unsuccessful.
3583 */
3584
3585 /* note buffer will be cleaned up in all cases by */
3586 /* internal_cluster_write or if an error on write */
3587 /* the vm_map_copy_page_discard call */
3588 *vsmap_ptr = write_vsmap;
3589
3590 if(vs_cluster_write(vs, upl, offset,
3591 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3592 error = KERN_FAILURE;
3593 if(!(VSM_ISCLR(*vsmap_ptr))) {
3594 /* unmap the new backing store object */
3595 ps_clunmap(vs, offset, size);
3596 }
3597 /* original vsmap */
3598 *vsmap_ptr = original_read_vsmap;
3599 VSM_CLR(write_vsmap);
3600 } else {
3601 if((offset + size) &
3602 ((vm_page_size << vs->vs_clshift)
3603 - 1)) {
3604 /* There is more to transfer in this
3605 cluster
3606 */
3607 write_vsmap = *vsmap_ptr;
3608 *vsmap_ptr = read_vsmap;
3609 } else {
3610 /* discard the old backing object */
3611 write_vsmap = *vsmap_ptr;
3612 *vsmap_ptr = read_vsmap;
3613 ps_clunmap(vs, offset, size);
3614 *vsmap_ptr = write_vsmap;
3615 VSM_CLR(write_vsmap);
3616 VSM_CLR(original_read_vsmap);
3617 }
3618 }
3619 } else {
3620 size_wanted = size;
3621 if (error == KERN_SUCCESS) {
3622 if (residual == size) {
3623 /*
3624 * If a read operation returns no error
3625 * and no data moved, we turn it into
3626 * an error, assuming we're reading at
3627 * or beyond EOF.
3628 * Fall through and error the entire
3629 * range.
3630 */
3631 error = KERN_FAILURE;
3632 *vsmap_ptr = write_vsmap;
3633 if(!(VSM_ISCLR(*vsmap_ptr))) {
3634 /* unmap the new backing store object */
3635 ps_clunmap(vs, offset, size);
3636 }
3637 *vsmap_ptr = original_read_vsmap;
3638 VSM_CLR(write_vsmap);
3639 continue;
3640 } else {
3641 /*
3642 * Otherwise, we have partial read.
3643 * This is also considered an error
3644 * for the purposes of cluster transfer
3645 */
3646 error = KERN_FAILURE;
3647 *vsmap_ptr = write_vsmap;
3648 if(!(VSM_ISCLR(*vsmap_ptr))) {
3649 /* unmap the new backing store object */
3650 ps_clunmap(vs, offset, size);
3651 }
3652 *vsmap_ptr = original_read_vsmap;
3653 VSM_CLR(write_vsmap);
3654 continue;
3655 }
3656 }
3657
3658 }
3659 cnt -= size;
3660 offset += size;
3661
3662 } /* END while (cnt && (error == 0)) */
3663 if(!VSM_ISCLR(write_vsmap))
3664 *vsmap_ptr = write_vsmap;
3665
3666 return error;
3667 }
3668
3669 kern_return_t
3670 default_pager_add_file(
3671 MACH_PORT_FACE backing_store,
3672 vnode_ptr_t vp,
3673 int record_size,
3674 vm_size_t size)
3675 {
3676 backing_store_t bs;
3677 paging_segment_t ps;
3678 int i;
3679 unsigned int j;
3680 int error;
3681
3682 if ((bs = backing_store_lookup(backing_store))
3683 == BACKING_STORE_NULL)
3684 return KERN_INVALID_ARGUMENT;
3685
3686 PSL_LOCK();
3687 for (i = 0; i <= paging_segment_max; i++) {
3688 ps = paging_segments[i];
3689 if (ps == PAGING_SEGMENT_NULL)
3690 continue;
3691 if (ps->ps_segtype != PS_FILE)
3692 continue;
3693
3694 /*
3695 * Check for overlap on same device.
3696 */
3697 if (ps->ps_vnode == (struct vnode *)vp) {
3698 PSL_UNLOCK();
3699 BS_UNLOCK(bs);
3700 return KERN_INVALID_ARGUMENT;
3701 }
3702 }
3703 PSL_UNLOCK();
3704
3705 /*
3706 * Set up the paging segment
3707 */
3708 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3709 if (ps == PAGING_SEGMENT_NULL) {
3710 BS_UNLOCK(bs);
3711 return KERN_RESOURCE_SHORTAGE;
3712 }
3713
3714 ps->ps_segtype = PS_FILE;
3715 ps->ps_vnode = (struct vnode *)vp;
3716 ps->ps_offset = 0;
3717 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3718 ps->ps_recnum = size;
3719 ps->ps_pgnum = size >> ps->ps_record_shift;
3720
3721 ps->ps_pgcount = ps->ps_pgnum;
3722 ps->ps_clshift = local_log2(bs->bs_clsize);
3723 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3724 ps->ps_hint = 0;
3725
3726 PS_LOCK_INIT(ps);
3727 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3728 if (!ps->ps_bmap) {
3729 kfree(ps, sizeof *ps);
3730 BS_UNLOCK(bs);
3731 return KERN_RESOURCE_SHORTAGE;
3732 }
3733 for (j = 0; j < ps->ps_ncls; j++) {
3734 clrbit(ps->ps_bmap, j);
3735 }
3736
3737 ps->ps_going_away = FALSE;
3738 ps->ps_bs = bs;
3739
3740 if ((error = ps_enter(ps)) != 0) {
3741 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3742 kfree(ps, sizeof *ps);
3743 BS_UNLOCK(bs);
3744 return KERN_RESOURCE_SHORTAGE;
3745 }
3746
3747 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3748 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3749 PSL_LOCK();
3750 dp_pages_free += ps->ps_pgcount;
3751 PSL_UNLOCK();
3752
3753 BS_UNLOCK(bs);
3754
3755 bs_more_space(ps->ps_clcount);
3756
3757 DP_DEBUG(DEBUG_BS_INTERNAL,
3758 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3759 device, offset, size, record_size,
3760 ps->ps_record_shift, ps->ps_pgnum));
3761
3762 return KERN_SUCCESS;
3763 }
3764
3765
3766
3767 kern_return_t
3768 ps_read_file(
3769 paging_segment_t ps,
3770 upl_t upl,
3771 upl_offset_t upl_offset,
3772 vm_offset_t offset,
3773 upl_size_t size,
3774 unsigned int *residualp,
3775 int flags)
3776 {
3777 vm_object_offset_t f_offset;
3778 int error = 0;
3779 int result;
3780
3781 assert(dp_encryption_inited);
3782
3783 clustered_reads[atop_32(size)]++;
3784
3785 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3786
3787 /* for transfer case we need to pass uploffset and flags */
3788 error = vnode_pagein(ps->ps_vnode,
3789 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
3790
3791 /* The vnode_pagein semantic is somewhat at odds with the existing */
3792 /* device_read semantic. Partial reads are not experienced at this */
3793 /* level. It is up to the bit map code and cluster read code to */
3794 /* check that requested data locations are actually backed, and the */
3795 /* pagein code to either read all of the requested data or return an */
3796 /* error. */
3797
3798 if (error)
3799 result = KERN_FAILURE;
3800 else {
3801 *residualp = 0;
3802 result = KERN_SUCCESS;
3803 }
3804 return result;
3805 }
3806
3807 kern_return_t
3808 ps_write_file(
3809 paging_segment_t ps,
3810 upl_t upl,
3811 upl_offset_t upl_offset,
3812 vm_offset_t offset,
3813 unsigned int size,
3814 int flags)
3815 {
3816 vm_object_offset_t f_offset;
3817 kern_return_t result;
3818
3819 assert(dp_encryption_inited);
3820
3821 clustered_writes[atop_32(size)]++;
3822 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3823
3824 if (flags & UPL_PAGING_ENCRYPTED) {
3825 /*
3826 * ENCRYPTED SWAP:
3827 * encrypt all the pages that we're going
3828 * to pageout.
3829 */
3830 upl_encrypt(upl, upl_offset, size);
3831 }
3832
3833 if (vnode_pageout(ps->ps_vnode,
3834 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3835 result = KERN_FAILURE;
3836 else
3837 result = KERN_SUCCESS;
3838
3839 return result;
3840 }
3841
3842 kern_return_t
3843 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3844 int hi_wat,
3845 int lo_wat,
3846 int flags,
3847 MACH_PORT_FACE trigger_port)
3848 {
3849 MACH_PORT_FACE release;
3850 kern_return_t kr;
3851
3852 PSL_LOCK();
3853 if (flags == SWAP_ENCRYPT_ON) {
3854 /* ENCRYPTED SWAP: turn encryption on */
3855 release = trigger_port;
3856 if (!dp_encryption_inited) {
3857 dp_encryption_inited = TRUE;
3858 dp_encryption = TRUE;
3859 kr = KERN_SUCCESS;
3860 } else {
3861 kr = KERN_FAILURE;
3862 }
3863 } else if (flags == SWAP_ENCRYPT_OFF) {
3864 /* ENCRYPTED SWAP: turn encryption off */
3865 release = trigger_port;
3866 if (!dp_encryption_inited) {
3867 dp_encryption_inited = TRUE;
3868 dp_encryption = FALSE;
3869 kr = KERN_SUCCESS;
3870 } else {
3871 kr = KERN_FAILURE;
3872 }
3873 } else if (flags == HI_WAT_ALERT) {
3874 release = min_pages_trigger_port;
3875 min_pages_trigger_port = trigger_port;
3876 minimum_pages_remaining = hi_wat/vm_page_size;
3877 bs_low = FALSE;
3878 kr = KERN_SUCCESS;
3879 } else if (flags == LO_WAT_ALERT) {
3880 release = max_pages_trigger_port;
3881 max_pages_trigger_port = trigger_port;
3882 maximum_pages_free = lo_wat/vm_page_size;
3883 kr = KERN_SUCCESS;
3884 } else {
3885 release = trigger_port;
3886 kr = KERN_INVALID_ARGUMENT;
3887 }
3888 PSL_UNLOCK();
3889
3890 if (IP_VALID(release))
3891 ipc_port_release_send(release);
3892
3893 return kr;
3894 }
3895
3896 /*
3897 * Monitor the amount of available backing store vs. the amount of
3898 * required backing store, notify a listener (if present) when
3899 * backing store may safely be removed.
3900 *
3901 * We attempt to avoid the situation where backing store is
3902 * discarded en masse, as this can lead to thrashing as the
3903 * backing store is compacted.
3904 */
3905
3906 #define PF_INTERVAL 3 /* time between free level checks */
3907 #define PF_LATENCY 10 /* number of intervals before release */
3908
3909 static int dp_pages_free_low_count = 0;
3910 thread_call_t default_pager_backing_store_monitor_callout;
3911
3912 void
3913 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3914 __unused thread_call_param_t p2)
3915 {
3916 // unsigned long long average;
3917 ipc_port_t trigger;
3918 uint64_t deadline;
3919
3920 /*
3921 * We determine whether it will be safe to release some
3922 * backing store by watching the free page level. If
3923 * it remains below the maximum_pages_free threshold for
3924 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3925 * then we deem it safe.
3926 *
3927 * Note that this establishes a maximum rate at which backing
3928 * store will be released, as each notification (currently)
3929 * only results in a single backing store object being
3930 * released.
3931 */
3932 if (dp_pages_free > maximum_pages_free) {
3933 dp_pages_free_low_count++;
3934 } else {
3935 dp_pages_free_low_count = 0;
3936 }
3937
3938 /* decide whether to send notification */
3939 trigger = IP_NULL;
3940 if (max_pages_trigger_port &&
3941 (backing_store_release_trigger_disable == 0) &&
3942 (dp_pages_free_low_count > PF_LATENCY)) {
3943 trigger = max_pages_trigger_port;
3944 max_pages_trigger_port = NULL;
3945 }
3946
3947 /* send notification */
3948 if (trigger != IP_NULL) {
3949 VSL_LOCK();
3950 if(backing_store_release_trigger_disable != 0) {
3951 assert_wait((event_t)
3952 &backing_store_release_trigger_disable,
3953 THREAD_UNINT);
3954 VSL_UNLOCK();
3955 thread_block(THREAD_CONTINUE_NULL);
3956 } else {
3957 VSL_UNLOCK();
3958 }
3959 default_pager_space_alert(trigger, LO_WAT_ALERT);
3960 ipc_port_release_send(trigger);
3961 dp_pages_free_low_count = 0;
3962 }
3963
3964 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
3965 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
3966 }