]> git.saurik.com Git - apple/xnu.git/blob - osfmk/default_pager/dp_backing_store.c
78c99073fd040ce84ad90b3b6fe20eec3b8dc005
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
1 /*
2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 /*
58 * Default Pager.
59 * Paging File Management.
60 */
61
62 #include <mach/host_priv.h>
63 #include <mach/memory_object_control.h>
64 #include <mach/memory_object_server.h>
65 #include <mach/upl.h>
66 #include <default_pager/default_pager_internal.h>
67 #include <default_pager/default_pager_alerts.h>
68 #include <default_pager/default_pager_object_server.h>
69
70 #include <ipc/ipc_types.h>
71 #include <ipc/ipc_port.h>
72 #include <ipc/ipc_space.h>
73
74 #include <kern/kern_types.h>
75 #include <kern/host.h>
76 #include <kern/queue.h>
77 #include <kern/counters.h>
78 #include <kern/sched_prim.h>
79
80 #include <vm/vm_kern.h>
81 #include <vm/vm_pageout.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_protos.h>
85
86
87 /* LP64todo - need large internal object support */
88
89 /*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
99
100 #define ALLOC_STRIDE (1024 * 1024 * 1024)
101 int physical_transfer_cluster_count = 0;
102
103 #define VM_SUPER_CLUSTER 0x40000
104 #define VM_SUPER_PAGES 64
105
106 /*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
110 #define VSTRUCT_DEF_CLSHIFT 2
111 int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
112 int default_pager_clsize = 0;
113
114 /* statistics */
115 unsigned int clustered_writes[VM_SUPER_PAGES+1];
116 unsigned int clustered_reads[VM_SUPER_PAGES+1];
117
118 /*
119 * Globals used for asynchronous paging operations:
120 * vs_async_list: head of list of to-be-completed I/O ops
121 * async_num_queued: number of pages completed, but not yet
122 * processed by async thread.
123 * async_requests_out: number of pages of requests not completed.
124 */
125
126 #if 0
127 struct vs_async *vs_async_list;
128 int async_num_queued;
129 int async_requests_out;
130 #endif
131
132
133 #define VS_ASYNC_REUSE 1
134 struct vs_async *vs_async_free_list;
135
136 mutex_t default_pager_async_lock; /* Protects globals above */
137
138
139 int vs_alloc_async_failed = 0; /* statistics */
140 int vs_alloc_async_count = 0; /* statistics */
141 struct vs_async *vs_alloc_async(void); /* forward */
142 void vs_free_async(struct vs_async *vsa); /* forward */
143
144
145 #define VS_ALLOC_ASYNC() vs_alloc_async()
146 #define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
147
148 #define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
149 #define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
150 #define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, 0)
151 #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
152 /*
153 * Paging Space Hysteresis triggers and the target notification port
154 *
155 */
156
157 unsigned int minimum_pages_remaining = 0;
158 unsigned int maximum_pages_free = 0;
159 ipc_port_t min_pages_trigger_port = NULL;
160 ipc_port_t max_pages_trigger_port = NULL;
161
162 boolean_t bs_low = FALSE;
163 int backing_store_release_trigger_disable = 0;
164
165
166 /* Have we decided if swap needs to be encrypted yet ? */
167 boolean_t dp_encryption_inited = FALSE;
168 /* Should we encrypt swap ? */
169 boolean_t dp_encryption = FALSE;
170
171
172 /*
173 * Object sizes are rounded up to the next power of 2,
174 * unless they are bigger than a given maximum size.
175 */
176 vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
177
178 /*
179 * List of all backing store and segments.
180 */
181 struct backing_store_list_head backing_store_list;
182 paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
183 mutex_t paging_segments_lock;
184 int paging_segment_max = 0;
185 int paging_segment_count = 0;
186 int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
187
188
189 /*
190 * Total pages free in system
191 * This differs from clusters committed/avail which is a measure of the
192 * over commitment of paging segments to backing store. An idea which is
193 * likely to be deprecated.
194 */
195 unsigned int dp_pages_free = 0;
196 unsigned int cluster_transfer_minimum = 100;
197
198 /* forward declarations */
199 kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, int); /* forward */
200 kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
201 default_pager_thread_t *get_read_buffer( void );
202 kern_return_t ps_vstruct_transfer_from_segment(
203 vstruct_t vs,
204 paging_segment_t segment,
205 upl_t upl);
206 kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
207 kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
208 kern_return_t vs_cluster_transfer(
209 vstruct_t vs,
210 upl_offset_t offset,
211 upl_size_t cnt,
212 upl_t upl);
213 vs_map_t vs_get_map_entry(
214 vstruct_t vs,
215 vm_offset_t offset);
216
217
218 default_pager_thread_t *
219 get_read_buffer( void )
220 {
221 int i;
222
223 DPT_LOCK(dpt_lock);
224 while(TRUE) {
225 for (i=0; i<default_pager_internal_count; i++) {
226 if(dpt_array[i]->checked_out == FALSE) {
227 dpt_array[i]->checked_out = TRUE;
228 DPT_UNLOCK(dpt_lock);
229 return dpt_array[i];
230 }
231 }
232 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
233 }
234 }
235
236 void
237 bs_initialize(void)
238 {
239 int i;
240
241 /*
242 * List of all backing store.
243 */
244 BSL_LOCK_INIT();
245 queue_init(&backing_store_list.bsl_queue);
246 PSL_LOCK_INIT();
247
248 VS_ASYNC_LOCK_INIT();
249 #if VS_ASYNC_REUSE
250 vs_async_free_list = NULL;
251 #endif /* VS_ASYNC_REUSE */
252
253 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
254 clustered_writes[i] = 0;
255 clustered_reads[i] = 0;
256 }
257
258 }
259
260 /*
261 * When things do not quite workout...
262 */
263 void bs_no_paging_space(boolean_t); /* forward */
264
265 void
266 bs_no_paging_space(
267 boolean_t out_of_memory)
268 {
269
270 if (out_of_memory)
271 dprintf(("*** OUT OF MEMORY ***\n"));
272 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
273 }
274
275 void bs_more_space(int); /* forward */
276 void bs_commit(int); /* forward */
277
278 boolean_t user_warned = FALSE;
279 unsigned int clusters_committed = 0;
280 unsigned int clusters_available = 0;
281 unsigned int clusters_committed_peak = 0;
282
283 void
284 bs_more_space(
285 int nclusters)
286 {
287 BSL_LOCK();
288 /*
289 * Account for new paging space.
290 */
291 clusters_available += nclusters;
292
293 if (clusters_available >= clusters_committed) {
294 if (verbose && user_warned) {
295 printf("%s%s - %d excess clusters now.\n",
296 my_name,
297 "paging space is OK now",
298 clusters_available - clusters_committed);
299 user_warned = FALSE;
300 clusters_committed_peak = 0;
301 }
302 } else {
303 if (verbose && user_warned) {
304 printf("%s%s - still short of %d clusters.\n",
305 my_name,
306 "WARNING: paging space over-committed",
307 clusters_committed - clusters_available);
308 clusters_committed_peak -= nclusters;
309 }
310 }
311 BSL_UNLOCK();
312
313 return;
314 }
315
316 void
317 bs_commit(
318 int nclusters)
319 {
320 BSL_LOCK();
321 clusters_committed += nclusters;
322 if (clusters_committed > clusters_available) {
323 if (verbose && !user_warned) {
324 user_warned = TRUE;
325 printf("%s%s - short of %d clusters.\n",
326 my_name,
327 "WARNING: paging space over-committed",
328 clusters_committed - clusters_available);
329 }
330 if (clusters_committed > clusters_committed_peak) {
331 clusters_committed_peak = clusters_committed;
332 }
333 } else {
334 if (verbose && user_warned) {
335 printf("%s%s - was short of up to %d clusters.\n",
336 my_name,
337 "paging space is OK now",
338 clusters_committed_peak - clusters_available);
339 user_warned = FALSE;
340 clusters_committed_peak = 0;
341 }
342 }
343 BSL_UNLOCK();
344
345 return;
346 }
347
348 int default_pager_info_verbose = 1;
349
350 void
351 bs_global_info(
352 vm_size_t *totalp,
353 vm_size_t *freep)
354 {
355 vm_size_t pages_total, pages_free;
356 paging_segment_t ps;
357 int i;
358
359 PSL_LOCK();
360 pages_total = pages_free = 0;
361 for (i = 0; i <= paging_segment_max; i++) {
362 ps = paging_segments[i];
363 if (ps == PAGING_SEGMENT_NULL)
364 continue;
365
366 /*
367 * no need to lock: by the time this data
368 * gets back to any remote requestor it
369 * will be obsolete anyways
370 */
371 pages_total += ps->ps_pgnum;
372 pages_free += ps->ps_clcount << ps->ps_clshift;
373 DP_DEBUG(DEBUG_BS_INTERNAL,
374 ("segment #%d: %d total, %d free\n",
375 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
376 }
377 *totalp = pages_total;
378 *freep = pages_free;
379 if (verbose && user_warned && default_pager_info_verbose) {
380 if (clusters_available < clusters_committed) {
381 printf("%s %d clusters committed, %d available.\n",
382 my_name,
383 clusters_committed,
384 clusters_available);
385 }
386 }
387 PSL_UNLOCK();
388 }
389
390 backing_store_t backing_store_alloc(void); /* forward */
391
392 backing_store_t
393 backing_store_alloc(void)
394 {
395 backing_store_t bs;
396
397 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
398 if (bs == BACKING_STORE_NULL)
399 panic("backing_store_alloc: no memory");
400
401 BS_LOCK_INIT(bs);
402 bs->bs_port = MACH_PORT_NULL;
403 bs->bs_priority = 0;
404 bs->bs_clsize = 0;
405 bs->bs_pages_total = 0;
406 bs->bs_pages_in = 0;
407 bs->bs_pages_in_fail = 0;
408 bs->bs_pages_out = 0;
409 bs->bs_pages_out_fail = 0;
410
411 return bs;
412 }
413
414 backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
415
416 /* Even in both the component space and external versions of this pager, */
417 /* backing_store_lookup will be called from tasks in the application space */
418 backing_store_t
419 backing_store_lookup(
420 MACH_PORT_FACE port)
421 {
422 backing_store_t bs;
423
424 /*
425 port is currently backed with a vs structure in the alias field
426 we could create an ISBS alias and a port_is_bs call but frankly
427 I see no reason for the test, the bs->port == port check below
428 will work properly on junk entries.
429
430 if ((port == MACH_PORT_NULL) || port_is_vs(port))
431 */
432 if ((port == MACH_PORT_NULL))
433 return BACKING_STORE_NULL;
434
435 BSL_LOCK();
436 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
437 bs_links) {
438 BS_LOCK(bs);
439 if (bs->bs_port == port) {
440 BSL_UNLOCK();
441 /* Success, return it locked. */
442 return bs;
443 }
444 BS_UNLOCK(bs);
445 }
446 BSL_UNLOCK();
447 return BACKING_STORE_NULL;
448 }
449
450 void backing_store_add(backing_store_t); /* forward */
451
452 void
453 backing_store_add(
454 __unused backing_store_t bs)
455 {
456 // MACH_PORT_FACE port = bs->bs_port;
457 // MACH_PORT_FACE pset = default_pager_default_set;
458 kern_return_t kr = KERN_SUCCESS;
459
460 if (kr != KERN_SUCCESS)
461 panic("backing_store_add: add to set");
462
463 }
464
465 /*
466 * Set up default page shift, but only if not already
467 * set and argument is within range.
468 */
469 boolean_t
470 bs_set_default_clsize(unsigned int npages)
471 {
472 switch(npages){
473 case 1:
474 case 2:
475 case 4:
476 case 8:
477 if (default_pager_clsize == 0) /* if not yet set */
478 vstruct_def_clshift = local_log2(npages);
479 return(TRUE);
480 }
481 return(FALSE);
482 }
483
484 int bs_get_global_clsize(int clsize); /* forward */
485
486 int
487 bs_get_global_clsize(
488 int clsize)
489 {
490 int i;
491 memory_object_default_t dmm;
492 kern_return_t kr;
493
494 /*
495 * Only allow setting of cluster size once. If called
496 * with no cluster size (default), we use the compiled-in default
497 * for the duration. The same cluster size is used for all
498 * paging segments.
499 */
500 if (default_pager_clsize == 0) {
501 /*
502 * Keep cluster size in bit shift because it's quicker
503 * arithmetic, and easier to keep at a power of 2.
504 */
505 if (clsize != NO_CLSIZE) {
506 for (i = 0; (1 << i) < clsize; i++);
507 if (i > MAX_CLUSTER_SHIFT)
508 i = MAX_CLUSTER_SHIFT;
509 vstruct_def_clshift = i;
510 }
511 default_pager_clsize = (1 << vstruct_def_clshift);
512
513 /*
514 * Let the user know the new (and definitive) cluster size.
515 */
516 if (verbose)
517 printf("%scluster size = %d page%s\n",
518 my_name, default_pager_clsize,
519 (default_pager_clsize == 1) ? "" : "s");
520
521 /*
522 * Let the kernel know too, in case it hasn't used the
523 * default value provided in main() yet.
524 */
525 dmm = default_pager_object;
526 clsize = default_pager_clsize * vm_page_size; /* in bytes */
527 kr = host_default_memory_manager(host_priv_self(),
528 &dmm,
529 clsize);
530 memory_object_default_deallocate(dmm);
531
532 if (kr != KERN_SUCCESS) {
533 panic("bs_get_global_cl_size:host_default_memory_manager");
534 }
535 if (dmm != default_pager_object) {
536 panic("bs_get_global_cl_size:there is another default pager");
537 }
538 }
539 ASSERT(default_pager_clsize > 0 &&
540 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
541
542 return default_pager_clsize;
543 }
544
545 kern_return_t
546 default_pager_backing_store_create(
547 memory_object_default_t pager,
548 int priority,
549 int clsize, /* in bytes */
550 MACH_PORT_FACE *backing_store)
551 {
552 backing_store_t bs;
553 MACH_PORT_FACE port;
554 // kern_return_t kr;
555 struct vstruct_alias *alias_struct;
556
557 if (pager != default_pager_object)
558 return KERN_INVALID_ARGUMENT;
559
560 bs = backing_store_alloc();
561 port = ipc_port_alloc_kernel();
562 ipc_port_make_send(port);
563 assert (port != IP_NULL);
564
565 DP_DEBUG(DEBUG_BS_EXTERNAL,
566 ("priority=%d clsize=%d bs_port=0x%x\n",
567 priority, clsize, (int) backing_store));
568
569 alias_struct = (struct vstruct_alias *)
570 kalloc(sizeof (struct vstruct_alias));
571 if(alias_struct != NULL) {
572 alias_struct->vs = (struct vstruct *)bs;
573 alias_struct->name = &default_pager_ops;
574 port->alias = (int) alias_struct;
575 }
576 else {
577 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
578 kfree(bs, sizeof (struct backing_store));
579 return KERN_RESOURCE_SHORTAGE;
580 }
581
582 bs->bs_port = port;
583 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
584 priority = BS_MAXPRI;
585 else if (priority == BS_NOPRI)
586 priority = BS_MAXPRI;
587 else
588 priority = BS_MINPRI;
589 bs->bs_priority = priority;
590
591 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
592
593 BSL_LOCK();
594 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
595 bs_links);
596 BSL_UNLOCK();
597
598 backing_store_add(bs);
599
600 *backing_store = port;
601 return KERN_SUCCESS;
602 }
603
604 kern_return_t
605 default_pager_backing_store_info(
606 MACH_PORT_FACE backing_store,
607 backing_store_flavor_t flavour,
608 backing_store_info_t info,
609 mach_msg_type_number_t *size)
610 {
611 backing_store_t bs;
612 backing_store_basic_info_t basic;
613 int i;
614 paging_segment_t ps;
615
616 if (flavour != BACKING_STORE_BASIC_INFO ||
617 *size < BACKING_STORE_BASIC_INFO_COUNT)
618 return KERN_INVALID_ARGUMENT;
619
620 basic = (backing_store_basic_info_t)info;
621 *size = BACKING_STORE_BASIC_INFO_COUNT;
622
623 VSTATS_LOCK(&global_stats.gs_lock);
624 basic->pageout_calls = global_stats.gs_pageout_calls;
625 basic->pagein_calls = global_stats.gs_pagein_calls;
626 basic->pages_in = global_stats.gs_pages_in;
627 basic->pages_out = global_stats.gs_pages_out;
628 basic->pages_unavail = global_stats.gs_pages_unavail;
629 basic->pages_init = global_stats.gs_pages_init;
630 basic->pages_init_writes= global_stats.gs_pages_init_writes;
631 VSTATS_UNLOCK(&global_stats.gs_lock);
632
633 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
634 return KERN_INVALID_ARGUMENT;
635
636 basic->bs_pages_total = bs->bs_pages_total;
637 PSL_LOCK();
638 bs->bs_pages_free = 0;
639 for (i = 0; i <= paging_segment_max; i++) {
640 ps = paging_segments[i];
641 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
642 PS_LOCK(ps);
643 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
644 PS_UNLOCK(ps);
645 }
646 }
647 PSL_UNLOCK();
648 basic->bs_pages_free = bs->bs_pages_free;
649 basic->bs_pages_in = bs->bs_pages_in;
650 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
651 basic->bs_pages_out = bs->bs_pages_out;
652 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
653
654 basic->bs_priority = bs->bs_priority;
655 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */
656
657 BS_UNLOCK(bs);
658
659 return KERN_SUCCESS;
660 }
661
662 int ps_delete(paging_segment_t); /* forward */
663
664 int
665 ps_delete(
666 paging_segment_t ps)
667 {
668 vstruct_t vs;
669 kern_return_t error = KERN_SUCCESS;
670 int vs_count;
671
672 VSL_LOCK(); /* get the lock on the list of vs's */
673
674 /* The lock relationship and sequence is farily complicated */
675 /* this code looks at a live list, locking and unlocking the list */
676 /* as it traverses it. It depends on the locking behavior of */
677 /* default_pager_no_senders. no_senders always locks the vstruct */
678 /* targeted for removal before locking the vstruct list. However */
679 /* it will remove that member of the list without locking its */
680 /* neighbors. We can be sure when we hold a lock on a vstruct */
681 /* it cannot be removed from the list but we must hold the list */
682 /* lock to be sure that its pointers to its neighbors are valid. */
683 /* Also, we can hold off destruction of a vstruct when the list */
684 /* lock and the vs locks are not being held by bumping the */
685 /* vs_async_pending count. */
686
687
688 while(backing_store_release_trigger_disable != 0) {
689 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
690 }
691
692 /* we will choose instead to hold a send right */
693 vs_count = vstruct_list.vsl_count;
694 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
695 if(vs == (vstruct_t)&vstruct_list) {
696 VSL_UNLOCK();
697 return KERN_SUCCESS;
698 }
699 VS_LOCK(vs);
700 vs_async_wait(vs); /* wait for any pending async writes */
701 if ((vs_count != 0) && (vs != NULL))
702 vs->vs_async_pending += 1; /* hold parties calling */
703 /* vs_async_wait */
704 VS_UNLOCK(vs);
705 VSL_UNLOCK();
706 while((vs_count != 0) && (vs != NULL)) {
707 /* We take the count of AMO's before beginning the */
708 /* transfer of of the target segment. */
709 /* We are guaranteed that the target segment cannot get */
710 /* more users. We also know that queue entries are */
711 /* made at the back of the list. If some of the entries */
712 /* we would check disappear while we are traversing the */
713 /* list then we will either check new entries which */
714 /* do not have any backing store in the target segment */
715 /* or re-check old entries. This might not be optimal */
716 /* but it will always be correct. The alternative is to */
717 /* take a snapshot of the list. */
718 vstruct_t next_vs;
719
720 if(dp_pages_free < cluster_transfer_minimum)
721 error = KERN_FAILURE;
722 else {
723 vm_object_t transfer_object;
724 unsigned int count;
725 upl_t upl;
726
727 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
728 count = 0;
729 error = vm_object_upl_request(transfer_object,
730 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
731 &upl, NULL, &count,
732 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_LITE | UPL_SET_INTERNAL);
733
734 if(error == KERN_SUCCESS) {
735 error = ps_vstruct_transfer_from_segment(
736 vs, ps, upl);
737 upl_commit(upl, NULL, 0);
738 upl_deallocate(upl);
739 } else {
740 error = KERN_FAILURE;
741 }
742 vm_object_deallocate(transfer_object);
743 }
744 if(error) {
745 VS_LOCK(vs);
746 vs->vs_async_pending -= 1; /* release vs_async_wait */
747 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
748 vs->vs_waiting_async = FALSE;
749 VS_UNLOCK(vs);
750 thread_wakeup(&vs->vs_async_pending);
751 } else {
752 VS_UNLOCK(vs);
753 }
754 return KERN_FAILURE;
755 }
756
757 VSL_LOCK();
758
759 while(backing_store_release_trigger_disable != 0) {
760 VSL_SLEEP(&backing_store_release_trigger_disable,
761 THREAD_UNINT);
762 }
763
764 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
765 if((next_vs != (vstruct_t)&vstruct_list) &&
766 (vs != next_vs) && (vs_count != 1)) {
767 VS_LOCK(next_vs);
768 vs_async_wait(next_vs); /* wait for any */
769 /* pending async writes */
770 next_vs->vs_async_pending += 1; /* hold parties */
771 /* calling vs_async_wait */
772 VS_UNLOCK(next_vs);
773 }
774 VSL_UNLOCK();
775 VS_LOCK(vs);
776 vs->vs_async_pending -= 1;
777 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
778 vs->vs_waiting_async = FALSE;
779 VS_UNLOCK(vs);
780 thread_wakeup(&vs->vs_async_pending);
781 } else {
782 VS_UNLOCK(vs);
783 }
784 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
785 vs = NULL;
786 else
787 vs = next_vs;
788 vs_count--;
789 }
790 return KERN_SUCCESS;
791 }
792
793
794 kern_return_t
795 default_pager_backing_store_delete(
796 MACH_PORT_FACE backing_store)
797 {
798 backing_store_t bs;
799 int i;
800 paging_segment_t ps;
801 int error;
802 int interim_pages_removed = 0;
803 // kern_return_t kr;
804
805 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
806 return KERN_INVALID_ARGUMENT;
807
808 #if 0
809 /* not implemented */
810 BS_UNLOCK(bs);
811 return KERN_FAILURE;
812 #endif
813
814 restart:
815 PSL_LOCK();
816 error = KERN_SUCCESS;
817 for (i = 0; i <= paging_segment_max; i++) {
818 ps = paging_segments[i];
819 if (ps != PAGING_SEGMENT_NULL &&
820 ps->ps_bs == bs &&
821 ! ps->ps_going_away) {
822 PS_LOCK(ps);
823 /* disable access to this segment */
824 ps->ps_going_away = TRUE;
825 PS_UNLOCK(ps);
826 /*
827 * The "ps" segment is "off-line" now,
828 * we can try and delete it...
829 */
830 if(dp_pages_free < (cluster_transfer_minimum
831 + ps->ps_pgcount)) {
832 error = KERN_FAILURE;
833 PSL_UNLOCK();
834 }
835 else {
836 /* remove all pages associated with the */
837 /* segment from the list of free pages */
838 /* when transfer is through, all target */
839 /* segment pages will appear to be free */
840
841 dp_pages_free -= ps->ps_pgcount;
842 interim_pages_removed += ps->ps_pgcount;
843 PSL_UNLOCK();
844 error = ps_delete(ps);
845 }
846 if (error != KERN_SUCCESS) {
847 /*
848 * We couldn't delete the segment,
849 * probably because there's not enough
850 * virtual memory left.
851 * Re-enable all the segments.
852 */
853 PSL_LOCK();
854 break;
855 }
856 goto restart;
857 }
858 }
859
860 if (error != KERN_SUCCESS) {
861 for (i = 0; i <= paging_segment_max; i++) {
862 ps = paging_segments[i];
863 if (ps != PAGING_SEGMENT_NULL &&
864 ps->ps_bs == bs &&
865 ps->ps_going_away) {
866 PS_LOCK(ps);
867 /* re-enable access to this segment */
868 ps->ps_going_away = FALSE;
869 PS_UNLOCK(ps);
870 }
871 }
872 dp_pages_free += interim_pages_removed;
873 PSL_UNLOCK();
874 BS_UNLOCK(bs);
875 return error;
876 }
877
878 for (i = 0; i <= paging_segment_max; i++) {
879 ps = paging_segments[i];
880 if (ps != PAGING_SEGMENT_NULL &&
881 ps->ps_bs == bs) {
882 if(ps->ps_going_away) {
883 paging_segments[i] = PAGING_SEGMENT_NULL;
884 paging_segment_count--;
885 PS_LOCK(ps);
886 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
887 kfree(ps, sizeof *ps);
888 }
889 }
890 }
891
892 /* Scan the entire ps array separately to make certain we find the */
893 /* proper paging_segment_max */
894 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
895 if(paging_segments[i] != PAGING_SEGMENT_NULL)
896 paging_segment_max = i;
897 }
898
899 PSL_UNLOCK();
900
901 /*
902 * All the segments have been deleted.
903 * We can remove the backing store.
904 */
905
906 /*
907 * Disable lookups of this backing store.
908 */
909 if((void *)bs->bs_port->alias != NULL)
910 kfree((void *) bs->bs_port->alias,
911 sizeof (struct vstruct_alias));
912 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
913 bs->bs_port = MACH_PORT_NULL;
914 BS_UNLOCK(bs);
915
916 /*
917 * Remove backing store from backing_store list.
918 */
919 BSL_LOCK();
920 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
921 bs_links);
922 BSL_UNLOCK();
923
924 /*
925 * Free the backing store structure.
926 */
927 kfree(bs, sizeof *bs);
928
929 return KERN_SUCCESS;
930 }
931
932 int ps_enter(paging_segment_t); /* forward */
933
934 int
935 ps_enter(
936 paging_segment_t ps)
937 {
938 int i;
939
940 PSL_LOCK();
941
942 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
943 if (paging_segments[i] == PAGING_SEGMENT_NULL)
944 break;
945 }
946
947 if (i < MAX_NUM_PAGING_SEGMENTS) {
948 paging_segments[i] = ps;
949 if (i > paging_segment_max)
950 paging_segment_max = i;
951 paging_segment_count++;
952 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
953 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
954 ps_select_array[ps->ps_bs->bs_priority] = 0;
955 i = 0;
956 } else {
957 PSL_UNLOCK();
958 return KERN_RESOURCE_SHORTAGE;
959 }
960
961 PSL_UNLOCK();
962 return i;
963 }
964
965 #ifdef DEVICE_PAGING
966 kern_return_t
967 default_pager_add_segment(
968 MACH_PORT_FACE backing_store,
969 MACH_PORT_FACE device,
970 recnum_t offset,
971 recnum_t count,
972 int record_size)
973 {
974 backing_store_t bs;
975 paging_segment_t ps;
976 int i;
977 int error;
978
979 if ((bs = backing_store_lookup(backing_store))
980 == BACKING_STORE_NULL)
981 return KERN_INVALID_ARGUMENT;
982
983 PSL_LOCK();
984 for (i = 0; i <= paging_segment_max; i++) {
985 ps = paging_segments[i];
986 if (ps == PAGING_SEGMENT_NULL)
987 continue;
988
989 /*
990 * Check for overlap on same device.
991 */
992 if (!(ps->ps_device != device
993 || offset >= ps->ps_offset + ps->ps_recnum
994 || offset + count <= ps->ps_offset)) {
995 PSL_UNLOCK();
996 BS_UNLOCK(bs);
997 return KERN_INVALID_ARGUMENT;
998 }
999 }
1000 PSL_UNLOCK();
1001
1002 /*
1003 * Set up the paging segment
1004 */
1005 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1006 if (ps == PAGING_SEGMENT_NULL) {
1007 BS_UNLOCK(bs);
1008 return KERN_RESOURCE_SHORTAGE;
1009 }
1010
1011 ps->ps_segtype = PS_PARTITION;
1012 ps->ps_device = device;
1013 ps->ps_offset = offset;
1014 ps->ps_record_shift = local_log2(vm_page_size / record_size);
1015 ps->ps_recnum = count;
1016 ps->ps_pgnum = count >> ps->ps_record_shift;
1017
1018 ps->ps_pgcount = ps->ps_pgnum;
1019 ps->ps_clshift = local_log2(bs->bs_clsize);
1020 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1021 ps->ps_hint = 0;
1022
1023 PS_LOCK_INIT(ps);
1024 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1025 if (!ps->ps_bmap) {
1026 kfree(ps, sizeof *ps);
1027 BS_UNLOCK(bs);
1028 return KERN_RESOURCE_SHORTAGE;
1029 }
1030 for (i = 0; i < ps->ps_ncls; i++) {
1031 clrbit(ps->ps_bmap, i);
1032 }
1033
1034 ps->ps_going_away = FALSE;
1035 ps->ps_bs = bs;
1036
1037 if ((error = ps_enter(ps)) != 0) {
1038 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1039 kfree(ps, sizeof *ps);
1040 BS_UNLOCK(bs);
1041 return KERN_RESOURCE_SHORTAGE;
1042 }
1043
1044 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1045 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1046 BS_UNLOCK(bs);
1047
1048 PSL_LOCK();
1049 dp_pages_free += ps->ps_pgcount;
1050 PSL_UNLOCK();
1051
1052 bs_more_space(ps->ps_clcount);
1053
1054 DP_DEBUG(DEBUG_BS_INTERNAL,
1055 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1056 device, offset, count, record_size,
1057 ps->ps_record_shift, ps->ps_pgnum));
1058
1059 return KERN_SUCCESS;
1060 }
1061
1062 boolean_t
1063 bs_add_device(
1064 char *dev_name,
1065 MACH_PORT_FACE master)
1066 {
1067 security_token_t null_security_token = {
1068 { 0, 0 }
1069 };
1070 MACH_PORT_FACE device;
1071 int info[DEV_GET_SIZE_COUNT];
1072 mach_msg_type_number_t info_count;
1073 MACH_PORT_FACE bs = MACH_PORT_NULL;
1074 unsigned int rec_size;
1075 recnum_t count;
1076 int clsize;
1077 MACH_PORT_FACE reply_port;
1078
1079 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1080 null_security_token, dev_name, &device))
1081 return FALSE;
1082
1083 info_count = DEV_GET_SIZE_COUNT;
1084 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1085 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1086 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1087 clsize = bs_get_global_clsize(0);
1088 if (!default_pager_backing_store_create(
1089 default_pager_object,
1090 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1091 (clsize * vm_page_size),
1092 &bs)) {
1093 if (!default_pager_add_segment(bs, device,
1094 0, count, rec_size)) {
1095 return TRUE;
1096 }
1097 ipc_port_release_receive(bs);
1098 }
1099 }
1100
1101 ipc_port_release_send(device);
1102 return FALSE;
1103 }
1104 #endif /* DEVICE_PAGING */
1105
1106 #if VS_ASYNC_REUSE
1107
1108 struct vs_async *
1109 vs_alloc_async(void)
1110 {
1111 struct vs_async *vsa;
1112 MACH_PORT_FACE reply_port;
1113 // kern_return_t kr;
1114
1115 VS_ASYNC_LOCK();
1116 if (vs_async_free_list == NULL) {
1117 VS_ASYNC_UNLOCK();
1118 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1119 if (vsa != NULL) {
1120 /*
1121 * Try allocating a reply port named after the
1122 * address of the vs_async structure.
1123 */
1124 struct vstruct_alias *alias_struct;
1125
1126 reply_port = ipc_port_alloc_kernel();
1127 alias_struct = (struct vstruct_alias *)
1128 kalloc(sizeof (struct vstruct_alias));
1129 if(alias_struct != NULL) {
1130 alias_struct->vs = (struct vstruct *)vsa;
1131 alias_struct->name = &default_pager_ops;
1132 reply_port->alias = (int) alias_struct;
1133 vsa->reply_port = reply_port;
1134 vs_alloc_async_count++;
1135 }
1136 else {
1137 vs_alloc_async_failed++;
1138 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1139 (reply_port));
1140 kfree(vsa, sizeof (struct vs_async));
1141 vsa = NULL;
1142 }
1143 }
1144 } else {
1145 vsa = vs_async_free_list;
1146 vs_async_free_list = vs_async_free_list->vsa_next;
1147 VS_ASYNC_UNLOCK();
1148 }
1149
1150 return vsa;
1151 }
1152
1153 void
1154 vs_free_async(
1155 struct vs_async *vsa)
1156 {
1157 VS_ASYNC_LOCK();
1158 vsa->vsa_next = vs_async_free_list;
1159 vs_async_free_list = vsa;
1160 VS_ASYNC_UNLOCK();
1161 }
1162
1163 #else /* VS_ASYNC_REUSE */
1164
1165 struct vs_async *
1166 vs_alloc_async(void)
1167 {
1168 struct vs_async *vsa;
1169 MACH_PORT_FACE reply_port;
1170 kern_return_t kr;
1171
1172 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1173 if (vsa != NULL) {
1174 /*
1175 * Try allocating a reply port named after the
1176 * address of the vs_async structure.
1177 */
1178 reply_port = ipc_port_alloc_kernel();
1179 alias_struct = (vstruct_alias *)
1180 kalloc(sizeof (struct vstruct_alias));
1181 if(alias_struct != NULL) {
1182 alias_struct->vs = reply_port;
1183 alias_struct->name = &default_pager_ops;
1184 reply_port->alias = (int) vsa;
1185 vsa->reply_port = reply_port;
1186 vs_alloc_async_count++;
1187 }
1188 else {
1189 vs_alloc_async_failed++;
1190 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1191 (reply_port));
1192 kfree(vsa, sizeof (struct vs_async));
1193 vsa = NULL;
1194 }
1195 }
1196
1197 return vsa;
1198 }
1199
1200 void
1201 vs_free_async(
1202 struct vs_async *vsa)
1203 {
1204 MACH_PORT_FACE reply_port;
1205 kern_return_t kr;
1206
1207 reply_port = vsa->reply_port;
1208 kfree(reply_port->alias, sizeof (struct vstuct_alias));
1209 kfree(vsa, sizeof (struct vs_async));
1210 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1211 #if 0
1212 VS_ASYNC_LOCK();
1213 vs_alloc_async_count--;
1214 VS_ASYNC_UNLOCK();
1215 #endif
1216 }
1217
1218 #endif /* VS_ASYNC_REUSE */
1219
1220 zone_t vstruct_zone;
1221
1222 vstruct_t
1223 ps_vstruct_create(
1224 vm_size_t size)
1225 {
1226 vstruct_t vs;
1227 unsigned int i;
1228
1229 vs = (vstruct_t) zalloc(vstruct_zone);
1230 if (vs == VSTRUCT_NULL) {
1231 return VSTRUCT_NULL;
1232 }
1233
1234 VS_LOCK_INIT(vs);
1235
1236 /*
1237 * The following fields will be provided later.
1238 */
1239 vs->vs_pager_ops = NULL;
1240 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1241 vs->vs_references = 1;
1242 vs->vs_seqno = 0;
1243
1244 #ifdef MACH_KERNEL
1245 vs->vs_waiting_seqno = FALSE;
1246 vs->vs_waiting_read = FALSE;
1247 vs->vs_waiting_write = FALSE;
1248 vs->vs_waiting_async = FALSE;
1249 #else
1250 mutex_init(&vs->vs_waiting_seqno, 0);
1251 mutex_init(&vs->vs_waiting_read, 0);
1252 mutex_init(&vs->vs_waiting_write, 0);
1253 mutex_init(&vs->vs_waiting_refs, 0);
1254 mutex_init(&vs->vs_waiting_async, 0);
1255 #endif
1256
1257 vs->vs_readers = 0;
1258 vs->vs_writers = 0;
1259
1260 vs->vs_errors = 0;
1261
1262 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1263 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1264 vs->vs_async_pending = 0;
1265
1266 /*
1267 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1268 * depending on the size of the memory object.
1269 */
1270 if (INDIRECT_CLMAP(vs->vs_size)) {
1271 vs->vs_imap = (struct vs_map **)
1272 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1273 vs->vs_indirect = TRUE;
1274 } else {
1275 vs->vs_dmap = (struct vs_map *)
1276 kalloc(CLMAP_SIZE(vs->vs_size));
1277 vs->vs_indirect = FALSE;
1278 }
1279 vs->vs_xfer_pending = FALSE;
1280 DP_DEBUG(DEBUG_VS_INTERNAL,
1281 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1282
1283 /*
1284 * Check to see that we got the space.
1285 */
1286 if (!vs->vs_dmap) {
1287 kfree(vs, sizeof *vs);
1288 return VSTRUCT_NULL;
1289 }
1290
1291 /*
1292 * Zero the indirect pointers, or clear the direct pointers.
1293 */
1294 if (vs->vs_indirect)
1295 memset(vs->vs_imap, 0,
1296 INDIRECT_CLMAP_SIZE(vs->vs_size));
1297 else
1298 for (i = 0; i < vs->vs_size; i++)
1299 VSM_CLR(vs->vs_dmap[i]);
1300
1301 VS_MAP_LOCK_INIT(vs);
1302
1303 bs_commit(vs->vs_size);
1304
1305 return vs;
1306 }
1307
1308 paging_segment_t ps_select_segment(unsigned int, int *); /* forward */
1309
1310 paging_segment_t
1311 ps_select_segment(
1312 unsigned int shift,
1313 int *psindex)
1314 {
1315 paging_segment_t ps;
1316 int i;
1317 int j;
1318
1319 /*
1320 * Optimize case where there's only one segment.
1321 * paging_segment_max will index the one and only segment.
1322 */
1323
1324 PSL_LOCK();
1325 if (paging_segment_count == 1) {
1326 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
1327 ipc_port_t trigger = IP_NULL;
1328
1329 ps = paging_segments[paging_segment_max];
1330 *psindex = paging_segment_max;
1331 PS_LOCK(ps);
1332 if (ps->ps_going_away) {
1333 /* this segment is being turned off */
1334 lps = PAGING_SEGMENT_NULL;
1335 } else {
1336 ASSERT(ps->ps_clshift >= shift);
1337 if (ps->ps_clcount) {
1338 ps->ps_clcount--;
1339 dp_pages_free -= 1 << ps->ps_clshift;
1340 if(min_pages_trigger_port &&
1341 (dp_pages_free < minimum_pages_remaining)) {
1342 trigger = min_pages_trigger_port;
1343 min_pages_trigger_port = NULL;
1344 bs_low = TRUE;
1345 }
1346 lps = ps;
1347 } else
1348 lps = PAGING_SEGMENT_NULL;
1349 }
1350 PS_UNLOCK(ps);
1351 PSL_UNLOCK();
1352
1353 if (trigger != IP_NULL) {
1354 default_pager_space_alert(trigger, HI_WAT_ALERT);
1355 ipc_port_release_send(trigger);
1356 }
1357 return lps;
1358 }
1359
1360 if (paging_segment_count == 0) {
1361 PSL_UNLOCK();
1362 return PAGING_SEGMENT_NULL;
1363 }
1364
1365 for (i = BS_MAXPRI;
1366 i >= BS_MINPRI; i--) {
1367 int start_index;
1368
1369 if ((ps_select_array[i] == BS_NOPRI) ||
1370 (ps_select_array[i] == BS_FULLPRI))
1371 continue;
1372 start_index = ps_select_array[i];
1373
1374 if(!(paging_segments[start_index])) {
1375 j = start_index+1;
1376 physical_transfer_cluster_count = 0;
1377 }
1378 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1379 (((paging_segments[start_index])->ps_clshift)
1380 + vm_page_shift))) {
1381 physical_transfer_cluster_count = 0;
1382 j = start_index + 1;
1383 } else {
1384 physical_transfer_cluster_count+=1;
1385 j = start_index;
1386 if(start_index == 0)
1387 start_index = paging_segment_max;
1388 else
1389 start_index = start_index - 1;
1390 }
1391
1392 while (1) {
1393 if (j > paging_segment_max)
1394 j = 0;
1395 if ((ps = paging_segments[j]) &&
1396 (ps->ps_bs->bs_priority == i)) {
1397 /*
1398 * Force the ps cluster size to be
1399 * >= that of the vstruct.
1400 */
1401 PS_LOCK(ps);
1402 if (ps->ps_going_away) {
1403 /* this segment is being turned off */
1404 } else if ((ps->ps_clcount) &&
1405 (ps->ps_clshift >= shift)) {
1406 ipc_port_t trigger = IP_NULL;
1407
1408 ps->ps_clcount--;
1409 dp_pages_free -= 1 << ps->ps_clshift;
1410 if(min_pages_trigger_port &&
1411 (dp_pages_free <
1412 minimum_pages_remaining)) {
1413 trigger = min_pages_trigger_port;
1414 min_pages_trigger_port = NULL;
1415 }
1416 PS_UNLOCK(ps);
1417 /*
1418 * found one, quit looking.
1419 */
1420 ps_select_array[i] = j;
1421 PSL_UNLOCK();
1422
1423 if (trigger != IP_NULL) {
1424 default_pager_space_alert(
1425 trigger,
1426 HI_WAT_ALERT);
1427 ipc_port_release_send(trigger);
1428 }
1429 *psindex = j;
1430 return ps;
1431 }
1432 PS_UNLOCK(ps);
1433 }
1434 if (j == start_index) {
1435 /*
1436 * none at this priority -- mark it full
1437 */
1438 ps_select_array[i] = BS_FULLPRI;
1439 break;
1440 }
1441 j++;
1442 }
1443 }
1444 PSL_UNLOCK();
1445 return PAGING_SEGMENT_NULL;
1446 }
1447
1448 vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1449
1450 vm_offset_t
1451 ps_allocate_cluster(
1452 vstruct_t vs,
1453 int *psindex,
1454 paging_segment_t use_ps)
1455 {
1456 unsigned int byte_num;
1457 int bit_num = 0;
1458 paging_segment_t ps;
1459 vm_offset_t cluster;
1460 ipc_port_t trigger = IP_NULL;
1461
1462 /*
1463 * Find best paging segment.
1464 * ps_select_segment will decrement cluster count on ps.
1465 * Must pass cluster shift to find the most appropriate segment.
1466 */
1467 /* NOTE: The addition of paging segment delete capability threatened
1468 * to seriously complicate the treatment of paging segments in this
1469 * module and the ones that call it (notably ps_clmap), because of the
1470 * difficulty in assuring that the paging segment would continue to
1471 * exist between being unlocked and locked. This was
1472 * avoided because all calls to this module are based in either
1473 * dp_memory_object calls which rely on the vs lock, or by
1474 * the transfer function which is part of the segment delete path.
1475 * The transfer function which is part of paging segment delete is
1476 * protected from multiple callers by the backing store lock.
1477 * The paging segment delete function treats mappings to a paging
1478 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1479 * while data is transferred to the remaining segments. This is in
1480 * line with the view that incomplete or in-transition mappings between
1481 * data, a vstruct, and backing store are protected by the vs lock.
1482 * This and the ordering of the paging segment "going_away" bit setting
1483 * protects us.
1484 */
1485 if (use_ps != PAGING_SEGMENT_NULL) {
1486 ps = use_ps;
1487 PSL_LOCK();
1488 PS_LOCK(ps);
1489
1490 ASSERT(ps->ps_clcount != 0);
1491
1492 ps->ps_clcount--;
1493 dp_pages_free -= 1 << ps->ps_clshift;
1494 if(min_pages_trigger_port &&
1495 (dp_pages_free < minimum_pages_remaining)) {
1496 trigger = min_pages_trigger_port;
1497 min_pages_trigger_port = NULL;
1498 }
1499 PSL_UNLOCK();
1500 PS_UNLOCK(ps);
1501 if (trigger != IP_NULL) {
1502 default_pager_space_alert(trigger, HI_WAT_ALERT);
1503 ipc_port_release_send(trigger);
1504 }
1505
1506 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1507 PAGING_SEGMENT_NULL) {
1508 static uint32_t lastnotify = 0;
1509 uint32_t now, nanoseconds_dummy;
1510
1511 /*
1512 * Emit a notification of the low-paging resource condition
1513 * but don't issue it more than once every five seconds. This
1514 * prevents us from overflowing logs with thousands of
1515 * repetitions of the message.
1516 */
1517 clock_get_system_nanotime(&now, &nanoseconds_dummy);
1518 if (now > lastnotify + 5) {
1519 dprintf(("no space in available paging segments\n"));
1520 lastnotify = now;
1521 }
1522
1523 /* the count got off maybe, reset to zero */
1524 PSL_LOCK();
1525 dp_pages_free = 0;
1526 if(min_pages_trigger_port) {
1527 trigger = min_pages_trigger_port;
1528 min_pages_trigger_port = NULL;
1529 bs_low = TRUE;
1530 }
1531 PSL_UNLOCK();
1532 if (trigger != IP_NULL) {
1533 default_pager_space_alert(trigger, HI_WAT_ALERT);
1534 ipc_port_release_send(trigger);
1535 }
1536 return (vm_offset_t) -1;
1537 }
1538
1539 /*
1540 * Look for an available cluster. At the end of the loop,
1541 * byte_num is the byte offset and bit_num is the bit offset of the
1542 * first zero bit in the paging segment bitmap.
1543 */
1544 PS_LOCK(ps);
1545 byte_num = ps->ps_hint;
1546 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1547 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1548 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1549 if (isclr((ps->ps_bmap + byte_num), bit_num))
1550 break;
1551 }
1552 ASSERT(bit_num != NBBY);
1553 break;
1554 }
1555 }
1556 ps->ps_hint = byte_num;
1557 cluster = (byte_num*NBBY) + bit_num;
1558
1559 /* Space was reserved, so this must be true */
1560 ASSERT(cluster < ps->ps_ncls);
1561
1562 setbit(ps->ps_bmap, cluster);
1563 PS_UNLOCK(ps);
1564
1565 return cluster;
1566 }
1567
1568 void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1569
1570 void
1571 ps_deallocate_cluster(
1572 paging_segment_t ps,
1573 vm_offset_t cluster)
1574 {
1575
1576 if (cluster >= (vm_offset_t) ps->ps_ncls)
1577 panic("ps_deallocate_cluster: Invalid cluster number");
1578
1579 /*
1580 * Lock the paging segment, clear the cluster's bitmap and increment the
1581 * number of free cluster.
1582 */
1583 PSL_LOCK();
1584 PS_LOCK(ps);
1585 clrbit(ps->ps_bmap, cluster);
1586 ++ps->ps_clcount;
1587 dp_pages_free += 1 << ps->ps_clshift;
1588 PSL_UNLOCK();
1589
1590 /*
1591 * Move the hint down to the freed cluster if it is
1592 * less than the current hint.
1593 */
1594 if ((cluster/NBBY) < ps->ps_hint) {
1595 ps->ps_hint = (cluster/NBBY);
1596 }
1597
1598 PS_UNLOCK(ps);
1599
1600 /*
1601 * If we're freeing space on a full priority, reset the array.
1602 */
1603 PSL_LOCK();
1604 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1605 ps_select_array[ps->ps_bs->bs_priority] = 0;
1606 PSL_UNLOCK();
1607
1608 return;
1609 }
1610
1611 void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1612
1613 void
1614 ps_dealloc_vsmap(
1615 struct vs_map *vsmap,
1616 vm_size_t size)
1617 {
1618 unsigned int i;
1619 for (i = 0; i < size; i++)
1620 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1621 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1622 VSM_CLOFF(vsmap[i]));
1623 }
1624
1625 void
1626 ps_vstruct_dealloc(
1627 vstruct_t vs)
1628 {
1629 unsigned int i;
1630 // spl_t s;
1631
1632 VS_MAP_LOCK(vs);
1633
1634 /*
1635 * If this is an indirect structure, then we walk through the valid
1636 * (non-zero) indirect pointers and deallocate the clusters
1637 * associated with each used map entry (via ps_dealloc_vsmap).
1638 * When all of the clusters in an indirect block have been
1639 * freed, we deallocate the block. When all of the indirect
1640 * blocks have been deallocated we deallocate the memory
1641 * holding the indirect pointers.
1642 */
1643 if (vs->vs_indirect) {
1644 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1645 if (vs->vs_imap[i] != NULL) {
1646 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1647 kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1648 }
1649 }
1650 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1651 } else {
1652 /*
1653 * Direct map. Free used clusters, then memory.
1654 */
1655 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1656 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1657 }
1658 VS_MAP_UNLOCK(vs);
1659
1660 bs_commit(- vs->vs_size);
1661
1662 zfree(vstruct_zone, vs);
1663 }
1664
1665 int ps_map_extend(vstruct_t, unsigned int); /* forward */
1666
1667 int ps_map_extend(
1668 vstruct_t vs,
1669 unsigned int new_size)
1670 {
1671 struct vs_map **new_imap;
1672 struct vs_map *new_dmap = NULL;
1673 int newdsize;
1674 int i;
1675 void *old_map = NULL;
1676 int old_map_size = 0;
1677
1678 if (vs->vs_size >= new_size) {
1679 /*
1680 * Someone has already done the work.
1681 */
1682 return 0;
1683 }
1684
1685 /*
1686 * If the new size extends into the indirect range, then we have one
1687 * of two cases: we are going from indirect to indirect, or we are
1688 * going from direct to indirect. If we are going from indirect to
1689 * indirect, then it is possible that the new size will fit in the old
1690 * indirect map. If this is the case, then just reset the size of the
1691 * vstruct map and we are done. If the new size will not
1692 * fit into the old indirect map, then we have to allocate a new
1693 * indirect map and copy the old map pointers into this new map.
1694 *
1695 * If we are going from direct to indirect, then we have to allocate a
1696 * new indirect map and copy the old direct pages into the first
1697 * indirect page of the new map.
1698 * NOTE: allocating memory here is dangerous, as we're in the
1699 * pageout path.
1700 */
1701 if (INDIRECT_CLMAP(new_size)) {
1702 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1703
1704 /*
1705 * Get a new indirect map and zero it.
1706 */
1707 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1708 if (vs->vs_indirect &&
1709 (new_map_size == old_map_size)) {
1710 bs_commit(new_size - vs->vs_size);
1711 vs->vs_size = new_size;
1712 return 0;
1713 }
1714
1715 new_imap = (struct vs_map **)kalloc(new_map_size);
1716 if (new_imap == NULL) {
1717 return -1;
1718 }
1719 memset(new_imap, 0, new_map_size);
1720
1721 if (vs->vs_indirect) {
1722 /* Copy old entries into new map */
1723 memcpy(new_imap, vs->vs_imap, old_map_size);
1724 /* Arrange to free the old map */
1725 old_map = (void *) vs->vs_imap;
1726 newdsize = 0;
1727 } else { /* Old map was a direct map */
1728 /* Allocate an indirect page */
1729 if ((new_imap[0] = (struct vs_map *)
1730 kalloc(CLMAP_THRESHOLD)) == NULL) {
1731 kfree(new_imap, new_map_size);
1732 return -1;
1733 }
1734 new_dmap = new_imap[0];
1735 newdsize = CLMAP_ENTRIES;
1736 }
1737 } else {
1738 new_imap = NULL;
1739 newdsize = new_size;
1740 /*
1741 * If the new map is a direct map, then the old map must
1742 * also have been a direct map. All we have to do is
1743 * to allocate a new direct map, copy the old entries
1744 * into it and free the old map.
1745 */
1746 if ((new_dmap = (struct vs_map *)
1747 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1748 return -1;
1749 }
1750 }
1751 if (newdsize) {
1752
1753 /* Free the old map */
1754 old_map = (void *) vs->vs_dmap;
1755 old_map_size = CLMAP_SIZE(vs->vs_size);
1756
1757 /* Copy info from the old map into the new map */
1758 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1759
1760 /* Initialize the rest of the new map */
1761 for (i = vs->vs_size; i < newdsize; i++)
1762 VSM_CLR(new_dmap[i]);
1763 }
1764 if (new_imap) {
1765 vs->vs_imap = new_imap;
1766 vs->vs_indirect = TRUE;
1767 } else
1768 vs->vs_dmap = new_dmap;
1769 bs_commit(new_size - vs->vs_size);
1770 vs->vs_size = new_size;
1771 if (old_map)
1772 kfree(old_map, old_map_size);
1773 return 0;
1774 }
1775
1776 vm_offset_t
1777 ps_clmap(
1778 vstruct_t vs,
1779 vm_offset_t offset,
1780 struct clmap *clmap,
1781 int flag,
1782 vm_size_t size,
1783 int error)
1784 {
1785 vm_offset_t cluster; /* The cluster of offset. */
1786 vm_offset_t newcl; /* The new cluster allocated. */
1787 vm_offset_t newoff;
1788 unsigned int i;
1789 struct vs_map *vsmap;
1790
1791 VS_MAP_LOCK(vs);
1792
1793 ASSERT(vs->vs_dmap);
1794 cluster = atop_32(offset) >> vs->vs_clshift;
1795
1796 /*
1797 * Initialize cluster error value
1798 */
1799 clmap->cl_error = 0;
1800
1801 /*
1802 * If the object has grown, extend the page map.
1803 */
1804 if (cluster >= vs->vs_size) {
1805 if (flag == CL_FIND) {
1806 /* Do not allocate if just doing a lookup */
1807 VS_MAP_UNLOCK(vs);
1808 return (vm_offset_t) -1;
1809 }
1810 if (ps_map_extend(vs, cluster + 1)) {
1811 VS_MAP_UNLOCK(vs);
1812 return (vm_offset_t) -1;
1813 }
1814 }
1815
1816 /*
1817 * Look for the desired cluster. If the map is indirect, then we
1818 * have a two level lookup. First find the indirect block, then
1819 * find the actual cluster. If the indirect block has not yet
1820 * been allocated, then do so. If the cluster has not yet been
1821 * allocated, then do so.
1822 *
1823 * If any of the allocations fail, then return an error.
1824 * Don't allocate if just doing a lookup.
1825 */
1826 if (vs->vs_indirect) {
1827 long ind_block = cluster/CLMAP_ENTRIES;
1828
1829 /* Is the indirect block allocated? */
1830 vsmap = vs->vs_imap[ind_block];
1831 if (vsmap == NULL) {
1832 if (flag == CL_FIND) {
1833 VS_MAP_UNLOCK(vs);
1834 return (vm_offset_t) -1;
1835 }
1836
1837 /* Allocate the indirect block */
1838 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1839 if (vsmap == NULL) {
1840 VS_MAP_UNLOCK(vs);
1841 return (vm_offset_t) -1;
1842 }
1843 /* Initialize the cluster offsets */
1844 for (i = 0; i < CLMAP_ENTRIES; i++)
1845 VSM_CLR(vsmap[i]);
1846 vs->vs_imap[ind_block] = vsmap;
1847 }
1848 } else
1849 vsmap = vs->vs_dmap;
1850
1851 ASSERT(vsmap);
1852 vsmap += cluster%CLMAP_ENTRIES;
1853
1854 /*
1855 * At this point, vsmap points to the struct vs_map desired.
1856 *
1857 * Look in the map for the cluster, if there was an error on a
1858 * previous write, flag it and return. If it is not yet
1859 * allocated, then allocate it, if we're writing; if we're
1860 * doing a lookup and the cluster's not allocated, return error.
1861 */
1862 if (VSM_ISERR(*vsmap)) {
1863 clmap->cl_error = VSM_GETERR(*vsmap);
1864 VS_MAP_UNLOCK(vs);
1865 return (vm_offset_t) -1;
1866 } else if (VSM_ISCLR(*vsmap)) {
1867 int psindex;
1868
1869 if (flag == CL_FIND) {
1870 /*
1871 * If there's an error and the entry is clear, then
1872 * we've run out of swap space. Record the error
1873 * here and return.
1874 */
1875 if (error) {
1876 VSM_SETERR(*vsmap, error);
1877 }
1878 VS_MAP_UNLOCK(vs);
1879 return (vm_offset_t) -1;
1880 } else {
1881 /*
1882 * Attempt to allocate a cluster from the paging segment
1883 */
1884 newcl = ps_allocate_cluster(vs, &psindex,
1885 PAGING_SEGMENT_NULL);
1886 if (newcl == (vm_offset_t) -1) {
1887 VS_MAP_UNLOCK(vs);
1888 return (vm_offset_t) -1;
1889 }
1890 VSM_CLR(*vsmap);
1891 VSM_SETCLOFF(*vsmap, newcl);
1892 VSM_SETPS(*vsmap, psindex);
1893 }
1894 } else
1895 newcl = VSM_CLOFF(*vsmap);
1896
1897 /*
1898 * Fill in pertinent fields of the clmap
1899 */
1900 clmap->cl_ps = VSM_PS(*vsmap);
1901 clmap->cl_numpages = VSCLSIZE(vs);
1902 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1903
1904 /*
1905 * Byte offset in paging segment is byte offset to cluster plus
1906 * byte offset within cluster. It looks ugly, but should be
1907 * relatively quick.
1908 */
1909 ASSERT(trunc_page(offset) == offset);
1910 newcl = ptoa_32(newcl) << vs->vs_clshift;
1911 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1912 if (flag == CL_ALLOC) {
1913 /*
1914 * set bits in the allocation bitmap according to which
1915 * pages were requested. size is in bytes.
1916 */
1917 i = atop_32(newoff);
1918 while ((size > 0) && (i < VSCLSIZE(vs))) {
1919 VSM_SETALLOC(*vsmap, i);
1920 i++;
1921 size -= vm_page_size;
1922 }
1923 }
1924 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1925 if (newoff) {
1926 /*
1927 * Offset is not cluster aligned, so number of pages
1928 * and bitmaps must be adjusted
1929 */
1930 clmap->cl_numpages -= atop_32(newoff);
1931 CLMAP_SHIFT(clmap, vs);
1932 CLMAP_SHIFTALLOC(clmap, vs);
1933 }
1934
1935 /*
1936 *
1937 * The setting of valid bits and handling of write errors
1938 * must be done here, while we hold the lock on the map.
1939 * It logically should be done in ps_vs_write_complete().
1940 * The size and error information has been passed from
1941 * ps_vs_write_complete(). If the size parameter is non-zero,
1942 * then there is work to be done. If error is also non-zero,
1943 * then the error number is recorded in the cluster and the
1944 * entire cluster is in error.
1945 */
1946 if (size && flag == CL_FIND) {
1947 vm_offset_t off = (vm_offset_t) 0;
1948
1949 if (!error) {
1950 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1951 i++) {
1952 VSM_SETPG(*vsmap, i);
1953 size -= vm_page_size;
1954 }
1955 ASSERT(i <= VSCLSIZE(vs));
1956 } else {
1957 BS_STAT(clmap->cl_ps->ps_bs,
1958 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
1959 atop_32(size));
1960 off = VSM_CLOFF(*vsmap);
1961 VSM_SETERR(*vsmap, error);
1962 }
1963 /*
1964 * Deallocate cluster if error, and no valid pages
1965 * already present.
1966 */
1967 if (off != (vm_offset_t) 0)
1968 ps_deallocate_cluster(clmap->cl_ps, off);
1969 VS_MAP_UNLOCK(vs);
1970 return (vm_offset_t) 0;
1971 } else
1972 VS_MAP_UNLOCK(vs);
1973
1974 DP_DEBUG(DEBUG_VS_INTERNAL,
1975 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1976 newcl+newoff, (int) vs, (int) vsmap, flag));
1977 DP_DEBUG(DEBUG_VS_INTERNAL,
1978 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1979 (int) clmap->cl_ps, clmap->cl_numpages,
1980 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1981
1982 return (newcl + newoff);
1983 }
1984
1985 void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1986
1987 void
1988 ps_clunmap(
1989 vstruct_t vs,
1990 vm_offset_t offset,
1991 vm_size_t length)
1992 {
1993 vm_offset_t cluster; /* The cluster number of offset */
1994 struct vs_map *vsmap;
1995
1996 VS_MAP_LOCK(vs);
1997
1998 /*
1999 * Loop through all clusters in this range, freeing paging segment
2000 * clusters and map entries as encountered.
2001 */
2002 while (length > 0) {
2003 vm_offset_t newoff;
2004 unsigned int i;
2005
2006 cluster = atop_32(offset) >> vs->vs_clshift;
2007 if (vs->vs_indirect) /* indirect map */
2008 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2009 else
2010 vsmap = vs->vs_dmap;
2011 if (vsmap == NULL) {
2012 VS_MAP_UNLOCK(vs);
2013 return;
2014 }
2015 vsmap += cluster%CLMAP_ENTRIES;
2016 if (VSM_ISCLR(*vsmap)) {
2017 length -= vm_page_size;
2018 offset += vm_page_size;
2019 continue;
2020 }
2021 /*
2022 * We've got a valid mapping. Clear it and deallocate
2023 * paging segment cluster pages.
2024 * Optimize for entire cluster cleraing.
2025 */
2026 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2027 /*
2028 * Not cluster aligned.
2029 */
2030 ASSERT(trunc_page(newoff) == newoff);
2031 i = atop_32(newoff);
2032 } else
2033 i = 0;
2034 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2035 VSM_CLRPG(*vsmap, i);
2036 VSM_CLRALLOC(*vsmap, i);
2037 length -= vm_page_size;
2038 offset += vm_page_size;
2039 i++;
2040 }
2041
2042 /*
2043 * If map entry is empty, clear and deallocate cluster.
2044 */
2045 if (!VSM_ALLOC(*vsmap)) {
2046 ps_deallocate_cluster(VSM_PS(*vsmap),
2047 VSM_CLOFF(*vsmap));
2048 VSM_CLR(*vsmap);
2049 }
2050 }
2051
2052 VS_MAP_UNLOCK(vs);
2053 }
2054
2055 void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2056
2057 void
2058 ps_vs_write_complete(
2059 vstruct_t vs,
2060 vm_offset_t offset,
2061 vm_size_t size,
2062 int error)
2063 {
2064 struct clmap clmap;
2065
2066 /*
2067 * Get the struct vsmap for this cluster.
2068 * Use READ, even though it was written, because the
2069 * cluster MUST be present, unless there was an error
2070 * in the original ps_clmap (e.g. no space), in which
2071 * case, nothing happens.
2072 *
2073 * Must pass enough information to ps_clmap to allow it
2074 * to set the vs_map structure bitmap under lock.
2075 */
2076 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2077 }
2078
2079 void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2080
2081 void
2082 vs_cl_write_complete(
2083 vstruct_t vs,
2084 __unused paging_segment_t ps,
2085 vm_offset_t offset,
2086 __unused vm_offset_t addr,
2087 vm_size_t size,
2088 boolean_t async,
2089 int error)
2090 {
2091 // kern_return_t kr;
2092
2093 if (error) {
2094 /*
2095 * For internal objects, the error is recorded on a
2096 * per-cluster basis by ps_clmap() which is called
2097 * by ps_vs_write_complete() below.
2098 */
2099 dprintf(("write failed error = 0x%x\n", error));
2100 /* add upl_abort code here */
2101 } else
2102 GSTAT(global_stats.gs_pages_out += atop_32(size));
2103 /*
2104 * Notify the vstruct mapping code, so it can do its accounting.
2105 */
2106 ps_vs_write_complete(vs, offset, size, error);
2107
2108 if (async) {
2109 VS_LOCK(vs);
2110 ASSERT(vs->vs_async_pending > 0);
2111 vs->vs_async_pending -= size;
2112 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2113 vs->vs_waiting_async = FALSE;
2114 VS_UNLOCK(vs);
2115 /* mutex_unlock(&vs->vs_waiting_async); */
2116 thread_wakeup(&vs->vs_async_pending);
2117 } else {
2118 VS_UNLOCK(vs);
2119 }
2120 }
2121 }
2122
2123 #ifdef DEVICE_PAGING
2124 kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2125
2126 kern_return_t
2127 device_write_reply(
2128 MACH_PORT_FACE reply_port,
2129 kern_return_t device_code,
2130 io_buf_len_t bytes_written)
2131 {
2132 struct vs_async *vsa;
2133
2134 vsa = (struct vs_async *)
2135 ((struct vstruct_alias *)(reply_port->alias))->vs;
2136
2137 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2138 device_code = KERN_FAILURE;
2139 }
2140
2141 vsa->vsa_error = device_code;
2142
2143
2144 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2145 if(vsa->vsa_flags & VSA_TRANSFER) {
2146 /* revisit when async disk segments redone */
2147 if(vsa->vsa_error) {
2148 /* need to consider error condition. re-write data or */
2149 /* throw it away here. */
2150 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2151 }
2152 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2153 vsa->vsa_size, vsa->vsa_error);
2154 } else {
2155 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2156 vsa->vsa_addr, vsa->vsa_size, TRUE,
2157 vsa->vsa_error);
2158 }
2159 VS_FREE_ASYNC(vsa);
2160
2161 return KERN_SUCCESS;
2162 }
2163
2164 kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2165 kern_return_t
2166 device_write_reply_inband(
2167 MACH_PORT_FACE reply_port,
2168 kern_return_t return_code,
2169 io_buf_len_t bytes_written)
2170 {
2171 panic("device_write_reply_inband: illegal");
2172 return KERN_SUCCESS;
2173 }
2174
2175 kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2176 kern_return_t
2177 device_read_reply(
2178 MACH_PORT_FACE reply_port,
2179 kern_return_t return_code,
2180 io_buf_ptr_t data,
2181 mach_msg_type_number_t dataCnt)
2182 {
2183 struct vs_async *vsa;
2184 vsa = (struct vs_async *)
2185 ((struct vstruct_alias *)(reply_port->alias))->vs;
2186 vsa->vsa_addr = (vm_offset_t)data;
2187 vsa->vsa_size = (vm_size_t)dataCnt;
2188 vsa->vsa_error = return_code;
2189 thread_wakeup(&vsa->vsa_lock);
2190 return KERN_SUCCESS;
2191 }
2192
2193 kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2194 kern_return_t
2195 device_read_reply_inband(
2196 MACH_PORT_FACE reply_port,
2197 kern_return_t return_code,
2198 io_buf_ptr_inband_t data,
2199 mach_msg_type_number_t dataCnt)
2200 {
2201 panic("device_read_reply_inband: illegal");
2202 return KERN_SUCCESS;
2203 }
2204
2205 kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2206 kern_return_t
2207 device_read_reply_overwrite(
2208 MACH_PORT_FACE reply_port,
2209 kern_return_t return_code,
2210 io_buf_len_t bytes_read)
2211 {
2212 panic("device_read_reply_overwrite: illegal\n");
2213 return KERN_SUCCESS;
2214 }
2215
2216 kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2217 kern_return_t
2218 device_open_reply(
2219 MACH_PORT_FACE reply_port,
2220 kern_return_t return_code,
2221 MACH_PORT_FACE device_port)
2222 {
2223 panic("device_open_reply: illegal\n");
2224 return KERN_SUCCESS;
2225 }
2226
2227 kern_return_t
2228 ps_read_device(
2229 paging_segment_t ps,
2230 vm_offset_t offset,
2231 vm_offset_t *bufferp,
2232 unsigned int size,
2233 unsigned int *residualp,
2234 int flags)
2235 {
2236 kern_return_t kr;
2237 recnum_t dev_offset;
2238 unsigned int bytes_wanted;
2239 unsigned int bytes_read;
2240 unsigned int total_read;
2241 vm_offset_t dev_buffer;
2242 vm_offset_t buf_ptr;
2243 unsigned int records_read;
2244 struct vs_async *vsa;
2245 mutex_t vs_waiting_read_reply;
2246
2247 device_t device;
2248 vm_map_copy_t device_data = NULL;
2249 default_pager_thread_t *dpt = NULL;
2250
2251 device = dev_port_lookup(ps->ps_device);
2252 clustered_reads[atop_32(size)]++;
2253
2254 dev_offset = (ps->ps_offset +
2255 (offset >> (vm_page_shift - ps->ps_record_shift)));
2256 bytes_wanted = size;
2257 total_read = 0;
2258 *bufferp = (vm_offset_t)NULL;
2259
2260 do {
2261 vsa = VS_ALLOC_ASYNC();
2262 if (vsa) {
2263 vsa->vsa_vs = NULL;
2264 vsa->vsa_addr = 0;
2265 vsa->vsa_offset = 0;
2266 vsa->vsa_size = 0;
2267 vsa->vsa_ps = NULL;
2268 }
2269 mutex_init(&vsa->vsa_lock, 0);
2270 ip_lock(vsa->reply_port);
2271 vsa->reply_port->ip_sorights++;
2272 ip_reference(vsa->reply_port);
2273 ip_unlock(vsa->reply_port);
2274 kr = ds_device_read_common(device,
2275 vsa->reply_port,
2276 (mach_msg_type_name_t)
2277 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2278 (dev_mode_t) 0,
2279 dev_offset,
2280 bytes_wanted,
2281 (IO_READ | IO_CALL),
2282 (io_buf_ptr_t *) &dev_buffer,
2283 (mach_msg_type_number_t *) &bytes_read);
2284 if(kr == MIG_NO_REPLY) {
2285 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
2286 thread_block(THREAD_CONTINUE_NULL);
2287
2288 dev_buffer = vsa->vsa_addr;
2289 bytes_read = (unsigned int)vsa->vsa_size;
2290 kr = vsa->vsa_error;
2291 }
2292 VS_FREE_ASYNC(vsa);
2293 if (kr != KERN_SUCCESS || bytes_read == 0) {
2294 break;
2295 }
2296 total_read += bytes_read;
2297
2298 /*
2299 * If we got the entire range, use the returned dev_buffer.
2300 */
2301 if (bytes_read == size) {
2302 *bufferp = (vm_offset_t)dev_buffer;
2303 break;
2304 }
2305
2306 #if 1
2307 dprintf(("read only %d bytes out of %d\n",
2308 bytes_read, bytes_wanted));
2309 #endif
2310 if(dpt == NULL) {
2311 dpt = get_read_buffer();
2312 buf_ptr = dpt->dpt_buffer;
2313 *bufferp = (vm_offset_t)buf_ptr;
2314 }
2315 /*
2316 * Otherwise, copy the data into the provided buffer (*bufferp)
2317 * and append the rest of the range as it comes in.
2318 */
2319 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2320 buf_ptr += bytes_read;
2321 bytes_wanted -= bytes_read;
2322 records_read = (bytes_read >>
2323 (vm_page_shift - ps->ps_record_shift));
2324 dev_offset += records_read;
2325 DP_DEBUG(DEBUG_VS_INTERNAL,
2326 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2327 dev_buffer, bytes_read));
2328 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2329 != KERN_SUCCESS)
2330 Panic("dealloc buf");
2331 } while (bytes_wanted);
2332
2333 *residualp = size - total_read;
2334 if((dev_buffer != *bufferp) && (total_read != 0)) {
2335 vm_offset_t temp_buffer;
2336 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2337 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2338 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2339 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2340 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2341 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2342 (vm_map_copy_t *)&device_data, FALSE))
2343 panic("ps_read_device: cannot copyin locally provided buffer\n");
2344 }
2345 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2346 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2347 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2348 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2349 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2350 (vm_map_copy_t *)&device_data, FALSE))
2351 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2352 }
2353 else {
2354 device_data = NULL;
2355 }
2356 *bufferp = (vm_offset_t)device_data;
2357
2358 if(dpt != NULL) {
2359 /* Free the receive buffer */
2360 dpt->checked_out = 0;
2361 thread_wakeup(&dpt_array);
2362 }
2363 return KERN_SUCCESS;
2364 }
2365
2366 kern_return_t
2367 ps_write_device(
2368 paging_segment_t ps,
2369 vm_offset_t offset,
2370 vm_offset_t addr,
2371 unsigned int size,
2372 struct vs_async *vsa)
2373 {
2374 recnum_t dev_offset;
2375 io_buf_len_t bytes_to_write, bytes_written;
2376 recnum_t records_written;
2377 kern_return_t kr;
2378 MACH_PORT_FACE reply_port;
2379
2380
2381
2382 clustered_writes[atop_32(size)]++;
2383
2384 dev_offset = (ps->ps_offset +
2385 (offset >> (vm_page_shift - ps->ps_record_shift)));
2386 bytes_to_write = size;
2387
2388 if (vsa) {
2389 /*
2390 * Asynchronous write.
2391 */
2392 reply_port = vsa->reply_port;
2393 ip_lock(reply_port);
2394 reply_port->ip_sorights++;
2395 ip_reference(reply_port);
2396 ip_unlock(reply_port);
2397 {
2398 device_t device;
2399 device = dev_port_lookup(ps->ps_device);
2400
2401 vsa->vsa_addr = addr;
2402 kr=ds_device_write_common(device,
2403 reply_port,
2404 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2405 (dev_mode_t) 0,
2406 dev_offset,
2407 (io_buf_ptr_t) addr,
2408 size,
2409 (IO_WRITE | IO_CALL),
2410 &bytes_written);
2411 }
2412 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2413 if (verbose)
2414 dprintf(("%s0x%x, addr=0x%x,"
2415 "size=0x%x,offset=0x%x\n",
2416 "device_write_request returned ",
2417 kr, addr, size, offset));
2418 BS_STAT(ps->ps_bs,
2419 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2420 /* do the completion notification to free resources */
2421 device_write_reply(reply_port, kr, 0);
2422 return PAGER_ERROR;
2423 }
2424 } else do {
2425 /*
2426 * Synchronous write.
2427 */
2428 {
2429 device_t device;
2430 device = dev_port_lookup(ps->ps_device);
2431 kr=ds_device_write_common(device,
2432 IP_NULL, 0,
2433 (dev_mode_t) 0,
2434 dev_offset,
2435 (io_buf_ptr_t) addr,
2436 size,
2437 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2438 &bytes_written);
2439 }
2440 if (kr != KERN_SUCCESS) {
2441 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2442 "device_write returned ",
2443 kr, addr, size, offset));
2444 BS_STAT(ps->ps_bs,
2445 ps->ps_bs->bs_pages_out_fail += atop_32(size));
2446 return PAGER_ERROR;
2447 }
2448 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2449 Panic("fragmented write");
2450 records_written = (bytes_written >>
2451 (vm_page_shift - ps->ps_record_shift));
2452 dev_offset += records_written;
2453 #if 1
2454 if (bytes_written != bytes_to_write) {
2455 dprintf(("wrote only %d bytes out of %d\n",
2456 bytes_written, bytes_to_write));
2457 }
2458 #endif
2459 bytes_to_write -= bytes_written;
2460 addr += bytes_written;
2461 } while (bytes_to_write > 0);
2462
2463 return PAGER_SUCCESS;
2464 }
2465
2466
2467 #else /* !DEVICE_PAGING */
2468
2469 kern_return_t
2470 ps_read_device(
2471 __unused paging_segment_t ps,
2472 __unused vm_offset_t offset,
2473 __unused vm_offset_t *bufferp,
2474 __unused unsigned int size,
2475 __unused unsigned int *residualp,
2476 __unused int flags)
2477 {
2478 panic("ps_read_device not supported");
2479 return KERN_FAILURE;
2480 }
2481
2482 kern_return_t
2483 ps_write_device(
2484 __unused paging_segment_t ps,
2485 __unused vm_offset_t offset,
2486 __unused vm_offset_t addr,
2487 __unused unsigned int size,
2488 __unused struct vs_async *vsa)
2489 {
2490 panic("ps_write_device not supported");
2491 return KERN_FAILURE;
2492 }
2493
2494 #endif /* DEVICE_PAGING */
2495 void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */
2496
2497 void
2498 pvs_object_data_provided(
2499 __unused vstruct_t vs,
2500 __unused upl_t upl,
2501 __unused upl_offset_t offset,
2502 upl_size_t size)
2503 {
2504
2505 DP_DEBUG(DEBUG_VS_INTERNAL,
2506 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2507 upl, offset, size));
2508
2509 ASSERT(size > 0);
2510 GSTAT(global_stats.gs_pages_in += atop_32(size));
2511
2512
2513 #if USE_PRECIOUS
2514 ps_clunmap(vs, offset, size);
2515 #endif /* USE_PRECIOUS */
2516
2517 }
2518
2519 static memory_object_offset_t last_start;
2520 static vm_size_t last_length;
2521
2522 kern_return_t
2523 pvs_cluster_read(
2524 vstruct_t vs,
2525 vm_offset_t vs_offset,
2526 vm_size_t cnt,
2527 void *fault_info)
2528 {
2529 kern_return_t error = KERN_SUCCESS;
2530 unsigned int size;
2531 unsigned int residual;
2532 unsigned int request_flags;
2533 int seg_index;
2534 int pages_in_cl;
2535 int cl_size;
2536 int cl_mask;
2537 int cl_index;
2538 unsigned int xfer_size;
2539 vm_offset_t orig_vs_offset;
2540 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2541 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2542 struct clmap clmap;
2543 upl_t upl;
2544 unsigned int page_list_count;
2545 memory_object_offset_t start;
2546
2547 pages_in_cl = 1 << vs->vs_clshift;
2548 cl_size = pages_in_cl * vm_page_size;
2549 cl_mask = cl_size - 1;
2550
2551 #if USE_PRECIOUS
2552 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2553 #else
2554 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2555 #endif
2556 cl_index = (vs_offset & cl_mask) / vm_page_size;
2557
2558 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (vm_offset_t)-1) ||
2559 !CLMAP_ISSET(clmap, cl_index)) {
2560 /*
2561 * the needed page doesn't exist in the backing store...
2562 * we don't want to try to do any I/O, just abort the
2563 * page and let the fault handler provide a zero-fill
2564 */
2565 if (cnt == 0) {
2566 /*
2567 * The caller was just poking at us to see if
2568 * the page has been paged out. No need to
2569 * mess with the page at all.
2570 * Just let the caller know we don't have that page.
2571 */
2572 return KERN_FAILURE;
2573 }
2574
2575 page_list_count = 0;
2576
2577 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2578 PAGE_SIZE, PAGE_SIZE,
2579 &upl, NULL, &page_list_count,
2580 request_flags);
2581
2582 if (clmap.cl_error)
2583 upl_abort(upl, UPL_ABORT_ERROR);
2584 else
2585 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2586 upl_deallocate(upl);
2587
2588 return KERN_SUCCESS;
2589 }
2590
2591 if (cnt == 0) {
2592 /*
2593 * The caller was just poking at us to see if
2594 * the page has been paged out. No need to
2595 * mess with the page at all.
2596 * Just let the caller know we do have that page.
2597 */
2598 return KERN_SUCCESS;
2599 }
2600
2601 assert(dp_encryption_inited);
2602 if (dp_encryption) {
2603 /*
2604 * ENCRYPTED SWAP:
2605 * request that the UPL be prepared for
2606 * decryption.
2607 */
2608 request_flags |= UPL_ENCRYPT;
2609 }
2610 orig_vs_offset = vs_offset;
2611
2612 start = (memory_object_offset_t)vs_offset;
2613 assert(cnt != 0);
2614 cnt = VM_SUPER_CLUSTER;
2615
2616 /*
2617 * determine how big a speculative I/O we should try for...
2618 */
2619 if (memory_object_cluster_size(vs->vs_control, &start, &cnt, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
2620 assert(vs_offset >= (vm_offset_t) start &&
2621 vs_offset < (vm_offset_t) (start + cnt));
2622 vs_offset = (vm_offset_t)start;
2623 } else
2624 cnt = PAGE_SIZE;
2625
2626 last_start = start;
2627 last_length = cnt;
2628
2629 /*
2630 * This loop will be executed multiple times until the entire
2631 * range has been looked at or we issue an I/O... if the request spans cluster
2632 * boundaries, the clusters will be checked for logical continunity,
2633 * if contiguous the I/O request will span multiple clusters...
2634 * at most only 1 I/O will be issued... it will encompass the original offset
2635 */
2636 while (cnt && error == KERN_SUCCESS) {
2637 int ps_info_valid;
2638
2639 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
2640 size = VM_SUPER_CLUSTER;
2641 size -= vs_offset & cl_mask;
2642 } else if (cnt > VM_SUPER_CLUSTER)
2643 size = VM_SUPER_CLUSTER;
2644 else
2645 size = cnt;
2646
2647 cnt -= size;
2648
2649 ps_info_valid = 0;
2650 seg_index = 0;
2651
2652 while (size > 0 && error == KERN_SUCCESS) {
2653 unsigned int abort_size;
2654 int failed_size;
2655 int beg_pseg;
2656 int beg_indx;
2657 vm_offset_t cur_offset;
2658
2659 if ( !ps_info_valid) {
2660 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2661 psp[seg_index] = CLMAP_PS(clmap);
2662 ps_info_valid = 1;
2663 }
2664 /*
2665 * skip over unallocated physical segments
2666 */
2667 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2668 abort_size = cl_size - (vs_offset & cl_mask);
2669 abort_size = MIN(abort_size, size);
2670
2671 size -= abort_size;
2672 vs_offset += abort_size;
2673
2674 seg_index++;
2675 ps_info_valid = 0;
2676
2677 continue;
2678 }
2679 cl_index = (vs_offset & cl_mask) / vm_page_size;
2680
2681 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2682 /*
2683 * skip over unallocated pages
2684 */
2685 if (CLMAP_ISSET(clmap, cl_index))
2686 break;
2687 abort_size += vm_page_size;
2688 }
2689 if (abort_size) {
2690 size -= abort_size;
2691 vs_offset += abort_size;
2692
2693 if (cl_index == pages_in_cl) {
2694 /*
2695 * if we're at the end of this physical cluster
2696 * then bump to the next one and continue looking
2697 */
2698 seg_index++;
2699 ps_info_valid = 0;
2700
2701 continue;
2702 }
2703 if (size == 0)
2704 break;
2705 }
2706 /*
2707 * remember the starting point of the first allocated page
2708 * for the I/O we're about to issue
2709 */
2710 beg_pseg = seg_index;
2711 beg_indx = cl_index;
2712 cur_offset = vs_offset;
2713
2714 /*
2715 * calculate the size of the I/O that we can do...
2716 * this may span multiple physical segments if
2717 * they are contiguous
2718 */
2719 for (xfer_size = 0; xfer_size < size; ) {
2720
2721 while (cl_index < pages_in_cl && xfer_size < size) {
2722 /*
2723 * accumulate allocated pages within
2724 * a physical segment
2725 */
2726 if (CLMAP_ISSET(clmap, cl_index)) {
2727 xfer_size += vm_page_size;
2728 cur_offset += vm_page_size;
2729 cl_index++;
2730
2731 BS_STAT(psp[seg_index]->ps_bs,
2732 psp[seg_index]->ps_bs->bs_pages_in++);
2733 } else
2734 break;
2735 }
2736 if (cl_index < pages_in_cl || xfer_size >= size) {
2737 /*
2738 * we've hit an unallocated page or
2739 * the end of this request... see if
2740 * it's time to fire the I/O
2741 */
2742 break;
2743 }
2744 /*
2745 * we've hit the end of the current physical
2746 * segment and there's more to do, so try
2747 * moving to the next one
2748 */
2749 seg_index++;
2750
2751 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2752 psp[seg_index] = CLMAP_PS(clmap);
2753 ps_info_valid = 1;
2754
2755 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2756 /*
2757 * if the physical segment we're about
2758 * to step into is not contiguous to
2759 * the one we're currently in, or it's
2760 * in a different paging file, or
2761 * it hasn't been allocated....
2762 * we stop this run and go check
2763 * to see if it's time to fire the I/O
2764 */
2765 break;
2766 }
2767 /*
2768 * start with first page of the next physical
2769 * segment
2770 */
2771 cl_index = 0;
2772 }
2773 if (xfer_size == 0) {
2774 /*
2775 * no I/O to generate for this segment
2776 */
2777 continue;
2778 }
2779 if (cur_offset <= orig_vs_offset) {
2780 /*
2781 * we've hit a hole in our speculative cluster
2782 * before the offset that we're really after...
2783 * don't issue the I/O since it doesn't encompass
2784 * the original offset and we're looking to only
2785 * pull in the speculative pages if they can be
2786 * made part of a single I/O
2787 */
2788 size -= xfer_size;
2789 vs_offset += xfer_size;
2790
2791 continue;
2792 }
2793 /*
2794 * we have a contiguous range of allocated pages
2795 * to read from that encompasses the original offset
2796 */
2797 page_list_count = 0;
2798 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
2799 xfer_size, xfer_size,
2800 &upl, NULL, &page_list_count,
2801 request_flags | UPL_SET_INTERNAL | UPL_NOBLOCK);
2802
2803 error = ps_read_file(psp[beg_pseg],
2804 upl, (upl_offset_t) 0,
2805 ps_offset[beg_pseg] + (beg_indx * vm_page_size),
2806 xfer_size, &residual, 0);
2807
2808 failed_size = 0;
2809
2810 /*
2811 * Adjust counts and send response to VM. Optimize
2812 * for the common case, i.e. no error and/or partial
2813 * data. If there was an error, then we need to error
2814 * the entire range, even if some data was successfully
2815 * read. If there was a partial read we may supply some
2816 * data and may error some as well. In all cases the
2817 * VM must receive some notification for every page
2818 * in the range.
2819 */
2820 if ((error == KERN_SUCCESS) && (residual == 0)) {
2821 /*
2822 * Got everything we asked for, supply the data
2823 * to the VM. Note that as a side effect of
2824 * supplying the data, the buffer holding the
2825 * supplied data is deallocated from the pager's
2826 * address space.
2827 */
2828 pvs_object_data_provided(vs, upl, vs_offset, xfer_size);
2829 } else {
2830 failed_size = xfer_size;
2831
2832 if (error == KERN_SUCCESS) {
2833 if (residual == xfer_size) {
2834 /*
2835 * If a read operation returns no error
2836 * and no data moved, we turn it into
2837 * an error, assuming we're reading at
2838 * or beyong EOF.
2839 * Fall through and error the entire range.
2840 */
2841 error = KERN_FAILURE;
2842 } else {
2843 /*
2844 * Otherwise, we have partial read. If
2845 * the part read is a integral number
2846 * of pages supply it. Otherwise round
2847 * it up to a page boundary, zero fill
2848 * the unread part, and supply it.
2849 * Fall through and error the remainder
2850 * of the range, if any.
2851 */
2852 int fill;
2853 unsigned int lsize;
2854
2855 fill = residual & ~vm_page_size;
2856 lsize = (xfer_size - residual) + fill;
2857
2858 pvs_object_data_provided(vs, upl, vs_offset, lsize);
2859
2860 if (lsize < xfer_size) {
2861 failed_size = xfer_size - lsize;
2862 error = KERN_FAILURE;
2863 }
2864 }
2865 }
2866 }
2867 if (error != KERN_SUCCESS) {
2868 /*
2869 * There was an error in some part of the range, tell
2870 * the VM. Note that error is explicitly checked again
2871 * since it can be modified above.
2872 */
2873 BS_STAT(psp[beg_pseg]->ps_bs,
2874 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
2875 }
2876 /*
2877 * we've issued a single I/O that encompassed the original offset
2878 * at this point we either met our speculative request length or
2879 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
2880 * not present or not physically contiguous to the previous one), so
2881 * we're done issuing I/O at this point
2882 */
2883 return (error);
2884 }
2885 }
2886 return error;
2887 }
2888
2889 int vs_do_async_write = 1;
2890
2891 kern_return_t
2892 vs_cluster_write(
2893 vstruct_t vs,
2894 upl_t internal_upl,
2895 upl_offset_t offset,
2896 upl_size_t cnt,
2897 boolean_t dp_internal,
2898 int flags)
2899 {
2900 upl_size_t transfer_size;
2901 int error = 0;
2902 struct clmap clmap;
2903
2904 vm_offset_t actual_offset; /* Offset within paging segment */
2905 paging_segment_t ps;
2906 vm_offset_t mobj_base_addr;
2907 vm_offset_t mobj_target_addr;
2908
2909 upl_t upl;
2910 upl_page_info_t *pl;
2911 int page_index;
2912 int list_size;
2913 int pages_in_cl;
2914 unsigned int cl_size;
2915 int base_index;
2916 unsigned int seg_size;
2917
2918 pages_in_cl = 1 << vs->vs_clshift;
2919 cl_size = pages_in_cl * vm_page_size;
2920
2921 if (!dp_internal) {
2922 unsigned int page_list_count;
2923 int request_flags;
2924 unsigned int super_size;
2925 int first_dirty;
2926 int num_dirty;
2927 int num_of_pages;
2928 int seg_index;
2929 upl_offset_t upl_offset;
2930 vm_offset_t seg_offset;
2931 vm_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2932 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT) + 1];
2933
2934
2935 if (bs_low) {
2936 super_size = cl_size;
2937
2938 request_flags = UPL_NOBLOCK |
2939 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2940 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
2941 } else {
2942 super_size = VM_SUPER_CLUSTER;
2943
2944 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2945 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2946 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
2947 }
2948
2949 if (!dp_encryption_inited) {
2950 /*
2951 * ENCRYPTED SWAP:
2952 * Once we've started using swap, we
2953 * can't change our mind on whether
2954 * it needs to be encrypted or
2955 * not.
2956 */
2957 dp_encryption_inited = TRUE;
2958 }
2959 if (dp_encryption) {
2960 /*
2961 * ENCRYPTED SWAP:
2962 * request that the UPL be prepared for
2963 * encryption.
2964 */
2965 request_flags |= UPL_ENCRYPT;
2966 flags |= UPL_PAGING_ENCRYPTED;
2967 }
2968
2969 page_list_count = 0;
2970 memory_object_super_upl_request(vs->vs_control,
2971 (memory_object_offset_t)offset,
2972 cnt, super_size,
2973 &upl, NULL, &page_list_count,
2974 request_flags | UPL_FOR_PAGEOUT);
2975
2976 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2977
2978 seg_size = cl_size - (upl->offset % cl_size);
2979 upl_offset = upl->offset & ~(cl_size - 1);
2980
2981 for (seg_index = 0, transfer_size = upl->size;
2982 transfer_size > 0; ) {
2983 ps_offset[seg_index] =
2984 ps_clmap(vs,
2985 upl_offset,
2986 &clmap, CL_ALLOC,
2987 cl_size, 0);
2988
2989 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2990 upl_abort(upl, 0);
2991 upl_deallocate(upl);
2992
2993 return KERN_FAILURE;
2994
2995 }
2996 psp[seg_index] = CLMAP_PS(clmap);
2997
2998 if (transfer_size > seg_size) {
2999 transfer_size -= seg_size;
3000 upl_offset += cl_size;
3001 seg_size = cl_size;
3002 seg_index++;
3003 } else
3004 transfer_size = 0;
3005 }
3006 /*
3007 * Ignore any non-present pages at the end of the
3008 * UPL.
3009 */
3010 for (page_index = upl->size / vm_page_size; page_index > 0;)
3011 if (UPL_PAGE_PRESENT(pl, --page_index))
3012 break;
3013 num_of_pages = page_index + 1;
3014
3015 base_index = (upl->offset % cl_size) / PAGE_SIZE;
3016
3017 for (page_index = 0; page_index < num_of_pages; ) {
3018 /*
3019 * skip over non-dirty pages
3020 */
3021 for ( ; page_index < num_of_pages; page_index++) {
3022 if (UPL_DIRTY_PAGE(pl, page_index)
3023 || UPL_PRECIOUS_PAGE(pl, page_index))
3024 /*
3025 * this is a page we need to write
3026 * go see if we can buddy it up with
3027 * others that are contiguous to it
3028 */
3029 break;
3030 /*
3031 * if the page is not-dirty, but present we
3032 * need to commit it... This is an unusual
3033 * case since we only asked for dirty pages
3034 */
3035 if (UPL_PAGE_PRESENT(pl, page_index)) {
3036 boolean_t empty = FALSE;
3037 upl_commit_range(upl,
3038 page_index * vm_page_size,
3039 vm_page_size,
3040 UPL_COMMIT_NOTIFY_EMPTY,
3041 pl,
3042 page_list_count,
3043 &empty);
3044 if (empty) {
3045 assert(page_index ==
3046 num_of_pages - 1);
3047 upl_deallocate(upl);
3048 }
3049 }
3050 }
3051 if (page_index == num_of_pages)
3052 /*
3053 * no more pages to look at, we're out of here
3054 */
3055 break;
3056
3057 /*
3058 * gather up contiguous dirty pages... we have at
3059 * least 1 * otherwise we would have bailed above
3060 * make sure that each physical segment that we step
3061 * into is contiguous to the one we're currently in
3062 * if it's not, we have to stop and write what we have
3063 */
3064 for (first_dirty = page_index;
3065 page_index < num_of_pages; ) {
3066 if ( !UPL_DIRTY_PAGE(pl, page_index)
3067 && !UPL_PRECIOUS_PAGE(pl, page_index))
3068 break;
3069 page_index++;
3070 /*
3071 * if we just looked at the last page in the UPL
3072 * we don't need to check for physical segment
3073 * continuity
3074 */
3075 if (page_index < num_of_pages) {
3076 int cur_seg;
3077 int nxt_seg;
3078
3079 cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3080 nxt_seg = (base_index + page_index)/pages_in_cl;
3081
3082 if (cur_seg != nxt_seg) {
3083 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3084 /*
3085 * if the segment we're about
3086 * to step into is not
3087 * contiguous to the one we're
3088 * currently in, or it's in a
3089 * different paging file....
3090 * we stop here and generate
3091 * the I/O
3092 */
3093 break;
3094 }
3095 }
3096 }
3097 num_dirty = page_index - first_dirty;
3098
3099 if (num_dirty) {
3100 upl_offset = first_dirty * vm_page_size;
3101 transfer_size = num_dirty * vm_page_size;
3102
3103 while (transfer_size) {
3104
3105 if ((seg_size = cl_size -
3106 ((upl->offset + upl_offset) % cl_size))
3107 > transfer_size)
3108 seg_size = transfer_size;
3109
3110 ps_vs_write_complete(vs,
3111 upl->offset + upl_offset,
3112 seg_size, error);
3113
3114 transfer_size -= seg_size;
3115 upl_offset += seg_size;
3116 }
3117 upl_offset = first_dirty * vm_page_size;
3118 transfer_size = num_dirty * vm_page_size;
3119
3120 seg_index = (base_index + first_dirty) / pages_in_cl;
3121 seg_offset = (upl->offset + upl_offset) % cl_size;
3122
3123 error = ps_write_file(psp[seg_index],
3124 upl, upl_offset,
3125 ps_offset[seg_index]
3126 + seg_offset,
3127 transfer_size, flags);
3128 } else {
3129 boolean_t empty = FALSE;
3130 upl_abort_range(upl,
3131 first_dirty * vm_page_size,
3132 num_dirty * vm_page_size,
3133 UPL_ABORT_NOTIFY_EMPTY,
3134 &empty);
3135 if (empty) {
3136 assert(page_index == num_of_pages);
3137 upl_deallocate(upl);
3138 }
3139 }
3140 }
3141
3142 } else {
3143 assert(cnt <= (vm_page_size << vs->vs_clshift));
3144 list_size = cnt;
3145
3146 page_index = 0;
3147 /* The caller provides a mapped_data which is derived */
3148 /* from a temporary object. The targeted pages are */
3149 /* guaranteed to be set at offset 0 in the mapped_data */
3150 /* The actual offset however must still be derived */
3151 /* from the offset in the vs in question */
3152 mobj_base_addr = offset;
3153 mobj_target_addr = mobj_base_addr;
3154
3155 for (transfer_size = list_size; transfer_size != 0;) {
3156 actual_offset = ps_clmap(vs, mobj_target_addr,
3157 &clmap, CL_ALLOC,
3158 transfer_size < cl_size ?
3159 transfer_size : cl_size, 0);
3160 if(actual_offset == (vm_offset_t) -1) {
3161 error = 1;
3162 break;
3163 }
3164 cnt = MIN(transfer_size,
3165 CLMAP_NPGS(clmap) * vm_page_size);
3166 ps = CLMAP_PS(clmap);
3167 /* Assume that the caller has given us contiguous */
3168 /* pages */
3169 if(cnt) {
3170 ps_vs_write_complete(vs, mobj_target_addr,
3171 cnt, error);
3172 error = ps_write_file(ps, internal_upl,
3173 0, actual_offset,
3174 cnt, flags);
3175 if (error)
3176 break;
3177 }
3178 if (error)
3179 break;
3180 actual_offset += cnt;
3181 mobj_target_addr += cnt;
3182 transfer_size -= cnt;
3183 cnt = 0;
3184
3185 if (error)
3186 break;
3187 }
3188 }
3189 if(error)
3190 return KERN_FAILURE;
3191 else
3192 return KERN_SUCCESS;
3193 }
3194
3195 vm_size_t
3196 ps_vstruct_allocated_size(
3197 vstruct_t vs)
3198 {
3199 int num_pages;
3200 struct vs_map *vsmap;
3201 unsigned int i, j, k;
3202
3203 num_pages = 0;
3204 if (vs->vs_indirect) {
3205 /* loop on indirect maps */
3206 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3207 vsmap = vs->vs_imap[i];
3208 if (vsmap == NULL)
3209 continue;
3210 /* loop on clusters in this indirect map */
3211 for (j = 0; j < CLMAP_ENTRIES; j++) {
3212 if (VSM_ISCLR(vsmap[j]) ||
3213 VSM_ISERR(vsmap[j]))
3214 continue;
3215 /* loop on pages in this cluster */
3216 for (k = 0; k < VSCLSIZE(vs); k++) {
3217 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3218 num_pages++;
3219 }
3220 }
3221 }
3222 } else {
3223 vsmap = vs->vs_dmap;
3224 if (vsmap == NULL)
3225 return 0;
3226 /* loop on clusters in the direct map */
3227 for (j = 0; j < CLMAP_ENTRIES; j++) {
3228 if (VSM_ISCLR(vsmap[j]) ||
3229 VSM_ISERR(vsmap[j]))
3230 continue;
3231 /* loop on pages in this cluster */
3232 for (k = 0; k < VSCLSIZE(vs); k++) {
3233 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3234 num_pages++;
3235 }
3236 }
3237 }
3238
3239 return ptoa_32(num_pages);
3240 }
3241
3242 size_t
3243 ps_vstruct_allocated_pages(
3244 vstruct_t vs,
3245 default_pager_page_t *pages,
3246 size_t pages_size)
3247 {
3248 unsigned int num_pages;
3249 struct vs_map *vsmap;
3250 vm_offset_t offset;
3251 unsigned int i, j, k;
3252
3253 num_pages = 0;
3254 offset = 0;
3255 if (vs->vs_indirect) {
3256 /* loop on indirect maps */
3257 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3258 vsmap = vs->vs_imap[i];
3259 if (vsmap == NULL) {
3260 offset += (vm_page_size * CLMAP_ENTRIES *
3261 VSCLSIZE(vs));
3262 continue;
3263 }
3264 /* loop on clusters in this indirect map */
3265 for (j = 0; j < CLMAP_ENTRIES; j++) {
3266 if (VSM_ISCLR(vsmap[j]) ||
3267 VSM_ISERR(vsmap[j])) {
3268 offset += vm_page_size * VSCLSIZE(vs);
3269 continue;
3270 }
3271 /* loop on pages in this cluster */
3272 for (k = 0; k < VSCLSIZE(vs); k++) {
3273 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3274 num_pages++;
3275 if (num_pages < pages_size)
3276 pages++->dpp_offset =
3277 offset;
3278 }
3279 offset += vm_page_size;
3280 }
3281 }
3282 }
3283 } else {
3284 vsmap = vs->vs_dmap;
3285 if (vsmap == NULL)
3286 return 0;
3287 /* loop on clusters in the direct map */
3288 for (j = 0; j < CLMAP_ENTRIES; j++) {
3289 if (VSM_ISCLR(vsmap[j]) ||
3290 VSM_ISERR(vsmap[j])) {
3291 offset += vm_page_size * VSCLSIZE(vs);
3292 continue;
3293 }
3294 /* loop on pages in this cluster */
3295 for (k = 0; k < VSCLSIZE(vs); k++) {
3296 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3297 num_pages++;
3298 if (num_pages < pages_size)
3299 pages++->dpp_offset = offset;
3300 }
3301 offset += vm_page_size;
3302 }
3303 }
3304 }
3305
3306 return num_pages;
3307 }
3308
3309
3310 kern_return_t
3311 ps_vstruct_transfer_from_segment(
3312 vstruct_t vs,
3313 paging_segment_t segment,
3314 upl_t upl)
3315 {
3316 struct vs_map *vsmap;
3317 // struct vs_map old_vsmap;
3318 // struct vs_map new_vsmap;
3319 unsigned int i, j;
3320
3321 VS_LOCK(vs); /* block all work on this vstruct */
3322 /* can't allow the normal multiple write */
3323 /* semantic because writes may conflict */
3324 vs->vs_xfer_pending = TRUE;
3325 vs_wait_for_sync_writers(vs);
3326 vs_start_write(vs);
3327 vs_wait_for_readers(vs);
3328 /* we will unlock the vs to allow other writes while transferring */
3329 /* and will be guaranteed of the persistance of the vs struct */
3330 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3331 /* vs_async_pending */
3332 /* OK we now have guaranteed no other parties are accessing this */
3333 /* vs. Now that we are also supporting simple lock versions of */
3334 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3335 /* our purpose in holding it before was the multiple write case */
3336 /* we now use the boolean xfer_pending to do that. We can use */
3337 /* a boolean instead of a count because we have guaranteed single */
3338 /* file access to this code in its caller */
3339 VS_UNLOCK(vs);
3340 vs_changed:
3341 if (vs->vs_indirect) {
3342 unsigned int vsmap_size;
3343 int clmap_off;
3344 /* loop on indirect maps */
3345 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3346 vsmap = vs->vs_imap[i];
3347 if (vsmap == NULL)
3348 continue;
3349 /* loop on clusters in this indirect map */
3350 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3351 VSCLSIZE(vs) * i);
3352 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3353 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3354 else
3355 vsmap_size = CLMAP_ENTRIES;
3356 for (j = 0; j < vsmap_size; j++) {
3357 if (VSM_ISCLR(vsmap[j]) ||
3358 VSM_ISERR(vsmap[j]) ||
3359 (VSM_PS(vsmap[j]) != segment))
3360 continue;
3361 if(vs_cluster_transfer(vs,
3362 (vm_page_size * (j << vs->vs_clshift))
3363 + clmap_off,
3364 vm_page_size << vs->vs_clshift,
3365 upl)
3366 != KERN_SUCCESS) {
3367 VS_LOCK(vs);
3368 vs->vs_xfer_pending = FALSE;
3369 VS_UNLOCK(vs);
3370 vs_finish_write(vs);
3371 return KERN_FAILURE;
3372 }
3373 /* allow other readers/writers during transfer*/
3374 VS_LOCK(vs);
3375 vs->vs_xfer_pending = FALSE;
3376 VS_UNLOCK(vs);
3377 vs_finish_write(vs);
3378 VS_LOCK(vs);
3379 vs->vs_xfer_pending = TRUE;
3380 vs_wait_for_sync_writers(vs);
3381 vs_start_write(vs);
3382 vs_wait_for_readers(vs);
3383 VS_UNLOCK(vs);
3384 if (!(vs->vs_indirect)) {
3385 goto vs_changed;
3386 }
3387 }
3388 }
3389 } else {
3390 vsmap = vs->vs_dmap;
3391 if (vsmap == NULL) {
3392 VS_LOCK(vs);
3393 vs->vs_xfer_pending = FALSE;
3394 VS_UNLOCK(vs);
3395 vs_finish_write(vs);
3396 return KERN_SUCCESS;
3397 }
3398 /* loop on clusters in the direct map */
3399 for (j = 0; j < vs->vs_size; j++) {
3400 if (VSM_ISCLR(vsmap[j]) ||
3401 VSM_ISERR(vsmap[j]) ||
3402 (VSM_PS(vsmap[j]) != segment))
3403 continue;
3404 if(vs_cluster_transfer(vs,
3405 vm_page_size * (j << vs->vs_clshift),
3406 vm_page_size << vs->vs_clshift,
3407 upl) != KERN_SUCCESS) {
3408 VS_LOCK(vs);
3409 vs->vs_xfer_pending = FALSE;
3410 VS_UNLOCK(vs);
3411 vs_finish_write(vs);
3412 return KERN_FAILURE;
3413 }
3414 /* allow other readers/writers during transfer*/
3415 VS_LOCK(vs);
3416 vs->vs_xfer_pending = FALSE;
3417 VS_UNLOCK(vs);
3418 vs_finish_write(vs);
3419 VS_LOCK(vs);
3420 vs->vs_xfer_pending = TRUE;
3421 VS_UNLOCK(vs);
3422 vs_wait_for_sync_writers(vs);
3423 vs_start_write(vs);
3424 vs_wait_for_readers(vs);
3425 if (vs->vs_indirect) {
3426 goto vs_changed;
3427 }
3428 }
3429 }
3430
3431 VS_LOCK(vs);
3432 vs->vs_xfer_pending = FALSE;
3433 VS_UNLOCK(vs);
3434 vs_finish_write(vs);
3435 return KERN_SUCCESS;
3436 }
3437
3438
3439
3440 vs_map_t
3441 vs_get_map_entry(
3442 vstruct_t vs,
3443 vm_offset_t offset)
3444 {
3445 struct vs_map *vsmap;
3446 vm_offset_t cluster;
3447
3448 cluster = atop_32(offset) >> vs->vs_clshift;
3449 if (vs->vs_indirect) {
3450 long ind_block = cluster/CLMAP_ENTRIES;
3451
3452 /* Is the indirect block allocated? */
3453 vsmap = vs->vs_imap[ind_block];
3454 if(vsmap == (vs_map_t) NULL)
3455 return vsmap;
3456 } else
3457 vsmap = vs->vs_dmap;
3458 vsmap += cluster%CLMAP_ENTRIES;
3459 return vsmap;
3460 }
3461
3462 kern_return_t
3463 vs_cluster_transfer(
3464 vstruct_t vs,
3465 vm_offset_t offset,
3466 vm_size_t cnt,
3467 upl_t upl)
3468 {
3469 vm_offset_t actual_offset;
3470 paging_segment_t ps;
3471 struct clmap clmap;
3472 kern_return_t error = KERN_SUCCESS;
3473 unsigned int size, size_wanted;
3474 int i;
3475 unsigned int residual = 0;
3476 unsigned int unavail_size;
3477 // default_pager_thread_t *dpt;
3478 // boolean_t dealloc;
3479 struct vs_map *vsmap_ptr = NULL;
3480 struct vs_map read_vsmap;
3481 struct vs_map original_read_vsmap;
3482 struct vs_map write_vsmap;
3483 // upl_t sync_upl;
3484 // vm_offset_t ioaddr;
3485
3486 /* vs_cluster_transfer reads in the pages of a cluster and
3487 * then writes these pages back to new backing store. The
3488 * segment the pages are being read from is assumed to have
3489 * been taken off-line and is no longer considered for new
3490 * space requests.
3491 */
3492
3493 /*
3494 * This loop will be executed once per cluster referenced.
3495 * Typically this means once, since it's unlikely that the
3496 * VM system will ask for anything spanning cluster boundaries.
3497 *
3498 * If there are holes in a cluster (in a paging segment), we stop
3499 * reading at the hole, then loop again, hoping to
3500 * find valid pages later in the cluster. This continues until
3501 * the entire range has been examined, and read, if present. The
3502 * pages are written as they are read. If a failure occurs after
3503 * some pages are written the unmap call at the bottom of the loop
3504 * recovers the backing store and the old backing store remains
3505 * in effect.
3506 */
3507
3508 VSM_CLR(write_vsmap);
3509 VSM_CLR(original_read_vsmap);
3510 /* grab the actual object's pages to sync with I/O */
3511 while (cnt && (error == KERN_SUCCESS)) {
3512 vsmap_ptr = vs_get_map_entry(vs, offset);
3513 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3514
3515 if (actual_offset == (vm_offset_t) -1) {
3516
3517 /*
3518 * Nothing left to write in this cluster at least
3519 * set write cluster information for any previous
3520 * write, clear for next cluster, if there is one
3521 */
3522 unsigned int local_size, clmask, clsize;
3523
3524 clsize = vm_page_size << vs->vs_clshift;
3525 clmask = clsize - 1;
3526 local_size = clsize - (offset & clmask);
3527 ASSERT(local_size);
3528 local_size = MIN(local_size, cnt);
3529
3530 /* This cluster has no data in it beyond what may */
3531 /* have been found on a previous iteration through */
3532 /* the loop "write_vsmap" */
3533 *vsmap_ptr = write_vsmap;
3534 VSM_CLR(write_vsmap);
3535 VSM_CLR(original_read_vsmap);
3536
3537 cnt -= local_size;
3538 offset += local_size;
3539 continue;
3540 }
3541
3542 /*
3543 * Count up contiguous available or unavailable
3544 * pages.
3545 */
3546 ps = CLMAP_PS(clmap);
3547 ASSERT(ps);
3548 size = 0;
3549 unavail_size = 0;
3550 for (i = 0;
3551 (size < cnt) && (unavail_size < cnt) &&
3552 (i < CLMAP_NPGS(clmap)); i++) {
3553 if (CLMAP_ISSET(clmap, i)) {
3554 if (unavail_size != 0)
3555 break;
3556 size += vm_page_size;
3557 BS_STAT(ps->ps_bs,
3558 ps->ps_bs->bs_pages_in++);
3559 } else {
3560 if (size != 0)
3561 break;
3562 unavail_size += vm_page_size;
3563 }
3564 }
3565
3566 if (size == 0) {
3567 ASSERT(unavail_size);
3568 cnt -= unavail_size;
3569 offset += unavail_size;
3570 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3571 == 0) {
3572 /* There is no more to transfer in this
3573 cluster
3574 */
3575 *vsmap_ptr = write_vsmap;
3576 VSM_CLR(write_vsmap);
3577 VSM_CLR(original_read_vsmap);
3578 }
3579 continue;
3580 }
3581
3582 if(VSM_ISCLR(original_read_vsmap))
3583 original_read_vsmap = *vsmap_ptr;
3584
3585 if(ps->ps_segtype == PS_PARTITION) {
3586 panic("swap partition not supported\n");
3587 /*NOTREACHED*/
3588 error = KERN_FAILURE;
3589 residual = size;
3590 /*
3591 NEED TO ISSUE WITH SYNC & NO COMMIT
3592 error = ps_read_device(ps, actual_offset, &buffer,
3593 size, &residual, flags);
3594 */
3595 } else {
3596 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
3597 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
3598 size, &residual,
3599 (UPL_IOSYNC | UPL_NOCOMMIT));
3600 }
3601
3602 read_vsmap = *vsmap_ptr;
3603
3604
3605 /*
3606 * Adjust counts and put data in new BS. Optimize for the
3607 * common case, i.e. no error and/or partial data.
3608 * If there was an error, then we need to error the entire
3609 * range, even if some data was successfully read.
3610 *
3611 */
3612 if ((error == KERN_SUCCESS) && (residual == 0)) {
3613
3614 /*
3615 * Got everything we asked for, supply the data to
3616 * the new BS. Note that as a side effect of supplying
3617 * the data, the buffer holding the supplied data is
3618 * deallocated from the pager's address space unless
3619 * the write is unsuccessful.
3620 */
3621
3622 /* note buffer will be cleaned up in all cases by */
3623 /* internal_cluster_write or if an error on write */
3624 /* the vm_map_copy_page_discard call */
3625 *vsmap_ptr = write_vsmap;
3626
3627 if(vs_cluster_write(vs, upl, offset,
3628 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
3629 error = KERN_FAILURE;
3630 if(!(VSM_ISCLR(*vsmap_ptr))) {
3631 /* unmap the new backing store object */
3632 ps_clunmap(vs, offset, size);
3633 }
3634 /* original vsmap */
3635 *vsmap_ptr = original_read_vsmap;
3636 VSM_CLR(write_vsmap);
3637 } else {
3638 if((offset + size) &
3639 ((vm_page_size << vs->vs_clshift)
3640 - 1)) {
3641 /* There is more to transfer in this
3642 cluster
3643 */
3644 write_vsmap = *vsmap_ptr;
3645 *vsmap_ptr = read_vsmap;
3646 } else {
3647 /* discard the old backing object */
3648 write_vsmap = *vsmap_ptr;
3649 *vsmap_ptr = read_vsmap;
3650 ps_clunmap(vs, offset, size);
3651 *vsmap_ptr = write_vsmap;
3652 VSM_CLR(write_vsmap);
3653 VSM_CLR(original_read_vsmap);
3654 }
3655 }
3656 } else {
3657 size_wanted = size;
3658 if (error == KERN_SUCCESS) {
3659 if (residual == size) {
3660 /*
3661 * If a read operation returns no error
3662 * and no data moved, we turn it into
3663 * an error, assuming we're reading at
3664 * or beyond EOF.
3665 * Fall through and error the entire
3666 * range.
3667 */
3668 error = KERN_FAILURE;
3669 *vsmap_ptr = write_vsmap;
3670 if(!(VSM_ISCLR(*vsmap_ptr))) {
3671 /* unmap the new backing store object */
3672 ps_clunmap(vs, offset, size);
3673 }
3674 *vsmap_ptr = original_read_vsmap;
3675 VSM_CLR(write_vsmap);
3676 continue;
3677 } else {
3678 /*
3679 * Otherwise, we have partial read.
3680 * This is also considered an error
3681 * for the purposes of cluster transfer
3682 */
3683 error = KERN_FAILURE;
3684 *vsmap_ptr = write_vsmap;
3685 if(!(VSM_ISCLR(*vsmap_ptr))) {
3686 /* unmap the new backing store object */
3687 ps_clunmap(vs, offset, size);
3688 }
3689 *vsmap_ptr = original_read_vsmap;
3690 VSM_CLR(write_vsmap);
3691 continue;
3692 }
3693 }
3694
3695 }
3696 cnt -= size;
3697 offset += size;
3698
3699 } /* END while (cnt && (error == 0)) */
3700 if(!VSM_ISCLR(write_vsmap))
3701 *vsmap_ptr = write_vsmap;
3702
3703 return error;
3704 }
3705
3706 kern_return_t
3707 default_pager_add_file(
3708 MACH_PORT_FACE backing_store,
3709 vnode_ptr_t vp,
3710 int record_size,
3711 vm_size_t size)
3712 {
3713 backing_store_t bs;
3714 paging_segment_t ps;
3715 int i;
3716 unsigned int j;
3717 int error;
3718
3719 if ((bs = backing_store_lookup(backing_store))
3720 == BACKING_STORE_NULL)
3721 return KERN_INVALID_ARGUMENT;
3722
3723 PSL_LOCK();
3724 for (i = 0; i <= paging_segment_max; i++) {
3725 ps = paging_segments[i];
3726 if (ps == PAGING_SEGMENT_NULL)
3727 continue;
3728 if (ps->ps_segtype != PS_FILE)
3729 continue;
3730
3731 /*
3732 * Check for overlap on same device.
3733 */
3734 if (ps->ps_vnode == (struct vnode *)vp) {
3735 PSL_UNLOCK();
3736 BS_UNLOCK(bs);
3737 return KERN_INVALID_ARGUMENT;
3738 }
3739 }
3740 PSL_UNLOCK();
3741
3742 /*
3743 * Set up the paging segment
3744 */
3745 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3746 if (ps == PAGING_SEGMENT_NULL) {
3747 BS_UNLOCK(bs);
3748 return KERN_RESOURCE_SHORTAGE;
3749 }
3750
3751 ps->ps_segtype = PS_FILE;
3752 ps->ps_vnode = (struct vnode *)vp;
3753 ps->ps_offset = 0;
3754 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3755 ps->ps_recnum = size;
3756 ps->ps_pgnum = size >> ps->ps_record_shift;
3757
3758 ps->ps_pgcount = ps->ps_pgnum;
3759 ps->ps_clshift = local_log2(bs->bs_clsize);
3760 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3761 ps->ps_hint = 0;
3762
3763 PS_LOCK_INIT(ps);
3764 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3765 if (!ps->ps_bmap) {
3766 kfree(ps, sizeof *ps);
3767 BS_UNLOCK(bs);
3768 return KERN_RESOURCE_SHORTAGE;
3769 }
3770 for (j = 0; j < ps->ps_ncls; j++) {
3771 clrbit(ps->ps_bmap, j);
3772 }
3773
3774 ps->ps_going_away = FALSE;
3775 ps->ps_bs = bs;
3776
3777 if ((error = ps_enter(ps)) != 0) {
3778 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3779 kfree(ps, sizeof *ps);
3780 BS_UNLOCK(bs);
3781 return KERN_RESOURCE_SHORTAGE;
3782 }
3783
3784 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3785 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3786 PSL_LOCK();
3787 dp_pages_free += ps->ps_pgcount;
3788 PSL_UNLOCK();
3789
3790 BS_UNLOCK(bs);
3791
3792 bs_more_space(ps->ps_clcount);
3793
3794 DP_DEBUG(DEBUG_BS_INTERNAL,
3795 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3796 device, offset, size, record_size,
3797 ps->ps_record_shift, ps->ps_pgnum));
3798
3799 return KERN_SUCCESS;
3800 }
3801
3802
3803
3804 kern_return_t
3805 ps_read_file(
3806 paging_segment_t ps,
3807 upl_t upl,
3808 upl_offset_t upl_offset,
3809 vm_offset_t offset,
3810 upl_size_t size,
3811 unsigned int *residualp,
3812 int flags)
3813 {
3814 vm_object_offset_t f_offset;
3815 int error = 0;
3816 int result;
3817
3818 assert(dp_encryption_inited);
3819
3820 clustered_reads[atop_32(size)]++;
3821
3822 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3823
3824 /*
3825 * for transfer case we need to pass uploffset and flags
3826 */
3827 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL);
3828
3829 /* The vnode_pagein semantic is somewhat at odds with the existing */
3830 /* device_read semantic. Partial reads are not experienced at this */
3831 /* level. It is up to the bit map code and cluster read code to */
3832 /* check that requested data locations are actually backed, and the */
3833 /* pagein code to either read all of the requested data or return an */
3834 /* error. */
3835
3836 if (error)
3837 result = KERN_FAILURE;
3838 else {
3839 *residualp = 0;
3840 result = KERN_SUCCESS;
3841 }
3842 return result;
3843 }
3844
3845 kern_return_t
3846 ps_write_file(
3847 paging_segment_t ps,
3848 upl_t upl,
3849 upl_offset_t upl_offset,
3850 vm_offset_t offset,
3851 unsigned int size,
3852 int flags)
3853 {
3854 vm_object_offset_t f_offset;
3855 kern_return_t result;
3856
3857 assert(dp_encryption_inited);
3858
3859 clustered_writes[atop_32(size)]++;
3860 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3861
3862 if (flags & UPL_PAGING_ENCRYPTED) {
3863 /*
3864 * ENCRYPTED SWAP:
3865 * encrypt all the pages that we're going
3866 * to pageout.
3867 */
3868 upl_encrypt(upl, upl_offset, size);
3869 }
3870 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3871 result = KERN_FAILURE;
3872 else
3873 result = KERN_SUCCESS;
3874
3875 return result;
3876 }
3877
3878 kern_return_t
3879 default_pager_triggers( __unused MACH_PORT_FACE default_pager,
3880 int hi_wat,
3881 int lo_wat,
3882 int flags,
3883 MACH_PORT_FACE trigger_port)
3884 {
3885 MACH_PORT_FACE release;
3886 kern_return_t kr;
3887
3888 PSL_LOCK();
3889 if (flags == SWAP_ENCRYPT_ON) {
3890 /* ENCRYPTED SWAP: turn encryption on */
3891 release = trigger_port;
3892 if (!dp_encryption_inited) {
3893 dp_encryption_inited = TRUE;
3894 dp_encryption = TRUE;
3895 kr = KERN_SUCCESS;
3896 } else {
3897 kr = KERN_FAILURE;
3898 }
3899 } else if (flags == SWAP_ENCRYPT_OFF) {
3900 /* ENCRYPTED SWAP: turn encryption off */
3901 release = trigger_port;
3902 if (!dp_encryption_inited) {
3903 dp_encryption_inited = TRUE;
3904 dp_encryption = FALSE;
3905 kr = KERN_SUCCESS;
3906 } else {
3907 kr = KERN_FAILURE;
3908 }
3909 } else if (flags == HI_WAT_ALERT) {
3910 release = min_pages_trigger_port;
3911 min_pages_trigger_port = trigger_port;
3912 minimum_pages_remaining = hi_wat/vm_page_size;
3913 bs_low = FALSE;
3914 kr = KERN_SUCCESS;
3915 } else if (flags == LO_WAT_ALERT) {
3916 release = max_pages_trigger_port;
3917 max_pages_trigger_port = trigger_port;
3918 maximum_pages_free = lo_wat/vm_page_size;
3919 kr = KERN_SUCCESS;
3920 } else {
3921 release = trigger_port;
3922 kr = KERN_INVALID_ARGUMENT;
3923 }
3924 PSL_UNLOCK();
3925
3926 if (IP_VALID(release))
3927 ipc_port_release_send(release);
3928
3929 return kr;
3930 }
3931
3932 /*
3933 * Monitor the amount of available backing store vs. the amount of
3934 * required backing store, notify a listener (if present) when
3935 * backing store may safely be removed.
3936 *
3937 * We attempt to avoid the situation where backing store is
3938 * discarded en masse, as this can lead to thrashing as the
3939 * backing store is compacted.
3940 */
3941
3942 #define PF_INTERVAL 3 /* time between free level checks */
3943 #define PF_LATENCY 10 /* number of intervals before release */
3944
3945 static int dp_pages_free_low_count = 0;
3946 thread_call_t default_pager_backing_store_monitor_callout;
3947
3948 void
3949 default_pager_backing_store_monitor(__unused thread_call_param_t p1,
3950 __unused thread_call_param_t p2)
3951 {
3952 // unsigned long long average;
3953 ipc_port_t trigger;
3954 uint64_t deadline;
3955
3956 /*
3957 * We determine whether it will be safe to release some
3958 * backing store by watching the free page level. If
3959 * it remains below the maximum_pages_free threshold for
3960 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
3961 * then we deem it safe.
3962 *
3963 * Note that this establishes a maximum rate at which backing
3964 * store will be released, as each notification (currently)
3965 * only results in a single backing store object being
3966 * released.
3967 */
3968 if (dp_pages_free > maximum_pages_free) {
3969 dp_pages_free_low_count++;
3970 } else {
3971 dp_pages_free_low_count = 0;
3972 }
3973
3974 /* decide whether to send notification */
3975 trigger = IP_NULL;
3976 if (max_pages_trigger_port &&
3977 (backing_store_release_trigger_disable == 0) &&
3978 (dp_pages_free_low_count > PF_LATENCY)) {
3979 trigger = max_pages_trigger_port;
3980 max_pages_trigger_port = NULL;
3981 }
3982
3983 /* send notification */
3984 if (trigger != IP_NULL) {
3985 VSL_LOCK();
3986 if(backing_store_release_trigger_disable != 0) {
3987 assert_wait((event_t)
3988 &backing_store_release_trigger_disable,
3989 THREAD_UNINT);
3990 VSL_UNLOCK();
3991 thread_block(THREAD_CONTINUE_NULL);
3992 } else {
3993 VSL_UNLOCK();
3994 }
3995 default_pager_space_alert(trigger, LO_WAT_ALERT);
3996 ipc_port_release_send(trigger);
3997 dp_pages_free_low_count = 0;
3998 }
3999
4000 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4001 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4002 }