]> git.saurik.com Git - apple/xnu.git/blame - osfmk/default_pager/dp_backing_store.c
xnu-344.34.tar.gz
[apple/xnu.git] / osfmk / default_pager / dp_backing_store.c
CommitLineData
1c79356b
A
1/*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
de355530
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 11 *
de355530
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
de355530
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
1c79356b
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/*
23 * @OSF_COPYRIGHT@
24 */
25/*
26 * Mach Operating System
27 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
28 * All Rights Reserved.
29 *
30 * Permission to use, copy, modify and distribute this software and its
31 * documentation is hereby granted, provided that both the copyright
32 * notice and this permission notice appear in all copies of the
33 * software, derivative works or modified versions, and any portions
34 * thereof, and that both notices appear in supporting documentation.
35 *
36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
37 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
39 *
40 * Carnegie Mellon requests users of this software to return to
41 *
42 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
43 * School of Computer Science
44 * Carnegie Mellon University
45 * Pittsburgh PA 15213-3890
46 *
47 * any improvements or extensions that they make and grant Carnegie Mellon
48 * the rights to redistribute these changes.
49 */
50
51/*
52 * Default Pager.
53 * Paging File Management.
54 */
55
0b4e3aa0 56#include <mach/memory_object_control.h>
1c79356b
A
57#include <mach/memory_object_server.h>
58#include "default_pager_internal.h"
59#include <default_pager/default_pager_alerts.h>
60#include <ipc/ipc_port.h>
61#include <ipc/ipc_space.h>
62#include <kern/queue.h>
63#include <kern/counters.h>
64#include <kern/sched_prim.h>
65#include <vm/vm_kern.h>
66#include <vm/vm_pageout.h>
67/* CDY CDY */
68#include <vm/vm_map.h>
69
0b4e3aa0
A
70/*
71 * ALLOC_STRIDE... the maximum number of bytes allocated from
72 * a swap file before moving on to the next swap file... if
73 * all swap files reside on a single disk, this value should
74 * be very large (this is the default assumption)... if the
75 * swap files are spread across multiple disks, than this value
76 * should be small (128 * 1024)...
77 *
78 * This should be determined dynamically in the future
79 */
1c79356b 80
0b4e3aa0 81#define ALLOC_STRIDE (1024 * 1024 * 1024)
1c79356b
A
82int physical_transfer_cluster_count = 0;
83
9bccf70c
A
84#define VM_SUPER_CLUSTER 0x40000
85#define VM_SUPER_PAGES 64
1c79356b
A
86
87/*
88 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
89 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
90 */
91#define VSTRUCT_DEF_CLSHIFT 2
92int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
93int default_pager_clsize = 0;
94
95/* statistics */
0b4e3aa0
A
96unsigned int clustered_writes[VM_SUPER_PAGES+1];
97unsigned int clustered_reads[VM_SUPER_PAGES+1];
1c79356b
A
98
99/*
100 * Globals used for asynchronous paging operations:
101 * vs_async_list: head of list of to-be-completed I/O ops
102 * async_num_queued: number of pages completed, but not yet
103 * processed by async thread.
104 * async_requests_out: number of pages of requests not completed.
105 */
106
107#if 0
108struct vs_async *vs_async_list;
109int async_num_queued;
110int async_requests_out;
111#endif
112
113
114#define VS_ASYNC_REUSE 1
115struct vs_async *vs_async_free_list;
116
117mutex_t default_pager_async_lock; /* Protects globals above */
118
119
120int vs_alloc_async_failed = 0; /* statistics */
121int vs_alloc_async_count = 0; /* statistics */
122struct vs_async *vs_alloc_async(void); /* forward */
123void vs_free_async(struct vs_async *vsa); /* forward */
124
125
126#define VS_ALLOC_ASYNC() vs_alloc_async()
127#define VS_FREE_ASYNC(vsa) vs_free_async(vsa)
128
129#define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock)
130#define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock)
131#define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \
132 ETAP_IO_DEV_PAGEH)
133#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock)
134/*
135 * Paging Space Hysteresis triggers and the target notification port
136 *
137 */
138
139unsigned int minimum_pages_remaining = 0;
140unsigned int maximum_pages_free = 0;
141ipc_port_t min_pages_trigger_port = NULL;
142ipc_port_t max_pages_trigger_port = NULL;
143
144boolean_t bs_low = FALSE;
0b4e3aa0 145int backing_store_release_trigger_disable = 0;
1c79356b
A
146
147
148
149/*
150 * Object sizes are rounded up to the next power of 2,
151 * unless they are bigger than a given maximum size.
152 */
153vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */
154
155/*
156 * List of all backing store and segments.
157 */
158struct backing_store_list_head backing_store_list;
159paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS];
160mutex_t paging_segments_lock;
161int paging_segment_max = 0;
162int paging_segment_count = 0;
163int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
164
165
166/*
167 * Total pages free in system
168 * This differs from clusters committed/avail which is a measure of the
169 * over commitment of paging segments to backing store. An idea which is
170 * likely to be deprecated.
171 */
172unsigned int dp_pages_free = 0;
173unsigned int cluster_transfer_minimum = 100;
174
175kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */
0b4e3aa0
A
176kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */
177
1c79356b
A
178
179default_pager_thread_t *
180get_read_buffer()
181{
182 int i;
183
184 DPT_LOCK(dpt_lock);
185 while(TRUE) {
186 for (i=0; i<default_pager_internal_count; i++) {
187 if(dpt_array[i]->checked_out == FALSE) {
188 dpt_array[i]->checked_out = TRUE;
189 DPT_UNLOCK(dpt_lock);
190 return dpt_array[i];
191 }
192 }
9bccf70c 193 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
1c79356b
A
194 }
195}
196
197void
198bs_initialize(void)
199{
200 int i;
201
202 /*
203 * List of all backing store.
204 */
205 BSL_LOCK_INIT();
206 queue_init(&backing_store_list.bsl_queue);
207 PSL_LOCK_INIT();
208
209 VS_ASYNC_LOCK_INIT();
210#if VS_ASYNC_REUSE
211 vs_async_free_list = NULL;
212#endif /* VS_ASYNC_REUSE */
213
0b4e3aa0 214 for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
1c79356b
A
215 clustered_writes[i] = 0;
216 clustered_reads[i] = 0;
217 }
218
219}
220
221/*
222 * When things do not quite workout...
223 */
224void bs_no_paging_space(boolean_t); /* forward */
225
226void
227bs_no_paging_space(
228 boolean_t out_of_memory)
229{
1c79356b
A
230
231 if (out_of_memory)
232 dprintf(("*** OUT OF MEMORY ***\n"));
233 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
234}
235
236void bs_more_space(int); /* forward */
237void bs_commit(int); /* forward */
238
239boolean_t user_warned = FALSE;
240unsigned int clusters_committed = 0;
241unsigned int clusters_available = 0;
242unsigned int clusters_committed_peak = 0;
243
244void
245bs_more_space(
246 int nclusters)
247{
248 BSL_LOCK();
249 /*
250 * Account for new paging space.
251 */
252 clusters_available += nclusters;
253
254 if (clusters_available >= clusters_committed) {
255 if (verbose && user_warned) {
256 printf("%s%s - %d excess clusters now.\n",
257 my_name,
258 "paging space is OK now",
259 clusters_available - clusters_committed);
260 user_warned = FALSE;
261 clusters_committed_peak = 0;
262 }
263 } else {
264 if (verbose && user_warned) {
265 printf("%s%s - still short of %d clusters.\n",
266 my_name,
267 "WARNING: paging space over-committed",
268 clusters_committed - clusters_available);
269 clusters_committed_peak -= nclusters;
270 }
271 }
272 BSL_UNLOCK();
273
274 return;
275}
276
277void
278bs_commit(
279 int nclusters)
280{
281 BSL_LOCK();
282 clusters_committed += nclusters;
283 if (clusters_committed > clusters_available) {
284 if (verbose && !user_warned) {
285 user_warned = TRUE;
286 printf("%s%s - short of %d clusters.\n",
287 my_name,
288 "WARNING: paging space over-committed",
289 clusters_committed - clusters_available);
290 }
291 if (clusters_committed > clusters_committed_peak) {
292 clusters_committed_peak = clusters_committed;
293 }
294 } else {
295 if (verbose && user_warned) {
296 printf("%s%s - was short of up to %d clusters.\n",
297 my_name,
298 "paging space is OK now",
299 clusters_committed_peak - clusters_available);
300 user_warned = FALSE;
301 clusters_committed_peak = 0;
302 }
303 }
304 BSL_UNLOCK();
305
306 return;
307}
308
309int default_pager_info_verbose = 1;
310
311void
312bs_global_info(
313 vm_size_t *totalp,
314 vm_size_t *freep)
315{
316 vm_size_t pages_total, pages_free;
317 paging_segment_t ps;
318 int i;
1c79356b
A
319
320 PSL_LOCK();
321 pages_total = pages_free = 0;
322 for (i = 0; i <= paging_segment_max; i++) {
323 ps = paging_segments[i];
324 if (ps == PAGING_SEGMENT_NULL)
325 continue;
326
327 /*
328 * no need to lock: by the time this data
329 * gets back to any remote requestor it
330 * will be obsolete anyways
331 */
332 pages_total += ps->ps_pgnum;
333 pages_free += ps->ps_clcount << ps->ps_clshift;
334 DEBUG(DEBUG_BS_INTERNAL,
335 ("segment #%d: %d total, %d free\n",
336 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
337 }
338 *totalp = pages_total;
339 *freep = pages_free;
340 if (verbose && user_warned && default_pager_info_verbose) {
341 if (clusters_available < clusters_committed) {
342 printf("%s %d clusters committed, %d available.\n",
343 my_name,
344 clusters_committed,
345 clusters_available);
346 }
347 }
348 PSL_UNLOCK();
349}
350
351backing_store_t backing_store_alloc(void); /* forward */
352
353backing_store_t
354backing_store_alloc(void)
355{
356 backing_store_t bs;
1c79356b
A
357
358 bs = (backing_store_t) kalloc(sizeof (struct backing_store));
359 if (bs == BACKING_STORE_NULL)
360 panic("backing_store_alloc: no memory");
361
362 BS_LOCK_INIT(bs);
363 bs->bs_port = MACH_PORT_NULL;
364 bs->bs_priority = 0;
365 bs->bs_clsize = 0;
366 bs->bs_pages_total = 0;
367 bs->bs_pages_in = 0;
368 bs->bs_pages_in_fail = 0;
369 bs->bs_pages_out = 0;
370 bs->bs_pages_out_fail = 0;
371
372 return bs;
373}
374
375backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */
376
377/* Even in both the component space and external versions of this pager, */
378/* backing_store_lookup will be called from tasks in the application space */
379backing_store_t
380backing_store_lookup(
381 MACH_PORT_FACE port)
382{
383 backing_store_t bs;
384
385/*
386 port is currently backed with a vs structure in the alias field
387 we could create an ISBS alias and a port_is_bs call but frankly
388 I see no reason for the test, the bs->port == port check below
389 will work properly on junk entries.
390
391 if ((port == MACH_PORT_NULL) || port_is_vs(port))
392*/
393 if ((port == MACH_PORT_NULL))
394 return BACKING_STORE_NULL;
395
396 BSL_LOCK();
397 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
398 bs_links) {
399 BS_LOCK(bs);
400 if (bs->bs_port == port) {
401 BSL_UNLOCK();
402 /* Success, return it locked. */
403 return bs;
404 }
405 BS_UNLOCK(bs);
406 }
407 BSL_UNLOCK();
408 return BACKING_STORE_NULL;
409}
410
411void backing_store_add(backing_store_t); /* forward */
412
413void
414backing_store_add(
415 backing_store_t bs)
416{
417 MACH_PORT_FACE port = bs->bs_port;
418 MACH_PORT_FACE pset = default_pager_default_set;
419 kern_return_t kr = KERN_SUCCESS;
1c79356b
A
420
421 if (kr != KERN_SUCCESS)
422 panic("backing_store_add: add to set");
423
424}
425
426/*
427 * Set up default page shift, but only if not already
428 * set and argument is within range.
429 */
430boolean_t
431bs_set_default_clsize(unsigned int npages)
432{
433 switch(npages){
434 case 1:
435 case 2:
436 case 4:
437 case 8:
438 if (default_pager_clsize == 0) /* if not yet set */
439 vstruct_def_clshift = local_log2(npages);
440 return(TRUE);
441 }
442 return(FALSE);
443}
444
445int bs_get_global_clsize(int clsize); /* forward */
446
447int
448bs_get_global_clsize(
449 int clsize)
450{
451 int i;
0b4e3aa0 452 memory_object_default_t dmm;
1c79356b 453 kern_return_t kr;
1c79356b
A
454
455 /*
456 * Only allow setting of cluster size once. If called
457 * with no cluster size (default), we use the compiled-in default
458 * for the duration. The same cluster size is used for all
459 * paging segments.
460 */
461 if (default_pager_clsize == 0) {
1c79356b
A
462 /*
463 * Keep cluster size in bit shift because it's quicker
464 * arithmetic, and easier to keep at a power of 2.
465 */
466 if (clsize != NO_CLSIZE) {
467 for (i = 0; (1 << i) < clsize; i++);
468 if (i > MAX_CLUSTER_SHIFT)
469 i = MAX_CLUSTER_SHIFT;
470 vstruct_def_clshift = i;
471 }
472 default_pager_clsize = (1 << vstruct_def_clshift);
473
474 /*
475 * Let the user know the new (and definitive) cluster size.
476 */
477 if (verbose)
478 printf("%scluster size = %d page%s\n",
479 my_name, default_pager_clsize,
480 (default_pager_clsize == 1) ? "" : "s");
0b4e3aa0 481
1c79356b
A
482 /*
483 * Let the kernel know too, in case it hasn't used the
484 * default value provided in main() yet.
485 */
0b4e3aa0 486 dmm = default_pager_object;
1c79356b
A
487 clsize = default_pager_clsize * vm_page_size; /* in bytes */
488 kr = host_default_memory_manager(host_priv_self(),
0b4e3aa0 489 &dmm,
1c79356b 490 clsize);
0b4e3aa0
A
491 memory_object_default_deallocate(dmm);
492
1c79356b
A
493 if (kr != KERN_SUCCESS) {
494 panic("bs_get_global_cl_size:host_default_memory_manager");
495 }
0b4e3aa0 496 if (dmm != default_pager_object) {
1c79356b
A
497 panic("bs_get_global_cl_size:there is another default pager");
498 }
499 }
500 ASSERT(default_pager_clsize > 0 &&
501 (default_pager_clsize & (default_pager_clsize - 1)) == 0);
502
503 return default_pager_clsize;
504}
505
506kern_return_t
507default_pager_backing_store_create(
0b4e3aa0
A
508 memory_object_default_t pager,
509 int priority,
510 int clsize, /* in bytes */
511 MACH_PORT_FACE *backing_store)
1c79356b
A
512{
513 backing_store_t bs;
514 MACH_PORT_FACE port;
515 kern_return_t kr;
516 struct vstruct_alias *alias_struct;
1c79356b 517
0b4e3aa0 518 if (pager != default_pager_object)
1c79356b
A
519 return KERN_INVALID_ARGUMENT;
520
521 bs = backing_store_alloc();
522 port = ipc_port_alloc_kernel();
523 ipc_port_make_send(port);
524 assert (port != IP_NULL);
525
526 DEBUG(DEBUG_BS_EXTERNAL,
527 ("priority=%d clsize=%d bs_port=0x%x\n",
528 priority, clsize, (int) backing_store));
529
530 alias_struct = (struct vstruct_alias *)
531 kalloc(sizeof (struct vstruct_alias));
532 if(alias_struct != NULL) {
533 alias_struct->vs = (struct vstruct *)bs;
534 alias_struct->name = ISVS;
535 port->alias = (int) alias_struct;
536 }
537 else {
538 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
539 kfree((vm_offset_t)bs, sizeof (struct backing_store));
540 return KERN_RESOURCE_SHORTAGE;
541 }
542
543 bs->bs_port = port;
544 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
545 priority = BS_MAXPRI;
546 else if (priority == BS_NOPRI)
547 priority = BS_MAXPRI;
548 else
549 priority = BS_MINPRI;
550 bs->bs_priority = priority;
551
de355530 552 bs->bs_clsize = bs_get_global_clsize(atop(clsize));
1c79356b
A
553
554 BSL_LOCK();
555 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
556 bs_links);
557 BSL_UNLOCK();
558
559 backing_store_add(bs);
560
561 *backing_store = port;
562 return KERN_SUCCESS;
563}
564
565kern_return_t
566default_pager_backing_store_info(
567 MACH_PORT_FACE backing_store,
568 backing_store_flavor_t flavour,
569 backing_store_info_t info,
570 mach_msg_type_number_t *size)
571{
572 backing_store_t bs;
573 backing_store_basic_info_t basic;
574 int i;
575 paging_segment_t ps;
576
577 if (flavour != BACKING_STORE_BASIC_INFO ||
578 *size < BACKING_STORE_BASIC_INFO_COUNT)
579 return KERN_INVALID_ARGUMENT;
580
581 basic = (backing_store_basic_info_t)info;
582 *size = BACKING_STORE_BASIC_INFO_COUNT;
583
584 VSTATS_LOCK(&global_stats.gs_lock);
585 basic->pageout_calls = global_stats.gs_pageout_calls;
586 basic->pagein_calls = global_stats.gs_pagein_calls;
587 basic->pages_in = global_stats.gs_pages_in;
588 basic->pages_out = global_stats.gs_pages_out;
589 basic->pages_unavail = global_stats.gs_pages_unavail;
590 basic->pages_init = global_stats.gs_pages_init;
591 basic->pages_init_writes= global_stats.gs_pages_init_writes;
592 VSTATS_UNLOCK(&global_stats.gs_lock);
593
594 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
595 return KERN_INVALID_ARGUMENT;
596
597 basic->bs_pages_total = bs->bs_pages_total;
598 PSL_LOCK();
599 bs->bs_pages_free = 0;
600 for (i = 0; i <= paging_segment_max; i++) {
601 ps = paging_segments[i];
602 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
603 PS_LOCK(ps);
604 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
605 PS_UNLOCK(ps);
606 }
607 }
608 PSL_UNLOCK();
609 basic->bs_pages_free = bs->bs_pages_free;
610 basic->bs_pages_in = bs->bs_pages_in;
611 basic->bs_pages_in_fail = bs->bs_pages_in_fail;
612 basic->bs_pages_out = bs->bs_pages_out;
613 basic->bs_pages_out_fail= bs->bs_pages_out_fail;
614
615 basic->bs_priority = bs->bs_priority;
de355530 616 basic->bs_clsize = ptoa(bs->bs_clsize); /* in bytes */
1c79356b
A
617
618 BS_UNLOCK(bs);
619
620 return KERN_SUCCESS;
621}
622
623int ps_delete(paging_segment_t); /* forward */
624
625int
626ps_delete(
627 paging_segment_t ps)
628{
629 vstruct_t vs;
630 kern_return_t error = KERN_SUCCESS;
631 int vs_count;
632
633 VSL_LOCK(); /* get the lock on the list of vs's */
634
635 /* The lock relationship and sequence is farily complicated */
636 /* this code looks at a live list, locking and unlocking the list */
637 /* as it traverses it. It depends on the locking behavior of */
638 /* default_pager_no_senders. no_senders always locks the vstruct */
639 /* targeted for removal before locking the vstruct list. However */
640 /* it will remove that member of the list without locking its */
641 /* neighbors. We can be sure when we hold a lock on a vstruct */
642 /* it cannot be removed from the list but we must hold the list */
643 /* lock to be sure that its pointers to its neighbors are valid. */
644 /* Also, we can hold off destruction of a vstruct when the list */
645 /* lock and the vs locks are not being held by bumping the */
646 /* vs_async_pending count. */
647
0b4e3aa0
A
648
649 while(backing_store_release_trigger_disable != 0) {
9bccf70c 650 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
0b4e3aa0
A
651 }
652
1c79356b
A
653 /* we will choose instead to hold a send right */
654 vs_count = vstruct_list.vsl_count;
655 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
656 if(vs == (vstruct_t)&vstruct_list) {
657 VSL_UNLOCK();
658 return KERN_SUCCESS;
659 }
660 VS_LOCK(vs);
661 vs_async_wait(vs); /* wait for any pending async writes */
662 if ((vs_count != 0) && (vs != NULL))
663 vs->vs_async_pending += 1; /* hold parties calling */
664 /* vs_async_wait */
665 VS_UNLOCK(vs);
666 VSL_UNLOCK();
667 while((vs_count != 0) && (vs != NULL)) {
668 /* We take the count of AMO's before beginning the */
669 /* transfer of of the target segment. */
670 /* We are guaranteed that the target segment cannot get */
671 /* more users. We also know that queue entries are */
672 /* made at the back of the list. If some of the entries */
673 /* we would check disappear while we are traversing the */
674 /* list then we will either check new entries which */
675 /* do not have any backing store in the target segment */
676 /* or re-check old entries. This might not be optimal */
677 /* but it will always be correct. The alternative is to */
678 /* take a snapshot of the list. */
679 vstruct_t next_vs;
680
681 if(dp_pages_free < cluster_transfer_minimum)
682 error = KERN_FAILURE;
683 else {
684 vm_object_t transfer_object;
0b4e3aa0 685 int count;
1c79356b
A
686 upl_t upl;
687
688 transfer_object = vm_object_allocate(VM_SUPER_CLUSTER);
0b4e3aa0
A
689 count = 0;
690 error = vm_object_upl_request(transfer_object,
691 (vm_object_offset_t)0, VM_SUPER_CLUSTER,
692 &upl, NULL, &count,
693 UPL_NO_SYNC | UPL_CLEAN_IN_PLACE
694 | UPL_SET_INTERNAL);
1c79356b 695 if(error == KERN_SUCCESS) {
1c79356b
A
696 error = ps_vstruct_transfer_from_segment(
697 vs, ps, upl);
0b4e3aa0
A
698 upl_commit(upl, NULL);
699 upl_deallocate(upl);
1c79356b 700 } else {
1c79356b
A
701 error = KERN_FAILURE;
702 }
9bccf70c 703 vm_object_deallocate(transfer_object);
1c79356b
A
704 }
705 if(error) {
706 VS_LOCK(vs);
707 vs->vs_async_pending -= 1; /* release vs_async_wait */
0b4e3aa0
A
708 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
709 vs->vs_waiting_async = FALSE;
1c79356b 710 VS_UNLOCK(vs);
0b4e3aa0 711 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
712 } else {
713 VS_UNLOCK(vs);
714 }
715 return KERN_FAILURE;
716 }
717
718 VSL_LOCK();
0b4e3aa0
A
719
720 while(backing_store_release_trigger_disable != 0) {
9bccf70c
A
721 VSL_SLEEP(&backing_store_release_trigger_disable,
722 THREAD_UNINT);
0b4e3aa0
A
723 }
724
1c79356b
A
725 next_vs = (vstruct_t) queue_next(&(vs->vs_links));
726 if((next_vs != (vstruct_t)&vstruct_list) &&
727 (vs != next_vs) && (vs_count != 1)) {
728 VS_LOCK(next_vs);
729 vs_async_wait(next_vs); /* wait for any */
730 /* pending async writes */
731 next_vs->vs_async_pending += 1; /* hold parties */
732 /* calling vs_async_wait */
733 VS_UNLOCK(next_vs);
734 }
735 VSL_UNLOCK();
736 VS_LOCK(vs);
737 vs->vs_async_pending -= 1;
0b4e3aa0
A
738 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
739 vs->vs_waiting_async = FALSE;
1c79356b 740 VS_UNLOCK(vs);
0b4e3aa0 741 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
742 } else {
743 VS_UNLOCK(vs);
744 }
745 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
746 vs = NULL;
747 else
748 vs = next_vs;
749 vs_count--;
750 }
751 return KERN_SUCCESS;
752}
753
754
755kern_return_t
756default_pager_backing_store_delete(
757 MACH_PORT_FACE backing_store)
758{
759 backing_store_t bs;
760 int i;
761 paging_segment_t ps;
762 int error;
763 int interim_pages_removed = 0;
764 kern_return_t kr;
1c79356b
A
765
766 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
767 return KERN_INVALID_ARGUMENT;
768
769#if 0
770 /* not implemented */
771 BS_UNLOCK(bs);
772 return KERN_FAILURE;
773#endif
774
775 restart:
776 PSL_LOCK();
777 error = KERN_SUCCESS;
778 for (i = 0; i <= paging_segment_max; i++) {
779 ps = paging_segments[i];
780 if (ps != PAGING_SEGMENT_NULL &&
781 ps->ps_bs == bs &&
782 ! ps->ps_going_away) {
783 PS_LOCK(ps);
784 /* disable access to this segment */
785 ps->ps_going_away = TRUE;
786 PS_UNLOCK(ps);
787 /*
788 * The "ps" segment is "off-line" now,
789 * we can try and delete it...
790 */
791 if(dp_pages_free < (cluster_transfer_minimum
792 + ps->ps_pgcount)) {
793 error = KERN_FAILURE;
794 PSL_UNLOCK();
795 }
796 else {
797 /* remove all pages associated with the */
798 /* segment from the list of free pages */
799 /* when transfer is through, all target */
800 /* segment pages will appear to be free */
801
802 dp_pages_free -= ps->ps_pgcount;
803 interim_pages_removed += ps->ps_pgcount;
804 PSL_UNLOCK();
805 error = ps_delete(ps);
806 }
807 if (error != KERN_SUCCESS) {
808 /*
809 * We couldn't delete the segment,
810 * probably because there's not enough
811 * virtual memory left.
812 * Re-enable all the segments.
813 */
814 PSL_LOCK();
815 break;
816 }
817 goto restart;
818 }
819 }
820
821 if (error != KERN_SUCCESS) {
822 for (i = 0; i <= paging_segment_max; i++) {
823 ps = paging_segments[i];
824 if (ps != PAGING_SEGMENT_NULL &&
825 ps->ps_bs == bs &&
826 ps->ps_going_away) {
827 PS_LOCK(ps);
828 /* re-enable access to this segment */
829 ps->ps_going_away = FALSE;
830 PS_UNLOCK(ps);
831 }
832 }
833 dp_pages_free += interim_pages_removed;
834 PSL_UNLOCK();
835 BS_UNLOCK(bs);
836 return error;
837 }
838
839 for (i = 0; i <= paging_segment_max; i++) {
840 ps = paging_segments[i];
841 if (ps != PAGING_SEGMENT_NULL &&
842 ps->ps_bs == bs) {
843 if(ps->ps_going_away) {
844 paging_segments[i] = PAGING_SEGMENT_NULL;
845 paging_segment_count--;
846 PS_LOCK(ps);
847 kfree((vm_offset_t)ps->ps_bmap,
848 RMAPSIZE(ps->ps_ncls));
849 kfree((vm_offset_t)ps, sizeof *ps);
850 }
851 }
852 }
853
854 /* Scan the entire ps array separately to make certain we find the */
855 /* proper paging_segment_max */
856 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
857 if(paging_segments[i] != PAGING_SEGMENT_NULL)
858 paging_segment_max = i;
859 }
860
861 PSL_UNLOCK();
862
863 /*
864 * All the segments have been deleted.
865 * We can remove the backing store.
866 */
867
868 /*
869 * Disable lookups of this backing store.
870 */
871 if((void *)bs->bs_port->alias != NULL)
872 kfree((vm_offset_t) bs->bs_port->alias,
873 sizeof (struct vstruct_alias));
1c79356b
A
874 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
875 bs->bs_port = MACH_PORT_NULL;
876 BS_UNLOCK(bs);
877
878 /*
879 * Remove backing store from backing_store list.
880 */
881 BSL_LOCK();
882 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
883 bs_links);
884 BSL_UNLOCK();
885
886 /*
887 * Free the backing store structure.
888 */
889 kfree((vm_offset_t)bs, sizeof *bs);
890
891 return KERN_SUCCESS;
892}
893
894int ps_enter(paging_segment_t); /* forward */
895
896int
897ps_enter(
898 paging_segment_t ps)
899{
900 int i;
901
902 PSL_LOCK();
903
904 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
905 if (paging_segments[i] == PAGING_SEGMENT_NULL)
906 break;
907 }
908
909 if (i < MAX_NUM_PAGING_SEGMENTS) {
910 paging_segments[i] = ps;
911 if (i > paging_segment_max)
912 paging_segment_max = i;
913 paging_segment_count++;
914 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
915 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
916 ps_select_array[ps->ps_bs->bs_priority] = 0;
917 i = 0;
918 } else {
919 PSL_UNLOCK();
920 return KERN_RESOURCE_SHORTAGE;
921 }
922
923 PSL_UNLOCK();
924 return i;
925}
926
927#ifdef DEVICE_PAGING
928kern_return_t
929default_pager_add_segment(
930 MACH_PORT_FACE backing_store,
931 MACH_PORT_FACE device,
932 recnum_t offset,
933 recnum_t count,
934 int record_size)
935{
936 backing_store_t bs;
937 paging_segment_t ps;
938 int i;
939 int error;
1c79356b
A
940
941 if ((bs = backing_store_lookup(backing_store))
942 == BACKING_STORE_NULL)
943 return KERN_INVALID_ARGUMENT;
944
945 PSL_LOCK();
946 for (i = 0; i <= paging_segment_max; i++) {
947 ps = paging_segments[i];
948 if (ps == PAGING_SEGMENT_NULL)
949 continue;
950
951 /*
952 * Check for overlap on same device.
953 */
954 if (!(ps->ps_device != device
955 || offset >= ps->ps_offset + ps->ps_recnum
956 || offset + count <= ps->ps_offset)) {
957 PSL_UNLOCK();
958 BS_UNLOCK(bs);
959 return KERN_INVALID_ARGUMENT;
960 }
961 }
962 PSL_UNLOCK();
963
964 /*
965 * Set up the paging segment
966 */
967 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
968 if (ps == PAGING_SEGMENT_NULL) {
969 BS_UNLOCK(bs);
970 return KERN_RESOURCE_SHORTAGE;
971 }
972
973 ps->ps_segtype = PS_PARTITION;
974 ps->ps_device = device;
975 ps->ps_offset = offset;
976 ps->ps_record_shift = local_log2(vm_page_size / record_size);
977 ps->ps_recnum = count;
978 ps->ps_pgnum = count >> ps->ps_record_shift;
979
980 ps->ps_pgcount = ps->ps_pgnum;
981 ps->ps_clshift = local_log2(bs->bs_clsize);
982 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
983 ps->ps_hint = 0;
984
985 PS_LOCK_INIT(ps);
986 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
987 if (!ps->ps_bmap) {
988 kfree((vm_offset_t)ps, sizeof *ps);
989 BS_UNLOCK(bs);
990 return KERN_RESOURCE_SHORTAGE;
991 }
992 for (i = 0; i < ps->ps_ncls; i++) {
993 clrbit(ps->ps_bmap, i);
994 }
995
996 ps->ps_going_away = FALSE;
997 ps->ps_bs = bs;
998
999 if ((error = ps_enter(ps)) != 0) {
1000 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1001 kfree((vm_offset_t)ps, sizeof *ps);
1002 BS_UNLOCK(bs);
1003 return KERN_RESOURCE_SHORTAGE;
1004 }
1005
1006 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1007 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1008 BS_UNLOCK(bs);
1009
1010 PSL_LOCK();
1011 dp_pages_free += ps->ps_pgcount;
1012 PSL_UNLOCK();
1013
1014 bs_more_space(ps->ps_clcount);
1015
1016 DEBUG(DEBUG_BS_INTERNAL,
1017 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1018 device, offset, count, record_size,
1019 ps->ps_record_shift, ps->ps_pgnum));
1020
1021 return KERN_SUCCESS;
1022}
1023
1024boolean_t
1025bs_add_device(
1026 char *dev_name,
1027 MACH_PORT_FACE master)
1028{
1029 security_token_t null_security_token = {
1030 { 0, 0 }
1031 };
1032 MACH_PORT_FACE device;
1033 int info[DEV_GET_SIZE_COUNT];
1034 mach_msg_type_number_t info_count;
1035 MACH_PORT_FACE bs = MACH_PORT_NULL;
1036 unsigned int rec_size;
1037 recnum_t count;
1038 int clsize;
1039 MACH_PORT_FACE reply_port;
1040
1041 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1042 null_security_token, dev_name, &device))
1043 return FALSE;
1044
1045 info_count = DEV_GET_SIZE_COUNT;
1046 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1047 rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1048 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size;
1049 clsize = bs_get_global_clsize(0);
1050 if (!default_pager_backing_store_create(
0b4e3aa0 1051 default_pager_object,
1c79356b
A
1052 DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1053 (clsize * vm_page_size),
1054 &bs)) {
1055 if (!default_pager_add_segment(bs, device,
1056 0, count, rec_size)) {
1057 return TRUE;
1058 }
1059 ipc_port_release_receive(bs);
1060 }
1061 }
1062
1063 ipc_port_release_send(device);
1064 return FALSE;
1065}
1066#endif /* DEVICE_PAGING */
1067
1068#if VS_ASYNC_REUSE
1069
1070struct vs_async *
1071vs_alloc_async(void)
1072{
1073 struct vs_async *vsa;
1074 MACH_PORT_FACE reply_port;
1075 kern_return_t kr;
1076
1077 VS_ASYNC_LOCK();
1078 if (vs_async_free_list == NULL) {
1079 VS_ASYNC_UNLOCK();
1080 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1081 if (vsa != NULL) {
1082 /*
1083 * Try allocating a reply port named after the
1084 * address of the vs_async structure.
1085 */
1086 struct vstruct_alias *alias_struct;
1087
1088 reply_port = ipc_port_alloc_kernel();
1089 alias_struct = (struct vstruct_alias *)
1090 kalloc(sizeof (struct vstruct_alias));
1091 if(alias_struct != NULL) {
1092 alias_struct->vs = (struct vstruct *)vsa;
1093 alias_struct->name = ISVS;
1094 reply_port->alias = (int) alias_struct;
1095 vsa->reply_port = reply_port;
1096 vs_alloc_async_count++;
1097 }
1098 else {
1099 vs_alloc_async_failed++;
1100 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1101 (reply_port));
1102 kfree((vm_offset_t)vsa,
1103 sizeof (struct vs_async));
1104 vsa = NULL;
1105 }
1106 }
1107 } else {
1108 vsa = vs_async_free_list;
1109 vs_async_free_list = vs_async_free_list->vsa_next;
1110 VS_ASYNC_UNLOCK();
1111 }
1112
1113 return vsa;
1114}
1115
1116void
1117vs_free_async(
1118 struct vs_async *vsa)
1119{
1120 VS_ASYNC_LOCK();
1121 vsa->vsa_next = vs_async_free_list;
1122 vs_async_free_list = vsa;
1123 VS_ASYNC_UNLOCK();
1124}
1125
1126#else /* VS_ASYNC_REUSE */
1127
1128struct vs_async *
1129vs_alloc_async(void)
1130{
1131 struct vs_async *vsa;
1132 MACH_PORT_FACE reply_port;
1133 kern_return_t kr;
1134
1135 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1136 if (vsa != NULL) {
1137 /*
1138 * Try allocating a reply port named after the
1139 * address of the vs_async structure.
1140 */
1141 reply_port = ipc_port_alloc_kernel();
1142 alias_struct = (vstruct_alias *)
1143 kalloc(sizeof (struct vstruct_alias));
1144 if(alias_struct != NULL) {
1145 alias_struct->vs = reply_port;
1146 alias_struct->name = ISVS;
1147 reply_port->alias = (int) vsa;
1148 vsa->reply_port = reply_port;
1149 vs_alloc_async_count++;
1150 }
1151 else {
1152 vs_alloc_async_failed++;
1153 ipc_port_dealloc_kernel((MACH_PORT_FACE)
1154 (reply_port));
1155 kfree((vm_offset_t) vsa,
1156 sizeof (struct vs_async));
1157 vsa = NULL;
1158 }
1159 }
1160
1161 return vsa;
1162}
1163
1164void
1165vs_free_async(
1166 struct vs_async *vsa)
1167{
1c79356b
A
1168 MACH_PORT_FACE reply_port;
1169 kern_return_t kr;
1170
1171 reply_port = vsa->reply_port;
1172 kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias));
1173 kfree((vm_offset_t) vsa, sizeof (struct vs_async));
1c79356b
A
1174 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1175#if 0
1176 VS_ASYNC_LOCK();
1177 vs_alloc_async_count--;
1178 VS_ASYNC_UNLOCK();
1179#endif
1180}
1181
1182#endif /* VS_ASYNC_REUSE */
1183
0b4e3aa0
A
1184zone_t vstruct_zone;
1185
1c79356b
A
1186vstruct_t
1187ps_vstruct_create(
1188 vm_size_t size)
1189{
1190 vstruct_t vs;
1191 int i;
1c79356b 1192
0b4e3aa0 1193 vs = (vstruct_t) zalloc(vstruct_zone);
1c79356b
A
1194 if (vs == VSTRUCT_NULL) {
1195 return VSTRUCT_NULL;
1196 }
1197
1198 VS_LOCK_INIT(vs);
1199
1200 /*
1201 * The following fields will be provided later.
1202 */
0b4e3aa0
A
1203 vs->vs_mem_obj = NULL;
1204 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1205 vs->vs_references = 1;
1c79356b 1206 vs->vs_seqno = 0;
1c79356b
A
1207
1208#ifdef MACH_KERNEL
1209 vs->vs_waiting_seqno = FALSE;
1210 vs->vs_waiting_read = FALSE;
1211 vs->vs_waiting_write = FALSE;
1c79356b
A
1212 vs->vs_waiting_async = FALSE;
1213#else
1214 mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO);
1215 mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD);
1216 mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE);
1217 mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS);
1218 mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC);
1219#endif
1220
1221 vs->vs_readers = 0;
1222 vs->vs_writers = 0;
1223
1224 vs->vs_errors = 0;
1225
1226 vs->vs_clshift = local_log2(bs_get_global_clsize(0));
de355530 1227 vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1;
1c79356b
A
1228 vs->vs_async_pending = 0;
1229
1230 /*
1231 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1232 * depending on the size of the memory object.
1233 */
1234 if (INDIRECT_CLMAP(vs->vs_size)) {
1235 vs->vs_imap = (struct vs_map **)
1236 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1237 vs->vs_indirect = TRUE;
1238 } else {
1239 vs->vs_dmap = (struct vs_map *)
1240 kalloc(CLMAP_SIZE(vs->vs_size));
1241 vs->vs_indirect = FALSE;
1242 }
1243 vs->vs_xfer_pending = FALSE;
1244 DEBUG(DEBUG_VS_INTERNAL,
1245 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1246
1247 /*
1248 * Check to see that we got the space.
1249 */
1250 if (!vs->vs_dmap) {
1251 kfree((vm_offset_t)vs, sizeof *vs);
1252 return VSTRUCT_NULL;
1253 }
1254
1255 /*
1256 * Zero the indirect pointers, or clear the direct pointers.
1257 */
1258 if (vs->vs_indirect)
1259 memset(vs->vs_imap, 0,
1260 INDIRECT_CLMAP_SIZE(vs->vs_size));
1261 else
1262 for (i = 0; i < vs->vs_size; i++)
1263 VSM_CLR(vs->vs_dmap[i]);
1264
1265 VS_MAP_LOCK_INIT(vs);
1266
1267 bs_commit(vs->vs_size);
1268
1269 return vs;
1270}
1271
1272paging_segment_t ps_select_segment(int, int *); /* forward */
1273
1274paging_segment_t
1275ps_select_segment(
1276 int shift,
1277 int *psindex)
1278{
1279 paging_segment_t ps;
1280 int i;
1281 int j;
1c79356b
A
1282
1283 /*
1284 * Optimize case where there's only one segment.
1285 * paging_segment_max will index the one and only segment.
1286 */
1287
1288 PSL_LOCK();
1289 if (paging_segment_count == 1) {
1290 paging_segment_t lps; /* used to avoid extra PS_UNLOCK */
0b4e3aa0 1291 ipc_port_t trigger = IP_NULL;
1c79356b
A
1292
1293 ps = paging_segments[paging_segment_max];
1294 *psindex = paging_segment_max;
1295 PS_LOCK(ps);
1296 if (ps->ps_going_away) {
1297 /* this segment is being turned off */
1298 lps = PAGING_SEGMENT_NULL;
1299 } else {
1300 ASSERT(ps->ps_clshift >= shift);
1301 if (ps->ps_clcount) {
1302 ps->ps_clcount--;
1303 dp_pages_free -= 1 << ps->ps_clshift;
1304 if(min_pages_trigger_port &&
1305 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1306 trigger = min_pages_trigger_port;
1c79356b
A
1307 min_pages_trigger_port = NULL;
1308 bs_low = TRUE;
1309 }
1310 lps = ps;
1311 } else
1312 lps = PAGING_SEGMENT_NULL;
1313 }
1314 PS_UNLOCK(ps);
1315 PSL_UNLOCK();
0b4e3aa0
A
1316
1317 if (trigger != IP_NULL) {
1318 default_pager_space_alert(trigger, HI_WAT_ALERT);
1319 ipc_port_release_send(trigger);
1320 }
1c79356b
A
1321 return lps;
1322 }
1323
1324 if (paging_segment_count == 0) {
1325 PSL_UNLOCK();
1326 return PAGING_SEGMENT_NULL;
1327 }
1328
1329 for (i = BS_MAXPRI;
1330 i >= BS_MINPRI; i--) {
1331 int start_index;
1332
1333 if ((ps_select_array[i] == BS_NOPRI) ||
1334 (ps_select_array[i] == BS_FULLPRI))
1335 continue;
1336 start_index = ps_select_array[i];
1337
1338 if(!(paging_segments[start_index])) {
1339 j = start_index+1;
1340 physical_transfer_cluster_count = 0;
1341 }
0b4e3aa0 1342 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1c79356b 1343 (((paging_segments[start_index])->ps_clshift)
0b4e3aa0 1344 + vm_page_shift))) {
1c79356b
A
1345 physical_transfer_cluster_count = 0;
1346 j = start_index + 1;
1347 } else {
1348 physical_transfer_cluster_count+=1;
1349 j = start_index;
1350 if(start_index == 0)
1351 start_index = paging_segment_max;
1352 else
1353 start_index = start_index - 1;
1354 }
1355
1356 while (1) {
1357 if (j > paging_segment_max)
1358 j = 0;
1359 if ((ps = paging_segments[j]) &&
1360 (ps->ps_bs->bs_priority == i)) {
1361 /*
1362 * Force the ps cluster size to be
1363 * >= that of the vstruct.
1364 */
1365 PS_LOCK(ps);
1366 if (ps->ps_going_away) {
1367 /* this segment is being turned off */
1368 } else if ((ps->ps_clcount) &&
1369 (ps->ps_clshift >= shift)) {
0b4e3aa0
A
1370 ipc_port_t trigger = IP_NULL;
1371
1c79356b
A
1372 ps->ps_clcount--;
1373 dp_pages_free -= 1 << ps->ps_clshift;
1374 if(min_pages_trigger_port &&
1375 (dp_pages_free <
1376 minimum_pages_remaining)) {
0b4e3aa0 1377 trigger = min_pages_trigger_port;
1c79356b
A
1378 min_pages_trigger_port = NULL;
1379 }
1380 PS_UNLOCK(ps);
1381 /*
1382 * found one, quit looking.
1383 */
1384 ps_select_array[i] = j;
1385 PSL_UNLOCK();
0b4e3aa0
A
1386
1387 if (trigger != IP_NULL) {
1388 default_pager_space_alert(
1389 trigger,
1390 HI_WAT_ALERT);
1391 ipc_port_release_send(trigger);
1392 }
1c79356b
A
1393 *psindex = j;
1394 return ps;
1395 }
1396 PS_UNLOCK(ps);
1397 }
1398 if (j == start_index) {
1399 /*
1400 * none at this priority -- mark it full
1401 */
1402 ps_select_array[i] = BS_FULLPRI;
1403 break;
1404 }
1405 j++;
1406 }
1407 }
1408 PSL_UNLOCK();
1409 return PAGING_SEGMENT_NULL;
1410}
1411
1412vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1413
1414vm_offset_t
1415ps_allocate_cluster(
1416 vstruct_t vs,
1417 int *psindex,
1418 paging_segment_t use_ps)
1419{
1420 int byte_num;
1421 int bit_num = 0;
1422 paging_segment_t ps;
1423 vm_offset_t cluster;
0b4e3aa0 1424 ipc_port_t trigger = IP_NULL;
1c79356b
A
1425
1426 /*
1427 * Find best paging segment.
1428 * ps_select_segment will decrement cluster count on ps.
1429 * Must pass cluster shift to find the most appropriate segment.
1430 */
1431 /* NOTE: The addition of paging segment delete capability threatened
1432 * to seriously complicate the treatment of paging segments in this
1433 * module and the ones that call it (notably ps_clmap), because of the
1434 * difficulty in assuring that the paging segment would continue to
1435 * exist between being unlocked and locked. This was
1436 * avoided because all calls to this module are based in either
1437 * dp_memory_object calls which rely on the vs lock, or by
1438 * the transfer function which is part of the segment delete path.
1439 * The transfer function which is part of paging segment delete is
1440 * protected from multiple callers by the backing store lock.
1441 * The paging segment delete function treats mappings to a paging
1442 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1443 * while data is transferred to the remaining segments. This is in
1444 * line with the view that incomplete or in-transition mappings between
1445 * data, a vstruct, and backing store are protected by the vs lock.
1446 * This and the ordering of the paging segment "going_away" bit setting
1447 * protects us.
1448 */
1449 if (use_ps != PAGING_SEGMENT_NULL) {
1450 ps = use_ps;
1451 PSL_LOCK();
1452 PS_LOCK(ps);
1453 ps->ps_clcount--;
1454 dp_pages_free -= 1 << ps->ps_clshift;
1c79356b
A
1455 if(min_pages_trigger_port &&
1456 (dp_pages_free < minimum_pages_remaining)) {
0b4e3aa0 1457 trigger = min_pages_trigger_port;
1c79356b
A
1458 min_pages_trigger_port = NULL;
1459 }
0b4e3aa0 1460 PSL_UNLOCK();
1c79356b 1461 PS_UNLOCK(ps);
0b4e3aa0
A
1462 if (trigger != IP_NULL) {
1463 default_pager_space_alert(trigger, HI_WAT_ALERT);
1464 ipc_port_release_send(trigger);
1465 }
1466
1c79356b
A
1467 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1468 PAGING_SEGMENT_NULL) {
1469#if 0
1470 bs_no_paging_space(TRUE);
1471#endif
1472#if 0
1473 if (verbose)
1474#endif
1475 dprintf(("no space in available paging segments; "
1476 "swapon suggested\n"));
1477 /* the count got off maybe, reset to zero */
0b4e3aa0 1478 PSL_LOCK();
1c79356b
A
1479 dp_pages_free = 0;
1480 if(min_pages_trigger_port) {
0b4e3aa0 1481 trigger = min_pages_trigger_port;
1c79356b
A
1482 min_pages_trigger_port = NULL;
1483 bs_low = TRUE;
1484 }
0b4e3aa0
A
1485 PSL_UNLOCK();
1486 if (trigger != IP_NULL) {
1487 default_pager_space_alert(trigger, HI_WAT_ALERT);
1488 ipc_port_release_send(trigger);
1489 }
1c79356b
A
1490 return (vm_offset_t) -1;
1491 }
1492 ASSERT(ps->ps_clcount != 0);
1493
1494 /*
1495 * Look for an available cluster. At the end of the loop,
1496 * byte_num is the byte offset and bit_num is the bit offset of the
1497 * first zero bit in the paging segment bitmap.
1498 */
1499 PS_LOCK(ps);
1500 byte_num = ps->ps_hint;
1501 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1502 if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1503 for (bit_num = 0; bit_num < NBBY; bit_num++) {
1504 if (isclr((ps->ps_bmap + byte_num), bit_num))
1505 break;
1506 }
1507 ASSERT(bit_num != NBBY);
1508 break;
1509 }
1510 }
1511 ps->ps_hint = byte_num;
1512 cluster = (byte_num*NBBY) + bit_num;
1513
1514 /* Space was reserved, so this must be true */
1515 ASSERT(cluster < ps->ps_ncls);
1516
1517 setbit(ps->ps_bmap, cluster);
1518 PS_UNLOCK(ps);
1519
1520 return cluster;
1521}
1522
1523void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */
1524
1525void
1526ps_deallocate_cluster(
1527 paging_segment_t ps,
1528 vm_offset_t cluster)
1529{
0b4e3aa0 1530 ipc_port_t trigger = IP_NULL;
1c79356b
A
1531
1532 if (cluster >= (vm_offset_t) ps->ps_ncls)
1533 panic("ps_deallocate_cluster: Invalid cluster number");
1534
1535 /*
1536 * Lock the paging segment, clear the cluster's bitmap and increment the
1537 * number of free cluster.
1538 */
1539 PSL_LOCK();
1540 PS_LOCK(ps);
1541 clrbit(ps->ps_bmap, cluster);
1542 ++ps->ps_clcount;
1543 dp_pages_free += 1 << ps->ps_clshift;
0b4e3aa0
A
1544 if(max_pages_trigger_port
1545 && (backing_store_release_trigger_disable == 0)
1546 && (dp_pages_free > maximum_pages_free)) {
1547 trigger = max_pages_trigger_port;
1c79356b
A
1548 max_pages_trigger_port = NULL;
1549 }
0b4e3aa0 1550 PSL_UNLOCK();
1c79356b
A
1551
1552 /*
1553 * Move the hint down to the freed cluster if it is
1554 * less than the current hint.
1555 */
1556 if ((cluster/NBBY) < ps->ps_hint) {
1557 ps->ps_hint = (cluster/NBBY);
1558 }
1559
1560 PS_UNLOCK(ps);
1561
1562 /*
1563 * If we're freeing space on a full priority, reset the array.
1564 */
1565 PSL_LOCK();
1566 if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1567 ps_select_array[ps->ps_bs->bs_priority] = 0;
1568 PSL_UNLOCK();
1569
0b4e3aa0
A
1570 if (trigger != IP_NULL) {
1571 VSL_LOCK();
1572 if(backing_store_release_trigger_disable != 0) {
1573 assert_wait((event_t)
1574 &backing_store_release_trigger_disable,
1575 THREAD_UNINT);
1576 VSL_UNLOCK();
9bccf70c 1577 thread_block(THREAD_CONTINUE_NULL);
0b4e3aa0
A
1578 } else {
1579 VSL_UNLOCK();
1580 }
1581 default_pager_space_alert(trigger, LO_WAT_ALERT);
1582 ipc_port_release_send(trigger);
1583 }
1584
1c79356b
A
1585 return;
1586}
1587
1588void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */
1589
1590void
1591ps_dealloc_vsmap(
1592 struct vs_map *vsmap,
1593 vm_size_t size)
1594{
1595 int i;
1596 for (i = 0; i < size; i++)
1597 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i]))
1598 ps_deallocate_cluster(VSM_PS(vsmap[i]),
1599 VSM_CLOFF(vsmap[i]));
1600}
1601
1602void
1603ps_vstruct_dealloc(
1604 vstruct_t vs)
1605{
1606 int i;
1607 spl_t s;
1c79356b
A
1608
1609 VS_MAP_LOCK(vs);
1610
1611 /*
1612 * If this is an indirect structure, then we walk through the valid
1613 * (non-zero) indirect pointers and deallocate the clusters
1614 * associated with each used map entry (via ps_dealloc_vsmap).
1615 * When all of the clusters in an indirect block have been
1616 * freed, we deallocate the block. When all of the indirect
1617 * blocks have been deallocated we deallocate the memory
1618 * holding the indirect pointers.
1619 */
1620 if (vs->vs_indirect) {
1621 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1622 if (vs->vs_imap[i] != NULL) {
1623 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1624 kfree((vm_offset_t)vs->vs_imap[i],
1625 CLMAP_THRESHOLD);
1626 }
1627 }
1628 kfree((vm_offset_t)vs->vs_imap,
1629 INDIRECT_CLMAP_SIZE(vs->vs_size));
1630 } else {
1631 /*
1632 * Direct map. Free used clusters, then memory.
1633 */
1634 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1635 kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1636 }
1637 VS_MAP_UNLOCK(vs);
1638
1639 bs_commit(- vs->vs_size);
1640
0b4e3aa0 1641 zfree(vstruct_zone, (vm_offset_t)vs);
1c79356b
A
1642}
1643
1644int ps_map_extend(vstruct_t, int); /* forward */
1645
1646int ps_map_extend(
1647 vstruct_t vs,
1648 int new_size)
1649{
1650 struct vs_map **new_imap;
1651 struct vs_map *new_dmap = NULL;
1652 int newdsize;
1653 int i;
1654 void *old_map = NULL;
1655 int old_map_size = 0;
1656
1657 if (vs->vs_size >= new_size) {
1658 /*
1659 * Someone has already done the work.
1660 */
1661 return 0;
1662 }
1663
1664 /*
1665 * If the new size extends into the indirect range, then we have one
1666 * of two cases: we are going from indirect to indirect, or we are
1667 * going from direct to indirect. If we are going from indirect to
1668 * indirect, then it is possible that the new size will fit in the old
1669 * indirect map. If this is the case, then just reset the size of the
1670 * vstruct map and we are done. If the new size will not
1671 * fit into the old indirect map, then we have to allocate a new
1672 * indirect map and copy the old map pointers into this new map.
1673 *
1674 * If we are going from direct to indirect, then we have to allocate a
1675 * new indirect map and copy the old direct pages into the first
1676 * indirect page of the new map.
1677 * NOTE: allocating memory here is dangerous, as we're in the
1678 * pageout path.
1679 */
1680 if (INDIRECT_CLMAP(new_size)) {
1681 int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
1682
1683 /*
1684 * Get a new indirect map and zero it.
1685 */
1686 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
1687 if (vs->vs_indirect &&
1688 (new_map_size == old_map_size)) {
1689 bs_commit(new_size - vs->vs_size);
1690 vs->vs_size = new_size;
1691 return 0;
1692 }
1693
1694 new_imap = (struct vs_map **)kalloc(new_map_size);
1695 if (new_imap == NULL) {
1696 return -1;
1697 }
1698 memset(new_imap, 0, new_map_size);
1699
1700 if (vs->vs_indirect) {
1701 /* Copy old entries into new map */
1702 memcpy(new_imap, vs->vs_imap, old_map_size);
1703 /* Arrange to free the old map */
1704 old_map = (void *) vs->vs_imap;
1705 newdsize = 0;
1706 } else { /* Old map was a direct map */
1707 /* Allocate an indirect page */
1708 if ((new_imap[0] = (struct vs_map *)
1709 kalloc(CLMAP_THRESHOLD)) == NULL) {
1710 kfree((vm_offset_t)new_imap, new_map_size);
1711 return -1;
1712 }
1713 new_dmap = new_imap[0];
1714 newdsize = CLMAP_ENTRIES;
1715 }
1716 } else {
1717 new_imap = NULL;
1718 newdsize = new_size;
1719 /*
1720 * If the new map is a direct map, then the old map must
1721 * also have been a direct map. All we have to do is
1722 * to allocate a new direct map, copy the old entries
1723 * into it and free the old map.
1724 */
1725 if ((new_dmap = (struct vs_map *)
1726 kalloc(CLMAP_SIZE(new_size))) == NULL) {
1727 return -1;
1728 }
1729 }
1730 if (newdsize) {
1731
1732 /* Free the old map */
1733 old_map = (void *) vs->vs_dmap;
1734 old_map_size = CLMAP_SIZE(vs->vs_size);
1735
1736 /* Copy info from the old map into the new map */
1737 memcpy(new_dmap, vs->vs_dmap, old_map_size);
1738
1739 /* Initialize the rest of the new map */
1740 for (i = vs->vs_size; i < newdsize; i++)
1741 VSM_CLR(new_dmap[i]);
1742 }
1743 if (new_imap) {
1744 vs->vs_imap = new_imap;
1745 vs->vs_indirect = TRUE;
1746 } else
1747 vs->vs_dmap = new_dmap;
1748 bs_commit(new_size - vs->vs_size);
1749 vs->vs_size = new_size;
1750 if (old_map)
1751 kfree((vm_offset_t)old_map, old_map_size);
1752 return 0;
1753}
1754
1755vm_offset_t
1756ps_clmap(
1757 vstruct_t vs,
1758 vm_offset_t offset,
1759 struct clmap *clmap,
1760 int flag,
1761 vm_size_t size,
1762 int error)
1763{
1764 vm_offset_t cluster; /* The cluster of offset. */
1765 vm_offset_t newcl; /* The new cluster allocated. */
1766 vm_offset_t newoff;
1767 int i;
1768 struct vs_map *vsmap;
1c79356b
A
1769
1770 VS_MAP_LOCK(vs);
1771
1772 ASSERT(vs->vs_dmap);
de355530 1773 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
1774
1775 /*
1776 * Initialize cluster error value
1777 */
1778 clmap->cl_error = 0;
1779
1780 /*
1781 * If the object has grown, extend the page map.
1782 */
1783 if (cluster >= vs->vs_size) {
1784 if (flag == CL_FIND) {
1785 /* Do not allocate if just doing a lookup */
1786 VS_MAP_UNLOCK(vs);
1787 return (vm_offset_t) -1;
1788 }
1789 if (ps_map_extend(vs, cluster + 1)) {
1790 VS_MAP_UNLOCK(vs);
1791 return (vm_offset_t) -1;
1792 }
1793 }
1794
1795 /*
1796 * Look for the desired cluster. If the map is indirect, then we
1797 * have a two level lookup. First find the indirect block, then
1798 * find the actual cluster. If the indirect block has not yet
1799 * been allocated, then do so. If the cluster has not yet been
1800 * allocated, then do so.
1801 *
1802 * If any of the allocations fail, then return an error.
1803 * Don't allocate if just doing a lookup.
1804 */
1805 if (vs->vs_indirect) {
1806 long ind_block = cluster/CLMAP_ENTRIES;
1807
1808 /* Is the indirect block allocated? */
1809 vsmap = vs->vs_imap[ind_block];
1810 if (vsmap == NULL) {
1811 if (flag == CL_FIND) {
1812 VS_MAP_UNLOCK(vs);
1813 return (vm_offset_t) -1;
1814 }
1815
1816 /* Allocate the indirect block */
1817 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
1818 if (vsmap == NULL) {
1819 VS_MAP_UNLOCK(vs);
1820 return (vm_offset_t) -1;
1821 }
1822 /* Initialize the cluster offsets */
1823 for (i = 0; i < CLMAP_ENTRIES; i++)
1824 VSM_CLR(vsmap[i]);
1825 vs->vs_imap[ind_block] = vsmap;
1826 }
1827 } else
1828 vsmap = vs->vs_dmap;
1829
1830 ASSERT(vsmap);
1831 vsmap += cluster%CLMAP_ENTRIES;
1832
1833 /*
1834 * At this point, vsmap points to the struct vs_map desired.
1835 *
1836 * Look in the map for the cluster, if there was an error on a
1837 * previous write, flag it and return. If it is not yet
1838 * allocated, then allocate it, if we're writing; if we're
1839 * doing a lookup and the cluster's not allocated, return error.
1840 */
1841 if (VSM_ISERR(*vsmap)) {
1842 clmap->cl_error = VSM_GETERR(*vsmap);
1843 VS_MAP_UNLOCK(vs);
1844 return (vm_offset_t) -1;
1845 } else if (VSM_ISCLR(*vsmap)) {
1846 int psindex;
1847
1848 if (flag == CL_FIND) {
1849 /*
1850 * If there's an error and the entry is clear, then
1851 * we've run out of swap space. Record the error
1852 * here and return.
1853 */
1854 if (error) {
1855 VSM_SETERR(*vsmap, error);
1856 }
1857 VS_MAP_UNLOCK(vs);
1858 return (vm_offset_t) -1;
1859 } else {
1860 /*
1861 * Attempt to allocate a cluster from the paging segment
1862 */
1863 newcl = ps_allocate_cluster(vs, &psindex,
1864 PAGING_SEGMENT_NULL);
1865 if (newcl == -1) {
1866 VS_MAP_UNLOCK(vs);
1867 return (vm_offset_t) -1;
1868 }
1869 VSM_CLR(*vsmap);
1870 VSM_SETCLOFF(*vsmap, newcl);
1871 VSM_SETPS(*vsmap, psindex);
1872 }
1873 } else
1874 newcl = VSM_CLOFF(*vsmap);
1875
1876 /*
1877 * Fill in pertinent fields of the clmap
1878 */
1879 clmap->cl_ps = VSM_PS(*vsmap);
1880 clmap->cl_numpages = VSCLSIZE(vs);
1881 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
1882
1883 /*
1884 * Byte offset in paging segment is byte offset to cluster plus
1885 * byte offset within cluster. It looks ugly, but should be
1886 * relatively quick.
1887 */
1888 ASSERT(trunc_page(offset) == offset);
de355530 1889 newcl = ptoa(newcl) << vs->vs_clshift;
1c79356b
A
1890 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
1891 if (flag == CL_ALLOC) {
1892 /*
1893 * set bits in the allocation bitmap according to which
1894 * pages were requested. size is in bytes.
1895 */
de355530 1896 i = atop(newoff);
1c79356b
A
1897 while ((size > 0) && (i < VSCLSIZE(vs))) {
1898 VSM_SETALLOC(*vsmap, i);
1899 i++;
1900 size -= vm_page_size;
1901 }
1902 }
1903 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
1904 if (newoff) {
1905 /*
1906 * Offset is not cluster aligned, so number of pages
1907 * and bitmaps must be adjusted
1908 */
de355530 1909 clmap->cl_numpages -= atop(newoff);
1c79356b
A
1910 CLMAP_SHIFT(clmap, vs);
1911 CLMAP_SHIFTALLOC(clmap, vs);
1912 }
1913
1914 /*
1915 *
1916 * The setting of valid bits and handling of write errors
1917 * must be done here, while we hold the lock on the map.
1918 * It logically should be done in ps_vs_write_complete().
1919 * The size and error information has been passed from
1920 * ps_vs_write_complete(). If the size parameter is non-zero,
1921 * then there is work to be done. If error is also non-zero,
1922 * then the error number is recorded in the cluster and the
1923 * entire cluster is in error.
1924 */
1925 if (size && flag == CL_FIND) {
1926 vm_offset_t off = (vm_offset_t) 0;
1927
1928 if (!error) {
1929 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
1930 i++) {
1931 VSM_SETPG(*vsmap, i);
1932 size -= vm_page_size;
1933 }
1934 ASSERT(i <= VSCLSIZE(vs));
1935 } else {
1936 BS_STAT(clmap->cl_ps->ps_bs,
1937 clmap->cl_ps->ps_bs->bs_pages_out_fail +=
de355530 1938 atop(size));
1c79356b
A
1939 off = VSM_CLOFF(*vsmap);
1940 VSM_SETERR(*vsmap, error);
1941 }
1942 /*
1943 * Deallocate cluster if error, and no valid pages
1944 * already present.
1945 */
1946 if (off != (vm_offset_t) 0)
1947 ps_deallocate_cluster(clmap->cl_ps, off);
1948 VS_MAP_UNLOCK(vs);
1949 return (vm_offset_t) 0;
1950 } else
1951 VS_MAP_UNLOCK(vs);
1952
1953 DEBUG(DEBUG_VS_INTERNAL,
1954 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
1955 newcl+newoff, (int) vs, (int) vsmap, flag));
1956 DEBUG(DEBUG_VS_INTERNAL,
1957 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
1958 (int) clmap->cl_ps, clmap->cl_numpages,
1959 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
1960
1961 return (newcl + newoff);
1962}
1963
1964void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */
1965
1966void
1967ps_clunmap(
1968 vstruct_t vs,
1969 vm_offset_t offset,
1970 vm_size_t length)
1971{
1972 vm_offset_t cluster; /* The cluster number of offset */
1973 struct vs_map *vsmap;
1c79356b
A
1974
1975 VS_MAP_LOCK(vs);
1976
1977 /*
1978 * Loop through all clusters in this range, freeing paging segment
1979 * clusters and map entries as encountered.
1980 */
1981 while (length > 0) {
1982 vm_offset_t newoff;
1983 int i;
1984
de355530 1985 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
1986 if (vs->vs_indirect) /* indirect map */
1987 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
1988 else
1989 vsmap = vs->vs_dmap;
1990 if (vsmap == NULL) {
1991 VS_MAP_UNLOCK(vs);
1992 return;
1993 }
1994 vsmap += cluster%CLMAP_ENTRIES;
1995 if (VSM_ISCLR(*vsmap)) {
1996 length -= vm_page_size;
1997 offset += vm_page_size;
1998 continue;
1999 }
2000 /*
2001 * We've got a valid mapping. Clear it and deallocate
2002 * paging segment cluster pages.
2003 * Optimize for entire cluster cleraing.
2004 */
2005 if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) {
2006 /*
2007 * Not cluster aligned.
2008 */
2009 ASSERT(trunc_page(newoff) == newoff);
de355530 2010 i = atop(newoff);
1c79356b
A
2011 } else
2012 i = 0;
2013 while ((i < VSCLSIZE(vs)) && (length > 0)) {
2014 VSM_CLRPG(*vsmap, i);
2015 VSM_CLRALLOC(*vsmap, i);
2016 length -= vm_page_size;
2017 offset += vm_page_size;
2018 i++;
2019 }
2020
2021 /*
2022 * If map entry is empty, clear and deallocate cluster.
2023 */
2024 if (!VSM_ALLOC(*vsmap)) {
2025 ps_deallocate_cluster(VSM_PS(*vsmap),
2026 VSM_CLOFF(*vsmap));
2027 VSM_CLR(*vsmap);
2028 }
2029 }
2030
2031 VS_MAP_UNLOCK(vs);
2032}
2033
2034void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */
2035
2036void
2037ps_vs_write_complete(
2038 vstruct_t vs,
2039 vm_offset_t offset,
2040 vm_size_t size,
2041 int error)
2042{
2043 struct clmap clmap;
2044
2045 /*
2046 * Get the struct vsmap for this cluster.
2047 * Use READ, even though it was written, because the
2048 * cluster MUST be present, unless there was an error
2049 * in the original ps_clmap (e.g. no space), in which
2050 * case, nothing happens.
2051 *
2052 * Must pass enough information to ps_clmap to allow it
2053 * to set the vs_map structure bitmap under lock.
2054 */
2055 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2056}
2057
2058void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */
2059
2060void
2061vs_cl_write_complete(
2062 vstruct_t vs,
2063 paging_segment_t ps,
2064 vm_offset_t offset,
2065 vm_offset_t addr,
2066 vm_size_t size,
2067 boolean_t async,
2068 int error)
2069{
1c79356b
A
2070 kern_return_t kr;
2071
2072 if (error) {
2073 /*
2074 * For internal objects, the error is recorded on a
2075 * per-cluster basis by ps_clmap() which is called
2076 * by ps_vs_write_complete() below.
2077 */
2078 dprintf(("write failed error = 0x%x\n", error));
2079 /* add upl_abort code here */
2080 } else
de355530 2081 GSTAT(global_stats.gs_pages_out += atop(size));
1c79356b
A
2082 /*
2083 * Notify the vstruct mapping code, so it can do its accounting.
2084 */
2085 ps_vs_write_complete(vs, offset, size, error);
2086
2087 if (async) {
2088 VS_LOCK(vs);
2089 ASSERT(vs->vs_async_pending > 0);
2090 vs->vs_async_pending -= size;
0b4e3aa0
A
2091 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2092 vs->vs_waiting_async = FALSE;
1c79356b
A
2093 VS_UNLOCK(vs);
2094 /* mutex_unlock(&vs->vs_waiting_async); */
0b4e3aa0 2095 thread_wakeup(&vs->vs_async_pending);
1c79356b
A
2096 } else {
2097 VS_UNLOCK(vs);
2098 }
2099 }
2100}
2101
2102#ifdef DEVICE_PAGING
2103kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2104
2105kern_return_t
2106device_write_reply(
2107 MACH_PORT_FACE reply_port,
2108 kern_return_t device_code,
2109 io_buf_len_t bytes_written)
2110{
2111 struct vs_async *vsa;
1c79356b
A
2112
2113 vsa = (struct vs_async *)
2114 ((struct vstruct_alias *)(reply_port->alias))->vs;
2115
2116 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2117 device_code = KERN_FAILURE;
2118 }
2119
2120 vsa->vsa_error = device_code;
2121
2122
2123 ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2124 if(vsa->vsa_flags & VSA_TRANSFER) {
2125 /* revisit when async disk segments redone */
2126 if(vsa->vsa_error) {
2127 /* need to consider error condition. re-write data or */
2128 /* throw it away here. */
2129 vm_offset_t ioaddr;
2130 if(vm_map_copyout(kernel_map, &ioaddr,
2131 (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS)
2132 panic("vs_cluster_write: unable to copy source list\n");
2133 vm_deallocate(kernel_map, ioaddr, vsa->vsa_size);
2134 }
2135 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2136 vsa->vsa_size, vsa->vsa_error);
2137 } else {
2138 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2139 vsa->vsa_addr, vsa->vsa_size, TRUE,
2140 vsa->vsa_error);
2141 }
2142 VS_FREE_ASYNC(vsa);
2143
2144 return KERN_SUCCESS;
2145}
2146
2147kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2148kern_return_t
2149device_write_reply_inband(
2150 MACH_PORT_FACE reply_port,
2151 kern_return_t return_code,
2152 io_buf_len_t bytes_written)
2153{
2154 panic("device_write_reply_inband: illegal");
2155 return KERN_SUCCESS;
2156}
2157
2158kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2159kern_return_t
2160device_read_reply(
2161 MACH_PORT_FACE reply_port,
2162 kern_return_t return_code,
2163 io_buf_ptr_t data,
2164 mach_msg_type_number_t dataCnt)
2165{
2166 struct vs_async *vsa;
2167 vsa = (struct vs_async *)
2168 ((struct vstruct_alias *)(reply_port->alias))->vs;
2169 vsa->vsa_addr = (vm_offset_t)data;
2170 vsa->vsa_size = (vm_size_t)dataCnt;
2171 vsa->vsa_error = return_code;
2172 thread_wakeup(&vsa->vsa_lock);
2173 return KERN_SUCCESS;
2174}
2175
2176kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2177kern_return_t
2178device_read_reply_inband(
2179 MACH_PORT_FACE reply_port,
2180 kern_return_t return_code,
2181 io_buf_ptr_inband_t data,
2182 mach_msg_type_number_t dataCnt)
2183{
2184 panic("device_read_reply_inband: illegal");
2185 return KERN_SUCCESS;
2186}
2187
2188kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2189kern_return_t
2190device_read_reply_overwrite(
2191 MACH_PORT_FACE reply_port,
2192 kern_return_t return_code,
2193 io_buf_len_t bytes_read)
2194{
2195 panic("device_read_reply_overwrite: illegal\n");
2196 return KERN_SUCCESS;
2197}
2198
2199kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2200kern_return_t
2201device_open_reply(
2202 MACH_PORT_FACE reply_port,
2203 kern_return_t return_code,
2204 MACH_PORT_FACE device_port)
2205{
2206 panic("device_open_reply: illegal\n");
2207 return KERN_SUCCESS;
2208}
2209
2210kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */
2211
2212kern_return_t
2213ps_read_device(
2214 paging_segment_t ps,
2215 vm_offset_t offset,
2216 vm_offset_t *bufferp,
2217 unsigned int size,
2218 unsigned int *residualp,
2219 int flags)
2220{
2221 kern_return_t kr;
2222 recnum_t dev_offset;
2223 unsigned int bytes_wanted;
2224 unsigned int bytes_read;
2225 unsigned int total_read;
2226 vm_offset_t dev_buffer;
2227 vm_offset_t buf_ptr;
2228 unsigned int records_read;
1c79356b
A
2229 struct vs_async *vsa;
2230 mutex_t vs_waiting_read_reply;
2231
2232 device_t device;
2233 vm_map_copy_t device_data = NULL;
2234 default_pager_thread_t *dpt = NULL;
2235
2236 device = dev_port_lookup(ps->ps_device);
de355530 2237 clustered_reads[atop(size)]++;
1c79356b
A
2238
2239 dev_offset = (ps->ps_offset +
2240 (offset >> (vm_page_shift - ps->ps_record_shift)));
2241 bytes_wanted = size;
2242 total_read = 0;
2243 *bufferp = (vm_offset_t)NULL;
2244
2245 do {
2246 vsa = VS_ALLOC_ASYNC();
2247 if (vsa) {
2248 vsa->vsa_vs = NULL;
2249 vsa->vsa_addr = 0;
2250 vsa->vsa_offset = 0;
2251 vsa->vsa_size = 0;
2252 vsa->vsa_ps = NULL;
2253 }
2254 mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO);
2255 ip_lock(vsa->reply_port);
2256 vsa->reply_port->ip_sorights++;
2257 ip_reference(vsa->reply_port);
2258 ip_unlock(vsa->reply_port);
2259 kr = ds_device_read_common(device,
2260 vsa->reply_port,
2261 (mach_msg_type_name_t)
2262 MACH_MSG_TYPE_MOVE_SEND_ONCE,
2263 (dev_mode_t) 0,
2264 dev_offset,
2265 bytes_wanted,
2266 (IO_READ | IO_CALL),
2267 (io_buf_ptr_t *) &dev_buffer,
2268 (mach_msg_type_number_t *) &bytes_read);
2269 if(kr == MIG_NO_REPLY) {
2270 assert_wait(&vsa->vsa_lock, THREAD_UNINT);
9bccf70c 2271 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
2272
2273 dev_buffer = vsa->vsa_addr;
2274 bytes_read = (unsigned int)vsa->vsa_size;
2275 kr = vsa->vsa_error;
2276 }
2277 VS_FREE_ASYNC(vsa);
2278 if (kr != KERN_SUCCESS || bytes_read == 0) {
2279 break;
2280 }
2281 total_read += bytes_read;
2282
2283 /*
2284 * If we got the entire range, use the returned dev_buffer.
2285 */
2286 if (bytes_read == size) {
2287 *bufferp = (vm_offset_t)dev_buffer;
2288 break;
2289 }
2290
2291#if 1
2292 dprintf(("read only %d bytes out of %d\n",
2293 bytes_read, bytes_wanted));
2294#endif
2295 if(dpt == NULL) {
2296 dpt = get_read_buffer();
2297 buf_ptr = dpt->dpt_buffer;
2298 *bufferp = (vm_offset_t)buf_ptr;
2299 }
2300 /*
2301 * Otherwise, copy the data into the provided buffer (*bufferp)
2302 * and append the rest of the range as it comes in.
2303 */
2304 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2305 buf_ptr += bytes_read;
2306 bytes_wanted -= bytes_read;
2307 records_read = (bytes_read >>
2308 (vm_page_shift - ps->ps_record_shift));
2309 dev_offset += records_read;
2310 DEBUG(DEBUG_VS_INTERNAL,
2311 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2312 dev_buffer, bytes_read));
2313 if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2314 != KERN_SUCCESS)
2315 Panic("dealloc buf");
2316 } while (bytes_wanted);
2317
2318 *residualp = size - total_read;
2319 if((dev_buffer != *bufferp) && (total_read != 0)) {
2320 vm_offset_t temp_buffer;
2321 vm_allocate(kernel_map, &temp_buffer, total_read, TRUE);
2322 memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2323 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2324 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2325 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2326 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2327 (vm_map_copy_t *)&device_data, FALSE))
2328 panic("ps_read_device: cannot copyin locally provided buffer\n");
2329 }
2330 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2331 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2332 VM_MAP_COPYIN_OPT_SRC_DESTROY |
2333 VM_MAP_COPYIN_OPT_STEAL_PAGES |
2334 VM_MAP_COPYIN_OPT_PMAP_ENTER,
2335 (vm_map_copy_t *)&device_data, FALSE))
2336 panic("ps_read_device: cannot copyin backing store provided buffer\n");
2337 }
2338 else {
2339 device_data = NULL;
2340 }
2341 *bufferp = (vm_offset_t)device_data;
2342
2343 if(dpt != NULL) {
2344 /* Free the receive buffer */
2345 dpt->checked_out = 0;
2346 thread_wakeup(&dpt_array);
2347 }
2348 return KERN_SUCCESS;
2349}
2350
2351kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */
2352
2353kern_return_t
2354ps_write_device(
2355 paging_segment_t ps,
2356 vm_offset_t offset,
2357 vm_offset_t addr,
2358 unsigned int size,
2359 struct vs_async *vsa)
2360{
2361 recnum_t dev_offset;
2362 io_buf_len_t bytes_to_write, bytes_written;
2363 recnum_t records_written;
2364 kern_return_t kr;
2365 MACH_PORT_FACE reply_port;
1c79356b
A
2366
2367
2368
de355530 2369 clustered_writes[atop(size)]++;
1c79356b
A
2370
2371 dev_offset = (ps->ps_offset +
2372 (offset >> (vm_page_shift - ps->ps_record_shift)));
2373 bytes_to_write = size;
2374
2375 if (vsa) {
2376 /*
2377 * Asynchronous write.
2378 */
2379 reply_port = vsa->reply_port;
2380 ip_lock(reply_port);
2381 reply_port->ip_sorights++;
2382 ip_reference(reply_port);
2383 ip_unlock(reply_port);
2384 {
2385 device_t device;
2386 device = dev_port_lookup(ps->ps_device);
2387
2388 vsa->vsa_addr = addr;
2389 kr=ds_device_write_common(device,
2390 reply_port,
2391 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2392 (dev_mode_t) 0,
2393 dev_offset,
2394 (io_buf_ptr_t) addr,
2395 size,
2396 (IO_WRITE | IO_CALL),
2397 &bytes_written);
2398 }
2399 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2400 if (verbose)
2401 dprintf(("%s0x%x, addr=0x%x,"
2402 "size=0x%x,offset=0x%x\n",
2403 "device_write_request returned ",
2404 kr, addr, size, offset));
2405 BS_STAT(ps->ps_bs,
de355530 2406 ps->ps_bs->bs_pages_out_fail += atop(size));
1c79356b
A
2407 /* do the completion notification to free resources */
2408 device_write_reply(reply_port, kr, 0);
2409 return PAGER_ERROR;
2410 }
2411 } else do {
2412 /*
2413 * Synchronous write.
2414 */
2415 {
2416 device_t device;
2417 device = dev_port_lookup(ps->ps_device);
2418 kr=ds_device_write_common(device,
2419 IP_NULL, 0,
2420 (dev_mode_t) 0,
2421 dev_offset,
2422 (io_buf_ptr_t) addr,
2423 size,
2424 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2425 &bytes_written);
2426 }
2427 if (kr != KERN_SUCCESS) {
2428 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2429 "device_write returned ",
2430 kr, addr, size, offset));
2431 BS_STAT(ps->ps_bs,
de355530 2432 ps->ps_bs->bs_pages_out_fail += atop(size));
1c79356b
A
2433 return PAGER_ERROR;
2434 }
2435 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2436 Panic("fragmented write");
2437 records_written = (bytes_written >>
2438 (vm_page_shift - ps->ps_record_shift));
2439 dev_offset += records_written;
2440#if 1
2441 if (bytes_written != bytes_to_write) {
2442 dprintf(("wrote only %d bytes out of %d\n",
2443 bytes_written, bytes_to_write));
2444 }
2445#endif
2446 bytes_to_write -= bytes_written;
2447 addr += bytes_written;
2448 } while (bytes_to_write > 0);
2449
2450 return PAGER_SUCCESS;
2451}
2452
2453
2454#else /* !DEVICE_PAGING */
2455
2456kern_return_t
2457ps_read_device(
2458 paging_segment_t ps,
2459 vm_offset_t offset,
2460 vm_offset_t *bufferp,
2461 unsigned int size,
2462 unsigned int *residualp,
2463 int flags)
2464{
2465 panic("ps_read_device not supported");
2466}
2467
2468ps_write_device(
2469 paging_segment_t ps,
2470 vm_offset_t offset,
2471 vm_offset_t addr,
2472 unsigned int size,
2473 struct vs_async *vsa)
2474{
2475 panic("ps_write_device not supported");
2476}
2477
2478#endif /* DEVICE_PAGING */
2479void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */
2480
2481void
2482pvs_object_data_provided(
2483 vstruct_t vs,
2484 upl_t upl,
2485 vm_offset_t offset,
2486 vm_size_t size)
2487{
1c79356b
A
2488
2489 DEBUG(DEBUG_VS_INTERNAL,
2490 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2491 upl, offset, size));
2492
2493 ASSERT(size > 0);
de355530 2494 GSTAT(global_stats.gs_pages_in += atop(size));
1c79356b
A
2495
2496
2497#if USE_PRECIOUS
2498 ps_clunmap(vs, offset, size);
2499#endif /* USE_PRECIOUS */
2500
2501}
2502
2503kern_return_t
2504pvs_cluster_read(
2505 vstruct_t vs,
0b4e3aa0 2506 vm_offset_t vs_offset,
1c79356b
A
2507 vm_size_t cnt)
2508{
1c79356b
A
2509 upl_t upl;
2510 kern_return_t error = KERN_SUCCESS;
0b4e3aa0 2511 int size;
1c79356b
A
2512 unsigned int residual;
2513 unsigned int request_flags;
0b4e3aa0
A
2514 int seg_index;
2515 int pages_in_cl;
2516 int cl_size;
2517 int cl_mask;
2518 int cl_index;
2519 int xfer_size;
2520 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2521 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2522 struct clmap clmap;
2523
2524 pages_in_cl = 1 << vs->vs_clshift;
2525 cl_size = pages_in_cl * vm_page_size;
2526 cl_mask = cl_size - 1;
1c79356b
A
2527
2528 /*
0b4e3aa0
A
2529 * This loop will be executed multiple times until the entire
2530 * request has been satisfied... if the request spans cluster
2531 * boundaries, the clusters will be checked for logical continunity,
2532 * if contiguous the I/O request will span multiple clusters, otherwise
2533 * it will be broken up into the minimal set of I/O's
1c79356b 2534 *
0b4e3aa0
A
2535 * If there are holes in a request (either unallocated pages in a paging
2536 * segment or an unallocated paging segment), we stop
1c79356b
A
2537 * reading at the hole, inform the VM of any data read, inform
2538 * the VM of an unavailable range, then loop again, hoping to
0b4e3aa0 2539 * find valid pages later in the requested range. This continues until
1c79356b
A
2540 * the entire range has been examined, and read, if present.
2541 */
2542
2543#if USE_PRECIOUS
9bccf70c 2544 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT;
1c79356b 2545#else
9bccf70c 2546 request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT;
1c79356b
A
2547#endif
2548 while (cnt && (error == KERN_SUCCESS)) {
0b4e3aa0
A
2549 int ps_info_valid;
2550 int page_list_count;
1c79356b 2551
d12e1678
A
2552 if((vs_offset & cl_mask) &&
2553 (cnt > (VM_SUPER_CLUSTER -
2554 (vs_offset & cl_mask)))) {
2555 size = VM_SUPER_CLUSTER;
2556 size -= vs_offset & cl_mask;
2557 } else if (cnt > VM_SUPER_CLUSTER) {
0b4e3aa0 2558 size = VM_SUPER_CLUSTER;
d12e1678 2559 } else {
0b4e3aa0 2560 size = cnt;
d12e1678 2561 }
0b4e3aa0 2562 cnt -= size;
1c79356b 2563
0b4e3aa0
A
2564 ps_info_valid = 0;
2565 seg_index = 0;
1c79356b 2566
0b4e3aa0
A
2567 while (size > 0 && error == KERN_SUCCESS) {
2568 int abort_size;
2569 int failed_size;
2570 int beg_pseg;
2571 int beg_indx;
2572 vm_offset_t cur_offset;
1c79356b 2573
0b4e3aa0
A
2574
2575 if ( !ps_info_valid) {
2576 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
2577 psp[seg_index] = CLMAP_PS(clmap);
2578 ps_info_valid = 1;
1c79356b 2579 }
0b4e3aa0
A
2580 /*
2581 * skip over unallocated physical segments
2582 */
2583 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2584 abort_size = cl_size - (vs_offset & cl_mask);
2585 abort_size = MIN(abort_size, size);
2586
2587 page_list_count = 0;
2588 memory_object_super_upl_request(
2589 vs->vs_control,
2590 (memory_object_offset_t)vs_offset,
2591 abort_size, abort_size,
2592 &upl, NULL, &page_list_count,
2593 request_flags);
1c79356b 2594
0b4e3aa0
A
2595 if (clmap.cl_error) {
2596 upl_abort(upl, UPL_ABORT_ERROR);
2597 } else {
2598 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2599 }
2600 upl_deallocate(upl);
1c79356b 2601
0b4e3aa0
A
2602 size -= abort_size;
2603 vs_offset += abort_size;
1c79356b 2604
0b4e3aa0
A
2605 seg_index++;
2606 ps_info_valid = 0;
2607 continue;
1c79356b 2608 }
0b4e3aa0
A
2609 cl_index = (vs_offset & cl_mask) / vm_page_size;
2610
2611 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
2612 /*
2613 * skip over unallocated pages
2614 */
2615 if (CLMAP_ISSET(clmap, cl_index))
2616 break;
2617 abort_size += vm_page_size;
2618 }
2619 if (abort_size) {
2620 /*
2621 * Let VM system know about holes in clusters.
2622 */
de355530 2623 GSTAT(global_stats.gs_pages_unavail += atop(abort_size));
0b4e3aa0
A
2624
2625 page_list_count = 0;
2626 memory_object_super_upl_request(
2627 vs->vs_control,
2628 (memory_object_offset_t)vs_offset,
2629 abort_size, abort_size,
2630 &upl, NULL, &page_list_count,
1c79356b 2631 request_flags);
1c79356b 2632
0b4e3aa0
A
2633 upl_abort(upl, UPL_ABORT_UNAVAILABLE);
2634 upl_deallocate(upl);
1c79356b 2635
0b4e3aa0
A
2636 size -= abort_size;
2637 vs_offset += abort_size;
2638
2639 if (cl_index == pages_in_cl) {
2640 /*
2641 * if we're at the end of this physical cluster
2642 * then bump to the next one and continue looking
2643 */
2644 seg_index++;
2645 ps_info_valid = 0;
2646 continue;
2647 }
2648 if (size == 0)
2649 break;
2650 }
1c79356b 2651 /*
0b4e3aa0
A
2652 * remember the starting point of the first allocated page
2653 * for the I/O we're about to issue
1c79356b 2654 */
0b4e3aa0
A
2655 beg_pseg = seg_index;
2656 beg_indx = cl_index;
2657 cur_offset = vs_offset;
2658
2659 /*
2660 * calculate the size of the I/O that we can do...
2661 * this may span multiple physical segments if
2662 * they are contiguous
2663 */
2664 for (xfer_size = 0; xfer_size < size; ) {
2665
d12e1678
A
2666 while (cl_index < pages_in_cl
2667 && xfer_size < size) {
0b4e3aa0 2668 /*
d12e1678
A
2669 * accumulate allocated pages within
2670 * a physical segment
1c79356b 2671 */
0b4e3aa0
A
2672 if (CLMAP_ISSET(clmap, cl_index)) {
2673 xfer_size += vm_page_size;
2674 cur_offset += vm_page_size;
2675 cl_index++;
2676
2677 BS_STAT(psp[seg_index]->ps_bs,
2678 psp[seg_index]->ps_bs->bs_pages_in++);
2679 } else
2680 break;
2681 }
d12e1678
A
2682 if (cl_index < pages_in_cl
2683 || xfer_size >= size) {
0b4e3aa0 2684 /*
d12e1678
A
2685 * we've hit an unallocated page or
2686 * the end of this request... go fire
2687 * the I/O
1c79356b 2688 */
0b4e3aa0
A
2689 break;
2690 }
2691 /*
d12e1678
A
2692 * we've hit the end of the current physical
2693 * segment and there's more to do, so try
2694 * moving to the next one
0b4e3aa0
A
2695 */
2696 seg_index++;
2697
d12e1678
A
2698 ps_offset[seg_index] =
2699 ps_clmap(vs,
2700 cur_offset & ~cl_mask,
2701 &clmap, CL_FIND, 0, 0);
2702 psp[seg_index] = CLMAP_PS(clmap);
0b4e3aa0
A
2703 ps_info_valid = 1;
2704
2705 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
2706 /*
d12e1678
A
2707 * if the physical segment we're about
2708 * to step into is not contiguous to
2709 * the one we're currently in, or it's
2710 * in a different paging file, or
0b4e3aa0
A
2711 * it hasn't been allocated....
2712 * we stop here and generate the I/O
2713 */
2714 break;
1c79356b 2715 }
0b4e3aa0 2716 /*
d12e1678
A
2717 * start with first page of the next physical
2718 * segment
0b4e3aa0
A
2719 */
2720 cl_index = 0;
1c79356b 2721 }
0b4e3aa0
A
2722 if (xfer_size) {
2723 /*
2724 * we have a contiguous range of allocated pages
2725 * to read from
2726 */
2727 page_list_count = 0;
2728 memory_object_super_upl_request(vs->vs_control,
d12e1678
A
2729 (memory_object_offset_t)vs_offset,
2730 xfer_size, xfer_size,
2731 &upl, NULL, &page_list_count,
2732 request_flags | UPL_SET_INTERNAL);
0b4e3aa0 2733
d12e1678
A
2734 error = ps_read_file(psp[beg_pseg],
2735 upl, (vm_offset_t) 0,
2736 ps_offset[beg_pseg] +
2737 (beg_indx * vm_page_size),
2738 xfer_size, &residual, 0);
0b4e3aa0
A
2739 } else
2740 continue;
1c79356b 2741
0b4e3aa0
A
2742 failed_size = 0;
2743
2744 /*
d12e1678
A
2745 * Adjust counts and send response to VM. Optimize
2746 * for the common case, i.e. no error and/or partial
2747 * data. If there was an error, then we need to error
2748 * the entire range, even if some data was successfully
2749 * read. If there was a partial read we may supply some
0b4e3aa0
A
2750 * data and may error some as well. In all cases the
2751 * VM must receive some notification for every page in the
2752 * range.
2753 */
2754 if ((error == KERN_SUCCESS) && (residual == 0)) {
2755 /*
d12e1678
A
2756 * Got everything we asked for, supply the data
2757 * to the VM. Note that as a side effect of
2758 * supplying * the data, the buffer holding the
2759 * supplied data is * deallocated from the pager's
2760 * address space.
0b4e3aa0 2761 */
d12e1678
A
2762 pvs_object_data_provided(
2763 vs, upl, vs_offset, xfer_size);
0b4e3aa0
A
2764 } else {
2765 failed_size = xfer_size;
2766
2767 if (error == KERN_SUCCESS) {
2768 if (residual == xfer_size) {
d12e1678
A
2769 /*
2770 * If a read operation returns no error
2771 * and no data moved, we turn it into
2772 * an error, assuming we're reading at
2773 * or beyong EOF.
2774 * Fall through and error the entire
2775 * range.
2776 */
0b4e3aa0
A
2777 error = KERN_FAILURE;
2778 } else {
d12e1678
A
2779 /*
2780 * Otherwise, we have partial read. If
2781 * the part read is a integral number
2782 * of pages supply it. Otherwise round
2783 * it up to a page boundary, zero fill
2784 * the unread part, and supply it.
2785 * Fall through and error the remainder
2786 * of the range, if any.
2787 */
0b4e3aa0
A
2788 int fill, lsize;
2789
d12e1678
A
2790 fill = residual
2791 & ~vm_page_size;
2792 lsize = (xfer_size - residual)
2793 + fill;
2794 pvs_object_data_provided(
2795 vs, upl,
2796 vs_offset, lsize);
0b4e3aa0
A
2797
2798 if (lsize < xfer_size) {
d12e1678
A
2799 failed_size =
2800 xfer_size - lsize;
0b4e3aa0
A
2801 error = KERN_FAILURE;
2802 }
2803 }
2804 }
2805 }
1c79356b
A
2806 /*
2807 * If there was an error in any part of the range, tell
d12e1678
A
2808 * the VM. Note that error is explicitly checked again
2809 * since it can be modified above.
1c79356b
A
2810 */
2811 if (error != KERN_SUCCESS) {
0b4e3aa0 2812 BS_STAT(psp[beg_pseg]->ps_bs,
d12e1678
A
2813 psp[beg_pseg]->ps_bs->bs_pages_in_fail
2814 += atop(failed_size));
1c79356b 2815 }
0b4e3aa0
A
2816 size -= xfer_size;
2817 vs_offset += xfer_size;
1c79356b 2818 }
1c79356b
A
2819
2820 } /* END while (cnt && (error == 0)) */
2821 return error;
2822}
2823
2824int vs_do_async_write = 1;
2825
2826kern_return_t
2827vs_cluster_write(
2828 vstruct_t vs,
2829 upl_t internal_upl,
2830 vm_offset_t offset,
2831 vm_size_t cnt,
2832 boolean_t dp_internal,
2833 int flags)
2834{
1c79356b
A
2835 vm_offset_t size;
2836 vm_offset_t transfer_size;
1c79356b
A
2837 int error = 0;
2838 struct clmap clmap;
0b4e3aa0
A
2839
2840 vm_offset_t actual_offset; /* Offset within paging segment */
1c79356b 2841 paging_segment_t ps;
0b4e3aa0
A
2842 vm_offset_t subx_size;
2843 vm_offset_t mobj_base_addr;
2844 vm_offset_t mobj_target_addr;
2845 int mobj_size;
2846
1c79356b
A
2847 struct vs_async *vsa;
2848 vm_map_copy_t copy;
1c79356b
A
2849
2850 upl_t upl;
0b4e3aa0 2851 upl_page_info_t *pl;
1c79356b
A
2852 int page_index;
2853 int list_size;
2854 int cl_size;
1c79356b 2855
1c79356b 2856 if (!dp_internal) {
0b4e3aa0 2857 int page_list_count;
1c79356b
A
2858 int request_flags;
2859 int super_size;
0b4e3aa0
A
2860 int first_dirty;
2861 int num_dirty;
2862 int num_of_pages;
2863 int seg_index;
2864 int pages_in_cl;
2865 int must_abort;
1c79356b 2866 vm_offset_t upl_offset;
0b4e3aa0
A
2867 vm_offset_t seg_offset;
2868 vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2869 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT];
2870
1c79356b 2871
0b4e3aa0
A
2872 pages_in_cl = 1 << vs->vs_clshift;
2873 cl_size = pages_in_cl * vm_page_size;
1c79356b
A
2874
2875 if (bs_low) {
2876 super_size = cl_size;
0b4e3aa0 2877
1c79356b
A
2878 request_flags = UPL_NOBLOCK |
2879 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2880 UPL_NO_SYNC | UPL_SET_INTERNAL;
2881 } else {
2882 super_size = VM_SUPER_CLUSTER;
0b4e3aa0 2883
1c79356b
A
2884 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
2885 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
2886 UPL_NO_SYNC | UPL_SET_INTERNAL;
2887 }
2888
0b4e3aa0
A
2889 page_list_count = 0;
2890 memory_object_super_upl_request(vs->vs_control,
2891 (memory_object_offset_t)offset,
2892 cnt, super_size,
2893 &upl, NULL, &page_list_count,
de355530 2894 request_flags | UPL_PAGEOUT);
1c79356b 2895
0b4e3aa0 2896 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2897
d12e1678
A
2898 for (seg_index = 0, transfer_size = upl->size;
2899 transfer_size > 0; ) {
1c79356b 2900
d12e1678
A
2901 ps_offset[seg_index] =
2902 ps_clmap(vs, upl->offset + (seg_index * cl_size),
2903 &clmap, CL_ALLOC,
2904 transfer_size < cl_size ?
2905 transfer_size : cl_size, 0);
1c79356b 2906
0b4e3aa0
A
2907 if (ps_offset[seg_index] == (vm_offset_t) -1) {
2908 upl_abort(upl, 0);
2909 upl_deallocate(upl);
2910
2911 return KERN_FAILURE;
1c79356b 2912
0b4e3aa0
A
2913 }
2914 psp[seg_index] = CLMAP_PS(clmap);
1c79356b 2915
0b4e3aa0
A
2916 if (transfer_size > cl_size) {
2917 transfer_size -= cl_size;
2918 seg_index++;
2919 } else
2920 transfer_size = 0;
2921 }
d12e1678
A
2922 for (page_index = 0,
2923 num_of_pages = upl->size / vm_page_size;
2924 page_index < num_of_pages; ) {
0b4e3aa0
A
2925 /*
2926 * skip over non-dirty pages
2927 */
2928 for ( ; page_index < num_of_pages; page_index++) {
d12e1678
A
2929 if (UPL_DIRTY_PAGE(pl, page_index)
2930 || UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2931 /*
2932 * this is a page we need to write
d12e1678
A
2933 * go see if we can buddy it up with
2934 * others that are contiguous to it
0b4e3aa0
A
2935 */
2936 break;
2937 /*
d12e1678
A
2938 * if the page is not-dirty, but present we
2939 * need to commit it... This is an unusual
2940 * case since we only asked for dirty pages
0b4e3aa0
A
2941 */
2942 if (UPL_PAGE_PRESENT(pl, page_index)) {
2943 boolean_t empty = FALSE;
2944 upl_commit_range(upl,
2945 page_index * vm_page_size,
2946 vm_page_size,
2947 UPL_COMMIT_NOTIFY_EMPTY,
2948 pl,
d52fe63f 2949 page_list_count,
0b4e3aa0
A
2950 &empty);
2951 if (empty)
2952 upl_deallocate(upl);
1c79356b 2953 }
1c79356b 2954 }
0b4e3aa0
A
2955 if (page_index == num_of_pages)
2956 /*
2957 * no more pages to look at, we're out of here
2958 */
2959 break;
1c79356b 2960
0b4e3aa0 2961 /*
d12e1678
A
2962 * gather up contiguous dirty pages... we have at
2963 * least 1 otherwise we would have bailed above
0b4e3aa0
A
2964 * make sure that each physical segment that we step
2965 * into is contiguous to the one we're currently in
2966 * if it's not, we have to stop and write what we have
2967 */
d12e1678
A
2968 for (first_dirty = page_index;
2969 page_index < num_of_pages; ) {
2970 if ( !UPL_DIRTY_PAGE(pl, page_index)
2971 && !UPL_PRECIOUS_PAGE(pl, page_index))
0b4e3aa0
A
2972 break;
2973 page_index++;
2974 /*
2975 * if we just looked at the last page in the UPL
2976 * we don't need to check for physical segment
2977 * continuity
2978 */
2979 if (page_index < num_of_pages) {
2980 int cur_seg;
2981 int nxt_seg;
2982
d12e1678
A
2983 cur_seg =
2984 (page_index - 1) / pages_in_cl;
0b4e3aa0
A
2985 nxt_seg = page_index / pages_in_cl;
2986
2987 if (cur_seg != nxt_seg) {
2988 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
d12e1678
A
2989 /*
2990 * if the segment we're about
2991 * to step into is not
2992 * contiguous to the one we're
2993 * currently in, or it's in a
2994 * different paging file....
2995 * we stop here and generate
2996 * the I/O
2997 */
0b4e3aa0 2998 break;
1c79356b 2999 }
1c79356b 3000 }
0b4e3aa0
A
3001 }
3002 num_dirty = page_index - first_dirty;
3003 must_abort = 1;
1c79356b 3004
0b4e3aa0
A
3005 if (num_dirty) {
3006 upl_offset = first_dirty * vm_page_size;
3007 seg_index = first_dirty / pages_in_cl;
3008 seg_offset = upl_offset - (seg_index * cl_size);
3009 transfer_size = num_dirty * vm_page_size;
3010
0b4e3aa0 3011
d12e1678
A
3012 while (transfer_size) {
3013 int seg_size;
1c79356b 3014
d12e1678
A
3015 if ((seg_size = cl_size -
3016 (upl_offset % cl_size))
3017 > transfer_size)
3018 seg_size = transfer_size;
0b4e3aa0 3019
d12e1678
A
3020 ps_vs_write_complete(vs,
3021 upl->offset + upl_offset,
3022 seg_size, error);
0b4e3aa0 3023
d12e1678
A
3024 transfer_size -= seg_size;
3025 upl_offset += seg_size;
0b4e3aa0 3026 }
d12e1678
A
3027 upl_offset = first_dirty * vm_page_size;
3028 transfer_size = num_dirty * vm_page_size;
3029 error = ps_write_file(psp[seg_index],
3030 upl, upl_offset,
3031 ps_offset[seg_index]
3032 + seg_offset,
3033 transfer_size, flags);
9bccf70c 3034 must_abort = 0;
0b4e3aa0
A
3035 }
3036 if (must_abort) {
3037 boolean_t empty = FALSE;
3038 upl_abort_range(upl,
3039 first_dirty * vm_page_size,
3040 num_dirty * vm_page_size,
3041 UPL_ABORT_NOTIFY_EMPTY,
3042 &empty);
3043 if (empty)
3044 upl_deallocate(upl);
1c79356b 3045 }
1c79356b 3046 }
0b4e3aa0 3047
1c79356b
A
3048 } else {
3049 assert(cnt <= (vm_page_size << vs->vs_clshift));
3050 list_size = cnt;
3051
3052 page_index = 0;
3053 /* The caller provides a mapped_data which is derived */
3054 /* from a temporary object. The targeted pages are */
3055 /* guaranteed to be set at offset 0 in the mapped_data */
3056 /* The actual offset however must still be derived */
3057 /* from the offset in the vs in question */
3058 mobj_base_addr = offset;
3059 mobj_target_addr = mobj_base_addr;
3060
3061 for (transfer_size = list_size; transfer_size != 0;) {
3062 actual_offset = ps_clmap(vs, mobj_target_addr,
3063 &clmap, CL_ALLOC,
3064 transfer_size < cl_size ?
3065 transfer_size : cl_size, 0);
3066 if(actual_offset == (vm_offset_t) -1) {
3067 error = 1;
3068 break;
3069 }
3070 cnt = MIN(transfer_size,
3071 CLMAP_NPGS(clmap) * vm_page_size);
3072 ps = CLMAP_PS(clmap);
3073 /* Assume that the caller has given us contiguous */
3074 /* pages */
3075 if(cnt) {
d12e1678
A
3076 ps_vs_write_complete(vs, mobj_target_addr,
3077 cnt, error);
1c79356b
A
3078 error = ps_write_file(ps, internal_upl,
3079 0, actual_offset,
3080 cnt, flags);
3081 if (error)
3082 break;
d12e1678 3083 }
1c79356b
A
3084 if (error)
3085 break;
3086 actual_offset += cnt;
3087 mobj_target_addr += cnt;
3088 transfer_size -= cnt;
3089 cnt = 0;
3090
3091 if (error)
3092 break;
3093 }
3094 }
3095 if(error)
3096 return KERN_FAILURE;
3097 else
3098 return KERN_SUCCESS;
3099}
3100
3101vm_size_t
3102ps_vstruct_allocated_size(
3103 vstruct_t vs)
3104{
3105 int num_pages;
3106 struct vs_map *vsmap;
3107 int i, j, k;
3108
3109 num_pages = 0;
3110 if (vs->vs_indirect) {
3111 /* loop on indirect maps */
3112 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3113 vsmap = vs->vs_imap[i];
3114 if (vsmap == NULL)
3115 continue;
3116 /* loop on clusters in this indirect map */
3117 for (j = 0; j < CLMAP_ENTRIES; j++) {
3118 if (VSM_ISCLR(vsmap[j]) ||
3119 VSM_ISERR(vsmap[j]))
3120 continue;
3121 /* loop on pages in this cluster */
3122 for (k = 0; k < VSCLSIZE(vs); k++) {
3123 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3124 num_pages++;
3125 }
3126 }
3127 }
3128 } else {
3129 vsmap = vs->vs_dmap;
3130 if (vsmap == NULL)
3131 return 0;
3132 /* loop on clusters in the direct map */
3133 for (j = 0; j < CLMAP_ENTRIES; j++) {
3134 if (VSM_ISCLR(vsmap[j]) ||
3135 VSM_ISERR(vsmap[j]))
3136 continue;
3137 /* loop on pages in this cluster */
3138 for (k = 0; k < VSCLSIZE(vs); k++) {
3139 if ((VSM_BMAP(vsmap[j])) & (1 << k))
3140 num_pages++;
3141 }
3142 }
3143 }
3144
de355530 3145 return ptoa(num_pages);
1c79356b
A
3146}
3147
3148size_t
3149ps_vstruct_allocated_pages(
3150 vstruct_t vs,
3151 default_pager_page_t *pages,
3152 size_t pages_size)
3153{
3154 int num_pages;
3155 struct vs_map *vsmap;
3156 vm_offset_t offset;
3157 int i, j, k;
3158
3159 num_pages = 0;
3160 offset = 0;
3161 if (vs->vs_indirect) {
3162 /* loop on indirect maps */
3163 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3164 vsmap = vs->vs_imap[i];
3165 if (vsmap == NULL) {
3166 offset += (vm_page_size * CLMAP_ENTRIES *
3167 VSCLSIZE(vs));
3168 continue;
3169 }
3170 /* loop on clusters in this indirect map */
3171 for (j = 0; j < CLMAP_ENTRIES; j++) {
3172 if (VSM_ISCLR(vsmap[j]) ||
3173 VSM_ISERR(vsmap[j])) {
3174 offset += vm_page_size * VSCLSIZE(vs);
3175 continue;
3176 }
3177 /* loop on pages in this cluster */
3178 for (k = 0; k < VSCLSIZE(vs); k++) {
3179 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3180 num_pages++;
3181 if (num_pages < pages_size)
3182 pages++->dpp_offset =
3183 offset;
3184 }
3185 offset += vm_page_size;
3186 }
3187 }
3188 }
3189 } else {
3190 vsmap = vs->vs_dmap;
3191 if (vsmap == NULL)
3192 return 0;
3193 /* loop on clusters in the direct map */
3194 for (j = 0; j < CLMAP_ENTRIES; j++) {
3195 if (VSM_ISCLR(vsmap[j]) ||
3196 VSM_ISERR(vsmap[j])) {
3197 offset += vm_page_size * VSCLSIZE(vs);
3198 continue;
3199 }
3200 /* loop on pages in this cluster */
3201 for (k = 0; k < VSCLSIZE(vs); k++) {
3202 if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3203 num_pages++;
3204 if (num_pages < pages_size)
3205 pages++->dpp_offset = offset;
3206 }
3207 offset += vm_page_size;
3208 }
3209 }
3210 }
3211
3212 return num_pages;
3213}
3214
3215
3216kern_return_t
3217ps_vstruct_transfer_from_segment(
3218 vstruct_t vs,
3219 paging_segment_t segment,
1c79356b 3220 upl_t upl)
1c79356b
A
3221{
3222 struct vs_map *vsmap;
3223 struct vs_map old_vsmap;
3224 struct vs_map new_vsmap;
3225 int i, j, k;
3226
3227 VS_LOCK(vs); /* block all work on this vstruct */
3228 /* can't allow the normal multiple write */
3229 /* semantic because writes may conflict */
3230 vs->vs_xfer_pending = TRUE;
3231 vs_wait_for_sync_writers(vs);
3232 vs_start_write(vs);
3233 vs_wait_for_readers(vs);
3234 /* we will unlock the vs to allow other writes while transferring */
3235 /* and will be guaranteed of the persistance of the vs struct */
3236 /* because the caller of ps_vstruct_transfer_from_segment bumped */
3237 /* vs_async_pending */
3238 /* OK we now have guaranteed no other parties are accessing this */
3239 /* vs. Now that we are also supporting simple lock versions of */
3240 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */
3241 /* our purpose in holding it before was the multiple write case */
3242 /* we now use the boolean xfer_pending to do that. We can use */
3243 /* a boolean instead of a count because we have guaranteed single */
3244 /* file access to this code in its caller */
3245 VS_UNLOCK(vs);
3246vs_changed:
3247 if (vs->vs_indirect) {
3248 int vsmap_size;
3249 int clmap_off;
3250 /* loop on indirect maps */
3251 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3252 vsmap = vs->vs_imap[i];
3253 if (vsmap == NULL)
3254 continue;
3255 /* loop on clusters in this indirect map */
3256 clmap_off = (vm_page_size * CLMAP_ENTRIES *
3257 VSCLSIZE(vs) * i);
3258 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3259 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3260 else
3261 vsmap_size = CLMAP_ENTRIES;
3262 for (j = 0; j < vsmap_size; j++) {
3263 if (VSM_ISCLR(vsmap[j]) ||
3264 VSM_ISERR(vsmap[j]) ||
3265 (VSM_PS(vsmap[j]) != segment))
3266 continue;
3267 if(vs_cluster_transfer(vs,
3268 (vm_page_size * (j << vs->vs_clshift))
3269 + clmap_off,
3270 vm_page_size << vs->vs_clshift,
1c79356b 3271 upl)
1c79356b
A
3272 != KERN_SUCCESS) {
3273 VS_LOCK(vs);
3274 vs->vs_xfer_pending = FALSE;
3275 VS_UNLOCK(vs);
3276 vs_finish_write(vs);
3277 return KERN_FAILURE;
3278 }
3279 /* allow other readers/writers during transfer*/
3280 VS_LOCK(vs);
3281 vs->vs_xfer_pending = FALSE;
3282 VS_UNLOCK(vs);
3283 vs_finish_write(vs);
3284 VS_LOCK(vs);
3285 vs->vs_xfer_pending = TRUE;
1c79356b
A
3286 vs_wait_for_sync_writers(vs);
3287 vs_start_write(vs);
3288 vs_wait_for_readers(vs);
0b4e3aa0 3289 VS_UNLOCK(vs);
1c79356b
A
3290 if (!(vs->vs_indirect)) {
3291 goto vs_changed;
3292 }
3293 }
3294 }
3295 } else {
3296 vsmap = vs->vs_dmap;
3297 if (vsmap == NULL) {
3298 VS_LOCK(vs);
3299 vs->vs_xfer_pending = FALSE;
3300 VS_UNLOCK(vs);
3301 vs_finish_write(vs);
3302 return KERN_SUCCESS;
3303 }
3304 /* loop on clusters in the direct map */
3305 for (j = 0; j < vs->vs_size; j++) {
3306 if (VSM_ISCLR(vsmap[j]) ||
3307 VSM_ISERR(vsmap[j]) ||
3308 (VSM_PS(vsmap[j]) != segment))
3309 continue;
3310 if(vs_cluster_transfer(vs,
3311 vm_page_size * (j << vs->vs_clshift),
3312 vm_page_size << vs->vs_clshift,
1c79356b 3313 upl) != KERN_SUCCESS) {
1c79356b
A
3314 VS_LOCK(vs);
3315 vs->vs_xfer_pending = FALSE;
3316 VS_UNLOCK(vs);
3317 vs_finish_write(vs);
3318 return KERN_FAILURE;
3319 }
3320 /* allow other readers/writers during transfer*/
3321 VS_LOCK(vs);
3322 vs->vs_xfer_pending = FALSE;
3323 VS_UNLOCK(vs);
3324 vs_finish_write(vs);
3325 VS_LOCK(vs);
3326 vs->vs_xfer_pending = TRUE;
3327 VS_UNLOCK(vs);
3328 vs_wait_for_sync_writers(vs);
3329 vs_start_write(vs);
3330 vs_wait_for_readers(vs);
3331 if (vs->vs_indirect) {
3332 goto vs_changed;
3333 }
3334 }
3335 }
3336
3337 VS_LOCK(vs);
3338 vs->vs_xfer_pending = FALSE;
3339 VS_UNLOCK(vs);
3340 vs_finish_write(vs);
3341 return KERN_SUCCESS;
3342}
3343
3344
3345
3346vs_map_t
3347vs_get_map_entry(
3348 vstruct_t vs,
3349 vm_offset_t offset)
3350{
3351 struct vs_map *vsmap;
3352 vm_offset_t cluster;
3353
de355530 3354 cluster = atop(offset) >> vs->vs_clshift;
1c79356b
A
3355 if (vs->vs_indirect) {
3356 long ind_block = cluster/CLMAP_ENTRIES;
3357
3358 /* Is the indirect block allocated? */
3359 vsmap = vs->vs_imap[ind_block];
3360 if(vsmap == (vs_map_t) NULL)
3361 return vsmap;
3362 } else
3363 vsmap = vs->vs_dmap;
3364 vsmap += cluster%CLMAP_ENTRIES;
3365 return vsmap;
3366}
3367
3368kern_return_t
3369vs_cluster_transfer(
3370 vstruct_t vs,
3371 vm_offset_t offset,
3372 vm_size_t cnt,
1c79356b 3373 upl_t upl)
1c79356b
A
3374{
3375 vm_offset_t actual_offset;
3376 paging_segment_t ps;
3377 struct clmap clmap;
3378 kern_return_t error = KERN_SUCCESS;
3379 int size, size_wanted, i;
3380 unsigned int residual;
3381 int unavail_size;
3382 default_pager_thread_t *dpt;
3383 boolean_t dealloc;
3384 struct vs_map *vsmap_ptr;
3385 struct vs_map read_vsmap;
3386 struct vs_map original_read_vsmap;
3387 struct vs_map write_vsmap;
3388 upl_t sync_upl;
1c79356b
A
3389 vm_offset_t ioaddr;
3390
1c79356b
A
3391 /* vs_cluster_transfer reads in the pages of a cluster and
3392 * then writes these pages back to new backing store. The
3393 * segment the pages are being read from is assumed to have
3394 * been taken off-line and is no longer considered for new
3395 * space requests.
3396 */
3397
3398 /*
3399 * This loop will be executed once per cluster referenced.
3400 * Typically this means once, since it's unlikely that the
3401 * VM system will ask for anything spanning cluster boundaries.
3402 *
3403 * If there are holes in a cluster (in a paging segment), we stop
3404 * reading at the hole, then loop again, hoping to
3405 * find valid pages later in the cluster. This continues until
3406 * the entire range has been examined, and read, if present. The
3407 * pages are written as they are read. If a failure occurs after
3408 * some pages are written the unmap call at the bottom of the loop
3409 * recovers the backing store and the old backing store remains
3410 * in effect.
3411 */
3412
1c79356b
A
3413 VSM_CLR(write_vsmap);
3414 VSM_CLR(original_read_vsmap);
3415 /* grab the actual object's pages to sync with I/O */
3416 while (cnt && (error == KERN_SUCCESS)) {
3417 vsmap_ptr = vs_get_map_entry(vs, offset);
3418 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
3419
3420 if (actual_offset == (vm_offset_t) -1) {
3421
3422 /*
3423 * Nothing left to write in this cluster at least
3424 * set write cluster information for any previous
3425 * write, clear for next cluster, if there is one
3426 */
3427 unsigned int local_size, clmask, clsize;
3428
3429 clsize = vm_page_size << vs->vs_clshift;
3430 clmask = clsize - 1;
3431 local_size = clsize - (offset & clmask);
3432 ASSERT(local_size);
3433 local_size = MIN(local_size, cnt);
3434
3435 /* This cluster has no data in it beyond what may */
3436 /* have been found on a previous iteration through */
3437 /* the loop "write_vsmap" */
3438 *vsmap_ptr = write_vsmap;
3439 VSM_CLR(write_vsmap);
3440 VSM_CLR(original_read_vsmap);
3441
3442 cnt -= local_size;
3443 offset += local_size;
3444 continue;
3445 }
3446
3447 /*
3448 * Count up contiguous available or unavailable
3449 * pages.
3450 */
3451 ps = CLMAP_PS(clmap);
3452 ASSERT(ps);
3453 size = 0;
3454 unavail_size = 0;
3455 for (i = 0;
3456 (size < cnt) && (unavail_size < cnt) &&
3457 (i < CLMAP_NPGS(clmap)); i++) {
3458 if (CLMAP_ISSET(clmap, i)) {
3459 if (unavail_size != 0)
3460 break;
3461 size += vm_page_size;
3462 BS_STAT(ps->ps_bs,
3463 ps->ps_bs->bs_pages_in++);
3464 } else {
3465 if (size != 0)
3466 break;
3467 unavail_size += vm_page_size;
3468 }
3469 }
3470
3471 if (size == 0) {
3472 ASSERT(unavail_size);
3473 cnt -= unavail_size;
3474 offset += unavail_size;
3475 if((offset & ((vm_page_size << vs->vs_clshift) - 1))
3476 == 0) {
3477 /* There is no more to transfer in this
3478 cluster
3479 */
3480 *vsmap_ptr = write_vsmap;
3481 VSM_CLR(write_vsmap);
3482 VSM_CLR(original_read_vsmap);
3483 }
3484 continue;
3485 }
3486
3487 if(VSM_ISCLR(original_read_vsmap))
3488 original_read_vsmap = *vsmap_ptr;
3489
3490 if(ps->ps_segtype == PS_PARTITION) {
3491/*
9bccf70c 3492 NEED TO ISSUE WITH SYNC & NO COMMIT
1c79356b
A
3493 error = ps_read_device(ps, actual_offset, &buffer,
3494 size, &residual, flags);
3495*/
3496 } else {
9bccf70c 3497 /* NEED TO ISSUE WITH SYNC & NO COMMIT */
0b4e3aa0 3498 error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset,
1c79356b 3499 size, &residual,
9bccf70c 3500 (UPL_IOSYNC | UPL_NOCOMMIT));
1c79356b
A
3501 }
3502
3503 read_vsmap = *vsmap_ptr;
3504
3505
3506 /*
3507 * Adjust counts and put data in new BS. Optimize for the
3508 * common case, i.e. no error and/or partial data.
3509 * If there was an error, then we need to error the entire
3510 * range, even if some data was successfully read.
3511 *
3512 */
3513 if ((error == KERN_SUCCESS) && (residual == 0)) {
0b4e3aa0
A
3514 int page_list_count = 0;
3515
1c79356b
A
3516 /*
3517 * Got everything we asked for, supply the data to
3518 * the new BS. Note that as a side effect of supplying
3519 * the data, the buffer holding the supplied data is
3520 * deallocated from the pager's address space unless
3521 * the write is unsuccessful.
3522 */
3523
3524 /* note buffer will be cleaned up in all cases by */
3525 /* internal_cluster_write or if an error on write */
3526 /* the vm_map_copy_page_discard call */
3527 *vsmap_ptr = write_vsmap;
3528
1c79356b
A
3529 if(vs_cluster_write(vs, upl, offset,
3530 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
1c79356b
A
3531 error = KERN_FAILURE;
3532 if(!(VSM_ISCLR(*vsmap_ptr))) {
3533 /* unmap the new backing store object */
3534 ps_clunmap(vs, offset, size);
3535 }
3536 /* original vsmap */
3537 *vsmap_ptr = original_read_vsmap;
3538 VSM_CLR(write_vsmap);
3539 } else {
3540 if((offset + size) &
3541 ((vm_page_size << vs->vs_clshift)
3542 - 1)) {
3543 /* There is more to transfer in this
3544 cluster
3545 */
3546 write_vsmap = *vsmap_ptr;
3547 *vsmap_ptr = read_vsmap;
3548 } else {
3549 /* discard the old backing object */
3550 write_vsmap = *vsmap_ptr;
3551 *vsmap_ptr = read_vsmap;
3552 ps_clunmap(vs, offset, size);
3553 *vsmap_ptr = write_vsmap;
3554 VSM_CLR(write_vsmap);
3555 VSM_CLR(original_read_vsmap);
3556 }
3557 }
3558 } else {
3559 size_wanted = size;
3560 if (error == KERN_SUCCESS) {
3561 if (residual == size) {
3562 /*
3563 * If a read operation returns no error
3564 * and no data moved, we turn it into
3565 * an error, assuming we're reading at
3566 * or beyond EOF.
3567 * Fall through and error the entire
3568 * range.
3569 */
3570 error = KERN_FAILURE;
3571 *vsmap_ptr = write_vsmap;
3572 if(!(VSM_ISCLR(*vsmap_ptr))) {
3573 /* unmap the new backing store object */
3574 ps_clunmap(vs, offset, size);
3575 }
3576 *vsmap_ptr = original_read_vsmap;
3577 VSM_CLR(write_vsmap);
3578 continue;
3579 } else {
3580 /*
3581 * Otherwise, we have partial read.
3582 * This is also considered an error
3583 * for the purposes of cluster transfer
3584 */
3585 error = KERN_FAILURE;
3586 *vsmap_ptr = write_vsmap;
3587 if(!(VSM_ISCLR(*vsmap_ptr))) {
3588 /* unmap the new backing store object */
3589 ps_clunmap(vs, offset, size);
3590 }
3591 *vsmap_ptr = original_read_vsmap;
3592 VSM_CLR(write_vsmap);
3593 continue;
3594 }
3595 }
3596
3597 }
3598 cnt -= size;
3599 offset += size;
3600
3601 } /* END while (cnt && (error == 0)) */
3602 if(!VSM_ISCLR(write_vsmap))
3603 *vsmap_ptr = write_vsmap;
3604
1c79356b
A
3605 return error;
3606}
3607
3608kern_return_t
3609default_pager_add_file(MACH_PORT_FACE backing_store,
3610 int *vp,
3611 int record_size,
3612 long size)
3613{
3614 backing_store_t bs;
3615 paging_segment_t ps;
3616 int i;
3617 int error;
1c79356b
A
3618
3619 if ((bs = backing_store_lookup(backing_store))
3620 == BACKING_STORE_NULL)
3621 return KERN_INVALID_ARGUMENT;
3622
3623 PSL_LOCK();
3624 for (i = 0; i <= paging_segment_max; i++) {
3625 ps = paging_segments[i];
3626 if (ps == PAGING_SEGMENT_NULL)
3627 continue;
3628 if (ps->ps_segtype != PS_FILE)
3629 continue;
3630
3631 /*
3632 * Check for overlap on same device.
3633 */
3634 if (ps->ps_vnode == (struct vnode *)vp) {
3635 PSL_UNLOCK();
3636 BS_UNLOCK(bs);
3637 return KERN_INVALID_ARGUMENT;
3638 }
3639 }
3640 PSL_UNLOCK();
3641
3642 /*
3643 * Set up the paging segment
3644 */
3645 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
3646 if (ps == PAGING_SEGMENT_NULL) {
3647 BS_UNLOCK(bs);
3648 return KERN_RESOURCE_SHORTAGE;
3649 }
3650
3651 ps->ps_segtype = PS_FILE;
3652 ps->ps_vnode = (struct vnode *)vp;
3653 ps->ps_offset = 0;
3654 ps->ps_record_shift = local_log2(vm_page_size / record_size);
3655 ps->ps_recnum = size;
3656 ps->ps_pgnum = size >> ps->ps_record_shift;
3657
3658 ps->ps_pgcount = ps->ps_pgnum;
3659 ps->ps_clshift = local_log2(bs->bs_clsize);
3660 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
3661 ps->ps_hint = 0;
3662
3663 PS_LOCK_INIT(ps);
3664 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
3665 if (!ps->ps_bmap) {
3666 kfree((vm_offset_t)ps, sizeof *ps);
3667 BS_UNLOCK(bs);
3668 return KERN_RESOURCE_SHORTAGE;
3669 }
3670 for (i = 0; i < ps->ps_ncls; i++) {
3671 clrbit(ps->ps_bmap, i);
3672 }
3673
3674 ps->ps_going_away = FALSE;
3675 ps->ps_bs = bs;
3676
3677 if ((error = ps_enter(ps)) != 0) {
3678 kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
3679 kfree((vm_offset_t)ps, sizeof *ps);
3680 BS_UNLOCK(bs);
3681 return KERN_RESOURCE_SHORTAGE;
3682 }
3683
3684 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
3685 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
3686 PSL_LOCK();
3687 dp_pages_free += ps->ps_pgcount;
3688 PSL_UNLOCK();
3689
3690 BS_UNLOCK(bs);
3691
3692 bs_more_space(ps->ps_clcount);
3693
3694 DEBUG(DEBUG_BS_INTERNAL,
3695 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
3696 device, offset, size, record_size,
3697 ps->ps_record_shift, ps->ps_pgnum));
3698
3699 return KERN_SUCCESS;
3700}
3701
3702
3703
1c79356b
A
3704kern_return_t
3705ps_read_file(
3706 paging_segment_t ps,
3707 upl_t upl,
0b4e3aa0 3708 vm_offset_t upl_offset,
1c79356b
A
3709 vm_offset_t offset,
3710 unsigned int size,
3711 unsigned int *residualp,
3712 int flags)
3713{
3714 vm_object_offset_t f_offset;
3715 int error = 0;
3716 int result;
1c79356b
A
3717
3718
de355530 3719 clustered_reads[atop(size)]++;
1c79356b
A
3720
3721 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3722
3723 /* for transfer case we need to pass uploffset and flags */
3724 error = vnode_pagein(ps->ps_vnode,
0b4e3aa0 3725 upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL);
1c79356b
A
3726
3727 /* The vnode_pagein semantic is somewhat at odds with the existing */
3728 /* device_read semantic. Partial reads are not experienced at this */
3729 /* level. It is up to the bit map code and cluster read code to */
3730 /* check that requested data locations are actually backed, and the */
3731 /* pagein code to either read all of the requested data or return an */
3732 /* error. */
3733
3734 if (error)
3735 result = KERN_FAILURE;
3736 else {
3737 *residualp = 0;
3738 result = KERN_SUCCESS;
3739 }
3740 return result;
1c79356b
A
3741}
3742
3743kern_return_t
3744ps_write_file(
3745 paging_segment_t ps,
3746 upl_t upl,
3747 vm_offset_t upl_offset,
3748 vm_offset_t offset,
3749 unsigned int size,
3750 int flags)
3751{
3752 vm_object_offset_t f_offset;
3753 kern_return_t result;
1c79356b
A
3754
3755 int error = 0;
3756
de355530 3757 clustered_writes[atop(size)]++;
1c79356b
A
3758 f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
3759
3760 if (vnode_pageout(ps->ps_vnode,
3761 upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL))
3762 result = KERN_FAILURE;
3763 else
3764 result = KERN_SUCCESS;
3765
3766 return result;
3767}
3768
3769kern_return_t
3770default_pager_triggers(MACH_PORT_FACE default_pager,
3771 int hi_wat,
3772 int lo_wat,
3773 int flags,
3774 MACH_PORT_FACE trigger_port)
3775{
0b4e3aa0
A
3776 MACH_PORT_FACE release;
3777 kern_return_t kr;
1c79356b 3778
0b4e3aa0
A
3779 PSL_LOCK();
3780 if (flags == HI_WAT_ALERT) {
3781 release = min_pages_trigger_port;
1c79356b
A
3782 min_pages_trigger_port = trigger_port;
3783 minimum_pages_remaining = hi_wat/vm_page_size;
3784 bs_low = FALSE;
0b4e3aa0
A
3785 kr = KERN_SUCCESS;
3786 } else if (flags == LO_WAT_ALERT) {
3787 release = max_pages_trigger_port;
1c79356b
A
3788 max_pages_trigger_port = trigger_port;
3789 maximum_pages_free = lo_wat/vm_page_size;
0b4e3aa0
A
3790 kr = KERN_SUCCESS;
3791 } else {
3792 release = trigger_port;
3793 kr = KERN_INVALID_ARGUMENT;
1c79356b 3794 }
0b4e3aa0
A
3795 PSL_UNLOCK();
3796
3797 if (IP_VALID(release))
3798 ipc_port_release_send(release);
3799
3800 return kr;
1c79356b 3801}