]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/i386_vm_init.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / i386 / i386_vm_init.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989, 1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57
58#include <mach/i386/vm_param.h>
59
60#include <string.h>
61#include <mach/vm_param.h>
62#include <mach/vm_prot.h>
63#include <mach/machine.h>
64#include <mach/time_value.h>
65#include <kern/spl.h>
66#include <kern/assert.h>
67#include <kern/debug.h>
68#include <kern/misc_protos.h>
69#include <kern/cpu_data.h>
70#include <kern/processor.h>
71#include <vm/vm_page.h>
72#include <vm/pmap.h>
73#include <vm/vm_kern.h>
74#include <i386/pmap.h>
75#include <i386/misc_protos.h>
76#include <i386/cpuid.h>
77#include <mach/thread_status.h>
78#include <pexpert/i386/efi.h>
79#include <pexpert/pexpert.h>
80#include <i386/i386_lowmem.h>
81#include <i386/misc_protos.h>
82#include <x86_64/lowglobals.h>
83#include <i386/pal_routines.h>
84
85#include <mach-o/loader.h>
86#include <libkern/kernel_mach_header.h>
87
88#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
89
90vm_size_t mem_size = 0;
91pmap_paddr_t first_avail = 0;/* first after page tables */
92
93uint64_t max_mem; /* Size of physical memory minus carveouts (bytes), adjusted by maxmem */
94uint64_t max_mem_actual; /* Actual size of physical memory (bytes) adjusted by
95 * the maxmem boot-arg */
96uint64_t mem_actual;
97uint64_t sane_size = 0; /* Memory size for defaults calculations */
98
99/*
100 * KASLR parameters
101 */
102ppnum_t vm_kernel_base_page;
103vm_offset_t vm_kernel_base;
104vm_offset_t vm_kernel_top;
105vm_offset_t vm_kernel_stext;
106vm_offset_t vm_kernel_etext;
107vm_offset_t vm_kernel_slide;
108vm_offset_t vm_kernel_slid_base;
109vm_offset_t vm_kernel_slid_top;
110vm_offset_t vm_hib_base;
111vm_offset_t vm_kext_base = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
112vm_offset_t vm_kext_top = VM_MIN_KERNEL_ADDRESS;
113
114vm_offset_t vm_prelink_stext;
115vm_offset_t vm_prelink_etext;
116vm_offset_t vm_prelink_sinfo;
117vm_offset_t vm_prelink_einfo;
118vm_offset_t vm_slinkedit;
119vm_offset_t vm_elinkedit;
120
121vm_offset_t vm_kernel_builtinkmod_text;
122vm_offset_t vm_kernel_builtinkmod_text_end;
123
124#define MAXLORESERVE (32 * 1024 * 1024)
125
126ppnum_t max_ppnum = 0;
127
128/*
129 * pmap_high_used* are the highest range of physical memory used for kernel
130 * internals (page tables, vm_pages) via pmap_steal_memory() that don't
131 * need to be encrypted in hibernation images. There can be one gap in
132 * the middle of this due to fragmentation when using a mix of small
133 * and large pages. In that case, the fragment lives between the high
134 * and middle ranges.
135 */
136ppnum_t pmap_high_used_top = 0;
137ppnum_t pmap_high_used_bottom = 0;
138ppnum_t pmap_middle_used_top = 0;
139ppnum_t pmap_middle_used_bottom = 0;
140
141enum {PMAP_MAX_RESERVED_RANGES = 32};
142uint32_t pmap_reserved_pages_allocated = 0;
143uint32_t pmap_reserved_range_indices[PMAP_MAX_RESERVED_RANGES];
144uint32_t pmap_last_reserved_range_index = 0;
145uint32_t pmap_reserved_ranges = 0;
146
147extern unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
148
149pmap_paddr_t avail_start, avail_end;
150vm_offset_t virtual_avail, virtual_end;
151static pmap_paddr_t avail_remaining;
152vm_offset_t static_memory_end = 0;
153
154vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, end, sconst, econst;
155
156/*
157 * _mh_execute_header is the mach_header for the currently executing kernel
158 */
159vm_offset_t segTEXTB; unsigned long segSizeTEXT;
160vm_offset_t segDATAB; unsigned long segSizeDATA;
161vm_offset_t segLINKB; unsigned long segSizeLINK;
162vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT;
163vm_offset_t segPRELINKINFOB; unsigned long segSizePRELINKINFO;
164vm_offset_t segHIBB; unsigned long segSizeHIB;
165unsigned long segSizeConst;
166
167static kernel_segment_command_t *segTEXT, *segDATA;
168static kernel_section_t *cursectTEXT, *lastsectTEXT;
169static kernel_segment_command_t *segCONST;
170
171extern uint64_t firmware_Conventional_bytes;
172extern uint64_t firmware_RuntimeServices_bytes;
173extern uint64_t firmware_ACPIReclaim_bytes;
174extern uint64_t firmware_ACPINVS_bytes;
175extern uint64_t firmware_PalCode_bytes;
176extern uint64_t firmware_Reserved_bytes;
177extern uint64_t firmware_Unusable_bytes;
178extern uint64_t firmware_other_bytes;
179uint64_t firmware_MMIO_bytes;
180
181/*
182 * Linker magic to establish the highest address in the kernel.
183 */
184extern void *last_kernel_symbol;
185
186#define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT)
187#define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT)
188
189/* set so no region large page fragment pages exist */
190#define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0))
191
192boolean_t memmap = FALSE;
193#if DEBUG || DEVELOPMENT
194static void
195kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
196{
197 unsigned int i;
198 unsigned int j;
199 pmap_memory_region_t *p = pmap_memory_regions;
200 EfiMemoryRange *mptr;
201 addr64_t region_start, region_end;
202 addr64_t efi_start, efi_end;
203
204 for (j = 0; j < pmap_memory_region_count; j++, p++) {
205 kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx"
206 " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n",
207 j, p->type,
208 (addr64_t) p->base << I386_PGSHIFT,
209 (addr64_t) p->alloc_up << I386_PGSHIFT,
210 (addr64_t) p->alloc_down << I386_PGSHIFT,
211 (addr64_t) p->alloc_frag_up << I386_PGSHIFT,
212 (addr64_t) p->alloc_frag_down << I386_PGSHIFT,
213 (addr64_t) p->end << I386_PGSHIFT);
214 region_start = (addr64_t) p->base << I386_PGSHIFT;
215 region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1;
216 mptr = (EfiMemoryRange *) maddr;
217 for (i = 0;
218 i < mcount;
219 i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
220 if (mptr->Type != kEfiLoaderCode &&
221 mptr->Type != kEfiLoaderData &&
222 mptr->Type != kEfiBootServicesCode &&
223 mptr->Type != kEfiBootServicesData &&
224 mptr->Type != kEfiConventionalMemory) {
225 efi_start = (addr64_t)mptr->PhysicalStart;
226 efi_end = efi_start + ((vm_offset_t)mptr->NumberOfPages << I386_PGSHIFT) - 1;
227 if ((efi_start >= region_start && efi_start <= region_end) ||
228 (efi_end >= region_start && efi_end <= region_end)) {
229 kprintf(" *** Overlapping region with EFI runtime region %d\n", i);
230 }
231 }
232 }
233 }
234}
235#define DPRINTF(x...) do { if (memmap) kprintf(x); } while (0)
236
237#else
238
239static void
240kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
241{
242#pragma unused(maddr, msize, mcount)
243}
244
245#define DPRINTF(x...)
246#endif /* DEBUG */
247
248/*
249 * Basic VM initialization.
250 */
251void
252i386_vm_init(uint64_t maxmem,
253 boolean_t IA32e,
254 boot_args *args)
255{
256 pmap_memory_region_t *pmptr;
257 pmap_memory_region_t *prev_pmptr;
258 EfiMemoryRange *mptr;
259 unsigned int mcount;
260 unsigned int msize;
261 vm_offset_t maddr;
262 ppnum_t fap;
263 unsigned int i;
264 ppnum_t maxpg = 0;
265 uint32_t pmap_type;
266 uint32_t maxloreserve;
267 uint32_t maxdmaaddr;
268 uint32_t mbuf_reserve = 0;
269 boolean_t mbuf_override = FALSE;
270 boolean_t coalescing_permitted;
271 vm_kernel_base_page = i386_btop(args->kaddr);
272 vm_offset_t base_address;
273 vm_offset_t static_base_address;
274
275 PE_parse_boot_argn("memmap", &memmap, sizeof(memmap));
276
277 /*
278 * Establish the KASLR parameters.
279 */
280 static_base_address = ml_static_ptovirt(KERNEL_BASE_OFFSET);
281 base_address = ml_static_ptovirt(args->kaddr);
282 vm_kernel_slide = base_address - static_base_address;
283 if (args->kslide) {
284 kprintf("KASLR slide: 0x%016lx dynamic\n", vm_kernel_slide);
285 if (vm_kernel_slide != ((vm_offset_t)args->kslide)) {
286 panic("Kernel base inconsistent with slide - rebased?");
287 }
288 } else {
289 /* No slide relative to on-disk symbols */
290 kprintf("KASLR slide: 0x%016lx static and ignored\n",
291 vm_kernel_slide);
292 vm_kernel_slide = 0;
293 }
294
295 /*
296 * Zero out local relocations to avoid confusing kxld.
297 * TODO: might be better to move this code to OSKext::initialize
298 */
299 if (_mh_execute_header.flags & MH_PIE) {
300 struct load_command *loadcmd;
301 uint32_t cmd;
302
303 loadcmd = (struct load_command *)((uintptr_t)&_mh_execute_header +
304 sizeof(_mh_execute_header));
305
306 for (cmd = 0; cmd < _mh_execute_header.ncmds; cmd++) {
307 if (loadcmd->cmd == LC_DYSYMTAB) {
308 struct dysymtab_command *dysymtab;
309
310 dysymtab = (struct dysymtab_command *)loadcmd;
311 dysymtab->nlocrel = 0;
312 dysymtab->locreloff = 0;
313 kprintf("Hiding local relocations\n");
314 break;
315 }
316 loadcmd = (struct load_command *)((uintptr_t)loadcmd + loadcmd->cmdsize);
317 }
318 }
319
320 /*
321 * Now retrieve addresses for end, edata, and etext
322 * from MACH-O headers.
323 */
324 segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
325 "__TEXT", &segSizeTEXT);
326 segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
327 "__DATA", &segSizeDATA);
328 segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
329 "__LINKEDIT", &segSizeLINK);
330 segHIBB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
331 "__HIB", &segSizeHIB);
332 segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
333 "__PRELINK_TEXT", &segSizePRELINKTEXT);
334 segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
335 "__PRELINK_INFO", &segSizePRELINKINFO);
336 segTEXT = getsegbynamefromheader(&_mh_execute_header,
337 "__TEXT");
338 segDATA = getsegbynamefromheader(&_mh_execute_header,
339 "__DATA");
340 segCONST = getsegbynamefromheader(&_mh_execute_header,
341 "__DATA_CONST");
342 cursectTEXT = lastsectTEXT = firstsect(segTEXT);
343 /* Discover the last TEXT section within the TEXT segment */
344 while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) {
345 lastsectTEXT = cursectTEXT;
346 }
347
348 sHIB = segHIBB;
349 eHIB = segHIBB + segSizeHIB;
350 vm_hib_base = sHIB;
351 /* Zero-padded from ehib to stext if text is 2M-aligned */
352 stext = segTEXTB;
353 lowGlo.lgStext = stext;
354 etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size);
355 /* Zero-padded from etext to sdata if text is 2M-aligned */
356 sdata = segDATAB;
357 edata = segDATAB + segSizeDATA;
358
359 sconst = segCONST->vmaddr;
360 segSizeConst = segCONST->vmsize;
361 econst = sconst + segSizeConst;
362
363 kc_format_t kc_format = KCFormatUnknown;
364
365 /* XXX: FIXME_IN_dyld: For new-style kernel caches, the ending address of __DATA_CONST may not be page-aligned */
366 if (PE_get_primary_kc_format(&kc_format) && kc_format == KCFormatFileset) {
367 /* Round up the end */
368 econst = P2ROUNDUP(econst, PAGE_SIZE);
369 edata = P2ROUNDUP(edata, PAGE_SIZE);
370 } else {
371 assert(((sconst | econst) & PAGE_MASK) == 0);
372 assert(((sdata | edata) & PAGE_MASK) == 0);
373 }
374
375 DPRINTF("segTEXTB = %p\n", (void *) segTEXTB);
376 DPRINTF("segDATAB = %p\n", (void *) segDATAB);
377 DPRINTF("segLINKB = %p\n", (void *) segLINKB);
378 DPRINTF("segHIBB = %p\n", (void *) segHIBB);
379 DPRINTF("segPRELINKTEXTB = %p\n", (void *) segPRELINKTEXTB);
380 DPRINTF("segPRELINKINFOB = %p\n", (void *) segPRELINKINFOB);
381 DPRINTF("sHIB = %p\n", (void *) sHIB);
382 DPRINTF("eHIB = %p\n", (void *) eHIB);
383 DPRINTF("stext = %p\n", (void *) stext);
384 DPRINTF("etext = %p\n", (void *) etext);
385 DPRINTF("sdata = %p\n", (void *) sdata);
386 DPRINTF("edata = %p\n", (void *) edata);
387 DPRINTF("sconst = %p\n", (void *) sconst);
388 DPRINTF("econst = %p\n", (void *) econst);
389 DPRINTF("kernel_top = %p\n", (void *) &last_kernel_symbol);
390
391 vm_kernel_base = sHIB;
392 vm_kernel_top = (vm_offset_t) &last_kernel_symbol;
393 vm_kernel_stext = stext;
394 vm_kernel_etext = etext;
395 vm_prelink_stext = segPRELINKTEXTB;
396 vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
397 vm_prelink_sinfo = segPRELINKINFOB;
398 vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO;
399 vm_slinkedit = segLINKB;
400 vm_elinkedit = segLINKB + segSizeLINK;
401
402 /*
403 * In the fileset world, we want to be able to (un)slide addresses from
404 * the kernel or any of the kexts (e.g., for kernel logging metadata
405 * passed between the kernel and logd in userspace). VM_KERNEL_UNSLIDE
406 * (via VM_KERNEL_IS_SLID) should apply to the addresses in the range
407 * from the first basement address to the last boot kc address.
408 *
409 * ^
410 * :
411 * |
412 * vm_kernel_slid_top - ---------------------------------------------
413 * |
414 * :
415 * : Boot kc (kexts in the boot kc here)
416 * : - - - - - - - - - - - - - - - - - - - - - - -
417 * :
418 * :
419 * | Boot kc (kernel here)
420 * - ---------------------------------------------
421 * |
422 * :
423 * | Basement (kexts in pageable and aux kcs here)
424 * vm_kernel_slid_base - ---------------------------------------------
425 * 0
426 */
427
428 vm_kernel_slid_base = vm_kext_base + vm_kernel_slide;
429 vm_kernel_slid_top = (kc_format == KCFormatFileset) ?
430 vm_slinkedit : vm_prelink_einfo;
431
432 vm_page_kernelcache_count = (unsigned int) (atop_64(vm_kernel_top - vm_kernel_base));
433
434 vm_set_page_size();
435
436 /*
437 * Compute the memory size.
438 */
439
440 avail_remaining = 0;
441 avail_end = 0;
442 pmptr = pmap_memory_regions;
443 prev_pmptr = 0;
444 pmap_memory_region_count = pmap_memory_region_current = 0;
445 fap = (ppnum_t) i386_btop(first_avail);
446
447 maddr = ml_static_ptovirt((vm_offset_t)args->MemoryMap);
448 mptr = (EfiMemoryRange *)maddr;
449 if (args->MemoryMapDescriptorSize == 0) {
450 panic("Invalid memory map descriptor size");
451 }
452 msize = args->MemoryMapDescriptorSize;
453 mcount = args->MemoryMapSize / msize;
454
455#define FOURGIG 0x0000000100000000ULL
456#define ONEGIG 0x0000000040000000ULL
457
458 for (i = 0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
459 ppnum_t base, top;
460 uint64_t region_bytes = 0;
461
462 if (pmap_memory_region_count >= PMAP_MEMORY_REGIONS_SIZE) {
463 kprintf("WARNING: truncating memory region count at %d\n", pmap_memory_region_count);
464 break;
465 }
466 base = (ppnum_t) (mptr->PhysicalStart >> I386_PGSHIFT);
467 top = (ppnum_t) (((mptr->PhysicalStart) >> I386_PGSHIFT) + mptr->NumberOfPages - 1);
468
469 if (base == 0) {
470 /*
471 * Avoid having to deal with the edge case of the
472 * very first possible physical page and the roll-over
473 * to -1; just ignore that page.
474 */
475 kprintf("WARNING: ignoring first page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
476 base++;
477 }
478 if (top + 1 == 0) {
479 /*
480 * Avoid having to deal with the edge case of the
481 * very last possible physical page and the roll-over
482 * to 0; just ignore that page.
483 */
484 kprintf("WARNING: ignoring last page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
485 top--;
486 }
487 if (top < base) {
488 /*
489 * That was the only page in that region, so
490 * ignore the whole region.
491 */
492 continue;
493 }
494
495#if MR_RSV_TEST
496 static uint32_t nmr = 0;
497 if ((base > 0x20000) && (nmr++ < 4)) {
498 mptr->Attribute |= EFI_MEMORY_KERN_RESERVED;
499 }
500#endif
501 region_bytes = (uint64_t)(mptr->NumberOfPages << I386_PGSHIFT);
502 pmap_type = mptr->Type;
503
504 switch (mptr->Type) {
505 case kEfiLoaderCode:
506 case kEfiLoaderData:
507 case kEfiBootServicesCode:
508 case kEfiBootServicesData:
509 case kEfiConventionalMemory:
510 /*
511 * Consolidate usable memory types into one.
512 */
513 pmap_type = kEfiConventionalMemory;
514 sane_size += region_bytes;
515 firmware_Conventional_bytes += region_bytes;
516 break;
517 /*
518 * sane_size should reflect the total amount of physical
519 * RAM in the system, not just the amount that is
520 * available for the OS to use.
521 * We now get this value from SMBIOS tables
522 * rather than reverse engineering the memory map.
523 * But the legacy computation of "sane_size" is kept
524 * for diagnostic information.
525 */
526
527 case kEfiRuntimeServicesCode:
528 case kEfiRuntimeServicesData:
529 firmware_RuntimeServices_bytes += region_bytes;
530 sane_size += region_bytes;
531 break;
532 case kEfiACPIReclaimMemory:
533 firmware_ACPIReclaim_bytes += region_bytes;
534 sane_size += region_bytes;
535 break;
536 case kEfiACPIMemoryNVS:
537 firmware_ACPINVS_bytes += region_bytes;
538 sane_size += region_bytes;
539 break;
540 case kEfiPalCode:
541 firmware_PalCode_bytes += region_bytes;
542 sane_size += region_bytes;
543 break;
544
545 case kEfiReservedMemoryType:
546 firmware_Reserved_bytes += region_bytes;
547 break;
548 case kEfiUnusableMemory:
549 firmware_Unusable_bytes += region_bytes;
550 break;
551 case kEfiMemoryMappedIO:
552 case kEfiMemoryMappedIOPortSpace:
553 firmware_MMIO_bytes += region_bytes;
554 break;
555 default:
556 firmware_other_bytes += region_bytes;
557 break;
558 }
559
560 DPRINTF("EFI region %d: type %u/%d, base 0x%x, top 0x%x %s\n",
561 i, mptr->Type, pmap_type, base, top,
562 (mptr->Attribute & EFI_MEMORY_KERN_RESERVED)? "RESERVED" :
563 (mptr->Attribute & EFI_MEMORY_RUNTIME)? "RUNTIME" : "");
564
565 if (maxpg) {
566 if (base >= maxpg) {
567 break;
568 }
569 top = (top > maxpg) ? maxpg : top;
570 }
571
572 /*
573 * handle each region
574 */
575 if ((mptr->Attribute & EFI_MEMORY_RUNTIME) == EFI_MEMORY_RUNTIME ||
576 pmap_type != kEfiConventionalMemory) {
577 prev_pmptr = 0;
578 continue;
579 } else {
580 /*
581 * Usable memory region
582 */
583 if (top < I386_LOWMEM_RESERVED ||
584 !pal_is_usable_memory(base, top)) {
585 prev_pmptr = 0;
586 continue;
587 }
588 /*
589 * A range may be marked with with the
590 * EFI_MEMORY_KERN_RESERVED attribute
591 * on some systems, to indicate that the range
592 * must not be made available to devices.
593 */
594
595 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
596 if (++pmap_reserved_ranges > PMAP_MAX_RESERVED_RANGES) {
597 panic("Too many reserved ranges %u\n", pmap_reserved_ranges);
598 }
599 }
600
601 if (top < fap) {
602 /*
603 * entire range below first_avail
604 * salvage some low memory pages
605 * we use some very low memory at startup
606 * mark as already allocated here
607 */
608 if (base >= I386_LOWMEM_RESERVED) {
609 pmptr->base = base;
610 } else {
611 pmptr->base = I386_LOWMEM_RESERVED;
612 }
613
614 pmptr->end = top;
615
616
617 if ((mptr->Attribute & EFI_MEMORY_KERN_RESERVED) &&
618 (top < vm_kernel_base_page)) {
619 pmptr->alloc_up = pmptr->base;
620 pmptr->alloc_down = pmptr->end;
621 RESET_FRAG(pmptr);
622 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
623 } else {
624 /*
625 * mark as already mapped
626 */
627 pmptr->alloc_up = top + 1;
628 pmptr->alloc_down = top;
629 RESET_FRAG(pmptr);
630 }
631 pmptr->type = pmap_type;
632 pmptr->attribute = mptr->Attribute;
633 } else if ((base < fap) && (top > fap)) {
634 /*
635 * spans first_avail
636 * put mem below first avail in table but
637 * mark already allocated
638 */
639 pmptr->base = base;
640 pmptr->end = (fap - 1);
641 pmptr->alloc_up = pmptr->end + 1;
642 pmptr->alloc_down = pmptr->end;
643 RESET_FRAG(pmptr);
644 pmptr->type = pmap_type;
645 pmptr->attribute = mptr->Attribute;
646 /*
647 * we bump these here inline so the accounting
648 * below works correctly
649 */
650 pmptr++;
651 pmap_memory_region_count++;
652
653 pmptr->alloc_up = pmptr->base = fap;
654 pmptr->type = pmap_type;
655 pmptr->attribute = mptr->Attribute;
656 pmptr->alloc_down = pmptr->end = top;
657 RESET_FRAG(pmptr);
658
659 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
660 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
661 }
662 } else {
663 /*
664 * entire range useable
665 */
666 pmptr->alloc_up = pmptr->base = base;
667 pmptr->type = pmap_type;
668 pmptr->attribute = mptr->Attribute;
669 pmptr->alloc_down = pmptr->end = top;
670 RESET_FRAG(pmptr);
671 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
672 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
673 }
674 }
675
676 if (i386_ptob(pmptr->end) > avail_end) {
677 avail_end = i386_ptob(pmptr->end);
678 }
679
680 avail_remaining += (pmptr->end - pmptr->base);
681 coalescing_permitted = (prev_pmptr && (pmptr->attribute == prev_pmptr->attribute) && ((pmptr->attribute & EFI_MEMORY_KERN_RESERVED) == 0));
682 /*
683 * Consolidate contiguous memory regions, if possible
684 */
685 if (prev_pmptr &&
686 (pmptr->type == prev_pmptr->type) &&
687 (coalescing_permitted) &&
688 (pmptr->base == pmptr->alloc_up) &&
689 (prev_pmptr->end == prev_pmptr->alloc_down) &&
690 (pmptr->base == (prev_pmptr->end + 1))) {
691 prev_pmptr->end = pmptr->end;
692 prev_pmptr->alloc_down = pmptr->alloc_down;
693 RESET_FRAG(pmptr);
694 } else {
695 pmap_memory_region_count++;
696 prev_pmptr = pmptr;
697 pmptr++;
698 }
699 }
700 }
701
702 if (memmap) {
703 kprint_memmap(maddr, msize, mcount);
704 }
705
706 avail_start = first_avail;
707 mem_actual = args->PhysicalMemorySize;
708
709 /*
710 * For user visible memory size, round up to 128 Mb
711 * - accounting for the various stolen memory not reported by EFI.
712 * This is maintained for historical, comparison purposes but
713 * we now use the memory size reported by EFI/Booter.
714 */
715 sane_size = (sane_size + 128 * MB - 1) & ~((uint64_t)(128 * MB - 1));
716 if (sane_size != mem_actual) {
717 printf("mem_actual: 0x%llx\n legacy sane_size: 0x%llx\n",
718 mem_actual, sane_size);
719 }
720 sane_size = mem_actual;
721
722 /*
723 * We cap at KERNEL_MAXMEM bytes (currently 1536GB).
724 * Unless overriden by the maxmem= boot-arg
725 * -- which is a non-zero maxmem argument to this function.
726 */
727 if (maxmem == 0 && sane_size > KERNEL_MAXMEM) {
728 maxmem = KERNEL_MAXMEM;
729 printf("Physical memory %lld bytes capped at %dGB\n",
730 sane_size, (uint32_t) (KERNEL_MAXMEM / GB));
731 }
732
733 /*
734 * if user set maxmem, reduce memory sizes
735 */
736 if ((maxmem > (uint64_t)first_avail) && (maxmem < sane_size)) {
737 ppnum_t discarded_pages = (ppnum_t)((sane_size - maxmem) >> I386_PGSHIFT);
738 ppnum_t highest_pn = 0;
739 ppnum_t cur_end = 0;
740 uint64_t pages_to_use;
741 unsigned cur_region = 0;
742
743 sane_size = maxmem;
744
745 if (avail_remaining > discarded_pages) {
746 avail_remaining -= discarded_pages;
747 } else {
748 avail_remaining = 0;
749 }
750
751 pages_to_use = avail_remaining;
752
753 while (cur_region < pmap_memory_region_count && pages_to_use) {
754 for (cur_end = pmap_memory_regions[cur_region].base;
755 cur_end < pmap_memory_regions[cur_region].end && pages_to_use;
756 cur_end++) {
757 if (cur_end > highest_pn) {
758 highest_pn = cur_end;
759 }
760 pages_to_use--;
761 }
762 if (pages_to_use == 0) {
763 pmap_memory_regions[cur_region].end = cur_end;
764 pmap_memory_regions[cur_region].alloc_down = cur_end;
765 RESET_FRAG(&pmap_memory_regions[cur_region]);
766 }
767
768 cur_region++;
769 }
770 pmap_memory_region_count = cur_region;
771
772 avail_end = i386_ptob(highest_pn + 1);
773 }
774
775 /*
776 * mem_size is only a 32 bit container... follow the PPC route
777 * and pin it to a 2 Gbyte maximum
778 */
779 if (sane_size > (FOURGIG >> 1)) {
780 mem_size = (vm_size_t)(FOURGIG >> 1);
781 } else {
782 mem_size = (vm_size_t)sane_size;
783 }
784 max_mem = sane_size;
785 max_mem_actual = sane_size;
786
787 kprintf("Physical memory %llu MB\n", sane_size / MB);
788
789 max_valid_low_ppnum = (2 * GB) / PAGE_SIZE;
790
791 if (!PE_parse_boot_argn("max_valid_dma_addr", &maxdmaaddr, sizeof(maxdmaaddr))) {
792 max_valid_dma_address = (uint64_t)4 * (uint64_t)GB;
793 } else {
794 max_valid_dma_address = ((uint64_t) maxdmaaddr) * MB;
795
796 if ((max_valid_dma_address / PAGE_SIZE) < max_valid_low_ppnum) {
797 max_valid_low_ppnum = (ppnum_t)(max_valid_dma_address / PAGE_SIZE);
798 }
799 }
800 if (avail_end >= max_valid_dma_address) {
801 if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof(maxloreserve))) {
802 if (sane_size >= (ONEGIG * 15)) {
803 maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 4;
804 } else if (sane_size >= (ONEGIG * 7)) {
805 maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 2;
806 } else {
807 maxloreserve = MAXLORESERVE / PAGE_SIZE;
808 }
809
810#if SOCKETS
811 mbuf_reserve = bsd_mbuf_cluster_reserve(&mbuf_override) / PAGE_SIZE;
812#endif
813 } else {
814 maxloreserve = (maxloreserve * (1024 * 1024)) / PAGE_SIZE;
815 }
816
817 if (maxloreserve) {
818 vm_lopage_free_limit = maxloreserve;
819
820 if (mbuf_override == TRUE) {
821 vm_lopage_free_limit += mbuf_reserve;
822 vm_lopage_lowater = 0;
823 } else {
824 vm_lopage_lowater = vm_lopage_free_limit / 16;
825 }
826
827 vm_lopage_refill = TRUE;
828 vm_lopage_needed = TRUE;
829 }
830 }
831
832 /*
833 * Initialize kernel physical map.
834 * Kernel virtual address starts at VM_KERNEL_MIN_ADDRESS.
835 */
836 kprintf("avail_remaining = 0x%lx\n", (unsigned long)avail_remaining);
837 pmap_bootstrap(0, IA32e);
838}
839
840
841unsigned int
842pmap_free_pages(void)
843{
844 return (unsigned int)avail_remaining;
845}
846
847boolean_t pmap_next_page_reserved(ppnum_t *);
848
849/*
850 * Pick a page from a "kernel private" reserved range; works around
851 * errata on some hardware. EFI marks pages which can't be used for
852 * certain kinds of I/O-ish activities as reserved. We reserve them for
853 * kernel internal usage and prevent them from ever going on regular
854 * free list.
855 */
856boolean_t
857pmap_next_page_reserved(
858 ppnum_t *pn)
859{
860 uint32_t n;
861 pmap_memory_region_t *region;
862 uint32_t reserved_index;
863
864 if (pmap_reserved_ranges) {
865 for (n = 0; n < pmap_last_reserved_range_index; n++) {
866 reserved_index = pmap_reserved_range_indices[n];
867 region = &pmap_memory_regions[reserved_index];
868 if (region->alloc_up <= region->alloc_down) {
869 *pn = region->alloc_up++;
870 } else if (region->alloc_frag_up <= region->alloc_frag_down) {
871 *pn = region->alloc_frag_up++;
872 } else {
873 continue;
874 }
875 avail_remaining--;
876
877 if (*pn > max_ppnum) {
878 max_ppnum = *pn;
879 }
880
881 pmap_reserved_pages_allocated++;
882#if DEBUG
883 if (region->alloc_up > region->alloc_down) {
884 kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
885 }
886#endif
887 return TRUE;
888 }
889 }
890 return FALSE;
891}
892
893/*
894 * Return the highest large page available. Fails once there are no more large pages.
895 */
896kern_return_t
897pmap_next_page_large(
898 ppnum_t *pn)
899{
900 int r;
901 pmap_memory_region_t *region;
902 ppnum_t frag_start;
903 ppnum_t lgpg;
904
905 if (avail_remaining < LG_PPNUM_PAGES) {
906 return KERN_FAILURE;
907 }
908
909 for (r = pmap_memory_region_count - 1; r >= 0; r--) {
910 region = &pmap_memory_regions[r];
911
912 /*
913 * First check if there is enough memory.
914 */
915 if (region->alloc_down < region->alloc_up ||
916 (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) {
917 continue;
918 }
919
920 /*
921 * Find the starting large page, creating a fragment if needed.
922 */
923 if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) {
924 lgpg = (region->alloc_down & ~LG_PPNUM_MASK);
925 } else {
926 /* Can only have 1 fragment per region at a time */
927 if (region->alloc_frag_up <= region->alloc_frag_down) {
928 continue;
929 }
930
931 /* Check for enough room below any fragment. */
932 frag_start = (region->alloc_down & ~LG_PPNUM_MASK);
933 if (frag_start < region->alloc_up ||
934 frag_start - region->alloc_up < LG_PPNUM_PAGES) {
935 continue;
936 }
937
938 lgpg = frag_start - LG_PPNUM_PAGES;
939 region->alloc_frag_up = frag_start;
940 region->alloc_frag_down = region->alloc_down;
941 }
942
943 *pn = lgpg;
944 region->alloc_down = lgpg - 1;
945
946
947 avail_remaining -= LG_PPNUM_PAGES;
948 if (*pn + LG_PPNUM_MASK > max_ppnum) {
949 max_ppnum = *pn + LG_PPNUM_MASK;
950 }
951
952 return KERN_SUCCESS;
953 }
954 return KERN_FAILURE;
955}
956
957boolean_t
958pmap_next_page_hi(
959 ppnum_t *pn,
960 boolean_t might_free)
961{
962 pmap_memory_region_t *region;
963 int n;
964
965 if (!might_free && pmap_next_page_reserved(pn)) {
966 return TRUE;
967 }
968
969 if (avail_remaining) {
970 for (n = pmap_memory_region_count - 1; n >= 0; n--) {
971 region = &pmap_memory_regions[n];
972 if (region->alloc_frag_up <= region->alloc_frag_down) {
973 *pn = region->alloc_frag_down--;
974 } else if (region->alloc_down >= region->alloc_up) {
975 *pn = region->alloc_down--;
976 } else {
977 continue;
978 }
979
980 avail_remaining--;
981
982 if (*pn > max_ppnum) {
983 max_ppnum = *pn;
984 }
985
986 return TRUE;
987 }
988 }
989 return FALSE;
990}
991
992/*
993 * Record which high pages have been allocated so far,
994 * so that pmap_init() can mark them PMAP_NOENCRYPT, which
995 * makes hibernation faster.
996 *
997 * Because of the code in pmap_next_page_large(), we could
998 * theoretically have fragments in several regions.
999 * In practice that just doesn't happen. The last pmap region
1000 * is normally the largest and will satisfy all pmap_next_hi/large()
1001 * allocations. Since this information is used as an optimization
1002 * and it's ok to be conservative, we'll just record the information
1003 * for the final region.
1004 */
1005void
1006pmap_hi_pages_done(void)
1007{
1008 pmap_memory_region_t *r;
1009
1010 r = &pmap_memory_regions[pmap_memory_region_count - 1];
1011 pmap_high_used_top = r->end;
1012 if (r->alloc_frag_up <= r->alloc_frag_down) {
1013 pmap_high_used_bottom = r->alloc_frag_down + 1;
1014 pmap_middle_used_top = r->alloc_frag_up - 1;
1015 if (r->alloc_up <= r->alloc_down) {
1016 pmap_middle_used_bottom = r->alloc_down + 1;
1017 } else {
1018 pmap_high_used_bottom = r->base;
1019 }
1020 } else {
1021 if (r->alloc_up <= r->alloc_down) {
1022 pmap_high_used_bottom = r->alloc_down + 1;
1023 } else {
1024 pmap_high_used_bottom = r->base;
1025 }
1026 }
1027#if DEBUG || DEVELOPMENT
1028 kprintf("pmap_high_used_top 0x%x\n", pmap_high_used_top);
1029 kprintf("pmap_high_used_bottom 0x%x\n", pmap_high_used_bottom);
1030 kprintf("pmap_middle_used_top 0x%x\n", pmap_middle_used_top);
1031 kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom);
1032#endif
1033}
1034
1035/*
1036 * Return the next available page from lowest memory for general use.
1037 */
1038boolean_t
1039pmap_next_page(
1040 ppnum_t *pn)
1041{
1042 pmap_memory_region_t *region;
1043
1044 if (avail_remaining) {
1045 while (pmap_memory_region_current < pmap_memory_region_count) {
1046 region = &pmap_memory_regions[pmap_memory_region_current];
1047 if (region->alloc_up <= region->alloc_down) {
1048 *pn = region->alloc_up++;
1049 } else if (region->alloc_frag_up <= region->alloc_frag_down) {
1050 *pn = region->alloc_frag_up++;
1051 } else {
1052 pmap_memory_region_current++;
1053 continue;
1054 }
1055 avail_remaining--;
1056
1057 if (*pn > max_ppnum) {
1058 max_ppnum = *pn;
1059 }
1060
1061 return TRUE;
1062 }
1063 }
1064 return FALSE;
1065}
1066
1067
1068boolean_t
1069pmap_valid_page(
1070 ppnum_t pn)
1071{
1072 unsigned int i;
1073 pmap_memory_region_t *pmptr = pmap_memory_regions;
1074
1075 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1076 if ((pn >= pmptr->base) && (pn <= pmptr->end)) {
1077 return TRUE;
1078 }
1079 }
1080 return FALSE;
1081}