2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1998 Apple Computer, Inc. All rights reserved.
30 * File: bsd/kern/kern_symfile.c
35 #include <mach/vm_param.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/signalvar.h>
40 #include <sys/resourcevar.h>
41 #include <sys/namei.h>
42 #include <sys/vnode_internal.h>
43 #include <sys/proc_internal.h>
44 #include <sys/kauth.h>
45 #include <sys/timeb.h>
46 #include <sys/times.h>
48 #include <sys/file_internal.h>
50 #include <sys/kernel.h>
54 #include <sys/content_protection.h>
55 #include <sys/fsctl.h>
57 #include <mach-o/loader.h>
58 #include <mach-o/nlist.h>
60 #include <kern/kalloc.h>
61 #include <vm/vm_kern.h>
62 #include <pexpert/pexpert.h>
63 #include <IOKit/IOPolledInterface.h>
65 #define HIBERNATE_MIN_PHYSICAL_LBA_512 (34)
66 #define HIBERNATE_MIN_PHYSICAL_LBA_4096 (6)
67 #define HIBERNATE_MIN_FILE_SIZE (1024*1024)
69 /* This function is called from kern_sysctl in the current process context;
70 * it is exported with the System6.0.exports, but this appears to be a legacy
71 * export, as there are no internal consumers.
74 get_kernel_symfile(__unused proc_t p
, __unused
char const **symfile
);
76 get_kernel_symfile(__unused proc_t p
, __unused
char const **symfile
)
81 struct kern_direct_file_io_ref_t
{
95 file_ioctl(void * p1
, void * p2
, u_long theIoctl
, caddr_t result
)
97 dev_t device
= *(dev_t
*) p1
;
99 return (*bdevsw
[major(device
)].d_ioctl
)
100 (device
, theIoctl
, result
, S_IFBLK
, p2
);
104 device_ioctl(void * p1
, __unused
void * p2
, u_long theIoctl
, caddr_t result
)
106 return VNOP_IOCTL(p1
, theIoctl
, result
, 0, p2
);
110 kern_ioctl_file_extents(struct kern_direct_file_io_ref_t
* ref
, u_long theIoctl
, off_t offset
, off_t end
)
113 int (*do_ioctl
)(void * p1
, void * p2
, u_long theIoctl
, caddr_t result
);
122 bzero(&extent
, sizeof(dk_extent_t
));
123 bzero(&unmap
, sizeof(dk_unmap_t
));
124 bzero(&pin
, sizeof(pin
));
125 if (ref
->vp
->v_type
== VREG
) {
128 do_ioctl
= &file_ioctl
;
133 do_ioctl
= &device_ioctl
;
136 if (_DKIOCCSPINEXTENT
== theIoctl
) {
137 /* Tell CS the image size, so it knows whether to place the subsequent pins SSD/HDD */
138 pin
.cp_extent
.length
= end
;
139 pin
.cp_flags
= _DKIOCCSHIBERNATEIMGSIZE
;
140 (void) do_ioctl(p1
, p2
, _DKIOCCSPINEXTENT
, (caddr_t
)&pin
);
141 } else if (_DKIOCCSUNPINEXTENT
== theIoctl
) {
142 /* Tell CS hibernation is done, so it can stop blocking overlapping writes */
143 pin
.cp_flags
= _DKIOCCSPINDISCARDBLACKLIST
;
144 (void) do_ioctl(p1
, p2
, _DKIOCCSUNPINEXTENT
, (caddr_t
)&pin
);
147 for (; offset
< end
; offset
+= filechunk
) {
148 if (ref
->vp
->v_type
== VREG
) {
150 filechunk
= 1 * 1024 * 1024 * 1024;
151 if (filechunk
> (size_t)(end
- offset
)) {
152 filechunk
= (size_t)(end
- offset
);
154 error
= VNOP_BLOCKMAP(ref
->vp
, offset
, filechunk
, &blkno
,
155 &filechunk
, NULL
, VNODE_WRITE
| VNODE_BLOCKMAP_NO_TRACK
, NULL
);
162 fileblk
= blkno
* ref
->blksize
;
163 } else if ((ref
->vp
->v_type
== VBLK
) || (ref
->vp
->v_type
== VCHR
)) {
165 filechunk
= (unsigned long)((ref
->filelength
> ULONG_MAX
) ? ULONG_MAX
: ref
->filelength
);
168 if (DKIOCUNMAP
== theIoctl
) {
169 extent
.offset
= fileblk
;
170 extent
.length
= filechunk
;
171 unmap
.extents
= &extent
;
172 unmap
.extentsCount
= 1;
173 error
= do_ioctl(p1
, p2
, theIoctl
, (caddr_t
)&unmap
);
174 // printf("DKIOCUNMAP(%d) 0x%qx, 0x%qx\n", error, extent.offset, extent.length);
175 } else if (_DKIOCCSPINEXTENT
== theIoctl
) {
176 pin
.cp_extent
.offset
= fileblk
;
177 pin
.cp_extent
.length
= filechunk
;
178 pin
.cp_flags
= _DKIOCCSPINFORHIBERNATION
;
179 error
= do_ioctl(p1
, p2
, theIoctl
, (caddr_t
)&pin
);
180 if (error
&& (ENOTTY
!= error
)) {
181 printf("_DKIOCCSPINEXTENT(%d) 0x%qx, 0x%qx\n", error
, pin
.cp_extent
.offset
, pin
.cp_extent
.length
);
183 } else if (_DKIOCCSUNPINEXTENT
== theIoctl
) {
184 pin
.cp_extent
.offset
= fileblk
;
185 pin
.cp_extent
.length
= filechunk
;
186 pin
.cp_flags
= _DKIOCCSPINFORHIBERNATION
;
187 error
= do_ioctl(p1
, p2
, theIoctl
, (caddr_t
)&pin
);
188 if (error
&& (ENOTTY
!= error
)) {
189 printf("_DKIOCCSUNPINEXTENT(%d) 0x%qx, 0x%qx\n", error
, pin
.cp_extent
.offset
, pin
.cp_extent
.length
);
202 extern uint32_t freespace_mb(vnode_t vp
);
204 struct kern_direct_file_io_ref_t
*
205 kern_open_file_for_direct_io(const char * name
,
207 kern_get_file_extents_callback_t callback
,
211 off_t write_file_offset
,
212 void * write_file_addr
,
213 size_t write_file_len
,
214 dev_t
* partition_device_result
,
215 dev_t
* image_device_result
,
216 uint64_t * partitionbase_result
,
217 uint64_t * maxiocount_result
,
220 struct kern_direct_file_io_ref_t
* ref
;
223 struct vnode_attr va
;
224 dk_apfs_wbc_range_t wbc_range
;
229 uint64_t physoffset
, minoffset
;
235 off_t maxiocount
, count
, segcount
, wbctotal
;
236 boolean_t locked
= FALSE
;
243 int (*do_ioctl
)(void * p1
, void * p2
, u_long theIoctl
, caddr_t result
);
249 ref
= (struct kern_direct_file_io_ref_t
*) kalloc(sizeof(struct kern_direct_file_io_ref_t
));
255 bzero(ref
, sizeof(*ref
));
257 ref
->ctx
= vfs_context_kernel();
259 fmode
= (kIOPolledFileCreate
& iflags
) ? (O_CREAT
| FWRITE
) : FWRITE
;
260 cmode
= S_IRUSR
| S_IWUSR
;
262 NDINIT(&nd
, LOOKUP
, OP_OPEN
, ndflags
, UIO_SYSSPACE
, CAST_USER_ADDR_T(name
), ref
->ctx
);
264 VATTR_SET(&va
, va_mode
, cmode
);
265 VATTR_SET(&va
, va_dataprotect_flags
, VA_DP_RAWENCRYPTED
);
266 VATTR_SET(&va
, va_dataprotect_class
, PROTECTION_CLASS_D
);
267 if ((error
= vn_open_auth(&nd
, &fmode
, &va
))) {
268 kprintf("vn_open_auth(fmode: %d, cmode: %d) failed with error: %d\n", fmode
, cmode
, error
);
273 if (ref
->vp
->v_type
== VREG
) {
274 vnode_lock_spin(ref
->vp
);
275 SET(ref
->vp
->v_flag
, VSWAP
);
276 vnode_unlock(ref
->vp
);
279 if (write_file_addr
&& write_file_len
) {
280 if ((error
= kern_write_file(ref
, write_file_offset
, write_file_addr
, write_file_len
, IO_SKIP_ENCRYPTION
))) {
281 kprintf("kern_write_file() failed with error: %d\n", error
);
287 VATTR_WANTED(&va
, va_rdev
);
288 VATTR_WANTED(&va
, va_fsid
);
289 VATTR_WANTED(&va
, va_devid
);
290 VATTR_WANTED(&va
, va_data_size
);
291 VATTR_WANTED(&va
, va_data_alloc
);
292 VATTR_WANTED(&va
, va_nlink
);
294 if (vnode_getattr(ref
->vp
, &va
, ref
->ctx
)) {
299 mpFree
= freespace_mb(ref
->vp
);
301 kprintf("kern_direct_file(%s): vp size %qd, alloc %qd, mp free %qd, keep free %qd\n",
302 name
, va
.va_data_size
, va
.va_data_alloc
, mpFree
, fs_free_size
);
304 if (ref
->vp
->v_type
== VREG
) {
305 /* Don't dump files with links. */
306 if (va
.va_nlink
!= 1) {
310 device
= (VATTR_IS_SUPPORTED(&va
, va_devid
)) ? va
.va_devid
: va
.va_fsid
;
311 ref
->filelength
= va
.va_data_size
;
315 do_ioctl
= &file_ioctl
;
317 if (kIOPolledFileHibernate
& iflags
) {
318 error
= do_ioctl(p1
, p2
, DKIOCAPFSGETWBCRANGE
, (caddr_t
) &wbc_range
);
319 ref
->wbcranged
= (error
== 0);
321 if (ref
->wbcranged
) {
323 assert(wbc_range
.count
<= (sizeof(wbc_range
.extents
) / sizeof(wbc_range
.extents
[0])));
324 for (idx
= 0; idx
< wbc_range
.count
; idx
++) {
325 wbctotal
+= wbc_range
.extents
[idx
].length
;
327 kprintf("kern_direct_file(%s): wbc %qd\n", name
, wbctotal
);
329 target
= wbc_range
.dev
;
335 if (wbctotal
>= set_file_size
) {
336 set_file_size
= HIBERNATE_MIN_FILE_SIZE
;
338 set_file_size
-= wbctotal
;
339 if (set_file_size
< HIBERNATE_MIN_FILE_SIZE
) {
340 set_file_size
= HIBERNATE_MIN_FILE_SIZE
;
345 mpFree
+= va
.va_data_alloc
;
346 if ((mpFree
< set_file_size
) || ((mpFree
- set_file_size
) < fs_free_size
)) {
351 error
= vnode_setsize(ref
->vp
, set_file_size
, IO_NOZEROFILL
| IO_NOAUTH
, ref
->ctx
);
355 ref
->filelength
= set_file_size
;
357 } else if ((ref
->vp
->v_type
== VBLK
) || (ref
->vp
->v_type
== VCHR
)) {
363 do_ioctl
= &device_ioctl
;
365 /* Don't dump to non-regular files. */
369 ref
->device
= device
;
372 dk_corestorage_info_t cs_info
;
373 memset(&cs_info
, 0, sizeof(dk_corestorage_info_t
));
374 error
= do_ioctl(p1
, p2
, DKIOCCORESTORAGE
, (caddr_t
)&cs_info
);
375 ref
->cf
= (error
== 0) && (cs_info
.flags
& DK_CORESTORAGE_ENABLE_HOTFILES
);
379 error
= do_ioctl(p1
, p2
, DKIOCGETBLOCKSIZE
, (caddr_t
) &ref
->blksize
);
384 if (ref
->blksize
== 4096) {
385 minoffset
= HIBERNATE_MIN_PHYSICAL_LBA_4096
* ref
->blksize
;
387 minoffset
= HIBERNATE_MIN_PHYSICAL_LBA_512
* ref
->blksize
;
390 if (ref
->vp
->v_type
!= VREG
) {
391 error
= do_ioctl(p1
, p2
, DKIOCGETBLOCKCOUNT
, (caddr_t
) &fileblk
);
395 ref
->filelength
= fileblk
* ref
->blksize
;
398 // pin logical extents, CS version
400 error
= kern_ioctl_file_extents(ref
, _DKIOCCSPINEXTENT
, 0, ref
->filelength
);
401 if (error
&& (ENOTTY
!= error
)) {
404 ref
->pinned
= (error
== 0);
406 // pin logical extents, apfs version
408 error
= VNOP_IOCTL(ref
->vp
, FSCTL_FREEZE_EXTENTS
, NULL
, 0, ref
->ctx
);
409 if (error
&& (ENOTTY
!= error
)) {
412 ref
->frozen
= (error
== 0);
414 // generate the block list
416 error
= do_ioctl(p1
, p2
, DKIOCLOCKPHYSICALEXTENTS
, NULL
);
423 for (; f_offset
< ref
->filelength
; f_offset
+= filechunk
) {
424 if (ref
->vp
->v_type
== VREG
) {
425 filechunk
= 1 * 1024 * 1024 * 1024;
428 error
= VNOP_BLOCKMAP(ref
->vp
, f_offset
, filechunk
, &blkno
,
429 &filechunk
, NULL
, VNODE_WRITE
| VNODE_BLOCKMAP_NO_TRACK
, NULL
);
436 fileblk
= blkno
* ref
->blksize
;
437 } else if ((ref
->vp
->v_type
== VBLK
) || (ref
->vp
->v_type
== VCHR
)) {
439 filechunk
= f_offset
? 0 : (unsigned long)ref
->filelength
;
443 while (physoffset
< filechunk
) {
444 dk_physical_extent_t getphysreq
;
445 bzero(&getphysreq
, sizeof(getphysreq
));
447 getphysreq
.offset
= fileblk
+ physoffset
;
448 getphysreq
.length
= (filechunk
- physoffset
);
449 error
= do_ioctl(p1
, p2
, DKIOCGETPHYSICALEXTENT
, (caddr_t
) &getphysreq
);
454 target
= getphysreq
.dev
;
455 } else if (target
!= getphysreq
.dev
) {
460 assert(getphysreq
.offset
>= minoffset
);
464 for (rev
= 4096; rev
<= getphysreq
.length
; rev
+= 4096) {
465 callback(callback_ref
, getphysreq
.offset
+ getphysreq
.length
- rev
, 4096);
468 callback(callback_ref
, getphysreq
.offset
, getphysreq
.length
);
470 physoffset
+= getphysreq
.length
;
473 if (ref
->wbcranged
) {
475 for (idx
= 0; idx
< wbc_range
.count
; idx
++) {
476 assert(wbc_range
.extents
[idx
].offset
>= minoffset
);
477 callback(callback_ref
, wbc_range
.extents
[idx
].offset
, wbc_range
.extents
[idx
].length
);
480 callback(callback_ref
, 0ULL, 0ULL);
482 if (ref
->vp
->v_type
== VREG
) {
487 do_ioctl
= &file_ioctl
;
490 // get partition base
492 if (partitionbase_result
) {
493 error
= do_ioctl(p1
, p2
, DKIOCGETBASE
, (caddr_t
) partitionbase_result
);
499 // get block size & constraints
501 error
= do_ioctl(p1
, p2
, DKIOCGETBLOCKSIZE
, (caddr_t
) &blksize
);
506 maxiocount
= 1 * 1024 * 1024 * 1024;
508 error
= do_ioctl(p1
, p2
, DKIOCGETMAXBLOCKCOUNTREAD
, (caddr_t
) &count
);
513 if (count
&& (count
< maxiocount
)) {
517 error
= do_ioctl(p1
, p2
, DKIOCGETMAXBLOCKCOUNTWRITE
, (caddr_t
) &count
);
522 if (count
&& (count
< maxiocount
)) {
526 error
= do_ioctl(p1
, p2
, DKIOCGETMAXBYTECOUNTREAD
, (caddr_t
) &count
);
530 if (count
&& (count
< maxiocount
)) {
534 error
= do_ioctl(p1
, p2
, DKIOCGETMAXBYTECOUNTWRITE
, (caddr_t
) &count
);
538 if (count
&& (count
< maxiocount
)) {
542 error
= do_ioctl(p1
, p2
, DKIOCGETMAXSEGMENTBYTECOUNTREAD
, (caddr_t
) &count
);
544 error
= do_ioctl(p1
, p2
, DKIOCGETMAXSEGMENTCOUNTREAD
, (caddr_t
) &segcount
);
547 count
= segcount
= 0;
550 if (count
&& (count
< maxiocount
)) {
554 error
= do_ioctl(p1
, p2
, DKIOCGETMAXSEGMENTBYTECOUNTWRITE
, (caddr_t
) &count
);
556 error
= do_ioctl(p1
, p2
, DKIOCGETMAXSEGMENTCOUNTWRITE
, (caddr_t
) &segcount
);
559 count
= segcount
= 0;
562 if (count
&& (count
< maxiocount
)) {
566 kprintf("max io 0x%qx bytes\n", maxiocount
);
567 if (maxiocount_result
) {
568 *maxiocount_result
= maxiocount
;
571 error
= do_ioctl(p1
, p2
, DKIOCISSOLIDSTATE
, (caddr_t
)&isssd
);
572 if (!error
&& isssd
) {
573 flags
|= kIOPolledFileSSD
;
576 if (partition_device_result
) {
577 *partition_device_result
= device
;
579 if (image_device_result
) {
580 *image_device_result
= target
;
586 if ((ref
->vp
->v_type
== VBLK
) || (ref
->vp
->v_type
== VCHR
)) {
587 vnode_close(ref
->vp
, FWRITE
, ref
->ctx
);
593 printf("kern_open_file_for_direct_io(%p, %d)\n", ref
, error
);
596 if (error
&& locked
) {
598 (void) do_ioctl(p1
, p2
, DKIOCUNLOCKPHYSICALEXTENTS
, NULL
);
603 (void) kern_ioctl_file_extents(ref
, _DKIOCCSUNPINEXTENT
, 0, (ref
->pinned
&& ref
->cf
) ? ref
->filelength
: 0);
606 (void) VNOP_IOCTL(ref
->vp
, FSCTL_THAW_EXTENTS
, NULL
, 0, ref
->ctx
);
608 if (ref
->wbcranged
) {
609 (void) do_ioctl(p1
, p2
, DKIOCAPFSRELEASEWBCRANGE
, (caddr_t
) NULL
);
611 vnode_close(ref
->vp
, FWRITE
, ref
->ctx
);
615 kfree(ref
, sizeof(struct kern_direct_file_io_ref_t
));
623 kern_write_file(struct kern_direct_file_io_ref_t
* ref
, off_t offset
, void * addr
, size_t len
, int ioflag
)
625 assert(len
<= INT32_MAX
);
626 return vn_rdwr(UIO_WRITE
, ref
->vp
,
627 addr
, (int)len
, offset
,
628 UIO_SYSSPACE
, ioflag
| IO_SYNC
| IO_NODELOCKED
| IO_UNIT
,
629 vfs_context_ucred(ref
->ctx
), (int *) 0,
630 vfs_context_proc(ref
->ctx
));
634 kern_read_file(struct kern_direct_file_io_ref_t
* ref
, off_t offset
, void * addr
, size_t len
, int ioflag
)
636 assert(len
<= INT32_MAX
);
637 return vn_rdwr(UIO_READ
, ref
->vp
,
638 addr
, (int)len
, offset
,
639 UIO_SYSSPACE
, ioflag
| IO_SYNC
| IO_NODELOCKED
| IO_UNIT
,
640 vfs_context_ucred(ref
->ctx
), (int *) 0,
641 vfs_context_proc(ref
->ctx
));
646 kern_file_mount(struct kern_direct_file_io_ref_t
* ref
)
648 return ref
->vp
->v_mount
;
652 kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t
* ref
,
653 off_t write_offset
, void * addr
, size_t write_length
,
654 off_t discard_offset
, off_t discard_end
)
657 printf("kern_close_file_for_direct_io(%p)\n", ref
);
664 int (*do_ioctl
)(void * p1
, void * p2
, u_long theIoctl
, caddr_t result
);
668 discard_offset
= ((discard_offset
+ ref
->blksize
- 1) & ~(((off_t
) ref
->blksize
) - 1));
669 discard_end
= ((discard_end
) & ~(((off_t
) ref
->blksize
) - 1));
671 if (ref
->vp
->v_type
== VREG
) {
674 do_ioctl
= &file_ioctl
;
679 do_ioctl
= &device_ioctl
;
681 (void) do_ioctl(p1
, p2
, DKIOCUNLOCKPHYSICALEXTENTS
, NULL
);
683 //XXX If unmapping extents then don't also need to unpin; except ...
684 //XXX if file unaligned (HFS 4k / Fusion 128k) then pin is superset and
685 //XXX unmap is subset, so save extra walk over file extents (and the risk
686 //XXX that CF drain starts) vs leaving partial units pinned to SSD
687 //XXX (until whatever was sharing also unmaps). Err on cleaning up fully.
688 boolean_t will_unmap
= (!ref
->pinned
|| ref
->cf
) && (discard_end
> discard_offset
);
689 boolean_t will_unpin
= (ref
->pinned
&& ref
->cf
/* && !will_unmap */);
691 (void) kern_ioctl_file_extents(ref
, _DKIOCCSUNPINEXTENT
, 0, (will_unpin
) ? ref
->filelength
: 0);
694 (void) kern_ioctl_file_extents(ref
, DKIOCUNMAP
, discard_offset
, (ref
->cf
) ? ref
->filelength
: discard_end
);
698 (void) VNOP_IOCTL(ref
->vp
, FSCTL_THAW_EXTENTS
, NULL
, 0, ref
->ctx
);
700 if (ref
->wbcranged
) {
701 (void) do_ioctl(p1
, p2
, DKIOCAPFSRELEASEWBCRANGE
, (caddr_t
) NULL
);
704 if (addr
&& write_length
) {
705 (void) kern_write_file(ref
, write_offset
, addr
, write_length
, IO_SKIP_ENCRYPTION
);
708 error
= vnode_close(ref
->vp
, FWRITE
, ref
->ctx
);
711 kprintf("vnode_close(%d)\n", error
);
717 kfree(ref
, sizeof(struct kern_direct_file_io_ref_t
));