2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/fsctl.h>
33 #include <sys/mount_internal.h>
34 #include <sys/vnode_internal.h>
35 #include <sys/buf_internal.h>
37 #include <kern/kalloc.h>
39 #include <sys/kauth.h>
40 #include <IOKit/IOBSD.h>
42 #include <vfs/vfs_disk_conditioner.h>
44 #define DISK_CONDITIONER_SET_ENTITLEMENT "com.apple.private.dmc.set"
46 // number of total blocks for a mount
47 #define BLK_MAX(mp) ((mp->mnt_vfsstat.f_blocks * mp->mnt_vfsstat.f_bsize) / (mp->mnt_devblocksize))
49 // approx. time to spin up an idle HDD
50 #define DISK_SPINUP_SEC (8)
52 // idle period until assumed disk spin down
53 #define DISK_IDLE_SEC (10 * 60)
55 struct saved_mount_fields
{
56 uint32_t mnt_maxreadcnt
; /* Max. byte count for read */
57 uint32_t mnt_maxwritecnt
; /* Max. byte count for write */
58 uint32_t mnt_segreadcnt
; /* Max. segment count for read */
59 uint32_t mnt_segwritecnt
; /* Max. segment count for write */
60 uint32_t mnt_ioqueue_depth
; /* the maxiumum number of commands a device can accept */
61 uint32_t mnt_ioscale
; /* scale the various throttles/limits imposed on the amount of I/O in flight */
64 struct _disk_conditioner_info_t
{
65 disk_conditioner_info dcinfo
; // all the original data from fsctl
66 struct saved_mount_fields mnt_fields
; // fields to restore in mount_t when conditioner is disabled
68 daddr64_t last_blkno
; // approx. last transfered block for simulating seek times
69 struct timeval last_io_timestamp
; // the last time an I/O completed
72 void disk_conditioner_delay(buf_t
, int, int, uint64_t);
73 void disk_conditioner_unmount(mount_t mp
);
75 extern void throttle_info_mount_reset_period(mount_t
, int isssd
);
78 weighted_scale_factor(double scale
)
80 // 0 to 1 increasing quickly from 0. This weights smaller blkdiffs higher to add a type of minimum latency
81 // I would like to use log(10) / 2.0 + 1, but using different approximation due to no math library
83 double x_m1
= scale
- 1;
84 return x_m1
* x_m1
* x_m1
+ 1;
88 disk_conditioner_delay(buf_t bp
, int extents
, int total_size
, uint64_t already_elapsed_usec
)
94 double access_time_scale
;
95 struct _disk_conditioner_info_t
*internal_info
= NULL
;
96 disk_conditioner_info
*info
= NULL
;
97 struct timeval elapsed
;
111 internal_info
= mp
->mnt_disk_conditioner_info
;
112 if (!internal_info
|| !internal_info
->dcinfo
.enabled
) {
115 info
= &(internal_info
->dcinfo
);
118 // calculate approximate seek time based on difference in block number
119 last_blkno
= internal_info
->last_blkno
;
120 blkdiff
= bp
->b_blkno
> last_blkno
? bp
->b_blkno
- last_blkno
: last_blkno
- bp
->b_blkno
;
121 internal_info
->last_blkno
= bp
->b_blkno
+ bp
->b_bcount
;
123 blkdiff
= BLK_MAX(mp
);
126 // scale access time by (distance in blocks from previous I/O / maximum blocks)
127 access_time_scale
= weighted_scale_factor((double)blkdiff
/ (double)BLK_MAX(mp
));
128 if (__builtin_isnan(access_time_scale
)) {
131 // most cases should pass in extents==1 for optimal delay calculation, otherwise just multiply delay by extents
132 double temp
= (((double)extents
* (double)info
->access_time_usec
) * access_time_scale
);
135 } else if (temp
>= (double)(18446744073709549568ULL)) { /* highest 64-bit unsigned integer representable as a double */
136 delay_usec
= UINT64_MAX
;
138 delay_usec
= (uint64_t)temp
;
141 if (info
->read_throughput_mbps
&& (bp
->b_flags
& B_READ
)) {
142 delay_usec
+= (uint64_t)(total_size
/ ((double)(info
->read_throughput_mbps
* 1024 * 1024 / 8) / USEC_PER_SEC
));
143 } else if (info
->write_throughput_mbps
&& !(bp
->b_flags
& B_READ
)) {
144 delay_usec
+= (uint64_t)(total_size
/ ((double)(info
->write_throughput_mbps
* 1024 * 1024 / 8) / USEC_PER_SEC
));
147 // try simulating disk spinup based on time since last I/O
149 microuptime(&elapsed
);
150 timevalsub(&elapsed
, &internal_info
->last_io_timestamp
);
151 // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning)
152 if (elapsed
.tv_sec
> DISK_IDLE_SEC
&& internal_info
->last_io_timestamp
.tv_sec
!= 0) {
153 delay_usec
+= DISK_SPINUP_SEC
* USEC_PER_SEC
;
157 if (delay_usec
<= already_elapsed_usec
) {
158 microuptime(&internal_info
->last_io_timestamp
);
162 delay_usec
-= already_elapsed_usec
;
166 assert(delay_usec
<= INT_MAX
);
167 delay((int)delay_usec
);
168 microuptime(&elapsed
);
169 timevalsub(&elapsed
, &start
);
170 if (elapsed
.tv_sec
* USEC_PER_SEC
< delay_usec
) {
171 delay_usec
-= elapsed
.tv_sec
* USEC_PER_SEC
;
175 if ((uint64_t)elapsed
.tv_usec
< delay_usec
) {
176 delay_usec
-= elapsed
.tv_usec
;
182 microuptime(&internal_info
->last_io_timestamp
);
186 disk_conditioner_get_info(mount_t mp
, disk_conditioner_info
*uinfo
)
188 struct _disk_conditioner_info_t
*info
;
194 info
= mp
->mnt_disk_conditioner_info
;
197 memcpy(uinfo
, &(info
->dcinfo
), sizeof(disk_conditioner_info
));
204 disk_conditioner_restore_mount_fields(mount_t mp
, struct saved_mount_fields
*mnt_fields
)
206 mp
->mnt_maxreadcnt
= mnt_fields
->mnt_maxreadcnt
;
207 mp
->mnt_maxwritecnt
= mnt_fields
->mnt_maxwritecnt
;
208 mp
->mnt_segreadcnt
= mnt_fields
->mnt_segreadcnt
;
209 mp
->mnt_segwritecnt
= mnt_fields
->mnt_segwritecnt
;
210 mp
->mnt_ioqueue_depth
= mnt_fields
->mnt_ioqueue_depth
;
211 mp
->mnt_ioscale
= mnt_fields
->mnt_ioscale
;
215 disk_conditioner_set_info(mount_t mp
, disk_conditioner_info
*uinfo
)
217 struct _disk_conditioner_info_t
*internal_info
;
218 disk_conditioner_info
*info
;
219 struct saved_mount_fields
*mnt_fields
;
221 if (!kauth_cred_issuser(kauth_cred_get()) || !IOTaskHasEntitlement(current_task(), DISK_CONDITIONER_SET_ENTITLEMENT
)) {
231 internal_info
= mp
->mnt_disk_conditioner_info
;
232 if (!internal_info
) {
233 internal_info
= kalloc(sizeof(struct _disk_conditioner_info_t
));
234 bzero(internal_info
, sizeof(struct _disk_conditioner_info_t
));
235 mp
->mnt_disk_conditioner_info
= internal_info
;
236 mnt_fields
= &(internal_info
->mnt_fields
);
238 /* save mount_t fields for restoration later */
239 mnt_fields
->mnt_maxreadcnt
= mp
->mnt_maxreadcnt
;
240 mnt_fields
->mnt_maxwritecnt
= mp
->mnt_maxwritecnt
;
241 mnt_fields
->mnt_segreadcnt
= mp
->mnt_segreadcnt
;
242 mnt_fields
->mnt_segwritecnt
= mp
->mnt_segwritecnt
;
243 mnt_fields
->mnt_ioqueue_depth
= mp
->mnt_ioqueue_depth
;
244 mnt_fields
->mnt_ioscale
= mp
->mnt_ioscale
;
247 info
= &(internal_info
->dcinfo
);
248 mnt_fields
= &(internal_info
->mnt_fields
);
250 if (!uinfo
->enabled
&& info
->enabled
) {
251 /* disk conditioner is being disabled when already enabled */
252 disk_conditioner_restore_mount_fields(mp
, mnt_fields
);
255 memcpy(info
, uinfo
, sizeof(disk_conditioner_info
));
257 /* scale back based on hardware advertised limits */
258 if (uinfo
->ioqueue_depth
== 0 || uinfo
->ioqueue_depth
> mnt_fields
->mnt_ioqueue_depth
) {
259 info
->ioqueue_depth
= mnt_fields
->mnt_ioqueue_depth
;
261 if (uinfo
->maxreadcnt
== 0 || uinfo
->maxreadcnt
> mnt_fields
->mnt_maxreadcnt
) {
262 info
->maxreadcnt
= mnt_fields
->mnt_maxreadcnt
;
264 if (uinfo
->maxwritecnt
== 0 || uinfo
->maxwritecnt
> mnt_fields
->mnt_maxwritecnt
) {
265 info
->maxwritecnt
= mnt_fields
->mnt_maxwritecnt
;
267 if (uinfo
->segreadcnt
== 0 || uinfo
->segreadcnt
> mnt_fields
->mnt_segreadcnt
) {
268 info
->segreadcnt
= mnt_fields
->mnt_segreadcnt
;
270 if (uinfo
->segwritecnt
== 0 || uinfo
->segwritecnt
> mnt_fields
->mnt_segwritecnt
) {
271 info
->segwritecnt
= mnt_fields
->mnt_segwritecnt
;
274 if (uinfo
->enabled
) {
275 mp
->mnt_maxreadcnt
= info
->maxreadcnt
;
276 mp
->mnt_maxwritecnt
= info
->maxwritecnt
;
277 mp
->mnt_segreadcnt
= info
->segreadcnt
;
278 mp
->mnt_segwritecnt
= info
->segwritecnt
;
279 mp
->mnt_ioqueue_depth
= info
->ioqueue_depth
;
280 mp
->mnt_ioscale
= MNT_IOSCALE(info
->ioqueue_depth
);
285 microuptime(&internal_info
->last_io_timestamp
);
287 // make sure throttling picks up the new periods
288 throttle_info_mount_reset_period(mp
, info
->is_ssd
);
294 disk_conditioner_unmount(mount_t mp
)
296 struct _disk_conditioner_info_t
*internal_info
= mp
->mnt_disk_conditioner_info
;
298 if (!internal_info
) {
302 if (internal_info
->dcinfo
.enabled
) {
303 disk_conditioner_restore_mount_fields(mp
, &(internal_info
->mnt_fields
));
305 mp
->mnt_disk_conditioner_info
= NULL
;
306 kfree(internal_info
, sizeof(struct _disk_conditioner_info_t
));
310 disk_conditioner_mount_is_ssd(mount_t mp
)
312 struct _disk_conditioner_info_t
*internal_info
= mp
->mnt_disk_conditioner_info
;
314 if (!internal_info
|| !internal_info
->dcinfo
.enabled
) {
315 if (mp
->mnt_kern_flag
& MNTK_SSD
) {
321 return internal_info
->dcinfo
.is_ssd
;