]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
2d21ac55 | 2 | * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. |
5d5c5d0d | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b A |
27 | */ |
28 | /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ | |
29 | /* | |
30 | * Copyright (c) 1989, 1993, 1995 | |
31 | * The Regents of the University of California. All rights reserved. | |
32 | * | |
33 | * Redistribution and use in source and binary forms, with or without | |
34 | * modification, are permitted provided that the following conditions | |
35 | * are met: | |
36 | * 1. Redistributions of source code must retain the above copyright | |
37 | * notice, this list of conditions and the following disclaimer. | |
38 | * 2. Redistributions in binary form must reproduce the above copyright | |
39 | * notice, this list of conditions and the following disclaimer in the | |
40 | * documentation and/or other materials provided with the distribution. | |
41 | * 3. All advertising materials mentioning features or use of this software | |
42 | * must display the following acknowledgement: | |
43 | * This product includes software developed by the University of | |
44 | * California, Berkeley and its contributors. | |
45 | * 4. Neither the name of the University nor the names of its contributors | |
46 | * may be used to endorse or promote products derived from this software | |
47 | * without specific prior written permission. | |
48 | * | |
49 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
50 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
51 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
52 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
53 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
54 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
55 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
56 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
57 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
58 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
59 | * SUCH DAMAGE. | |
60 | * | |
61 | * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 | |
62 | */ | |
63 | ||
64 | #include <sys/param.h> | |
91447636 A |
65 | #include <sys/proc_internal.h> |
66 | #include <sys/kauth.h> | |
1c79356b A |
67 | #include <sys/systm.h> |
68 | #include <sys/kernel.h> | |
69 | #include <sys/conf.h> | |
91447636 A |
70 | #include <sys/buf_internal.h> |
71 | #include <sys/mount_internal.h> | |
91447636 | 72 | #include <sys/vnode_internal.h> |
6d2010ae A |
73 | #include <sys/file_internal.h> |
74 | #include <sys/namei.h> | |
1c79356b A |
75 | #include <sys/stat.h> |
76 | #include <sys/errno.h> | |
77 | #include <sys/ioctl.h> | |
78 | #include <sys/file.h> | |
91447636 | 79 | #include <sys/user.h> |
1c79356b | 80 | #include <sys/malloc.h> |
55e303ae | 81 | #include <sys/disk.h> |
91447636 | 82 | #include <sys/uio_internal.h> |
2d21ac55 | 83 | #include <sys/resource.h> |
1c79356b A |
84 | #include <miscfs/specfs/specdev.h> |
85 | #include <vfs/vfs_support.h> | |
6d2010ae A |
86 | #include <kern/assert.h> |
87 | #include <kern/task.h> | |
1c79356b | 88 | |
9bccf70c | 89 | #include <sys/kdebug.h> |
1c79356b | 90 | |
2d21ac55 | 91 | /* XXX following three prototypes should be in a header file somewhere */ |
2d21ac55 A |
92 | extern dev_t chrtoblk(dev_t dev); |
93 | extern int iskmemdev(dev_t dev); | |
b0d623f7 A |
94 | extern int bpfkqfilter(dev_t dev, struct knote *kn); |
95 | extern int ptsd_kqfilter(dev_t dev, struct knote *kn); | |
2d21ac55 | 96 | |
1c79356b A |
97 | struct vnode *speclisth[SPECHSZ]; |
98 | ||
99 | /* symbolic sleep message strings for devices */ | |
100 | char devopn[] = "devopn"; | |
101 | char devio[] = "devio"; | |
102 | char devwait[] = "devwait"; | |
103 | char devin[] = "devin"; | |
104 | char devout[] = "devout"; | |
105 | char devioc[] = "devioc"; | |
106 | char devcls[] = "devcls"; | |
107 | ||
108 | #define VOPFUNC int (*)(void *) | |
109 | ||
110 | int (**spec_vnodeop_p)(void *); | |
111 | struct vnodeopv_entry_desc spec_vnodeop_entries[] = { | |
91447636 A |
112 | { &vnop_default_desc, (VOPFUNC)vn_default_error }, |
113 | { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ | |
114 | { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ | |
115 | { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ | |
116 | { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ | |
117 | { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ | |
118 | { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ | |
119 | { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ | |
120 | { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ | |
121 | { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ | |
122 | { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ | |
123 | { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ | |
124 | { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ | |
125 | { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ | |
126 | { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ | |
127 | { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ | |
128 | { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ | |
129 | { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ | |
130 | { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ | |
131 | { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ | |
132 | { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ | |
133 | { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ | |
134 | { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ | |
135 | { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ | |
136 | { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ | |
137 | { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ | |
138 | { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ | |
139 | { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ | |
140 | { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ | |
141 | { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ | |
91447636 A |
142 | { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ |
143 | { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ | |
144 | { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ | |
145 | { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ | |
146 | { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ | |
147 | { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ | |
1c79356b A |
148 | { (struct vnodeop_desc*)NULL, (int(*)())NULL } |
149 | }; | |
150 | struct vnodeopv_desc spec_vnodeop_opv_desc = | |
151 | { &spec_vnodeop_p, spec_vnodeop_entries }; | |
152 | ||
91447636 A |
153 | |
154 | static void set_blocksize(vnode_t, dev_t); | |
155 | ||
156 | ||
1c79356b A |
157 | /* |
158 | * Trivial lookup routine that always fails. | |
159 | */ | |
160 | int | |
2d21ac55 | 161 | spec_lookup(struct vnop_lookup_args *ap) |
1c79356b A |
162 | { |
163 | ||
164 | *ap->a_vpp = NULL; | |
165 | return (ENOTDIR); | |
166 | } | |
167 | ||
91447636 | 168 | static void |
1c79356b A |
169 | set_blocksize(struct vnode *vp, dev_t dev) |
170 | { | |
91447636 | 171 | int (*size)(dev_t); |
1c79356b A |
172 | int rsize; |
173 | ||
174 | if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { | |
175 | rsize = (*size)(dev); | |
176 | if (rsize <= 0) /* did size fail? */ | |
177 | vp->v_specsize = DEV_BSIZE; | |
178 | else | |
179 | vp->v_specsize = rsize; | |
180 | } | |
181 | else | |
182 | vp->v_specsize = DEV_BSIZE; | |
183 | } | |
184 | ||
185 | void | |
186 | set_fsblocksize(struct vnode *vp) | |
187 | { | |
188 | ||
189 | if (vp->v_type == VBLK) { | |
190 | dev_t dev = (dev_t)vp->v_rdev; | |
191 | int maj = major(dev); | |
192 | ||
91447636 | 193 | if ((u_int)maj >= (u_int)nblkdev) |
1c79356b A |
194 | return; |
195 | ||
91447636 | 196 | vnode_lock(vp); |
1c79356b | 197 | set_blocksize(vp, dev); |
91447636 | 198 | vnode_unlock(vp); |
1c79356b A |
199 | } |
200 | ||
201 | } | |
202 | ||
203 | ||
204 | /* | |
205 | * Open a special file. | |
206 | */ | |
91447636 | 207 | int |
2d21ac55 | 208 | spec_open(struct vnop_open_args *ap) |
1c79356b | 209 | { |
91447636 A |
210 | struct proc *p = vfs_context_proc(ap->a_context); |
211 | kauth_cred_t cred = vfs_context_ucred(ap->a_context); | |
212 | struct vnode *vp = ap->a_vp; | |
1c79356b A |
213 | dev_t bdev, dev = (dev_t)vp->v_rdev; |
214 | int maj = major(dev); | |
215 | int error; | |
216 | ||
217 | /* | |
218 | * Don't allow open if fs is mounted -nodev. | |
219 | */ | |
220 | if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) | |
221 | return (ENXIO); | |
222 | ||
223 | switch (vp->v_type) { | |
224 | ||
225 | case VCHR: | |
91447636 | 226 | if ((u_int)maj >= (u_int)nchrdev) |
1c79356b | 227 | return (ENXIO); |
91447636 | 228 | if (cred != FSCRED && (ap->a_mode & FWRITE)) { |
1c79356b A |
229 | /* |
230 | * When running in very secure mode, do not allow | |
231 | * opens for writing of any disk character devices. | |
232 | */ | |
233 | if (securelevel >= 2 && isdisk(dev, VCHR)) | |
234 | return (EPERM); | |
235 | /* | |
236 | * When running in secure mode, do not allow opens | |
237 | * for writing of /dev/mem, /dev/kmem, or character | |
238 | * devices whose corresponding block devices are | |
239 | * currently mounted. | |
240 | */ | |
241 | if (securelevel >= 1) { | |
91447636 | 242 | if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) |
1c79356b A |
243 | return (error); |
244 | if (iskmemdev(dev)) | |
245 | return (EPERM); | |
246 | } | |
247 | } | |
91447636 A |
248 | if (cdevsw[maj].d_type == D_TTY) { |
249 | vnode_lock(vp); | |
1c79356b | 250 | vp->v_flag |= VISTTY; |
91447636 A |
251 | vnode_unlock(vp); |
252 | } | |
6d2010ae A |
253 | |
254 | devsw_lock(dev, S_IFCHR); | |
1c79356b | 255 | error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); |
6d2010ae A |
256 | |
257 | if (error == 0) { | |
258 | vp->v_specinfo->si_opencount++; | |
259 | } | |
260 | ||
261 | devsw_unlock(dev, S_IFCHR); | |
1c79356b A |
262 | return (error); |
263 | ||
264 | case VBLK: | |
91447636 | 265 | if ((u_int)maj >= (u_int)nblkdev) |
1c79356b A |
266 | return (ENXIO); |
267 | /* | |
268 | * When running in very secure mode, do not allow | |
269 | * opens for writing of any disk block devices. | |
270 | */ | |
91447636 | 271 | if (securelevel >= 2 && cred != FSCRED && |
1c79356b A |
272 | (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) |
273 | return (EPERM); | |
274 | /* | |
275 | * Do not allow opens of block devices that are | |
276 | * currently mounted. | |
277 | */ | |
91447636 | 278 | if ( (error = vfs_mountedon(vp)) ) |
1c79356b | 279 | return (error); |
6d2010ae A |
280 | |
281 | devsw_lock(dev, S_IFBLK); | |
1c79356b | 282 | error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); |
6d2010ae A |
283 | if (!error) { |
284 | vp->v_specinfo->si_opencount++; | |
285 | } | |
286 | devsw_unlock(dev, S_IFBLK); | |
287 | ||
1c79356b | 288 | if (!error) { |
55e303ae A |
289 | u_int64_t blkcnt; |
290 | u_int32_t blksize; | |
91447636 A |
291 | int setsize = 0; |
292 | u_int32_t size512 = 512; | |
293 | ||
294 | ||
295 | if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { | |
296 | /* Switch to 512 byte sectors (temporarily) */ | |
55e303ae | 297 | |
91447636 A |
298 | if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { |
299 | /* Get the number of 512 byte physical blocks. */ | |
300 | if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { | |
301 | setsize = 1; | |
302 | } | |
303 | } | |
304 | /* If it doesn't set back, we can't recover */ | |
305 | if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) | |
306 | error = ENXIO; | |
307 | } | |
308 | ||
309 | ||
310 | vnode_lock(vp); | |
1c79356b | 311 | set_blocksize(vp, dev); |
55e303ae A |
312 | |
313 | /* | |
314 | * Cache the size in bytes of the block device for later | |
315 | * use by spec_write(). | |
316 | */ | |
91447636 | 317 | if (setsize) |
55e303ae | 318 | vp->v_specdevsize = blkcnt * (u_int64_t)size512; |
91447636 A |
319 | else |
320 | vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ | |
321 | ||
322 | vnode_unlock(vp); | |
323 | ||
1c79356b A |
324 | } |
325 | return(error); | |
91447636 A |
326 | default: |
327 | panic("spec_open type"); | |
1c79356b A |
328 | } |
329 | return (0); | |
330 | } | |
331 | ||
332 | /* | |
333 | * Vnode op for read | |
334 | */ | |
91447636 | 335 | int |
2d21ac55 | 336 | spec_read(struct vnop_read_args *ap) |
1c79356b | 337 | { |
2d21ac55 A |
338 | struct vnode *vp = ap->a_vp; |
339 | struct uio *uio = ap->a_uio; | |
1c79356b | 340 | struct buf *bp; |
91447636 | 341 | daddr64_t bn, nextbn; |
1c79356b A |
342 | long bsize, bscale; |
343 | int devBlockSize=0; | |
91447636 | 344 | int n, on; |
1c79356b A |
345 | int error = 0; |
346 | dev_t dev; | |
347 | ||
348 | #if DIAGNOSTIC | |
349 | if (uio->uio_rw != UIO_READ) | |
350 | panic("spec_read mode"); | |
91447636 | 351 | if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) |
1c79356b A |
352 | panic("spec_read proc"); |
353 | #endif | |
91447636 | 354 | if (uio_resid(uio) == 0) |
1c79356b A |
355 | return (0); |
356 | ||
357 | switch (vp->v_type) { | |
358 | ||
359 | case VCHR: | |
1c79356b A |
360 | error = (*cdevsw[major(vp->v_rdev)].d_read) |
361 | (vp->v_rdev, uio, ap->a_ioflag); | |
1c79356b A |
362 | return (error); |
363 | ||
364 | case VBLK: | |
365 | if (uio->uio_offset < 0) | |
366 | return (EINVAL); | |
367 | ||
368 | dev = vp->v_rdev; | |
369 | ||
370 | devBlockSize = vp->v_specsize; | |
371 | ||
372 | if (devBlockSize > PAGE_SIZE) | |
373 | return (EINVAL); | |
374 | ||
375 | bscale = PAGE_SIZE / devBlockSize; | |
376 | bsize = bscale * devBlockSize; | |
377 | ||
378 | do { | |
379 | on = uio->uio_offset % bsize; | |
380 | ||
91447636 | 381 | bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); |
1c79356b | 382 | |
91447636 | 383 | if (vp->v_speclastr + bscale == bn) { |
1c79356b | 384 | nextbn = bn + bscale; |
91447636 | 385 | error = buf_breadn(vp, bn, (int)bsize, &nextbn, |
1c79356b A |
386 | (int *)&bsize, 1, NOCRED, &bp); |
387 | } else | |
91447636 A |
388 | error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); |
389 | ||
390 | vnode_lock(vp); | |
391 | vp->v_speclastr = bn; | |
392 | vnode_unlock(vp); | |
1c79356b | 393 | |
91447636 | 394 | n = bsize - buf_resid(bp); |
1c79356b A |
395 | if ((on > n) || error) { |
396 | if (!error) | |
397 | error = EINVAL; | |
91447636 | 398 | buf_brelse(bp); |
1c79356b A |
399 | return (error); |
400 | } | |
91447636 | 401 | n = min((unsigned)(n - on), uio_resid(uio)); |
1c79356b | 402 | |
6d2010ae | 403 | error = uiomove((char *)buf_dataptr(bp) + on, n, uio); |
1c79356b | 404 | if (n + on == bsize) |
91447636 A |
405 | buf_markaged(bp); |
406 | buf_brelse(bp); | |
407 | } while (error == 0 && uio_resid(uio) > 0 && n != 0); | |
1c79356b A |
408 | return (error); |
409 | ||
410 | default: | |
411 | panic("spec_read type"); | |
412 | } | |
413 | /* NOTREACHED */ | |
91447636 A |
414 | |
415 | return (0); | |
1c79356b A |
416 | } |
417 | ||
418 | /* | |
419 | * Vnode op for write | |
420 | */ | |
91447636 | 421 | int |
2d21ac55 | 422 | spec_write(struct vnop_write_args *ap) |
1c79356b | 423 | { |
2d21ac55 A |
424 | struct vnode *vp = ap->a_vp; |
425 | struct uio *uio = ap->a_uio; | |
1c79356b | 426 | struct buf *bp; |
91447636 | 427 | daddr64_t bn; |
1c79356b | 428 | int bsize, blkmask, bscale; |
2d21ac55 | 429 | int io_sync; |
1c79356b | 430 | int devBlockSize=0; |
2d21ac55 | 431 | int n, on; |
1c79356b A |
432 | int error = 0; |
433 | dev_t dev; | |
434 | ||
435 | #if DIAGNOSTIC | |
436 | if (uio->uio_rw != UIO_WRITE) | |
437 | panic("spec_write mode"); | |
91447636 | 438 | if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) |
1c79356b A |
439 | panic("spec_write proc"); |
440 | #endif | |
441 | ||
442 | switch (vp->v_type) { | |
443 | ||
444 | case VCHR: | |
1c79356b A |
445 | error = (*cdevsw[major(vp->v_rdev)].d_write) |
446 | (vp->v_rdev, uio, ap->a_ioflag); | |
1c79356b A |
447 | return (error); |
448 | ||
449 | case VBLK: | |
91447636 | 450 | if (uio_resid(uio) == 0) |
1c79356b A |
451 | return (0); |
452 | if (uio->uio_offset < 0) | |
453 | return (EINVAL); | |
454 | ||
455 | io_sync = (ap->a_ioflag & IO_SYNC); | |
1c79356b A |
456 | |
457 | dev = (vp->v_rdev); | |
458 | ||
459 | devBlockSize = vp->v_specsize; | |
460 | if (devBlockSize > PAGE_SIZE) | |
461 | return(EINVAL); | |
462 | ||
463 | bscale = PAGE_SIZE / devBlockSize; | |
464 | blkmask = bscale - 1; | |
465 | bsize = bscale * devBlockSize; | |
466 | ||
467 | ||
468 | do { | |
91447636 | 469 | bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); |
1c79356b A |
470 | on = uio->uio_offset % bsize; |
471 | ||
91447636 | 472 | n = min((unsigned)(bsize - on), uio_resid(uio)); |
1c79356b | 473 | |
55e303ae | 474 | /* |
91447636 | 475 | * Use buf_getblk() as an optimization IFF: |
55e303ae A |
476 | * |
477 | * 1) We are reading exactly a block on a block | |
478 | * aligned boundary | |
479 | * 2) We know the size of the device from spec_open | |
480 | * 3) The read doesn't span the end of the device | |
481 | * | |
91447636 | 482 | * Otherwise, we fall back on buf_bread(). |
55e303ae A |
483 | */ |
484 | if (n == bsize && | |
485 | vp->v_specdevsize != (u_int64_t)0 && | |
486 | (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { | |
487 | /* reduce the size of the read to what is there */ | |
488 | n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; | |
489 | } | |
490 | ||
1c79356b | 491 | if (n == bsize) |
91447636 | 492 | bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); |
1c79356b | 493 | else |
91447636 | 494 | error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); |
1c79356b | 495 | |
55e303ae | 496 | /* Translate downstream error for upstream, if needed */ |
91447636 A |
497 | if (!error) |
498 | error = (int)buf_error(bp); | |
1c79356b | 499 | if (error) { |
91447636 | 500 | buf_brelse(bp); |
1c79356b A |
501 | return (error); |
502 | } | |
91447636 | 503 | n = min(n, bsize - buf_resid(bp)); |
1c79356b | 504 | |
6d2010ae | 505 | error = uiomove((char *)buf_dataptr(bp) + on, n, uio); |
91447636 A |
506 | if (error) { |
507 | buf_brelse(bp); | |
508 | return (error); | |
509 | } | |
510 | buf_markaged(bp); | |
1c79356b A |
511 | |
512 | if (io_sync) | |
91447636 | 513 | error = buf_bwrite(bp); |
1c79356b A |
514 | else { |
515 | if ((n + on) == bsize) | |
91447636 | 516 | error = buf_bawrite(bp); |
1c79356b | 517 | else |
91447636 | 518 | error = buf_bdwrite(bp); |
1c79356b | 519 | } |
91447636 | 520 | } while (error == 0 && uio_resid(uio) > 0 && n != 0); |
1c79356b A |
521 | return (error); |
522 | ||
523 | default: | |
524 | panic("spec_write type"); | |
525 | } | |
526 | /* NOTREACHED */ | |
91447636 A |
527 | |
528 | return (0); | |
1c79356b A |
529 | } |
530 | ||
531 | /* | |
532 | * Device ioctl operation. | |
533 | */ | |
91447636 | 534 | int |
2d21ac55 | 535 | spec_ioctl(struct vnop_ioctl_args *ap) |
1c79356b | 536 | { |
91447636 | 537 | proc_t p = vfs_context_proc(ap->a_context); |
1c79356b | 538 | dev_t dev = ap->a_vp->v_rdev; |
b0d623f7 A |
539 | int retval = 0; |
540 | ||
541 | KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START, | |
542 | (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0); | |
1c79356b A |
543 | |
544 | switch (ap->a_vp->v_type) { | |
545 | ||
546 | case VCHR: | |
b0d623f7 A |
547 | retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, |
548 | ap->a_fflag, p); | |
549 | break; | |
1c79356b A |
550 | |
551 | case VBLK: | |
b0d623f7 A |
552 | retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, |
553 | ap->a_fflag, p); | |
554 | break; | |
1c79356b A |
555 | |
556 | default: | |
557 | panic("spec_ioctl"); | |
558 | /* NOTREACHED */ | |
559 | } | |
b0d623f7 A |
560 | KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END, |
561 | (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0); | |
562 | ||
563 | return (retval); | |
1c79356b A |
564 | } |
565 | ||
91447636 | 566 | int |
2d21ac55 | 567 | spec_select(struct vnop_select_args *ap) |
1c79356b | 568 | { |
91447636 | 569 | proc_t p = vfs_context_proc(ap->a_context); |
2d21ac55 | 570 | dev_t dev; |
1c79356b A |
571 | |
572 | switch (ap->a_vp->v_type) { | |
573 | ||
574 | default: | |
575 | return (1); /* XXX */ | |
576 | ||
577 | case VCHR: | |
578 | dev = ap->a_vp->v_rdev; | |
91447636 | 579 | return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); |
1c79356b A |
580 | } |
581 | } | |
91447636 | 582 | |
6d2010ae A |
583 | static int filt_specattach(struct knote *kn); |
584 | ||
b0d623f7 A |
585 | int |
586 | spec_kqfilter(vnode_t vp, struct knote *kn) | |
587 | { | |
588 | dev_t dev; | |
589 | int err = EINVAL; | |
590 | ||
591 | /* | |
592 | * For a few special kinds of devices, we can attach knotes. | |
593 | * Each filter function must check whether the dev type matches it. | |
594 | */ | |
595 | dev = vnode_specrdev(vp); | |
596 | ||
597 | if (vnode_istty(vp)) { | |
6d2010ae A |
598 | /* We can hook into TTYs... */ |
599 | err = filt_specattach(kn); | |
b0d623f7 A |
600 | } else { |
601 | /* Try a bpf device, as defined in bsd/net/bpf.c */ | |
602 | err = bpfkqfilter(dev, kn); | |
603 | } | |
604 | ||
605 | return err; | |
606 | } | |
607 | ||
1c79356b A |
608 | /* |
609 | * Synch buffers associated with a block device | |
610 | */ | |
1c79356b | 611 | int |
91447636 | 612 | spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) |
1c79356b | 613 | { |
1c79356b A |
614 | if (vp->v_type == VCHR) |
615 | return (0); | |
616 | /* | |
617 | * Flush all dirty buffers associated with a block device. | |
618 | */ | |
b0d623f7 | 619 | buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync"); |
91447636 | 620 | |
1c79356b A |
621 | return (0); |
622 | } | |
623 | ||
91447636 | 624 | int |
2d21ac55 | 625 | spec_fsync(struct vnop_fsync_args *ap) |
91447636 A |
626 | { |
627 | return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); | |
628 | } | |
629 | ||
1c79356b A |
630 | /* |
631 | * Just call the device strategy routine | |
632 | */ | |
91447636 | 633 | extern int hard_throttle_on_root; |
2d21ac55 | 634 | void IOSleep(int); |
2d21ac55 A |
635 | |
636 | // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond | |
637 | #define LOWPRI_INITIAL_WINDOW_MSECS 100 | |
638 | #define LOWPRI_WINDOW_MSECS_INC 50 | |
639 | #define LOWPRI_MAX_WINDOW_MSECS 200 | |
640 | #define LOWPRI_MAX_WAITING_MSECS 200 | |
2d21ac55 | 641 | |
6d2010ae A |
642 | #if CONFIG_EMBEDDED |
643 | #define LOWPRI_SLEEP_INTERVAL 5 | |
644 | #else | |
645 | #define LOWPRI_SLEEP_INTERVAL 2 | |
646 | #endif | |
b0d623f7 | 647 | |
593a1d5f A |
648 | struct _throttle_io_info_t { |
649 | struct timeval last_normal_IO_timestamp; | |
b0d623f7 | 650 | struct timeval last_IO_timestamp; |
593a1d5f | 651 | SInt32 numthreads_throttling; |
b0d623f7 A |
652 | SInt32 refcnt; |
653 | SInt32 alloc; | |
593a1d5f A |
654 | }; |
655 | ||
656 | struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; | |
2d21ac55 A |
657 | int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS; |
658 | int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC; | |
659 | int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS; | |
660 | int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; | |
661 | ||
b0d623f7 A |
662 | #if 0 |
663 | #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \ | |
664 | do { \ | |
665 | if ((debug_info)->alloc) \ | |
666 | printf("%s: "format, __FUNCTION__, ## args); \ | |
667 | } while(0) | |
668 | ||
669 | #else | |
670 | #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) | |
671 | #endif | |
672 | ||
6d2010ae A |
673 | SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); |
674 | SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); | |
675 | SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); | |
676 | SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); | |
677 | ||
678 | /* | |
679 | * throttled I/O helper function | |
680 | * convert the index of the lowest set bit to a device index | |
681 | */ | |
682 | int | |
683 | num_trailing_0(uint64_t n) | |
684 | { | |
685 | /* | |
686 | * since in most cases the number of trailing 0s is very small, | |
687 | * we simply counting sequentially from the lowest bit | |
688 | */ | |
689 | if (n == 0) | |
690 | return sizeof(n) * 8; | |
691 | int count = 0; | |
692 | while (!ISSET(n, 1)) { | |
693 | n >>= 1; | |
694 | ++count; | |
695 | } | |
696 | return count; | |
697 | } | |
2d21ac55 | 698 | |
b0d623f7 A |
699 | /* |
700 | * Release the reference and if the item was allocated and this is the last | |
701 | * reference then free it. | |
702 | * | |
703 | * This routine always returns the old value. | |
704 | */ | |
705 | static int | |
706 | throttle_info_rel(struct _throttle_io_info_t *info) | |
707 | { | |
708 | SInt32 oldValue = OSDecrementAtomic(&info->refcnt); | |
709 | ||
710 | DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", | |
711 | info, (int)(oldValue -1), info ); | |
712 | ||
713 | /* The reference count just went negative, very bad */ | |
714 | if (oldValue == 0) | |
715 | panic("throttle info ref cnt went negative!"); | |
716 | ||
717 | /* | |
718 | * Once reference count is zero, no one else should be able to take a | |
719 | * reference | |
720 | */ | |
721 | if ((info->refcnt == 0) && (info->alloc)) { | |
722 | DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info ); | |
723 | FREE(info, M_TEMP); | |
724 | } | |
725 | return oldValue; | |
726 | } | |
727 | ||
728 | /* | |
729 | * Just take a reference on the throttle info structure. | |
730 | * | |
731 | * This routine always returns the old value. | |
732 | */ | |
733 | static SInt32 | |
734 | throttle_info_ref(struct _throttle_io_info_t *info) | |
735 | { | |
736 | SInt32 oldValue = OSIncrementAtomic(&info->refcnt); | |
737 | ||
738 | DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", | |
739 | info, (int)(oldValue -1), info ); | |
740 | /* Allocated items should never have a reference of zero */ | |
741 | if (info->alloc && (oldValue == 0)) | |
742 | panic("Taking a reference without calling create throttle info!\n"); | |
743 | ||
744 | return oldValue; | |
745 | } | |
746 | ||
747 | /* | |
748 | * KPI routine | |
749 | * | |
750 | * Create and take a reference on a throttle info structure and return a | |
751 | * pointer for the file system to use when calling throttle_info_update. | |
752 | * Calling file system must have a matching release for every create. | |
753 | */ | |
754 | void * | |
755 | throttle_info_create(void) | |
756 | { | |
757 | struct _throttle_io_info_t *info; | |
758 | ||
759 | MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK); | |
760 | /* Should never happen but just in case */ | |
761 | if (info == NULL) | |
762 | return NULL; | |
763 | /* Mark that this one was allocated and needs to be freed */ | |
764 | DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); | |
765 | info->alloc = TRUE; | |
766 | /* Take a reference */ | |
767 | OSIncrementAtomic(&info->refcnt); | |
768 | return info; | |
769 | } | |
770 | ||
771 | /* | |
772 | * KPI routine | |
773 | * | |
774 | * Release the throttle info pointer if all the reference are gone. Should be | |
775 | * called to release reference taken by throttle_info_create | |
776 | */ | |
777 | void | |
778 | throttle_info_release(void *throttle_info) | |
779 | { | |
780 | DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n", | |
781 | (struct _throttle_io_info_t *)throttle_info, | |
782 | (struct _throttle_io_info_t *)throttle_info); | |
783 | if (throttle_info) /* Just to be careful */ | |
784 | throttle_info_rel(throttle_info); | |
785 | } | |
786 | ||
787 | /* | |
788 | * KPI routine | |
789 | * | |
790 | * File Systems that create an info structure, need to call this routine in | |
791 | * their mount routine (used by cluster code). File Systems that call this in | |
792 | * their mount routines must call throttle_info_mount_rel in their unmount | |
793 | * routines. | |
794 | */ | |
795 | void | |
796 | throttle_info_mount_ref(mount_t mp, void *throttle_info) | |
797 | { | |
798 | if ((throttle_info == NULL) || (mp == NULL)) | |
799 | return; | |
800 | throttle_info_ref(throttle_info); | |
801 | /* We already have a reference release it before adding the new one */ | |
802 | if (mp->mnt_throttle_info) | |
803 | throttle_info_rel(mp->mnt_throttle_info); | |
804 | mp->mnt_throttle_info = throttle_info; | |
805 | } | |
806 | ||
6d2010ae A |
807 | /* |
808 | * Private KPI routine | |
809 | * | |
810 | * return a handle for accessing throttle_info given a throttle_mask. The | |
811 | * handle must be released by throttle_info_rel_by_mask | |
812 | */ | |
813 | int | |
814 | throttle_info_ref_by_mask(uint64_t throttle_mask, | |
815 | throttle_info_handle_t *throttle_info_handle) | |
816 | { | |
817 | int dev_index; | |
818 | struct _throttle_io_info_t *info; | |
819 | ||
820 | if (throttle_info_handle == NULL) | |
821 | return EINVAL; | |
822 | ||
823 | dev_index = num_trailing_0(throttle_mask); | |
824 | info = &_throttle_io_info[dev_index]; | |
825 | throttle_info_ref(info); | |
826 | *(struct _throttle_io_info_t**)throttle_info_handle = info; | |
827 | return 0; | |
828 | } | |
829 | ||
830 | /* | |
831 | * Private KPI routine | |
832 | * | |
833 | * release the handle obtained by throttle_info_ref_by_mask | |
834 | */ | |
835 | void | |
836 | throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) | |
837 | { | |
838 | /* for now the handle is just a pointer to _throttle_io_info_t */ | |
839 | throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); | |
840 | } | |
841 | ||
b0d623f7 A |
842 | /* |
843 | * KPI routine | |
844 | * | |
845 | * File Systems that throttle_info_mount_ref, must call this routine in their | |
846 | * umount routine. | |
847 | */ | |
848 | void | |
849 | throttle_info_mount_rel(mount_t mp) | |
850 | { | |
851 | if (mp->mnt_throttle_info) | |
852 | throttle_info_rel(mp->mnt_throttle_info); | |
853 | mp->mnt_throttle_info = NULL; | |
854 | } | |
855 | ||
e2fac8b1 A |
856 | void |
857 | throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) | |
858 | { | |
b0d623f7 | 859 | struct _throttle_io_info_t *info; |
e2fac8b1 | 860 | |
b0d623f7 A |
861 | if (mp == NULL) |
862 | info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; | |
863 | else if (mp->mnt_throttle_info == NULL) | |
864 | info = &_throttle_io_info[mp->mnt_devbsdunit]; | |
865 | else | |
866 | info = mp->mnt_throttle_info; | |
867 | ||
868 | *tv = info->last_IO_timestamp; | |
e2fac8b1 A |
869 | } |
870 | ||
871 | void | |
872 | update_last_io_time(mount_t mp) | |
873 | { | |
b0d623f7 | 874 | struct _throttle_io_info_t *info; |
e2fac8b1 | 875 | |
b0d623f7 A |
876 | if (mp == NULL) |
877 | info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; | |
878 | else if (mp->mnt_throttle_info == NULL) | |
879 | info = &_throttle_io_info[mp->mnt_devbsdunit]; | |
880 | else | |
881 | info = mp->mnt_throttle_info; | |
e2fac8b1 | 882 | |
b0d623f7 | 883 | microuptime(&info->last_IO_timestamp); |
e2fac8b1 A |
884 | } |
885 | ||
6d2010ae A |
886 | |
887 | #if CONFIG_EMBEDDED | |
888 | ||
889 | int throttle_get_io_policy(struct uthread **ut) | |
890 | { | |
891 | int policy = IOPOL_DEFAULT; | |
892 | proc_t p = current_proc(); | |
893 | ||
894 | *ut = get_bsdthread_info(current_thread()); | |
895 | ||
896 | if (p != NULL) | |
897 | policy = p->p_iopol_disk; | |
898 | ||
899 | if (*ut != NULL) { | |
900 | // the I/O policy of the thread overrides that of the process | |
901 | // unless the I/O policy of the thread is default | |
902 | if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) | |
903 | policy = (*ut)->uu_iopol_disk; | |
904 | } | |
905 | return policy; | |
906 | } | |
907 | #else | |
908 | ||
909 | int throttle_get_io_policy(__unused struct uthread **ut) | |
910 | { | |
911 | *ut = get_bsdthread_info(current_thread()); | |
912 | ||
913 | return (proc_get_task_selfdiskacc()); | |
914 | } | |
915 | #endif | |
916 | ||
917 | ||
b0d623f7 A |
918 | static int |
919 | throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info) | |
2d21ac55 | 920 | { |
b0d623f7 | 921 | struct _throttle_io_info_t *info = throttle_info; |
2d21ac55 | 922 | struct timeval elapsed; |
593a1d5f | 923 | int elapsed_msecs; |
6d2010ae A |
924 | int policy; |
925 | struct uthread *ut; | |
926 | ||
927 | policy = throttle_get_io_policy(&ut); | |
928 | ||
929 | if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE) | |
930 | return (0); | |
2d21ac55 | 931 | |
593a1d5f | 932 | microuptime(&elapsed); |
b0d623f7 | 933 | timevalsub(&elapsed, &info->last_normal_IO_timestamp); |
593a1d5f | 934 | elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; |
2d21ac55 | 935 | |
593a1d5f A |
936 | if (lowpri_window_msecs == -1) // use the max waiting time |
937 | lowpri_window_msecs = lowpri_max_waiting_msecs; | |
2d21ac55 | 938 | |
593a1d5f A |
939 | return elapsed_msecs < lowpri_window_msecs; |
940 | } | |
2d21ac55 | 941 | |
b0d623f7 A |
942 | /* |
943 | * If we have a mount point and it has a throttle info pointer then | |
944 | * use it to do the check, otherwise use the device unit number to find | |
945 | * the correct throttle info array element. | |
946 | */ | |
947 | int | |
948 | throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp) | |
949 | { | |
950 | void *info; | |
951 | ||
952 | /* Should we just return zero if no mount point */ | |
953 | if (mp == NULL) | |
954 | info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; | |
955 | else if (mp->mnt_throttle_info == NULL) | |
956 | info = &_throttle_io_info[mp->mnt_devbsdunit]; | |
957 | else | |
958 | info = mp->mnt_throttle_info; | |
959 | return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info); | |
960 | } | |
961 | ||
6d2010ae A |
962 | uint32_t |
963 | throttle_lowpri_io(int sleep_amount) | |
593a1d5f | 964 | { |
6d2010ae A |
965 | int sleep_cnt = 0; |
966 | int numthreads_throttling; | |
593a1d5f A |
967 | int max_try_num; |
968 | struct uthread *ut; | |
b0d623f7 | 969 | struct _throttle_io_info_t *info; |
6d2010ae | 970 | int max_waiting_msecs; |
2d21ac55 | 971 | |
593a1d5f A |
972 | ut = get_bsdthread_info(current_thread()); |
973 | ||
b0d623f7 A |
974 | if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL)) |
975 | goto done; | |
593a1d5f | 976 | |
b0d623f7 | 977 | info = ut->uu_throttle_info; |
593a1d5f | 978 | |
6d2010ae A |
979 | if (sleep_amount != 0) { |
980 | #if CONFIG_EMBEDDED | |
981 | max_waiting_msecs = lowpri_max_waiting_msecs; | |
982 | #else | |
983 | if (ut->uu_throttle_isssd == TRUE) | |
984 | max_waiting_msecs = lowpri_max_waiting_msecs / 100; | |
985 | else | |
986 | max_waiting_msecs = lowpri_max_waiting_msecs; | |
987 | #endif | |
988 | if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL) | |
989 | max_waiting_msecs = LOWPRI_SLEEP_INTERVAL; | |
b0d623f7 | 990 | |
6d2010ae A |
991 | numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1; |
992 | max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling); | |
993 | ||
994 | for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) { | |
b0d623f7 | 995 | if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) { |
6d2010ae A |
996 | if (sleep_cnt == 0) { |
997 | KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, | |
998 | ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0); | |
999 | } | |
593a1d5f | 1000 | IOSleep(LOWPRI_SLEEP_INTERVAL); |
b0d623f7 | 1001 | DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info ); |
593a1d5f A |
1002 | } else { |
1003 | break; | |
1004 | } | |
2d21ac55 | 1005 | } |
6d2010ae A |
1006 | if (sleep_cnt) { |
1007 | KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, | |
1008 | ut->uu_lowpri_window, sleep_cnt, 0, 0, 0); | |
1009 | } | |
2d21ac55 | 1010 | } |
593a1d5f | 1011 | SInt32 oldValue; |
b0d623f7 | 1012 | oldValue = OSDecrementAtomic(&info->numthreads_throttling); |
593a1d5f A |
1013 | |
1014 | if (oldValue <= 0) { | |
1015 | panic("%s: numthreads negative", __func__); | |
1016 | } | |
b0d623f7 A |
1017 | done: |
1018 | ut->uu_lowpri_window = 0; | |
1019 | if (ut->uu_throttle_info) | |
1020 | throttle_info_rel(ut->uu_throttle_info); | |
1021 | ut->uu_throttle_info = NULL; | |
6d2010ae A |
1022 | ut->uu_throttle_bc = FALSE; |
1023 | ||
1024 | return (sleep_cnt * LOWPRI_SLEEP_INTERVAL); | |
593a1d5f A |
1025 | } |
1026 | ||
6d2010ae A |
1027 | /* |
1028 | * KPI routine | |
1029 | * | |
1030 | * set a kernel thread's IO policy. policy can be: | |
1031 | * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE | |
1032 | * | |
1033 | * explanations about these policies are in the man page of setiopolicy_np | |
1034 | */ | |
1035 | void throttle_set_thread_io_policy(int policy) | |
593a1d5f | 1036 | { |
6d2010ae A |
1037 | #if !CONFIG_EMBEDDED |
1038 | proc_apply_thread_selfdiskacc(policy); | |
1039 | #else /* !CONFIG_EMBEDDED */ | |
1040 | struct uthread *ut; | |
1041 | ut = get_bsdthread_info(current_thread()); | |
1042 | ut->uu_iopol_disk = policy; | |
1043 | #endif /* !CONFIG_EMBEDDED */ | |
1044 | } | |
593a1d5f | 1045 | |
593a1d5f | 1046 | |
6d2010ae A |
1047 | static |
1048 | void throttle_info_reset_window(struct uthread *ut) | |
1049 | { | |
1050 | struct _throttle_io_info_t *info; | |
1051 | ||
1052 | info = ut->uu_throttle_info; | |
1053 | ||
1054 | OSDecrementAtomic(&info->numthreads_throttling); | |
1055 | throttle_info_rel(info); | |
1056 | ut->uu_throttle_info = NULL; | |
1057 | ut->uu_lowpri_window = 0; | |
1058 | } | |
1059 | ||
1060 | static | |
1061 | void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle) | |
1062 | { | |
1063 | SInt32 oldValue; | |
1064 | ||
1065 | ut->uu_throttle_info = info; | |
1066 | throttle_info_ref(info); | |
1067 | DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); | |
1068 | ||
1069 | oldValue = OSIncrementAtomic(&info->numthreads_throttling); | |
1070 | if (oldValue < 0) { | |
1071 | panic("%s: numthreads negative", __func__); | |
593a1d5f | 1072 | } |
6d2010ae A |
1073 | ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; |
1074 | ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; | |
1075 | ut->uu_throttle_isssd = isssd; | |
1076 | ut->uu_throttle_bc = BC_throttle; | |
2d21ac55 | 1077 | } |
91447636 | 1078 | |
6d2010ae A |
1079 | |
1080 | static | |
1081 | void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd) | |
b0d623f7 A |
1082 | { |
1083 | struct _throttle_io_info_t *info = throttle_info; | |
1084 | struct uthread *ut; | |
1085 | int policy; | |
1086 | int is_throttleable_io = 0; | |
1087 | int is_passive_io = 0; | |
b0d623f7 A |
1088 | |
1089 | if (!lowpri_IO_initial_window_msecs || (info == NULL)) | |
1090 | return; | |
1091 | policy = throttle_get_io_policy(&ut); | |
1092 | ||
1093 | switch (policy) { | |
1094 | case IOPOL_DEFAULT: | |
1095 | case IOPOL_NORMAL: | |
1096 | break; | |
1097 | case IOPOL_THROTTLE: | |
1098 | is_throttleable_io = 1; | |
1099 | break; | |
1100 | case IOPOL_PASSIVE: | |
1101 | is_passive_io = 1; | |
1102 | break; | |
1103 | default: | |
1104 | printf("unknown I/O policy %d", policy); | |
1105 | break; | |
1106 | } | |
1107 | ||
1108 | if (!is_throttleable_io && ISSET(flags, B_PASSIVE)) | |
1109 | is_passive_io |= 1; | |
1110 | ||
1111 | if (!is_throttleable_io) { | |
1112 | if (!is_passive_io){ | |
1113 | microuptime(&info->last_normal_IO_timestamp); | |
1114 | } | |
1115 | } else if (ut) { | |
1116 | /* | |
1117 | * I'd really like to do the IOSleep here, but | |
1118 | * we may be holding all kinds of filesystem related locks | |
1119 | * and the pages for this I/O marked 'busy'... | |
1120 | * we don't want to cause a normal task to block on | |
1121 | * one of these locks while we're throttling a task marked | |
1122 | * for low priority I/O... we'll mark the uthread and | |
1123 | * do the delay just before we return from the system | |
1124 | * call that triggered this I/O or from vnode_pagein | |
1125 | */ | |
6d2010ae A |
1126 | if (ut->uu_lowpri_window == 0) |
1127 | throttle_info_set_initial_window(ut, info, isssd, FALSE); | |
1128 | else { | |
b0d623f7 A |
1129 | /* The thread sends I/Os to different devices within the same system call */ |
1130 | if (ut->uu_throttle_info != info) { | |
6d2010ae | 1131 | struct _throttle_io_info_t *old_info = ut->uu_throttle_info; |
b0d623f7 A |
1132 | |
1133 | // keep track of the numthreads in the right device | |
1134 | OSDecrementAtomic(&old_info->numthreads_throttling); | |
1135 | OSIncrementAtomic(&info->numthreads_throttling); | |
1136 | ||
6d2010ae A |
1137 | DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); |
1138 | DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); | |
b0d623f7 A |
1139 | /* This thread no longer needs a reference on that throttle info */ |
1140 | throttle_info_rel(ut->uu_throttle_info); | |
1141 | ut->uu_throttle_info = info; | |
1142 | /* Need to take a reference on this throttle info */ | |
1143 | throttle_info_ref(ut->uu_throttle_info); | |
1144 | } | |
1145 | int numthreads = MAX(1, info->numthreads_throttling); | |
1146 | ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; | |
1147 | if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) | |
1148 | ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; | |
6d2010ae A |
1149 | |
1150 | if (isssd == FALSE) { | |
1151 | /* | |
1152 | * we're here because we've actually issued I/Os to different devices... | |
1153 | * if at least one of them was a non SSD, then thottle the thread | |
1154 | * using the policy for non SSDs | |
1155 | */ | |
1156 | ut->uu_throttle_isssd = FALSE; | |
1157 | } | |
b0d623f7 A |
1158 | } |
1159 | } | |
1160 | } | |
1161 | ||
6d2010ae A |
1162 | /* |
1163 | * KPI routine | |
1164 | * | |
1165 | * this is usually called before every I/O, used for throttled I/O | |
1166 | * book keeping. This routine has low overhead and does not sleep | |
1167 | */ | |
1168 | void throttle_info_update(void *throttle_info, int flags) | |
1169 | { | |
1170 | throttle_info_update_internal(throttle_info, flags, FALSE); | |
1171 | } | |
1172 | ||
1173 | /* | |
1174 | * KPI routine | |
1175 | * | |
1176 | * this is usually called before every I/O, used for throttled I/O | |
1177 | * book keeping. This routine has low overhead and does not sleep | |
1178 | */ | |
1179 | void throttle_info_update_by_mask(void *throttle_info_handle, int flags) | |
1180 | { | |
1181 | void *throttle_info = throttle_info_handle; | |
1182 | /* for now we only use the lowest bit of the throttle mask, so the | |
1183 | * handle is the same as the throttle_info. Later if we store a | |
1184 | * set of throttle infos in the handle, we will want to loop through | |
1185 | * them and call throttle_info_update in a loop | |
1186 | */ | |
1187 | throttle_info_update(throttle_info, flags); | |
1188 | } | |
1189 | ||
1190 | extern int ignore_is_ssd; | |
1191 | ||
91447636 | 1192 | int |
2d21ac55 | 1193 | spec_strategy(struct vnop_strategy_args *ap) |
1c79356b | 1194 | { |
91447636 A |
1195 | buf_t bp; |
1196 | int bflags; | |
6d2010ae | 1197 | int policy; |
91447636 | 1198 | dev_t bdev; |
b0d623f7 | 1199 | uthread_t ut; |
b0d623f7 | 1200 | mount_t mp; |
6d2010ae A |
1201 | int strategy_ret; |
1202 | struct _throttle_io_info_t *throttle_info; | |
1203 | boolean_t isssd = FALSE; | |
9bccf70c A |
1204 | |
1205 | bp = ap->a_bp; | |
91447636 | 1206 | bdev = buf_device(bp); |
b0d623f7 | 1207 | mp = buf_vnode(bp)->v_mount; |
9bccf70c | 1208 | |
6d2010ae A |
1209 | policy = throttle_get_io_policy(&ut); |
1210 | ||
1211 | if (policy == IOPOL_THROTTLE) { | |
1212 | bp->b_flags |= B_THROTTLED_IO; | |
1213 | bp->b_flags &= ~B_PASSIVE; | |
1214 | } else if (policy == IOPOL_PASSIVE) | |
1215 | bp->b_flags |= B_PASSIVE; | |
1216 | ||
1217 | bflags = bp->b_flags; | |
1218 | ||
9bccf70c | 1219 | if (kdebug_enable) { |
91447636 | 1220 | int code = 0; |
9bccf70c | 1221 | |
91447636 A |
1222 | if (bflags & B_READ) |
1223 | code |= DKIO_READ; | |
1224 | if (bflags & B_ASYNC) | |
1225 | code |= DKIO_ASYNC; | |
9bccf70c | 1226 | |
91447636 A |
1227 | if (bflags & B_META) |
1228 | code |= DKIO_META; | |
1229 | else if (bflags & B_PAGEIO) | |
1230 | code |= DKIO_PAGING; | |
9bccf70c | 1231 | |
6d2010ae A |
1232 | if (bflags & B_THROTTLED_IO) |
1233 | code |= DKIO_THROTTLE; | |
1234 | else if (bflags & B_PASSIVE) | |
1235 | code |= DKIO_PASSIVE; | |
1236 | ||
91447636 | 1237 | KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, |
b0d623f7 | 1238 | bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); |
9bccf70c | 1239 | } |
b0d623f7 A |
1240 | if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && |
1241 | mp && (mp->mnt_kern_flag & MNTK_ROOTDEV)) | |
91447636 A |
1242 | hard_throttle_on_root = 1; |
1243 | ||
6d2010ae A |
1244 | if (mp != NULL) { |
1245 | if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) | |
1246 | isssd = TRUE; | |
1247 | throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; | |
1248 | } else | |
1249 | throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; | |
2d21ac55 | 1250 | |
6d2010ae | 1251 | throttle_info_update_internal(throttle_info, bflags, isssd); |
e2fac8b1 | 1252 | |
b0d623f7 | 1253 | if ((bflags & B_READ) == 0) { |
6d2010ae | 1254 | microuptime(&throttle_info->last_IO_timestamp); |
b0d623f7 A |
1255 | if (mp) { |
1256 | INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); | |
1257 | } | |
1258 | } else if (mp) { | |
1259 | INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); | |
e2fac8b1 | 1260 | } |
6d2010ae A |
1261 | /* |
1262 | * The BootCache may give us special information about | |
1263 | * the IO, so it returns special values that we check | |
1264 | * for here. | |
1265 | * | |
1266 | * IO_SATISFIED_BY_CACHE | |
1267 | * The read has been satisfied by the boot cache. Don't | |
1268 | * throttle the thread unnecessarily. | |
1269 | * | |
1270 | * IO_SHOULD_BE_THROTTLED | |
1271 | * The boot cache is playing back a playlist and this IO | |
1272 | * cut through. Throttle it so we're not cutting through | |
1273 | * the boot cache too often. | |
1274 | * | |
1275 | * Note that typical strategy routines are defined with | |
1276 | * a void return so we'll get garbage here. In the | |
1277 | * unlikely case the garbage matches our special return | |
1278 | * value, it's not a big deal since we're only adjusting | |
1279 | * the throttling delay. | |
1280 | */ | |
1281 | #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) | |
1282 | #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) | |
1283 | typedef int strategy_fcn_ret_t(struct buf *bp); | |
b0d623f7 | 1284 | |
6d2010ae A |
1285 | strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); |
1286 | ||
1287 | if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) { | |
1288 | /* | |
1289 | * If this was a throttled IO satisfied by the boot cache, | |
1290 | * don't delay the thread. | |
1291 | */ | |
1292 | throttle_info_reset_window(ut); | |
1293 | ||
1294 | } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) { | |
1295 | /* | |
1296 | * If the boot cache indicates this IO should be throttled, | |
1297 | * delay the thread. | |
1298 | */ | |
1299 | throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE); | |
1300 | } | |
b0d623f7 | 1301 | return (0); |
ccc36f2f A |
1302 | } |
1303 | ||
1c79356b A |
1304 | |
1305 | /* | |
1306 | * This is a noop, simply returning what one has been given. | |
1307 | */ | |
91447636 A |
1308 | int |
1309 | spec_blockmap(__unused struct vnop_blockmap_args *ap) | |
1c79356b | 1310 | { |
91447636 | 1311 | return (ENOTSUP); |
1c79356b A |
1312 | } |
1313 | ||
1314 | ||
1315 | /* | |
1316 | * Device close routine | |
1317 | */ | |
91447636 | 1318 | int |
2d21ac55 | 1319 | spec_close(struct vnop_close_args *ap) |
1c79356b | 1320 | { |
2d21ac55 | 1321 | struct vnode *vp = ap->a_vp; |
1c79356b | 1322 | dev_t dev = vp->v_rdev; |
6d2010ae | 1323 | int error = 0; |
2d21ac55 | 1324 | int flags = ap->a_fflag; |
91447636 | 1325 | struct proc *p = vfs_context_proc(ap->a_context); |
2d21ac55 | 1326 | struct session *sessp; |
6d2010ae | 1327 | int do_rele = 0; |
1c79356b A |
1328 | |
1329 | switch (vp->v_type) { | |
1330 | ||
1331 | case VCHR: | |
1332 | /* | |
1333 | * Hack: a tty device that is a controlling terminal | |
1334 | * has a reference from the session structure. | |
1335 | * We cannot easily tell that a character device is | |
1336 | * a controlling terminal, unless it is the closing | |
1337 | * process' controlling terminal. In that case, | |
b0d623f7 A |
1338 | * if the reference count is 1 (this is the very |
1339 | * last close) | |
1c79356b | 1340 | */ |
2d21ac55 A |
1341 | sessp = proc_session(p); |
1342 | if (sessp != SESSION_NULL) { | |
b0d623f7 | 1343 | if ((vcount(vp) == 1) && |
2d21ac55 | 1344 | (vp == sessp->s_ttyvp)) { |
6d2010ae | 1345 | |
2d21ac55 | 1346 | session_lock(sessp); |
6d2010ae A |
1347 | if (vp == sessp->s_ttyvp) { |
1348 | sessp->s_ttyvp = NULL; | |
1349 | sessp->s_ttyvid = 0; | |
1350 | sessp->s_ttyp = TTY_NULL; | |
1351 | sessp->s_ttypgrpid = NO_PID; | |
1352 | do_rele = 1; | |
1353 | } | |
2d21ac55 | 1354 | session_unlock(sessp); |
6d2010ae A |
1355 | |
1356 | if (do_rele) { | |
1357 | vnode_rele(vp); | |
1358 | } | |
2d21ac55 A |
1359 | } |
1360 | session_rele(sessp); | |
1c79356b | 1361 | } |
2d21ac55 | 1362 | |
6d2010ae A |
1363 | devsw_lock(dev, S_IFCHR); |
1364 | ||
1365 | vp->v_specinfo->si_opencount--; | |
1366 | ||
1367 | if (vp->v_specinfo->si_opencount < 0) { | |
1368 | panic("Negative open count?"); | |
1369 | } | |
1c79356b | 1370 | /* |
2d21ac55 | 1371 | * close on last reference or on vnode revoke call |
1c79356b | 1372 | */ |
6d2010ae A |
1373 | if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) { |
1374 | devsw_unlock(dev, S_IFCHR); | |
1c79356b | 1375 | return (0); |
6d2010ae A |
1376 | } |
1377 | ||
1378 | error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); | |
1379 | ||
1380 | devsw_unlock(dev, S_IFCHR); | |
1c79356b A |
1381 | break; |
1382 | ||
1383 | case VBLK: | |
1c79356b | 1384 | /* |
6d2010ae A |
1385 | * If there is more than one outstanding open, don't |
1386 | * send the close to the device. | |
0b4e3aa0 | 1387 | */ |
6d2010ae A |
1388 | devsw_lock(dev, S_IFBLK); |
1389 | if (vcount(vp) > 1) { | |
1390 | vp->v_specinfo->si_opencount--; | |
1391 | devsw_unlock(dev, S_IFBLK); | |
0b4e3aa0 | 1392 | return (0); |
6d2010ae A |
1393 | } |
1394 | devsw_unlock(dev, S_IFBLK); | |
0b4e3aa0 A |
1395 | |
1396 | /* | |
1397 | * On last close of a block device (that isn't mounted) | |
1398 | * we must invalidate any in core blocks, so that | |
1399 | * we can, for instance, change floppy disks. | |
1400 | */ | |
91447636 A |
1401 | if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) |
1402 | return (error); | |
1403 | ||
1404 | error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); | |
0b4e3aa0 A |
1405 | if (error) |
1406 | return (error); | |
b0d623f7 | 1407 | |
6d2010ae A |
1408 | devsw_lock(dev, S_IFBLK); |
1409 | ||
1410 | vp->v_specinfo->si_opencount--; | |
1411 | ||
1412 | if (vp->v_specinfo->si_opencount < 0) { | |
1413 | panic("Negative open count?"); | |
1414 | } | |
1415 | ||
1416 | if (vcount(vp) > 0) { | |
1417 | devsw_unlock(dev, S_IFBLK); | |
1418 | return (0); | |
1419 | } | |
1420 | ||
1421 | error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); | |
1422 | ||
1423 | devsw_unlock(dev, S_IFBLK); | |
1c79356b A |
1424 | break; |
1425 | ||
1426 | default: | |
1427 | panic("spec_close: not special"); | |
2d21ac55 | 1428 | return(EBADF); |
1c79356b A |
1429 | } |
1430 | ||
6d2010ae | 1431 | return error; |
1c79356b A |
1432 | } |
1433 | ||
1434 | /* | |
1435 | * Return POSIX pathconf information applicable to special devices. | |
1436 | */ | |
91447636 | 1437 | int |
2d21ac55 | 1438 | spec_pathconf(struct vnop_pathconf_args *ap) |
1c79356b A |
1439 | { |
1440 | ||
1441 | switch (ap->a_name) { | |
1442 | case _PC_LINK_MAX: | |
1443 | *ap->a_retval = LINK_MAX; | |
1444 | return (0); | |
1445 | case _PC_MAX_CANON: | |
1446 | *ap->a_retval = MAX_CANON; | |
1447 | return (0); | |
1448 | case _PC_MAX_INPUT: | |
1449 | *ap->a_retval = MAX_INPUT; | |
1450 | return (0); | |
1451 | case _PC_PIPE_BUF: | |
1452 | *ap->a_retval = PIPE_BUF; | |
1453 | return (0); | |
1454 | case _PC_CHOWN_RESTRICTED: | |
2d21ac55 | 1455 | *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ |
1c79356b A |
1456 | return (0); |
1457 | case _PC_VDISABLE: | |
1458 | *ap->a_retval = _POSIX_VDISABLE; | |
1459 | return (0); | |
1460 | default: | |
1461 | return (EINVAL); | |
1462 | } | |
1463 | /* NOTREACHED */ | |
1464 | } | |
1465 | ||
1c79356b A |
1466 | /* |
1467 | * Special device failed operation | |
1468 | */ | |
91447636 A |
1469 | int |
1470 | spec_ebadf(__unused void *dummy) | |
1c79356b A |
1471 | { |
1472 | ||
1473 | return (EBADF); | |
1474 | } | |
1475 | ||
1c79356b A |
1476 | /* Blktooff derives file offset from logical block number */ |
1477 | int | |
2d21ac55 | 1478 | spec_blktooff(struct vnop_blktooff_args *ap) |
1c79356b | 1479 | { |
2d21ac55 | 1480 | struct vnode *vp = ap->a_vp; |
1c79356b A |
1481 | |
1482 | switch (vp->v_type) { | |
1483 | case VCHR: | |
1484 | *ap->a_offset = (off_t)-1; /* failure */ | |
91447636 | 1485 | return (ENOTSUP); |
1c79356b A |
1486 | |
1487 | case VBLK: | |
1488 | printf("spec_blktooff: not implemented for VBLK\n"); | |
1489 | *ap->a_offset = (off_t)-1; /* failure */ | |
91447636 | 1490 | return (ENOTSUP); |
1c79356b A |
1491 | |
1492 | default: | |
1493 | panic("spec_blktooff type"); | |
1494 | } | |
1495 | /* NOTREACHED */ | |
91447636 A |
1496 | |
1497 | return (0); | |
1c79356b A |
1498 | } |
1499 | ||
1500 | /* Offtoblk derives logical block number from file offset */ | |
1501 | int | |
2d21ac55 | 1502 | spec_offtoblk(struct vnop_offtoblk_args *ap) |
1c79356b | 1503 | { |
2d21ac55 | 1504 | struct vnode *vp = ap->a_vp; |
1c79356b A |
1505 | |
1506 | switch (vp->v_type) { | |
1507 | case VCHR: | |
91447636 A |
1508 | *ap->a_lblkno = (daddr64_t)-1; /* failure */ |
1509 | return (ENOTSUP); | |
1c79356b A |
1510 | |
1511 | case VBLK: | |
1512 | printf("spec_offtoblk: not implemented for VBLK\n"); | |
91447636 A |
1513 | *ap->a_lblkno = (daddr64_t)-1; /* failure */ |
1514 | return (ENOTSUP); | |
1c79356b A |
1515 | |
1516 | default: | |
1517 | panic("spec_offtoblk type"); | |
1518 | } | |
1519 | /* NOTREACHED */ | |
91447636 A |
1520 | |
1521 | return (0); | |
1c79356b | 1522 | } |
6d2010ae A |
1523 | |
1524 | static void filt_specdetach(struct knote *kn); | |
1525 | static int filt_spec(struct knote *kn, long hint); | |
1526 | static unsigned filt_specpeek(struct knote *kn); | |
1527 | ||
1528 | struct filterops spec_filtops = { | |
1529 | .f_isfd = 1, | |
1530 | .f_attach = filt_specattach, | |
1531 | .f_detach = filt_specdetach, | |
1532 | .f_event = filt_spec, | |
1533 | .f_peek = filt_specpeek | |
1534 | }; | |
1535 | ||
1536 | static int | |
1537 | filter_to_seltype(int16_t filter) | |
1538 | { | |
1539 | switch (filter) { | |
1540 | case EVFILT_READ: | |
1541 | return FREAD; | |
1542 | case EVFILT_WRITE: | |
1543 | return FWRITE; | |
1544 | break; | |
1545 | default: | |
1546 | panic("filt_to_seltype(): invalid filter %d\n", filter); | |
1547 | return 0; | |
1548 | } | |
1549 | } | |
1550 | ||
1551 | static int | |
1552 | filt_specattach(struct knote *kn) | |
1553 | { | |
1554 | vnode_t vp; | |
1555 | dev_t dev; | |
1556 | ||
1557 | vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ | |
1558 | ||
1559 | assert(vnode_ischr(vp)); | |
1560 | ||
1561 | dev = vnode_specrdev(vp); | |
1562 | ||
1563 | if (major(dev) > nchrdev) { | |
1564 | return ENXIO; | |
1565 | } | |
1566 | ||
1567 | if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { | |
1568 | return EINVAL; | |
1569 | } | |
1570 | ||
1571 | /* Resulting wql is safe to unlink even if it has never been linked */ | |
1572 | kn->kn_hook = wait_queue_link_allocate(); | |
1573 | if (kn->kn_hook == NULL) { | |
1574 | return EAGAIN; | |
1575 | } | |
1576 | ||
1577 | kn->kn_fop = &spec_filtops; | |
1578 | kn->kn_hookid = vnode_vid(vp); | |
1579 | ||
1580 | knote_markstayqueued(kn); | |
1581 | ||
1582 | return 0; | |
1583 | } | |
1584 | ||
1585 | static void | |
1586 | filt_specdetach(struct knote *kn) | |
1587 | { | |
1588 | kern_return_t ret; | |
1589 | ||
1590 | /* | |
1591 | * Given wait queue link and wait queue set, unlink. This is subtle. | |
1592 | * If the device has been revoked from under us, selclearthread() will | |
1593 | * have removed our link from the kqueue's wait queue set, which | |
1594 | * wait_queue_set_unlink_one() will detect and handle. | |
1595 | */ | |
1596 | ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); | |
1597 | if (ret != KERN_SUCCESS) { | |
1598 | panic("filt_specdetach(): failed to unlink wait queue link."); | |
1599 | } | |
1600 | ||
1601 | (void)wait_queue_link_free(kn->kn_hook); | |
1602 | kn->kn_hook = NULL; | |
1603 | kn->kn_status &= ~KN_STAYQUEUED; | |
1604 | } | |
1605 | ||
1606 | static int | |
1607 | filt_spec(struct knote *kn, long hint) | |
1608 | { | |
1609 | vnode_t vp; | |
1610 | uthread_t uth; | |
1611 | wait_queue_set_t old_wqs; | |
1612 | vfs_context_t ctx; | |
1613 | int selres; | |
1614 | int error; | |
1615 | int use_offset; | |
1616 | dev_t dev; | |
1617 | uint64_t flags; | |
1618 | ||
1619 | assert(kn->kn_hook != NULL); | |
1620 | ||
1621 | if (hint != 0) { | |
1622 | panic("filt_spec(): nonzero hint?"); | |
1623 | } | |
1624 | ||
1625 | uth = get_bsdthread_info(current_thread()); | |
1626 | ctx = vfs_context_current(); | |
1627 | vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; | |
1628 | ||
1629 | error = vnode_getwithvid(vp, kn->kn_hookid); | |
1630 | if (error != 0) { | |
1631 | kn->kn_flags |= (EV_EOF | EV_ONESHOT); | |
1632 | return 1; | |
1633 | } | |
1634 | ||
1635 | dev = vnode_specrdev(vp); | |
1636 | flags = cdevsw_flags[major(dev)]; | |
1637 | use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); | |
1638 | assert((flags & CDEVSW_SELECT_KQUEUE) != 0); | |
1639 | ||
1640 | /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ | |
1641 | old_wqs = uth->uu_wqset; | |
1642 | uth->uu_wqset = kn->kn_kq->kq_wqs; | |
1643 | selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); | |
1644 | uth->uu_wqset = old_wqs; | |
1645 | ||
1646 | if (use_offset) { | |
1647 | if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { | |
1648 | kn->kn_data = 0; | |
1649 | } else { | |
1650 | kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; | |
1651 | } | |
1652 | } else { | |
1653 | kn->kn_data = selres; | |
1654 | } | |
1655 | ||
1656 | vnode_put(vp); | |
1657 | ||
1658 | return (kn->kn_data != 0); | |
1659 | } | |
1660 | ||
1661 | static unsigned | |
1662 | filt_specpeek(struct knote *kn) | |
1663 | { | |
1664 | vnode_t vp; | |
1665 | uthread_t uth; | |
1666 | wait_queue_set_t old_wqs; | |
1667 | vfs_context_t ctx; | |
1668 | int error, selres; | |
1669 | ||
1670 | uth = get_bsdthread_info(current_thread()); | |
1671 | ctx = vfs_context_current(); | |
1672 | vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; | |
1673 | ||
1674 | error = vnode_getwithvid(vp, kn->kn_hookid); | |
1675 | if (error != 0) { | |
1676 | return 1; /* Just like VNOP_SELECT() on recycled vnode */ | |
1677 | } | |
1678 | ||
1679 | /* | |
1680 | * Why pass the link here? Because we may not have registered in the past... | |
1681 | */ | |
1682 | old_wqs = uth->uu_wqset; | |
1683 | uth->uu_wqset = kn->kn_kq->kq_wqs; | |
1684 | selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); | |
1685 | uth->uu_wqset = old_wqs; | |
1686 | ||
1687 | vnode_put(vp); | |
1688 | return selres; | |
1689 | } | |
1690 |