]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/vn/vn.c
xnu-792.21.3.tar.gz
[apple/xnu.git] / bsd / dev / vn / vn.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (c) 1988 University of Utah.
31 * Copyright (c) 1990, 1993
32 * The Regents of the University of California. All rights reserved.
33 *
34 * This code is derived from software contributed to Berkeley by
35 * the Systems Programming Group of the University of Utah Computer
36 * Science Department.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * from: Utah Hdr: vn.c 1.13 94/04/02
67 *
68 * from: @(#)vn.c 8.6 (Berkeley) 4/1/94
69 * $FreeBSD: src/sys/dev/vn/vn.c,v 1.105.2.4 2001/11/18 07:11:00 dillon Exp $
70 */
71
72 /*
73 * Vnode disk driver.
74 *
75 * Block/character interface to a vnode. Allows one to treat a file
76 * as a disk (e.g. build a filesystem in it, mount it, etc.).
77 *
78 * NOTE 1: This uses the vnop_blockmap/vnop_strategy interface to the vnode
79 * instead of a simple VOP_RDWR. We do this to avoid distorting the
80 * local buffer cache.
81 *
82 * NOTE 2: There is a security issue involved with this driver.
83 * Once mounted all access to the contents of the "mapped" file via
84 * the special file is controlled by the permissions on the special
85 * file, the protection of the mapped file is ignored (effectively,
86 * by using root credentials in all transactions).
87 *
88 * NOTE 3: Doesn't interact with leases, should it?
89 */
90
91 #include "vndevice.h"
92
93 #if NVNDEVICE > 0
94
95 #include <sys/param.h>
96 #include <sys/systm.h>
97 #include <sys/kernel.h>
98 #include <sys/mount.h>
99 #include <sys/namei.h>
100 #include <sys/proc.h>
101 #include <sys/kauth.h>
102 #include <sys/buf.h>
103 #include <sys/malloc.h>
104 #include <sys/vnode_internal.h>
105 #include <sys/fcntl.h>
106 #include <sys/conf.h>
107 #include <sys/disk.h>
108 #include <sys/stat.h>
109 #include <sys/conf.h>
110 #include <sys/uio_internal.h>
111
112 #include <sys/vnioctl.h>
113
114 #include <sys/vm.h>
115
116 #include <vm/vm_pager.h>
117 #include <mach/memory_object_types.h>
118
119 #include <miscfs/devfs/devfs.h>
120
121
122 #include "shadow.h"
123
124 static ioctl_fcn_t vnioctl_chr;
125 static ioctl_fcn_t vnioctl_blk;
126 static open_close_fcn_t vnopen;
127 static open_close_fcn_t vnclose;
128 static psize_fcn_t vnsize;
129 static strategy_fcn_t vnstrategy;
130 static read_write_fcn_t vnread;
131 static read_write_fcn_t vnwrite;
132
133 static int vndevice_bdev_major;
134 static int vndevice_cdev_major;
135
136 /*
137 * cdevsw
138 * D_DISK we want to look like a disk
139 * D_CANFREE We support B_FREEBUF
140 */
141
142 static struct bdevsw vn_bdevsw = {
143 /* open */ vnopen,
144 /* close */ vnclose,
145 /* strategy */ vnstrategy,
146 /* ioctl */ vnioctl_blk,
147 /* dump */ eno_dump,
148 /* psize */ vnsize,
149 /* flags */ D_DISK,
150 };
151
152 static struct cdevsw vn_cdevsw = {
153 /* open */ vnopen,
154 /* close */ vnclose,
155 /* read */ vnread,
156 /* write */ vnwrite,
157 /* ioctl */ vnioctl_chr,
158 /* stop */ eno_stop,
159 /* reset */ eno_reset,
160 /* ttys */ 0,
161 /* select */ eno_select,
162 /* mmap */ eno_mmap,
163 /* strategy */ eno_strat,
164 /* getc */ eno_getc,
165 /* putc */ eno_putc,
166 /* flags */ D_DISK,
167 };
168
169 struct vn_softc {
170 u_int64_t sc_fsize; /* file size in bytes */
171 u_int64_t sc_size; /* size of vn, sc_secsize scale */
172 int sc_flags; /* flags */
173 u_long sc_secsize; /* sector size */
174 struct vnode *sc_vp; /* vnode if not NULL */
175 uint32_t sc_vid;
176 int sc_open_flags;
177 struct vnode *sc_shadow_vp; /* shadow vnode if not NULL */
178 uint32_t sc_shadow_vid;
179 shadow_map_t * sc_shadow_map; /* shadow map if not NULL */
180 kauth_cred_t sc_cred; /* credentials */
181 u_int32_t sc_options; /* options */
182 void * sc_bdev;
183 void * sc_cdev;
184 } vn_table[NVNDEVICE];
185
186 #define ROOT_IMAGE_UNIT 0
187
188 /* sc_flags */
189 #define VNF_INITED 0x01
190 #define VNF_READONLY 0x02
191
192 static u_int32_t vn_options;
193
194 #define IFOPT(vn,opt) if (((vn)->sc_options|vn_options) & (opt))
195 #define TESTOPT(vn,opt) (((vn)->sc_options|vn_options) & (opt))
196
197 static int setcred(struct vnode * vp, struct proc * p,
198 kauth_cred_t cred);
199 static void vnclear (struct vn_softc *vn, struct proc * p);
200 static void vn_ioctl_to_64(struct vn_ioctl *from, struct user_vn_ioctl *to);
201 void vndevice_init(void);
202 int vndevice_root_image(char * path, char devname[], dev_t * dev_p);
203
204 static int
205 vniocattach_file(struct vn_softc *vn,
206 struct user_vn_ioctl *vniop,
207 dev_t dev,
208 int in_kernel,
209 struct proc *p);
210 static int
211 vniocattach_shadow(struct vn_softc * vn,
212 struct user_vn_ioctl *vniop,
213 dev_t dev,
214 int in_kernel,
215 struct proc *p);
216 static __inline__ int
217 vnunit(dev_t dev)
218 {
219 return (minor(dev));
220 }
221
222 static int
223 vnclose(__unused dev_t dev, __unused int flags,
224 __unused int devtype, __unused struct proc *p)
225 {
226 return (0);
227 }
228
229 static int
230 vnopen(dev_t dev, int flags, __unused int devtype, __unused struct proc *p)
231 {
232 struct vn_softc *vn;
233 int unit;
234
235 unit = vnunit(dev);
236 if (vnunit(dev) >= NVNDEVICE) {
237 return (ENXIO);
238 }
239 vn = vn_table + unit;
240 if ((flags & FWRITE) && (vn->sc_flags & VNF_READONLY))
241 return (EACCES);
242
243 return(0);
244 }
245
246 static int
247 file_io(struct vnode * vp, struct vfs_context * context_p,
248 enum uio_rw op, char * base, off_t offset, user_ssize_t count,
249 user_ssize_t * resid)
250 {
251 uio_t auio;
252 int error;
253 char uio_buf[UIO_SIZEOF(1)];
254
255 auio = uio_createwithbuffer(1, offset, UIO_SYSSPACE, op,
256 &uio_buf[0], sizeof(uio_buf));
257 uio_addiov(auio, CAST_USER_ADDR_T(base), count);
258 if (op == UIO_READ)
259 error = VNOP_READ(vp, auio, IO_SYNC, context_p);
260 else
261 error = VNOP_WRITE(vp, auio, IO_SYNC, context_p);
262
263 if (resid != NULL) {
264 *resid = uio_resid(auio);
265 }
266 return (error);
267 }
268
269 static __inline__ off_t
270 block_round(off_t o, int blocksize)
271 {
272 return ((o + blocksize - 1) / blocksize);
273 }
274
275 static __inline__ off_t
276 block_truncate(off_t o, int blocksize)
277 {
278 return (o / blocksize);
279 }
280
281 static __inline__ int
282 block_remainder(off_t o, int blocksize)
283 {
284 return (o % blocksize);
285 }
286
287 static int
288 vnread_shadow(struct vn_softc * vn, struct uio *uio, int ioflag,
289 struct vfs_context * context_p)
290 {
291 u_long blocksize = vn->sc_secsize;
292 int error = 0;
293 off_t offset;
294 user_ssize_t resid;
295 off_t orig_offset;
296 user_ssize_t orig_resid;
297
298 orig_resid = resid = uio_resid(uio);
299 orig_offset = offset = uio_offset(uio);
300
301 while (resid > 0) {
302 u_long remainder;
303 u_long this_block_number;
304 u_long this_block_count;
305 off_t this_offset;
306 user_ssize_t this_resid;
307 struct vnode * vp;
308
309 /* figure out which blocks to read */
310 remainder = block_remainder(offset, blocksize);
311 if (shadow_map_read(vn->sc_shadow_map,
312 block_truncate(offset, blocksize),
313 block_round(resid + remainder, blocksize),
314 &this_block_number, &this_block_count)) {
315 vp = vn->sc_shadow_vp;
316 }
317 else {
318 vp = vn->sc_vp;
319 }
320
321 /* read the blocks (or parts thereof) */
322 this_offset = (off_t)this_block_number * blocksize + remainder;
323 uio_setoffset(uio, this_offset);
324 this_resid = this_block_count * blocksize - remainder;
325 if (this_resid > resid) {
326 this_resid = resid;
327 }
328 uio_setresid(uio, this_resid);
329 error = VNOP_READ(vp, uio, ioflag, context_p);
330 if (error) {
331 break;
332 }
333
334 /* figure out how much we actually read */
335 this_resid -= uio_resid(uio);
336 if (this_resid == 0) {
337 printf("vn device: vnread_shadow zero length read\n");
338 break;
339 }
340 resid -= this_resid;
341 offset += this_resid;
342 }
343 uio_setresid(uio, resid);
344 uio_setoffset(uio, offset);
345 return (error);
346 }
347
348 static int
349 vncopy_block_to_shadow(struct vn_softc * vn, struct vfs_context * context_p,
350 u_long file_block, u_long shadow_block)
351 {
352 int error;
353 char * tmpbuf;
354
355 tmpbuf = _MALLOC(vn->sc_secsize, M_TEMP, M_WAITOK);
356 if (tmpbuf == NULL) {
357 return (ENOMEM);
358 }
359 /* read one block from file at file_block offset */
360 error = file_io(vn->sc_vp, context_p, UIO_READ,
361 tmpbuf, (off_t)file_block * vn->sc_secsize,
362 vn->sc_secsize, NULL);
363 if (error) {
364 goto done;
365 }
366 /* write one block to shadow file at shadow_block offset */
367 error = file_io(vn->sc_shadow_vp, context_p, UIO_WRITE,
368 tmpbuf, (off_t)shadow_block * vn->sc_secsize,
369 vn->sc_secsize, NULL);
370 done:
371 FREE(tmpbuf, M_TEMP);
372 return (error);
373 }
374
375 enum {
376 FLAGS_FIRST_BLOCK_PARTIAL = 0x1,
377 FLAGS_LAST_BLOCK_PARTIAL = 0x2
378 };
379
380 static int
381 vnwrite_shadow(struct vn_softc * vn, struct uio *uio, int ioflag,
382 struct vfs_context * context_p)
383 {
384 u_long blocksize = vn->sc_secsize;
385 int error = 0;
386 user_ssize_t resid;
387 off_t offset;
388
389 resid = uio_resid(uio);
390 offset = uio_offset(uio);
391
392 while (resid > 0) {
393 int flags = 0;
394 u_long offset_block_number;
395 u_long remainder;
396 u_long resid_block_count;
397 u_long shadow_block_count;
398 u_long shadow_block_number;
399 user_ssize_t this_resid;
400
401 /* figure out which blocks to write */
402 offset_block_number = block_truncate(offset, blocksize);
403 remainder = block_remainder(offset, blocksize);
404 resid_block_count = block_round(resid + remainder, blocksize);
405 /* figure out if the first or last blocks are partial writes */
406 if (remainder > 0
407 && !shadow_map_is_written(vn->sc_shadow_map,
408 offset_block_number)) {
409 /* the first block is a partial write */
410 flags |= FLAGS_FIRST_BLOCK_PARTIAL;
411 }
412 if (resid_block_count > 1
413 && !shadow_map_is_written(vn->sc_shadow_map,
414 offset_block_number
415 + resid_block_count - 1)
416 && block_remainder(offset + resid, blocksize) > 0) {
417 /* the last block is a partial write */
418 flags |= FLAGS_LAST_BLOCK_PARTIAL;
419 }
420 if (shadow_map_write(vn->sc_shadow_map,
421 offset_block_number, resid_block_count,
422 &shadow_block_number,
423 &shadow_block_count)) {
424 /* shadow file is growing */
425 #if 0
426 /* truncate the file to its new length before write */
427 off_t size;
428 size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map)
429 * vn->sc_secsize;
430 vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC,
431 context_p);
432 #endif 0
433 }
434 /* write the blocks (or parts thereof) */
435 uio_setoffset(uio, (off_t)
436 shadow_block_number * blocksize + remainder);
437 this_resid = (off_t)shadow_block_count * blocksize - remainder;
438 if (this_resid >= resid) {
439 this_resid = resid;
440 if ((flags & FLAGS_LAST_BLOCK_PARTIAL) != 0) {
441 /* copy the last block to the shadow */
442 u_long d;
443 u_long s;
444
445 s = offset_block_number
446 + resid_block_count - 1;
447 d = shadow_block_number
448 + shadow_block_count - 1;
449 error = vncopy_block_to_shadow(vn, context_p,
450 s, d);
451 if (error) {
452 printf("vnwrite_shadow: failed to copy"
453 " block %d to shadow block %d\n",
454 s, d);
455 break;
456 }
457 }
458 }
459 uio_setresid(uio, this_resid);
460 if ((flags & FLAGS_FIRST_BLOCK_PARTIAL) != 0) {
461 /* copy the first block to the shadow */
462 error = vncopy_block_to_shadow(vn, context_p,
463 offset_block_number,
464 shadow_block_number);
465 if (error) {
466 printf("vnwrite_shadow: failed to"
467 " copy block %d to shadow block %d\n",
468 offset_block_number,
469 shadow_block_number);
470 break;
471 }
472 }
473 error = VNOP_WRITE(vn->sc_shadow_vp, uio, ioflag, context_p);
474 if (error) {
475 break;
476 }
477 /* figure out how much we actually wrote */
478 this_resid -= uio_resid(uio);
479 if (this_resid == 0) {
480 printf("vn device: vnwrite_shadow zero length write\n");
481 break;
482 }
483 resid -= this_resid;
484 offset += this_resid;
485 }
486 uio_setresid(uio, resid);
487 uio_setoffset(uio, offset);
488 return (error);
489 }
490
491 static int
492 vnread(dev_t dev, struct uio *uio, int ioflag)
493 {
494 struct vfs_context context;
495 int error = 0;
496 boolean_t funnel_state;
497 off_t offset;
498 struct proc * p;
499 user_ssize_t resid;
500 struct vn_softc * vn;
501 int unit;
502
503 unit = vnunit(dev);
504 if (vnunit(dev) >= NVNDEVICE) {
505 return (ENXIO);
506 }
507 p = current_proc();
508 funnel_state = thread_funnel_set(kernel_flock, TRUE);
509 vn = vn_table + unit;
510 if ((vn->sc_flags & VNF_INITED) == 0) {
511 error = ENXIO;
512 goto done;
513 }
514 error = vnode_getwithvid(vn->sc_vp, vn->sc_vid);
515 if (error != 0) {
516 /* the vnode is no longer available, abort */
517 error = ENXIO;
518 vnclear(vn, p);
519 goto done;
520 }
521
522 resid = uio_resid(uio);
523 offset = uio_offset(uio);
524
525 /*
526 * If out of bounds return an error. If at the EOF point,
527 * simply read less.
528 */
529 if (offset >= (off_t)vn->sc_fsize) {
530 if (offset > (off_t)vn->sc_fsize) {
531 error = EINVAL;
532 }
533 goto done;
534 }
535 /*
536 * If the request crosses EOF, truncate the request.
537 */
538 if ((offset + resid) > (off_t)vn->sc_fsize) {
539 resid = vn->sc_fsize - offset;
540 uio_setresid(uio, resid);
541 }
542
543 context.vc_proc = p;
544 context.vc_ucred = vn->sc_cred;
545 if (vn->sc_shadow_vp != NULL) {
546 error = vnode_getwithvid(vn->sc_shadow_vp,
547 vn->sc_shadow_vid);
548 if (error != 0) {
549 /* the vnode is no longer available, abort */
550 error = ENXIO;
551 vnode_put(vn->sc_vp);
552 vnclear(vn, p);
553 goto done;
554 }
555 error = vnread_shadow(vn, uio, ioflag, &context);
556 vnode_put(vn->sc_shadow_vp);
557 } else {
558 error = VNOP_READ(vn->sc_vp, uio, ioflag, &context);
559 }
560 vnode_put(vn->sc_vp);
561 done:
562 (void) thread_funnel_set(kernel_flock, funnel_state);
563 return (error);
564 }
565
566 static int
567 vnwrite(dev_t dev, struct uio *uio, int ioflag)
568 {
569 struct vfs_context context;
570 int error;
571 boolean_t funnel_state;
572 off_t offset;
573 struct proc * p;
574 user_ssize_t resid;
575 struct vn_softc * vn;
576 int unit;
577
578 unit = vnunit(dev);
579 if (vnunit(dev) >= NVNDEVICE) {
580 return (ENXIO);
581 }
582 p = current_proc();
583 funnel_state = thread_funnel_set(kernel_flock, TRUE);
584 vn = vn_table + unit;
585 if ((vn->sc_flags & VNF_INITED) == 0) {
586 error = ENXIO;
587 goto done;
588 }
589 if (vn->sc_flags & VNF_READONLY) {
590 error = EROFS;
591 goto done;
592 }
593 error = vnode_getwithvid(vn->sc_vp, vn->sc_vid);
594 if (error != 0) {
595 /* the vnode is no longer available, abort */
596 error = ENXIO;
597 vnclear(vn, p);
598 goto done;
599 }
600 resid = uio_resid(uio);
601 offset = uio_offset(uio);
602
603 /*
604 * If out of bounds return an error. If at the EOF point,
605 * simply write less.
606 */
607 if (offset >= (off_t)vn->sc_fsize) {
608 if (offset > (off_t)vn->sc_fsize) {
609 error = EINVAL;
610 }
611 goto done;
612 }
613 /*
614 * If the request crosses EOF, truncate the request.
615 */
616 if ((offset + resid) > (off_t)vn->sc_fsize) {
617 resid = (off_t)vn->sc_fsize - offset;
618 uio_setresid(uio, resid);
619 }
620
621 context.vc_proc = p;
622 context.vc_ucred = vn->sc_cred;
623
624 if (vn->sc_shadow_vp != NULL) {
625 error = vnode_getwithvid(vn->sc_shadow_vp,
626 vn->sc_shadow_vid);
627 if (error != 0) {
628 /* the vnode is no longer available, abort */
629 error = ENXIO;
630 vnode_put(vn->sc_vp);
631 vnclear(vn, p);
632 goto done;
633 }
634 error = vnwrite_shadow(vn, uio, ioflag, &context);
635 vnode_put(vn->sc_shadow_vp);
636 } else {
637 error = VNOP_WRITE(vn->sc_vp, uio, ioflag, &context);
638 }
639 vnode_put(vn->sc_vp);
640 done:
641 (void) thread_funnel_set(kernel_flock, funnel_state);
642 return (error);
643 }
644
645 static int
646 shadow_read(struct vn_softc * vn, struct buf * bp, char * base, struct proc * p)
647 {
648 u_long blocksize = vn->sc_secsize;
649 struct vfs_context context;
650 int error = 0;
651 u_long offset;
652 boolean_t read_shadow;
653 u_long resid;
654 u_long start = 0;
655
656 context.vc_proc = p;
657 context.vc_ucred = vn->sc_cred;
658 offset = buf_blkno(bp);
659 resid = buf_resid(bp) / blocksize;
660 while (resid > 0) {
661 user_ssize_t temp_resid;
662 u_long this_offset;
663 u_long this_resid;
664 struct vnode * vp;
665
666 read_shadow = shadow_map_read(vn->sc_shadow_map,
667 offset, resid,
668 &this_offset, &this_resid);
669 if (read_shadow) {
670 vp = vn->sc_shadow_vp;
671 }
672 else {
673 vp = vn->sc_vp;
674 }
675 error = file_io(vp, &context, UIO_READ, base + start,
676 (off_t)this_offset * blocksize,
677 (user_ssize_t)this_resid * blocksize,
678 &temp_resid);
679 if (error) {
680 break;
681 }
682 this_resid -= (temp_resid / blocksize);
683 if (this_resid == 0) {
684 printf("vn device: shadow_read zero length read\n");
685 break;
686 }
687 resid -= this_resid;
688 offset += this_resid;
689 start += this_resid * blocksize;
690 }
691 buf_setresid(bp, resid * blocksize);
692 return (error);
693 }
694
695 static int
696 shadow_write(struct vn_softc * vn, struct buf * bp, char * base,
697 struct proc * p)
698 {
699 u_long blocksize = vn->sc_secsize;
700 struct vfs_context context;
701 int error = 0;
702 u_long offset;
703 boolean_t shadow_grew;
704 u_long resid;
705 u_long start = 0;
706
707 context.vc_proc = p;
708 context.vc_ucred = vn->sc_cred;
709 offset = buf_blkno(bp);
710 resid = buf_resid(bp) / blocksize;
711 while (resid > 0) {
712 user_ssize_t temp_resid;
713 u_long this_offset;
714 u_long this_resid;
715
716 shadow_grew = shadow_map_write(vn->sc_shadow_map,
717 offset, resid,
718 &this_offset, &this_resid);
719 if (shadow_grew) {
720 #if 0
721 off_t size;
722 /* truncate the file to its new length before write */
723 size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map)
724 * blocksize;
725 vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC,
726 &context);
727 #endif
728 }
729 error = file_io(vn->sc_shadow_vp, &context, UIO_WRITE,
730 base + start,
731 (off_t)this_offset * blocksize,
732 (user_ssize_t)this_resid * blocksize,
733 &temp_resid);
734 if (error) {
735 break;
736 }
737 this_resid -= (temp_resid / blocksize);
738 if (this_resid == 0) {
739 printf("vn device: shadow_write zero length write\n");
740 break;
741 }
742 resid -= this_resid;
743 offset += this_resid;
744 start += this_resid * blocksize;
745 }
746 buf_setresid(bp, resid * blocksize);
747 return (error);
748 }
749
750 static int
751 vn_readwrite_io(struct vn_softc * vn, struct buf * bp, struct proc * p)
752 {
753 int error = 0;
754 char * iov_base;
755 caddr_t vaddr;
756
757
758 if (buf_map(bp, &vaddr))
759 panic("vn device: buf_map failed");
760 iov_base = (char *)vaddr;
761
762 if (vn->sc_shadow_vp == NULL) {
763 struct vfs_context context;
764 user_ssize_t temp_resid;
765
766 context.vc_proc = p;
767 context.vc_ucred = vn->sc_cred;
768
769 error = file_io(vn->sc_vp, &context,
770 buf_flags(bp) & B_READ ? UIO_READ : UIO_WRITE,
771 iov_base,
772 (off_t)buf_blkno(bp) * vn->sc_secsize,
773 buf_resid(bp), &temp_resid);
774 buf_setresid(bp, temp_resid);
775 }
776 else {
777 if (buf_flags(bp) & B_READ)
778 error = shadow_read(vn, bp, iov_base, p);
779 else
780 error = shadow_write(vn, bp, iov_base, p);
781 }
782 buf_unmap(bp);
783
784 return (error);
785 }
786
787 static void
788 vnstrategy(struct buf *bp)
789 {
790 struct vn_softc *vn;
791 int error = 0;
792 long sz; /* in sc_secsize chunks */
793 daddr64_t blk_num;
794 boolean_t funnel_state;
795 struct proc * p = current_proc();
796 struct vnode * shadow_vp = NULL;
797 struct vnode * vp = NULL;
798
799 funnel_state = thread_funnel_set(kernel_flock, TRUE);
800 vn = vn_table + vnunit(buf_device(bp));
801 if ((vn->sc_flags & VNF_INITED) == 0) {
802 error = ENXIO;
803 goto done;
804 }
805
806 buf_setresid(bp, buf_count(bp));
807 /*
808 * Check for required alignment. Transfers must be a valid
809 * multiple of the sector size.
810 */
811 blk_num = buf_blkno(bp);
812 if (buf_count(bp) % vn->sc_secsize != 0) {
813 error = EINVAL;
814 goto done;
815 }
816 sz = howmany(buf_count(bp), vn->sc_secsize);
817
818 /*
819 * If out of bounds return an error. If at the EOF point,
820 * simply read or write less.
821 */
822 if (blk_num >= 0 && (u_int64_t)blk_num >= vn->sc_size) {
823 if (blk_num > 0 && (u_int64_t)blk_num > vn->sc_size) {
824 error = EINVAL;
825 }
826 goto done;
827 }
828 /*
829 * If the request crosses EOF, truncate the request.
830 */
831 if ((blk_num + sz) > 0 && ((u_int64_t)(blk_num + sz)) > vn->sc_size) {
832 buf_setcount(bp, (vn->sc_size - blk_num) * vn->sc_secsize);
833 buf_setresid(bp, buf_count(bp));
834 }
835 vp = vn->sc_vp;
836 if (vp == NULL) {
837 error = ENXIO;
838 goto done;
839 }
840 error = vnode_getwithvid(vp, vn->sc_vid);
841 if (error != 0) {
842 /* the vnode is no longer available, abort */
843 error = ENXIO;
844 vnclear(vn, p);
845 goto done;
846 }
847 shadow_vp = vn->sc_shadow_vp;
848 if (shadow_vp != NULL) {
849 error = vnode_getwithvid(shadow_vp,
850 vn->sc_shadow_vid);
851 if (error != 0) {
852 /* the vnode is no longer available, abort */
853 error = ENXIO;
854 vnode_put(vn->sc_vp);
855 vnclear(vn, p);
856 goto done;
857 }
858 }
859 error = vn_readwrite_io(vn, bp, p);
860 vnode_put(vp);
861 if (shadow_vp != NULL) {
862 vnode_put(shadow_vp);
863 }
864
865 done:
866 (void) thread_funnel_set(kernel_flock, funnel_state);
867 if (error) {
868 buf_seterror(bp, error);
869 }
870 buf_biodone(bp);
871 return;
872 }
873
874 /* ARGSUSED */
875 static int
876 vnioctl(dev_t dev, u_long cmd, caddr_t data,
877 __unused int flag, struct proc *p,
878 int is_char)
879 {
880 struct vn_softc *vn;
881 struct user_vn_ioctl *viop;
882 int error;
883 u_int32_t *f;
884 u_int64_t * o;
885 int unit;
886 struct vfsioattr ioattr;
887 struct user_vn_ioctl user_vnio;
888 boolean_t funnel_state;
889
890 unit = vnunit(dev);
891 if (vnunit(dev) >= NVNDEVICE) {
892 return (ENXIO);
893 }
894
895 funnel_state = thread_funnel_set(kernel_flock, TRUE);
896 vn = vn_table + unit;
897 error = proc_suser(p);
898 if (error) {
899 goto done;
900 }
901
902 viop = (struct user_vn_ioctl *)data;
903 f = (u_int32_t *)data;
904 o = (u_int64_t *)data;
905 switch (cmd) {
906 case VNIOCDETACH:
907 case VNIOCDETACH64:
908 case DKIOCGETBLOCKSIZE:
909 case DKIOCSETBLOCKSIZE:
910 case DKIOCGETMAXBLOCKCOUNTREAD:
911 case DKIOCGETMAXBLOCKCOUNTWRITE:
912 case DKIOCGETMAXSEGMENTCOUNTREAD:
913 case DKIOCGETMAXSEGMENTCOUNTWRITE:
914 case DKIOCGETMAXSEGMENTBYTECOUNTREAD:
915 case DKIOCGETMAXSEGMENTBYTECOUNTWRITE:
916 case DKIOCGETBLOCKCOUNT:
917 case DKIOCGETBLOCKCOUNT32:
918 if ((vn->sc_flags & VNF_INITED) == 0) {
919 error = ENXIO;
920 goto done;
921 }
922 break;
923 default:
924 break;
925 }
926
927 if (vn->sc_vp != NULL)
928 vfs_ioattr(vnode_mount(vn->sc_vp), &ioattr);
929 else
930 bzero(&ioattr, sizeof(ioattr));
931
932 switch (cmd) {
933 case DKIOCISVIRTUAL:
934 *f = 1;
935 break;
936 case DKIOCGETMAXBLOCKCOUNTREAD:
937 *o = ioattr.io_maxreadcnt / vn->sc_secsize;
938 break;
939 case DKIOCGETMAXBLOCKCOUNTWRITE:
940 *o = ioattr.io_maxwritecnt / vn->sc_secsize;
941 break;
942 case DKIOCGETMAXBYTECOUNTREAD:
943 *o = ioattr.io_maxreadcnt;
944 break;
945 case DKIOCGETMAXBYTECOUNTWRITE:
946 *o = ioattr.io_maxwritecnt;
947 break;
948 case DKIOCGETMAXSEGMENTCOUNTREAD:
949 *o = ioattr.io_segreadcnt;
950 break;
951 case DKIOCGETMAXSEGMENTCOUNTWRITE:
952 *o = ioattr.io_segwritecnt;
953 break;
954 case DKIOCGETMAXSEGMENTBYTECOUNTREAD:
955 *o = ioattr.io_maxsegreadsize;
956 break;
957 case DKIOCGETMAXSEGMENTBYTECOUNTWRITE:
958 *o = ioattr.io_maxsegwritesize;
959 break;
960 case DKIOCGETBLOCKSIZE:
961 *f = vn->sc_secsize;
962 break;
963 case DKIOCSETBLOCKSIZE:
964 if (is_char) {
965 /* can only set block size on block device */
966 error = ENODEV;
967 break;
968 }
969 if (*f < DEV_BSIZE) {
970 error = EINVAL;
971 break;
972 }
973 if (vn->sc_shadow_vp != NULL) {
974 if (*f == (unsigned)vn->sc_secsize) {
975 break;
976 }
977 /* can't change the block size if already shadowing */
978 error = EBUSY;
979 break;
980 }
981 vn->sc_secsize = *f;
982 /* recompute the size in terms of the new blocksize */
983 vn->sc_size = vn->sc_fsize / vn->sc_secsize;
984 break;
985 case DKIOCISWRITABLE:
986 *f = 1;
987 break;
988 case DKIOCGETBLOCKCOUNT32:
989 *f = vn->sc_size;
990 break;
991 case DKIOCGETBLOCKCOUNT:
992 *o = vn->sc_size;
993 break;
994 case VNIOCSHADOW:
995 case VNIOCSHADOW64:
996 if (vn->sc_shadow_vp != NULL) {
997 error = EBUSY;
998 break;
999 }
1000 if (vn->sc_vp == NULL) {
1001 /* much be attached before we can shadow */
1002 error = EINVAL;
1003 break;
1004 }
1005 if (!proc_is64bit(p)) {
1006 /* downstream code expects LP64 version of vn_ioctl structure */
1007 vn_ioctl_to_64((struct vn_ioctl *)viop, &user_vnio);
1008 viop = &user_vnio;
1009 }
1010 if (viop->vn_file == USER_ADDR_NULL) {
1011 error = EINVAL;
1012 break;
1013 }
1014 error = vniocattach_shadow(vn, viop, dev, 0, p);
1015 break;
1016
1017 case VNIOCATTACH:
1018 case VNIOCATTACH64:
1019 if (is_char) {
1020 /* attach only on block device */
1021 error = ENODEV;
1022 break;
1023 }
1024 if (vn->sc_flags & VNF_INITED) {
1025 error = EBUSY;
1026 break;
1027 }
1028 if (!proc_is64bit(p)) {
1029 /* downstream code expects LP64 version of vn_ioctl structure */
1030 vn_ioctl_to_64((struct vn_ioctl *)viop, &user_vnio);
1031 viop = &user_vnio;
1032 }
1033 if (viop->vn_file == USER_ADDR_NULL) {
1034 error = EINVAL;
1035 break;
1036 }
1037 error = vniocattach_file(vn, viop, dev, 0, p);
1038 break;
1039
1040 case VNIOCDETACH:
1041 case VNIOCDETACH64:
1042 if (is_char) {
1043 /* detach only on block device */
1044 error = ENODEV;
1045 break;
1046 }
1047 /* Note: spec_open won't open a mounted block device */
1048
1049 /*
1050 * XXX handle i/o in progress. Return EBUSY, or wait, or
1051 * flush the i/o.
1052 * XXX handle multiple opens of the device. Return EBUSY,
1053 * or revoke the fd's.
1054 * How are these problems handled for removable and failing
1055 * hardware devices? (Hint: They are not)
1056 */
1057 vnclear(vn, p);
1058 break;
1059
1060 case VNIOCGSET:
1061 vn_options |= *f;
1062 *f = vn_options;
1063 break;
1064
1065 case VNIOCGCLEAR:
1066 vn_options &= ~(*f);
1067 *f = vn_options;
1068 break;
1069
1070 case VNIOCUSET:
1071 vn->sc_options |= *f;
1072 *f = vn->sc_options;
1073 break;
1074
1075 case VNIOCUCLEAR:
1076 vn->sc_options &= ~(*f);
1077 *f = vn->sc_options;
1078 break;
1079
1080 default:
1081 error = ENOTTY;
1082 break;
1083 }
1084 done:
1085 (void) thread_funnel_set(kernel_flock, funnel_state);
1086 return(error);
1087 }
1088
1089 static int
1090 vnioctl_chr(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
1091 {
1092 return (vnioctl(dev, cmd, data, flag, p, TRUE));
1093 }
1094
1095 static int
1096 vnioctl_blk(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
1097 {
1098 return (vnioctl(dev, cmd, data, flag, p, FALSE));
1099 }
1100
1101 /*
1102 * vniocattach_file:
1103 *
1104 * Attach a file to a VN partition. Return the size in the vn_size
1105 * field.
1106 */
1107
1108 static int
1109 vniocattach_file(struct vn_softc *vn,
1110 struct user_vn_ioctl *vniop,
1111 dev_t dev,
1112 int in_kernel,
1113 struct proc *p)
1114 {
1115 dev_t cdev;
1116 struct vfs_context context;
1117 kauth_cred_t cred;
1118 struct nameidata nd;
1119 off_t file_size;
1120 int error, flags;
1121
1122 context.vc_proc = p;
1123 context.vc_ucred = proc_ucred(p);
1124
1125 flags = FREAD|FWRITE;
1126 if (in_kernel) {
1127 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32, vniop->vn_file, &context);
1128 }
1129 else {
1130 NDINIT(&nd, LOOKUP, FOLLOW,
1131 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1132 vniop->vn_file, &context);
1133 }
1134 /* vn_open gives both long- and short-term references */
1135 error = vn_open(&nd, flags, 0);
1136 if (error) {
1137 if (error != EACCES && error != EPERM && error != EROFS)
1138 return (error);
1139 flags &= ~FWRITE;
1140 if (in_kernel) {
1141 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32,
1142 vniop->vn_file, &context);
1143 }
1144 else {
1145 NDINIT(&nd, LOOKUP, FOLLOW,
1146 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1147 vniop->vn_file, &context);
1148 }
1149 error = vn_open(&nd, flags, 0);
1150 if (error)
1151 return (error);
1152 }
1153 if (nd.ni_vp->v_type != VREG) {
1154 error = EINVAL;
1155 }
1156 else {
1157 error = vnode_size(nd.ni_vp, &file_size, &context);
1158 }
1159 if (error != 0) {
1160 (void) vn_close(nd.ni_vp, flags, proc_ucred(p), p);
1161 vnode_put(nd.ni_vp);
1162 return (error);
1163 }
1164 cred = kauth_cred_proc_ref(p);
1165 nd.ni_vp->v_flag |= VNOCACHE_DATA;
1166 error = setcred(nd.ni_vp, p, cred);
1167 if (error) {
1168 (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p);
1169 vnode_put(nd.ni_vp);
1170 kauth_cred_rele(cred);
1171 return(error);
1172 }
1173 vn->sc_secsize = DEV_BSIZE;
1174 vn->sc_fsize = file_size;
1175 vn->sc_size = file_size / vn->sc_secsize;
1176 vn->sc_vp = nd.ni_vp;
1177 vn->sc_vid = vnode_vid(nd.ni_vp);
1178 vn->sc_open_flags = flags;
1179 vn->sc_cred = cred;
1180 cdev = makedev(vndevice_cdev_major, minor(dev));
1181 vn->sc_cdev = devfs_make_node(cdev, DEVFS_CHAR,
1182 UID_ROOT, GID_OPERATOR,
1183 0600, "rvn%d",
1184 minor(dev));
1185 vn->sc_flags |= VNF_INITED;
1186 if (flags == FREAD)
1187 vn->sc_flags |= VNF_READONLY;
1188 /* lose the short-term reference */
1189 vnode_put(nd.ni_vp);
1190 return(0);
1191 }
1192
1193 static int
1194 vniocattach_shadow(struct vn_softc *vn, struct user_vn_ioctl *vniop,
1195 __unused int dev, int in_kernel, struct proc *p)
1196 {
1197 struct vfs_context context;
1198 struct nameidata nd;
1199 int error, flags;
1200 shadow_map_t * map;
1201 off_t file_size;
1202
1203 context.vc_proc = p;
1204 context.vc_ucred = proc_ucred(p);
1205
1206 flags = FREAD|FWRITE;
1207 if (in_kernel) {
1208 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE32, vniop->vn_file, &context);
1209 }
1210 else {
1211 NDINIT(&nd, LOOKUP, FOLLOW,
1212 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1213 vniop->vn_file, &context);
1214 }
1215 /* vn_open gives both long- and short-term references */
1216 error = vn_open(&nd, flags, 0);
1217 if (error) {
1218 /* shadow MUST be writable! */
1219 return (error);
1220 }
1221 if (nd.ni_vp->v_type != VREG
1222 || (error = vnode_size(nd.ni_vp, &file_size, &context))) {
1223 (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p);
1224 vnode_put(nd.ni_vp);
1225 return (error ? error : EINVAL);
1226 }
1227 map = shadow_map_create(vn->sc_fsize, file_size,
1228 0, vn->sc_secsize);
1229 if (map == NULL) {
1230 (void)vn_close(nd.ni_vp, flags, proc_ucred(p), p);
1231 vnode_put(nd.ni_vp);
1232 vn->sc_shadow_vp = NULL;
1233 return (ENOMEM);
1234 }
1235 vn->sc_shadow_vp = nd.ni_vp;
1236 vn->sc_shadow_vid = vnode_vid(nd.ni_vp);
1237 vn->sc_shadow_vp->v_flag |= VNOCACHE_DATA;
1238 vn->sc_shadow_map = map;
1239 vn->sc_flags &= ~VNF_READONLY; /* we're now read/write */
1240
1241 /* lose the short-term reference */
1242 vnode_put(nd.ni_vp);
1243 return(0);
1244 }
1245
1246 int
1247 vndevice_root_image(char * path, char devname[], dev_t * dev_p)
1248 {
1249 int error = 0;
1250 struct vn_softc * vn;
1251 struct user_vn_ioctl vnio;
1252
1253 vnio.vn_file = CAST_USER_ADDR_T(path);
1254 vnio.vn_size = 0;
1255
1256 vn = vn_table + ROOT_IMAGE_UNIT;
1257 *dev_p = makedev(vndevice_bdev_major,
1258 ROOT_IMAGE_UNIT);
1259 sprintf(devname, "vn%d", ROOT_IMAGE_UNIT);
1260 error = vniocattach_file(vn, &vnio, *dev_p, 1, current_proc());
1261 return (error);
1262 }
1263
1264 /*
1265 * Duplicate the current processes' credentials. Since we are called only
1266 * as the result of a SET ioctl and only root can do that, any future access
1267 * to this "disk" is essentially as root. Note that credentials may change
1268 * if some other uid can write directly to the mapped file (NFS).
1269 */
1270 static int
1271 setcred(struct vnode * vp, struct proc * p, kauth_cred_t cred)
1272 {
1273 char *tmpbuf;
1274 int error = 0;
1275 struct vfs_context context;
1276
1277 /*
1278 * Horrible kludge to establish credentials for NFS XXX.
1279 */
1280 context.vc_proc = p;
1281 context.vc_ucred = cred;
1282 tmpbuf = _MALLOC(DEV_BSIZE, M_TEMP, M_WAITOK);
1283 error = file_io(vp, &context, UIO_READ, tmpbuf, 0, DEV_BSIZE, NULL);
1284 FREE(tmpbuf, M_TEMP);
1285 return (error);
1286 }
1287
1288 void
1289 vnclear(struct vn_softc *vn, struct proc * p)
1290 {
1291 if (vn->sc_vp != NULL) {
1292 /* release long-term reference */
1293 (void)vn_close(vn->sc_vp, vn->sc_open_flags, vn->sc_cred, p);
1294 vn->sc_vp = NULL;
1295 }
1296 if (vn->sc_shadow_vp != NULL) {
1297 /* release long-term reference */
1298 (void)vn_close(vn->sc_shadow_vp, FREAD | FWRITE,
1299 vn->sc_cred, p);
1300 vn->sc_shadow_vp = NULL;
1301 }
1302 if (vn->sc_shadow_map != NULL) {
1303 shadow_map_free(vn->sc_shadow_map);
1304 vn->sc_shadow_map = NULL;
1305 }
1306 vn->sc_flags &= ~(VNF_INITED | VNF_READONLY);
1307 if (vn->sc_cred) {
1308 kauth_cred_rele(vn->sc_cred);
1309 vn->sc_cred = NULL;
1310 }
1311 vn->sc_size = 0;
1312 vn->sc_fsize = 0;
1313 if (vn->sc_cdev) {
1314 devfs_remove(vn->sc_cdev);
1315 vn->sc_cdev = NULL;
1316 }
1317 }
1318
1319 static int
1320 vnsize(dev_t dev)
1321 {
1322 int secsize;
1323 struct vn_softc *vn;
1324 int unit;
1325 boolean_t funnel_state;
1326
1327 unit = vnunit(dev);
1328 if (vnunit(dev) >= NVNDEVICE) {
1329 return (-1);
1330 }
1331
1332 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1333 vn = vn_table + unit;
1334 if ((vn->sc_flags & VNF_INITED) == 0)
1335 secsize = -1;
1336 else
1337 secsize = vn->sc_secsize;
1338 (void) thread_funnel_set(kernel_flock, funnel_state);
1339 return (secsize);
1340 }
1341
1342 #define CDEV_MAJOR -1
1343 #define BDEV_MAJOR -1
1344 static int vndevice_inited = 0;
1345
1346 void
1347 vndevice_init(void)
1348 {
1349 int i;
1350
1351 if (vndevice_inited)
1352 return;
1353 vndevice_bdev_major = bdevsw_add(BDEV_MAJOR, &vn_bdevsw);
1354
1355 if (vndevice_bdev_major < 0) {
1356 printf("vndevice_init: bdevsw_add() returned %d\n",
1357 vndevice_bdev_major);
1358 return;
1359 }
1360 vndevice_cdev_major = cdevsw_add_with_bdev(CDEV_MAJOR, &vn_cdevsw,
1361 vndevice_bdev_major);
1362 if (vndevice_cdev_major < 0) {
1363 printf("vndevice_init: cdevsw_add() returned %d\n",
1364 vndevice_cdev_major);
1365 return;
1366 }
1367 for (i = 0; i < NVNDEVICE; i++) {
1368 dev_t dev = makedev(vndevice_bdev_major, i);
1369 vn_table[i].sc_bdev = devfs_make_node(dev, DEVFS_BLOCK,
1370 UID_ROOT, GID_OPERATOR,
1371 0600, "vn%d",
1372 i);
1373 if (vn_table[i].sc_bdev == NULL)
1374 printf("vninit: devfs_make_node failed!\n");
1375 }
1376 }
1377
1378 static void
1379 vn_ioctl_to_64(struct vn_ioctl *from, struct user_vn_ioctl *to)
1380 {
1381 to->vn_file = CAST_USER_ADDR_T(from->vn_file);
1382 to->vn_size = from->vn_size;
1383 to->vn_control = from->vn_control;
1384 }
1385
1386 #endif /* NVNDEVICE */