]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_syscalls.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_syscalls.c
1 /*
2 * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/fsctl.h>
101 #include <sys/ubc_internal.h>
102 #include <sys/disk.h>
103 #include <sys/content_protection.h>
104 #include <sys/clonefile.h>
105 #include <sys/snapshot.h>
106 #include <sys/priv.h>
107 #include <sys/fsgetpath.h>
108 #include <machine/cons.h>
109 #include <machine/limits.h>
110 #include <miscfs/specfs/specdev.h>
111
112 #include <vfs/vfs_disk_conditioner.h>
113
114 #include <security/audit/audit.h>
115 #include <bsm/audit_kevents.h>
116
117 #include <mach/mach_types.h>
118 #include <kern/kern_types.h>
119 #include <kern/kalloc.h>
120 #include <kern/task.h>
121
122 #include <vm/vm_pageout.h>
123 #include <vm/vm_protos.h>
124
125 #include <libkern/OSAtomic.h>
126 #include <os/atomic_private.h>
127 #include <pexpert/pexpert.h>
128 #include <IOKit/IOBSD.h>
129
130 // deps for MIG call
131 #include <kern/host.h>
132 #include <kern/ipc_misc.h>
133 #include <mach/host_priv.h>
134 #include <mach/vfs_nspace.h>
135 #include <os/log.h>
136
137 #include <nfs/nfs_conf.h>
138
139 #if ROUTEFS
140 #include <miscfs/routefs/routefs.h>
141 #endif /* ROUTEFS */
142
143 #if CONFIG_MACF
144 #include <security/mac.h>
145 #include <security/mac_framework.h>
146 #endif
147
148 #if CONFIG_FSE
149 #define GET_PATH(x) \
150 ((x) = get_pathbuff())
151 #define RELEASE_PATH(x) \
152 release_pathbuff(x)
153 #else
154 #define GET_PATH(x) \
155 ((x) = zalloc(ZV_NAMEI))
156 #define RELEASE_PATH(x) \
157 zfree(ZV_NAMEI, x)
158 #endif /* CONFIG_FSE */
159
160 #ifndef HFS_GET_BOOT_INFO
161 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
162 #endif
163
164 #ifndef HFS_SET_BOOT_INFO
165 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
166 #endif
167
168 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
169 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
170 #endif
171
172 /*
173 * If you need accounting for KM_FD_VN_DATA consider using
174 * ZONE_VIEW_DEFINE to define a zone view.
175 */
176 #define KM_FD_VN_DATA KHEAP_DEFAULT
177
178 extern void disk_conditioner_unmount(mount_t mp);
179
180 /* struct for checkdirs iteration */
181 struct cdirargs {
182 vnode_t olddp;
183 vnode_t newdp;
184 };
185 /* callback for checkdirs iteration */
186 static int checkdirs_callback(proc_t p, void * arg);
187
188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
190 void enablequotas(struct mount *mp, vfs_context_t ctx);
191 static int getfsstat_callback(mount_t mp, void * arg);
192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
194 static int sync_callback(mount_t, void *);
195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
196 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
197 boolean_t partial_copy);
198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
199 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
200 struct componentname *cnp, user_addr_t fsmountargs,
201 int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
202 vfs_context_t ctx);
203 void vfs_notify_mount(vnode_t pdvp);
204
205 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
206
207 struct fd_vn_data * fg_vn_data_alloc(void);
208
209 /*
210 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
211 * Concurrent lookups (or lookups by ids) on hard links can cause the
212 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
213 * does) to return ENOENT as the path cannot be returned from the name cache
214 * alone. We have no option but to retry and hope to get one namei->reverse path
215 * generation done without an intervening lookup, lookup by id on the hard link
216 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
217 * which currently are the MAC hooks for rename, unlink and rmdir.
218 */
219 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
220
221 /* Max retry limit for rename due to vnode recycling. */
222 #define MAX_RENAME_ERECYCLE_RETRIES 1024
223
224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
225 int unlink_flags);
226
227 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
228
229 #ifdef CONFIG_IMGSRC_ACCESS
230 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
231 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
232 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
233 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
234 static void mount_end_update(mount_t mp);
235 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
236 #endif /* CONFIG_IMGSRC_ACCESS */
237
238 #if CONFIG_LOCKERBOOT
239 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
240 const char *pbdevpath);
241 #endif
242
243 //snapshot functions
244 #if CONFIG_MNT_ROOTSNAP
245 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
246 #else
247 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
248 #endif
249
250 __private_extern__
251 int sync_internal(void);
252
253 __private_extern__
254 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
255
256 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
257 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
258
259 /* vars for sync mutex */
260 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
261 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
262
263 extern lck_rw_t rootvnode_rw_lock;
264
265 /*
266 * incremented each time a mount or unmount operation occurs
267 * used to invalidate the cached value of the rootvp in the
268 * mount structure utilized by cache_lookup_path
269 */
270 uint32_t mount_generation = 0;
271
272 /* counts number of mount and unmount operations */
273 unsigned int vfs_nummntops = 0;
274
275 /* system-wide, per-boot unique mount ID */
276 static _Atomic uint64_t mount_unique_id = 1;
277
278 extern const struct fileops vnops;
279 #if CONFIG_APPLEDOUBLE
280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
281 #endif /* CONFIG_APPLEDOUBLE */
282
283 /*
284 * Virtual File System System Calls
285 */
286
287 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
288 /*
289 * Private in-kernel mounting spi (NFS only, not exported)
290 */
291 __private_extern__
292 boolean_t
293 vfs_iskernelmount(mount_t mp)
294 {
295 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297
298 __private_extern__
299 int
300 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
302 {
303 struct nameidata nd;
304 boolean_t did_namei;
305 int error;
306
307 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
308 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
309
310 /*
311 * Get the vnode to be covered if it's not supplied
312 */
313 if (vp == NULLVP) {
314 error = namei(&nd);
315 if (error) {
316 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
317 printf("failed to locate mount-on path: %s ", path);
318 }
319 return error;
320 }
321 vp = nd.ni_vp;
322 pvp = nd.ni_dvp;
323 did_namei = TRUE;
324 } else {
325 char *pnbuf = CAST_DOWN(char *, path);
326
327 nd.ni_cnd.cn_pnbuf = pnbuf;
328 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
329 did_namei = FALSE;
330 }
331
332 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
333 syscall_flags, kern_flags, NULL, TRUE, ctx);
334
335 if (did_namei) {
336 vnode_put(vp);
337 vnode_put(pvp);
338 nameidone(&nd);
339 }
340
341 return error;
342 }
343 #endif /* CONFIG_NFS_CLIENT || DEVFS */
344
345 /*
346 * Mount a file system.
347 */
348 /* ARGSUSED */
349 int
350 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
351 {
352 struct __mac_mount_args muap;
353
354 muap.type = uap->type;
355 muap.path = uap->path;
356 muap.flags = uap->flags;
357 muap.data = uap->data;
358 muap.mac_p = USER_ADDR_NULL;
359 return __mac_mount(p, &muap, retval);
360 }
361
362 int
363 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
364 {
365 struct componentname cn;
366 vfs_context_t ctx = vfs_context_current();
367 size_t dummy = 0;
368 int error;
369 int flags = uap->flags;
370 char fstypename[MFSNAMELEN];
371 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
372 vnode_t pvp;
373 vnode_t vp;
374
375 AUDIT_ARG(fd, uap->fd);
376 AUDIT_ARG(fflags, flags);
377 /* fstypename will get audited by mount_common */
378
379 /* Sanity check the flags */
380 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
381 return ENOTSUP;
382 }
383
384 if (flags & MNT_UNION) {
385 return EPERM;
386 }
387
388 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
389 if (error) {
390 return error;
391 }
392
393 if ((error = file_vnode(uap->fd, &vp)) != 0) {
394 return error;
395 }
396
397 if ((error = vnode_getwithref(vp)) != 0) {
398 file_drop(uap->fd);
399 return error;
400 }
401
402 pvp = vnode_getparent(vp);
403 if (pvp == NULL) {
404 vnode_put(vp);
405 file_drop(uap->fd);
406 return EINVAL;
407 }
408
409 memset(&cn, 0, sizeof(struct componentname));
410 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
411 cn.cn_pnlen = MAXPATHLEN;
412
413 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
414 zfree(ZV_NAMEI, cn.cn_pnbuf);
415 vnode_put(pvp);
416 vnode_put(vp);
417 file_drop(uap->fd);
418 return error;
419 }
420
421 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
422
423 zfree(ZV_NAMEI, cn.cn_pnbuf);
424 vnode_put(pvp);
425 vnode_put(vp);
426 file_drop(uap->fd);
427
428 return error;
429 }
430
431 void
432 vfs_notify_mount(vnode_t pdvp)
433 {
434 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
435 lock_vnode_and_post(pdvp, NOTE_WRITE);
436 }
437
438 /*
439 * __mac_mount:
440 * Mount a file system taking into account MAC label behavior.
441 * See mount(2) man page for more information
442 *
443 * Parameters: p Process requesting the mount
444 * uap User argument descriptor (see below)
445 * retval (ignored)
446 *
447 * Indirect: uap->type Filesystem type
448 * uap->path Path to mount
449 * uap->data Mount arguments
450 * uap->mac_p MAC info
451 * uap->flags Mount flags
452 *
453 *
454 * Returns: 0 Success
455 * !0 Not success
456 */
457 boolean_t root_fs_upgrade_try = FALSE;
458
459 int
460 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
461 {
462 vnode_t pvp = NULL;
463 vnode_t vp = NULL;
464 int need_nameidone = 0;
465 vfs_context_t ctx = vfs_context_current();
466 char fstypename[MFSNAMELEN];
467 struct nameidata nd;
468 size_t dummy = 0;
469 char *labelstr = NULL;
470 size_t labelsz = 0;
471 int flags = uap->flags;
472 int error;
473 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
474 boolean_t is_64bit = IS_64BIT_PROCESS(p);
475 #else
476 #pragma unused(p)
477 #endif
478 /*
479 * Get the fs type name from user space
480 */
481 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
482 if (error) {
483 return error;
484 }
485
486 /*
487 * Get the vnode to be covered
488 */
489 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
490 UIO_USERSPACE, uap->path, ctx);
491 error = namei(&nd);
492 if (error) {
493 goto out;
494 }
495 need_nameidone = 1;
496 vp = nd.ni_vp;
497 pvp = nd.ni_dvp;
498
499 #ifdef CONFIG_IMGSRC_ACCESS
500 /* Mounting image source cannot be batched with other operations */
501 if (flags == MNT_IMGSRC_BY_INDEX) {
502 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
503 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
504 goto out;
505 }
506 #endif /* CONFIG_IMGSRC_ACCESS */
507
508 #if CONFIG_MACF
509 /*
510 * Get the label string (if any) from user space
511 */
512 if (uap->mac_p != USER_ADDR_NULL) {
513 struct user_mac mac;
514 size_t ulen = 0;
515
516 if (is_64bit) {
517 struct user64_mac mac64;
518 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
519 mac.m_buflen = (user_size_t)mac64.m_buflen;
520 mac.m_string = (user_addr_t)mac64.m_string;
521 } else {
522 struct user32_mac mac32;
523 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
524 mac.m_buflen = mac32.m_buflen;
525 mac.m_string = mac32.m_string;
526 }
527 if (error) {
528 goto out;
529 }
530 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
531 (mac.m_buflen < 2)) {
532 error = EINVAL;
533 goto out;
534 }
535 labelsz = mac.m_buflen;
536 labelstr = kheap_alloc(KHEAP_TEMP, labelsz, Z_WAITOK);
537 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
538 if (error) {
539 goto out;
540 }
541 AUDIT_ARG(mac_string, labelstr);
542 }
543 #endif /* CONFIG_MACF */
544
545 AUDIT_ARG(fflags, flags);
546
547 #if SECURE_KERNEL
548 if (flags & MNT_UNION) {
549 /* No union mounts on release kernels */
550 error = EPERM;
551 goto out;
552 }
553 #endif
554
555 if ((vp->v_flag & VROOT) &&
556 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
557 if (!(flags & MNT_UNION)) {
558 flags |= MNT_UPDATE;
559 } else {
560 /*
561 * For a union mount on '/', treat it as fresh
562 * mount instead of update.
563 * Otherwise, union mouting on '/' used to panic the
564 * system before, since mnt_vnodecovered was found to
565 * be NULL for '/' which is required for unionlookup
566 * after it gets ENOENT on union mount.
567 */
568 flags = (flags & ~(MNT_UPDATE));
569 }
570
571 #if SECURE_KERNEL
572 if ((flags & MNT_RDONLY) == 0) {
573 /* Release kernels are not allowed to mount "/" as rw */
574 error = EPERM;
575 goto out;
576 }
577 #endif
578 /*
579 * See 7392553 for more details on why this check exists.
580 * Suffice to say: If this check is ON and something tries
581 * to mount the rootFS RW, we'll turn off the codesign
582 * bitmap optimization.
583 */
584 #if CHECK_CS_VALIDATION_BITMAP
585 if ((flags & MNT_RDONLY) == 0) {
586 root_fs_upgrade_try = TRUE;
587 }
588 #endif
589 }
590
591 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
592 labelstr, FALSE, ctx);
593
594 out:
595
596 #if CONFIG_MACF
597 kheap_free(KHEAP_DEFAULT, labelstr, labelsz);
598 #endif /* CONFIG_MACF */
599
600 if (vp) {
601 vnode_put(vp);
602 }
603 if (pvp) {
604 vnode_put(pvp);
605 }
606 if (need_nameidone) {
607 nameidone(&nd);
608 }
609
610 return error;
611 }
612
613 /*
614 * common mount implementation (final stage of mounting)
615 *
616 * Arguments:
617 * fstypename file system type (ie it's vfs name)
618 * pvp parent of covered vnode
619 * vp covered vnode
620 * cnp component name (ie path) of covered vnode
621 * flags generic mount flags
622 * fsmountargs file system specific data
623 * labelstr optional MAC label
624 * kernelmount TRUE for mounts initiated from inside the kernel
625 * ctx caller's context
626 */
627 static int
628 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
629 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
630 char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
631 {
632 #if !CONFIG_MACF
633 #pragma unused(labelstr)
634 #endif
635 struct vnode *devvp = NULLVP;
636 struct vnode *device_vnode = NULLVP;
637 #if CONFIG_MACF
638 struct vnode *rvp;
639 #endif
640 struct mount *mp;
641 struct vfstable *vfsp = (struct vfstable *)0;
642 struct proc *p = vfs_context_proc(ctx);
643 int error, flag = 0;
644 bool flag_set = false;
645 user_addr_t devpath = USER_ADDR_NULL;
646 int ronly = 0;
647 int mntalloc = 0;
648 boolean_t vfsp_ref = FALSE;
649 boolean_t is_rwlock_locked = FALSE;
650 boolean_t did_rele = FALSE;
651 boolean_t have_usecount = FALSE;
652 boolean_t did_set_lmount = FALSE;
653
654 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
655 /* Check for mutually-exclusive flag bits */
656 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
657 int bitcount = 0;
658 while (checkflags != 0) {
659 checkflags &= (checkflags - 1);
660 bitcount++;
661 }
662
663 if (bitcount > 1) {
664 //not allowed to request multiple mount-by-role flags
665 error = EINVAL;
666 goto out1;
667 }
668 #endif
669
670 /*
671 * Process an update for an existing mount
672 */
673 if (flags & MNT_UPDATE) {
674 if ((vp->v_flag & VROOT) == 0) {
675 error = EINVAL;
676 goto out1;
677 }
678 mp = vp->v_mount;
679
680 /* if unmount or mount in progress, return error */
681 mount_lock_spin(mp);
682 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
683 mount_unlock(mp);
684 error = EBUSY;
685 goto out1;
686 }
687 mp->mnt_lflag |= MNT_LMOUNT;
688 did_set_lmount = TRUE;
689 mount_unlock(mp);
690 lck_rw_lock_exclusive(&mp->mnt_rwlock);
691 is_rwlock_locked = TRUE;
692 /*
693 * We only allow the filesystem to be reloaded if it
694 * is currently mounted read-only.
695 */
696 if ((flags & MNT_RELOAD) &&
697 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
698 error = ENOTSUP;
699 goto out1;
700 }
701
702 /*
703 * If content protection is enabled, update mounts are not
704 * allowed to turn it off.
705 */
706 if ((mp->mnt_flag & MNT_CPROTECT) &&
707 ((flags & MNT_CPROTECT) == 0)) {
708 error = EINVAL;
709 goto out1;
710 }
711
712 /*
713 * can't turn off MNT_REMOVABLE either but it may be an unexpected
714 * failure to return an error for this so we'll just silently
715 * add it if it is not passed in.
716 */
717 if ((mp->mnt_flag & MNT_REMOVABLE) &&
718 ((flags & MNT_REMOVABLE) == 0)) {
719 flags |= MNT_REMOVABLE;
720 }
721
722 /* Can't downgrade the backer of the root FS */
723 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
724 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
725 error = ENOTSUP;
726 goto out1;
727 }
728
729 /*
730 * Only root, or the user that did the original mount is
731 * permitted to update it.
732 */
733 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
734 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
735 goto out1;
736 }
737 #if CONFIG_MACF
738 error = mac_mount_check_remount(ctx, mp);
739 if (error != 0) {
740 goto out1;
741 }
742 #endif
743 /*
744 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
745 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
746 */
747 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
748 flags |= MNT_NOSUID | MNT_NODEV;
749 if (mp->mnt_flag & MNT_NOEXEC) {
750 flags |= MNT_NOEXEC;
751 }
752 }
753 flag = mp->mnt_flag;
754 flag_set = true;
755
756
757
758 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
759
760 vfsp = mp->mnt_vtable;
761 goto update;
762 } // MNT_UPDATE
763
764 /*
765 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
766 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
767 */
768 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
769 flags |= MNT_NOSUID | MNT_NODEV;
770 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
771 flags |= MNT_NOEXEC;
772 }
773 }
774
775 /* XXXAUDIT: Should we capture the type on the error path as well? */
776 AUDIT_ARG(text, fstypename);
777 mount_list_lock();
778 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
779 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
780 vfsp->vfc_refcount++;
781 vfsp_ref = TRUE;
782 break;
783 }
784 }
785 mount_list_unlock();
786 if (vfsp == NULL) {
787 error = ENODEV;
788 goto out1;
789 }
790
791 /*
792 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
793 * except in ROSV configs and for the initial BaseSystem root.
794 */
795 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
796 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
797 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
798 error = EINVAL; /* unsupported request */
799 goto out1;
800 }
801
802 error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
803 if (error != 0) {
804 goto out1;
805 }
806
807 /*
808 * Allocate and initialize the filesystem (mount_t)
809 */
810 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
811 mntalloc = 1;
812
813 /* Initialize the default IO constraints */
814 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
815 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
816 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
817 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
818 mp->mnt_devblocksize = DEV_BSIZE;
819 mp->mnt_alignmentmask = PAGE_MASK;
820 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
821 mp->mnt_ioscale = 1;
822 mp->mnt_ioflags = 0;
823 mp->mnt_realrootvp = NULLVP;
824 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
825
826 mp->mnt_lflag |= MNT_LMOUNT;
827 did_set_lmount = TRUE;
828
829 TAILQ_INIT(&mp->mnt_vnodelist);
830 TAILQ_INIT(&mp->mnt_workerqueue);
831 TAILQ_INIT(&mp->mnt_newvnodes);
832 mount_lock_init(mp);
833 lck_rw_lock_exclusive(&mp->mnt_rwlock);
834 is_rwlock_locked = TRUE;
835 mp->mnt_op = vfsp->vfc_vfsops;
836 mp->mnt_vtable = vfsp;
837 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
838 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
839 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
840 do {
841 int pathlen = MAXPATHLEN;
842
843 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
844 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
845 }
846 } while (0);
847 mp->mnt_vnodecovered = vp;
848 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
849 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
850 mp->mnt_devbsdunit = 0;
851 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
852
853 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
854 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
855
856 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
857 if (kernelmount) {
858 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
859 }
860 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
861 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
862 }
863 #endif /* CONFIG_NFS_CLIENT || DEVFS */
864
865 if (KERNEL_MOUNT_DEVFS & internal_flags) {
866 // kernel mounted devfs
867 mp->mnt_kern_flag |= MNTK_SYSTEM;
868 }
869
870 update:
871
872 /*
873 * Set the mount level flags.
874 */
875 if (flags & MNT_RDONLY) {
876 mp->mnt_flag |= MNT_RDONLY;
877 } else if (mp->mnt_flag & MNT_RDONLY) {
878 // disallow read/write upgrades of file systems that
879 // had the TYPENAME_OVERRIDE feature set.
880 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
881 error = EPERM;
882 goto out1;
883 }
884 mp->mnt_kern_flag |= MNTK_WANTRDWR;
885 }
886 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
887 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
888 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
889 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
890 MNT_QUARANTINE | MNT_CPROTECT);
891
892 #if SECURE_KERNEL
893 #if !CONFIG_MNT_SUID
894 /*
895 * On release builds of iOS based platforms, always enforce NOSUID on
896 * all mounts. We do this here because we can catch update mounts as well as
897 * non-update mounts in this case.
898 */
899 mp->mnt_flag |= (MNT_NOSUID);
900 #endif
901 #endif
902
903 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
904 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
905 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
906 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
907 MNT_QUARANTINE | MNT_CPROTECT);
908
909 #if CONFIG_MACF
910 if (flags & MNT_MULTILABEL) {
911 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
912 error = EINVAL;
913 goto out1;
914 }
915 mp->mnt_flag |= MNT_MULTILABEL;
916 }
917 #endif
918 /*
919 * Process device path for local file systems if requested.
920 *
921 * Snapshot and mount-by-role mounts do not use this path; they are
922 * passing other opaque data in the device path field.
923 *
924 * Basesystemroot mounts pass a device path to be resolved here,
925 * but it's just a char * already inside the kernel, which
926 * kernel_mount() shoved into a user_addr_t to call us. So for such
927 * mounts we must skip copyin (both of the address and of the string
928 * (in NDINIT).
929 */
930 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
931 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
932 boolean_t do_copyin_devpath = true;
933 #if CONFIG_BASESYSTEMROOT
934 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
935 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
936 // We have been passed fsmountargs, which is typed as a user_addr_t,
937 // but is actually a char ** pointing to a (kernelspace) string.
938 // We manually unpack it with a series of casts and dereferences
939 // that reverses what was done just above us on the stack in
940 // imageboot_pivot_image().
941 // After retrieving the path to the dev node (which we will NDINIT
942 // in a moment), we pass NULL fsmountargs on to the filesystem.
943 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
944 char **devnamepp = (char **)fsmountargs;
945 char *devnamep = *devnamepp;
946 devpath = CAST_USER_ADDR_T(devnamep);
947 do_copyin_devpath = false;
948 fsmountargs = USER_ADDR_NULL;
949
950 //Now that we have a mp, denote that this mount is for the basesystem.
951 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
952 }
953 #endif // CONFIG_BASESYSTEMROOT
954
955 if (do_copyin_devpath) {
956 if (vfs_context_is64bit(ctx)) {
957 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
958 goto out1;
959 }
960 fsmountargs += sizeof(devpath);
961 } else {
962 user32_addr_t tmp;
963 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
964 goto out1;
965 }
966 /* munge into LP64 addr */
967 devpath = CAST_USER_ADDR_T(tmp);
968 fsmountargs += sizeof(tmp);
969 }
970 }
971
972 /* Lookup device and authorize access to it */
973 if ((devpath)) {
974 struct nameidata nd;
975
976 enum uio_seg seg = UIO_USERSPACE;
977 #if CONFIG_BASESYSTEMROOT
978 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
979 seg = UIO_SYSSPACE;
980 }
981 #endif // CONFIG_BASESYSTEMROOT
982
983 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
984 if ((error = namei(&nd))) {
985 goto out1;
986 }
987
988 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
989 devvp = nd.ni_vp;
990
991 nameidone(&nd);
992
993 if (devvp->v_type != VBLK) {
994 error = ENOTBLK;
995 goto out2;
996 }
997 if (major(devvp->v_rdev) >= nblkdev) {
998 error = ENXIO;
999 goto out2;
1000 }
1001 /*
1002 * If mount by non-root, then verify that user has necessary
1003 * permissions on the device.
1004 */
1005 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1006 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1007
1008 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1009 accessmode |= KAUTH_VNODE_WRITE_DATA;
1010 }
1011 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1012 goto out2;
1013 }
1014 }
1015 }
1016 /* On first mount, preflight and open device */
1017 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1018 if ((error = vnode_ref(devvp))) {
1019 goto out2;
1020 }
1021 /*
1022 * Disallow multiple mounts of the same device.
1023 * Disallow mounting of a device that is currently in use
1024 * (except for root, which might share swap device for miniroot).
1025 * Flush out any old buffers remaining from a previous use.
1026 */
1027 if ((error = vfs_mountedon(devvp))) {
1028 goto out3;
1029 }
1030
1031 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1032 error = EBUSY;
1033 goto out3;
1034 }
1035 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1036 error = ENOTBLK;
1037 goto out3;
1038 }
1039 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1040 goto out3;
1041 }
1042
1043 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1044 #if CONFIG_MACF
1045 error = mac_vnode_check_open(ctx,
1046 devvp,
1047 ronly ? FREAD : FREAD | FWRITE);
1048 if (error) {
1049 goto out3;
1050 }
1051 #endif /* MAC */
1052 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1053 goto out3;
1054 }
1055
1056 mp->mnt_devvp = devvp;
1057 device_vnode = devvp;
1058 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1059 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1060 (device_vnode = mp->mnt_devvp)) {
1061 dev_t dev;
1062 int maj;
1063 /*
1064 * If upgrade to read-write by non-root, then verify
1065 * that user has necessary permissions on the device.
1066 */
1067 vnode_getalways(device_vnode);
1068
1069 if (suser(vfs_context_ucred(ctx), NULL) &&
1070 (error = vnode_authorize(device_vnode, NULL,
1071 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1072 ctx)) != 0) {
1073 vnode_put(device_vnode);
1074 goto out2;
1075 }
1076
1077 /* Tell the device that we're upgrading */
1078 dev = (dev_t)device_vnode->v_rdev;
1079 maj = major(dev);
1080
1081 if ((u_int)maj >= (u_int)nblkdev) {
1082 panic("Volume mounted on a device with invalid major number.");
1083 }
1084
1085 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1086 vnode_put(device_vnode);
1087 device_vnode = NULLVP;
1088 if (error != 0) {
1089 goto out2;
1090 }
1091 }
1092 } // localargs && !(snapshot | data | vm)
1093
1094 #if CONFIG_MACF
1095 if ((flags & MNT_UPDATE) == 0) {
1096 mac_mount_label_init(mp);
1097 mac_mount_label_associate(ctx, mp);
1098 }
1099 if (labelstr) {
1100 if ((flags & MNT_UPDATE) != 0) {
1101 error = mac_mount_check_label_update(ctx, mp);
1102 if (error != 0) {
1103 goto out3;
1104 }
1105 }
1106 }
1107 #endif
1108 /*
1109 * Mount the filesystem. We already asserted that internal_flags
1110 * cannot have more than one mount-by-role bit set.
1111 */
1112 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1113 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1114 (caddr_t)fsmountargs, 0, ctx);
1115 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1116 #if CONFIG_ROSV_STARTUP
1117 struct mount *origin_mp = (struct mount*)fsmountargs;
1118 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1119 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1120 if (error) {
1121 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1122 } else {
1123 /* Mark volume associated with system volume */
1124 mp->mnt_kern_flag |= MNTK_SYSTEM;
1125
1126 /* Attempt to acquire the mnt_devvp and set it up */
1127 struct vnode *mp_devvp = NULL;
1128 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1129 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1130 0, &mp_devvp, vfs_context_kernel());
1131 if (!lerr) {
1132 mp->mnt_devvp = mp_devvp;
1133 //vnode_lookup took an iocount, need to drop it.
1134 vnode_put(mp_devvp);
1135 // now set `device_vnode` to the devvp that was acquired.
1136 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1137 // note that though the iocount above was dropped, the mount acquires
1138 // an implicit reference against the device.
1139 device_vnode = mp_devvp;
1140 }
1141 }
1142 }
1143 #else
1144 error = EINVAL;
1145 #endif
1146 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1147 #if CONFIG_MOUNT_VM
1148 struct mount *origin_mp = (struct mount*)fsmountargs;
1149 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1150 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1151 if (error) {
1152 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1153 } else {
1154 /* Mark volume associated with system volume and a swap mount */
1155 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1156 /* Attempt to acquire the mnt_devvp and set it up */
1157 struct vnode *mp_devvp = NULL;
1158 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1159 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1160 0, &mp_devvp, vfs_context_kernel());
1161 if (!lerr) {
1162 mp->mnt_devvp = mp_devvp;
1163 //vnode_lookup took an iocount, need to drop it.
1164 vnode_put(mp_devvp);
1165
1166 // now set `device_vnode` to the devvp that was acquired.
1167 // note that though the iocount above was dropped, the mount acquires
1168 // an implicit reference against the device.
1169 device_vnode = mp_devvp;
1170 }
1171 }
1172 }
1173 #else
1174 error = EINVAL;
1175 #endif
1176 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1177 #if CONFIG_MOUNT_PREBOOTRECOVERY
1178 struct mount *origin_mp = (struct mount*)fsmountargs;
1179 uint32_t mount_role = 0;
1180 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1181 mount_role = VFS_PREBOOT_ROLE;
1182 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1183 mount_role = VFS_RECOVERY_ROLE;
1184 }
1185
1186 if (mount_role != 0) {
1187 fs_role_mount_args_t frma = {origin_mp, mount_role};
1188 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1189 if (error) {
1190 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1191 } else {
1192 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1193 /* Mark volume associated with system volume */
1194 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1195 /* Attempt to acquire the mnt_devvp and set it up */
1196 struct vnode *mp_devvp = NULL;
1197 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1198 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1199 0, &mp_devvp, vfs_context_kernel());
1200 if (!lerr) {
1201 mp->mnt_devvp = mp_devvp;
1202 //vnode_lookup took an iocount, need to drop it.
1203 vnode_put(mp_devvp);
1204
1205 // now set `device_vnode` to the devvp that was acquired.
1206 // note that though the iocount above was dropped, the mount acquires
1207 // an implicit reference against the device.
1208 device_vnode = mp_devvp;
1209 }
1210 }
1211 }
1212 } else {
1213 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1214 error = EINVAL;
1215 }
1216 #else
1217 error = EINVAL;
1218 #endif
1219 } else {
1220 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1221 }
1222
1223 if (flags & MNT_UPDATE) {
1224 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1225 mp->mnt_flag &= ~MNT_RDONLY;
1226 }
1227 mp->mnt_flag &= ~
1228 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1229 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1230 if (error) {
1231 mp->mnt_flag = flag; /* restore flag value */
1232 }
1233 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1234 lck_rw_done(&mp->mnt_rwlock);
1235 is_rwlock_locked = FALSE;
1236 if (!error) {
1237 enablequotas(mp, ctx);
1238 }
1239 goto exit;
1240 }
1241
1242 /*
1243 * Put the new filesystem on the mount list after root.
1244 */
1245 if (error == 0) {
1246 struct vfs_attr vfsattr;
1247 #if CONFIG_MACF
1248 error = mac_mount_check_mount_late(ctx, mp);
1249 if (error != 0) {
1250 goto out4;
1251 }
1252
1253 if (vfs_flags(mp) & MNT_MULTILABEL) {
1254 error = VFS_ROOT(mp, &rvp, ctx);
1255 if (error) {
1256 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1257 goto out4;
1258 }
1259 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1260 /*
1261 * drop reference provided by VFS_ROOT
1262 */
1263 vnode_put(rvp);
1264
1265 if (error) {
1266 goto out4;
1267 }
1268 }
1269 #endif /* MAC */
1270
1271 vnode_lock_spin(vp);
1272 CLR(vp->v_flag, VMOUNT);
1273 vp->v_mountedhere = mp;
1274 vnode_unlock(vp);
1275
1276 /*
1277 * taking the name_cache_lock exclusively will
1278 * insure that everyone is out of the fast path who
1279 * might be trying to use a now stale copy of
1280 * vp->v_mountedhere->mnt_realrootvp
1281 * bumping mount_generation causes the cached values
1282 * to be invalidated
1283 */
1284 name_cache_lock();
1285 mount_generation++;
1286 name_cache_unlock();
1287
1288 error = vnode_ref(vp);
1289 if (error != 0) {
1290 goto out4;
1291 }
1292
1293 have_usecount = TRUE;
1294
1295 error = checkdirs(vp, ctx);
1296 if (error != 0) {
1297 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1298 goto out4;
1299 }
1300 /*
1301 * there is no cleanup code here so I have made it void
1302 * we need to revisit this
1303 */
1304 (void)VFS_START(mp, 0, ctx);
1305
1306 if (mount_list_add(mp) != 0) {
1307 /*
1308 * The system is shutting down trying to umount
1309 * everything, so fail with a plausible errno.
1310 */
1311 error = EBUSY;
1312 goto out4;
1313 }
1314 lck_rw_done(&mp->mnt_rwlock);
1315 is_rwlock_locked = FALSE;
1316
1317 /* Check if this mounted file system supports EAs or named streams. */
1318 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1319 VFSATTR_INIT(&vfsattr);
1320 VFSATTR_WANTED(&vfsattr, f_capabilities);
1321 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1322 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1323 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1324 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1325 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1326 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1327 }
1328 #if NAMEDSTREAMS
1329 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1330 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1331 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1332 }
1333 #endif
1334 /* Check if this file system supports path from id lookups. */
1335 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1336 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1337 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1338 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1339 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1340 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1341 }
1342
1343 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1344 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1345 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1346 }
1347 }
1348 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1349 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1350 }
1351 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1352 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1353 }
1354 /* increment the operations count */
1355 OSAddAtomic(1, &vfs_nummntops);
1356 enablequotas(mp, ctx);
1357
1358 if (device_vnode) {
1359 device_vnode->v_specflags |= SI_MOUNTEDON;
1360
1361 /*
1362 * cache the IO attributes for the underlying physical media...
1363 * an error return indicates the underlying driver doesn't
1364 * support all the queries necessary... however, reasonable
1365 * defaults will have been set, so no reason to bail or care
1366 */
1367 vfs_init_io_attributes(device_vnode, mp);
1368 }
1369
1370 /* Now that mount is setup, notify the listeners */
1371 vfs_notify_mount(pvp);
1372 IOBSDMountChange(mp, kIOMountChangeMount);
1373 } else {
1374 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1375 if (mp->mnt_vnodelist.tqh_first != NULL) {
1376 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1377 mp->mnt_vtable->vfc_name, error);
1378 }
1379
1380 vnode_lock_spin(vp);
1381 CLR(vp->v_flag, VMOUNT);
1382 vnode_unlock(vp);
1383 mount_list_lock();
1384 mp->mnt_vtable->vfc_refcount--;
1385 mount_list_unlock();
1386
1387 if (device_vnode) {
1388 vnode_rele(device_vnode);
1389 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1390 }
1391 lck_rw_done(&mp->mnt_rwlock);
1392 is_rwlock_locked = FALSE;
1393
1394 /*
1395 * if we get here, we have a mount structure that needs to be freed,
1396 * but since the coveredvp hasn't yet been updated to point at it,
1397 * no need to worry about other threads holding a crossref on this mp
1398 * so it's ok to just free it
1399 */
1400 mount_lock_destroy(mp);
1401 #if CONFIG_MACF
1402 mac_mount_label_destroy(mp);
1403 #endif
1404 zfree(mount_zone, mp);
1405 did_set_lmount = false;
1406 }
1407 exit:
1408 /*
1409 * drop I/O count on the device vp if there was one
1410 */
1411 if (devpath && devvp) {
1412 vnode_put(devvp);
1413 }
1414
1415 if (did_set_lmount) {
1416 mount_lock_spin(mp);
1417 mp->mnt_lflag &= ~MNT_LMOUNT;
1418 mount_unlock(mp);
1419 }
1420
1421 return error;
1422
1423 /* Error condition exits */
1424 out4:
1425 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1426
1427 /*
1428 * If the mount has been placed on the covered vp,
1429 * it may have been discovered by now, so we have
1430 * to treat this just like an unmount
1431 */
1432 mount_lock_spin(mp);
1433 mp->mnt_lflag |= MNT_LDEAD;
1434 mount_unlock(mp);
1435
1436 if (device_vnode != NULLVP) {
1437 vnode_rele(device_vnode);
1438 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1439 ctx);
1440 did_rele = TRUE;
1441 }
1442
1443 vnode_lock_spin(vp);
1444
1445 mp->mnt_crossref++;
1446 vp->v_mountedhere = (mount_t) 0;
1447
1448 vnode_unlock(vp);
1449
1450 if (have_usecount) {
1451 vnode_rele(vp);
1452 }
1453 out3:
1454 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1455 vnode_rele(devvp);
1456 }
1457 out2:
1458 if (devpath && devvp) {
1459 vnode_put(devvp);
1460 }
1461 out1:
1462 /* Release mnt_rwlock only when it was taken */
1463 if (is_rwlock_locked == TRUE) {
1464 if (flag_set) {
1465 mp->mnt_flag = flag; /* restore mnt_flag value */
1466 }
1467 lck_rw_done(&mp->mnt_rwlock);
1468 }
1469
1470 if (did_set_lmount) {
1471 mount_lock_spin(mp);
1472 mp->mnt_lflag &= ~MNT_LMOUNT;
1473 mount_unlock(mp);
1474 }
1475
1476 if (mntalloc) {
1477 if (mp->mnt_crossref) {
1478 mount_dropcrossref(mp, vp, 0);
1479 } else {
1480 mount_lock_destroy(mp);
1481 #if CONFIG_MACF
1482 mac_mount_label_destroy(mp);
1483 #endif
1484 zfree(mount_zone, mp);
1485 }
1486 }
1487 if (vfsp_ref) {
1488 mount_list_lock();
1489 vfsp->vfc_refcount--;
1490 mount_list_unlock();
1491 }
1492
1493 return error;
1494 }
1495
1496 /*
1497 * Flush in-core data, check for competing mount attempts,
1498 * and set VMOUNT
1499 */
1500 int
1501 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1502 {
1503 #if !CONFIG_MACF
1504 #pragma unused(cnp,fsname)
1505 #endif
1506 struct vnode_attr va;
1507 int error;
1508
1509 if (!skip_auth) {
1510 /*
1511 * If the user is not root, ensure that they own the directory
1512 * onto which we are attempting to mount.
1513 */
1514 VATTR_INIT(&va);
1515 VATTR_WANTED(&va, va_uid);
1516 if ((error = vnode_getattr(vp, &va, ctx)) ||
1517 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1518 (!vfs_context_issuser(ctx)))) {
1519 error = EPERM;
1520 goto out;
1521 }
1522 }
1523
1524 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1525 goto out;
1526 }
1527
1528 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1529 goto out;
1530 }
1531
1532 if (vp->v_type != VDIR) {
1533 error = ENOTDIR;
1534 goto out;
1535 }
1536
1537 if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1538 error = EBUSY;
1539 goto out;
1540 }
1541
1542 #if CONFIG_MACF
1543 error = mac_mount_check_mount(ctx, vp,
1544 cnp, fsname);
1545 if (error != 0) {
1546 goto out;
1547 }
1548 #endif
1549
1550 vnode_lock_spin(vp);
1551 SET(vp->v_flag, VMOUNT);
1552 vnode_unlock(vp);
1553
1554 out:
1555 return error;
1556 }
1557
1558 #if CONFIG_IMGSRC_ACCESS
1559
1560 #define DEBUG_IMGSRC 0
1561
1562 #if DEBUG_IMGSRC
1563 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1564 #else
1565 #define IMGSRC_DEBUG(args...) do { } while(0)
1566 #endif
1567
1568 static int
1569 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1570 {
1571 struct nameidata nd;
1572 vnode_t vp, realdevvp;
1573 mode_t accessmode;
1574 int error;
1575 enum uio_seg uio = UIO_USERSPACE;
1576
1577 if (ctx == vfs_context_kernel()) {
1578 uio = UIO_SYSSPACE;
1579 }
1580
1581 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1582 if ((error = namei(&nd))) {
1583 IMGSRC_DEBUG("namei() failed with %d\n", error);
1584 return error;
1585 }
1586
1587 vp = nd.ni_vp;
1588
1589 if (!vnode_isblk(vp)) {
1590 IMGSRC_DEBUG("Not block device.\n");
1591 error = ENOTBLK;
1592 goto out;
1593 }
1594
1595 realdevvp = mp->mnt_devvp;
1596 if (realdevvp == NULLVP) {
1597 IMGSRC_DEBUG("No device backs the mount.\n");
1598 error = ENXIO;
1599 goto out;
1600 }
1601
1602 error = vnode_getwithref(realdevvp);
1603 if (error != 0) {
1604 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1605 goto out;
1606 }
1607
1608 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1609 IMGSRC_DEBUG("Wrong dev_t.\n");
1610 error = ENXIO;
1611 goto out1;
1612 }
1613
1614 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1615
1616 /*
1617 * If mount by non-root, then verify that user has necessary
1618 * permissions on the device.
1619 */
1620 if (!vfs_context_issuser(ctx)) {
1621 accessmode = KAUTH_VNODE_READ_DATA;
1622 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1623 accessmode |= KAUTH_VNODE_WRITE_DATA;
1624 }
1625 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1626 IMGSRC_DEBUG("Access denied.\n");
1627 goto out1;
1628 }
1629 }
1630
1631 *devvpp = vp;
1632
1633 out1:
1634 vnode_put(realdevvp);
1635
1636 out:
1637 nameidone(&nd);
1638
1639 if (error) {
1640 vnode_put(vp);
1641 }
1642
1643 return error;
1644 }
1645
1646 /*
1647 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1648 * and call checkdirs()
1649 */
1650 static int
1651 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1652 {
1653 int error;
1654
1655 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1656
1657 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1658 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1659
1660 vnode_lock_spin(vp);
1661 CLR(vp->v_flag, VMOUNT);
1662 vp->v_mountedhere = mp;
1663 vnode_unlock(vp);
1664
1665 /*
1666 * taking the name_cache_lock exclusively will
1667 * insure that everyone is out of the fast path who
1668 * might be trying to use a now stale copy of
1669 * vp->v_mountedhere->mnt_realrootvp
1670 * bumping mount_generation causes the cached values
1671 * to be invalidated
1672 */
1673 name_cache_lock();
1674 mount_generation++;
1675 name_cache_unlock();
1676
1677 error = vnode_ref(vp);
1678 if (error != 0) {
1679 goto out;
1680 }
1681
1682 error = checkdirs(vp, ctx);
1683 if (error != 0) {
1684 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1685 vnode_rele(vp);
1686 goto out;
1687 }
1688
1689 out:
1690 if (error != 0) {
1691 mp->mnt_vnodecovered = NULLVP;
1692 }
1693 return error;
1694 }
1695
1696 static void
1697 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1698 {
1699 vnode_rele(vp);
1700 vnode_lock_spin(vp);
1701 vp->v_mountedhere = (mount_t)NULL;
1702 vnode_unlock(vp);
1703
1704 mp->mnt_vnodecovered = NULLVP;
1705 }
1706
1707 static int
1708 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1709 {
1710 int error;
1711
1712 /* unmount in progress return error */
1713 mount_lock_spin(mp);
1714 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1715 mount_unlock(mp);
1716 return EBUSY;
1717 }
1718 mount_unlock(mp);
1719 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1720
1721 /*
1722 * We only allow the filesystem to be reloaded if it
1723 * is currently mounted read-only.
1724 */
1725 if ((flags & MNT_RELOAD) &&
1726 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1727 error = ENOTSUP;
1728 goto out;
1729 }
1730
1731 /*
1732 * Only root, or the user that did the original mount is
1733 * permitted to update it.
1734 */
1735 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1736 (!vfs_context_issuser(ctx))) {
1737 error = EPERM;
1738 goto out;
1739 }
1740 #if CONFIG_MACF
1741 error = mac_mount_check_remount(ctx, mp);
1742 if (error != 0) {
1743 goto out;
1744 }
1745 #endif
1746
1747 out:
1748 if (error) {
1749 lck_rw_done(&mp->mnt_rwlock);
1750 }
1751
1752 return error;
1753 }
1754
1755 static void
1756 mount_end_update(mount_t mp)
1757 {
1758 lck_rw_done(&mp->mnt_rwlock);
1759 }
1760
1761 static int
1762 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1763 {
1764 vnode_t vp;
1765
1766 if (height >= MAX_IMAGEBOOT_NESTING) {
1767 return EINVAL;
1768 }
1769
1770 vp = imgsrc_rootvnodes[height];
1771 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1772 *rvpp = vp;
1773 return 0;
1774 } else {
1775 return ENOENT;
1776 }
1777 }
1778
1779 static int
1780 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1781 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1782 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1783 {
1784 int error;
1785 mount_t mp;
1786 boolean_t placed = FALSE;
1787 struct vfstable *vfsp;
1788 user_addr_t devpath;
1789 char *old_mntonname;
1790 vnode_t rvp;
1791 vnode_t devvp;
1792 uint32_t height;
1793 uint32_t flags;
1794
1795 /* If we didn't imageboot, nothing to move */
1796 if (imgsrc_rootvnodes[0] == NULLVP) {
1797 return EINVAL;
1798 }
1799
1800 /* Only root can do this */
1801 if (!vfs_context_issuser(ctx)) {
1802 return EPERM;
1803 }
1804
1805 IMGSRC_DEBUG("looking for root vnode.\n");
1806
1807 /*
1808 * Get root vnode of filesystem we're moving.
1809 */
1810 if (by_index) {
1811 if (is64bit) {
1812 struct user64_mnt_imgsrc_args mia64;
1813 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1814 if (error != 0) {
1815 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1816 return error;
1817 }
1818
1819 height = mia64.mi_height;
1820 flags = mia64.mi_flags;
1821 devpath = (user_addr_t)mia64.mi_devpath;
1822 } else {
1823 struct user32_mnt_imgsrc_args mia32;
1824 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1825 if (error != 0) {
1826 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1827 return error;
1828 }
1829
1830 height = mia32.mi_height;
1831 flags = mia32.mi_flags;
1832 devpath = mia32.mi_devpath;
1833 }
1834 } else {
1835 /*
1836 * For binary compatibility--assumes one level of nesting.
1837 */
1838 if (is64bit) {
1839 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1840 return error;
1841 }
1842 } else {
1843 user32_addr_t tmp;
1844 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1845 return error;
1846 }
1847
1848 /* munge into LP64 addr */
1849 devpath = CAST_USER_ADDR_T(tmp);
1850 }
1851
1852 height = 0;
1853 flags = 0;
1854 }
1855
1856 if (flags != 0) {
1857 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1858 return EINVAL;
1859 }
1860
1861 error = get_imgsrc_rootvnode(height, &rvp);
1862 if (error != 0) {
1863 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1864 return error;
1865 }
1866
1867 IMGSRC_DEBUG("got old root vnode\n");
1868
1869 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1870
1871 /* Can only move once */
1872 mp = vnode_mount(rvp);
1873 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1874 IMGSRC_DEBUG("Already moved.\n");
1875 error = EBUSY;
1876 goto out0;
1877 }
1878
1879 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1880 IMGSRC_DEBUG("Starting updated.\n");
1881
1882 /* Get exclusive rwlock on mount, authorize update on mp */
1883 error = mount_begin_update(mp, ctx, 0);
1884 if (error != 0) {
1885 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1886 goto out0;
1887 }
1888
1889 /*
1890 * It can only be moved once. Flag is set under the rwlock,
1891 * so we're now safe to proceed.
1892 */
1893 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1894 IMGSRC_DEBUG("Already moved [2]\n");
1895 goto out1;
1896 }
1897
1898 IMGSRC_DEBUG("Preparing coveredvp.\n");
1899
1900 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1901 error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1902 if (error != 0) {
1903 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1904 goto out1;
1905 }
1906
1907 IMGSRC_DEBUG("Covered vp OK.\n");
1908
1909 /* Sanity check the name caller has provided */
1910 vfsp = mp->mnt_vtable;
1911 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1912 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1913 vfsp->vfc_name, fsname);
1914 error = EINVAL;
1915 goto out2;
1916 }
1917
1918 /* Check the device vnode and update mount-from name, for local filesystems */
1919 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1920 IMGSRC_DEBUG("Local, doing device validation.\n");
1921
1922 if (devpath != USER_ADDR_NULL) {
1923 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1924 if (error) {
1925 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1926 goto out2;
1927 }
1928
1929 vnode_put(devvp);
1930 }
1931 }
1932
1933 /*
1934 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1935 * and increment the name cache's mount generation
1936 */
1937
1938 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1939 error = place_mount_and_checkdirs(mp, vp, ctx);
1940 if (error != 0) {
1941 goto out2;
1942 }
1943
1944 placed = TRUE;
1945
1946 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1947 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1948
1949 /* Forbid future moves */
1950 mount_lock(mp);
1951 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1952 mount_unlock(mp);
1953
1954 /* Finally, add to mount list, completely ready to go */
1955 if (mount_list_add(mp) != 0) {
1956 /*
1957 * The system is shutting down trying to umount
1958 * everything, so fail with a plausible errno.
1959 */
1960 error = EBUSY;
1961 goto out3;
1962 }
1963
1964 mount_end_update(mp);
1965 vnode_put(rvp);
1966 zfree(ZV_NAMEI, old_mntonname);
1967
1968 vfs_notify_mount(pvp);
1969
1970 return 0;
1971 out3:
1972 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1973
1974 mount_lock(mp);
1975 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1976 mount_unlock(mp);
1977
1978 out2:
1979 /*
1980 * Placing the mp on the vnode clears VMOUNT,
1981 * so cleanup is different after that point
1982 */
1983 if (placed) {
1984 /* Rele the vp, clear VMOUNT and v_mountedhere */
1985 undo_place_on_covered_vp(mp, vp);
1986 } else {
1987 vnode_lock_spin(vp);
1988 CLR(vp->v_flag, VMOUNT);
1989 vnode_unlock(vp);
1990 }
1991 out1:
1992 mount_end_update(mp);
1993
1994 out0:
1995 vnode_put(rvp);
1996 zfree(ZV_NAMEI, old_mntonname);
1997 return error;
1998 }
1999
2000 #if CONFIG_LOCKERBOOT
2001 __private_extern__
2002 int
2003 mount_locker_protoboot(const char *fsname, const char *mntpoint,
2004 const char *pbdevpath)
2005 {
2006 int error = -1;
2007 struct nameidata nd;
2008 boolean_t cleanup_nd = FALSE;
2009 vfs_context_t ctx = vfs_context_kernel();
2010 boolean_t is64 = TRUE;
2011 boolean_t by_index = TRUE;
2012 struct user64_mnt_imgsrc_args mia64 = {
2013 .mi_height = 0,
2014 .mi_flags = 0,
2015 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
2016 };
2017 user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
2018
2019 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
2020 UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
2021 error = namei(&nd);
2022 if (error) {
2023 IMGSRC_DEBUG("namei: %d\n", error);
2024 goto out;
2025 }
2026
2027 cleanup_nd = TRUE;
2028 error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
2029 &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
2030
2031 out:
2032 if (cleanup_nd) {
2033 int stashed = error;
2034
2035 error = vnode_put(nd.ni_vp);
2036 if (error) {
2037 panic("vnode_put() returned non-zero: %d", error);
2038 }
2039
2040 if (nd.ni_dvp) {
2041 error = vnode_put(nd.ni_dvp);
2042 if (error) {
2043 panic("vnode_put() returned non-zero: %d", error);
2044 }
2045 }
2046 nameidone(&nd);
2047
2048 error = stashed;
2049 }
2050 return error;
2051 }
2052 #endif /* CONFIG_LOCKERBOOT */
2053 #endif /* CONFIG_IMGSRC_ACCESS */
2054
2055 void
2056 enablequotas(struct mount *mp, vfs_context_t ctx)
2057 {
2058 struct nameidata qnd;
2059 int type;
2060 char qfpath[MAXPATHLEN];
2061 const char *qfname = QUOTAFILENAME;
2062 const char *qfopsname = QUOTAOPSNAME;
2063 const char *qfextension[] = INITQFNAMES;
2064
2065 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2066 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2067 return;
2068 }
2069 /*
2070 * Enable filesystem disk quotas if necessary.
2071 * We ignore errors as this should not interfere with final mount
2072 */
2073 for (type = 0; type < MAXQUOTAS; type++) {
2074 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2075 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2076 CAST_USER_ADDR_T(qfpath), ctx);
2077 if (namei(&qnd) != 0) {
2078 continue; /* option file to trigger quotas is not present */
2079 }
2080 vnode_put(qnd.ni_vp);
2081 nameidone(&qnd);
2082 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2083
2084 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2085 }
2086 return;
2087 }
2088
2089
2090 static int
2091 checkdirs_callback(proc_t p, void * arg)
2092 {
2093 struct cdirargs * cdrp = (struct cdirargs *)arg;
2094 vnode_t olddp = cdrp->olddp;
2095 vnode_t newdp = cdrp->newdp;
2096 struct filedesc *fdp;
2097 vnode_t new_cvp = newdp;
2098 vnode_t new_rvp = newdp;
2099 vnode_t old_cvp = NULL;
2100 vnode_t old_rvp = NULL;
2101
2102 /*
2103 * XXX Also needs to iterate each thread in the process to see if it
2104 * XXX is using a per-thread current working directory, and, if so,
2105 * XXX update that as well.
2106 */
2107
2108 /*
2109 * First, with the proc_fdlock held, check to see if we will need
2110 * to do any work. If not, we will get out fast.
2111 */
2112 proc_fdlock(p);
2113 fdp = p->p_fd;
2114 if (fdp == NULL ||
2115 (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
2116 proc_fdunlock(p);
2117 return PROC_RETURNED;
2118 }
2119 proc_fdunlock(p);
2120
2121 /*
2122 * Ok, we will have to do some work. Always take two refs
2123 * because we might need that many. We'll dispose of whatever
2124 * we ended up not using.
2125 */
2126 if (vnode_ref(newdp) != 0) {
2127 return PROC_RETURNED;
2128 }
2129 if (vnode_ref(newdp) != 0) {
2130 vnode_rele(newdp);
2131 return PROC_RETURNED;
2132 }
2133
2134 proc_dirs_lock_exclusive(p);
2135 /*
2136 * Now do the work. Note: we dropped the proc_fdlock, so we
2137 * have to do all of the checks again.
2138 */
2139 proc_fdlock(p);
2140 fdp = p->p_fd;
2141 if (fdp != NULL) {
2142 if (fdp->fd_cdir == olddp) {
2143 old_cvp = olddp;
2144 fdp->fd_cdir = newdp;
2145 new_cvp = NULL;
2146 }
2147 if (fdp->fd_rdir == olddp) {
2148 old_rvp = olddp;
2149 fdp->fd_rdir = newdp;
2150 new_rvp = NULL;
2151 }
2152 }
2153 proc_fdunlock(p);
2154 proc_dirs_unlock_exclusive(p);
2155
2156 /*
2157 * Dispose of any references that are no longer needed.
2158 */
2159 if (old_cvp != NULL) {
2160 vnode_rele(old_cvp);
2161 }
2162 if (old_rvp != NULL) {
2163 vnode_rele(old_rvp);
2164 }
2165 if (new_cvp != NULL) {
2166 vnode_rele(new_cvp);
2167 }
2168 if (new_rvp != NULL) {
2169 vnode_rele(new_rvp);
2170 }
2171
2172 return PROC_RETURNED;
2173 }
2174
2175
2176
2177 /*
2178 * Scan all active processes to see if any of them have a current
2179 * or root directory onto which the new filesystem has just been
2180 * mounted. If so, replace them with the new mount point.
2181 */
2182 static int
2183 checkdirs(vnode_t olddp, vfs_context_t ctx)
2184 {
2185 vnode_t newdp;
2186 vnode_t tvp;
2187 int err;
2188 struct cdirargs cdr;
2189
2190 if (olddp->v_usecount == 1) {
2191 return 0;
2192 }
2193 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2194
2195 if (err != 0) {
2196 #if DIAGNOSTIC
2197 panic("mount: lost mount: error %d", err);
2198 #endif
2199 return err;
2200 }
2201
2202 cdr.olddp = olddp;
2203 cdr.newdp = newdp;
2204 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2205 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2206
2207 if (rootvnode == olddp) {
2208 vnode_ref(newdp);
2209 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2210 tvp = rootvnode;
2211 rootvnode = newdp;
2212 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2213 vnode_rele(tvp);
2214 }
2215
2216 vnode_put(newdp);
2217 return 0;
2218 }
2219
2220 /*
2221 * Unmount a file system.
2222 *
2223 * Note: unmount takes a path to the vnode mounted on as argument,
2224 * not special file (as before).
2225 */
2226 /* ARGSUSED */
2227 int
2228 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2229 {
2230 vnode_t vp;
2231 struct mount *mp;
2232 int error;
2233 struct nameidata nd;
2234 vfs_context_t ctx = vfs_context_current();
2235
2236 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2237 UIO_USERSPACE, uap->path, ctx);
2238 error = namei(&nd);
2239 if (error) {
2240 return error;
2241 }
2242 vp = nd.ni_vp;
2243 mp = vp->v_mount;
2244 nameidone(&nd);
2245
2246 #if CONFIG_MACF
2247 error = mac_mount_check_umount(ctx, mp);
2248 if (error != 0) {
2249 vnode_put(vp);
2250 return error;
2251 }
2252 #endif
2253 /*
2254 * Must be the root of the filesystem
2255 */
2256 if ((vp->v_flag & VROOT) == 0) {
2257 vnode_put(vp);
2258 return EINVAL;
2259 }
2260 mount_ref(mp, 0);
2261 vnode_put(vp);
2262 /* safedounmount consumes the mount ref */
2263 return safedounmount(mp, uap->flags, ctx);
2264 }
2265
2266 int
2267 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2268 {
2269 mount_t mp;
2270
2271 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2272 if (mp == (mount_t)0) {
2273 return ENOENT;
2274 }
2275 mount_ref(mp, 0);
2276 mount_iterdrop(mp);
2277 /* safedounmount consumes the mount ref */
2278 return safedounmount(mp, flags, ctx);
2279 }
2280
2281 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2282 "com.apple.private.vfs.role-account-unmount"
2283
2284 /*
2285 * The mount struct comes with a mount ref which will be consumed.
2286 * Do the actual file system unmount, prevent some common foot shooting.
2287 */
2288 int
2289 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2290 {
2291 int error;
2292 proc_t p = vfs_context_proc(ctx);
2293
2294 /*
2295 * If the file system is not responding and MNT_NOBLOCK
2296 * is set and not a forced unmount then return EBUSY.
2297 */
2298 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2299 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2300 error = EBUSY;
2301 goto out;
2302 }
2303
2304 /*
2305 * Skip authorization in two cases:
2306 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2307 * This entitlement allows non-root processes unmount volumes mounted by
2308 * other processes.
2309 * - If the mount is tagged as permissive and this is not a forced-unmount
2310 * attempt.
2311 */
2312 if (!IOTaskHasEntitlement(current_task(), ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2313 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2314 /*
2315 * Only root, or the user that did the original mount is
2316 * permitted to unmount this filesystem.
2317 */
2318 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2319 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2320 goto out;
2321 }
2322 }
2323 /*
2324 * Don't allow unmounting the root file system, or other volumes
2325 * associated with it (for example, the associated VM or DATA mounts) .
2326 */
2327 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2328 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2329 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2330 mp->mnt_vfsstat.f_mntonname);
2331 }
2332 error = EBUSY; /* the root (or associated volumes) is always busy */
2333 goto out;
2334 }
2335
2336 /*
2337 * If the mount is providing the root filesystem's disk image
2338 * (i.e. imageboot), don't allow unmounting
2339 */
2340 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2341 error = EBUSY;
2342 goto out;
2343 }
2344
2345 return dounmount(mp, flags, 1, ctx);
2346
2347 out:
2348 mount_drop(mp, 0);
2349 return error;
2350 }
2351
2352 /*
2353 * Do the actual file system unmount.
2354 */
2355 int
2356 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2357 {
2358 vnode_t coveredvp = (vnode_t)0;
2359 int error;
2360 int needwakeup = 0;
2361 int forcedunmount = 0;
2362 int lflags = 0;
2363 struct vnode *devvp = NULLVP;
2364 #if CONFIG_TRIGGERS
2365 proc_t p = vfs_context_proc(ctx);
2366 int did_vflush = 0;
2367 int pflags_save = 0;
2368 #endif /* CONFIG_TRIGGERS */
2369
2370 #if CONFIG_FSE
2371 if (!(flags & MNT_FORCE)) {
2372 fsevent_unmount(mp, ctx); /* has to come first! */
2373 }
2374 #endif
2375
2376 mount_lock(mp);
2377
2378 /*
2379 * If already an unmount in progress just return EBUSY.
2380 * Even a forced unmount cannot override.
2381 */
2382 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2383 if (withref != 0) {
2384 mount_drop(mp, 1);
2385 }
2386 mount_unlock(mp);
2387 return EBUSY;
2388 }
2389
2390 if (flags & MNT_FORCE) {
2391 forcedunmount = 1;
2392 mp->mnt_lflag |= MNT_LFORCE;
2393 }
2394
2395 #if CONFIG_TRIGGERS
2396 if (flags & MNT_NOBLOCK && p != kernproc) {
2397 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2398 }
2399 #endif
2400
2401 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2402 mp->mnt_lflag |= MNT_LUNMOUNT;
2403 mp->mnt_flag &= ~MNT_ASYNC;
2404 /*
2405 * anyone currently in the fast path that
2406 * trips over the cached rootvp will be
2407 * dumped out and forced into the slow path
2408 * to regenerate a new cached value
2409 */
2410 mp->mnt_realrootvp = NULLVP;
2411 mount_unlock(mp);
2412
2413 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2414 /*
2415 * Force unmount any mounts in this filesystem.
2416 * If any unmounts fail - just leave them dangling.
2417 * Avoids recursion.
2418 */
2419 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2420 }
2421
2422 /*
2423 * taking the name_cache_lock exclusively will
2424 * insure that everyone is out of the fast path who
2425 * might be trying to use a now stale copy of
2426 * vp->v_mountedhere->mnt_realrootvp
2427 * bumping mount_generation causes the cached values
2428 * to be invalidated
2429 */
2430 name_cache_lock();
2431 mount_generation++;
2432 name_cache_unlock();
2433
2434
2435 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2436 if (withref != 0) {
2437 mount_drop(mp, 0);
2438 }
2439 error = 0;
2440 if (forcedunmount == 0) {
2441 ubc_umount(mp); /* release cached vnodes */
2442 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2443 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2444 if (error) {
2445 mount_lock(mp);
2446 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2447 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2448 mp->mnt_lflag &= ~MNT_LFORCE;
2449 goto out;
2450 }
2451 }
2452 }
2453
2454 IOBSDMountChange(mp, kIOMountChangeUnmount);
2455
2456 #if CONFIG_TRIGGERS
2457 vfs_nested_trigger_unmounts(mp, flags, ctx);
2458 did_vflush = 1;
2459 #endif
2460 if (forcedunmount) {
2461 lflags |= FORCECLOSE;
2462 }
2463 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2464 if ((forcedunmount == 0) && error) {
2465 mount_lock(mp);
2466 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2467 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2468 mp->mnt_lflag &= ~MNT_LFORCE;
2469 goto out;
2470 }
2471
2472 /* make sure there are no one in the mount iterations or lookup */
2473 mount_iterdrain(mp);
2474
2475 error = VFS_UNMOUNT(mp, flags, ctx);
2476 if (error) {
2477 mount_iterreset(mp);
2478 mount_lock(mp);
2479 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2480 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2481 mp->mnt_lflag &= ~MNT_LFORCE;
2482 goto out;
2483 }
2484
2485 /* increment the operations count */
2486 if (!error) {
2487 OSAddAtomic(1, &vfs_nummntops);
2488 }
2489
2490 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2491 /* hold an io reference and drop the usecount before close */
2492 devvp = mp->mnt_devvp;
2493 vnode_getalways(devvp);
2494 vnode_rele(devvp);
2495 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2496 ctx);
2497 vnode_clearmountedon(devvp);
2498 vnode_put(devvp);
2499 }
2500 lck_rw_done(&mp->mnt_rwlock);
2501 mount_list_remove(mp);
2502 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2503
2504 /* mark the mount point hook in the vp but not drop the ref yet */
2505 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2506 /*
2507 * The covered vnode needs special handling. Trying to get an
2508 * iocount must not block here as this may lead to deadlocks
2509 * if the Filesystem to which the covered vnode belongs is
2510 * undergoing forced unmounts. Since we hold a usecount, the
2511 * vnode cannot be reused (it can, however, still be terminated)
2512 */
2513 vnode_getalways(coveredvp);
2514 vnode_lock_spin(coveredvp);
2515
2516 mp->mnt_crossref++;
2517 coveredvp->v_mountedhere = (struct mount *)0;
2518 CLR(coveredvp->v_flag, VMOUNT);
2519
2520 vnode_unlock(coveredvp);
2521 vnode_put(coveredvp);
2522 }
2523
2524 mount_list_lock();
2525 mp->mnt_vtable->vfc_refcount--;
2526 mount_list_unlock();
2527
2528 cache_purgevfs(mp); /* remove cache entries for this file sys */
2529 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2530 mount_lock(mp);
2531 mp->mnt_lflag |= MNT_LDEAD;
2532
2533 if (mp->mnt_lflag & MNT_LWAIT) {
2534 /*
2535 * do the wakeup here
2536 * in case we block in mount_refdrain
2537 * which will drop the mount lock
2538 * and allow anyone blocked in vfs_busy
2539 * to wakeup and see the LDEAD state
2540 */
2541 mp->mnt_lflag &= ~MNT_LWAIT;
2542 wakeup((caddr_t)mp);
2543 }
2544 mount_refdrain(mp);
2545
2546 /* free disk_conditioner_info structure for this mount */
2547 disk_conditioner_unmount(mp);
2548
2549 out:
2550 if (mp->mnt_lflag & MNT_LWAIT) {
2551 mp->mnt_lflag &= ~MNT_LWAIT;
2552 needwakeup = 1;
2553 }
2554
2555 #if CONFIG_TRIGGERS
2556 if (flags & MNT_NOBLOCK && p != kernproc) {
2557 // Restore P_NOREMOTEHANG bit to its previous value
2558 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2559 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2560 }
2561 }
2562
2563 /*
2564 * Callback and context are set together under the mount lock, and
2565 * never cleared, so we're safe to examine them here, drop the lock,
2566 * and call out.
2567 */
2568 if (mp->mnt_triggercallback != NULL) {
2569 mount_unlock(mp);
2570 if (error == 0) {
2571 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2572 } else if (did_vflush) {
2573 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2574 }
2575 } else {
2576 mount_unlock(mp);
2577 }
2578 #else
2579 mount_unlock(mp);
2580 #endif /* CONFIG_TRIGGERS */
2581
2582 lck_rw_done(&mp->mnt_rwlock);
2583
2584 if (needwakeup) {
2585 wakeup((caddr_t)mp);
2586 }
2587
2588 if (!error) {
2589 if ((coveredvp != NULLVP)) {
2590 vnode_t pvp = NULLVP;
2591
2592 /*
2593 * The covered vnode needs special handling. Trying to
2594 * get an iocount must not block here as this may lead
2595 * to deadlocks if the Filesystem to which the covered
2596 * vnode belongs is undergoing forced unmounts. Since we
2597 * hold a usecount, the vnode cannot be reused
2598 * (it can, however, still be terminated).
2599 */
2600 vnode_getalways(coveredvp);
2601
2602 mount_dropcrossref(mp, coveredvp, 0);
2603 /*
2604 * We'll _try_ to detect if this really needs to be
2605 * done. The coveredvp can only be in termination (or
2606 * terminated) if the coveredvp's mount point is in a
2607 * forced unmount (or has been) since we still hold the
2608 * ref.
2609 */
2610 if (!vnode_isrecycled(coveredvp)) {
2611 pvp = vnode_getparent(coveredvp);
2612 #if CONFIG_TRIGGERS
2613 if (coveredvp->v_resolve) {
2614 vnode_trigger_rearm(coveredvp, ctx);
2615 }
2616 #endif
2617 }
2618
2619 vnode_rele(coveredvp);
2620 vnode_put(coveredvp);
2621 coveredvp = NULLVP;
2622
2623 if (pvp) {
2624 lock_vnode_and_post(pvp, NOTE_WRITE);
2625 vnode_put(pvp);
2626 }
2627 } else if (mp->mnt_flag & MNT_ROOTFS) {
2628 mount_lock_destroy(mp);
2629 #if CONFIG_MACF
2630 mac_mount_label_destroy(mp);
2631 #endif
2632 zfree(mount_zone, mp);
2633 } else {
2634 panic("dounmount: no coveredvp");
2635 }
2636 }
2637 return error;
2638 }
2639
2640 /*
2641 * Unmount any mounts in this filesystem.
2642 */
2643 void
2644 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2645 {
2646 mount_t smp;
2647 fsid_t *fsids, fsid;
2648 int fsids_sz;
2649 int count = 0, i, m = 0;
2650 vnode_t vp;
2651
2652 mount_list_lock();
2653
2654 // Get an array to hold the submounts fsids.
2655 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2656 count++;
2657 fsids_sz = count * sizeof(fsid_t);
2658 fsids = kheap_alloc(KHEAP_TEMP, fsids_sz, Z_NOWAIT);
2659 if (fsids == NULL) {
2660 mount_list_unlock();
2661 goto out;
2662 }
2663 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2664
2665 /*
2666 * Fill the array with submount fsids.
2667 * Since mounts are always added to the tail of the mount list, the
2668 * list is always in mount order.
2669 * For each mount check if the mounted-on vnode belongs to a
2670 * mount that's already added to our array of mounts to be unmounted.
2671 */
2672 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2673 vp = smp->mnt_vnodecovered;
2674 if (vp == NULL) {
2675 continue;
2676 }
2677 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2678 for (i = 0; i <= m; i++) {
2679 if (fsids[i].val[0] == fsid.val[0] &&
2680 fsids[i].val[1] == fsid.val[1]) {
2681 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2682 break;
2683 }
2684 }
2685 }
2686 mount_list_unlock();
2687
2688 // Unmount the submounts in reverse order. Ignore errors.
2689 for (i = m; i > 0; i--) {
2690 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2691 if (smp) {
2692 mount_ref(smp, 0);
2693 mount_iterdrop(smp);
2694 (void) dounmount(smp, flags, 1, ctx);
2695 }
2696 }
2697 out:
2698 kheap_free(KHEAP_TEMP, fsids, fsids_sz);
2699 }
2700
2701 void
2702 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2703 {
2704 vnode_lock(dp);
2705 mp->mnt_crossref--;
2706
2707 if (mp->mnt_crossref < 0) {
2708 panic("mount cross refs -ve");
2709 }
2710
2711 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2712 if (need_put) {
2713 vnode_put_locked(dp);
2714 }
2715 vnode_unlock(dp);
2716
2717 mount_lock_destroy(mp);
2718 #if CONFIG_MACF
2719 mac_mount_label_destroy(mp);
2720 #endif
2721 zfree(mount_zone, mp);
2722 return;
2723 }
2724 if (need_put) {
2725 vnode_put_locked(dp);
2726 }
2727 vnode_unlock(dp);
2728 }
2729
2730
2731 /*
2732 * Sync each mounted filesystem.
2733 */
2734 #if DIAGNOSTIC
2735 int syncprt = 0;
2736 #endif
2737
2738 int print_vmpage_stat = 0;
2739
2740 /*
2741 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2742 * mounted read-write with the passed waitfor value.
2743 *
2744 * Parameters: mp mount-point descriptor per mounted file-system instance.
2745 * arg user argument (please see below)
2746 *
2747 * User argument is a pointer to 32 bit unsigned integer which describes the
2748 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2749 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2750 * waitfor value.
2751 *
2752 * Returns: VFS_RETURNED
2753 */
2754 static int
2755 sync_callback(mount_t mp, void *arg)
2756 {
2757 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2758 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2759 unsigned waitfor = MNT_NOWAIT;
2760
2761 if (arg) {
2762 waitfor = *(uint32_t*)arg;
2763 }
2764
2765 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2766 if (waitfor != MNT_WAIT &&
2767 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2768 waitfor != MNT_NOWAIT &&
2769 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2770 waitfor != MNT_DWAIT &&
2771 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2772 panic("Passed inappropriate waitfor %u to "
2773 "sync_callback()", waitfor);
2774 }
2775
2776 mp->mnt_flag &= ~MNT_ASYNC;
2777 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2778 if (asyncflag) {
2779 mp->mnt_flag |= MNT_ASYNC;
2780 }
2781 }
2782
2783 return VFS_RETURNED;
2784 }
2785
2786 /* ARGSUSED */
2787 int
2788 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2789 {
2790 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2791
2792 if (print_vmpage_stat) {
2793 vm_countdirtypages();
2794 }
2795
2796 #if DIAGNOSTIC
2797 if (syncprt) {
2798 vfs_bufstats();
2799 }
2800 #endif /* DIAGNOSTIC */
2801 return 0;
2802 }
2803
2804 typedef enum {
2805 SYNC_ALL = 0,
2806 SYNC_ONLY_RELIABLE_MEDIA = 1,
2807 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2808 } sync_type_t;
2809
2810 static int
2811 sync_internal_callback(mount_t mp, void *arg)
2812 {
2813 if (arg) {
2814 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2815 (mp->mnt_flag & MNT_LOCAL);
2816 sync_type_t sync_type = *((sync_type_t *)arg);
2817
2818 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2819 return VFS_RETURNED;
2820 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2821 return VFS_RETURNED;
2822 }
2823 }
2824
2825 (void)sync_callback(mp, NULL);
2826
2827 return VFS_RETURNED;
2828 }
2829
2830 int sync_thread_state = 0;
2831 int sync_timeout_seconds = 5;
2832
2833 #define SYNC_THREAD_RUN 0x0001
2834 #define SYNC_THREAD_RUNNING 0x0002
2835
2836 #if CONFIG_PHYS_WRITE_ACCT
2837 thread_t pm_sync_thread;
2838 #endif /* CONFIG_PHYS_WRITE_ACCT */
2839
2840 static void
2841 sync_thread(__unused void *arg, __unused wait_result_t wr)
2842 {
2843 sync_type_t sync_type;
2844 #if CONFIG_PHYS_WRITE_ACCT
2845 pm_sync_thread = current_thread();
2846 #endif /* CONFIG_PHYS_WRITE_ACCT */
2847
2848 lck_mtx_lock(&sync_mtx_lck);
2849 while (sync_thread_state & SYNC_THREAD_RUN) {
2850 sync_thread_state &= ~SYNC_THREAD_RUN;
2851 lck_mtx_unlock(&sync_mtx_lck);
2852
2853 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2854 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2855 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2856 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2857
2858 lck_mtx_lock(&sync_mtx_lck);
2859 }
2860 /*
2861 * This wakeup _has_ to be issued before the lock is released otherwise
2862 * we may end up waking up a thread in sync_internal which is
2863 * expecting a wakeup from a thread it just created and not from this
2864 * thread which is about to exit.
2865 */
2866 wakeup(&sync_thread_state);
2867 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2868 #if CONFIG_PHYS_WRITE_ACCT
2869 pm_sync_thread = NULL;
2870 #endif /* CONFIG_PHYS_WRITE_ACCT */
2871 lck_mtx_unlock(&sync_mtx_lck);
2872
2873 if (print_vmpage_stat) {
2874 vm_countdirtypages();
2875 }
2876
2877 #if DIAGNOSTIC
2878 if (syncprt) {
2879 vfs_bufstats();
2880 }
2881 #endif /* DIAGNOSTIC */
2882 }
2883
2884 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2885
2886 /*
2887 * An in-kernel sync for power management to call.
2888 * This function always returns within sync_timeout seconds.
2889 */
2890 __private_extern__ int
2891 sync_internal(void)
2892 {
2893 thread_t thd;
2894 int error;
2895 int thread_created = FALSE;
2896 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2897
2898 lck_mtx_lock(&sync_mtx_lck);
2899 sync_thread_state |= SYNC_THREAD_RUN;
2900 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2901 int kr;
2902
2903 sync_thread_state |= SYNC_THREAD_RUNNING;
2904 kr = kernel_thread_start(sync_thread, NULL, &thd);
2905 if (kr != KERN_SUCCESS) {
2906 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2907 lck_mtx_unlock(&sync_mtx_lck);
2908 printf("sync_thread failed\n");
2909 return 0;
2910 }
2911 thread_created = TRUE;
2912 }
2913
2914 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2915 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2916 if (error) {
2917 struct timeval now;
2918
2919 microtime(&now);
2920 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2921 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2922 sync_timeout_last_print.tv_sec = now.tv_sec;
2923 }
2924 }
2925
2926 if (thread_created) {
2927 thread_deallocate(thd);
2928 }
2929
2930 return 0;
2931 } /* end of sync_internal call */
2932
2933 /*
2934 * Change filesystem quotas.
2935 */
2936 #if QUOTA
2937 int
2938 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2939 {
2940 struct mount *mp;
2941 int error, quota_cmd, quota_status = 0;
2942 caddr_t datap;
2943 size_t fnamelen;
2944 struct nameidata nd;
2945 vfs_context_t ctx = vfs_context_current();
2946 struct dqblk my_dqblk = {};
2947
2948 AUDIT_ARG(uid, uap->uid);
2949 AUDIT_ARG(cmd, uap->cmd);
2950 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2951 uap->path, ctx);
2952 error = namei(&nd);
2953 if (error) {
2954 return error;
2955 }
2956 mp = nd.ni_vp->v_mount;
2957 mount_ref(mp, 0);
2958 vnode_put(nd.ni_vp);
2959 nameidone(&nd);
2960
2961 /* copyin any data we will need for downstream code */
2962 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2963
2964 switch (quota_cmd) {
2965 case Q_QUOTAON:
2966 /* uap->arg specifies a file from which to take the quotas */
2967 fnamelen = MAXPATHLEN;
2968 datap = zalloc(ZV_NAMEI);
2969 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2970 break;
2971 case Q_GETQUOTA:
2972 /* uap->arg is a pointer to a dqblk structure. */
2973 datap = (caddr_t) &my_dqblk;
2974 break;
2975 case Q_SETQUOTA:
2976 case Q_SETUSE:
2977 /* uap->arg is a pointer to a dqblk structure. */
2978 datap = (caddr_t) &my_dqblk;
2979 if (proc_is64bit(p)) {
2980 struct user_dqblk my_dqblk64;
2981 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2982 if (error == 0) {
2983 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2984 }
2985 } else {
2986 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2987 }
2988 break;
2989 case Q_QUOTASTAT:
2990 /* uap->arg is a pointer to an integer */
2991 datap = (caddr_t) &quota_status;
2992 break;
2993 default:
2994 datap = NULL;
2995 break;
2996 } /* switch */
2997
2998 if (error == 0) {
2999 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3000 }
3001
3002 switch (quota_cmd) {
3003 case Q_QUOTAON:
3004 if (datap != NULL) {
3005 zfree(ZV_NAMEI, datap);
3006 }
3007 break;
3008 case Q_GETQUOTA:
3009 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3010 if (error == 0) {
3011 if (proc_is64bit(p)) {
3012 struct user_dqblk my_dqblk64;
3013
3014 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3015 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3016 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3017 } else {
3018 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3019 }
3020 }
3021 break;
3022 case Q_QUOTASTAT:
3023 /* uap->arg is a pointer to an integer */
3024 if (error == 0) {
3025 error = copyout(datap, uap->arg, sizeof(quota_status));
3026 }
3027 break;
3028 default:
3029 break;
3030 } /* switch */
3031
3032 mount_drop(mp, 0);
3033 return error;
3034 }
3035 #else
3036 int
3037 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3038 {
3039 return EOPNOTSUPP;
3040 }
3041 #endif /* QUOTA */
3042
3043 /*
3044 * Get filesystem statistics.
3045 *
3046 * Returns: 0 Success
3047 * namei:???
3048 * vfs_update_vfsstat:???
3049 * munge_statfs:EFAULT
3050 */
3051 /* ARGSUSED */
3052 int
3053 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3054 {
3055 struct mount *mp;
3056 struct vfsstatfs *sp;
3057 int error;
3058 struct nameidata nd;
3059 vfs_context_t ctx = vfs_context_current();
3060 vnode_t vp;
3061
3062 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3063 UIO_USERSPACE, uap->path, ctx);
3064 error = namei(&nd);
3065 if (error != 0) {
3066 return error;
3067 }
3068 vp = nd.ni_vp;
3069 mp = vp->v_mount;
3070 sp = &mp->mnt_vfsstat;
3071 nameidone(&nd);
3072
3073 #if CONFIG_MACF
3074 error = mac_mount_check_stat(ctx, mp);
3075 if (error != 0) {
3076 vnode_put(vp);
3077 return error;
3078 }
3079 #endif
3080
3081 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3082 if (error != 0) {
3083 vnode_put(vp);
3084 return error;
3085 }
3086
3087 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3088 vnode_put(vp);
3089 return error;
3090 }
3091
3092 /*
3093 * Get filesystem statistics.
3094 */
3095 /* ARGSUSED */
3096 int
3097 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3098 {
3099 vnode_t vp;
3100 struct mount *mp;
3101 struct vfsstatfs *sp;
3102 int error;
3103
3104 AUDIT_ARG(fd, uap->fd);
3105
3106 if ((error = file_vnode(uap->fd, &vp))) {
3107 return error;
3108 }
3109
3110 error = vnode_getwithref(vp);
3111 if (error) {
3112 file_drop(uap->fd);
3113 return error;
3114 }
3115
3116 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3117
3118 mp = vp->v_mount;
3119 if (!mp) {
3120 error = EBADF;
3121 goto out;
3122 }
3123
3124 #if CONFIG_MACF
3125 error = mac_mount_check_stat(vfs_context_current(), mp);
3126 if (error != 0) {
3127 goto out;
3128 }
3129 #endif
3130
3131 sp = &mp->mnt_vfsstat;
3132 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3133 goto out;
3134 }
3135
3136 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3137
3138 out:
3139 file_drop(uap->fd);
3140 vnode_put(vp);
3141
3142 return error;
3143 }
3144
3145 void
3146 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3147 {
3148 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3149
3150 bzero(sfs, sizeof(*sfs));
3151
3152 sfs->f_bsize = vsfs->f_bsize;
3153 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3154 sfs->f_blocks = vsfs->f_blocks;
3155 sfs->f_bfree = vsfs->f_bfree;
3156 sfs->f_bavail = vsfs->f_bavail;
3157 sfs->f_files = vsfs->f_files;
3158 sfs->f_ffree = vsfs->f_ffree;
3159 sfs->f_fsid = vsfs->f_fsid;
3160 sfs->f_owner = vsfs->f_owner;
3161 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3162 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3163 sfs->f_fssubtype = vsfs->f_fssubtype;
3164 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3165 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3166 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3167 } else {
3168 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3169 }
3170 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3171 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3172 }
3173
3174 /*
3175 * Get file system statistics in 64-bit mode
3176 */
3177 int
3178 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3179 {
3180 struct mount *mp;
3181 int error;
3182 struct nameidata *ndp;
3183 struct statfs64 *sfsp;
3184 vfs_context_t ctxp = vfs_context_current();
3185 vnode_t vp;
3186 union {
3187 struct nameidata nd;
3188 struct statfs64 sfs;
3189 } *__nameidata_statfs64;
3190
3191 __nameidata_statfs64 = kheap_alloc(KHEAP_TEMP, sizeof(*__nameidata_statfs64),
3192 Z_WAITOK);
3193 ndp = &__nameidata_statfs64->nd;
3194
3195 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3196 UIO_USERSPACE, uap->path, ctxp);
3197 error = namei(ndp);
3198 if (error != 0) {
3199 goto out;
3200 }
3201 vp = ndp->ni_vp;
3202 mp = vp->v_mount;
3203 nameidone(ndp);
3204
3205 #if CONFIG_MACF
3206 error = mac_mount_check_stat(ctxp, mp);
3207 if (error != 0) {
3208 vnode_put(vp);
3209 goto out;
3210 }
3211 #endif
3212
3213 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3214 if (error != 0) {
3215 vnode_put(vp);
3216 goto out;
3217 }
3218
3219 sfsp = &__nameidata_statfs64->sfs;
3220 vfs_get_statfs64(mp, sfsp);
3221 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3222 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3223 /* This process does not want to see a seperate data volume mountpoint */
3224 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3225 }
3226 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3227 vnode_put(vp);
3228
3229 out:
3230 kheap_free(KHEAP_TEMP, __nameidata_statfs64, sizeof(*__nameidata_statfs64));
3231
3232 return error;
3233 }
3234
3235 /*
3236 * Get file system statistics in 64-bit mode
3237 */
3238 int
3239 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3240 {
3241 struct vnode *vp;
3242 struct mount *mp;
3243 struct statfs64 sfs;
3244 int error;
3245
3246 AUDIT_ARG(fd, uap->fd);
3247
3248 if ((error = file_vnode(uap->fd, &vp))) {
3249 return error;
3250 }
3251
3252 error = vnode_getwithref(vp);
3253 if (error) {
3254 file_drop(uap->fd);
3255 return error;
3256 }
3257
3258 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3259
3260 mp = vp->v_mount;
3261 if (!mp) {
3262 error = EBADF;
3263 goto out;
3264 }
3265
3266 #if CONFIG_MACF
3267 error = mac_mount_check_stat(vfs_context_current(), mp);
3268 if (error != 0) {
3269 goto out;
3270 }
3271 #endif
3272
3273 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3274 goto out;
3275 }
3276
3277 vfs_get_statfs64(mp, &sfs);
3278 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3279 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3280 /* This process does not want to see a seperate data volume mountpoint */
3281 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3282 }
3283 error = copyout(&sfs, uap->buf, sizeof(sfs));
3284
3285 out:
3286 file_drop(uap->fd);
3287 vnode_put(vp);
3288
3289 return error;
3290 }
3291
3292 struct getfsstat_struct {
3293 user_addr_t sfsp;
3294 user_addr_t *mp;
3295 int count;
3296 int maxcount;
3297 int flags;
3298 int error;
3299 };
3300
3301
3302 static int
3303 getfsstat_callback(mount_t mp, void * arg)
3304 {
3305 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3306 struct vfsstatfs *sp;
3307 int error, my_size;
3308 vfs_context_t ctx = vfs_context_current();
3309
3310 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3311 #if CONFIG_MACF
3312 error = mac_mount_check_stat(ctx, mp);
3313 if (error != 0) {
3314 fstp->error = error;
3315 return VFS_RETURNED_DONE;
3316 }
3317 #endif
3318 sp = &mp->mnt_vfsstat;
3319 /*
3320 * If MNT_NOWAIT is specified, do not refresh the
3321 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3322 */
3323 if ((mp->mnt_lflag & MNT_LDEAD) ||
3324 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3325 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3326 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3327 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3328 return VFS_RETURNED;
3329 }
3330
3331 /*
3332 * Need to handle LP64 version of struct statfs
3333 */
3334 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3335 if (error) {
3336 fstp->error = error;
3337 return VFS_RETURNED_DONE;
3338 }
3339 fstp->sfsp += my_size;
3340
3341 if (fstp->mp) {
3342 #if CONFIG_MACF
3343 error = mac_mount_label_get(mp, *fstp->mp);
3344 if (error) {
3345 fstp->error = error;
3346 return VFS_RETURNED_DONE;
3347 }
3348 #endif
3349 fstp->mp++;
3350 }
3351 }
3352 fstp->count++;
3353 return VFS_RETURNED;
3354 }
3355
3356 /*
3357 * Get statistics on all filesystems.
3358 */
3359 int
3360 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3361 {
3362 struct __mac_getfsstat_args muap;
3363
3364 muap.buf = uap->buf;
3365 muap.bufsize = uap->bufsize;
3366 muap.mac = USER_ADDR_NULL;
3367 muap.macsize = 0;
3368 muap.flags = uap->flags;
3369
3370 return __mac_getfsstat(p, &muap, retval);
3371 }
3372
3373 /*
3374 * __mac_getfsstat: Get MAC-related file system statistics
3375 *
3376 * Parameters: p (ignored)
3377 * uap User argument descriptor (see below)
3378 * retval Count of file system statistics (N stats)
3379 *
3380 * Indirect: uap->bufsize Buffer size
3381 * uap->macsize MAC info size
3382 * uap->buf Buffer where information will be returned
3383 * uap->mac MAC info
3384 * uap->flags File system flags
3385 *
3386 *
3387 * Returns: 0 Success
3388 * !0 Not success
3389 *
3390 */
3391 int
3392 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3393 {
3394 user_addr_t sfsp;
3395 user_addr_t *mp;
3396 size_t count, maxcount, bufsize, macsize;
3397 struct getfsstat_struct fst;
3398
3399 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3400 return EINVAL;
3401 }
3402
3403 bufsize = (size_t) uap->bufsize;
3404 macsize = (size_t) uap->macsize;
3405
3406 if (IS_64BIT_PROCESS(p)) {
3407 maxcount = bufsize / sizeof(struct user64_statfs);
3408 } else {
3409 maxcount = bufsize / sizeof(struct user32_statfs);
3410 }
3411 sfsp = uap->buf;
3412 count = 0;
3413
3414 mp = NULL;
3415
3416 #if CONFIG_MACF
3417 if (uap->mac != USER_ADDR_NULL) {
3418 u_int32_t *mp0;
3419 int error;
3420 unsigned int i;
3421
3422 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3423 if (count != maxcount) {
3424 return EINVAL;
3425 }
3426
3427 /* Copy in the array */
3428 mp0 = kheap_alloc(KHEAP_TEMP, macsize, Z_WAITOK);
3429 if (mp0 == NULL) {
3430 return ENOMEM;
3431 }
3432
3433 error = copyin(uap->mac, mp0, macsize);
3434 if (error) {
3435 kheap_free(KHEAP_TEMP, mp0, macsize);
3436 return error;
3437 }
3438
3439 /* Normalize to an array of user_addr_t */
3440 mp = kheap_alloc(KHEAP_TEMP, count * sizeof(user_addr_t), Z_WAITOK);
3441 if (mp == NULL) {
3442 kheap_free(KHEAP_TEMP, mp0, macsize);
3443 return ENOMEM;
3444 }
3445
3446 for (i = 0; i < count; i++) {
3447 if (IS_64BIT_PROCESS(p)) {
3448 mp[i] = ((user_addr_t *)mp0)[i];
3449 } else {
3450 mp[i] = (user_addr_t)mp0[i];
3451 }
3452 }
3453 kheap_free(KHEAP_TEMP, mp0, macsize);
3454 }
3455 #endif
3456
3457
3458 fst.sfsp = sfsp;
3459 fst.mp = mp;
3460 fst.flags = uap->flags;
3461 fst.count = 0;
3462 fst.error = 0;
3463 fst.maxcount = (int)maxcount;
3464
3465
3466 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3467
3468 if (mp) {
3469 kheap_free(KHEAP_TEMP, mp, count * sizeof(user_addr_t));
3470 }
3471
3472 if (fst.error) {
3473 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3474 return fst.error;
3475 }
3476
3477 if (fst.sfsp && fst.count > fst.maxcount) {
3478 *retval = fst.maxcount;
3479 } else {
3480 *retval = fst.count;
3481 }
3482 return 0;
3483 }
3484
3485 static int
3486 getfsstat64_callback(mount_t mp, void * arg)
3487 {
3488 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3489 struct vfsstatfs *sp;
3490 struct statfs64 sfs;
3491 int error;
3492
3493 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3494 #if CONFIG_MACF
3495 error = mac_mount_check_stat(vfs_context_current(), mp);
3496 if (error != 0) {
3497 fstp->error = error;
3498 return VFS_RETURNED_DONE;
3499 }
3500 #endif
3501 sp = &mp->mnt_vfsstat;
3502 /*
3503 * If MNT_NOWAIT is specified, do not refresh the fsstat
3504 * cache. MNT_WAIT overrides MNT_NOWAIT.
3505 *
3506 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3507 * getfsstat, since the constants are out of the same
3508 * namespace.
3509 */
3510 if ((mp->mnt_lflag & MNT_LDEAD) ||
3511 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3512 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3513 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3514 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3515 return VFS_RETURNED;
3516 }
3517
3518 vfs_get_statfs64(mp, &sfs);
3519 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3520 if (error) {
3521 fstp->error = error;
3522 return VFS_RETURNED_DONE;
3523 }
3524 fstp->sfsp += sizeof(sfs);
3525 }
3526 fstp->count++;
3527 return VFS_RETURNED;
3528 }
3529
3530 /*
3531 * Get statistics on all file systems in 64 bit mode.
3532 */
3533 int
3534 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3535 {
3536 user_addr_t sfsp;
3537 int count, maxcount;
3538 struct getfsstat_struct fst;
3539
3540 maxcount = uap->bufsize / sizeof(struct statfs64);
3541
3542 sfsp = uap->buf;
3543 count = 0;
3544
3545 fst.sfsp = sfsp;
3546 fst.flags = uap->flags;
3547 fst.count = 0;
3548 fst.error = 0;
3549 fst.maxcount = maxcount;
3550
3551 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3552
3553 if (fst.error) {
3554 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3555 return fst.error;
3556 }
3557
3558 if (fst.sfsp && fst.count > fst.maxcount) {
3559 *retval = fst.maxcount;
3560 } else {
3561 *retval = fst.count;
3562 }
3563
3564 return 0;
3565 }
3566
3567 /*
3568 * gets the associated vnode with the file descriptor passed.
3569 * as input
3570 *
3571 * INPUT
3572 * ctx - vfs context of caller
3573 * fd - file descriptor for which vnode is required.
3574 * vpp - Pointer to pointer to vnode to be returned.
3575 *
3576 * The vnode is returned with an iocount so any vnode obtained
3577 * by this call needs a vnode_put
3578 *
3579 */
3580 int
3581 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3582 {
3583 int error;
3584 vnode_t vp;
3585 struct fileproc *fp;
3586 proc_t p = vfs_context_proc(ctx);
3587
3588 *vpp = NULLVP;
3589
3590 error = fp_getfvp(p, fd, &fp, &vp);
3591 if (error) {
3592 return error;
3593 }
3594
3595 error = vnode_getwithref(vp);
3596 if (error) {
3597 (void)fp_drop(p, fd, fp, 0);
3598 return error;
3599 }
3600
3601 (void)fp_drop(p, fd, fp, 0);
3602 *vpp = vp;
3603 return error;
3604 }
3605
3606 /*
3607 * Wrapper function around namei to start lookup from a directory
3608 * specified by a file descriptor ni_dirfd.
3609 *
3610 * In addition to all the errors returned by namei, this call can
3611 * return ENOTDIR if the file descriptor does not refer to a directory.
3612 * and EBADF if the file descriptor is not valid.
3613 */
3614 int
3615 nameiat(struct nameidata *ndp, int dirfd)
3616 {
3617 if ((dirfd != AT_FDCWD) &&
3618 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3619 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3620 int error = 0;
3621 char c;
3622
3623 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3624 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3625 if (error) {
3626 return error;
3627 }
3628 } else {
3629 c = *((char *)(ndp->ni_dirp));
3630 }
3631
3632 if (c != '/') {
3633 vnode_t dvp_at;
3634
3635 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3636 &dvp_at);
3637 if (error) {
3638 return error;
3639 }
3640
3641 if (vnode_vtype(dvp_at) != VDIR) {
3642 vnode_put(dvp_at);
3643 return ENOTDIR;
3644 }
3645
3646 ndp->ni_dvp = dvp_at;
3647 ndp->ni_cnd.cn_flags |= USEDVP;
3648 error = namei(ndp);
3649 ndp->ni_cnd.cn_flags &= ~USEDVP;
3650 vnode_put(dvp_at);
3651 return error;
3652 }
3653 }
3654
3655 return namei(ndp);
3656 }
3657
3658 /*
3659 * Change current working directory to a given file descriptor.
3660 */
3661 /* ARGSUSED */
3662 static int
3663 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3664 {
3665 struct filedesc *fdp = p->p_fd;
3666 vnode_t vp;
3667 vnode_t tdp;
3668 vnode_t tvp;
3669 struct mount *mp;
3670 int error, should_put = 1;
3671 vfs_context_t ctx = vfs_context_current();
3672
3673 AUDIT_ARG(fd, uap->fd);
3674 if (per_thread && uap->fd == -1) {
3675 /*
3676 * Switching back from per-thread to per process CWD; verify we
3677 * in fact have one before proceeding. The only success case
3678 * for this code path is to return 0 preemptively after zapping
3679 * the thread structure contents.
3680 */
3681 thread_t th = vfs_context_thread(ctx);
3682 if (th) {
3683 uthread_t uth = get_bsdthread_info(th);
3684 tvp = uth->uu_cdir;
3685 uth->uu_cdir = NULLVP;
3686 if (tvp != NULLVP) {
3687 vnode_rele(tvp);
3688 return 0;
3689 }
3690 }
3691 return EBADF;
3692 }
3693
3694 if ((error = file_vnode(uap->fd, &vp))) {
3695 return error;
3696 }
3697 if ((error = vnode_getwithref(vp))) {
3698 file_drop(uap->fd);
3699 return error;
3700 }
3701
3702 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3703
3704 if (vp->v_type != VDIR) {
3705 error = ENOTDIR;
3706 goto out;
3707 }
3708
3709 #if CONFIG_MACF
3710 error = mac_vnode_check_chdir(ctx, vp);
3711 if (error) {
3712 goto out;
3713 }
3714 #endif
3715 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3716 if (error) {
3717 goto out;
3718 }
3719
3720 while (!error && (mp = vp->v_mountedhere) != NULL) {
3721 if (vfs_busy(mp, LK_NOWAIT)) {
3722 error = EACCES;
3723 goto out;
3724 }
3725 error = VFS_ROOT(mp, &tdp, ctx);
3726 vfs_unbusy(mp);
3727 if (error) {
3728 break;
3729 }
3730 vnode_put(vp);
3731 vp = tdp;
3732 }
3733 if (error) {
3734 goto out;
3735 }
3736 if ((error = vnode_ref(vp))) {
3737 goto out;
3738 }
3739 vnode_put(vp);
3740 should_put = 0;
3741
3742 if (per_thread) {
3743 thread_t th = vfs_context_thread(ctx);
3744 if (th) {
3745 uthread_t uth = get_bsdthread_info(th);
3746 tvp = uth->uu_cdir;
3747 uth->uu_cdir = vp;
3748 OSBitOrAtomic(P_THCWD, &p->p_flag);
3749 } else {
3750 vnode_rele(vp);
3751 error = ENOENT;
3752 goto out;
3753 }
3754 } else {
3755 proc_dirs_lock_exclusive(p);
3756 proc_fdlock(p);
3757 tvp = fdp->fd_cdir;
3758 fdp->fd_cdir = vp;
3759 proc_fdunlock(p);
3760 proc_dirs_unlock_exclusive(p);
3761 }
3762
3763 if (tvp) {
3764 vnode_rele(tvp);
3765 }
3766
3767 out:
3768 if (should_put) {
3769 vnode_put(vp);
3770 }
3771 file_drop(uap->fd);
3772
3773 return error;
3774 }
3775
3776 int
3777 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3778 {
3779 return common_fchdir(p, uap, 0);
3780 }
3781
3782 int
3783 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3784 {
3785 return common_fchdir(p, (void *)uap, 1);
3786 }
3787
3788
3789 /*
3790 * Change current working directory (".").
3791 *
3792 * Returns: 0 Success
3793 * change_dir:ENOTDIR
3794 * change_dir:???
3795 * vnode_ref:ENOENT No such file or directory
3796 */
3797 /* ARGSUSED */
3798 int
3799 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3800 {
3801 struct filedesc *fdp = p->p_fd;
3802 int error;
3803 vnode_t tvp;
3804
3805 error = change_dir(ndp, ctx);
3806 if (error) {
3807 return error;
3808 }
3809 if ((error = vnode_ref(ndp->ni_vp))) {
3810 vnode_put(ndp->ni_vp);
3811 return error;
3812 }
3813 /*
3814 * drop the iocount we picked up in change_dir
3815 */
3816 vnode_put(ndp->ni_vp);
3817
3818 if (per_thread) {
3819 thread_t th = vfs_context_thread(ctx);
3820 if (th) {
3821 uthread_t uth = get_bsdthread_info(th);
3822 tvp = uth->uu_cdir;
3823 uth->uu_cdir = ndp->ni_vp;
3824 OSBitOrAtomic(P_THCWD, &p->p_flag);
3825 } else {
3826 vnode_rele(ndp->ni_vp);
3827 return ENOENT;
3828 }
3829 } else {
3830 proc_dirs_lock_exclusive(p);
3831 proc_fdlock(p);
3832 tvp = fdp->fd_cdir;
3833 fdp->fd_cdir = ndp->ni_vp;
3834 proc_fdunlock(p);
3835 proc_dirs_unlock_exclusive(p);
3836 }
3837
3838 if (tvp) {
3839 vnode_rele(tvp);
3840 }
3841
3842 return 0;
3843 }
3844
3845
3846 /*
3847 * Change current working directory (".").
3848 *
3849 * Returns: 0 Success
3850 * chdir_internal:ENOTDIR
3851 * chdir_internal:ENOENT No such file or directory
3852 * chdir_internal:???
3853 */
3854 /* ARGSUSED */
3855 static int
3856 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3857 {
3858 struct nameidata nd;
3859 vfs_context_t ctx = vfs_context_current();
3860
3861 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3862 UIO_USERSPACE, uap->path, ctx);
3863
3864 return chdir_internal(p, ctx, &nd, per_thread);
3865 }
3866
3867
3868 /*
3869 * chdir
3870 *
3871 * Change current working directory (".") for the entire process
3872 *
3873 * Parameters: p Process requesting the call
3874 * uap User argument descriptor (see below)
3875 * retval (ignored)
3876 *
3877 * Indirect parameters: uap->path Directory path
3878 *
3879 * Returns: 0 Success
3880 * common_chdir: ENOTDIR
3881 * common_chdir: ENOENT No such file or directory
3882 * common_chdir: ???
3883 *
3884 */
3885 int
3886 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3887 {
3888 return common_chdir(p, (void *)uap, 0);
3889 }
3890
3891 /*
3892 * __pthread_chdir
3893 *
3894 * Change current working directory (".") for a single thread
3895 *
3896 * Parameters: p Process requesting the call
3897 * uap User argument descriptor (see below)
3898 * retval (ignored)
3899 *
3900 * Indirect parameters: uap->path Directory path
3901 *
3902 * Returns: 0 Success
3903 * common_chdir: ENOTDIR
3904 * common_chdir: ENOENT No such file or directory
3905 * common_chdir: ???
3906 *
3907 */
3908 int
3909 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3910 {
3911 return common_chdir(p, (void *)uap, 1);
3912 }
3913
3914
3915 /*
3916 * Change notion of root (``/'') directory.
3917 */
3918 /* ARGSUSED */
3919 int
3920 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3921 {
3922 struct filedesc *fdp = p->p_fd;
3923 int error;
3924 struct nameidata nd;
3925 vnode_t tvp;
3926 vfs_context_t ctx = vfs_context_current();
3927
3928 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3929 return error;
3930 }
3931
3932 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3933 UIO_USERSPACE, uap->path, ctx);
3934 error = change_dir(&nd, ctx);
3935 if (error) {
3936 return error;
3937 }
3938
3939 #if CONFIG_MACF
3940 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3941 &nd.ni_cnd);
3942 if (error) {
3943 vnode_put(nd.ni_vp);
3944 return error;
3945 }
3946 #endif
3947
3948 if ((error = vnode_ref(nd.ni_vp))) {
3949 vnode_put(nd.ni_vp);
3950 return error;
3951 }
3952 vnode_put(nd.ni_vp);
3953
3954 /*
3955 * This lock provides the guarantee that as long as you hold the lock
3956 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3957 * on a referenced vnode in namei when determining the rootvnode for
3958 * a process.
3959 */
3960 /* needed for synchronization with lookup */
3961 proc_dirs_lock_exclusive(p);
3962 /* needed for setting the flag and other activities on the fd itself */
3963 proc_fdlock(p);
3964 tvp = fdp->fd_rdir;
3965 fdp->fd_rdir = nd.ni_vp;
3966 fdp->fd_flags |= FD_CHROOT;
3967 proc_fdunlock(p);
3968 proc_dirs_unlock_exclusive(p);
3969
3970 if (tvp != NULL) {
3971 vnode_rele(tvp);
3972 }
3973
3974 return 0;
3975 }
3976
3977 #define PATHSTATICBUFLEN 256
3978 #define PIVOT_ROOT_ENTITLEMENT \
3979 "com.apple.private.vfs.pivot-root"
3980
3981 #if defined(XNU_TARGET_OS_OSX)
3982 int
3983 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3984 {
3985 int error;
3986 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3987 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3988 char *new_rootfs_path_before_buf = NULL;
3989 char *old_rootfs_path_after_buf = NULL;
3990 char *incoming = NULL;
3991 char *outgoing = NULL;
3992 vnode_t incoming_rootvp = NULLVP;
3993 size_t bytes_copied;
3994
3995 /*
3996 * XXX : Additional restrictions needed
3997 * - perhaps callable only once.
3998 */
3999 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4000 return error;
4001 }
4002
4003 /*
4004 * pivot_root can be executed by launchd only.
4005 * Enforce entitlement.
4006 */
4007 if ((p->p_pid != 1) || !IOTaskHasEntitlement(current_task(), PIVOT_ROOT_ENTITLEMENT)) {
4008 return EPERM;
4009 }
4010
4011 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4012 if (error == ENAMETOOLONG) {
4013 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4014 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4015 }
4016
4017 if (error) {
4018 goto out;
4019 }
4020
4021 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4022 if (error == ENAMETOOLONG) {
4023 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4024 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4025 }
4026 if (error) {
4027 goto out;
4028 }
4029
4030 if (new_rootfs_path_before_buf) {
4031 incoming = new_rootfs_path_before_buf;
4032 } else {
4033 incoming = &new_rootfs_path_before[0];
4034 }
4035
4036 if (old_rootfs_path_after_buf) {
4037 outgoing = old_rootfs_path_after_buf;
4038 } else {
4039 outgoing = &old_rootfs_path_after[0];
4040 }
4041
4042 /*
4043 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4044 * Userland is not allowed to pivot to an image.
4045 */
4046 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4047 if (error) {
4048 goto out;
4049 }
4050 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4051 if (error) {
4052 goto out;
4053 }
4054
4055 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4056
4057 out:
4058 if (incoming_rootvp != NULLVP) {
4059 vnode_put(incoming_rootvp);
4060 incoming_rootvp = NULLVP;
4061 }
4062
4063 if (old_rootfs_path_after_buf) {
4064 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4065 }
4066
4067 if (new_rootfs_path_before_buf) {
4068 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4069 }
4070
4071 return error;
4072 }
4073 #else
4074 int
4075 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4076 {
4077 return nosys(p, NULL, retval);
4078 }
4079 #endif /* XNU_TARGET_OS_OSX */
4080
4081 /*
4082 * Common routine for chroot and chdir.
4083 *
4084 * Returns: 0 Success
4085 * ENOTDIR Not a directory
4086 * namei:??? [anything namei can return]
4087 * vnode_authorize:??? [anything vnode_authorize can return]
4088 */
4089 static int
4090 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4091 {
4092 vnode_t vp;
4093 int error;
4094
4095 if ((error = namei(ndp))) {
4096 return error;
4097 }
4098 nameidone(ndp);
4099 vp = ndp->ni_vp;
4100
4101 if (vp->v_type != VDIR) {
4102 vnode_put(vp);
4103 return ENOTDIR;
4104 }
4105
4106 #if CONFIG_MACF
4107 error = mac_vnode_check_chdir(ctx, vp);
4108 if (error) {
4109 vnode_put(vp);
4110 return error;
4111 }
4112 #endif
4113
4114 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4115 if (error) {
4116 vnode_put(vp);
4117 return error;
4118 }
4119
4120 return error;
4121 }
4122
4123 /*
4124 * Free the vnode data (for directories) associated with the file glob.
4125 */
4126 struct fd_vn_data *
4127 fg_vn_data_alloc(void)
4128 {
4129 struct fd_vn_data *fvdata;
4130
4131 /* Allocate per fd vnode data */
4132 fvdata = kheap_alloc(KM_FD_VN_DATA, sizeof(struct fd_vn_data),
4133 Z_WAITOK | Z_ZERO);
4134 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4135 return fvdata;
4136 }
4137
4138 /*
4139 * Free the vnode data (for directories) associated with the file glob.
4140 */
4141 void
4142 fg_vn_data_free(void *fgvndata)
4143 {
4144 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4145
4146 kheap_free(KHEAP_DATA_BUFFERS, fvdata->fv_buf, fvdata->fv_bufallocsiz);
4147 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4148 kheap_free(KM_FD_VN_DATA, fvdata, sizeof(struct fd_vn_data));
4149 }
4150
4151 /*
4152 * Check permissions, allocate an open file structure,
4153 * and call the device open routine if any.
4154 *
4155 * Returns: 0 Success
4156 * EINVAL
4157 * EINTR
4158 * falloc:ENFILE
4159 * falloc:EMFILE
4160 * falloc:ENOMEM
4161 * vn_open_auth:???
4162 * dupfdopen:???
4163 * VNOP_ADVLOCK:???
4164 * vnode_setsize:???
4165 *
4166 * XXX Need to implement uid, gid
4167 */
4168 int
4169 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4170 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
4171 int32_t *retval)
4172 {
4173 proc_t p = vfs_context_proc(ctx);
4174 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4175 struct fileproc *fp;
4176 vnode_t vp;
4177 int flags, oflags;
4178 int type, indx, error;
4179 struct vfs_context context;
4180
4181 oflags = uflags;
4182
4183 if ((oflags & O_ACCMODE) == O_ACCMODE) {
4184 return EINVAL;
4185 }
4186
4187 flags = FFLAGS(uflags);
4188 CLR(flags, FENCRYPTED);
4189 CLR(flags, FUNENCRYPTED);
4190
4191 AUDIT_ARG(fflags, oflags);
4192 AUDIT_ARG(mode, vap->va_mode);
4193
4194 if ((error = falloc_withalloc(p,
4195 &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
4196 return error;
4197 }
4198 uu->uu_dupfd = -indx - 1;
4199
4200 if ((error = vn_open_auth(ndp, &flags, vap))) {
4201 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) { /* XXX from fdopen */
4202 if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
4203 fp_drop(p, indx, NULL, 0);
4204 *retval = indx;
4205 return 0;
4206 }
4207 }
4208 if (error == ERESTART) {
4209 error = EINTR;
4210 }
4211 fp_free(p, indx, fp);
4212 return error;
4213 }
4214 uu->uu_dupfd = 0;
4215 vp = ndp->ni_vp;
4216
4217 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4218 fp->fp_glob->fg_ops = &vnops;
4219 fp->fp_glob->fg_data = (caddr_t)vp;
4220
4221 if (flags & (O_EXLOCK | O_SHLOCK)) {
4222 struct flock lf = {
4223 .l_whence = SEEK_SET,
4224 };
4225
4226 if (flags & O_EXLOCK) {
4227 lf.l_type = F_WRLCK;
4228 } else {
4229 lf.l_type = F_RDLCK;
4230 }
4231 type = F_FLOCK;
4232 if ((flags & FNONBLOCK) == 0) {
4233 type |= F_WAIT;
4234 }
4235 #if CONFIG_MACF
4236 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4237 F_SETLK, &lf);
4238 if (error) {
4239 goto bad;
4240 }
4241 #endif
4242 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4243 goto bad;
4244 }
4245 fp->fp_glob->fg_flag |= FWASLOCKED;
4246 }
4247
4248 /* try to truncate by setting the size attribute */
4249 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4250 goto bad;
4251 }
4252
4253 /*
4254 * For directories we hold some additional information in the fd.
4255 */
4256 if (vnode_vtype(vp) == VDIR) {
4257 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4258 } else {
4259 fp->fp_glob->fg_vn_data = NULL;
4260 }
4261
4262 vnode_put(vp);
4263
4264 /*
4265 * The first terminal open (without a O_NOCTTY) by a session leader
4266 * results in it being set as the controlling terminal.
4267 */
4268 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4269 !(flags & O_NOCTTY)) {
4270 int tmp = 0;
4271
4272 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4273 (caddr_t)&tmp, ctx);
4274 }
4275
4276 proc_fdlock(p);
4277 if (flags & O_CLOEXEC) {
4278 *fdflags(p, indx) |= UF_EXCLOSE;
4279 }
4280 if (flags & O_CLOFORK) {
4281 *fdflags(p, indx) |= UF_FORKCLOSE;
4282 }
4283 procfdtbl_releasefd(p, indx, NULL);
4284
4285 #if CONFIG_SECLUDED_MEMORY
4286 if (secluded_for_filecache &&
4287 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4288 vnode_vtype(vp) == VREG) {
4289 memory_object_control_t moc;
4290
4291 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4292
4293 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4294 /* nothing to do... */
4295 } else if (fp->fp_glob->fg_flag & FWRITE) {
4296 /* writable -> no longer eligible for secluded pages */
4297 memory_object_mark_eligible_for_secluded(moc,
4298 FALSE);
4299 } else if (secluded_for_filecache == 1) {
4300 char pathname[32] = { 0, };
4301 size_t copied;
4302 /* XXX FBDP: better way to detect /Applications/ ? */
4303 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4304 (void)copyinstr(ndp->ni_dirp,
4305 pathname,
4306 sizeof(pathname),
4307 &copied);
4308 } else {
4309 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4310 pathname,
4311 sizeof(pathname),
4312 &copied);
4313 }
4314 pathname[sizeof(pathname) - 1] = '\0';
4315 if (strncmp(pathname,
4316 "/Applications/",
4317 strlen("/Applications/")) == 0 &&
4318 strncmp(pathname,
4319 "/Applications/Camera.app/",
4320 strlen("/Applications/Camera.app/")) != 0) {
4321 /*
4322 * not writable
4323 * AND from "/Applications/"
4324 * AND not from "/Applications/Camera.app/"
4325 * ==> eligible for secluded
4326 */
4327 memory_object_mark_eligible_for_secluded(moc,
4328 TRUE);
4329 }
4330 } else if (secluded_for_filecache == 2) {
4331 #if __arm64__
4332 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4333 #elif __arm__
4334 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4335 #else
4336 /* not implemented... */
4337 #endif
4338 size_t len = strlen(vp->v_name);
4339 if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4340 !strncmp(vp->v_name, "dyld", len) ||
4341 !strncmp(vp->v_name, "launchd", len) ||
4342 !strncmp(vp->v_name, "Camera", len) ||
4343 !strncmp(vp->v_name, "mediaserverd", len) ||
4344 !strncmp(vp->v_name, "SpringBoard", len) ||
4345 !strncmp(vp->v_name, "backboardd", len)) {
4346 /*
4347 * This file matters when launching Camera:
4348 * do not store its contents in the secluded
4349 * pool that will be drained on Camera launch.
4350 */
4351 memory_object_mark_eligible_for_secluded(moc,
4352 FALSE);
4353 }
4354 }
4355 }
4356 #endif /* CONFIG_SECLUDED_MEMORY */
4357
4358 fp_drop(p, indx, fp, 1);
4359 proc_fdunlock(p);
4360
4361 *retval = indx;
4362
4363 return 0;
4364 bad:
4365 context = *vfs_context_current();
4366 context.vc_ucred = fp->fp_glob->fg_cred;
4367
4368 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4369 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4370 struct flock lf = {
4371 .l_whence = SEEK_SET,
4372 .l_type = F_UNLCK,
4373 };
4374
4375 (void)VNOP_ADVLOCK(
4376 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4377 }
4378
4379 vn_close(vp, fp->fp_glob->fg_flag, &context);
4380 vnode_put(vp);
4381 fp_free(p, indx, fp);
4382
4383 return error;
4384 }
4385
4386 /*
4387 * While most of the *at syscall handlers can call nameiat() which
4388 * is a wrapper around namei, the use of namei and initialisation
4389 * of nameidata are far removed and in different functions - namei
4390 * gets called in vn_open_auth for open1. So we'll just do here what
4391 * nameiat() does.
4392 */
4393 static int
4394 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4395 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4396 int dirfd)
4397 {
4398 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4399 int error;
4400 char c;
4401
4402 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4403 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4404 if (error) {
4405 return error;
4406 }
4407 } else {
4408 c = *((char *)(ndp->ni_dirp));
4409 }
4410
4411 if (c != '/') {
4412 vnode_t dvp_at;
4413
4414 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4415 &dvp_at);
4416 if (error) {
4417 return error;
4418 }
4419
4420 if (vnode_vtype(dvp_at) != VDIR) {
4421 vnode_put(dvp_at);
4422 return ENOTDIR;
4423 }
4424
4425 ndp->ni_dvp = dvp_at;
4426 ndp->ni_cnd.cn_flags |= USEDVP;
4427 error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4428 retval);
4429 vnode_put(dvp_at);
4430 return error;
4431 }
4432 }
4433
4434 return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4435 }
4436
4437 /*
4438 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4439 *
4440 * Parameters: p Process requesting the open
4441 * uap User argument descriptor (see below)
4442 * retval Pointer to an area to receive the
4443 * return calue from the system call
4444 *
4445 * Indirect: uap->path Path to open (same as 'open')
4446 * uap->flags Flags to open (same as 'open'
4447 * uap->uid UID to set, if creating
4448 * uap->gid GID to set, if creating
4449 * uap->mode File mode, if creating (same as 'open')
4450 * uap->xsecurity ACL to set, if creating
4451 *
4452 * Returns: 0 Success
4453 * !0 errno value
4454 *
4455 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4456 *
4457 * XXX: We should enummerate the possible errno values here, and where
4458 * in the code they originated.
4459 */
4460 int
4461 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4462 {
4463 struct filedesc *fdp = p->p_fd;
4464 int ciferror;
4465 kauth_filesec_t xsecdst;
4466 struct vnode_attr va;
4467 struct nameidata nd;
4468 int cmode;
4469
4470 AUDIT_ARG(owner, uap->uid, uap->gid);
4471
4472 xsecdst = NULL;
4473 if ((uap->xsecurity != USER_ADDR_NULL) &&
4474 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4475 return ciferror;
4476 }
4477
4478 VATTR_INIT(&va);
4479 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4480 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4481 if (uap->uid != KAUTH_UID_NONE) {
4482 VATTR_SET(&va, va_uid, uap->uid);
4483 }
4484 if (uap->gid != KAUTH_GID_NONE) {
4485 VATTR_SET(&va, va_gid, uap->gid);
4486 }
4487 if (xsecdst != NULL) {
4488 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4489 }
4490
4491 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4492 uap->path, vfs_context_current());
4493
4494 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4495 fileproc_alloc_init, NULL, retval);
4496 if (xsecdst != NULL) {
4497 kauth_filesec_free(xsecdst);
4498 }
4499
4500 return ciferror;
4501 }
4502
4503 /*
4504 * Go through the data-protected atomically controlled open (2)
4505 *
4506 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4507 */
4508 int
4509 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4510 {
4511 int flags = uap->flags;
4512 int class = uap->class;
4513 int dpflags = uap->dpflags;
4514
4515 /*
4516 * Follow the same path as normal open(2)
4517 * Look up the item if it exists, and acquire the vnode.
4518 */
4519 struct filedesc *fdp = p->p_fd;
4520 struct vnode_attr va;
4521 struct nameidata nd;
4522 int cmode;
4523 int error;
4524
4525 VATTR_INIT(&va);
4526 /* Mask off all but regular access permissions */
4527 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4528 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4529
4530 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4531 uap->path, vfs_context_current());
4532
4533 /*
4534 * Initialize the extra fields in vnode_attr to pass down our
4535 * extra fields.
4536 * 1. target cprotect class.
4537 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4538 */
4539 if (flags & O_CREAT) {
4540 /* lower level kernel code validates that the class is valid before applying it. */
4541 if (class != PROTECTION_CLASS_DEFAULT) {
4542 /*
4543 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4544 * file behave the same as open (2)
4545 */
4546 VATTR_SET(&va, va_dataprotect_class, class);
4547 }
4548 }
4549
4550 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4551 if (flags & (O_RDWR | O_WRONLY)) {
4552 /* Not allowed to write raw encrypted bytes */
4553 return EINVAL;
4554 }
4555 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4556 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4557 }
4558 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4559 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4560 }
4561 }
4562
4563 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4564 fileproc_alloc_init, NULL, retval);
4565
4566 return error;
4567 }
4568
4569 static int
4570 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4571 int fd, enum uio_seg segflg, int *retval)
4572 {
4573 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4574 struct {
4575 struct vnode_attr va;
4576 struct nameidata nd;
4577 } *__open_data;
4578 struct vnode_attr *vap;
4579 struct nameidata *ndp;
4580 int cmode;
4581 int error;
4582
4583 __open_data = kheap_alloc(KHEAP_TEMP, sizeof(*__open_data), Z_WAITOK);
4584 vap = &__open_data->va;
4585 ndp = &__open_data->nd;
4586
4587 VATTR_INIT(vap);
4588 /* Mask off all but regular access permissions */
4589 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4590 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4591
4592 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4593 segflg, path, ctx);
4594
4595 error = open1at(ctx, ndp, flags, vap, fileproc_alloc_init, NULL,
4596 retval, fd);
4597
4598 kheap_free(KHEAP_TEMP, __open_data, sizeof(*__open_data));
4599
4600 return error;
4601 }
4602
4603 int
4604 open(proc_t p, struct open_args *uap, int32_t *retval)
4605 {
4606 __pthread_testcancel(1);
4607 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4608 }
4609
4610 int
4611 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4612 int32_t *retval)
4613 {
4614 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4615 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4616 }
4617
4618 int
4619 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4620 int32_t *retval)
4621 {
4622 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4623 uap->mode, uap->fd, UIO_USERSPACE, retval);
4624 }
4625
4626 int
4627 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4628 {
4629 __pthread_testcancel(1);
4630 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4631 }
4632
4633 /*
4634 * openbyid_np: open a file given a file system id and a file system object id
4635 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4636 * file systems that don't support object ids it is a node id (uint64_t).
4637 *
4638 * Parameters: p Process requesting the open
4639 * uap User argument descriptor (see below)
4640 * retval Pointer to an area to receive the
4641 * return calue from the system call
4642 *
4643 * Indirect: uap->path Path to open (same as 'open')
4644 *
4645 * uap->fsid id of target file system
4646 * uap->objid id of target file system object
4647 * uap->flags Flags to open (same as 'open')
4648 *
4649 * Returns: 0 Success
4650 * !0 errno value
4651 *
4652 *
4653 * XXX: We should enummerate the possible errno values here, and where
4654 * in the code they originated.
4655 */
4656 int
4657 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4658 {
4659 fsid_t fsid;
4660 uint64_t objid;
4661 int error;
4662 char *buf = NULL;
4663 int buflen = MAXPATHLEN;
4664 int pathlen = 0;
4665 vfs_context_t ctx = vfs_context_current();
4666
4667 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4668 return error;
4669 }
4670
4671 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4672 return error;
4673 }
4674
4675 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4676 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4677 return error;
4678 }
4679
4680 AUDIT_ARG(value32, fsid.val[0]);
4681 AUDIT_ARG(value64, objid);
4682
4683 /*resolve path from fsis, objid*/
4684 do {
4685 buf = kheap_alloc(KHEAP_TEMP, buflen + 1, Z_WAITOK);
4686 if (buf == NULL) {
4687 return ENOMEM;
4688 }
4689
4690 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4691 buf, FSOPT_ISREALFSID, &pathlen);
4692
4693 if (error) {
4694 kheap_free(KHEAP_TEMP, buf, buflen + 1);
4695 buf = NULL;
4696 }
4697 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4698
4699 if (error) {
4700 return error;
4701 }
4702
4703 buf[pathlen] = 0;
4704
4705 error = openat_internal(
4706 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4707
4708 kheap_free(KHEAP_TEMP, buf, buflen + 1);
4709
4710 return error;
4711 }
4712
4713
4714 /*
4715 * Create a special file.
4716 */
4717 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4718
4719 int
4720 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4721 {
4722 struct vnode_attr va;
4723 vfs_context_t ctx = vfs_context_current();
4724 int error;
4725 struct nameidata nd;
4726 vnode_t vp, dvp;
4727
4728 VATTR_INIT(&va);
4729 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4730 VATTR_SET(&va, va_rdev, uap->dev);
4731
4732 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4733 if ((uap->mode & S_IFMT) == S_IFIFO) {
4734 return mkfifo1(ctx, uap->path, &va);
4735 }
4736
4737 AUDIT_ARG(mode, (mode_t)uap->mode);
4738 AUDIT_ARG(value32, uap->dev);
4739
4740 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4741 return error;
4742 }
4743 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4744 UIO_USERSPACE, uap->path, ctx);
4745 error = namei(&nd);
4746 if (error) {
4747 return error;
4748 }
4749 dvp = nd.ni_dvp;
4750 vp = nd.ni_vp;
4751
4752 if (vp != NULL) {
4753 error = EEXIST;
4754 goto out;
4755 }
4756
4757 switch (uap->mode & S_IFMT) {
4758 case S_IFCHR:
4759 VATTR_SET(&va, va_type, VCHR);
4760 break;
4761 case S_IFBLK:
4762 VATTR_SET(&va, va_type, VBLK);
4763 break;
4764 default:
4765 error = EINVAL;
4766 goto out;
4767 }
4768
4769 #if CONFIG_MACF
4770 error = mac_vnode_check_create(ctx,
4771 nd.ni_dvp, &nd.ni_cnd, &va);
4772 if (error) {
4773 goto out;
4774 }
4775 #endif
4776
4777 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4778 goto out;
4779 }
4780
4781 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4782 goto out;
4783 }
4784
4785 if (vp) {
4786 int update_flags = 0;
4787
4788 // Make sure the name & parent pointers are hooked up
4789 if (vp->v_name == NULL) {
4790 update_flags |= VNODE_UPDATE_NAME;
4791 }
4792 if (vp->v_parent == NULLVP) {
4793 update_flags |= VNODE_UPDATE_PARENT;
4794 }
4795
4796 if (update_flags) {
4797 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4798 }
4799
4800 #if CONFIG_FSE
4801 add_fsevent(FSE_CREATE_FILE, ctx,
4802 FSE_ARG_VNODE, vp,
4803 FSE_ARG_DONE);
4804 #endif
4805 }
4806
4807 out:
4808 /*
4809 * nameidone has to happen before we vnode_put(dvp)
4810 * since it may need to release the fs_nodelock on the dvp
4811 */
4812 nameidone(&nd);
4813
4814 if (vp) {
4815 vnode_put(vp);
4816 }
4817 vnode_put(dvp);
4818
4819 return error;
4820 }
4821
4822 /*
4823 * Create a named pipe.
4824 *
4825 * Returns: 0 Success
4826 * EEXIST
4827 * namei:???
4828 * vnode_authorize:???
4829 * vn_create:???
4830 */
4831 static int
4832 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4833 {
4834 vnode_t vp, dvp;
4835 int error;
4836 struct nameidata nd;
4837
4838 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4839 UIO_USERSPACE, upath, ctx);
4840 error = namei(&nd);
4841 if (error) {
4842 return error;
4843 }
4844 dvp = nd.ni_dvp;
4845 vp = nd.ni_vp;
4846
4847 /* check that this is a new file and authorize addition */
4848 if (vp != NULL) {
4849 error = EEXIST;
4850 goto out;
4851 }
4852 VATTR_SET(vap, va_type, VFIFO);
4853
4854 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4855 goto out;
4856 }
4857
4858 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4859 out:
4860 /*
4861 * nameidone has to happen before we vnode_put(dvp)
4862 * since it may need to release the fs_nodelock on the dvp
4863 */
4864 nameidone(&nd);
4865
4866 if (vp) {
4867 vnode_put(vp);
4868 }
4869 vnode_put(dvp);
4870
4871 return error;
4872 }
4873
4874
4875 /*
4876 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4877 *
4878 * Parameters: p Process requesting the open
4879 * uap User argument descriptor (see below)
4880 * retval (Ignored)
4881 *
4882 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4883 * uap->uid UID to set
4884 * uap->gid GID to set
4885 * uap->mode File mode to set (same as 'mkfifo')
4886 * uap->xsecurity ACL to set, if creating
4887 *
4888 * Returns: 0 Success
4889 * !0 errno value
4890 *
4891 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4892 *
4893 * XXX: We should enummerate the possible errno values here, and where
4894 * in the code they originated.
4895 */
4896 int
4897 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4898 {
4899 int ciferror;
4900 kauth_filesec_t xsecdst;
4901 struct vnode_attr va;
4902
4903 AUDIT_ARG(owner, uap->uid, uap->gid);
4904
4905 xsecdst = KAUTH_FILESEC_NONE;
4906 if (uap->xsecurity != USER_ADDR_NULL) {
4907 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4908 return ciferror;
4909 }
4910 }
4911
4912 VATTR_INIT(&va);
4913 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4914 if (uap->uid != KAUTH_UID_NONE) {
4915 VATTR_SET(&va, va_uid, uap->uid);
4916 }
4917 if (uap->gid != KAUTH_GID_NONE) {
4918 VATTR_SET(&va, va_gid, uap->gid);
4919 }
4920 if (xsecdst != KAUTH_FILESEC_NONE) {
4921 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4922 }
4923
4924 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4925
4926 if (xsecdst != KAUTH_FILESEC_NONE) {
4927 kauth_filesec_free(xsecdst);
4928 }
4929 return ciferror;
4930 }
4931
4932 /* ARGSUSED */
4933 int
4934 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4935 {
4936 struct vnode_attr va;
4937
4938 VATTR_INIT(&va);
4939 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4940
4941 return mkfifo1(vfs_context_current(), uap->path, &va);
4942 }
4943
4944 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4945 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4946 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4947
4948 int
4949 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4950 {
4951 int ret, len = _len;
4952
4953 *truncated_path = 0;
4954
4955 if (firmlink) {
4956 ret = vn_getpath(dvp, path, &len);
4957 } else {
4958 ret = vn_getpath_no_firmlink(dvp, path, &len);
4959 }
4960 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4961 if (leafname) {
4962 path[len - 1] = '/';
4963 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4964 if (len > MAXPATHLEN) {
4965 char *ptr;
4966
4967 // the string got truncated!
4968 *truncated_path = 1;
4969 ptr = strrchr(path, '/');
4970 if (ptr) {
4971 *ptr = '\0'; // chop off the string at the last directory component
4972 }
4973 len = (int)strlen(path) + 1;
4974 }
4975 }
4976 } else if (ret == 0) {
4977 *truncated_path = 1;
4978 } else if (ret != 0) {
4979 struct vnode *mydvp = dvp;
4980
4981 if (ret != ENOSPC) {
4982 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4983 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4984 }
4985 *truncated_path = 1;
4986
4987 do {
4988 if (mydvp->v_parent != NULL) {
4989 mydvp = mydvp->v_parent;
4990 } else if (mydvp->v_mount) {
4991 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4992 break;
4993 } else {
4994 // no parent and no mount point? only thing is to punt and say "/" changed
4995 strlcpy(path, "/", _len);
4996 len = 2;
4997 mydvp = NULL;
4998 }
4999
5000 if (mydvp == NULL) {
5001 break;
5002 }
5003
5004 len = _len;
5005 if (firmlink) {
5006 ret = vn_getpath(mydvp, path, &len);
5007 } else {
5008 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5009 }
5010 } while (ret == ENOSPC);
5011 }
5012
5013 return len;
5014 }
5015
5016 int
5017 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5018 {
5019 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5020 }
5021
5022 int
5023 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5024 {
5025 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5026 }
5027
5028 /*
5029 * Make a hard file link.
5030 *
5031 * Returns: 0 Success
5032 * EPERM
5033 * EEXIST
5034 * EXDEV
5035 * namei:???
5036 * vnode_authorize:???
5037 * VNOP_LINK:???
5038 */
5039 /* ARGSUSED */
5040 static int
5041 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5042 user_addr_t link, int flag, enum uio_seg segflg)
5043 {
5044 vnode_t vp, pvp, dvp, lvp;
5045 struct nameidata nd;
5046 int follow;
5047 int error;
5048 #if CONFIG_FSE
5049 fse_info finfo;
5050 #endif
5051 int need_event, has_listeners, need_kpath2;
5052 char *target_path = NULL;
5053 int truncated = 0;
5054
5055 vp = dvp = lvp = NULLVP;
5056
5057 /* look up the object we are linking to */
5058 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5059 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5060 segflg, path, ctx);
5061
5062 error = nameiat(&nd, fd1);
5063 if (error) {
5064 return error;
5065 }
5066 vp = nd.ni_vp;
5067
5068 nameidone(&nd);
5069
5070 /*
5071 * Normally, linking to directories is not supported.
5072 * However, some file systems may have limited support.
5073 */
5074 if (vp->v_type == VDIR) {
5075 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5076 error = EPERM; /* POSIX */
5077 goto out;
5078 }
5079
5080 /* Linking to a directory requires ownership. */
5081 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5082 struct vnode_attr dva;
5083
5084 VATTR_INIT(&dva);
5085 VATTR_WANTED(&dva, va_uid);
5086 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5087 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5088 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5089 error = EACCES;
5090 goto out;
5091 }
5092 }
5093 }
5094
5095 /* lookup the target node */
5096 #if CONFIG_TRIGGERS
5097 nd.ni_op = OP_LINK;
5098 #endif
5099 nd.ni_cnd.cn_nameiop = CREATE;
5100 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5101 nd.ni_dirp = link;
5102 error = nameiat(&nd, fd2);
5103 if (error != 0) {
5104 goto out;
5105 }
5106 dvp = nd.ni_dvp;
5107 lvp = nd.ni_vp;
5108
5109 #if CONFIG_MACF
5110 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5111 goto out2;
5112 }
5113 #endif
5114
5115 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5116 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5117 goto out2;
5118 }
5119
5120 /* target node must not exist */
5121 if (lvp != NULLVP) {
5122 error = EEXIST;
5123 goto out2;
5124 }
5125 /* cannot link across mountpoints */
5126 if (vnode_mount(vp) != vnode_mount(dvp)) {
5127 error = EXDEV;
5128 goto out2;
5129 }
5130
5131 /* authorize creation of the target note */
5132 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5133 goto out2;
5134 }
5135
5136 /* and finally make the link */
5137 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5138 if (error) {
5139 goto out2;
5140 }
5141
5142 #if CONFIG_MACF
5143 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5144 #endif
5145
5146 #if CONFIG_FSE
5147 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5148 #else
5149 need_event = 0;
5150 #endif
5151 has_listeners = kauth_authorize_fileop_has_listeners();
5152
5153 need_kpath2 = 0;
5154 #if CONFIG_AUDIT
5155 if (AUDIT_RECORD_EXISTS()) {
5156 need_kpath2 = 1;
5157 }
5158 #endif
5159
5160 if (need_event || has_listeners || need_kpath2) {
5161 char *link_to_path = NULL;
5162 int len, link_name_len;
5163
5164 /* build the path to the new link file */
5165 GET_PATH(target_path);
5166
5167 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5168
5169 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5170
5171 if (has_listeners) {
5172 /* build the path to file we are linking to */
5173 GET_PATH(link_to_path);
5174
5175 link_name_len = MAXPATHLEN;
5176 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5177 /*
5178 * Call out to allow 3rd party notification of rename.
5179 * Ignore result of kauth_authorize_fileop call.
5180 */
5181 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5182 (uintptr_t)link_to_path,
5183 (uintptr_t)target_path);
5184 }
5185 if (link_to_path != NULL) {
5186 RELEASE_PATH(link_to_path);
5187 }
5188 }
5189 #if CONFIG_FSE
5190 if (need_event) {
5191 /* construct fsevent */
5192 if (get_fse_info(vp, &finfo, ctx) == 0) {
5193 if (truncated) {
5194 finfo.mode |= FSE_TRUNCATED_PATH;
5195 }
5196
5197 // build the path to the destination of the link
5198 add_fsevent(FSE_CREATE_FILE, ctx,
5199 FSE_ARG_STRING, len, target_path,
5200 FSE_ARG_FINFO, &finfo,
5201 FSE_ARG_DONE);
5202 }
5203
5204 pvp = vp->v_parent;
5205 // need an iocount on pvp in this case
5206 if (pvp && pvp != dvp) {
5207 error = vnode_get(pvp);
5208 if (error) {
5209 pvp = NULLVP;
5210 error = 0;
5211 }
5212 }
5213 if (pvp) {
5214 add_fsevent(FSE_STAT_CHANGED, ctx,
5215 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5216 }
5217 if (pvp && pvp != dvp) {
5218 vnode_put(pvp);
5219 }
5220 }
5221 #endif
5222 }
5223 out2:
5224 /*
5225 * nameidone has to happen before we vnode_put(dvp)
5226 * since it may need to release the fs_nodelock on the dvp
5227 */
5228 nameidone(&nd);
5229 if (target_path != NULL) {
5230 RELEASE_PATH(target_path);
5231 }
5232 out:
5233 if (lvp) {
5234 vnode_put(lvp);
5235 }
5236 if (dvp) {
5237 vnode_put(dvp);
5238 }
5239 vnode_put(vp);
5240 return error;
5241 }
5242
5243 int
5244 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5245 {
5246 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5247 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5248 }
5249
5250 int
5251 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5252 {
5253 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5254 return EINVAL;
5255 }
5256
5257 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5258 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5259 }
5260
5261 /*
5262 * Make a symbolic link.
5263 *
5264 * We could add support for ACLs here too...
5265 */
5266 /* ARGSUSED */
5267 static int
5268 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5269 user_addr_t link, enum uio_seg segflg)
5270 {
5271 struct vnode_attr va;
5272 char *path;
5273 int error;
5274 struct nameidata nd;
5275 vnode_t vp, dvp;
5276 size_t dummy = 0;
5277 proc_t p;
5278
5279 error = 0;
5280 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5281 path = zalloc(ZV_NAMEI);
5282 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5283 } else {
5284 path = (char *)path_data;
5285 }
5286 if (error) {
5287 goto out;
5288 }
5289 AUDIT_ARG(text, path); /* This is the link string */
5290
5291 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5292 segflg, link, ctx);
5293
5294 error = nameiat(&nd, fd);
5295 if (error) {
5296 goto out;
5297 }
5298 dvp = nd.ni_dvp;
5299 vp = nd.ni_vp;
5300
5301 p = vfs_context_proc(ctx);
5302 VATTR_INIT(&va);
5303 VATTR_SET(&va, va_type, VLNK);
5304 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5305
5306 #if CONFIG_MACF
5307 error = mac_vnode_check_create(ctx,
5308 dvp, &nd.ni_cnd, &va);
5309 #endif
5310 if (error != 0) {
5311 goto skipit;
5312 }
5313
5314 if (vp != NULL) {
5315 error = EEXIST;
5316 goto skipit;
5317 }
5318
5319 /* authorize */
5320 if (error == 0) {
5321 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5322 }
5323 /* get default ownership, etc. */
5324 if (error == 0) {
5325 error = vnode_authattr_new(dvp, &va, 0, ctx);
5326 }
5327 if (error == 0) {
5328 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5329 }
5330
5331 /* do fallback attribute handling */
5332 if (error == 0 && vp) {
5333 error = vnode_setattr_fallback(vp, &va, ctx);
5334 }
5335
5336 #if CONFIG_MACF
5337 if (error == 0 && vp) {
5338 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5339 }
5340 #endif
5341
5342 if (error == 0) {
5343 int update_flags = 0;
5344
5345 /*check if a new vnode was created, else try to get one*/
5346 if (vp == NULL) {
5347 nd.ni_cnd.cn_nameiop = LOOKUP;
5348 #if CONFIG_TRIGGERS
5349 nd.ni_op = OP_LOOKUP;
5350 #endif
5351 nd.ni_cnd.cn_flags = 0;
5352 error = nameiat(&nd, fd);
5353 vp = nd.ni_vp;
5354
5355 if (vp == NULL) {
5356 goto skipit;
5357 }
5358 }
5359
5360 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5361 /* call out to allow 3rd party notification of rename.
5362 * Ignore result of kauth_authorize_fileop call.
5363 */
5364 if (kauth_authorize_fileop_has_listeners() &&
5365 namei(&nd) == 0) {
5366 char *new_link_path = NULL;
5367 int len;
5368
5369 /* build the path to the new link file */
5370 new_link_path = get_pathbuff();
5371 len = MAXPATHLEN;
5372 vn_getpath(dvp, new_link_path, &len);
5373 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5374 new_link_path[len - 1] = '/';
5375 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5376 }
5377
5378 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5379 (uintptr_t)path, (uintptr_t)new_link_path);
5380 if (new_link_path != NULL) {
5381 release_pathbuff(new_link_path);
5382 }
5383 }
5384 #endif
5385 // Make sure the name & parent pointers are hooked up
5386 if (vp->v_name == NULL) {
5387 update_flags |= VNODE_UPDATE_NAME;
5388 }
5389 if (vp->v_parent == NULLVP) {
5390 update_flags |= VNODE_UPDATE_PARENT;
5391 }
5392
5393 if (update_flags) {
5394 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5395 }
5396
5397 #if CONFIG_FSE
5398 add_fsevent(FSE_CREATE_FILE, ctx,
5399 FSE_ARG_VNODE, vp,
5400 FSE_ARG_DONE);
5401 #endif
5402 }
5403
5404 skipit:
5405 /*
5406 * nameidone has to happen before we vnode_put(dvp)
5407 * since it may need to release the fs_nodelock on the dvp
5408 */
5409 nameidone(&nd);
5410
5411 if (vp) {
5412 vnode_put(vp);
5413 }
5414 vnode_put(dvp);
5415 out:
5416 if (path && (path != (char *)path_data)) {
5417 zfree(ZV_NAMEI, path);
5418 }
5419
5420 return error;
5421 }
5422
5423 int
5424 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5425 {
5426 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5427 uap->link, UIO_USERSPACE);
5428 }
5429
5430 int
5431 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5432 __unused int32_t *retval)
5433 {
5434 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5435 uap->path2, UIO_USERSPACE);
5436 }
5437
5438 /*
5439 * Delete a whiteout from the filesystem.
5440 * No longer supported.
5441 */
5442 int
5443 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5444 {
5445 return ENOTSUP;
5446 }
5447
5448 /*
5449 * Delete a name from the filesystem.
5450 */
5451 /* ARGSUSED */
5452 static int
5453 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5454 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5455 {
5456 struct nameidata nd;
5457 vnode_t vp, dvp;
5458 int error;
5459 struct componentname *cnp;
5460 char *path = NULL;
5461 char *no_firmlink_path = NULL;
5462 int len_path = 0;
5463 int len_no_firmlink_path = 0;
5464 #if CONFIG_FSE
5465 fse_info finfo;
5466 struct vnode_attr va;
5467 #endif
5468 int flags;
5469 int need_event;
5470 int has_listeners;
5471 int truncated_path;
5472 int truncated_no_firmlink_path;
5473 int batched;
5474 struct vnode_attr *vap;
5475 int do_retry;
5476 int retry_count = 0;
5477 int cn_flags;
5478
5479 cn_flags = LOCKPARENT;
5480 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5481 cn_flags |= AUDITVNPATH1;
5482 }
5483 /* If a starting dvp is passed, it trumps any fd passed. */
5484 if (start_dvp) {
5485 cn_flags |= USEDVP;
5486 }
5487
5488 #if NAMEDRSRCFORK
5489 /* unlink or delete is allowed on rsrc forks and named streams */
5490 cn_flags |= CN_ALLOWRSRCFORK;
5491 #endif
5492
5493 retry:
5494 do_retry = 0;
5495 flags = 0;
5496 need_event = 0;
5497 has_listeners = 0;
5498 truncated_path = 0;
5499 truncated_no_firmlink_path = 0;
5500 vap = NULL;
5501
5502 NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5503
5504 nd.ni_dvp = start_dvp;
5505 nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5506 cnp = &nd.ni_cnd;
5507
5508 continue_lookup:
5509 error = nameiat(&nd, fd);
5510 if (error) {
5511 return error;
5512 }
5513
5514 dvp = nd.ni_dvp;
5515 vp = nd.ni_vp;
5516
5517
5518 /* With Carbon delete semantics, busy files cannot be deleted */
5519 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5520 flags |= VNODE_REMOVE_NODELETEBUSY;
5521 }
5522
5523 /* Skip any potential upcalls if told to. */
5524 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5525 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5526 }
5527
5528 if (vp) {
5529 batched = vnode_compound_remove_available(vp);
5530 /*
5531 * The root of a mounted filesystem cannot be deleted.
5532 */
5533 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5534 error = EBUSY;
5535 goto out;
5536 }
5537
5538 #if DEVELOPMENT || DEBUG
5539 /*
5540 * XXX VSWAP: Check for entitlements or special flag here
5541 * so we can restrict access appropriately.
5542 */
5543 #else /* DEVELOPMENT || DEBUG */
5544
5545 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5546 error = EPERM;
5547 goto out;
5548 }
5549 #endif /* DEVELOPMENT || DEBUG */
5550
5551 if (!batched) {
5552 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5553 if (error) {
5554 if (error == ENOENT) {
5555 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5556 do_retry = 1;
5557 retry_count++;
5558 }
5559 }
5560 goto out;
5561 }
5562 }
5563 } else {
5564 batched = 1;
5565
5566 if (!vnode_compound_remove_available(dvp)) {
5567 panic("No vp, but no compound remove?");
5568 }
5569 }
5570
5571 #if CONFIG_FSE
5572 need_event = need_fsevent(FSE_DELETE, dvp);
5573 if (need_event) {
5574 if (!batched) {
5575 if ((vp->v_flag & VISHARDLINK) == 0) {
5576 /* XXX need to get these data in batched VNOP */
5577 get_fse_info(vp, &finfo, ctx);
5578 }
5579 } else {
5580 error = vfs_get_notify_attributes(&va);
5581 if (error) {
5582 goto out;
5583 }
5584
5585 vap = &va;
5586 }
5587 }
5588 #endif
5589 has_listeners = kauth_authorize_fileop_has_listeners();
5590 if (need_event || has_listeners) {
5591 if (path == NULL) {
5592 GET_PATH(path);
5593 }
5594 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5595 if (no_firmlink_path == NULL) {
5596 GET_PATH(no_firmlink_path);
5597 }
5598 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5599 }
5600
5601 #if NAMEDRSRCFORK
5602 if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5603 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5604 } else
5605 #endif
5606 {
5607 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5608 vp = nd.ni_vp;
5609 if (error == EKEEPLOOKING) {
5610 if (!batched) {
5611 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5612 }
5613
5614 if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5615 panic("EKEEPLOOKING, but continue flag not set?");
5616 }
5617
5618 if (vnode_isdir(vp)) {
5619 error = EISDIR;
5620 goto out;
5621 }
5622 goto continue_lookup;
5623 } else if (error == ENOENT && batched) {
5624 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5625 /*
5626 * For compound VNOPs, the authorization callback may
5627 * return ENOENT in case of racing hardlink lookups
5628 * hitting the name cache, redrive the lookup.
5629 */
5630 do_retry = 1;
5631 retry_count += 1;
5632 goto out;
5633 }
5634 }
5635 }
5636
5637 /*
5638 * Call out to allow 3rd party notification of delete.
5639 * Ignore result of kauth_authorize_fileop call.
5640 */
5641 if (!error) {
5642 if (has_listeners) {
5643 kauth_authorize_fileop(vfs_context_ucred(ctx),
5644 KAUTH_FILEOP_DELETE,
5645 (uintptr_t)vp,
5646 (uintptr_t)path);
5647 }
5648
5649 if (vp->v_flag & VISHARDLINK) {
5650 //
5651 // if a hardlink gets deleted we want to blow away the
5652 // v_parent link because the path that got us to this
5653 // instance of the link is no longer valid. this will
5654 // force the next call to get the path to ask the file
5655 // system instead of just following the v_parent link.
5656 //
5657 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5658 }
5659
5660 #if CONFIG_FSE
5661 if (need_event) {
5662 if (vp->v_flag & VISHARDLINK) {
5663 get_fse_info(vp, &finfo, ctx);
5664 } else if (vap) {
5665 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5666 }
5667 if (truncated_path) {
5668 finfo.mode |= FSE_TRUNCATED_PATH;
5669 }
5670 add_fsevent(FSE_DELETE, ctx,
5671 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5672 FSE_ARG_FINFO, &finfo,
5673 FSE_ARG_DONE);
5674 }
5675 #endif
5676 }
5677
5678 out:
5679 if (path != NULL) {
5680 RELEASE_PATH(path);
5681 path = NULL;
5682 }
5683
5684 if (no_firmlink_path != NULL) {
5685 RELEASE_PATH(no_firmlink_path);
5686 no_firmlink_path = NULL;
5687 }
5688 #if NAMEDRSRCFORK
5689 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5690 * will cause its shadow file to go away if necessary.
5691 */
5692 if (vp && (vnode_isnamedstream(vp)) &&
5693 (vp->v_parent != NULLVP) &&
5694 vnode_isshadow(vp)) {
5695 vnode_recycle(vp);
5696 }
5697 #endif
5698 /*
5699 * nameidone has to happen before we vnode_put(dvp)
5700 * since it may need to release the fs_nodelock on the dvp
5701 */
5702 nameidone(&nd);
5703 vnode_put(dvp);
5704 if (vp) {
5705 vnode_put(vp);
5706 }
5707
5708 if (do_retry) {
5709 goto retry;
5710 }
5711
5712 return error;
5713 }
5714
5715 int
5716 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5717 enum uio_seg segflg, int unlink_flags)
5718 {
5719 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5720 unlink_flags);
5721 }
5722
5723 /*
5724 * Delete a name from the filesystem using Carbon semantics.
5725 */
5726 int
5727 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5728 {
5729 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5730 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5731 }
5732
5733 /*
5734 * Delete a name from the filesystem using POSIX semantics.
5735 */
5736 int
5737 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5738 {
5739 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5740 uap->path, UIO_USERSPACE, 0);
5741 }
5742
5743 int
5744 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5745 {
5746 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5747 return EINVAL;
5748 }
5749
5750 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5751 int unlink_flags = 0;
5752
5753 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5754 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5755 }
5756 return rmdirat_internal(vfs_context_current(), uap->fd,
5757 uap->path, UIO_USERSPACE, unlink_flags);
5758 } else {
5759 return unlinkat_internal(vfs_context_current(), uap->fd,
5760 NULLVP, uap->path, UIO_USERSPACE, 0);
5761 }
5762 }
5763
5764 /*
5765 * Reposition read/write file offset.
5766 */
5767 int
5768 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5769 {
5770 struct fileproc *fp;
5771 vnode_t vp;
5772 struct vfs_context *ctx;
5773 off_t offset = uap->offset, file_size;
5774 int error;
5775
5776 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5777 if (error == ENOTSUP) {
5778 return ESPIPE;
5779 }
5780 return error;
5781 }
5782 if (vnode_isfifo(vp)) {
5783 file_drop(uap->fd);
5784 return ESPIPE;
5785 }
5786
5787
5788 ctx = vfs_context_current();
5789 #if CONFIG_MACF
5790 if (uap->whence == L_INCR && uap->offset == 0) {
5791 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5792 fp->fp_glob);
5793 } else {
5794 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5795 fp->fp_glob);
5796 }
5797 if (error) {
5798 file_drop(uap->fd);
5799 return error;
5800 }
5801 #endif
5802 if ((error = vnode_getwithref(vp))) {
5803 file_drop(uap->fd);
5804 return error;
5805 }
5806
5807 switch (uap->whence) {
5808 case L_INCR:
5809 offset += fp->fp_glob->fg_offset;
5810 break;
5811 case L_XTND:
5812 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5813 break;
5814 }
5815 offset += file_size;
5816 break;
5817 case L_SET:
5818 break;
5819 case SEEK_HOLE:
5820 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5821 break;
5822 case SEEK_DATA:
5823 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5824 break;
5825 default:
5826 error = EINVAL;
5827 }
5828 if (error == 0) {
5829 if (uap->offset > 0 && offset < 0) {
5830 /* Incremented/relative move past max size */
5831 error = EOVERFLOW;
5832 } else {
5833 /*
5834 * Allow negative offsets on character devices, per
5835 * POSIX 1003.1-2001. Most likely for writing disk
5836 * labels.
5837 */
5838 if (offset < 0 && vp->v_type != VCHR) {
5839 /* Decremented/relative move before start */
5840 error = EINVAL;
5841 } else {
5842 /* Success */
5843 fp->fp_glob->fg_offset = offset;
5844 *retval = fp->fp_glob->fg_offset;
5845 }
5846 }
5847 }
5848
5849 /*
5850 * An lseek can affect whether data is "available to read." Use
5851 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5852 */
5853 post_event_if_success(vp, error, NOTE_NONE);
5854 (void)vnode_put(vp);
5855 file_drop(uap->fd);
5856 return error;
5857 }
5858
5859
5860 /*
5861 * Check access permissions.
5862 *
5863 * Returns: 0 Success
5864 * vnode_authorize:???
5865 */
5866 static int
5867 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5868 {
5869 kauth_action_t action;
5870 int error;
5871
5872 /*
5873 * If just the regular access bits, convert them to something
5874 * that vnode_authorize will understand.
5875 */
5876 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5877 action = 0;
5878 if (uflags & R_OK) {
5879 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5880 }
5881 if (uflags & W_OK) {
5882 if (vnode_isdir(vp)) {
5883 action |= KAUTH_VNODE_ADD_FILE |
5884 KAUTH_VNODE_ADD_SUBDIRECTORY;
5885 /* might want delete rights here too */
5886 } else {
5887 action |= KAUTH_VNODE_WRITE_DATA;
5888 }
5889 }
5890 if (uflags & X_OK) {
5891 if (vnode_isdir(vp)) {
5892 action |= KAUTH_VNODE_SEARCH;
5893 } else {
5894 action |= KAUTH_VNODE_EXECUTE;
5895 }
5896 }
5897 } else {
5898 /* take advantage of definition of uflags */
5899 action = uflags >> 8;
5900 }
5901
5902 #if CONFIG_MACF
5903 error = mac_vnode_check_access(ctx, vp, uflags);
5904 if (error) {
5905 return error;
5906 }
5907 #endif /* MAC */
5908
5909 /* action == 0 means only check for existence */
5910 if (action != 0) {
5911 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5912 } else {
5913 error = 0;
5914 }
5915
5916 return error;
5917 }
5918
5919
5920
5921 /*
5922 * access_extended: Check access permissions in bulk.
5923 *
5924 * Description: uap->entries Pointer to an array of accessx
5925 * descriptor structs, plus one or
5926 * more NULL terminated strings (see
5927 * "Notes" section below).
5928 * uap->size Size of the area pointed to by
5929 * uap->entries.
5930 * uap->results Pointer to the results array.
5931 *
5932 * Returns: 0 Success
5933 * ENOMEM Insufficient memory
5934 * EINVAL Invalid arguments
5935 * namei:EFAULT Bad address
5936 * namei:ENAMETOOLONG Filename too long
5937 * namei:ENOENT No such file or directory
5938 * namei:ELOOP Too many levels of symbolic links
5939 * namei:EBADF Bad file descriptor
5940 * namei:ENOTDIR Not a directory
5941 * namei:???
5942 * access1:
5943 *
5944 * Implicit returns:
5945 * uap->results Array contents modified
5946 *
5947 * Notes: The uap->entries are structured as an arbitrary length array
5948 * of accessx descriptors, followed by one or more NULL terminated
5949 * strings
5950 *
5951 * struct accessx_descriptor[0]
5952 * ...
5953 * struct accessx_descriptor[n]
5954 * char name_data[0];
5955 *
5956 * We determine the entry count by walking the buffer containing
5957 * the uap->entries argument descriptor. For each descriptor we
5958 * see, the valid values for the offset ad_name_offset will be
5959 * in the byte range:
5960 *
5961 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5962 * to
5963 * [ uap->entries + uap->size - 2 ]
5964 *
5965 * since we must have at least one string, and the string must
5966 * be at least one character plus the NULL terminator in length.
5967 *
5968 * XXX: Need to support the check-as uid argument
5969 */
5970 int
5971 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5972 {
5973 struct accessx_descriptor *input = NULL;
5974 errno_t *result = NULL;
5975 errno_t error = 0;
5976 int wantdelete = 0;
5977 size_t desc_max, desc_actual;
5978 unsigned int i, j;
5979 struct vfs_context context;
5980 struct nameidata nd;
5981 int niopts;
5982 vnode_t vp = NULL;
5983 vnode_t dvp = NULL;
5984 #define ACCESSX_MAX_DESCR_ON_STACK 10
5985 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5986
5987 context.vc_ucred = NULL;
5988
5989 /*
5990 * Validate parameters; if valid, copy the descriptor array and string
5991 * arguments into local memory. Before proceeding, the following
5992 * conditions must have been met:
5993 *
5994 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5995 * o There must be sufficient room in the request for at least one
5996 * descriptor and a one yte NUL terminated string.
5997 * o The allocation of local storage must not fail.
5998 */
5999 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6000 return ENOMEM;
6001 }
6002 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6003 return EINVAL;
6004 }
6005 if (uap->size <= sizeof(stack_input)) {
6006 input = stack_input;
6007 } else {
6008 input = kheap_alloc(KHEAP_DATA_BUFFERS, uap->size, Z_WAITOK);
6009 if (input == NULL) {
6010 error = ENOMEM;
6011 goto out;
6012 }
6013 }
6014 error = copyin(uap->entries, input, uap->size);
6015 if (error) {
6016 goto out;
6017 }
6018
6019 AUDIT_ARG(opaque, input, uap->size);
6020
6021 /*
6022 * Force NUL termination of the copyin buffer to avoid nami() running
6023 * off the end. If the caller passes us bogus data, they may get a
6024 * bogus result.
6025 */
6026 ((char *)input)[uap->size - 1] = 0;
6027
6028 /*
6029 * Access is defined as checking against the process' real identity,
6030 * even if operations are checking the effective identity. This
6031 * requires that we use a local vfs context.
6032 */
6033 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6034 context.vc_thread = current_thread();
6035
6036 /*
6037 * Find out how many entries we have, so we can allocate the result
6038 * array by walking the list and adjusting the count downward by the
6039 * earliest string offset we see.
6040 */
6041 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6042 desc_actual = desc_max;
6043 for (i = 0; i < desc_actual; i++) {
6044 /*
6045 * Take the offset to the name string for this entry and
6046 * convert to an input array index, which would be one off
6047 * the end of the array if this entry was the lowest-addressed
6048 * name string.
6049 */
6050 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6051
6052 /*
6053 * An offset greater than the max allowable offset is an error.
6054 * It is also an error for any valid entry to point
6055 * to a location prior to the end of the current entry, if
6056 * it's not a reference to the string of the previous entry.
6057 */
6058 if (j > desc_max || (j != 0 && j <= i)) {
6059 error = EINVAL;
6060 goto out;
6061 }
6062
6063 /* Also do not let ad_name_offset point to something beyond the size of the input */
6064 if (input[i].ad_name_offset >= uap->size) {
6065 error = EINVAL;
6066 goto out;
6067 }
6068
6069 /*
6070 * An offset of 0 means use the previous descriptor's offset;
6071 * this is used to chain multiple requests for the same file
6072 * to avoid multiple lookups.
6073 */
6074 if (j == 0) {
6075 /* This is not valid for the first entry */
6076 if (i == 0) {
6077 error = EINVAL;
6078 goto out;
6079 }
6080 continue;
6081 }
6082
6083 /*
6084 * If the offset of the string for this descriptor is before
6085 * what we believe is the current actual last descriptor,
6086 * then we need to adjust our estimate downward; this permits
6087 * the string table following the last descriptor to be out
6088 * of order relative to the descriptor list.
6089 */
6090 if (j < desc_actual) {
6091 desc_actual = j;
6092 }
6093 }
6094
6095 /*
6096 * We limit the actual number of descriptors we are willing to process
6097 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6098 * requested does not exceed this limit,
6099 */
6100 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6101 error = ENOMEM;
6102 goto out;
6103 }
6104 result = kheap_alloc(KHEAP_DATA_BUFFERS, desc_actual * sizeof(errno_t),
6105 Z_WAITOK | Z_ZERO);
6106 if (result == NULL) {
6107 error = ENOMEM;
6108 goto out;
6109 }
6110
6111 /*
6112 * Do the work by iterating over the descriptor entries we know to
6113 * at least appear to contain valid data.
6114 */
6115 error = 0;
6116 for (i = 0; i < desc_actual; i++) {
6117 /*
6118 * If the ad_name_offset is 0, then we use the previous
6119 * results to make the check; otherwise, we are looking up
6120 * a new file name.
6121 */
6122 if (input[i].ad_name_offset != 0) {
6123 /* discard old vnodes */
6124 if (vp) {
6125 vnode_put(vp);
6126 vp = NULL;
6127 }
6128 if (dvp) {
6129 vnode_put(dvp);
6130 dvp = NULL;
6131 }
6132
6133 /*
6134 * Scan forward in the descriptor list to see if we
6135 * need the parent vnode. We will need it if we are
6136 * deleting, since we must have rights to remove
6137 * entries in the parent directory, as well as the
6138 * rights to delete the object itself.
6139 */
6140 wantdelete = input[i].ad_flags & _DELETE_OK;
6141 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6142 if (input[j].ad_flags & _DELETE_OK) {
6143 wantdelete = 1;
6144 }
6145 }
6146
6147 niopts = FOLLOW | AUDITVNPATH1;
6148
6149 /* need parent for vnode_authorize for deletion test */
6150 if (wantdelete) {
6151 niopts |= WANTPARENT;
6152 }
6153
6154 /* do the lookup */
6155 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6156 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6157 &context);
6158 error = namei(&nd);
6159 if (!error) {
6160 vp = nd.ni_vp;
6161 if (wantdelete) {
6162 dvp = nd.ni_dvp;
6163 }
6164 }
6165 nameidone(&nd);
6166 }
6167
6168 /*
6169 * Handle lookup errors.
6170 */
6171 switch (error) {
6172 case ENOENT:
6173 case EACCES:
6174 case EPERM:
6175 case ENOTDIR:
6176 result[i] = error;
6177 break;
6178 case 0:
6179 /* run this access check */
6180 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6181 break;
6182 default:
6183 /* fatal lookup error */
6184
6185 goto out;
6186 }
6187 }
6188
6189 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6190
6191 /* copy out results */
6192 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6193
6194 out:
6195 if (input && input != stack_input) {
6196 kheap_free(KHEAP_DATA_BUFFERS, input, uap->size);
6197 }
6198 if (result) {
6199 kheap_free(KHEAP_DATA_BUFFERS, result, desc_actual * sizeof(errno_t));
6200 }
6201 if (vp) {
6202 vnode_put(vp);
6203 }
6204 if (dvp) {
6205 vnode_put(dvp);
6206 }
6207 if (IS_VALID_CRED(context.vc_ucred)) {
6208 kauth_cred_unref(&context.vc_ucred);
6209 }
6210 return error;
6211 }
6212
6213
6214 /*
6215 * Returns: 0 Success
6216 * namei:EFAULT Bad address
6217 * namei:ENAMETOOLONG Filename too long
6218 * namei:ENOENT No such file or directory
6219 * namei:ELOOP Too many levels of symbolic links
6220 * namei:EBADF Bad file descriptor
6221 * namei:ENOTDIR Not a directory
6222 * namei:???
6223 * access1:
6224 */
6225 static int
6226 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6227 int flag, enum uio_seg segflg)
6228 {
6229 int error;
6230 struct nameidata nd;
6231 int niopts;
6232 struct vfs_context context;
6233 #if NAMEDRSRCFORK
6234 int is_namedstream = 0;
6235 #endif
6236
6237 /*
6238 * Unless the AT_EACCESS option is used, Access is defined as checking
6239 * against the process' real identity, even if operations are checking
6240 * the effective identity. So we need to tweak the credential
6241 * in the context for that case.
6242 */
6243 if (!(flag & AT_EACCESS)) {
6244 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6245 } else {
6246 context.vc_ucred = ctx->vc_ucred;
6247 }
6248 context.vc_thread = ctx->vc_thread;
6249
6250
6251 niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6252 /* need parent for vnode_authorize for deletion test */
6253 if (amode & _DELETE_OK) {
6254 niopts |= WANTPARENT;
6255 }
6256 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6257 path, &context);
6258
6259 #if NAMEDRSRCFORK
6260 /* access(F_OK) calls are allowed for resource forks. */
6261 if (amode == F_OK) {
6262 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6263 }
6264 #endif
6265 error = nameiat(&nd, fd);
6266 if (error) {
6267 goto out;
6268 }
6269
6270 #if NAMEDRSRCFORK
6271 /* Grab reference on the shadow stream file vnode to
6272 * force an inactive on release which will mark it
6273 * for recycle.
6274 */
6275 if (vnode_isnamedstream(nd.ni_vp) &&
6276 (nd.ni_vp->v_parent != NULLVP) &&
6277 vnode_isshadow(nd.ni_vp)) {
6278 is_namedstream = 1;
6279 vnode_ref(nd.ni_vp);
6280 }
6281 #endif
6282
6283 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6284
6285 #if NAMEDRSRCFORK
6286 if (is_namedstream) {
6287 vnode_rele(nd.ni_vp);
6288 }
6289 #endif
6290
6291 vnode_put(nd.ni_vp);
6292 if (amode & _DELETE_OK) {
6293 vnode_put(nd.ni_dvp);
6294 }
6295 nameidone(&nd);
6296
6297 out:
6298 if (!(flag & AT_EACCESS)) {
6299 kauth_cred_unref(&context.vc_ucred);
6300 }
6301 return error;
6302 }
6303
6304 int
6305 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6306 {
6307 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6308 uap->path, uap->flags, 0, UIO_USERSPACE);
6309 }
6310
6311 int
6312 faccessat(__unused proc_t p, struct faccessat_args *uap,
6313 __unused int32_t *retval)
6314 {
6315 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6316 return EINVAL;
6317 }
6318
6319 return faccessat_internal(vfs_context_current(), uap->fd,
6320 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6321 }
6322
6323 /*
6324 * Returns: 0 Success
6325 * EFAULT
6326 * copyout:EFAULT
6327 * namei:???
6328 * vn_stat:???
6329 */
6330 static int
6331 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6332 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6333 enum uio_seg segflg, int fd, int flag)
6334 {
6335 struct nameidata nd;
6336 int follow;
6337 union {
6338 struct stat sb;
6339 struct stat64 sb64;
6340 } source = {};
6341 union {
6342 struct user64_stat user64_sb;
6343 struct user32_stat user32_sb;
6344 struct user64_stat64 user64_sb64;
6345 struct user32_stat64 user32_sb64;
6346 } dest = {};
6347 caddr_t sbp;
6348 int error, my_size;
6349 kauth_filesec_t fsec;
6350 size_t xsecurity_bufsize;
6351 void * statptr;
6352 struct fileproc *fp = NULL;
6353 int needsrealdev = 0;
6354
6355 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6356 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6357 segflg, path, ctx);
6358
6359 #if NAMEDRSRCFORK
6360 int is_namedstream = 0;
6361 /* stat calls are allowed for resource forks. */
6362 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6363 #endif
6364
6365 if (flag & AT_FDONLY) {
6366 vnode_t fvp;
6367
6368 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6369 if (error) {
6370 return error;
6371 }
6372 if ((error = vnode_getwithref(fvp))) {
6373 file_drop(fd);
6374 return error;
6375 }
6376 nd.ni_vp = fvp;
6377 } else {
6378 error = nameiat(&nd, fd);
6379 if (error) {
6380 return error;
6381 }
6382 }
6383 fsec = KAUTH_FILESEC_NONE;
6384
6385 statptr = (void *)&source;
6386
6387 #if NAMEDRSRCFORK
6388 /* Grab reference on the shadow stream file vnode to
6389 * force an inactive on release which will mark it
6390 * for recycle.
6391 */
6392 if (vnode_isnamedstream(nd.ni_vp) &&
6393 (nd.ni_vp->v_parent != NULLVP) &&
6394 vnode_isshadow(nd.ni_vp)) {
6395 is_namedstream = 1;
6396 vnode_ref(nd.ni_vp);
6397 }
6398 #endif
6399
6400 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6401 if (fp && (xsecurity == USER_ADDR_NULL)) {
6402 /*
6403 * If the caller has the file open, and is not
6404 * requesting extended security information, we are
6405 * going to let them get the basic stat information.
6406 */
6407 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6408 fp->fp_glob->fg_cred);
6409 } else {
6410 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6411 isstat64, needsrealdev, ctx);
6412 }
6413
6414 #if NAMEDRSRCFORK
6415 if (is_namedstream) {
6416 vnode_rele(nd.ni_vp);
6417 }
6418 #endif
6419 vnode_put(nd.ni_vp);
6420 nameidone(&nd);
6421 if (fp) {
6422 file_drop(fd);
6423 fp = NULL;
6424 }
6425
6426 if (error) {
6427 return error;
6428 }
6429 /* Zap spare fields */
6430 if (isstat64 != 0) {
6431 source.sb64.st_lspare = 0;
6432 source.sb64.st_qspare[0] = 0LL;
6433 source.sb64.st_qspare[1] = 0LL;
6434 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6435 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6436 my_size = sizeof(dest.user64_sb64);
6437 sbp = (caddr_t)&dest.user64_sb64;
6438 } else {
6439 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6440 my_size = sizeof(dest.user32_sb64);
6441 sbp = (caddr_t)&dest.user32_sb64;
6442 }
6443 /*
6444 * Check if we raced (post lookup) against the last unlink of a file.
6445 */
6446 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6447 source.sb64.st_nlink = 1;
6448 }
6449 } else {
6450 source.sb.st_lspare = 0;
6451 source.sb.st_qspare[0] = 0LL;
6452 source.sb.st_qspare[1] = 0LL;
6453 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6454 munge_user64_stat(&source.sb, &dest.user64_sb);
6455 my_size = sizeof(dest.user64_sb);
6456 sbp = (caddr_t)&dest.user64_sb;
6457 } else {
6458 munge_user32_stat(&source.sb, &dest.user32_sb);
6459 my_size = sizeof(dest.user32_sb);
6460 sbp = (caddr_t)&dest.user32_sb;
6461 }
6462
6463 /*
6464 * Check if we raced (post lookup) against the last unlink of a file.
6465 */
6466 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6467 source.sb.st_nlink = 1;
6468 }
6469 }
6470 if ((error = copyout(sbp, ub, my_size)) != 0) {
6471 goto out;
6472 }
6473
6474 /* caller wants extended security information? */
6475 if (xsecurity != USER_ADDR_NULL) {
6476 /* did we get any? */
6477 if (fsec == KAUTH_FILESEC_NONE) {
6478 if (susize(xsecurity_size, 0) != 0) {
6479 error = EFAULT;
6480 goto out;
6481 }
6482 } else {
6483 /* find the user buffer size */
6484 xsecurity_bufsize = fusize(xsecurity_size);
6485
6486 /* copy out the actual data size */
6487 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6488 error = EFAULT;
6489 goto out;
6490 }
6491
6492 /* if the caller supplied enough room, copy out to it */
6493 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6494 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6495 }
6496 }
6497 }
6498 out:
6499 if (fsec != KAUTH_FILESEC_NONE) {
6500 kauth_filesec_free(fsec);
6501 }
6502 return error;
6503 }
6504
6505 /*
6506 * stat_extended: Get file status; with extended security (ACL).
6507 *
6508 * Parameters: p (ignored)
6509 * uap User argument descriptor (see below)
6510 * retval (ignored)
6511 *
6512 * Indirect: uap->path Path of file to get status from
6513 * uap->ub User buffer (holds file status info)
6514 * uap->xsecurity ACL to get (extended security)
6515 * uap->xsecurity_size Size of ACL
6516 *
6517 * Returns: 0 Success
6518 * !0 errno value
6519 *
6520 */
6521 int
6522 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6523 __unused int32_t *retval)
6524 {
6525 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6526 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6527 0);
6528 }
6529
6530 /*
6531 * Returns: 0 Success
6532 * fstatat_internal:??? [see fstatat_internal() in this file]
6533 */
6534 int
6535 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6536 {
6537 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6538 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6539 }
6540
6541 int
6542 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6543 {
6544 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6545 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6546 }
6547
6548 /*
6549 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6550 *
6551 * Parameters: p (ignored)
6552 * uap User argument descriptor (see below)
6553 * retval (ignored)
6554 *
6555 * Indirect: uap->path Path of file to get status from
6556 * uap->ub User buffer (holds file status info)
6557 * uap->xsecurity ACL to get (extended security)
6558 * uap->xsecurity_size Size of ACL
6559 *
6560 * Returns: 0 Success
6561 * !0 errno value
6562 *
6563 */
6564 int
6565 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6566 {
6567 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6568 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6569 0);
6570 }
6571
6572 /*
6573 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6574 *
6575 * Parameters: p (ignored)
6576 * uap User argument descriptor (see below)
6577 * retval (ignored)
6578 *
6579 * Indirect: uap->path Path of file to get status from
6580 * uap->ub User buffer (holds file status info)
6581 * uap->xsecurity ACL to get (extended security)
6582 * uap->xsecurity_size Size of ACL
6583 *
6584 * Returns: 0 Success
6585 * !0 errno value
6586 *
6587 */
6588 int
6589 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6590 {
6591 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6592 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6593 AT_SYMLINK_NOFOLLOW);
6594 }
6595
6596 /*
6597 * Get file status; this version does not follow links.
6598 */
6599 int
6600 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6601 {
6602 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6603 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6604 }
6605
6606 int
6607 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6608 {
6609 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6610 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6611 }
6612
6613 /*
6614 * lstat64_extended: Get file status; can handle large inode numbers; does not
6615 * follow links; with extended security (ACL).
6616 *
6617 * Parameters: p (ignored)
6618 * uap User argument descriptor (see below)
6619 * retval (ignored)
6620 *
6621 * Indirect: uap->path Path of file to get status from
6622 * uap->ub User buffer (holds file status info)
6623 * uap->xsecurity ACL to get (extended security)
6624 * uap->xsecurity_size Size of ACL
6625 *
6626 * Returns: 0 Success
6627 * !0 errno value
6628 *
6629 */
6630 int
6631 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6632 {
6633 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6634 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6635 AT_SYMLINK_NOFOLLOW);
6636 }
6637
6638 int
6639 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6640 {
6641 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6642 return EINVAL;
6643 }
6644
6645 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6646 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6647 }
6648
6649 int
6650 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6651 __unused int32_t *retval)
6652 {
6653 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6654 return EINVAL;
6655 }
6656
6657 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6658 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6659 }
6660
6661 /*
6662 * Get configurable pathname variables.
6663 *
6664 * Returns: 0 Success
6665 * namei:???
6666 * vn_pathconf:???
6667 *
6668 * Notes: Global implementation constants are intended to be
6669 * implemented in this function directly; all other constants
6670 * are per-FS implementation, and therefore must be handled in
6671 * each respective FS, instead.
6672 *
6673 * XXX We implement some things globally right now that should actually be
6674 * XXX per-FS; we will need to deal with this at some point.
6675 */
6676 /* ARGSUSED */
6677 int
6678 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6679 {
6680 int error;
6681 struct nameidata nd;
6682 vfs_context_t ctx = vfs_context_current();
6683
6684 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6685 UIO_USERSPACE, uap->path, ctx);
6686 error = namei(&nd);
6687 if (error) {
6688 return error;
6689 }
6690
6691 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6692
6693 vnode_put(nd.ni_vp);
6694 nameidone(&nd);
6695 return error;
6696 }
6697
6698 /*
6699 * Return target name of a symbolic link.
6700 */
6701 /* ARGSUSED */
6702 static int
6703 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6704 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6705 int *retval)
6706 {
6707 vnode_t vp;
6708 uio_t auio;
6709 int error;
6710 struct nameidata nd;
6711 char uio_buf[UIO_SIZEOF(1)];
6712
6713 if (bufsize > INT32_MAX) {
6714 return EINVAL;
6715 }
6716
6717 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6718 seg, path, ctx);
6719
6720 error = nameiat(&nd, fd);
6721 if (error) {
6722 return error;
6723 }
6724 vp = nd.ni_vp;
6725
6726 nameidone(&nd);
6727
6728 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6729 &uio_buf[0], sizeof(uio_buf));
6730 uio_addiov(auio, buf, bufsize);
6731 if (vp->v_type != VLNK) {
6732 error = EINVAL;
6733 } else {
6734 #if CONFIG_MACF
6735 error = mac_vnode_check_readlink(ctx, vp);
6736 #endif
6737 if (error == 0) {
6738 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6739 ctx);
6740 }
6741 if (error == 0) {
6742 error = VNOP_READLINK(vp, auio, ctx);
6743 }
6744 }
6745 vnode_put(vp);
6746
6747 *retval = (int)(bufsize - uio_resid(auio));
6748 return error;
6749 }
6750
6751 int
6752 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6753 {
6754 enum uio_seg procseg;
6755
6756 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6757 return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6758 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6759 uap->count, procseg, retval);
6760 }
6761
6762 int
6763 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6764 {
6765 enum uio_seg procseg;
6766
6767 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6768 return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6769 procseg, uap->buf, uap->bufsize, procseg, retval);
6770 }
6771
6772 /*
6773 * Change file flags, the deep inner layer.
6774 */
6775 static int
6776 chflags0(vnode_t vp, struct vnode_attr *va,
6777 int (*setattr)(vnode_t, void *, vfs_context_t),
6778 void *arg, vfs_context_t ctx)
6779 {
6780 kauth_action_t action = 0;
6781 int error;
6782
6783 #if CONFIG_MACF
6784 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6785 if (error) {
6786 goto out;
6787 }
6788 #endif
6789
6790 /* request authorisation, disregard immutability */
6791 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6792 goto out;
6793 }
6794 /*
6795 * Request that the auth layer disregard those file flags it's allowed to when
6796 * authorizing this operation; we need to do this in order to be able to
6797 * clear immutable flags.
6798 */
6799 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6800 goto out;
6801 }
6802 error = (*setattr)(vp, arg, ctx);
6803
6804 #if CONFIG_MACF
6805 if (error == 0) {
6806 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6807 }
6808 #endif
6809
6810 out:
6811 return error;
6812 }
6813
6814 /*
6815 * Change file flags.
6816 *
6817 * NOTE: this will vnode_put() `vp'
6818 */
6819 static int
6820 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6821 {
6822 struct vnode_attr va;
6823 int error;
6824
6825 VATTR_INIT(&va);
6826 VATTR_SET(&va, va_flags, flags);
6827
6828 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6829 vnode_put(vp);
6830
6831 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6832 error = ENOTSUP;
6833 }
6834
6835 return error;
6836 }
6837
6838 /*
6839 * Change flags of a file given a path name.
6840 */
6841 /* ARGSUSED */
6842 int
6843 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6844 {
6845 vnode_t vp;
6846 vfs_context_t ctx = vfs_context_current();
6847 int error;
6848 struct nameidata nd;
6849
6850 AUDIT_ARG(fflags, uap->flags);
6851 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6852 UIO_USERSPACE, uap->path, ctx);
6853 error = namei(&nd);
6854 if (error) {
6855 return error;
6856 }
6857 vp = nd.ni_vp;
6858 nameidone(&nd);
6859
6860 /* we don't vnode_put() here because chflags1 does internally */
6861 error = chflags1(vp, uap->flags, ctx);
6862
6863 return error;
6864 }
6865
6866 /*
6867 * Change flags of a file given a file descriptor.
6868 */
6869 /* ARGSUSED */
6870 int
6871 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6872 {
6873 vnode_t vp;
6874 int error;
6875
6876 AUDIT_ARG(fd, uap->fd);
6877 AUDIT_ARG(fflags, uap->flags);
6878 if ((error = file_vnode(uap->fd, &vp))) {
6879 return error;
6880 }
6881
6882 if ((error = vnode_getwithref(vp))) {
6883 file_drop(uap->fd);
6884 return error;
6885 }
6886
6887 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6888
6889 /* we don't vnode_put() here because chflags1 does internally */
6890 error = chflags1(vp, uap->flags, vfs_context_current());
6891
6892 file_drop(uap->fd);
6893 return error;
6894 }
6895
6896 /*
6897 * Change security information on a filesystem object.
6898 *
6899 * Returns: 0 Success
6900 * EPERM Operation not permitted
6901 * vnode_authattr:??? [anything vnode_authattr can return]
6902 * vnode_authorize:??? [anything vnode_authorize can return]
6903 * vnode_setattr:??? [anything vnode_setattr can return]
6904 *
6905 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6906 * translated to EPERM before being returned.
6907 */
6908 static int
6909 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6910 {
6911 kauth_action_t action;
6912 int error;
6913
6914 AUDIT_ARG(mode, vap->va_mode);
6915 /* XXX audit new args */
6916
6917 #if NAMEDSTREAMS
6918 /* chmod calls are not allowed for resource forks. */
6919 if (vp->v_flag & VISNAMEDSTREAM) {
6920 return EPERM;
6921 }
6922 #endif
6923
6924 #if CONFIG_MACF
6925 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6926 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6927 return error;
6928 }
6929
6930 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6931 if ((error = mac_vnode_check_setowner(ctx, vp,
6932 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6933 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6934 return error;
6935 }
6936 }
6937
6938 if (VATTR_IS_ACTIVE(vap, va_acl) &&
6939 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6940 return error;
6941 }
6942 #endif
6943
6944 /* make sure that the caller is allowed to set this security information */
6945 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6946 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6947 if (error == EACCES) {
6948 error = EPERM;
6949 }
6950 return error;
6951 }
6952
6953 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6954 return error;
6955 }
6956
6957 #if CONFIG_MACF
6958 if (VATTR_IS_ACTIVE(vap, va_mode)) {
6959 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6960 }
6961
6962 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6963 mac_vnode_notify_setowner(ctx, vp,
6964 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6965 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6966 }
6967
6968 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6969 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6970 }
6971 #endif
6972
6973 return error;
6974 }
6975
6976
6977 /*
6978 * Change mode of a file given a path name.
6979 *
6980 * Returns: 0 Success
6981 * namei:??? [anything namei can return]
6982 * chmod_vnode:??? [anything chmod_vnode can return]
6983 */
6984 static int
6985 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6986 int fd, int flag, enum uio_seg segflg)
6987 {
6988 struct nameidata nd;
6989 int follow, error;
6990
6991 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6992 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6993 segflg, path, ctx);
6994 if ((error = nameiat(&nd, fd))) {
6995 return error;
6996 }
6997 error = chmod_vnode(ctx, nd.ni_vp, vap);
6998 vnode_put(nd.ni_vp);
6999 nameidone(&nd);
7000 return error;
7001 }
7002
7003 /*
7004 * chmod_extended: Change the mode of a file given a path name; with extended
7005 * argument list (including extended security (ACL)).
7006 *
7007 * Parameters: p Process requesting the open
7008 * uap User argument descriptor (see below)
7009 * retval (ignored)
7010 *
7011 * Indirect: uap->path Path to object (same as 'chmod')
7012 * uap->uid UID to set
7013 * uap->gid GID to set
7014 * uap->mode File mode to set (same as 'chmod')
7015 * uap->xsecurity ACL to set (or delete)
7016 *
7017 * Returns: 0 Success
7018 * !0 errno value
7019 *
7020 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7021 *
7022 * XXX: We should enummerate the possible errno values here, and where
7023 * in the code they originated.
7024 */
7025 int
7026 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7027 {
7028 int error;
7029 struct vnode_attr va;
7030 kauth_filesec_t xsecdst;
7031
7032 AUDIT_ARG(owner, uap->uid, uap->gid);
7033
7034 VATTR_INIT(&va);
7035 if (uap->mode != -1) {
7036 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7037 }
7038 if (uap->uid != KAUTH_UID_NONE) {
7039 VATTR_SET(&va, va_uid, uap->uid);
7040 }
7041 if (uap->gid != KAUTH_GID_NONE) {
7042 VATTR_SET(&va, va_gid, uap->gid);
7043 }
7044
7045 xsecdst = NULL;
7046 switch (uap->xsecurity) {
7047 /* explicit remove request */
7048 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7049 VATTR_SET(&va, va_acl, NULL);
7050 break;
7051 /* not being set */
7052 case USER_ADDR_NULL:
7053 break;
7054 default:
7055 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7056 return error;
7057 }
7058 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7059 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7060 }
7061
7062 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7063 UIO_USERSPACE);
7064
7065 if (xsecdst != NULL) {
7066 kauth_filesec_free(xsecdst);
7067 }
7068 return error;
7069 }
7070
7071 /*
7072 * Returns: 0 Success
7073 * chmodat:??? [anything chmodat can return]
7074 */
7075 static int
7076 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7077 int flag, enum uio_seg segflg)
7078 {
7079 struct vnode_attr va;
7080
7081 VATTR_INIT(&va);
7082 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7083
7084 return chmodat(ctx, path, &va, fd, flag, segflg);
7085 }
7086
7087 int
7088 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7089 {
7090 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7091 AT_FDCWD, 0, UIO_USERSPACE);
7092 }
7093
7094 int
7095 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7096 {
7097 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7098 return EINVAL;
7099 }
7100
7101 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7102 uap->fd, uap->flag, UIO_USERSPACE);
7103 }
7104
7105 /*
7106 * Change mode of a file given a file descriptor.
7107 */
7108 static int
7109 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7110 {
7111 vnode_t vp;
7112 int error;
7113
7114 AUDIT_ARG(fd, fd);
7115
7116 if ((error = file_vnode(fd, &vp)) != 0) {
7117 return error;
7118 }
7119 if ((error = vnode_getwithref(vp)) != 0) {
7120 file_drop(fd);
7121 return error;
7122 }
7123 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7124
7125 error = chmod_vnode(vfs_context_current(), vp, vap);
7126 (void)vnode_put(vp);
7127 file_drop(fd);
7128
7129 return error;
7130 }
7131
7132 /*
7133 * fchmod_extended: Change mode of a file given a file descriptor; with
7134 * extended argument list (including extended security (ACL)).
7135 *
7136 * Parameters: p Process requesting to change file mode
7137 * uap User argument descriptor (see below)
7138 * retval (ignored)
7139 *
7140 * Indirect: uap->mode File mode to set (same as 'chmod')
7141 * uap->uid UID to set
7142 * uap->gid GID to set
7143 * uap->xsecurity ACL to set (or delete)
7144 * uap->fd File descriptor of file to change mode
7145 *
7146 * Returns: 0 Success
7147 * !0 errno value
7148 *
7149 */
7150 int
7151 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7152 {
7153 int error;
7154 struct vnode_attr va;
7155 kauth_filesec_t xsecdst;
7156
7157 AUDIT_ARG(owner, uap->uid, uap->gid);
7158
7159 VATTR_INIT(&va);
7160 if (uap->mode != -1) {
7161 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7162 }
7163 if (uap->uid != KAUTH_UID_NONE) {
7164 VATTR_SET(&va, va_uid, uap->uid);
7165 }
7166 if (uap->gid != KAUTH_GID_NONE) {
7167 VATTR_SET(&va, va_gid, uap->gid);
7168 }
7169
7170 xsecdst = NULL;
7171 switch (uap->xsecurity) {
7172 case USER_ADDR_NULL:
7173 VATTR_SET(&va, va_acl, NULL);
7174 break;
7175 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7176 VATTR_SET(&va, va_acl, NULL);
7177 break;
7178 /* not being set */
7179 case CAST_USER_ADDR_T(-1):
7180 break;
7181 default:
7182 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7183 return error;
7184 }
7185 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7186 }
7187
7188 error = fchmod1(p, uap->fd, &va);
7189
7190
7191 switch (uap->xsecurity) {
7192 case USER_ADDR_NULL:
7193 case CAST_USER_ADDR_T(-1):
7194 break;
7195 default:
7196 if (xsecdst != NULL) {
7197 kauth_filesec_free(xsecdst);
7198 }
7199 }
7200 return error;
7201 }
7202
7203 int
7204 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7205 {
7206 struct vnode_attr va;
7207
7208 VATTR_INIT(&va);
7209 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7210
7211 return fchmod1(p, uap->fd, &va);
7212 }
7213
7214
7215 /*
7216 * Set ownership given a path name.
7217 */
7218 /* ARGSUSED */
7219 static int
7220 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7221 gid_t gid, int flag, enum uio_seg segflg)
7222 {
7223 vnode_t vp;
7224 struct vnode_attr va;
7225 int error;
7226 struct nameidata nd;
7227 int follow;
7228 kauth_action_t action;
7229
7230 AUDIT_ARG(owner, uid, gid);
7231
7232 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7233 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7234 path, ctx);
7235 error = nameiat(&nd, fd);
7236 if (error) {
7237 return error;
7238 }
7239 vp = nd.ni_vp;
7240
7241 nameidone(&nd);
7242
7243 VATTR_INIT(&va);
7244 if (uid != (uid_t)VNOVAL) {
7245 VATTR_SET(&va, va_uid, uid);
7246 }
7247 if (gid != (gid_t)VNOVAL) {
7248 VATTR_SET(&va, va_gid, gid);
7249 }
7250
7251 #if CONFIG_MACF
7252 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7253 if (error) {
7254 goto out;
7255 }
7256 #endif
7257
7258 /* preflight and authorize attribute changes */
7259 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7260 goto out;
7261 }
7262 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7263 goto out;
7264 }
7265 error = vnode_setattr(vp, &va, ctx);
7266
7267 #if CONFIG_MACF
7268 if (error == 0) {
7269 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7270 }
7271 #endif
7272
7273 out:
7274 /*
7275 * EACCES is only allowed from namei(); permissions failure should
7276 * return EPERM, so we need to translate the error code.
7277 */
7278 if (error == EACCES) {
7279 error = EPERM;
7280 }
7281
7282 vnode_put(vp);
7283 return error;
7284 }
7285
7286 int
7287 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7288 {
7289 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7290 uap->uid, uap->gid, 0, UIO_USERSPACE);
7291 }
7292
7293 int
7294 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7295 {
7296 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7297 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7298 }
7299
7300 int
7301 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7302 {
7303 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7304 return EINVAL;
7305 }
7306
7307 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7308 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7309 }
7310
7311 /*
7312 * Set ownership given a file descriptor.
7313 */
7314 /* ARGSUSED */
7315 int
7316 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7317 {
7318 struct vnode_attr va;
7319 vfs_context_t ctx = vfs_context_current();
7320 vnode_t vp;
7321 int error;
7322 kauth_action_t action;
7323
7324 AUDIT_ARG(owner, uap->uid, uap->gid);
7325 AUDIT_ARG(fd, uap->fd);
7326
7327 if ((error = file_vnode(uap->fd, &vp))) {
7328 return error;
7329 }
7330
7331 if ((error = vnode_getwithref(vp))) {
7332 file_drop(uap->fd);
7333 return error;
7334 }
7335 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7336
7337 VATTR_INIT(&va);
7338 if (uap->uid != VNOVAL) {
7339 VATTR_SET(&va, va_uid, uap->uid);
7340 }
7341 if (uap->gid != VNOVAL) {
7342 VATTR_SET(&va, va_gid, uap->gid);
7343 }
7344
7345 #if NAMEDSTREAMS
7346 /* chown calls are not allowed for resource forks. */
7347 if (vp->v_flag & VISNAMEDSTREAM) {
7348 error = EPERM;
7349 goto out;
7350 }
7351 #endif
7352
7353 #if CONFIG_MACF
7354 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7355 if (error) {
7356 goto out;
7357 }
7358 #endif
7359
7360 /* preflight and authorize attribute changes */
7361 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7362 goto out;
7363 }
7364 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7365 if (error == EACCES) {
7366 error = EPERM;
7367 }
7368 goto out;
7369 }
7370 error = vnode_setattr(vp, &va, ctx);
7371
7372 #if CONFIG_MACF
7373 if (error == 0) {
7374 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7375 }
7376 #endif
7377
7378 out:
7379 (void)vnode_put(vp);
7380 file_drop(uap->fd);
7381 return error;
7382 }
7383
7384 static int
7385 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7386 {
7387 int error;
7388
7389 if (usrtvp == USER_ADDR_NULL) {
7390 struct timeval old_tv;
7391 /* XXX Y2038 bug because of microtime argument */
7392 microtime(&old_tv);
7393 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7394 tsp[1] = tsp[0];
7395 } else {
7396 if (IS_64BIT_PROCESS(current_proc())) {
7397 struct user64_timeval tv[2];
7398 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7399 if (error) {
7400 return error;
7401 }
7402 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7403 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7404 } else {
7405 struct user32_timeval tv[2];
7406 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7407 if (error) {
7408 return error;
7409 }
7410 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7411 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7412 }
7413 }
7414 return 0;
7415 }
7416
7417 static int
7418 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7419 int nullflag)
7420 {
7421 int error;
7422 struct vnode_attr va;
7423 kauth_action_t action;
7424
7425 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7426
7427 VATTR_INIT(&va);
7428 VATTR_SET(&va, va_access_time, ts[0]);
7429 VATTR_SET(&va, va_modify_time, ts[1]);
7430 if (nullflag) {
7431 va.va_vaflags |= VA_UTIMES_NULL;
7432 }
7433
7434 #if NAMEDSTREAMS
7435 /* utimes calls are not allowed for resource forks. */
7436 if (vp->v_flag & VISNAMEDSTREAM) {
7437 error = EPERM;
7438 goto out;
7439 }
7440 #endif
7441
7442 #if CONFIG_MACF
7443 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7444 if (error) {
7445 goto out;
7446 }
7447 #endif
7448 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7449 if (!nullflag && error == EACCES) {
7450 error = EPERM;
7451 }
7452 goto out;
7453 }
7454
7455 /* since we may not need to auth anything, check here */
7456 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7457 if (!nullflag && error == EACCES) {
7458 error = EPERM;
7459 }
7460 goto out;
7461 }
7462 error = vnode_setattr(vp, &va, ctx);
7463
7464 #if CONFIG_MACF
7465 if (error == 0) {
7466 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7467 }
7468 #endif
7469
7470 out:
7471 return error;
7472 }
7473
7474 /*
7475 * Set the access and modification times of a file.
7476 */
7477 /* ARGSUSED */
7478 int
7479 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7480 {
7481 struct timespec ts[2];
7482 user_addr_t usrtvp;
7483 int error;
7484 struct nameidata nd;
7485 vfs_context_t ctx = vfs_context_current();
7486
7487 /*
7488 * AUDIT: Needed to change the order of operations to do the
7489 * name lookup first because auditing wants the path.
7490 */
7491 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7492 UIO_USERSPACE, uap->path, ctx);
7493 error = namei(&nd);
7494 if (error) {
7495 return error;
7496 }
7497 nameidone(&nd);
7498
7499 /*
7500 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7501 * the current time instead.
7502 */
7503 usrtvp = uap->tptr;
7504 if ((error = getutimes(usrtvp, ts)) != 0) {
7505 goto out;
7506 }
7507
7508 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7509
7510 out:
7511 vnode_put(nd.ni_vp);
7512 return error;
7513 }
7514
7515 /*
7516 * Set the access and modification times of a file.
7517 */
7518 /* ARGSUSED */
7519 int
7520 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7521 {
7522 struct timespec ts[2];
7523 vnode_t vp;
7524 user_addr_t usrtvp;
7525 int error;
7526
7527 AUDIT_ARG(fd, uap->fd);
7528 usrtvp = uap->tptr;
7529 if ((error = getutimes(usrtvp, ts)) != 0) {
7530 return error;
7531 }
7532 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7533 return error;
7534 }
7535 if ((error = vnode_getwithref(vp))) {
7536 file_drop(uap->fd);
7537 return error;
7538 }
7539
7540 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7541 vnode_put(vp);
7542 file_drop(uap->fd);
7543 return error;
7544 }
7545
7546 /*
7547 * Truncate a file given its path name.
7548 */
7549 /* ARGSUSED */
7550 int
7551 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7552 {
7553 vnode_t vp;
7554 struct vnode_attr va;
7555 vfs_context_t ctx = vfs_context_current();
7556 int error;
7557 struct nameidata nd;
7558 kauth_action_t action;
7559 rlim_t fsize_limit;
7560
7561 if (uap->length < 0) {
7562 return EINVAL;
7563 }
7564
7565 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
7566 if ((rlim_t)uap->length > fsize_limit) {
7567 psignal(p, SIGXFSZ);
7568 return EFBIG;
7569 }
7570
7571 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7572 UIO_USERSPACE, uap->path, ctx);
7573 if ((error = namei(&nd))) {
7574 return error;
7575 }
7576 vp = nd.ni_vp;
7577
7578 nameidone(&nd);
7579
7580 VATTR_INIT(&va);
7581 VATTR_SET(&va, va_data_size, uap->length);
7582
7583 #if CONFIG_MACF
7584 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7585 if (error) {
7586 goto out;
7587 }
7588 #endif
7589
7590 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7591 goto out;
7592 }
7593 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7594 goto out;
7595 }
7596 error = vnode_setattr(vp, &va, ctx);
7597
7598 #if CONFIG_MACF
7599 if (error == 0) {
7600 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7601 }
7602 #endif
7603
7604 out:
7605 vnode_put(vp);
7606 return error;
7607 }
7608
7609 /*
7610 * Truncate a file given a file descriptor.
7611 */
7612 /* ARGSUSED */
7613 int
7614 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7615 {
7616 vfs_context_t ctx = vfs_context_current();
7617 struct vnode_attr va;
7618 vnode_t vp;
7619 struct fileproc *fp;
7620 int error;
7621 int fd = uap->fd;
7622 rlim_t fsize_limit;
7623
7624 AUDIT_ARG(fd, uap->fd);
7625 if (uap->length < 0) {
7626 return EINVAL;
7627 }
7628
7629 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
7630 if ((rlim_t)uap->length > fsize_limit) {
7631 psignal(p, SIGXFSZ);
7632 return EFBIG;
7633 }
7634
7635 if ((error = fp_lookup(p, fd, &fp, 0))) {
7636 return error;
7637 }
7638
7639 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7640 case DTYPE_PSXSHM:
7641 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7642 goto out;
7643 case DTYPE_VNODE:
7644 break;
7645 default:
7646 error = EINVAL;
7647 goto out;
7648 }
7649
7650 vp = (vnode_t)fp->fp_glob->fg_data;
7651
7652 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7653 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7654 error = EINVAL;
7655 goto out;
7656 }
7657
7658 if ((error = vnode_getwithref(vp)) != 0) {
7659 goto out;
7660 }
7661
7662 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7663
7664 #if CONFIG_MACF
7665 error = mac_vnode_check_truncate(ctx,
7666 fp->fp_glob->fg_cred, vp);
7667 if (error) {
7668 (void)vnode_put(vp);
7669 goto out;
7670 }
7671 #endif
7672 VATTR_INIT(&va);
7673 VATTR_SET(&va, va_data_size, uap->length);
7674 error = vnode_setattr(vp, &va, ctx);
7675
7676 #if CONFIG_MACF
7677 if (error == 0) {
7678 mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7679 }
7680 #endif
7681
7682 (void)vnode_put(vp);
7683 out:
7684 file_drop(fd);
7685 return error;
7686 }
7687
7688
7689 /*
7690 * Sync an open file with synchronized I/O _file_ integrity completion
7691 */
7692 /* ARGSUSED */
7693 int
7694 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7695 {
7696 __pthread_testcancel(1);
7697 return fsync_common(p, uap, MNT_WAIT);
7698 }
7699
7700
7701 /*
7702 * Sync an open file with synchronized I/O _file_ integrity completion
7703 *
7704 * Notes: This is a legacy support function that does not test for
7705 * thread cancellation points.
7706 */
7707 /* ARGSUSED */
7708 int
7709 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7710 {
7711 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7712 }
7713
7714
7715 /*
7716 * Sync an open file with synchronized I/O _data_ integrity completion
7717 */
7718 /* ARGSUSED */
7719 int
7720 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7721 {
7722 __pthread_testcancel(1);
7723 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7724 }
7725
7726
7727 /*
7728 * fsync_common
7729 *
7730 * Common fsync code to support both synchronized I/O file integrity completion
7731 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7732 *
7733 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7734 * will only guarantee that the file data contents are retrievable. If
7735 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7736 * includes additional metadata unnecessary for retrieving the file data
7737 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7738 * storage.
7739 *
7740 * Parameters: p The process
7741 * uap->fd The descriptor to synchronize
7742 * flags The data integrity flags
7743 *
7744 * Returns: int Success
7745 * fp_getfvp:EBADF Bad file descriptor
7746 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7747 * VNOP_FSYNC:??? unspecified
7748 *
7749 * Notes: We use struct fsync_args because it is a short name, and all
7750 * caller argument structures are otherwise identical.
7751 */
7752 static int
7753 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7754 {
7755 vnode_t vp;
7756 struct fileproc *fp;
7757 vfs_context_t ctx = vfs_context_current();
7758 int error;
7759
7760 AUDIT_ARG(fd, uap->fd);
7761
7762 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7763 return error;
7764 }
7765 if ((error = vnode_getwithref(vp))) {
7766 file_drop(uap->fd);
7767 return error;
7768 }
7769
7770 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7771
7772 error = VNOP_FSYNC(vp, flags, ctx);
7773
7774 #if NAMEDRSRCFORK
7775 /* Sync resource fork shadow file if necessary. */
7776 if ((error == 0) &&
7777 (vp->v_flag & VISNAMEDSTREAM) &&
7778 (vp->v_parent != NULLVP) &&
7779 vnode_isshadow(vp) &&
7780 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7781 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7782 }
7783 #endif
7784
7785 (void)vnode_put(vp);
7786 file_drop(uap->fd);
7787 return error;
7788 }
7789
7790 /*
7791 * Duplicate files. Source must be a file, target must be a file or
7792 * must not exist.
7793 *
7794 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7795 * perform inheritance correctly.
7796 */
7797 /* ARGSUSED */
7798 int
7799 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7800 {
7801 vnode_t tvp, fvp, tdvp, sdvp;
7802 struct nameidata fromnd, tond;
7803 int error;
7804 vfs_context_t ctx = vfs_context_current();
7805 #if CONFIG_MACF
7806 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7807 struct vnode_attr va;
7808 #endif
7809
7810 /* Check that the flags are valid. */
7811
7812 if (uap->flags & ~CPF_MASK) {
7813 return EINVAL;
7814 }
7815
7816 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7817 UIO_USERSPACE, uap->from, ctx);
7818 if ((error = namei(&fromnd))) {
7819 return error;
7820 }
7821 fvp = fromnd.ni_vp;
7822
7823 NDINIT(&tond, CREATE, OP_LINK,
7824 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7825 UIO_USERSPACE, uap->to, ctx);
7826 if ((error = namei(&tond))) {
7827 goto out1;
7828 }
7829 tdvp = tond.ni_dvp;
7830 tvp = tond.ni_vp;
7831
7832 if (tvp != NULL) {
7833 if (!(uap->flags & CPF_OVERWRITE)) {
7834 error = EEXIST;
7835 goto out;
7836 }
7837 }
7838
7839 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7840 error = EISDIR;
7841 goto out;
7842 }
7843
7844 /* This calls existing MAC hooks for open */
7845 if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7846 NULL))) {
7847 goto out;
7848 }
7849
7850 if (tvp) {
7851 /*
7852 * See unlinkat_internal for an explanation of the potential
7853 * ENOENT from the MAC hook but the gist is that the MAC hook
7854 * can fail because vn_getpath isn't able to return the full
7855 * path. We choose to ignore this failure.
7856 */
7857 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7858 if (error && error != ENOENT) {
7859 goto out;
7860 }
7861 error = 0;
7862 }
7863
7864 #if CONFIG_MACF
7865 VATTR_INIT(&va);
7866 VATTR_SET(&va, va_type, fvp->v_type);
7867 /* Mask off all but regular access permissions */
7868 VATTR_SET(&va, va_mode,
7869 ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7870 error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7871 if (error) {
7872 goto out;
7873 }
7874 #endif /* CONFIG_MACF */
7875
7876 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7877 goto out;
7878 }
7879
7880 if (fvp == tdvp) {
7881 error = EINVAL;
7882 }
7883 /*
7884 * If source is the same as the destination (that is the
7885 * same inode number) then there is nothing to do.
7886 * (fixed to have POSIX semantics - CSM 3/2/98)
7887 */
7888 if (fvp == tvp) {
7889 error = -1;
7890 }
7891 if (!error) {
7892 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7893 }
7894 out:
7895 sdvp = tond.ni_startdir;
7896 /*
7897 * nameidone has to happen before we vnode_put(tdvp)
7898 * since it may need to release the fs_nodelock on the tdvp
7899 */
7900 nameidone(&tond);
7901
7902 if (tvp) {
7903 vnode_put(tvp);
7904 }
7905 vnode_put(tdvp);
7906 vnode_put(sdvp);
7907 out1:
7908 vnode_put(fvp);
7909
7910 nameidone(&fromnd);
7911
7912 if (error == -1) {
7913 return 0;
7914 }
7915 return error;
7916 }
7917
7918 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7919
7920 /*
7921 * Helper function for doing clones. The caller is expected to provide an
7922 * iocounted source vnode and release it.
7923 */
7924 static int
7925 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7926 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7927 {
7928 vnode_t tvp, tdvp;
7929 struct nameidata tond;
7930 int error;
7931 int follow;
7932 boolean_t free_src_acl;
7933 boolean_t attr_cleanup;
7934 enum vtype v_type;
7935 kauth_action_t action;
7936 struct componentname *cnp;
7937 uint32_t defaulted;
7938 struct vnode_attr va;
7939 struct vnode_attr nva;
7940 uint32_t vnop_flags;
7941
7942 v_type = vnode_vtype(fvp);
7943 switch (v_type) {
7944 case VLNK:
7945 /* FALLTHRU */
7946 case VREG:
7947 action = KAUTH_VNODE_ADD_FILE;
7948 break;
7949 case VDIR:
7950 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7951 fvp->v_mountedhere) {
7952 return EINVAL;
7953 }
7954 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7955 break;
7956 default:
7957 return EINVAL;
7958 }
7959
7960 AUDIT_ARG(fd2, dst_dirfd);
7961 AUDIT_ARG(value32, flags);
7962
7963 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7964 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7965 UIO_USERSPACE, dst, ctx);
7966 if ((error = nameiat(&tond, dst_dirfd))) {
7967 return error;
7968 }
7969 cnp = &tond.ni_cnd;
7970 tdvp = tond.ni_dvp;
7971 tvp = tond.ni_vp;
7972
7973 free_src_acl = FALSE;
7974 attr_cleanup = FALSE;
7975
7976 if (tvp != NULL) {
7977 error = EEXIST;
7978 goto out;
7979 }
7980
7981 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7982 error = EXDEV;
7983 goto out;
7984 }
7985
7986 #if CONFIG_MACF
7987 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7988 goto out;
7989 }
7990 #endif
7991 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7992 goto out;
7993 }
7994
7995 action = KAUTH_VNODE_GENERIC_READ_BITS;
7996 if (data_read_authorised) {
7997 action &= ~KAUTH_VNODE_READ_DATA;
7998 }
7999 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8000 goto out;
8001 }
8002
8003 /*
8004 * certain attributes may need to be changed from the source, we ask for
8005 * those here with the exception of source file's ACL. The clone file
8006 * will inherit the target directory's ACL.
8007 */
8008 VATTR_INIT(&va);
8009 VATTR_WANTED(&va, va_uid);
8010 VATTR_WANTED(&va, va_gid);
8011 VATTR_WANTED(&va, va_mode);
8012 VATTR_WANTED(&va, va_flags);
8013
8014 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8015 goto out;
8016 }
8017
8018 VATTR_INIT(&nva);
8019 VATTR_SET(&nva, va_type, v_type);
8020 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8021 VATTR_SET(&nva, va_acl, va.va_acl);
8022 free_src_acl = TRUE;
8023 }
8024
8025 /* Handle ACL inheritance, initialize vap. */
8026 if (v_type == VLNK) {
8027 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8028 } else {
8029 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8030 if (error) {
8031 goto out;
8032 }
8033 attr_cleanup = TRUE;
8034 }
8035
8036 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8037 /*
8038 * We've got initial values for all security parameters,
8039 * If we are superuser, then we can change owners to be the
8040 * same as the source. Both superuser and the owner have default
8041 * WRITE_SECURITY privileges so all other fields can be taken
8042 * from source as well.
8043 */
8044 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8045 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8046 VATTR_SET(&nva, va_uid, va.va_uid);
8047 }
8048 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8049 VATTR_SET(&nva, va_gid, va.va_gid);
8050 }
8051 } else {
8052 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8053 }
8054
8055 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8056 VATTR_SET(&nva, va_mode, va.va_mode);
8057 }
8058 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8059 VATTR_SET(&nva, va_flags,
8060 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8061 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8062 }
8063
8064 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8065
8066 if (!error && tvp) {
8067 int update_flags = 0;
8068 #if CONFIG_FSE
8069 int fsevent;
8070 #endif /* CONFIG_FSE */
8071
8072 /*
8073 * If some of the requested attributes weren't handled by the
8074 * VNOP, use our fallback code.
8075 */
8076 if (!VATTR_ALL_SUPPORTED(&nva)) {
8077 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8078 }
8079
8080 #if CONFIG_MACF
8081 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8082 VNODE_LABEL_CREATE, ctx);
8083 #endif
8084
8085 // Make sure the name & parent pointers are hooked up
8086 if (tvp->v_name == NULL) {
8087 update_flags |= VNODE_UPDATE_NAME;
8088 }
8089 if (tvp->v_parent == NULLVP) {
8090 update_flags |= VNODE_UPDATE_PARENT;
8091 }
8092
8093 if (update_flags) {
8094 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8095 cnp->cn_namelen, cnp->cn_hash, update_flags);
8096 }
8097
8098 #if CONFIG_FSE
8099 switch (vnode_vtype(tvp)) {
8100 case VLNK:
8101 /* FALLTHRU */
8102 case VREG:
8103 fsevent = FSE_CREATE_FILE;
8104 break;
8105 case VDIR:
8106 fsevent = FSE_CREATE_DIR;
8107 break;
8108 default:
8109 goto out;
8110 }
8111
8112 if (need_fsevent(fsevent, tvp)) {
8113 /*
8114 * The following is a sequence of three explicit events.
8115 * A pair of FSE_CLONE events representing the source and destination
8116 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8117 * fseventsd may coalesce the destination clone and create events
8118 * into a single event resulting in the following sequence for a client
8119 * FSE_CLONE (src)
8120 * FSE_CLONE | FSE_CREATE (dst)
8121 */
8122 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8123 FSE_ARG_DONE);
8124 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8125 FSE_ARG_DONE);
8126 }
8127 #endif /* CONFIG_FSE */
8128 }
8129
8130 out:
8131 if (attr_cleanup) {
8132 vn_attribute_cleanup(&nva, defaulted);
8133 }
8134 if (free_src_acl && va.va_acl) {
8135 kauth_acl_free(va.va_acl);
8136 }
8137 nameidone(&tond);
8138 if (tvp) {
8139 vnode_put(tvp);
8140 }
8141 vnode_put(tdvp);
8142 return error;
8143 }
8144
8145 /*
8146 * clone files or directories, target must not exist.
8147 */
8148 /* ARGSUSED */
8149 int
8150 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8151 __unused int32_t *retval)
8152 {
8153 vnode_t fvp;
8154 struct nameidata fromnd;
8155 int follow;
8156 int error;
8157 vfs_context_t ctx = vfs_context_current();
8158
8159 /* Check that the flags are valid. */
8160 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8161 return EINVAL;
8162 }
8163
8164 AUDIT_ARG(fd, uap->src_dirfd);
8165
8166 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8167 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8168 UIO_USERSPACE, uap->src, ctx);
8169 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8170 return error;
8171 }
8172
8173 fvp = fromnd.ni_vp;
8174 nameidone(&fromnd);
8175
8176 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8177 uap->flags, ctx);
8178
8179 vnode_put(fvp);
8180 return error;
8181 }
8182
8183 int
8184 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8185 __unused int32_t *retval)
8186 {
8187 vnode_t fvp;
8188 struct fileproc *fp;
8189 int error;
8190 vfs_context_t ctx = vfs_context_current();
8191
8192 /* Check that the flags are valid. */
8193 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8194 return EINVAL;
8195 }
8196
8197 AUDIT_ARG(fd, uap->src_fd);
8198 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8199 if (error) {
8200 return error;
8201 }
8202
8203 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8204 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8205 error = EBADF;
8206 goto out;
8207 }
8208
8209 if ((error = vnode_getwithref(fvp))) {
8210 goto out;
8211 }
8212
8213 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8214
8215 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8216 uap->flags, ctx);
8217
8218 vnode_put(fvp);
8219 out:
8220 file_drop(uap->src_fd);
8221 return error;
8222 }
8223
8224 static int
8225 rename_submounts_callback(mount_t mp, void *arg)
8226 {
8227 int error = 0;
8228 mount_t pmp = (mount_t)arg;
8229 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8230
8231 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8232 return 0;
8233 }
8234
8235 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8236 return 0;
8237 }
8238
8239 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8240 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8241 return -1;
8242 }
8243
8244 int pathlen = MAXPATHLEN;
8245 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8246 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8247 }
8248
8249 vfs_unbusy(mp);
8250
8251 return error;
8252 }
8253
8254 /*
8255 * Rename files. Source and destination must either both be directories,
8256 * or both not be directories. If target is a directory, it must be empty.
8257 */
8258 /* ARGSUSED */
8259 static int
8260 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8261 int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8262 {
8263 if (flags & ~VFS_RENAME_FLAGS_MASK) {
8264 return EINVAL;
8265 }
8266
8267 if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8268 return EINVAL;
8269 }
8270
8271 vnode_t tvp, tdvp;
8272 vnode_t fvp, fdvp;
8273 vnode_t mnt_fvp;
8274 struct nameidata *fromnd, *tond;
8275 int error;
8276 int do_retry;
8277 int retry_count;
8278 int mntrename;
8279 int need_event;
8280 int need_kpath2;
8281 int has_listeners;
8282 const char *oname = NULL;
8283 char *from_name = NULL, *to_name = NULL;
8284 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8285 int from_len = 0, to_len = 0;
8286 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8287 int holding_mntlock;
8288 int vn_authorize_skipped;
8289 mount_t locked_mp = NULL;
8290 vnode_t oparent = NULLVP;
8291 #if CONFIG_FSE
8292 fse_info from_finfo, to_finfo;
8293 #endif
8294 int from_truncated = 0, to_truncated = 0;
8295 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8296 int batched = 0;
8297 struct vnode_attr *fvap, *tvap;
8298 int continuing = 0;
8299 /* carving out a chunk for structs that are too big to be on stack. */
8300 struct {
8301 struct nameidata from_node, to_node;
8302 struct vnode_attr fv_attr, tv_attr;
8303 } * __rename_data;
8304 __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
8305 fromnd = &__rename_data->from_node;
8306 tond = &__rename_data->to_node;
8307
8308 holding_mntlock = 0;
8309 do_retry = 0;
8310 retry_count = 0;
8311 retry:
8312 fvp = tvp = NULL;
8313 fdvp = tdvp = NULL;
8314 fvap = tvap = NULL;
8315 mnt_fvp = NULLVP;
8316 mntrename = FALSE;
8317 vn_authorize_skipped = FALSE;
8318
8319 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8320 segflg, from, ctx);
8321 fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8322
8323 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8324 segflg, to, ctx);
8325 tond->ni_flag = NAMEI_COMPOUNDRENAME;
8326
8327 continue_lookup:
8328 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8329 if ((error = nameiat(fromnd, fromfd))) {
8330 goto out1;
8331 }
8332 fdvp = fromnd->ni_dvp;
8333 fvp = fromnd->ni_vp;
8334
8335 if (fvp && fvp->v_type == VDIR) {
8336 tond->ni_cnd.cn_flags |= WILLBEDIR;
8337 }
8338 }
8339
8340 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8341 if ((error = nameiat(tond, tofd))) {
8342 /*
8343 * Translate error code for rename("dir1", "dir2/.").
8344 */
8345 if (error == EISDIR && fvp->v_type == VDIR) {
8346 error = EINVAL;
8347 }
8348 goto out1;
8349 }
8350 tdvp = tond->ni_dvp;
8351 tvp = tond->ni_vp;
8352 }
8353
8354 #if DEVELOPMENT || DEBUG
8355 /*
8356 * XXX VSWAP: Check for entitlements or special flag here
8357 * so we can restrict access appropriately.
8358 */
8359 #else /* DEVELOPMENT || DEBUG */
8360
8361 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8362 error = EPERM;
8363 goto out1;
8364 }
8365
8366 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8367 error = EPERM;
8368 goto out1;
8369 }
8370 #endif /* DEVELOPMENT || DEBUG */
8371
8372 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8373 error = ENOENT;
8374 goto out1;
8375 }
8376
8377 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8378 int32_t pval = 0;
8379 int err = 0;
8380
8381 /*
8382 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8383 * has the same name as target iff the following conditions are met:
8384 * 1. the target file system is case insensitive
8385 * 2. source and target directories are the same
8386 * 3. source and target files are the same
8387 * 4. name only differs in case (determined by underlying filesystem)
8388 */
8389 if (fvp != tvp || fdvp != tdvp) {
8390 error = EEXIST;
8391 goto out1;
8392 }
8393
8394 /*
8395 * Assume that the target file system is case sensitive if
8396 * _PC_CASE_SENSITIVE selector isn't supported.
8397 */
8398 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8399 if (err != 0 || pval != 0) {
8400 error = EEXIST;
8401 goto out1;
8402 }
8403 }
8404
8405 batched = vnode_compound_rename_available(fdvp);
8406
8407 #if CONFIG_FSE
8408 need_event = need_fsevent(FSE_RENAME, fdvp);
8409 if (need_event) {
8410 if (fvp) {
8411 get_fse_info(fvp, &from_finfo, ctx);
8412 } else {
8413 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8414 if (error) {
8415 goto out1;
8416 }
8417
8418 fvap = &__rename_data->fv_attr;
8419 }
8420
8421 if (tvp) {
8422 get_fse_info(tvp, &to_finfo, ctx);
8423 } else if (batched) {
8424 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8425 if (error) {
8426 goto out1;
8427 }
8428
8429 tvap = &__rename_data->tv_attr;
8430 }
8431 }
8432 #else
8433 need_event = 0;
8434 #endif /* CONFIG_FSE */
8435
8436 has_listeners = kauth_authorize_fileop_has_listeners();
8437
8438 need_kpath2 = 0;
8439 #if CONFIG_AUDIT
8440 if (AUDIT_RECORD_EXISTS()) {
8441 need_kpath2 = 1;
8442 }
8443 #endif
8444
8445 if (need_event || has_listeners) {
8446 if (from_name == NULL) {
8447 GET_PATH(from_name);
8448 }
8449
8450 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8451
8452 if (from_name_no_firmlink == NULL) {
8453 GET_PATH(from_name_no_firmlink);
8454 }
8455
8456 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8457 }
8458
8459 if (need_event || need_kpath2 || has_listeners) {
8460 if (to_name == NULL) {
8461 GET_PATH(to_name);
8462 }
8463
8464 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8465
8466 if (to_name_no_firmlink == NULL) {
8467 GET_PATH(to_name_no_firmlink);
8468 }
8469
8470 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8471 if (to_name && need_kpath2) {
8472 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8473 }
8474 }
8475 if (!fvp) {
8476 /*
8477 * Claim: this check will never reject a valid rename.
8478 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8479 * Suppose fdvp and tdvp are not on the same mount.
8480 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8481 * then you can't move it to within another dir on the same mountpoint.
8482 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8483 *
8484 * If this check passes, then we are safe to pass these vnodes to the same FS.
8485 */
8486 if (fdvp->v_mount != tdvp->v_mount) {
8487 error = EXDEV;
8488 goto out1;
8489 }
8490 goto skipped_lookup;
8491 }
8492
8493 /*
8494 * If the source and destination are the same (i.e. they're
8495 * links to the same vnode) and the target file system is
8496 * case sensitive, then there is nothing to do.
8497 *
8498 * XXX Come back to this.
8499 */
8500 if (fvp == tvp) {
8501 int pathconf_val;
8502
8503 /*
8504 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8505 * then assume that this file system is case sensitive.
8506 */
8507 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8508 pathconf_val != 0) {
8509 vn_authorize_skipped = TRUE;
8510 goto out1;
8511 }
8512 }
8513
8514 /*
8515 * Allow the renaming of mount points.
8516 * - target must not exist
8517 * - target must reside in the same directory as source
8518 * - union mounts cannot be renamed
8519 * - the root fs, and tightly-linked system volumes, cannot be renamed
8520 *
8521 * XXX Handle this in VFS after a continued lookup (if we missed
8522 * in the cache to start off)
8523 *
8524 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8525 * we'll skip past here. The file system is responsible for
8526 * checking that @tvp is not a descendent of @fvp and vice versa
8527 * so it should always return EINVAL if either @tvp or @fvp is the
8528 * root of a volume.
8529 */
8530 if ((fvp->v_flag & VROOT) &&
8531 (fvp->v_type == VDIR) &&
8532 (tvp == NULL) &&
8533 (fvp->v_mountedhere == NULL) &&
8534 (fdvp == tdvp) &&
8535 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8536 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8537 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8538 vnode_t coveredvp;
8539
8540 /* switch fvp to the covered vnode */
8541 coveredvp = fvp->v_mount->mnt_vnodecovered;
8542 if ((vnode_getwithref(coveredvp))) {
8543 error = ENOENT;
8544 goto out1;
8545 }
8546 /*
8547 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8548 * later.
8549 */
8550 mnt_fvp = fvp;
8551
8552 fvp = coveredvp;
8553 mntrename = TRUE;
8554 }
8555 /*
8556 * Check for cross-device rename.
8557 */
8558 if ((fvp->v_mount != tdvp->v_mount) ||
8559 (tvp && (fvp->v_mount != tvp->v_mount))) {
8560 error = EXDEV;
8561 goto out1;
8562 }
8563
8564 /*
8565 * If source is the same as the destination (that is the
8566 * same inode number) then there is nothing to do...
8567 * EXCEPT if the underlying file system supports case
8568 * insensitivity and is case preserving. In this case
8569 * the file system needs to handle the special case of
8570 * getting the same vnode as target (fvp) and source (tvp).
8571 *
8572 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8573 * and _PC_CASE_PRESERVING can have this exception, and they need to
8574 * handle the special case of getting the same vnode as target and
8575 * source. NOTE: Then the target is unlocked going into vnop_rename,
8576 * so not to cause locking problems. There is a single reference on tvp.
8577 *
8578 * NOTE - that fvp == tvp also occurs if they are hard linked and
8579 * that correct behaviour then is just to return success without doing
8580 * anything.
8581 *
8582 * XXX filesystem should take care of this itself, perhaps...
8583 */
8584 if (fvp == tvp && fdvp == tdvp) {
8585 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8586 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8587 fromnd->ni_cnd.cn_namelen)) {
8588 vn_authorize_skipped = TRUE;
8589 goto out1;
8590 }
8591 }
8592
8593 if (holding_mntlock && fvp->v_mount != locked_mp) {
8594 /*
8595 * we're holding a reference and lock
8596 * on locked_mp, but it no longer matches
8597 * what we want to do... so drop our hold
8598 */
8599 mount_unlock_renames(locked_mp);
8600 mount_drop(locked_mp, 0);
8601 holding_mntlock = 0;
8602 }
8603 if (tdvp != fdvp && fvp->v_type == VDIR) {
8604 /*
8605 * serialize renames that re-shape
8606 * the tree... if holding_mntlock is
8607 * set, then we're ready to go...
8608 * otherwise we
8609 * first need to drop the iocounts
8610 * we picked up, second take the
8611 * lock to serialize the access,
8612 * then finally start the lookup
8613 * process over with the lock held
8614 */
8615 if (!holding_mntlock) {
8616 /*
8617 * need to grab a reference on
8618 * the mount point before we
8619 * drop all the iocounts... once
8620 * the iocounts are gone, the mount
8621 * could follow
8622 */
8623 locked_mp = fvp->v_mount;
8624 mount_ref(locked_mp, 0);
8625
8626 /*
8627 * nameidone has to happen before we vnode_put(tvp)
8628 * since it may need to release the fs_nodelock on the tvp
8629 */
8630 nameidone(tond);
8631
8632 if (tvp) {
8633 vnode_put(tvp);
8634 }
8635 vnode_put(tdvp);
8636
8637 /*
8638 * nameidone has to happen before we vnode_put(fdvp)
8639 * since it may need to release the fs_nodelock on the fvp
8640 */
8641 nameidone(fromnd);
8642
8643 vnode_put(fvp);
8644 vnode_put(fdvp);
8645
8646 if (mnt_fvp != NULLVP) {
8647 vnode_put(mnt_fvp);
8648 }
8649
8650 mount_lock_renames(locked_mp);
8651 holding_mntlock = 1;
8652
8653 goto retry;
8654 }
8655 } else {
8656 /*
8657 * when we dropped the iocounts to take
8658 * the lock, we allowed the identity of
8659 * the various vnodes to change... if they did,
8660 * we may no longer be dealing with a rename
8661 * that reshapes the tree... once we're holding
8662 * the iocounts, the vnodes can't change type
8663 * so we're free to drop the lock at this point
8664 * and continue on
8665 */
8666 if (holding_mntlock) {
8667 mount_unlock_renames(locked_mp);
8668 mount_drop(locked_mp, 0);
8669 holding_mntlock = 0;
8670 }
8671 }
8672
8673 if (!batched) {
8674 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8675 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8676 flags, NULL);
8677 if (error) {
8678 if (error == ENOENT) {
8679 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8680 /*
8681 * We encountered a race where after doing the namei,
8682 * tvp stops being valid. If so, simply re-drive the rename
8683 * call from the top.
8684 */
8685 do_retry = 1;
8686 retry_count += 1;
8687 }
8688 }
8689 goto out1;
8690 }
8691 }
8692
8693 /* Release the 'mnt_fvp' now that it is no longer needed. */
8694 if (mnt_fvp != NULLVP) {
8695 vnode_put(mnt_fvp);
8696 mnt_fvp = NULLVP;
8697 }
8698
8699 // save these off so we can later verify that fvp is the same
8700 oname = fvp->v_name;
8701 oparent = fvp->v_parent;
8702
8703 skipped_lookup:
8704 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8705 tdvp, &tvp, &tond->ni_cnd, tvap,
8706 flags, ctx);
8707
8708 if (holding_mntlock) {
8709 /*
8710 * we can drop our serialization
8711 * lock now
8712 */
8713 mount_unlock_renames(locked_mp);
8714 mount_drop(locked_mp, 0);
8715 holding_mntlock = 0;
8716 }
8717 if (error) {
8718 if (error == EDATALESS) {
8719 /*
8720 * If we've been here before, something has gone
8721 * horribly wrong and we should just get out lest
8722 * we spiral around the drain forever.
8723 */
8724 if (flags & VFS_RENAME_DATALESS) {
8725 error = EIO;
8726 goto out1;
8727 }
8728
8729 /*
8730 * The object we're renaming is dataless (or has a
8731 * dataless descendent) and requires materialization
8732 * before the rename occurs. But we're holding the
8733 * mount point's rename lock, so it's not safe to
8734 * make the upcall.
8735 *
8736 * In this case, we release the lock, perform the
8737 * materialization, and start the whole thing over.
8738 */
8739 error = vnode_materialize_dataless_file(fvp,
8740 NAMESPACE_HANDLER_RENAME_OP);
8741
8742 if (error == 0) {
8743 /*
8744 * The next time around we need to tell the
8745 * file system that the materializtaion has
8746 * been performed.
8747 */
8748 flags |= VFS_RENAME_DATALESS;
8749 do_retry = 1;
8750 }
8751 goto out1;
8752 }
8753 if (error == EKEEPLOOKING) {
8754 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8755 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8756 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8757 }
8758 }
8759
8760 fromnd->ni_vp = fvp;
8761 tond->ni_vp = tvp;
8762
8763 goto continue_lookup;
8764 }
8765
8766 /*
8767 * We may encounter a race in the VNOP where the destination didn't
8768 * exist when we did the namei, but it does by the time we go and
8769 * try to create the entry. In this case, we should re-drive this rename
8770 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8771 * but other filesystems susceptible to this race could return it, too.
8772 */
8773 if (error == ERECYCLE) {
8774 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8775 do_retry = 1;
8776 retry_count += 1;
8777 } else {
8778 printf("rename retry limit due to ERECYCLE reached\n");
8779 error = ENOENT;
8780 }
8781 }
8782
8783 /*
8784 * For compound VNOPs, the authorization callback may return
8785 * ENOENT in case of racing hardlink lookups hitting the name
8786 * cache, redrive the lookup.
8787 */
8788 if (batched && error == ENOENT) {
8789 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8790 do_retry = 1;
8791 retry_count += 1;
8792 }
8793 }
8794
8795 goto out1;
8796 }
8797
8798 /* call out to allow 3rd party notification of rename.
8799 * Ignore result of kauth_authorize_fileop call.
8800 */
8801 kauth_authorize_fileop(vfs_context_ucred(ctx),
8802 KAUTH_FILEOP_RENAME,
8803 (uintptr_t)from_name, (uintptr_t)to_name);
8804 if (flags & VFS_RENAME_SWAP) {
8805 kauth_authorize_fileop(vfs_context_ucred(ctx),
8806 KAUTH_FILEOP_RENAME,
8807 (uintptr_t)to_name, (uintptr_t)from_name);
8808 }
8809
8810 #if CONFIG_FSE
8811 if (from_name != NULL && to_name != NULL) {
8812 if (from_truncated || to_truncated) {
8813 // set it here since only the from_finfo gets reported up to user space
8814 from_finfo.mode |= FSE_TRUNCATED_PATH;
8815 }
8816
8817 if (tvap && tvp) {
8818 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8819 }
8820 if (fvap) {
8821 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8822 }
8823
8824 if (tvp) {
8825 add_fsevent(FSE_RENAME, ctx,
8826 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8827 FSE_ARG_FINFO, &from_finfo,
8828 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8829 FSE_ARG_FINFO, &to_finfo,
8830 FSE_ARG_DONE);
8831 if (flags & VFS_RENAME_SWAP) {
8832 /*
8833 * Strictly speaking, swap is the equivalent of
8834 * *three* renames. FSEvents clients should only take
8835 * the events as a hint, so we only bother reporting
8836 * two.
8837 */
8838 add_fsevent(FSE_RENAME, ctx,
8839 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8840 FSE_ARG_FINFO, &to_finfo,
8841 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8842 FSE_ARG_FINFO, &from_finfo,
8843 FSE_ARG_DONE);
8844 }
8845 } else {
8846 add_fsevent(FSE_RENAME, ctx,
8847 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8848 FSE_ARG_FINFO, &from_finfo,
8849 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8850 FSE_ARG_DONE);
8851 }
8852 }
8853 #endif /* CONFIG_FSE */
8854
8855 /*
8856 * update filesystem's mount point data
8857 */
8858 if (mntrename) {
8859 char *cp, *pathend, *mpname;
8860 char * tobuf;
8861 struct mount *mp;
8862 int maxlen;
8863 size_t len = 0;
8864
8865 mp = fvp->v_mountedhere;
8866
8867 if (vfs_busy(mp, LK_NOWAIT)) {
8868 error = EBUSY;
8869 goto out1;
8870 }
8871 tobuf = zalloc(ZV_NAMEI);
8872
8873 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8874 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8875 } else {
8876 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8877 }
8878 if (!error) {
8879 /* find current mount point prefix */
8880 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8881 for (cp = pathend; *cp != '\0'; ++cp) {
8882 if (*cp == '/') {
8883 pathend = cp + 1;
8884 }
8885 }
8886 /* find last component of target name */
8887 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8888 if (*cp == '/') {
8889 mpname = cp + 1;
8890 }
8891 }
8892
8893 /* Update f_mntonname of sub mounts */
8894 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8895
8896 /* append name to prefix */
8897 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8898 bzero(pathend, maxlen);
8899
8900 strlcpy(pathend, mpname, maxlen);
8901 }
8902 zfree(ZV_NAMEI, tobuf);
8903
8904 vfs_unbusy(mp);
8905
8906 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8907 }
8908 /*
8909 * fix up name & parent pointers. note that we first
8910 * check that fvp has the same name/parent pointers it
8911 * had before the rename call... this is a 'weak' check
8912 * at best...
8913 *
8914 * XXX oparent and oname may not be set in the compound vnop case
8915 */
8916 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8917 int update_flags;
8918
8919 update_flags = VNODE_UPDATE_NAME;
8920
8921 if (fdvp != tdvp) {
8922 update_flags |= VNODE_UPDATE_PARENT;
8923 }
8924
8925 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8926 }
8927 out1:
8928 /*
8929 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8930 * skipped earlier as no actual rename was performed.
8931 */
8932 if (vn_authorize_skipped && error == 0) {
8933 error = vn_authorize_renamex_with_paths(fdvp, fvp,
8934 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8935 flags, NULL);
8936 if (error && error == ENOENT) {
8937 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8938 do_retry = 1;
8939 retry_count += 1;
8940 }
8941 }
8942 }
8943 if (to_name != NULL) {
8944 RELEASE_PATH(to_name);
8945 to_name = NULL;
8946 }
8947 if (to_name_no_firmlink != NULL) {
8948 RELEASE_PATH(to_name_no_firmlink);
8949 to_name_no_firmlink = NULL;
8950 }
8951 if (from_name != NULL) {
8952 RELEASE_PATH(from_name);
8953 from_name = NULL;
8954 }
8955 if (from_name_no_firmlink != NULL) {
8956 RELEASE_PATH(from_name_no_firmlink);
8957 from_name_no_firmlink = NULL;
8958 }
8959 if (holding_mntlock) {
8960 mount_unlock_renames(locked_mp);
8961 mount_drop(locked_mp, 0);
8962 holding_mntlock = 0;
8963 }
8964 if (tdvp) {
8965 /*
8966 * nameidone has to happen before we vnode_put(tdvp)
8967 * since it may need to release the fs_nodelock on the tdvp
8968 */
8969 nameidone(tond);
8970
8971 if (tvp) {
8972 vnode_put(tvp);
8973 }
8974 vnode_put(tdvp);
8975 }
8976 if (fdvp) {
8977 /*
8978 * nameidone has to happen before we vnode_put(fdvp)
8979 * since it may need to release the fs_nodelock on the fdvp
8980 */
8981 nameidone(fromnd);
8982
8983 if (fvp) {
8984 vnode_put(fvp);
8985 }
8986 vnode_put(fdvp);
8987 }
8988 if (mnt_fvp != NULLVP) {
8989 vnode_put(mnt_fvp);
8990 }
8991 /*
8992 * If things changed after we did the namei, then we will re-drive
8993 * this rename call from the top.
8994 */
8995 if (do_retry) {
8996 do_retry = 0;
8997 goto retry;
8998 }
8999
9000 kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
9001 return error;
9002 }
9003
9004 int
9005 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9006 {
9007 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9008 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9009 }
9010
9011 int
9012 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9013 {
9014 return renameat_internal(
9015 vfs_context_current(),
9016 uap->fromfd, uap->from,
9017 uap->tofd, uap->to,
9018 UIO_USERSPACE, uap->flags);
9019 }
9020
9021 int
9022 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9023 {
9024 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9025 uap->tofd, uap->to, UIO_USERSPACE, 0);
9026 }
9027
9028 /*
9029 * Make a directory file.
9030 *
9031 * Returns: 0 Success
9032 * EEXIST
9033 * namei:???
9034 * vnode_authorize:???
9035 * vn_create:???
9036 */
9037 /* ARGSUSED */
9038 static int
9039 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9040 enum uio_seg segflg)
9041 {
9042 vnode_t vp, dvp;
9043 int error;
9044 int update_flags = 0;
9045 int batched;
9046 struct nameidata nd;
9047
9048 AUDIT_ARG(mode, vap->va_mode);
9049 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9050 path, ctx);
9051 nd.ni_cnd.cn_flags |= WILLBEDIR;
9052 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9053
9054 continue_lookup:
9055 error = nameiat(&nd, fd);
9056 if (error) {
9057 return error;
9058 }
9059 dvp = nd.ni_dvp;
9060 vp = nd.ni_vp;
9061
9062 if (vp != NULL) {
9063 error = EEXIST;
9064 goto out;
9065 }
9066
9067 batched = vnode_compound_mkdir_available(dvp);
9068
9069 VATTR_SET(vap, va_type, VDIR);
9070
9071 /*
9072 * XXX
9073 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9074 * only get EXISTS or EISDIR for existing path components, and not that it could see
9075 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9076 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9077 */
9078 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9079 if (error == EACCES || error == EPERM) {
9080 int error2;
9081
9082 nameidone(&nd);
9083 vnode_put(dvp);
9084 dvp = NULLVP;
9085
9086 /*
9087 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9088 * rather than EACCESS if the target exists.
9089 */
9090 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9091 path, ctx);
9092 error2 = nameiat(&nd, fd);
9093 if (error2) {
9094 goto out;
9095 } else {
9096 vp = nd.ni_vp;
9097 error = EEXIST;
9098 goto out;
9099 }
9100 }
9101
9102 goto out;
9103 }
9104
9105 /*
9106 * make the directory
9107 */
9108 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9109 if (error == EKEEPLOOKING) {
9110 nd.ni_vp = vp;
9111 goto continue_lookup;
9112 }
9113
9114 goto out;
9115 }
9116
9117 // Make sure the name & parent pointers are hooked up
9118 if (vp->v_name == NULL) {
9119 update_flags |= VNODE_UPDATE_NAME;
9120 }
9121 if (vp->v_parent == NULLVP) {
9122 update_flags |= VNODE_UPDATE_PARENT;
9123 }
9124
9125 if (update_flags) {
9126 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9127 }
9128
9129 #if CONFIG_FSE
9130 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9131 #endif
9132
9133 out:
9134 /*
9135 * nameidone has to happen before we vnode_put(dvp)
9136 * since it may need to release the fs_nodelock on the dvp
9137 */
9138 nameidone(&nd);
9139
9140 if (vp) {
9141 vnode_put(vp);
9142 }
9143 if (dvp) {
9144 vnode_put(dvp);
9145 }
9146
9147 return error;
9148 }
9149
9150 /*
9151 * mkdir_extended: Create a directory; with extended security (ACL).
9152 *
9153 * Parameters: p Process requesting to create the directory
9154 * uap User argument descriptor (see below)
9155 * retval (ignored)
9156 *
9157 * Indirect: uap->path Path of directory to create
9158 * uap->mode Access permissions to set
9159 * uap->xsecurity ACL to set
9160 *
9161 * Returns: 0 Success
9162 * !0 Not success
9163 *
9164 */
9165 int
9166 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9167 {
9168 int ciferror;
9169 kauth_filesec_t xsecdst;
9170 struct vnode_attr va;
9171
9172 AUDIT_ARG(owner, uap->uid, uap->gid);
9173
9174 xsecdst = NULL;
9175 if ((uap->xsecurity != USER_ADDR_NULL) &&
9176 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9177 return ciferror;
9178 }
9179
9180 VATTR_INIT(&va);
9181 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9182 if (xsecdst != NULL) {
9183 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9184 }
9185
9186 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9187 UIO_USERSPACE);
9188 if (xsecdst != NULL) {
9189 kauth_filesec_free(xsecdst);
9190 }
9191 return ciferror;
9192 }
9193
9194 int
9195 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9196 {
9197 struct vnode_attr va;
9198
9199 VATTR_INIT(&va);
9200 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9201
9202 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9203 UIO_USERSPACE);
9204 }
9205
9206 int
9207 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9208 {
9209 struct vnode_attr va;
9210
9211 VATTR_INIT(&va);
9212 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9213
9214 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9215 UIO_USERSPACE);
9216 }
9217
9218 static int
9219 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9220 enum uio_seg segflg, int unlink_flags)
9221 {
9222 vnode_t vp, dvp;
9223 int error;
9224 struct nameidata nd;
9225 char *path = NULL;
9226 char *no_firmlink_path = NULL;
9227 int len_path = 0;
9228 int len_no_firmlink_path = 0;
9229 int has_listeners = 0;
9230 int need_event = 0;
9231 int truncated_path = 0;
9232 int truncated_no_firmlink_path = 0;
9233 #if CONFIG_FSE
9234 struct vnode_attr va;
9235 #endif /* CONFIG_FSE */
9236 struct vnode_attr *vap = NULL;
9237 int restart_count = 0;
9238 int batched;
9239
9240 int restart_flag;
9241
9242 /*
9243 * This loop exists to restart rmdir in the unlikely case that two
9244 * processes are simultaneously trying to remove the same directory
9245 * containing orphaned appleDouble files.
9246 */
9247 do {
9248 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9249 segflg, dirpath, ctx);
9250 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
9251 continue_lookup:
9252 restart_flag = 0;
9253 vap = NULL;
9254
9255 error = nameiat(&nd, fd);
9256 if (error) {
9257 return error;
9258 }
9259
9260 dvp = nd.ni_dvp;
9261 vp = nd.ni_vp;
9262
9263 if (vp) {
9264 batched = vnode_compound_rmdir_available(vp);
9265
9266 if (vp->v_flag & VROOT) {
9267 /*
9268 * The root of a mounted filesystem cannot be deleted.
9269 */
9270 error = EBUSY;
9271 goto out;
9272 }
9273
9274 #if DEVELOPMENT || DEBUG
9275 /*
9276 * XXX VSWAP: Check for entitlements or special flag here
9277 * so we can restrict access appropriately.
9278 */
9279 #else /* DEVELOPMENT || DEBUG */
9280
9281 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9282 error = EPERM;
9283 goto out;
9284 }
9285 #endif /* DEVELOPMENT || DEBUG */
9286
9287 /*
9288 * Removed a check here; we used to abort if vp's vid
9289 * was not the same as what we'd seen the last time around.
9290 * I do not think that check was valid, because if we retry
9291 * and all dirents are gone, the directory could legitimately
9292 * be recycled but still be present in a situation where we would
9293 * have had permission to delete. Therefore, we won't make
9294 * an effort to preserve that check now that we may not have a
9295 * vp here.
9296 */
9297
9298 if (!batched) {
9299 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
9300 if (error) {
9301 if (error == ENOENT) {
9302 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9303 restart_flag = 1;
9304 restart_count += 1;
9305 }
9306 }
9307 goto out;
9308 }
9309 }
9310 } else {
9311 batched = 1;
9312
9313 if (!vnode_compound_rmdir_available(dvp)) {
9314 panic("No error, but no compound rmdir?");
9315 }
9316 }
9317
9318 #if CONFIG_FSE
9319 fse_info finfo = {0};
9320
9321 need_event = need_fsevent(FSE_DELETE, dvp);
9322 if (need_event) {
9323 if (!batched) {
9324 get_fse_info(vp, &finfo, ctx);
9325 } else {
9326 error = vfs_get_notify_attributes(&va);
9327 if (error) {
9328 goto out;
9329 }
9330
9331 vap = &va;
9332 }
9333 }
9334 #endif
9335 has_listeners = kauth_authorize_fileop_has_listeners();
9336 if (need_event || has_listeners) {
9337 if (path == NULL) {
9338 GET_PATH(path);
9339 }
9340
9341 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9342
9343 if (no_firmlink_path == NULL) {
9344 GET_PATH(no_firmlink_path);
9345 }
9346
9347 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9348 #if CONFIG_FSE
9349 if (truncated_no_firmlink_path) {
9350 finfo.mode |= FSE_TRUNCATED_PATH;
9351 }
9352 #endif
9353 }
9354
9355 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9356 nd.ni_vp = vp;
9357 if (vp == NULLVP) {
9358 /* Couldn't find a vnode */
9359 goto out;
9360 }
9361
9362 if (error == EKEEPLOOKING) {
9363 goto continue_lookup;
9364 } else if (batched && error == ENOENT) {
9365 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9366 /*
9367 * For compound VNOPs, the authorization callback
9368 * may return ENOENT in case of racing hard link lookups
9369 * redrive the lookup.
9370 */
9371 restart_flag = 1;
9372 restart_count += 1;
9373 goto out;
9374 }
9375 }
9376
9377 /*
9378 * XXX There's no provision for passing flags
9379 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9380 * because it's not empty, then we try again
9381 * with VNOP_REMOVE(), passing in a special
9382 * flag that clever file systems will know
9383 * how to handle.
9384 */
9385 if (error == ENOTEMPTY &&
9386 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9387 /*
9388 * If this fails, we want to keep the original
9389 * error.
9390 */
9391 if (vn_remove(dvp, &vp, &nd,
9392 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9393 error = 0;
9394 }
9395 }
9396
9397 #if CONFIG_APPLEDOUBLE
9398 /*
9399 * Special case to remove orphaned AppleDouble
9400 * files. I don't like putting this in the kernel,
9401 * but carbon does not like putting this in carbon either,
9402 * so here we are.
9403 */
9404 if (error == ENOTEMPTY) {
9405 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9406 if (ad_error == EBUSY) {
9407 error = ad_error;
9408 goto out;
9409 }
9410
9411
9412 /*
9413 * Assuming everything went well, we will try the RMDIR again
9414 */
9415 if (!ad_error) {
9416 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9417 }
9418 }
9419 #endif /* CONFIG_APPLEDOUBLE */
9420 /*
9421 * Call out to allow 3rd party notification of delete.
9422 * Ignore result of kauth_authorize_fileop call.
9423 */
9424 if (!error) {
9425 if (has_listeners) {
9426 kauth_authorize_fileop(vfs_context_ucred(ctx),
9427 KAUTH_FILEOP_DELETE,
9428 (uintptr_t)vp,
9429 (uintptr_t)path);
9430 }
9431
9432 if (vp->v_flag & VISHARDLINK) {
9433 // see the comment in unlink1() about why we update
9434 // the parent of a hard link when it is removed
9435 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9436 }
9437
9438 #if CONFIG_FSE
9439 if (need_event) {
9440 if (vap) {
9441 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9442 }
9443 add_fsevent(FSE_DELETE, ctx,
9444 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9445 FSE_ARG_FINFO, &finfo,
9446 FSE_ARG_DONE);
9447 }
9448 #endif
9449 }
9450
9451 out:
9452 if (path != NULL) {
9453 RELEASE_PATH(path);
9454 path = NULL;
9455 }
9456
9457 if (no_firmlink_path != NULL) {
9458 RELEASE_PATH(no_firmlink_path);
9459 no_firmlink_path = NULL;
9460 }
9461
9462 /*
9463 * nameidone has to happen before we vnode_put(dvp)
9464 * since it may need to release the fs_nodelock on the dvp
9465 */
9466 nameidone(&nd);
9467 vnode_put(dvp);
9468
9469 if (vp) {
9470 vnode_put(vp);
9471 }
9472
9473 if (restart_flag == 0) {
9474 wakeup_one((caddr_t)vp);
9475 return error;
9476 }
9477 tsleep(vp, PVFS, "rm AD", 1);
9478 } while (restart_flag != 0);
9479
9480 return error;
9481 }
9482
9483 /*
9484 * Remove a directory file.
9485 */
9486 /* ARGSUSED */
9487 int
9488 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9489 {
9490 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9491 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9492 }
9493
9494 /* Get direntry length padded to 8 byte alignment */
9495 #define DIRENT64_LEN(namlen) \
9496 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9497
9498 /* Get dirent length padded to 4 byte alignment */
9499 #define DIRENT_LEN(namelen) \
9500 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9501
9502 /* Get the end of this dirent */
9503 #define DIRENT_END(dep) \
9504 (((char *)(dep)) + (dep)->d_reclen - 1)
9505
9506 errno_t
9507 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9508 int *numdirent, vfs_context_t ctxp)
9509 {
9510 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9511 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9512 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9513 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9514 } else {
9515 size_t bufsize;
9516 void * bufptr;
9517 uio_t auio;
9518 struct direntry *entry64;
9519 struct dirent *dep;
9520 size_t bytesread;
9521 int error;
9522
9523 /*
9524 * We're here because the underlying file system does not
9525 * support direnties or we mounted denying support so we must
9526 * fall back to dirents and convert them to direntries.
9527 *
9528 * Our kernel buffer needs to be smaller since re-packing will
9529 * expand each dirent. The worse case (when the name length
9530 * is 3 or less) corresponds to a struct direntry size of 32
9531 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9532 * (4-byte aligned). So having a buffer that is 3/8 the size
9533 * will prevent us from reading more than we can pack.
9534 *
9535 * Since this buffer is wired memory, we will limit the
9536 * buffer size to a maximum of 32K. We would really like to
9537 * use 32K in the MIN(), but we use magic number 87371 to
9538 * prevent uio_resid() * 3 / 8 from overflowing.
9539 */
9540 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9541 bufptr = kheap_alloc(KHEAP_DATA_BUFFERS, bufsize, Z_WAITOK);
9542 if (bufptr == NULL) {
9543 return ENOMEM;
9544 }
9545
9546 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9547 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9548 auio->uio_offset = uio->uio_offset;
9549
9550 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9551
9552 dep = (struct dirent *)bufptr;
9553 bytesread = bufsize - uio_resid(auio);
9554
9555 entry64 = kheap_alloc(KHEAP_TEMP, sizeof(struct direntry), Z_WAITOK);
9556 /*
9557 * Convert all the entries and copy them out to user's buffer.
9558 */
9559 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9560 /* First check that the dirent struct up to d_name is within the buffer */
9561 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9562 /* Check that the length of the entire dirent is within the buffer */
9563 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9564 /* Check that the actual length including the name doesn't exceed d_reclen */
9565 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9566 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9567 vp->v_mount->mnt_vfsstat.f_mntonname,
9568 vp->v_name ? vp->v_name : "<unknown>");
9569 error = EIO;
9570 break;
9571 }
9572
9573 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9574
9575 bzero(entry64, enbufsize);
9576 /* Convert a dirent to a dirent64. */
9577 entry64->d_ino = dep->d_ino;
9578 entry64->d_seekoff = 0;
9579 entry64->d_reclen = (uint16_t)enbufsize;
9580 entry64->d_namlen = dep->d_namlen;
9581 entry64->d_type = dep->d_type;
9582 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9583
9584 /* Move to next entry. */
9585 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9586
9587 /* Copy entry64 to user's buffer. */
9588 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9589 }
9590
9591 /* Update the real offset using the offset we got from VNOP_READDIR. */
9592 if (error == 0) {
9593 uio->uio_offset = auio->uio_offset;
9594 }
9595 uio_free(auio);
9596 kheap_free(KHEAP_DATA_BUFFERS, bufptr, bufsize);
9597 kheap_free(KHEAP_TEMP, entry64, sizeof(struct direntry));
9598 return error;
9599 }
9600 }
9601
9602 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9603
9604 /*
9605 * Read a block of directory entries in a file system independent format.
9606 */
9607 static int
9608 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9609 off_t *offset, int *eofflag, int flags)
9610 {
9611 vnode_t vp;
9612 struct vfs_context context = *vfs_context_current(); /* local copy */
9613 struct fileproc *fp;
9614 uio_t auio;
9615 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9616 off_t loff;
9617 int error, numdirent;
9618 char uio_buf[UIO_SIZEOF(1)];
9619
9620 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9621 if (error) {
9622 return error;
9623 }
9624 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9625 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9626 error = EBADF;
9627 goto out;
9628 }
9629
9630 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9631 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9632 }
9633
9634 #if CONFIG_MACF
9635 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9636 if (error) {
9637 goto out;
9638 }
9639 #endif
9640 if ((error = vnode_getwithref(vp))) {
9641 goto out;
9642 }
9643 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9644
9645 unionread:
9646 if (vp->v_type != VDIR) {
9647 (void)vnode_put(vp);
9648 error = EINVAL;
9649 goto out;
9650 }
9651
9652 #if CONFIG_MACF
9653 error = mac_vnode_check_readdir(&context, vp);
9654 if (error != 0) {
9655 (void)vnode_put(vp);
9656 goto out;
9657 }
9658 #endif /* MAC */
9659
9660 loff = fp->fp_glob->fg_offset;
9661 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9662 uio_addiov(auio, bufp, bufsize);
9663
9664 if (flags & VNODE_READDIR_EXTENDED) {
9665 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9666 fp->fp_glob->fg_offset = uio_offset(auio);
9667 } else {
9668 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9669 fp->fp_glob->fg_offset = uio_offset(auio);
9670 }
9671 if (error) {
9672 (void)vnode_put(vp);
9673 goto out;
9674 }
9675
9676 if ((user_ssize_t)bufsize == uio_resid(auio)) {
9677 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9678 struct vnode *tvp = vp;
9679 if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9680 vnode_ref(vp);
9681 fp->fp_glob->fg_data = (caddr_t) vp;
9682 fp->fp_glob->fg_offset = 0;
9683 vnode_rele(tvp);
9684 vnode_put(tvp);
9685 goto unionread;
9686 }
9687 vp = tvp;
9688 }
9689 }
9690
9691 vnode_put(vp);
9692 if (offset) {
9693 *offset = loff;
9694 }
9695
9696 *bytesread = bufsize - uio_resid(auio);
9697 out:
9698 file_drop(fd);
9699 return error;
9700 }
9701
9702
9703 int
9704 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9705 {
9706 off_t offset;
9707 ssize_t bytesread;
9708 int error, eofflag;
9709
9710 AUDIT_ARG(fd, uap->fd);
9711 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9712 &bytesread, &offset, &eofflag, 0);
9713
9714 if (error == 0) {
9715 if (proc_is64bit(p)) {
9716 user64_long_t base = (user64_long_t)offset;
9717 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9718 } else {
9719 user32_long_t base = (user32_long_t)offset;
9720 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9721 }
9722 *retval = (int)bytesread;
9723 }
9724 return error;
9725 }
9726
9727 int
9728 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9729 {
9730 off_t offset;
9731 ssize_t bytesread;
9732 int error, eofflag;
9733 user_size_t bufsize;
9734
9735 AUDIT_ARG(fd, uap->fd);
9736
9737 /*
9738 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9739 * then the kernel carves out the last 4 bytes to return extended
9740 * information to userspace (namely whether we reached EOF with this call).
9741 */
9742 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9743 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9744 } else {
9745 bufsize = uap->bufsize;
9746 }
9747
9748 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9749 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9750
9751 if (error == 0) {
9752 *retval = bytesread;
9753 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9754
9755 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9756 getdirentries64_flags_t flags = 0;
9757 if (eofflag) {
9758 flags |= GETDIRENTRIES64_EOF;
9759 }
9760 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9761 sizeof(flags));
9762 }
9763 }
9764 return error;
9765 }
9766
9767
9768 /*
9769 * Set the mode mask for creation of filesystem nodes.
9770 * XXX implement xsecurity
9771 */
9772 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9773 static int
9774 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9775 {
9776 struct filedesc *fdp;
9777
9778 AUDIT_ARG(mask, newmask);
9779 proc_fdlock(p);
9780 fdp = p->p_fd;
9781 *retval = fdp->fd_cmask;
9782 fdp->fd_cmask = newmask & ALLPERMS;
9783 proc_fdunlock(p);
9784 return 0;
9785 }
9786
9787 /*
9788 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9789 *
9790 * Parameters: p Process requesting to set the umask
9791 * uap User argument descriptor (see below)
9792 * retval umask of the process (parameter p)
9793 *
9794 * Indirect: uap->newmask umask to set
9795 * uap->xsecurity ACL to set
9796 *
9797 * Returns: 0 Success
9798 * !0 Not success
9799 *
9800 */
9801 int
9802 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9803 {
9804 int ciferror;
9805 kauth_filesec_t xsecdst;
9806
9807 xsecdst = KAUTH_FILESEC_NONE;
9808 if (uap->xsecurity != USER_ADDR_NULL) {
9809 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9810 return ciferror;
9811 }
9812 } else {
9813 xsecdst = KAUTH_FILESEC_NONE;
9814 }
9815
9816 ciferror = umask1(p, uap->newmask, xsecdst, retval);
9817
9818 if (xsecdst != KAUTH_FILESEC_NONE) {
9819 kauth_filesec_free(xsecdst);
9820 }
9821 return ciferror;
9822 }
9823
9824 int
9825 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9826 {
9827 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9828 }
9829
9830 /*
9831 * Void all references to file by ripping underlying filesystem
9832 * away from vnode.
9833 */
9834 /* ARGSUSED */
9835 int
9836 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9837 {
9838 vnode_t vp;
9839 struct vnode_attr va;
9840 vfs_context_t ctx = vfs_context_current();
9841 int error;
9842 struct nameidata nd;
9843
9844 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9845 uap->path, ctx);
9846 error = namei(&nd);
9847 if (error) {
9848 return error;
9849 }
9850 vp = nd.ni_vp;
9851
9852 nameidone(&nd);
9853
9854 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9855 error = ENOTSUP;
9856 goto out;
9857 }
9858
9859 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9860 error = EBUSY;
9861 goto out;
9862 }
9863
9864 #if CONFIG_MACF
9865 error = mac_vnode_check_revoke(ctx, vp);
9866 if (error) {
9867 goto out;
9868 }
9869 #endif
9870
9871 VATTR_INIT(&va);
9872 VATTR_WANTED(&va, va_uid);
9873 if ((error = vnode_getattr(vp, &va, ctx))) {
9874 goto out;
9875 }
9876 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9877 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9878 goto out;
9879 }
9880 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9881 VNOP_REVOKE(vp, REVOKEALL, ctx);
9882 }
9883 out:
9884 vnode_put(vp);
9885 return error;
9886 }
9887
9888
9889 /*
9890 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9891 * The following system calls are designed to support features
9892 * which are specific to the HFS & HFS Plus volume formats
9893 */
9894
9895
9896 /*
9897 * Obtain attribute information on objects in a directory while enumerating
9898 * the directory.
9899 */
9900 /* ARGSUSED */
9901 int
9902 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9903 {
9904 vnode_t vp;
9905 struct fileproc *fp;
9906 uio_t auio = NULL;
9907 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9908 uint32_t count = 0, savecount = 0;
9909 uint32_t newstate = 0;
9910 int error, eofflag;
9911 off_t loff = 0;
9912 struct attrlist attributelist;
9913 vfs_context_t ctx = vfs_context_current();
9914 int fd = uap->fd;
9915 char uio_buf[UIO_SIZEOF(1)];
9916 kauth_action_t action;
9917
9918 AUDIT_ARG(fd, fd);
9919
9920 /* Get the attributes into kernel space */
9921 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9922 return error;
9923 }
9924 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9925 return error;
9926 }
9927 savecount = count;
9928 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9929 return error;
9930 }
9931 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9932 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9933 error = EBADF;
9934 goto out;
9935 }
9936
9937
9938 #if CONFIG_MACF
9939 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9940 fp->fp_glob);
9941 if (error) {
9942 goto out;
9943 }
9944 #endif
9945
9946
9947 if ((error = vnode_getwithref(vp))) {
9948 goto out;
9949 }
9950
9951 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9952
9953 unionread:
9954 if (vp->v_type != VDIR) {
9955 (void)vnode_put(vp);
9956 error = EINVAL;
9957 goto out;
9958 }
9959
9960 #if CONFIG_MACF
9961 error = mac_vnode_check_readdir(ctx, vp);
9962 if (error != 0) {
9963 (void)vnode_put(vp);
9964 goto out;
9965 }
9966 #endif /* MAC */
9967
9968 /* set up the uio structure which will contain the users return buffer */
9969 loff = fp->fp_glob->fg_offset;
9970 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9971 uio_addiov(auio, uap->buffer, uap->buffersize);
9972
9973 /*
9974 * If the only item requested is file names, we can let that past with
9975 * just LIST_DIRECTORY. If they want any other attributes, that means
9976 * they need SEARCH as well.
9977 */
9978 action = KAUTH_VNODE_LIST_DIRECTORY;
9979 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9980 attributelist.fileattr || attributelist.dirattr) {
9981 action |= KAUTH_VNODE_SEARCH;
9982 }
9983
9984 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9985 /* Believe it or not, uap->options only has 32-bits of valid
9986 * info, so truncate before extending again */
9987
9988 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9989 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9990 }
9991
9992 if (error) {
9993 (void) vnode_put(vp);
9994 goto out;
9995 }
9996
9997 /*
9998 * If we've got the last entry of a directory in a union mount
9999 * then reset the eofflag and pretend there's still more to come.
10000 * The next call will again set eofflag and the buffer will be empty,
10001 * so traverse to the underlying directory and do the directory
10002 * read there.
10003 */
10004 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10005 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10006 eofflag = 0;
10007 } else { // Empty buffer
10008 struct vnode *tvp = vp;
10009 if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
10010 vnode_ref_ext(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0);
10011 fp->fp_glob->fg_data = (caddr_t) vp;
10012 fp->fp_glob->fg_offset = 0; // reset index for new dir
10013 count = savecount;
10014 vnode_rele_internal(tvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10015 vnode_put(tvp);
10016 goto unionread;
10017 }
10018 vp = tvp;
10019 }
10020 }
10021
10022 (void)vnode_put(vp);
10023
10024 if (error) {
10025 goto out;
10026 }
10027 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10028
10029 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10030 goto out;
10031 }
10032 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10033 goto out;
10034 }
10035 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10036 goto out;
10037 }
10038
10039 *retval = eofflag; /* similar to getdirentries */
10040 error = 0;
10041 out:
10042 file_drop(fd);
10043 return error; /* return error earlier, an retval of 0 or 1 now */
10044 } /* end of getdirentriesattr system call */
10045
10046 /*
10047 * Exchange data between two files
10048 */
10049
10050 /* ARGSUSED */
10051 int
10052 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10053 {
10054 struct nameidata fnd, snd;
10055 vfs_context_t ctx = vfs_context_current();
10056 vnode_t fvp;
10057 vnode_t svp;
10058 int error;
10059 u_int32_t nameiflags;
10060 char *fpath = NULL;
10061 char *spath = NULL;
10062 int flen = 0, slen = 0;
10063 int from_truncated = 0, to_truncated = 0;
10064 #if CONFIG_FSE
10065 fse_info f_finfo, s_finfo;
10066 #endif
10067
10068 nameiflags = 0;
10069 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10070 nameiflags |= FOLLOW;
10071 }
10072
10073 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10074 UIO_USERSPACE, uap->path1, ctx);
10075
10076 error = namei(&fnd);
10077 if (error) {
10078 goto out2;
10079 }
10080
10081 nameidone(&fnd);
10082 fvp = fnd.ni_vp;
10083
10084 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10085 UIO_USERSPACE, uap->path2, ctx);
10086
10087 error = namei(&snd);
10088 if (error) {
10089 vnode_put(fvp);
10090 goto out2;
10091 }
10092 nameidone(&snd);
10093 svp = snd.ni_vp;
10094
10095 /*
10096 * if the files are the same, return an inval error
10097 */
10098 if (svp == fvp) {
10099 error = EINVAL;
10100 goto out;
10101 }
10102
10103 /*
10104 * if the files are on different volumes, return an error
10105 */
10106 if (svp->v_mount != fvp->v_mount) {
10107 error = EXDEV;
10108 goto out;
10109 }
10110
10111 /* If they're not files, return an error */
10112 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10113 error = EINVAL;
10114 goto out;
10115 }
10116
10117 #if CONFIG_MACF
10118 error = mac_vnode_check_exchangedata(ctx,
10119 fvp, svp);
10120 if (error) {
10121 goto out;
10122 }
10123 #endif
10124 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10125 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10126 goto out;
10127 }
10128
10129 if (
10130 #if CONFIG_FSE
10131 need_fsevent(FSE_EXCHANGE, fvp) ||
10132 #endif
10133 kauth_authorize_fileop_has_listeners()) {
10134 GET_PATH(fpath);
10135 GET_PATH(spath);
10136
10137 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10138 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10139
10140 #if CONFIG_FSE
10141 get_fse_info(fvp, &f_finfo, ctx);
10142 get_fse_info(svp, &s_finfo, ctx);
10143 if (from_truncated || to_truncated) {
10144 // set it here since only the f_finfo gets reported up to user space
10145 f_finfo.mode |= FSE_TRUNCATED_PATH;
10146 }
10147 #endif
10148 }
10149 /* Ok, make the call */
10150 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10151
10152 if (error == 0) {
10153 const char *tmpname;
10154
10155 if (fpath != NULL && spath != NULL) {
10156 /* call out to allow 3rd party notification of exchangedata.
10157 * Ignore result of kauth_authorize_fileop call.
10158 */
10159 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10160 (uintptr_t)fpath, (uintptr_t)spath);
10161 }
10162 name_cache_lock();
10163
10164 tmpname = fvp->v_name;
10165 fvp->v_name = svp->v_name;
10166 svp->v_name = tmpname;
10167
10168 if (fvp->v_parent != svp->v_parent) {
10169 vnode_t tmp;
10170
10171 tmp = fvp->v_parent;
10172 fvp->v_parent = svp->v_parent;
10173 svp->v_parent = tmp;
10174 }
10175 name_cache_unlock();
10176
10177 #if CONFIG_FSE
10178 if (fpath != NULL && spath != NULL) {
10179 add_fsevent(FSE_EXCHANGE, ctx,
10180 FSE_ARG_STRING, flen, fpath,
10181 FSE_ARG_FINFO, &f_finfo,
10182 FSE_ARG_STRING, slen, spath,
10183 FSE_ARG_FINFO, &s_finfo,
10184 FSE_ARG_DONE);
10185 }
10186 #endif
10187 }
10188
10189 out:
10190 if (fpath != NULL) {
10191 RELEASE_PATH(fpath);
10192 }
10193 if (spath != NULL) {
10194 RELEASE_PATH(spath);
10195 }
10196 vnode_put(svp);
10197 vnode_put(fvp);
10198 out2:
10199 return error;
10200 }
10201
10202 /*
10203 * Return (in MB) the amount of freespace on the given vnode's volume.
10204 */
10205 uint32_t freespace_mb(vnode_t vp);
10206
10207 uint32_t
10208 freespace_mb(vnode_t vp)
10209 {
10210 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10211 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10212 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10213 }
10214
10215 #if CONFIG_SEARCHFS
10216
10217 /* ARGSUSED */
10218
10219 int
10220 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10221 {
10222 vnode_t vp, tvp;
10223 int i, error = 0;
10224 int fserror = 0;
10225 struct nameidata nd;
10226 struct user64_fssearchblock searchblock;
10227 struct searchstate *state;
10228 struct attrlist *returnattrs;
10229 struct timeval timelimit;
10230 void *searchparams1, *searchparams2;
10231 uio_t auio = NULL;
10232 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10233 uint32_t nummatches;
10234 size_t mallocsize;
10235 uint32_t nameiflags;
10236 vfs_context_t ctx = vfs_context_current();
10237 char uio_buf[UIO_SIZEOF(1)];
10238
10239 /* Start by copying in fsearchblock parameter list */
10240 if (IS_64BIT_PROCESS(p)) {
10241 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10242 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10243 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10244 } else {
10245 struct user32_fssearchblock tmp_searchblock;
10246
10247 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10248 // munge into 64-bit version
10249 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10250 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10251 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10252 searchblock.maxmatches = tmp_searchblock.maxmatches;
10253 /*
10254 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10255 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10256 */
10257 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10258 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10259 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10260 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10261 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10262 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10263 searchblock.searchattrs = tmp_searchblock.searchattrs;
10264 }
10265 if (error) {
10266 return error;
10267 }
10268
10269 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10270 */
10271 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10272 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10273 return EINVAL;
10274 }
10275
10276 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10277 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10278 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10279 /* block. */
10280 /* */
10281 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10282 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10283 /* assumes the size is still 556 bytes it will continue to work */
10284
10285 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10286 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10287
10288 searchparams1 = kheap_alloc(KHEAP_DATA_BUFFERS, mallocsize, Z_WAITOK);
10289
10290 /* Now set up the various pointers to the correct place in our newly allocated memory */
10291
10292 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10293 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10294 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10295
10296 /* Now copy in the stuff given our local variables. */
10297
10298 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10299 goto freeandexit;
10300 }
10301
10302 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10303 goto freeandexit;
10304 }
10305
10306 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10307 goto freeandexit;
10308 }
10309
10310 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10311 goto freeandexit;
10312 }
10313
10314 /*
10315 * When searching a union mount, need to set the
10316 * start flag at the first call on each layer to
10317 * reset state for the new volume.
10318 */
10319 if (uap->options & SRCHFS_START) {
10320 state->ss_union_layer = 0;
10321 } else {
10322 uap->options |= state->ss_union_flags;
10323 }
10324 state->ss_union_flags = 0;
10325
10326 /*
10327 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10328 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10329 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10330 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10331 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10332 */
10333
10334 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10335 attrreference_t* string_ref;
10336 u_int32_t* start_length;
10337 user64_size_t param_length;
10338
10339 /* validate searchparams1 */
10340 param_length = searchblock.sizeofsearchparams1;
10341 /* skip the word that specifies length of the buffer */
10342 start_length = (u_int32_t*) searchparams1;
10343 start_length = start_length + 1;
10344 string_ref = (attrreference_t*) start_length;
10345
10346 /* ensure no negative offsets or too big offsets */
10347 if (string_ref->attr_dataoffset < 0) {
10348 error = EINVAL;
10349 goto freeandexit;
10350 }
10351 if (string_ref->attr_length > MAXPATHLEN) {
10352 error = EINVAL;
10353 goto freeandexit;
10354 }
10355
10356 /* Check for pointer overflow in the string ref */
10357 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10358 error = EINVAL;
10359 goto freeandexit;
10360 }
10361
10362 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10363 error = EINVAL;
10364 goto freeandexit;
10365 }
10366 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10367 error = EINVAL;
10368 goto freeandexit;
10369 }
10370 }
10371
10372 /* set up the uio structure which will contain the users return buffer */
10373 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10374 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10375
10376 nameiflags = 0;
10377 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10378 nameiflags |= FOLLOW;
10379 }
10380 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10381 UIO_USERSPACE, uap->path, ctx);
10382
10383 error = namei(&nd);
10384 if (error) {
10385 goto freeandexit;
10386 }
10387 vp = nd.ni_vp;
10388 nameidone(&nd);
10389
10390 /*
10391 * Switch to the root vnode for the volume
10392 */
10393 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10394 vnode_put(vp);
10395 if (error) {
10396 goto freeandexit;
10397 }
10398 vp = tvp;
10399
10400 /*
10401 * If it's a union mount, the path lookup takes
10402 * us to the top layer. But we may need to descend
10403 * to a lower layer. For non-union mounts the layer
10404 * is always zero.
10405 */
10406 for (i = 0; i < (int) state->ss_union_layer; i++) {
10407 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10408 break;
10409 }
10410 tvp = vp;
10411 vp = vp->v_mount->mnt_vnodecovered;
10412 if (vp == NULL) {
10413 vnode_put(tvp);
10414 error = ENOENT;
10415 goto freeandexit;
10416 }
10417 error = vnode_getwithref(vp);
10418 vnode_put(tvp);
10419 if (error) {
10420 goto freeandexit;
10421 }
10422 }
10423
10424 #if CONFIG_MACF
10425 error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10426 if (error) {
10427 vnode_put(vp);
10428 goto freeandexit;
10429 }
10430 #endif
10431
10432
10433 /*
10434 * If searchblock.maxmatches == 0, then skip the search. This has happened
10435 * before and sometimes the underlying code doesnt deal with it well.
10436 */
10437 if (searchblock.maxmatches == 0) {
10438 nummatches = 0;
10439 goto saveandexit;
10440 }
10441
10442 /*
10443 * Allright, we have everything we need, so lets make that call.
10444 *
10445 * We keep special track of the return value from the file system:
10446 * EAGAIN is an acceptable error condition that shouldn't keep us
10447 * from copying out any results...
10448 */
10449
10450 fserror = VNOP_SEARCHFS(vp,
10451 searchparams1,
10452 searchparams2,
10453 &searchblock.searchattrs,
10454 (uint32_t)searchblock.maxmatches,
10455 &timelimit,
10456 returnattrs,
10457 &nummatches,
10458 (uint32_t)uap->scriptcode,
10459 (uint32_t)uap->options,
10460 auio,
10461 (struct searchstate *) &state->ss_fsstate,
10462 ctx);
10463
10464 /*
10465 * If it's a union mount we need to be called again
10466 * to search the mounted-on filesystem.
10467 */
10468 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10469 state->ss_union_flags = SRCHFS_START;
10470 state->ss_union_layer++; // search next layer down
10471 fserror = EAGAIN;
10472 }
10473
10474 saveandexit:
10475
10476 vnode_put(vp);
10477
10478 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10479 * search state. Everything was already put into he return buffer by the vop call. */
10480
10481 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10482 goto freeandexit;
10483 }
10484
10485 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10486 goto freeandexit;
10487 }
10488
10489 error = fserror;
10490
10491 freeandexit:
10492
10493 kheap_free(KHEAP_DATA_BUFFERS, searchparams1, mallocsize);
10494
10495 return error;
10496 } /* end of searchfs system call */
10497
10498 #else /* CONFIG_SEARCHFS */
10499
10500 int
10501 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10502 {
10503 return ENOTSUP;
10504 }
10505
10506 #endif /* CONFIG_SEARCHFS */
10507
10508
10509 #if CONFIG_DATALESS_FILES
10510
10511 /*
10512 * === Namespace Resolver Up-call Mechanism ===
10513 *
10514 * When I/O is performed to a dataless file or directory (read, write,
10515 * lookup-in, etc.), the file system performs an upcall to the namespace
10516 * resolver (filecoordinationd) to materialize the object.
10517 *
10518 * We need multiple up-calls to be in flight at once, and we need these
10519 * up-calls to be interruptible, thus the following implementation:
10520 *
10521 * => The nspace_resolver_request represents the in-kernel request state.
10522 * It contains a request ID, storage space for the errno code returned
10523 * by filecoordinationd, and flags.
10524 *
10525 * => The request ID is simply a global monotonically incrementing 32-bit
10526 * number. Outstanding requests are stored in a hash table, and the
10527 * hash function is extremely simple.
10528 *
10529 * => When an upcall is to be made to filecoordinationd, a request structure
10530 * is allocated on the stack (it is small, and needs to live only during
10531 * the duration of the call to resolve_nspace_item_ext()). It is
10532 * initialized and inserted into the table. Some backpressure from
10533 * filecoordinationd is applied by limiting the numnber of entries that
10534 * can be inserted into the table (and thus limiting the number of
10535 * outstanding requests issued to filecoordinationd); waiting for an
10536 * available slot is interruptible.
10537 *
10538 * => Once the request has been inserted into the table, the up-call is made
10539 * to filecoordinationd via a MiG-generated stub. The up-call returns
10540 * immediately and filecoordinationd processes the request asynchronously.
10541 *
10542 * => The caller now waits for the request to complete. Tnis is achieved by
10543 * sleeping on the address of the request structure and waiting for
10544 * filecoordinationd to mark the request structure as complete. This
10545 * is an interruptible sleep call; if interrupted, the request structure
10546 * is removed from the table and EINTR is returned to the caller. If
10547 * this occurs, an advisory up-call is made to filecoordinationd with
10548 * the request ID to indicate that the request can be aborted or
10549 * de-prioritized at the discretion of filecoordinationd.
10550 *
10551 * => When filecoordinationd has completed the request, it signals completion
10552 * by writing to the vfs.nspace.complete sysctl node. Only a process
10553 * decorated as a namespace resolver can write to this sysctl node. The
10554 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10555 * The request ID is looked up in the table, and if the request is found,
10556 * the error code is stored in the request structure and a wakeup()
10557 * issued on the address of the request structure. If the request is not
10558 * found, we simply drop the completion notification, assuming that the
10559 * caller was interrupted.
10560 *
10561 * => When the waiting thread wakes up, it extracts the error code from the
10562 * request structure, removes the request from the table, and returns the
10563 * error code to the calling function. Fini!
10564 */
10565
10566 struct nspace_resolver_request {
10567 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10568 vnode_t r_vp;
10569 uint32_t r_req_id;
10570 int r_resolver_error;
10571 int r_flags;
10572 };
10573
10574 #define RRF_COMPLETE 0x0001
10575
10576 static uint32_t
10577 next_nspace_req_id(void)
10578 {
10579 static uint32_t next_req_id;
10580
10581 return OSAddAtomic(1, &next_req_id);
10582 }
10583
10584 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10585 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10586
10587 static LIST_HEAD(nspace_resolver_requesthead,
10588 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10589 static u_long nspace_resolver_request_hashmask;
10590 static u_int nspace_resolver_request_count;
10591 static bool nspace_resolver_request_wait_slot;
10592 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10593 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10594 &nspace_resolver_request_lck_grp);
10595
10596 #define NSPACE_REQ_LOCK() \
10597 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10598 #define NSPACE_REQ_UNLOCK() \
10599 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10600
10601 #define NSPACE_RESOLVER_HASH(req_id) \
10602 (&nspace_resolver_request_hashtbl[(req_id) & \
10603 nspace_resolver_request_hashmask])
10604
10605 static struct nspace_resolver_request *
10606 nspace_resolver_req_lookup(uint32_t req_id)
10607 {
10608 struct nspace_resolver_requesthead *bucket;
10609 struct nspace_resolver_request *req;
10610
10611 bucket = NSPACE_RESOLVER_HASH(req_id);
10612 LIST_FOREACH(req, bucket, r_hashlink) {
10613 if (req->r_req_id == req_id) {
10614 return req;
10615 }
10616 }
10617
10618 return NULL;
10619 }
10620
10621 static int
10622 nspace_resolver_req_add(struct nspace_resolver_request *req)
10623 {
10624 struct nspace_resolver_requesthead *bucket;
10625 int error;
10626
10627 while (nspace_resolver_request_count >=
10628 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10629 nspace_resolver_request_wait_slot = true;
10630 error = msleep(&nspace_resolver_request_count,
10631 &nspace_resolver_request_hash_mutex,
10632 PVFS | PCATCH, "nspacerq", NULL);
10633 if (error) {
10634 return error;
10635 }
10636 }
10637
10638 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10639 #if DIAGNOSTIC
10640 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10641 #endif /* DIAGNOSTIC */
10642 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10643 nspace_resolver_request_count++;
10644
10645 return 0;
10646 }
10647
10648 static void
10649 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10650 {
10651 struct nspace_resolver_requesthead *bucket;
10652
10653 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10654 #if DIAGNOSTIC
10655 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10656 #endif /* DIAGNOSTIC */
10657 LIST_REMOVE(req, r_hashlink);
10658 nspace_resolver_request_count--;
10659
10660 if (nspace_resolver_request_wait_slot) {
10661 nspace_resolver_request_wait_slot = false;
10662 wakeup(&nspace_resolver_request_count);
10663 }
10664 }
10665
10666 static void
10667 nspace_resolver_req_cancel(uint32_t req_id)
10668 {
10669 kern_return_t kr;
10670 mach_port_t mp;
10671
10672 // Failures here aren't fatal -- the cancellation message
10673 // sent to the resolver is merely advisory.
10674
10675 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10676 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10677 return;
10678 }
10679
10680 kr = send_nspace_resolve_cancel(mp, req_id);
10681 if (kr != KERN_SUCCESS) {
10682 os_log_error(OS_LOG_DEFAULT,
10683 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10684 }
10685
10686 ipc_port_release_send(mp);
10687 }
10688
10689 static int
10690 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10691 {
10692 bool send_cancel_message = false;
10693 int error;
10694
10695 NSPACE_REQ_LOCK();
10696
10697 while ((req->r_flags & RRF_COMPLETE) == 0) {
10698 error = msleep(req, &nspace_resolver_request_hash_mutex,
10699 PVFS | PCATCH, "nspace", NULL);
10700 if (error && error != ERESTART) {
10701 req->r_resolver_error = (error == EINTR) ? EINTR :
10702 ETIMEDOUT;
10703 send_cancel_message = true;
10704 break;
10705 }
10706 }
10707
10708 nspace_resolver_req_remove(req);
10709
10710 NSPACE_REQ_UNLOCK();
10711
10712 if (send_cancel_message) {
10713 nspace_resolver_req_cancel(req->r_req_id);
10714 }
10715
10716 return req->r_resolver_error;
10717 }
10718
10719 static void
10720 nspace_resolver_req_mark_complete(
10721 struct nspace_resolver_request *req,
10722 int resolver_error)
10723 {
10724 req->r_resolver_error = resolver_error;
10725 req->r_flags |= RRF_COMPLETE;
10726 wakeup(req);
10727 }
10728
10729 static void
10730 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10731 {
10732 struct nspace_resolver_request *req;
10733
10734 NSPACE_REQ_LOCK();
10735
10736 // If we don't find the request corresponding to our req_id,
10737 // just drop the completion signal on the floor; it's likely
10738 // that the requester interrupted with a signal.
10739
10740 req = nspace_resolver_req_lookup(req_id);
10741 if (req) {
10742 mount_t locked_mp = NULL;
10743
10744 locked_mp = req->r_vp->v_mount;
10745 mount_ref(locked_mp, 0);
10746 mount_lock_renames(locked_mp);
10747
10748 //
10749 // if the resolver isn't already returning an error and we have an
10750 // orig_gencount, then get an iocount on the request vnode and check
10751 // that the gencount on req->r_vp has not changed.
10752 //
10753 // note: a ref was taken on req->r_vp when the request was created
10754 // and that ref will be dropped by that thread when it wakes up.
10755 //
10756 if (resolver_error == 0 &&
10757 orig_gencount != 0 &&
10758 vnode_getwithref(req->r_vp) == 0) {
10759 struct vnode_attr va;
10760 uint64_t cur_gencount;
10761
10762 VATTR_INIT(&va);
10763 VATTR_WANTED(&va, va_recursive_gencount);
10764
10765 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10766 cur_gencount = va.va_recursive_gencount;
10767 } else {
10768 cur_gencount = 0;
10769 }
10770
10771 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10772 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10773
10774 // this error will be returned to the thread that initiated the
10775 // materialization of req->r_vp.
10776 resolver_error = EBUSY;
10777
10778 // note: we explicitly do not return an error to the caller (i.e.
10779 // the thread that did the materialization) because they said they
10780 // don't want one.
10781 }
10782
10783 vnode_put(req->r_vp);
10784 }
10785
10786 mount_unlock_renames(locked_mp);
10787 mount_drop(locked_mp, 0);
10788
10789 nspace_resolver_req_mark_complete(req, resolver_error);
10790 }
10791
10792 NSPACE_REQ_UNLOCK();
10793
10794 return;
10795 }
10796
10797 static struct proc *nspace_resolver_proc;
10798
10799 static int
10800 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10801 {
10802 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10803 p == nspace_resolver_proc) ? 1 : 0;
10804 return 0;
10805 }
10806
10807 static int
10808 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10809 {
10810 vfs_context_t ctx = vfs_context_current();
10811 int error = 0;
10812
10813 //
10814 // The system filecoordinationd runs as uid == 0. This also
10815 // has the nice side-effect of filtering out filecoordinationd
10816 // running in the simulator.
10817 //
10818 if (!vfs_context_issuser(ctx)) {
10819 return EPERM;
10820 }
10821
10822 error = priv_check_cred(vfs_context_ucred(ctx),
10823 PRIV_VFS_DATALESS_RESOLVER, 0);
10824 if (error) {
10825 return error;
10826 }
10827
10828 if (is_resolver) {
10829 NSPACE_REQ_LOCK();
10830
10831 if (nspace_resolver_proc == NULL) {
10832 proc_lock(p);
10833 p->p_lflag |= P_LNSPACE_RESOLVER;
10834 proc_unlock(p);
10835 nspace_resolver_proc = p;
10836 } else {
10837 error = EBUSY;
10838 }
10839
10840 NSPACE_REQ_UNLOCK();
10841 } else {
10842 // This is basically just like the exit case.
10843 // nspace_resolver_exited() will verify that the
10844 // process is the resolver, and will clear the
10845 // global.
10846 nspace_resolver_exited(p);
10847 }
10848
10849 return error;
10850 }
10851
10852 static int
10853 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10854 {
10855 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10856 (p->p_vfs_iopolicy &
10857 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10858 *is_prevented = 1;
10859 } else {
10860 *is_prevented = 0;
10861 }
10862 return 0;
10863 }
10864
10865 static int
10866 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10867 {
10868 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10869 return is_prevented ? 0 : EBUSY;
10870 }
10871
10872 if (is_prevented) {
10873 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10874 } else {
10875 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10876 }
10877 return 0;
10878 }
10879
10880 static int
10881 nspace_materialization_get_thread_state(int *is_prevented)
10882 {
10883 uthread_t ut = get_bsdthread_info(current_thread());
10884
10885 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10886 return 0;
10887 }
10888
10889 static int
10890 nspace_materialization_set_thread_state(int is_prevented)
10891 {
10892 uthread_t ut = get_bsdthread_info(current_thread());
10893
10894 if (is_prevented) {
10895 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10896 } else {
10897 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10898 }
10899 return 0;
10900 }
10901
10902 /* the vfs.nspace branch */
10903 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10904
10905 static int
10906 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10907 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10908 {
10909 struct proc *p = req->p;
10910 int new_value, old_value, changed = 0;
10911 int error;
10912
10913 error = nspace_resolver_get_proc_state(p, &old_value);
10914 if (error) {
10915 return error;
10916 }
10917
10918 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10919 &changed);
10920 if (error == 0 && changed) {
10921 error = nspace_resolver_set_proc_state(p, new_value);
10922 }
10923 return error;
10924 }
10925
10926 /* decorate this process as the dataless file resolver */
10927 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10928 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10929 0, 0, sysctl_nspace_resolver, "I", "");
10930
10931 static int
10932 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10933 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10934 {
10935 struct proc *p = req->p;
10936 int new_value, old_value, changed = 0;
10937 int error;
10938
10939 error = nspace_materialization_get_proc_state(p, &old_value);
10940 if (error) {
10941 return error;
10942 }
10943
10944 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10945 &changed);
10946 if (error == 0 && changed) {
10947 error = nspace_materialization_set_proc_state(p, new_value);
10948 }
10949 return error;
10950 }
10951
10952 /* decorate this process as not wanting to materialize dataless files */
10953 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10954 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10955 0, 0, sysctl_nspace_prevent_materialization, "I", "");
10956
10957 static int
10958 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10959 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10960 {
10961 int new_value, old_value, changed = 0;
10962 int error;
10963
10964 error = nspace_materialization_get_thread_state(&old_value);
10965 if (error) {
10966 return error;
10967 }
10968
10969 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10970 &changed);
10971 if (error == 0 && changed) {
10972 error = nspace_materialization_set_thread_state(new_value);
10973 }
10974 return error;
10975 }
10976
10977 /* decorate this thread as not wanting to materialize dataless files */
10978 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10979 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10980 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10981
10982 static int
10983 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10984 __unused int arg2, struct sysctl_req *req)
10985 {
10986 struct proc *p = req->p;
10987 uint32_t req_status[2] = { 0, 0 };
10988 uint64_t gencount = 0;
10989 int error, is_resolver, changed = 0, gencount_changed;
10990
10991 error = nspace_resolver_get_proc_state(p, &is_resolver);
10992 if (error) {
10993 return error;
10994 }
10995
10996 if (!is_resolver) {
10997 return EPERM;
10998 }
10999
11000 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11001 &changed);
11002 if (error) {
11003 return error;
11004 }
11005
11006 // get the gencount if it was passed
11007 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11008 &gencount_changed);
11009 if (error) {
11010 gencount = 0;
11011 // we ignore the error because the gencount was optional
11012 error = 0;
11013 }
11014
11015 /*
11016 * req_status[0] is the req_id
11017 *
11018 * req_status[1] is the errno
11019 */
11020 if (error == 0 && changed) {
11021 nspace_resolver_req_completed(req_status[0],
11022 (int)req_status[1], gencount);
11023 }
11024 return error;
11025 }
11026
11027 /* Resolver reports completed reqs here. */
11028 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11029 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11030 0, 0, sysctl_nspace_complete, "-", "");
11031
11032 #endif /* CONFIG_DATALESS_FILES */
11033
11034 #if CONFIG_DATALESS_FILES
11035 #define __no_dataless_unused /* nothing */
11036 #else
11037 #define __no_dataless_unused __unused
11038 #endif
11039
11040 int
11041 vfs_context_dataless_materialization_is_prevented(
11042 vfs_context_t const ctx __no_dataless_unused)
11043 {
11044 #if CONFIG_DATALESS_FILES
11045 proc_t const p = vfs_context_proc(ctx);
11046 thread_t const t = vfs_context_thread(ctx);
11047 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11048
11049 /*
11050 * Kernel context ==> return EDEADLK, as we would with any random
11051 * process decorated as no-materialize.
11052 */
11053 if (ctx == vfs_context_kernel()) {
11054 return EDEADLK;
11055 }
11056
11057 /*
11058 * If the process has the dataless-manipulation entitlement,
11059 * materialization is prevented, and depending on the kind
11060 * of file system operation, things get to proceed as if the
11061 * object is not dataless.
11062 */
11063 if (vfs_context_is_dataless_manipulator(ctx)) {
11064 return EJUSTRETURN;
11065 }
11066
11067 /*
11068 * Per-thread decorations override any process-wide decorations.
11069 * (Foundation uses this, and this overrides even the dataless-
11070 * manipulation entitlement so as to make API contracts consistent.)
11071 */
11072 if (ut != NULL) {
11073 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11074 return EDEADLK;
11075 }
11076 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11077 return 0;
11078 }
11079 }
11080
11081 /*
11082 * If the process's iopolicy specifies that dataless files
11083 * can be materialized, then we let it go ahead.
11084 */
11085 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11086 return 0;
11087 }
11088 #endif /* CONFIG_DATALESS_FILES */
11089
11090 /*
11091 * The default behavior is to not materialize dataless files;
11092 * return to the caller that deadlock was detected.
11093 */
11094 return EDEADLK;
11095 }
11096
11097 void
11098 nspace_resolver_init(void)
11099 {
11100 #if CONFIG_DATALESS_FILES
11101 nspace_resolver_request_hashtbl =
11102 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11103 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11104 #endif /* CONFIG_DATALESS_FILES */
11105 }
11106
11107 void
11108 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11109 {
11110 #if CONFIG_DATALESS_FILES
11111 struct nspace_resolver_requesthead *bucket;
11112 struct nspace_resolver_request *req;
11113 u_long idx;
11114
11115 NSPACE_REQ_LOCK();
11116
11117 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11118 p == nspace_resolver_proc) {
11119 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11120 bucket = &nspace_resolver_request_hashtbl[idx];
11121 LIST_FOREACH(req, bucket, r_hashlink) {
11122 nspace_resolver_req_mark_complete(req,
11123 ETIMEDOUT);
11124 }
11125 }
11126 nspace_resolver_proc = NULL;
11127 }
11128
11129 NSPACE_REQ_UNLOCK();
11130 #endif /* CONFIG_DATALESS_FILES */
11131 }
11132
11133 int
11134 resolve_nspace_item(struct vnode *vp, uint64_t op)
11135 {
11136 return resolve_nspace_item_ext(vp, op, NULL);
11137 }
11138
11139 #define DATALESS_RESOLVER_ENTITLEMENT \
11140 "com.apple.private.vfs.dataless-resolver"
11141 #define DATALESS_MANIPULATION_ENTITLEMENT \
11142 "com.apple.private.vfs.dataless-manipulation"
11143
11144 /*
11145 * Return TRUE if the vfs context is associated with a process entitled
11146 * for dataless manipulation.
11147 *
11148 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11149 * complication around CONFIG_DATALESS_FILES.
11150 */
11151 boolean_t
11152 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11153 {
11154 #if CONFIG_DATALESS_FILES
11155 assert(ctx->vc_thread == current_thread());
11156 task_t const task = current_task();
11157 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11158 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11159 #else
11160 return false;
11161 #endif /* CONFIG_DATALESS_FILES */
11162 }
11163
11164 int
11165 resolve_nspace_item_ext(
11166 struct vnode *vp __no_dataless_unused,
11167 uint64_t op __no_dataless_unused,
11168 void *arg __unused)
11169 {
11170 #if CONFIG_DATALESS_FILES
11171 int error;
11172 mach_port_t mp;
11173 char *path = NULL;
11174 int path_len;
11175 kern_return_t kr;
11176 struct nspace_resolver_request req;
11177
11178 // only allow namespace events on regular files, directories and symlinks.
11179 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11180 return EFTYPE;
11181 }
11182
11183 //
11184 // if this is a snapshot event and the vnode is on a
11185 // disk image just pretend nothing happened since any
11186 // change to the disk image will cause the disk image
11187 // itself to get backed up and this avoids multi-way
11188 // deadlocks between the snapshot handler and the ever
11189 // popular diskimages-helper process. the variable
11190 // nspace_allow_virtual_devs allows this behavior to
11191 // be overridden (for use by the Mobile TimeMachine
11192 // testing infrastructure which uses disk images)
11193 //
11194 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11195 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11196 return ENOTSUP;
11197 }
11198
11199 error = vfs_context_dataless_materialization_is_prevented(
11200 vfs_context_current());
11201 if (error) {
11202 os_log_debug(OS_LOG_DEFAULT,
11203 "NSPACE process/thread is decorated as no-materialization");
11204 return error;
11205 }
11206
11207 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11208 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11209 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11210 // Treat this like being unable to access the backing
11211 // store server.
11212 return ETIMEDOUT;
11213 }
11214
11215 path = zalloc(ZV_NAMEI);
11216 path_len = MAXPATHLEN;
11217
11218 error = vn_getpath(vp, path, &path_len);
11219 if (error == 0) {
11220 int xxx_rdar44371223; /* XXX Mig bug */
11221 req.r_req_id = next_nspace_req_id();
11222 req.r_resolver_error = 0;
11223 req.r_flags = 0;
11224
11225 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
11226 req.r_vp = vp;
11227 } else {
11228 goto out_release_port;
11229 }
11230
11231 NSPACE_REQ_LOCK();
11232 error = nspace_resolver_req_add(&req);
11233 NSPACE_REQ_UNLOCK();
11234 if (error) {
11235 vnode_rele(req.r_vp);
11236 goto out_release_port;
11237 }
11238
11239 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11240 kr = send_nspace_resolve_path(mp, req.r_req_id,
11241 current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
11242 path, &xxx_rdar44371223);
11243 if (kr != KERN_SUCCESS) {
11244 // Also treat this like being unable to access
11245 // the backing store server.
11246 os_log_error(OS_LOG_DEFAULT,
11247 "NSPACE resolve_path failure: %d", kr);
11248 error = ETIMEDOUT;
11249
11250 NSPACE_REQ_LOCK();
11251 nspace_resolver_req_remove(&req);
11252 NSPACE_REQ_UNLOCK();
11253 vnode_rele(req.r_vp);
11254 goto out_release_port;
11255 }
11256
11257 // Give back the memory we allocated earlier while
11258 // we wait; we no longer need it.
11259 zfree(ZV_NAMEI, path);
11260 path = NULL;
11261
11262 // Request has been submitted to the resolver.
11263 // Now (interruptibly) wait for completion.
11264 // Upon requrn, the request will have been removed
11265 // from the lookup table.
11266 error = nspace_resolver_req_wait(&req);
11267
11268 vnode_rele(req.r_vp);
11269 }
11270
11271 out_release_port:
11272 if (path != NULL) {
11273 zfree(ZV_NAMEI, path);
11274 }
11275 ipc_port_release_send(mp);
11276
11277 return error;
11278 #else
11279 return ENOTSUP;
11280 #endif /* CONFIG_DATALESS_FILES */
11281 }
11282
11283 int
11284 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
11285 __unused uint64_t op_type, __unused void *arg)
11286 {
11287 return 0;
11288 }
11289
11290 #if 0
11291 static int
11292 build_volfs_path(struct vnode *vp, char *path, int *len)
11293 {
11294 struct vnode_attr va;
11295 int ret;
11296
11297 VATTR_INIT(&va);
11298 VATTR_WANTED(&va, va_fsid);
11299 VATTR_WANTED(&va, va_fileid);
11300
11301 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11302 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11303 ret = -1;
11304 } else {
11305 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11306 ret = 0;
11307 }
11308
11309 return ret;
11310 }
11311 #endif
11312
11313 static unsigned long
11314 fsctl_bogus_command_compat(unsigned long cmd)
11315 {
11316 switch (cmd) {
11317 case IOCBASECMD(FSIOC_SYNC_VOLUME):
11318 return FSIOC_SYNC_VOLUME;
11319 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11320 return FSIOC_ROUTEFS_SETROUTEID;
11321 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11322 return FSIOC_SET_PACKAGE_EXTS;
11323 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11324 return FSIOC_SET_FSTYPENAME_OVERRIDE;
11325 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11326 return DISK_CONDITIONER_IOC_GET;
11327 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11328 return DISK_CONDITIONER_IOC_SET;
11329 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11330 return FSIOC_FIOSEEKHOLE;
11331 case IOCBASECMD(FSIOC_FIOSEEKDATA):
11332 return FSIOC_FIOSEEKDATA;
11333 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11334 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11335 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11336 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11337 }
11338
11339 return cmd;
11340 }
11341
11342 static int
11343 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11344 {
11345 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11346 }
11347
11348 static int __attribute__((noinline))
11349 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11350 {
11351 struct vfs_attr vfa;
11352 mount_t mp = vp->v_mount;
11353 unsigned arg;
11354 int error;
11355
11356 /* record vid of vp so we can drop it below. */
11357 uint32_t vvid = vp->v_id;
11358
11359 /*
11360 * Then grab mount_iterref so that we can release the vnode.
11361 * Without this, a thread may call vnode_iterate_prepare then
11362 * get into a deadlock because we've never released the root vp
11363 */
11364 error = mount_iterref(mp, 0);
11365 if (error) {
11366 return error;
11367 }
11368 vnode_put(vp);
11369
11370 arg = MNT_NOWAIT;
11371 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11372 arg = MNT_WAIT;
11373 }
11374
11375 /*
11376 * If the filessytem supports multiple filesytems in a
11377 * partition (For eg APFS volumes in a container, it knows
11378 * that the waitfor argument to VFS_SYNC are flags.
11379 */
11380 VFSATTR_INIT(&vfa);
11381 VFSATTR_WANTED(&vfa, f_capabilities);
11382 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11383 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11384 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11385 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11386 arg |= MNT_VOLUME;
11387 }
11388
11389 /* issue the sync for this volume */
11390 (void)sync_callback(mp, &arg);
11391
11392 /*
11393 * Then release the mount_iterref once we're done syncing; it's not
11394 * needed for the VNOP_IOCTL below
11395 */
11396 mount_iterdrop(mp);
11397
11398 if (arg & FSCTL_SYNC_FULLSYNC) {
11399 /* re-obtain vnode iocount on the root vp, if possible */
11400 error = vnode_getwithvid(vp, vvid);
11401 if (error == 0) {
11402 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11403 vnode_put(vp);
11404 }
11405 }
11406 /* mark the argument VP as having been released */
11407 *arg_vp = NULL;
11408 return error;
11409 }
11410
11411 #if ROUTEFS
11412 static int __attribute__((noinline))
11413 handle_routes(user_addr_t udata)
11414 {
11415 char routepath[MAXPATHLEN];
11416 size_t len = 0;
11417 int error;
11418
11419 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11420 return error;
11421 }
11422 bzero(routepath, MAXPATHLEN);
11423 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11424 if (error) {
11425 return error;
11426 }
11427 error = routefs_kernel_mount(routepath);
11428 return error;
11429 }
11430 #endif
11431
11432 static int __attribute__((noinline))
11433 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11434 {
11435 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11436 struct vnode_attr va;
11437 int error;
11438
11439 VATTR_INIT(&va);
11440 VATTR_SET(&va, va_flags, cas->new_flags);
11441
11442 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11443 return error;
11444 }
11445
11446 static int __attribute__((noinline))
11447 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11448 {
11449 struct mount *mp = NULL;
11450 errno_t rootauth = 0;
11451
11452 mp = vp->v_mount;
11453
11454 /*
11455 * query the underlying FS and see if it reports something
11456 * sane for this vnode. If volume is authenticated via
11457 * chunklist, leave that for the caller to determine.
11458 */
11459 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11460
11461 return rootauth;
11462 }
11463
11464 /*
11465 * Make a filesystem-specific control call:
11466 */
11467 /* ARGSUSED */
11468 static int
11469 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11470 {
11471 int error = 0;
11472 boolean_t is64bit;
11473 u_int size;
11474 #define STK_PARAMS 128
11475 char stkbuf[STK_PARAMS] = {0};
11476 caddr_t data, memp;
11477 vnode_t vp = *arg_vp;
11478
11479 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11480 return ENOTTY;
11481 }
11482
11483 cmd = fsctl_bogus_command_compat(cmd);
11484
11485 size = IOCPARM_LEN(cmd);
11486 if (size > IOCPARM_MAX) {
11487 return EINVAL;
11488 }
11489
11490 is64bit = proc_is64bit(p);
11491
11492 memp = NULL;
11493
11494 if (size > sizeof(stkbuf)) {
11495 if ((memp = (caddr_t)kheap_alloc(KHEAP_TEMP, size, Z_WAITOK)) == 0) {
11496 return ENOMEM;
11497 }
11498 data = memp;
11499 } else {
11500 data = &stkbuf[0];
11501 };
11502
11503 if (cmd & IOC_IN) {
11504 if (size) {
11505 error = copyin(udata, data, size);
11506 if (error) {
11507 if (memp) {
11508 kheap_free(KHEAP_TEMP, memp, size);
11509 }
11510 return error;
11511 }
11512 } else {
11513 if (is64bit) {
11514 *(user_addr_t *)data = udata;
11515 } else {
11516 *(uint32_t *)data = (uint32_t)udata;
11517 }
11518 };
11519 } else if ((cmd & IOC_OUT) && size) {
11520 /*
11521 * Zero the buffer so the user always
11522 * gets back something deterministic.
11523 */
11524 bzero(data, size);
11525 } else if (cmd & IOC_VOID) {
11526 if (is64bit) {
11527 *(user_addr_t *)data = udata;
11528 } else {
11529 *(uint32_t *)data = (uint32_t)udata;
11530 }
11531 }
11532
11533 /* Check to see if it's a generic command */
11534 switch (cmd) {
11535 case FSIOC_SYNC_VOLUME:
11536 error = handle_sync_volume(vp, arg_vp, data, ctx);
11537 break;
11538
11539 case FSIOC_ROUTEFS_SETROUTEID:
11540 #if ROUTEFS
11541 error = handle_routes(udata);
11542 #endif
11543 break;
11544
11545 case FSIOC_SET_PACKAGE_EXTS: {
11546 user_addr_t ext_strings;
11547 uint32_t num_entries;
11548 uint32_t max_width;
11549
11550 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11551 break;
11552 }
11553
11554 if ((is64bit && size != sizeof(user64_package_ext_info))
11555 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11556 // either you're 64-bit and passed a 64-bit struct or
11557 // you're 32-bit and passed a 32-bit struct. otherwise
11558 // it's not ok.
11559 error = EINVAL;
11560 break;
11561 }
11562
11563 if (is64bit) {
11564 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11565 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11566 }
11567 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11568 num_entries = ((user64_package_ext_info *)data)->num_entries;
11569 max_width = ((user64_package_ext_info *)data)->max_width;
11570 } else {
11571 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11572 num_entries = ((user32_package_ext_info *)data)->num_entries;
11573 max_width = ((user32_package_ext_info *)data)->max_width;
11574 }
11575 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11576 }
11577 break;
11578
11579 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11580 {
11581 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11582 break;
11583 }
11584 if (vp->v_mount) {
11585 mount_lock(vp->v_mount);
11586 if (data[0] != 0) {
11587 int i;
11588 for (i = 0; i < MFSTYPENAMELEN; i++) {
11589 if (!data[i]) {
11590 goto continue_copy;
11591 }
11592 }
11593 /*
11594 * Getting here means we have a user data string which has no
11595 * NULL termination in its first MFSTYPENAMELEN bytes.
11596 * This is bogus, let's avoid strlcpy-ing the read data and
11597 * return an error.
11598 */
11599 error = EINVAL;
11600 goto unlock;
11601 continue_copy:
11602 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11603 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11604 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11605 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11606 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11607 }
11608 } else {
11609 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11610 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11611 }
11612 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11613 vp->v_mount->fstypename_override[0] = '\0';
11614 }
11615 unlock:
11616 mount_unlock(vp->v_mount);
11617 }
11618 }
11619 break;
11620
11621 case DISK_CONDITIONER_IOC_GET: {
11622 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11623 }
11624 break;
11625
11626 case DISK_CONDITIONER_IOC_SET: {
11627 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11628 }
11629 break;
11630
11631 case FSIOC_CAS_BSDFLAGS:
11632 error = handle_flags(vp, data, ctx);
11633 break;
11634
11635 case FSIOC_FD_ONLY_OPEN_ONCE: {
11636 error = 0;
11637 if (vnode_usecount(vp) > 1) {
11638 vnode_lock_spin(vp);
11639 if (vp->v_lflag & VL_HASSTREAMS) {
11640 if (vnode_isinuse_locked(vp, 1, 1)) {
11641 error = EBUSY;
11642 }
11643 } else if (vnode_usecount(vp) > 1) {
11644 error = EBUSY;
11645 }
11646 vnode_unlock(vp);
11647 }
11648 }
11649 break;
11650
11651 case FSIOC_EVAL_ROOTAUTH:
11652 error = handle_auth(vp, cmd, data, options, ctx);
11653 break;
11654
11655 default: {
11656 /* other, known commands shouldn't be passed down here */
11657 switch (cmd) {
11658 case F_PUNCHHOLE:
11659 case F_TRIM_ACTIVE_FILE:
11660 case F_RDADVISE:
11661 case F_TRANSCODEKEY:
11662 case F_GETPROTECTIONLEVEL:
11663 case F_GETDEFAULTPROTLEVEL:
11664 case F_MAKECOMPRESSED:
11665 case F_SET_GREEDY_MODE:
11666 case F_SETSTATICCONTENT:
11667 case F_SETIOTYPE:
11668 case F_SETBACKINGSTORE:
11669 case F_GETPATH_MTMINFO:
11670 case APFSIOC_REVERT_TO_SNAPSHOT:
11671 case FSIOC_FIOSEEKHOLE:
11672 case FSIOC_FIOSEEKDATA:
11673 case HFS_GET_BOOT_INFO:
11674 case HFS_SET_BOOT_INFO:
11675 case FIOPINSWAP:
11676 case F_CHKCLEAN:
11677 case F_FULLFSYNC:
11678 case F_BARRIERFSYNC:
11679 case F_FREEZE_FS:
11680 case F_THAW_FS:
11681 case FSIOC_KERNEL_ROOTAUTH:
11682 error = EINVAL;
11683 goto outdrop;
11684 }
11685 /* Invoke the filesystem-specific code */
11686 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11687 }
11688 } /* end switch stmt */
11689
11690 /*
11691 * if no errors, copy any data to user. Size was
11692 * already set and checked above.
11693 */
11694 if (error == 0 && (cmd & IOC_OUT) && size) {
11695 error = copyout(data, udata, size);
11696 }
11697
11698 outdrop:
11699 if (memp) {
11700 kheap_free(KHEAP_TEMP, memp, size);
11701 }
11702
11703 return error;
11704 }
11705
11706 /* ARGSUSED */
11707 int
11708 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11709 {
11710 int error;
11711 struct nameidata nd;
11712 uint32_t nameiflags;
11713 vnode_t vp = NULL;
11714 vfs_context_t ctx = vfs_context_current();
11715
11716 AUDIT_ARG(cmd, (int)uap->cmd);
11717 AUDIT_ARG(value32, uap->options);
11718 /* Get the vnode for the file we are getting info on: */
11719 nameiflags = 0;
11720 //
11721 // if we come through fsctl() then the file is by definition not open.
11722 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11723 // lest the caller mistakenly thinks the only open is their own (but in
11724 // reality it's someone elses).
11725 //
11726 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11727 return EINVAL;
11728 }
11729 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11730 nameiflags |= FOLLOW;
11731 }
11732 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11733 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11734 }
11735 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11736 UIO_USERSPACE, uap->path, ctx);
11737 if ((error = namei(&nd))) {
11738 goto done;
11739 }
11740 vp = nd.ni_vp;
11741 nameidone(&nd);
11742
11743 #if CONFIG_MACF
11744 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11745 if (error) {
11746 goto done;
11747 }
11748 #endif
11749
11750 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11751
11752 done:
11753 if (vp) {
11754 vnode_put(vp);
11755 }
11756 return error;
11757 }
11758 /* ARGSUSED */
11759 int
11760 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11761 {
11762 int error;
11763 vnode_t vp = NULL;
11764 vfs_context_t ctx = vfs_context_current();
11765 int fd = -1;
11766
11767 AUDIT_ARG(fd, uap->fd);
11768 AUDIT_ARG(cmd, (int)uap->cmd);
11769 AUDIT_ARG(value32, uap->options);
11770
11771 /* Get the vnode for the file we are getting info on: */
11772 if ((error = file_vnode(uap->fd, &vp))) {
11773 return error;
11774 }
11775 fd = uap->fd;
11776 if ((error = vnode_getwithref(vp))) {
11777 file_drop(fd);
11778 return error;
11779 }
11780
11781 #if CONFIG_MACF
11782 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11783 file_drop(fd);
11784 vnode_put(vp);
11785 return error;
11786 }
11787 #endif
11788
11789 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11790
11791 file_drop(fd);
11792
11793 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11794 if (vp) {
11795 vnode_put(vp);
11796 }
11797
11798 return error;
11799 }
11800 /* end of fsctl system call */
11801
11802 #define FILESEC_ACCESS_ENTITLEMENT \
11803 "com.apple.private.vfs.filesec-access"
11804
11805 static int
11806 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
11807 {
11808 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
11809 /*
11810 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
11811 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
11812 */
11813 if ((!setting && vfs_context_issuser(ctx)) ||
11814 IOTaskHasEntitlement(current_task(),
11815 FILESEC_ACCESS_ENTITLEMENT)) {
11816 return 0;
11817 }
11818 }
11819
11820 return EPERM;
11821 }
11822
11823 /*
11824 * Retrieve the data of an extended attribute.
11825 */
11826 int
11827 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11828 {
11829 vnode_t vp;
11830 struct nameidata nd;
11831 char attrname[XATTR_MAXNAMELEN + 1];
11832 vfs_context_t ctx = vfs_context_current();
11833 uio_t auio = NULL;
11834 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11835 size_t attrsize = 0;
11836 size_t namelen;
11837 u_int32_t nameiflags;
11838 int error;
11839 char uio_buf[UIO_SIZEOF(1)];
11840
11841 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11842 return EINVAL;
11843 }
11844
11845 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11846 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11847 if ((error = namei(&nd))) {
11848 return error;
11849 }
11850 vp = nd.ni_vp;
11851 nameidone(&nd);
11852
11853 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11854 if (error != 0) {
11855 goto out;
11856 }
11857 if (xattr_protected(attrname) &&
11858 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
11859 goto out;
11860 }
11861 /*
11862 * the specific check for 0xffffffff is a hack to preserve
11863 * binaray compatibilty in K64 with applications that discovered
11864 * that passing in a buf pointer and a size of -1 resulted in
11865 * just the size of the indicated extended attribute being returned.
11866 * this isn't part of the documented behavior, but because of the
11867 * original implemtation's check for "uap->size > 0", this behavior
11868 * was allowed. In K32 that check turned into a signed comparison
11869 * even though uap->size is unsigned... in K64, we blow by that
11870 * check because uap->size is unsigned and doesn't get sign smeared
11871 * in the munger for a 32 bit user app. we also need to add a
11872 * check to limit the maximum size of the buffer being passed in...
11873 * unfortunately, the underlying fileystems seem to just malloc
11874 * the requested size even if the actual extended attribute is tiny.
11875 * because that malloc is for kernel wired memory, we have to put a
11876 * sane limit on it.
11877 *
11878 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11879 * U64 running on K64 will yield -1 (64 bits wide)
11880 * U32/U64 running on K32 will yield -1 (32 bits wide)
11881 */
11882 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11883 goto no_uio;
11884 }
11885
11886 if (uap->value) {
11887 if (uap->size > (size_t)XATTR_MAXSIZE) {
11888 uap->size = XATTR_MAXSIZE;
11889 }
11890
11891 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11892 &uio_buf[0], sizeof(uio_buf));
11893 uio_addiov(auio, uap->value, uap->size);
11894 }
11895 no_uio:
11896 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11897 out:
11898 vnode_put(vp);
11899
11900 if (auio) {
11901 *retval = uap->size - uio_resid(auio);
11902 } else {
11903 *retval = (user_ssize_t)attrsize;
11904 }
11905
11906 return error;
11907 }
11908
11909 /*
11910 * Retrieve the data of an extended attribute.
11911 */
11912 int
11913 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11914 {
11915 vnode_t vp;
11916 char attrname[XATTR_MAXNAMELEN + 1];
11917 vfs_context_t ctx = vfs_context_current();
11918 uio_t auio = NULL;
11919 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11920 size_t attrsize = 0;
11921 size_t namelen;
11922 int error;
11923 char uio_buf[UIO_SIZEOF(1)];
11924
11925 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11926 return EINVAL;
11927 }
11928
11929 if ((error = file_vnode(uap->fd, &vp))) {
11930 return error;
11931 }
11932 if ((error = vnode_getwithref(vp))) {
11933 file_drop(uap->fd);
11934 return error;
11935 }
11936 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11937 if (error != 0) {
11938 goto out;
11939 }
11940 if (xattr_protected(attrname) &&
11941 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
11942 goto out;
11943 }
11944 if (uap->value && uap->size > 0) {
11945 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11946 &uio_buf[0], sizeof(uio_buf));
11947 uio_addiov(auio, uap->value, uap->size);
11948 }
11949
11950 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11951 out:
11952 (void)vnode_put(vp);
11953 file_drop(uap->fd);
11954
11955 if (auio) {
11956 *retval = uap->size - uio_resid(auio);
11957 } else {
11958 *retval = (user_ssize_t)attrsize;
11959 }
11960 return error;
11961 }
11962
11963 /*
11964 * Set the data of an extended attribute.
11965 */
11966 int
11967 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11968 {
11969 vnode_t vp;
11970 struct nameidata nd;
11971 char attrname[XATTR_MAXNAMELEN + 1];
11972 vfs_context_t ctx = vfs_context_current();
11973 uio_t auio = NULL;
11974 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11975 size_t namelen;
11976 u_int32_t nameiflags;
11977 int error;
11978 char uio_buf[UIO_SIZEOF(1)];
11979
11980 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11981 return EINVAL;
11982 }
11983
11984 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11985 if (error != 0) {
11986 if (error == EPERM) {
11987 /* if the string won't fit in attrname, copyinstr emits EPERM */
11988 return ENAMETOOLONG;
11989 }
11990 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11991 return error;
11992 }
11993 if (xattr_protected(attrname) &&
11994 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
11995 return error;
11996 }
11997 if (uap->size != 0 && uap->value == 0) {
11998 return EINVAL;
11999 }
12000 if (uap->size > INT_MAX) {
12001 return E2BIG;
12002 }
12003
12004 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12005 NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12006 if ((error = namei(&nd))) {
12007 return error;
12008 }
12009 vp = nd.ni_vp;
12010 nameidone(&nd);
12011
12012 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12013 &uio_buf[0], sizeof(uio_buf));
12014 uio_addiov(auio, uap->value, uap->size);
12015
12016 error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
12017 #if CONFIG_FSE
12018 if (error == 0) {
12019 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12020 FSE_ARG_VNODE, vp,
12021 FSE_ARG_DONE);
12022 }
12023 #endif
12024 vnode_put(vp);
12025 *retval = 0;
12026 return error;
12027 }
12028
12029 /*
12030 * Set the data of an extended attribute.
12031 */
12032 int
12033 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12034 {
12035 vnode_t vp;
12036 char attrname[XATTR_MAXNAMELEN + 1];
12037 vfs_context_t ctx = vfs_context_current();
12038 uio_t auio = NULL;
12039 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12040 size_t namelen;
12041 int error;
12042 char uio_buf[UIO_SIZEOF(1)];
12043
12044 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12045 return EINVAL;
12046 }
12047
12048 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12049 if (error != 0) {
12050 if (error == EPERM) {
12051 /* if the string won't fit in attrname, copyinstr emits EPERM */
12052 return ENAMETOOLONG;
12053 }
12054 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12055 return error;
12056 }
12057 if (xattr_protected(attrname) &&
12058 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12059 return error;
12060 }
12061 if (uap->size != 0 && uap->value == 0) {
12062 return EINVAL;
12063 }
12064 if (uap->size > INT_MAX) {
12065 return E2BIG;
12066 }
12067 if ((error = file_vnode(uap->fd, &vp))) {
12068 return error;
12069 }
12070 if ((error = vnode_getwithref(vp))) {
12071 file_drop(uap->fd);
12072 return error;
12073 }
12074 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12075 &uio_buf[0], sizeof(uio_buf));
12076 uio_addiov(auio, uap->value, uap->size);
12077
12078 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12079 #if CONFIG_FSE
12080 if (error == 0) {
12081 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12082 FSE_ARG_VNODE, vp,
12083 FSE_ARG_DONE);
12084 }
12085 #endif
12086 vnode_put(vp);
12087 file_drop(uap->fd);
12088 *retval = 0;
12089 return error;
12090 }
12091
12092 /*
12093 * Remove an extended attribute.
12094 * XXX Code duplication here.
12095 */
12096 int
12097 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12098 {
12099 vnode_t vp;
12100 struct nameidata nd;
12101 char attrname[XATTR_MAXNAMELEN + 1];
12102 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12103 vfs_context_t ctx = vfs_context_current();
12104 size_t namelen;
12105 u_int32_t nameiflags;
12106 int error;
12107
12108 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12109 return EINVAL;
12110 }
12111
12112 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12113 if (error != 0) {
12114 return error;
12115 }
12116 if (xattr_protected(attrname)) {
12117 return EPERM;
12118 }
12119 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12120 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12121 if ((error = namei(&nd))) {
12122 return error;
12123 }
12124 vp = nd.ni_vp;
12125 nameidone(&nd);
12126
12127 error = vn_removexattr(vp, attrname, uap->options, ctx);
12128 #if CONFIG_FSE
12129 if (error == 0) {
12130 add_fsevent(FSE_XATTR_REMOVED, ctx,
12131 FSE_ARG_VNODE, vp,
12132 FSE_ARG_DONE);
12133 }
12134 #endif
12135 vnode_put(vp);
12136 *retval = 0;
12137 return error;
12138 }
12139
12140 /*
12141 * Remove an extended attribute.
12142 * XXX Code duplication here.
12143 */
12144 int
12145 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12146 {
12147 vnode_t vp;
12148 char attrname[XATTR_MAXNAMELEN + 1];
12149 size_t namelen;
12150 int error;
12151 #if CONFIG_FSE
12152 vfs_context_t ctx = vfs_context_current();
12153 #endif
12154
12155 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12156 return EINVAL;
12157 }
12158
12159 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12160 if (error != 0) {
12161 return error;
12162 }
12163 if (xattr_protected(attrname)) {
12164 return EPERM;
12165 }
12166 if ((error = file_vnode(uap->fd, &vp))) {
12167 return error;
12168 }
12169 if ((error = vnode_getwithref(vp))) {
12170 file_drop(uap->fd);
12171 return error;
12172 }
12173
12174 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12175 #if CONFIG_FSE
12176 if (error == 0) {
12177 add_fsevent(FSE_XATTR_REMOVED, ctx,
12178 FSE_ARG_VNODE, vp,
12179 FSE_ARG_DONE);
12180 }
12181 #endif
12182 vnode_put(vp);
12183 file_drop(uap->fd);
12184 *retval = 0;
12185 return error;
12186 }
12187
12188 /*
12189 * Retrieve the list of extended attribute names.
12190 * XXX Code duplication here.
12191 */
12192 int
12193 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12194 {
12195 vnode_t vp;
12196 struct nameidata nd;
12197 vfs_context_t ctx = vfs_context_current();
12198 uio_t auio = NULL;
12199 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12200 size_t attrsize = 0;
12201 u_int32_t nameiflags;
12202 int error;
12203 char uio_buf[UIO_SIZEOF(1)];
12204
12205 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12206 return EINVAL;
12207 }
12208
12209 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12210 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12211 if ((error = namei(&nd))) {
12212 return error;
12213 }
12214 vp = nd.ni_vp;
12215 nameidone(&nd);
12216 if (uap->namebuf != 0 && uap->bufsize > 0) {
12217 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12218 &uio_buf[0], sizeof(uio_buf));
12219 uio_addiov(auio, uap->namebuf, uap->bufsize);
12220 }
12221
12222 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12223
12224 vnode_put(vp);
12225 if (auio) {
12226 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12227 } else {
12228 *retval = (user_ssize_t)attrsize;
12229 }
12230 return error;
12231 }
12232
12233 /*
12234 * Retrieve the list of extended attribute names.
12235 * XXX Code duplication here.
12236 */
12237 int
12238 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12239 {
12240 vnode_t vp;
12241 uio_t auio = NULL;
12242 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12243 size_t attrsize = 0;
12244 int error;
12245 char uio_buf[UIO_SIZEOF(1)];
12246
12247 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12248 return EINVAL;
12249 }
12250
12251 if ((error = file_vnode(uap->fd, &vp))) {
12252 return error;
12253 }
12254 if ((error = vnode_getwithref(vp))) {
12255 file_drop(uap->fd);
12256 return error;
12257 }
12258 if (uap->namebuf != 0 && uap->bufsize > 0) {
12259 auio = uio_createwithbuffer(1, 0, spacetype,
12260 UIO_READ, &uio_buf[0], sizeof(uio_buf));
12261 uio_addiov(auio, uap->namebuf, uap->bufsize);
12262 }
12263
12264 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12265
12266 vnode_put(vp);
12267 file_drop(uap->fd);
12268 if (auio) {
12269 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12270 } else {
12271 *retval = (user_ssize_t)attrsize;
12272 }
12273 return error;
12274 }
12275
12276 static int
12277 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12278 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12279 {
12280 int error;
12281 struct mount *mp = NULL;
12282 vnode_t vp;
12283 int length;
12284 int bpflags;
12285 /* maximum number of times to retry build_path */
12286 unsigned int retries = 0x10;
12287
12288 if (bufsize > PAGE_SIZE) {
12289 return EINVAL;
12290 }
12291
12292 if (buf == NULL) {
12293 return ENOMEM;
12294 }
12295
12296 retry:
12297 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12298 error = ENOTSUP; /* unexpected failure */
12299 return ENOTSUP;
12300 }
12301
12302 unionget:
12303 if (objid == 2) {
12304 struct vfs_attr vfsattr;
12305 int use_vfs_root = TRUE;
12306
12307 VFSATTR_INIT(&vfsattr);
12308 VFSATTR_WANTED(&vfsattr, f_capabilities);
12309 if (!(options & FSOPT_ISREALFSID) &&
12310 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12311 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12312 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12313 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12314 use_vfs_root = FALSE;
12315 }
12316 }
12317
12318 if (use_vfs_root) {
12319 error = VFS_ROOT(mp, &vp, ctx);
12320 } else {
12321 error = VFS_VGET(mp, objid, &vp, ctx);
12322 }
12323 } else {
12324 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12325 }
12326
12327 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12328 /*
12329 * If the fileid isn't found and we're in a union
12330 * mount volume, then see if the fileid is in the
12331 * mounted-on volume.
12332 */
12333 struct mount *tmp = mp;
12334 mp = vnode_mount(tmp->mnt_vnodecovered);
12335 vfs_unbusy(tmp);
12336 if (vfs_busy(mp, LK_NOWAIT) == 0) {
12337 goto unionget;
12338 }
12339 } else {
12340 vfs_unbusy(mp);
12341 }
12342
12343 if (error) {
12344 return error;
12345 }
12346
12347 #if CONFIG_MACF
12348 error = mac_vnode_check_fsgetpath(ctx, vp);
12349 if (error) {
12350 vnode_put(vp);
12351 return error;
12352 }
12353 #endif
12354
12355 /* Obtain the absolute path to this vnode. */
12356 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12357 if (options & FSOPT_NOFIRMLINKPATH) {
12358 bpflags |= BUILDPATH_NO_FIRMLINK;
12359 }
12360 bpflags |= BUILDPATH_CHECK_MOVED;
12361 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12362 vnode_put(vp);
12363
12364 if (error) {
12365 /* there was a race building the path, try a few more times */
12366 if (error == EAGAIN) {
12367 --retries;
12368 if (retries > 0) {
12369 goto retry;
12370 }
12371
12372 error = ENOENT;
12373 }
12374 goto out;
12375 }
12376
12377 AUDIT_ARG(text, buf);
12378
12379 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12380 unsigned long path_words[NUMPARMS];
12381 size_t path_len = sizeof(path_words);
12382
12383 if ((size_t)length < path_len) {
12384 memcpy((char *)path_words, buf, length);
12385 memset((char *)path_words + length, 0, path_len - length);
12386
12387 path_len = length;
12388 } else {
12389 memcpy((char *)path_words, buf + (length - path_len), path_len);
12390 }
12391
12392 kdebug_vfs_lookup(path_words, (int)path_len, vp,
12393 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12394 }
12395
12396 *pathlen = length; /* may be superseded by error */
12397
12398 out:
12399 return error;
12400 }
12401
12402 /*
12403 * Obtain the full pathname of a file system object by id.
12404 */
12405 static int
12406 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12407 uint32_t options, user_ssize_t *retval)
12408 {
12409 vfs_context_t ctx = vfs_context_current();
12410 fsid_t fsid;
12411 char *realpath;
12412 int length;
12413 int error;
12414
12415 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12416 return EINVAL;
12417 }
12418
12419 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12420 return error;
12421 }
12422 AUDIT_ARG(value32, fsid.val[0]);
12423 AUDIT_ARG(value64, objid);
12424 /* Restrict output buffer size for now. */
12425
12426 if (bufsize > PAGE_SIZE || bufsize <= 0) {
12427 return EINVAL;
12428 }
12429 realpath = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK | Z_ZERO);
12430 if (realpath == NULL) {
12431 return ENOMEM;
12432 }
12433
12434 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12435 options, &length);
12436
12437 if (error) {
12438 goto out;
12439 }
12440
12441 error = copyout((caddr_t)realpath, buf, length);
12442
12443 *retval = (user_ssize_t)length; /* may be superseded by error */
12444 out:
12445 kheap_free(KHEAP_TEMP, realpath, bufsize);
12446 return error;
12447 }
12448
12449 int
12450 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12451 {
12452 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12453 0, retval);
12454 }
12455
12456 int
12457 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12458 {
12459 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12460 uap->options, retval);
12461 }
12462
12463 /*
12464 * Common routine to handle various flavors of statfs data heading out
12465 * to user space.
12466 *
12467 * Returns: 0 Success
12468 * EFAULT
12469 */
12470 static int
12471 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12472 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12473 boolean_t partial_copy)
12474 {
12475 int error;
12476 int my_size, copy_size;
12477
12478 if (is_64_bit) {
12479 struct user64_statfs sfs;
12480 my_size = copy_size = sizeof(sfs);
12481 bzero(&sfs, my_size);
12482 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12483 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12484 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12485 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12486 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12487 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12488 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12489 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12490 sfs.f_files = (user64_long_t)sfsp->f_files;
12491 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12492 sfs.f_fsid = sfsp->f_fsid;
12493 sfs.f_owner = sfsp->f_owner;
12494 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12495 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12496 } else {
12497 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12498 }
12499 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12500 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12501
12502 if (partial_copy) {
12503 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12504 }
12505 error = copyout((caddr_t)&sfs, bufp, copy_size);
12506 } else {
12507 struct user32_statfs sfs;
12508
12509 my_size = copy_size = sizeof(sfs);
12510 bzero(&sfs, my_size);
12511
12512 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12513 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12514 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12515
12516 /*
12517 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12518 * have to fudge the numbers here in that case. We inflate the blocksize in order
12519 * to reflect the filesystem size as best we can.
12520 */
12521 if ((sfsp->f_blocks > INT_MAX)
12522 /* Hack for 4061702 . I think the real fix is for Carbon to
12523 * look for some volume capability and not depend on hidden
12524 * semantics agreed between a FS and carbon.
12525 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12526 * for Carbon to set bNoVolumeSizes volume attribute.
12527 * Without this the webdavfs files cannot be copied onto
12528 * disk as they look huge. This change should not affect
12529 * XSAN as they should not setting these to -1..
12530 */
12531 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12532 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12533 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12534 int shift;
12535
12536 /*
12537 * Work out how far we have to shift the block count down to make it fit.
12538 * Note that it's possible to have to shift so far that the resulting
12539 * blocksize would be unreportably large. At that point, we will clip
12540 * any values that don't fit.
12541 *
12542 * For safety's sake, we also ensure that f_iosize is never reported as
12543 * being smaller than f_bsize.
12544 */
12545 for (shift = 0; shift < 32; shift++) {
12546 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12547 break;
12548 }
12549 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12550 break;
12551 }
12552 }
12553 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12554 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12555 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12556 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12557 #undef __SHIFT_OR_CLIP
12558 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12559 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12560 } else {
12561 /* filesystem is small enough to be reported honestly */
12562 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12563 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12564 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12565 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12566 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12567 }
12568 sfs.f_files = (user32_long_t)sfsp->f_files;
12569 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12570 sfs.f_fsid = sfsp->f_fsid;
12571 sfs.f_owner = sfsp->f_owner;
12572 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12573 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12574 } else {
12575 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12576 }
12577 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12578 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12579
12580 if (partial_copy) {
12581 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12582 }
12583 error = copyout((caddr_t)&sfs, bufp, copy_size);
12584 }
12585
12586 if (sizep != NULL) {
12587 *sizep = my_size;
12588 }
12589 return error;
12590 }
12591
12592 /*
12593 * copy stat structure into user_stat structure.
12594 */
12595 void
12596 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12597 {
12598 bzero(usbp, sizeof(*usbp));
12599
12600 usbp->st_dev = sbp->st_dev;
12601 usbp->st_ino = sbp->st_ino;
12602 usbp->st_mode = sbp->st_mode;
12603 usbp->st_nlink = sbp->st_nlink;
12604 usbp->st_uid = sbp->st_uid;
12605 usbp->st_gid = sbp->st_gid;
12606 usbp->st_rdev = sbp->st_rdev;
12607 #ifndef _POSIX_C_SOURCE
12608 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12609 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12610 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12611 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12612 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12613 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12614 #else
12615 usbp->st_atime = sbp->st_atime;
12616 usbp->st_atimensec = sbp->st_atimensec;
12617 usbp->st_mtime = sbp->st_mtime;
12618 usbp->st_mtimensec = sbp->st_mtimensec;
12619 usbp->st_ctime = sbp->st_ctime;
12620 usbp->st_ctimensec = sbp->st_ctimensec;
12621 #endif
12622 usbp->st_size = sbp->st_size;
12623 usbp->st_blocks = sbp->st_blocks;
12624 usbp->st_blksize = sbp->st_blksize;
12625 usbp->st_flags = sbp->st_flags;
12626 usbp->st_gen = sbp->st_gen;
12627 usbp->st_lspare = sbp->st_lspare;
12628 usbp->st_qspare[0] = sbp->st_qspare[0];
12629 usbp->st_qspare[1] = sbp->st_qspare[1];
12630 }
12631
12632 void
12633 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12634 {
12635 bzero(usbp, sizeof(*usbp));
12636
12637 usbp->st_dev = sbp->st_dev;
12638 usbp->st_ino = sbp->st_ino;
12639 usbp->st_mode = sbp->st_mode;
12640 usbp->st_nlink = sbp->st_nlink;
12641 usbp->st_uid = sbp->st_uid;
12642 usbp->st_gid = sbp->st_gid;
12643 usbp->st_rdev = sbp->st_rdev;
12644 #ifndef _POSIX_C_SOURCE
12645 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12646 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12647 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12648 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12649 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12650 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12651 #else
12652 usbp->st_atime = sbp->st_atime;
12653 usbp->st_atimensec = sbp->st_atimensec;
12654 usbp->st_mtime = sbp->st_mtime;
12655 usbp->st_mtimensec = sbp->st_mtimensec;
12656 usbp->st_ctime = sbp->st_ctime;
12657 usbp->st_ctimensec = sbp->st_ctimensec;
12658 #endif
12659 usbp->st_size = sbp->st_size;
12660 usbp->st_blocks = sbp->st_blocks;
12661 usbp->st_blksize = sbp->st_blksize;
12662 usbp->st_flags = sbp->st_flags;
12663 usbp->st_gen = sbp->st_gen;
12664 usbp->st_lspare = sbp->st_lspare;
12665 usbp->st_qspare[0] = sbp->st_qspare[0];
12666 usbp->st_qspare[1] = sbp->st_qspare[1];
12667 }
12668
12669 /*
12670 * copy stat64 structure into user_stat64 structure.
12671 */
12672 void
12673 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12674 {
12675 bzero(usbp, sizeof(*usbp));
12676
12677 usbp->st_dev = sbp->st_dev;
12678 usbp->st_ino = sbp->st_ino;
12679 usbp->st_mode = sbp->st_mode;
12680 usbp->st_nlink = sbp->st_nlink;
12681 usbp->st_uid = sbp->st_uid;
12682 usbp->st_gid = sbp->st_gid;
12683 usbp->st_rdev = sbp->st_rdev;
12684 #ifndef _POSIX_C_SOURCE
12685 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12686 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12687 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12688 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12689 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12690 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12691 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12692 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12693 #else
12694 usbp->st_atime = sbp->st_atime;
12695 usbp->st_atimensec = sbp->st_atimensec;
12696 usbp->st_mtime = sbp->st_mtime;
12697 usbp->st_mtimensec = sbp->st_mtimensec;
12698 usbp->st_ctime = sbp->st_ctime;
12699 usbp->st_ctimensec = sbp->st_ctimensec;
12700 usbp->st_birthtime = sbp->st_birthtime;
12701 usbp->st_birthtimensec = sbp->st_birthtimensec;
12702 #endif
12703 usbp->st_size = sbp->st_size;
12704 usbp->st_blocks = sbp->st_blocks;
12705 usbp->st_blksize = sbp->st_blksize;
12706 usbp->st_flags = sbp->st_flags;
12707 usbp->st_gen = sbp->st_gen;
12708 usbp->st_lspare = sbp->st_lspare;
12709 usbp->st_qspare[0] = sbp->st_qspare[0];
12710 usbp->st_qspare[1] = sbp->st_qspare[1];
12711 }
12712
12713 void
12714 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12715 {
12716 bzero(usbp, sizeof(*usbp));
12717
12718 usbp->st_dev = sbp->st_dev;
12719 usbp->st_ino = sbp->st_ino;
12720 usbp->st_mode = sbp->st_mode;
12721 usbp->st_nlink = sbp->st_nlink;
12722 usbp->st_uid = sbp->st_uid;
12723 usbp->st_gid = sbp->st_gid;
12724 usbp->st_rdev = sbp->st_rdev;
12725 #ifndef _POSIX_C_SOURCE
12726 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12727 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12728 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12729 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12730 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12731 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12732 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
12733 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
12734 #else
12735 usbp->st_atime = sbp->st_atime;
12736 usbp->st_atimensec = sbp->st_atimensec;
12737 usbp->st_mtime = sbp->st_mtime;
12738 usbp->st_mtimensec = sbp->st_mtimensec;
12739 usbp->st_ctime = sbp->st_ctime;
12740 usbp->st_ctimensec = sbp->st_ctimensec;
12741 usbp->st_birthtime = sbp->st_birthtime;
12742 usbp->st_birthtimensec = sbp->st_birthtimensec;
12743 #endif
12744 usbp->st_size = sbp->st_size;
12745 usbp->st_blocks = sbp->st_blocks;
12746 usbp->st_blksize = sbp->st_blksize;
12747 usbp->st_flags = sbp->st_flags;
12748 usbp->st_gen = sbp->st_gen;
12749 usbp->st_lspare = sbp->st_lspare;
12750 usbp->st_qspare[0] = sbp->st_qspare[0];
12751 usbp->st_qspare[1] = sbp->st_qspare[1];
12752 }
12753
12754 /*
12755 * Purge buffer cache for simulating cold starts
12756 */
12757 static int
12758 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12759 {
12760 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12761
12762 return VNODE_RETURNED;
12763 }
12764
12765 static int
12766 vfs_purge_callback(mount_t mp, __unused void * arg)
12767 {
12768 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12769
12770 return VFS_RETURNED;
12771 }
12772
12773 int
12774 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12775 {
12776 if (!kauth_cred_issuser(kauth_cred_get())) {
12777 return EPERM;
12778 }
12779
12780 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12781
12782 return 0;
12783 }
12784
12785 /*
12786 * gets the vnode associated with the (unnamed) snapshot directory
12787 * for a Filesystem. The snapshot directory vnode is returned with
12788 * an iocount on it.
12789 */
12790 int
12791 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12792 {
12793 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12794 }
12795
12796 /*
12797 * Get the snapshot vnode.
12798 *
12799 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12800 * needs nameidone() on ndp.
12801 *
12802 * If the snapshot vnode exists it is returned in ndp->ni_vp.
12803 *
12804 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12805 * not needed.
12806 */
12807 static int
12808 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12809 user_addr_t name, struct nameidata *ndp, int32_t op,
12810 #if !CONFIG_TRIGGERS
12811 __unused
12812 #endif
12813 enum path_operation pathop,
12814 vfs_context_t ctx)
12815 {
12816 int error, i;
12817 caddr_t name_buf;
12818 size_t name_len;
12819 struct vfs_attr vfa;
12820
12821 *sdvpp = NULLVP;
12822 *rvpp = NULLVP;
12823
12824 error = vnode_getfromfd(ctx, dirfd, rvpp);
12825 if (error) {
12826 return error;
12827 }
12828
12829 if (!vnode_isvroot(*rvpp)) {
12830 error = EINVAL;
12831 goto out;
12832 }
12833
12834 /* Make sure the filesystem supports snapshots */
12835 VFSATTR_INIT(&vfa);
12836 VFSATTR_WANTED(&vfa, f_capabilities);
12837 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12838 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12839 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12840 VOL_CAP_INT_SNAPSHOT)) ||
12841 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12842 VOL_CAP_INT_SNAPSHOT))) {
12843 error = ENOTSUP;
12844 goto out;
12845 }
12846
12847 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12848 if (error) {
12849 goto out;
12850 }
12851
12852 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
12853 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12854 if (error) {
12855 goto out1;
12856 }
12857
12858 /*
12859 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12860 * (the length returned by copyinstr includes the terminating NUL)
12861 */
12862 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12863 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12864 error = EINVAL;
12865 goto out1;
12866 }
12867 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12868 ;
12869 }
12870 if (i < (int)name_len) {
12871 error = EINVAL;
12872 goto out1;
12873 }
12874
12875 #if CONFIG_MACF
12876 if (op == CREATE) {
12877 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12878 name_buf);
12879 } else if (op == DELETE) {
12880 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12881 name_buf);
12882 }
12883 if (error) {
12884 goto out1;
12885 }
12886 #endif
12887
12888 /* Check if the snapshot already exists ... */
12889 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12890 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12891 ndp->ni_dvp = *sdvpp;
12892
12893 error = namei(ndp);
12894 out1:
12895 zfree(ZV_NAMEI, name_buf);
12896 out:
12897 if (error) {
12898 if (*sdvpp) {
12899 vnode_put(*sdvpp);
12900 *sdvpp = NULLVP;
12901 }
12902 if (*rvpp) {
12903 vnode_put(*rvpp);
12904 *rvpp = NULLVP;
12905 }
12906 }
12907 return error;
12908 }
12909
12910 /*
12911 * create a filesystem snapshot (for supporting filesystems)
12912 *
12913 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12914 * We get to the (unnamed) snapshot directory vnode and create the vnode
12915 * for the snapshot in it.
12916 *
12917 * Restrictions:
12918 *
12919 * a) Passed in name for snapshot cannot have slashes.
12920 * b) name can't be "." or ".."
12921 *
12922 * Since this requires superuser privileges, vnode_authorize calls are not
12923 * made.
12924 */
12925 static int __attribute__((noinline))
12926 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12927 vfs_context_t ctx)
12928 {
12929 vnode_t rvp, snapdvp;
12930 int error;
12931 struct nameidata *ndp;
12932
12933 ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
12934
12935 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
12936 OP_LINK, ctx);
12937 if (error) {
12938 goto out;
12939 }
12940
12941 if (ndp->ni_vp) {
12942 vnode_put(ndp->ni_vp);
12943 error = EEXIST;
12944 } else {
12945 struct vnode_attr *vap;
12946 vnode_t vp = NULLVP;
12947
12948 vap = kheap_alloc(KHEAP_TEMP, sizeof(*vap), Z_WAITOK);
12949
12950 VATTR_INIT(vap);
12951 VATTR_SET(vap, va_type, VREG);
12952 VATTR_SET(vap, va_mode, 0);
12953
12954 error = vn_create(snapdvp, &vp, ndp, vap,
12955 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12956 if (!error && vp) {
12957 vnode_put(vp);
12958 }
12959
12960 kheap_free(KHEAP_TEMP, vap, sizeof(*vap));
12961 }
12962
12963 nameidone(ndp);
12964 vnode_put(snapdvp);
12965 vnode_put(rvp);
12966 out:
12967 kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
12968
12969 return error;
12970 }
12971
12972 /*
12973 * Delete a Filesystem snapshot
12974 *
12975 * get the vnode for the unnamed snapshot directory and the snapshot and
12976 * delete the snapshot.
12977 */
12978 static int __attribute__((noinline))
12979 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12980 vfs_context_t ctx)
12981 {
12982 vnode_t rvp, snapdvp;
12983 int error;
12984 struct nameidata *ndp;
12985
12986 ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
12987
12988 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
12989 OP_UNLINK, ctx);
12990 if (error) {
12991 goto out;
12992 }
12993
12994 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
12995 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12996
12997 vnode_put(ndp->ni_vp);
12998 nameidone(ndp);
12999 vnode_put(snapdvp);
13000 vnode_put(rvp);
13001 out:
13002 kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
13003
13004 return error;
13005 }
13006
13007 /*
13008 * Revert a filesystem to a snapshot
13009 *
13010 * Marks the filesystem to revert to the given snapshot on next mount.
13011 */
13012 static int __attribute__((noinline))
13013 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13014 vfs_context_t ctx)
13015 {
13016 int error;
13017 vnode_t rvp;
13018 mount_t mp;
13019 struct fs_snapshot_revert_args revert_data;
13020 struct componentname cnp;
13021 caddr_t name_buf;
13022 size_t name_len;
13023
13024 error = vnode_getfromfd(ctx, dirfd, &rvp);
13025 if (error) {
13026 return error;
13027 }
13028 mp = vnode_mount(rvp);
13029
13030 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13031 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13032 if (error) {
13033 zfree(ZV_NAMEI, name_buf);
13034 vnode_put(rvp);
13035 return error;
13036 }
13037
13038 #if CONFIG_MACF
13039 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13040 if (error) {
13041 zfree(ZV_NAMEI, name_buf);
13042 vnode_put(rvp);
13043 return error;
13044 }
13045 #endif
13046
13047 /*
13048 * Grab mount_iterref so that we can release the vnode,
13049 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13050 */
13051 error = mount_iterref(mp, 0);
13052 vnode_put(rvp);
13053 if (error) {
13054 zfree(ZV_NAMEI, name_buf);
13055 return error;
13056 }
13057
13058 memset(&cnp, 0, sizeof(cnp));
13059 cnp.cn_pnbuf = (char *)name_buf;
13060 cnp.cn_nameiop = LOOKUP;
13061 cnp.cn_flags = ISLASTCN | HASBUF;
13062 cnp.cn_pnlen = MAXPATHLEN;
13063 cnp.cn_nameptr = cnp.cn_pnbuf;
13064 cnp.cn_namelen = (int)name_len;
13065 revert_data.sr_cnp = &cnp;
13066
13067 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13068 mount_iterdrop(mp);
13069 zfree(ZV_NAMEI, name_buf);
13070
13071 if (error) {
13072 /* If there was any error, try again using VNOP_IOCTL */
13073
13074 vnode_t snapdvp;
13075 struct nameidata namend;
13076
13077 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13078 OP_LOOKUP, ctx);
13079 if (error) {
13080 return error;
13081 }
13082
13083
13084 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13085 0, ctx);
13086
13087 vnode_put(namend.ni_vp);
13088 nameidone(&namend);
13089 vnode_put(snapdvp);
13090 vnode_put(rvp);
13091 }
13092
13093 return error;
13094 }
13095
13096 /*
13097 * rename a Filesystem snapshot
13098 *
13099 * get the vnode for the unnamed snapshot directory and the snapshot and
13100 * rename the snapshot. This is a very specialised (and simple) case of
13101 * rename(2) (which has to deal with a lot more complications). It differs
13102 * slightly from rename(2) in that EEXIST is returned if the new name exists.
13103 */
13104 static int __attribute__((noinline))
13105 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13106 __unused uint32_t flags, vfs_context_t ctx)
13107 {
13108 vnode_t rvp, snapdvp;
13109 int error, i;
13110 caddr_t newname_buf;
13111 size_t name_len;
13112 vnode_t fvp;
13113 struct nameidata *fromnd, *tond;
13114 /* carving out a chunk for structs that are too big to be on stack. */
13115 struct {
13116 struct nameidata from_node;
13117 struct nameidata to_node;
13118 } * __rename_data;
13119
13120 __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
13121 fromnd = &__rename_data->from_node;
13122 tond = &__rename_data->to_node;
13123
13124 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13125 OP_UNLINK, ctx);
13126 if (error) {
13127 goto out;
13128 }
13129 fvp = fromnd->ni_vp;
13130
13131 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13132 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13133 if (error) {
13134 goto out1;
13135 }
13136
13137 /*
13138 * Some sanity checks- new name can't be empty, "." or ".." or have
13139 * slashes.
13140 * (the length returned by copyinstr includes the terminating NUL)
13141 *
13142 * The FS rename VNOP is suppossed to handle this but we'll pick it
13143 * off here itself.
13144 */
13145 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13146 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13147 error = EINVAL;
13148 goto out1;
13149 }
13150 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13151 ;
13152 }
13153 if (i < (int)name_len) {
13154 error = EINVAL;
13155 goto out1;
13156 }
13157
13158 #if CONFIG_MACF
13159 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13160 newname_buf);
13161 if (error) {
13162 goto out1;
13163 }
13164 #endif
13165
13166 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13167 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13168 tond->ni_dvp = snapdvp;
13169
13170 error = namei(tond);
13171 if (error) {
13172 goto out2;
13173 } else if (tond->ni_vp) {
13174 /*
13175 * snapshot rename behaves differently than rename(2) - if the
13176 * new name exists, EEXIST is returned.
13177 */
13178 vnode_put(tond->ni_vp);
13179 error = EEXIST;
13180 goto out2;
13181 }
13182
13183 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13184 &tond->ni_cnd, ctx);
13185
13186 out2:
13187 nameidone(tond);
13188 out1:
13189 zfree(ZV_NAMEI, newname_buf);
13190 vnode_put(fvp);
13191 vnode_put(snapdvp);
13192 vnode_put(rvp);
13193 nameidone(fromnd);
13194 out:
13195 kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
13196 return error;
13197 }
13198
13199 /*
13200 * Mount a Filesystem snapshot
13201 *
13202 * get the vnode for the unnamed snapshot directory and the snapshot and
13203 * mount the snapshot.
13204 */
13205 static int __attribute__((noinline))
13206 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13207 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13208 {
13209 mount_t mp;
13210 vnode_t rvp, snapdvp, snapvp, vp, pvp;
13211 struct fs_snapshot_mount_args smnt_data;
13212 int error;
13213 struct nameidata *snapndp, *dirndp;
13214 /* carving out a chunk for structs that are too big to be on stack. */
13215 struct {
13216 struct nameidata snapnd;
13217 struct nameidata dirnd;
13218 } * __snapshot_mount_data;
13219
13220 __snapshot_mount_data = kheap_alloc(KHEAP_TEMP,
13221 sizeof(*__snapshot_mount_data), Z_WAITOK);
13222 snapndp = &__snapshot_mount_data->snapnd;
13223 dirndp = &__snapshot_mount_data->dirnd;
13224
13225 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13226 OP_LOOKUP, ctx);
13227 if (error) {
13228 goto out;
13229 }
13230
13231 snapvp = snapndp->ni_vp;
13232 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13233 error = EIO;
13234 goto out1;
13235 }
13236
13237 /* Get the vnode to be covered */
13238 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13239 UIO_USERSPACE, directory, ctx);
13240 error = namei(dirndp);
13241 if (error) {
13242 goto out1;
13243 }
13244
13245 vp = dirndp->ni_vp;
13246 pvp = dirndp->ni_dvp;
13247 mp = vnode_mount(rvp);
13248
13249 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13250 error = EINVAL;
13251 goto out2;
13252 }
13253
13254 #if CONFIG_MACF
13255 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13256 mp->mnt_vfsstat.f_fstypename);
13257 if (error) {
13258 goto out2;
13259 }
13260 #endif
13261
13262 smnt_data.sm_mp = mp;
13263 smnt_data.sm_cnp = &snapndp->ni_cnd;
13264 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13265 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13266 KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
13267
13268 out2:
13269 vnode_put(vp);
13270 vnode_put(pvp);
13271 nameidone(dirndp);
13272 out1:
13273 vnode_put(snapvp);
13274 vnode_put(snapdvp);
13275 vnode_put(rvp);
13276 nameidone(snapndp);
13277 out:
13278 kheap_free(KHEAP_TEMP, __snapshot_mount_data,
13279 sizeof(*__snapshot_mount_data));
13280 return error;
13281 }
13282
13283 /*
13284 * Root from a snapshot of the filesystem
13285 *
13286 * Marks the filesystem to root from the given snapshot on next boot.
13287 */
13288 static int __attribute__((noinline))
13289 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13290 vfs_context_t ctx)
13291 {
13292 int error;
13293 vnode_t rvp;
13294 mount_t mp;
13295 struct fs_snapshot_root_args root_data;
13296 struct componentname cnp;
13297 caddr_t name_buf;
13298 size_t name_len;
13299
13300 error = vnode_getfromfd(ctx, dirfd, &rvp);
13301 if (error) {
13302 return error;
13303 }
13304 mp = vnode_mount(rvp);
13305
13306 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13307 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13308 if (error) {
13309 zfree(ZV_NAMEI, name_buf);
13310 vnode_put(rvp);
13311 return error;
13312 }
13313
13314 // XXX MAC checks ?
13315
13316 /*
13317 * Grab mount_iterref so that we can release the vnode,
13318 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13319 */
13320 error = mount_iterref(mp, 0);
13321 vnode_put(rvp);
13322 if (error) {
13323 zfree(ZV_NAMEI, name_buf);
13324 return error;
13325 }
13326
13327 memset(&cnp, 0, sizeof(cnp));
13328 cnp.cn_pnbuf = (char *)name_buf;
13329 cnp.cn_nameiop = LOOKUP;
13330 cnp.cn_flags = ISLASTCN | HASBUF;
13331 cnp.cn_pnlen = MAXPATHLEN;
13332 cnp.cn_nameptr = cnp.cn_pnbuf;
13333 cnp.cn_namelen = (int)name_len;
13334 root_data.sr_cnp = &cnp;
13335
13336 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13337
13338 mount_iterdrop(mp);
13339 zfree(ZV_NAMEI, name_buf);
13340
13341 return error;
13342 }
13343
13344 /*
13345 * FS snapshot operations dispatcher
13346 */
13347 int
13348 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13349 __unused int32_t *retval)
13350 {
13351 int error;
13352 vfs_context_t ctx = vfs_context_current();
13353
13354 AUDIT_ARG(fd, uap->dirfd);
13355 AUDIT_ARG(value32, uap->op);
13356
13357 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13358 if (error) {
13359 return error;
13360 }
13361
13362 /*
13363 * Enforce user authorization for snapshot modification operations,
13364 * or if trying to root from snapshot.
13365 */
13366 if (uap->op != SNAPSHOT_OP_MOUNT) {
13367 vnode_t dvp = NULLVP;
13368 vnode_t devvp = NULLVP;
13369 mount_t mp;
13370
13371 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13372 if (error) {
13373 return error;
13374 }
13375 mp = vnode_mount(dvp);
13376 devvp = mp->mnt_devvp;
13377
13378 /* get an iocount on devvp */
13379 if (devvp == NULLVP) {
13380 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13381 /* for mounts which arent block devices */
13382 if (error == ENOENT) {
13383 error = ENXIO;
13384 }
13385 } else {
13386 error = vnode_getwithref(devvp);
13387 }
13388
13389 if (error) {
13390 vnode_put(dvp);
13391 return error;
13392 }
13393
13394 if ((vfs_context_issuser(ctx) == 0) &&
13395 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
13396 error = EPERM;
13397 }
13398 vnode_put(dvp);
13399 vnode_put(devvp);
13400
13401 if (error) {
13402 return error;
13403 }
13404 }
13405
13406 switch (uap->op) {
13407 case SNAPSHOT_OP_CREATE:
13408 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13409 break;
13410 case SNAPSHOT_OP_DELETE:
13411 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13412 break;
13413 case SNAPSHOT_OP_RENAME:
13414 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13415 uap->flags, ctx);
13416 break;
13417 case SNAPSHOT_OP_MOUNT:
13418 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13419 uap->data, uap->flags, ctx);
13420 break;
13421 case SNAPSHOT_OP_REVERT:
13422 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13423 break;
13424 #if CONFIG_MNT_ROOTSNAP
13425 case SNAPSHOT_OP_ROOT:
13426 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13427 break;
13428 #endif /* CONFIG_MNT_ROOTSNAP */
13429 default:
13430 error = ENOSYS;
13431 }
13432
13433 return error;
13434 }