]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_syscalls.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_syscalls.c
1 /*
2 * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <sys/malloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/fsctl.h>
101 #include <sys/ubc_internal.h>
102 #include <sys/disk.h>
103 #include <sys/content_protection.h>
104 #include <sys/clonefile.h>
105 #include <sys/snapshot.h>
106 #include <sys/priv.h>
107 #include <sys/fsgetpath.h>
108 #include <machine/cons.h>
109 #include <machine/limits.h>
110 #include <miscfs/specfs/specdev.h>
111
112 #include <vfs/vfs_disk_conditioner.h>
113
114 #include <security/audit/audit.h>
115 #include <bsm/audit_kevents.h>
116
117 #include <mach/mach_types.h>
118 #include <kern/kern_types.h>
119 #include <kern/kalloc.h>
120 #include <kern/task.h>
121
122 #include <vm/vm_pageout.h>
123 #include <vm/vm_protos.h>
124
125 #include <libkern/OSAtomic.h>
126 #include <pexpert/pexpert.h>
127 #include <IOKit/IOBSD.h>
128
129 // deps for MIG call
130 #include <kern/host.h>
131 #include <kern/ipc_misc.h>
132 #include <mach/host_priv.h>
133 #include <mach/vfs_nspace.h>
134 #include <os/log.h>
135
136 #include <nfs/nfs_conf.h>
137
138 #if ROUTEFS
139 #include <miscfs/routefs/routefs.h>
140 #endif /* ROUTEFS */
141
142 #if CONFIG_MACF
143 #include <security/mac.h>
144 #include <security/mac_framework.h>
145 #endif
146
147 #if CONFIG_FSE
148 #define GET_PATH(x) \
149 (x) = get_pathbuff();
150 #define RELEASE_PATH(x) \
151 release_pathbuff(x);
152 #else
153 #define GET_PATH(x) \
154 MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
155 #define RELEASE_PATH(x) \
156 FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
157 #endif /* CONFIG_FSE */
158
159 #ifndef HFS_GET_BOOT_INFO
160 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
161 #endif
162
163 #ifndef HFS_SET_BOOT_INFO
164 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
165 #endif
166
167 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
168 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
169 #endif
170
171 extern void disk_conditioner_unmount(mount_t mp);
172
173 /* struct for checkdirs iteration */
174 struct cdirargs {
175 vnode_t olddp;
176 vnode_t newdp;
177 };
178 /* callback for checkdirs iteration */
179 static int checkdirs_callback(proc_t p, void * arg);
180
181 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
182 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
183 void enablequotas(struct mount *mp, vfs_context_t ctx);
184 static int getfsstat_callback(mount_t mp, void * arg);
185 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
186 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
187 static int sync_callback(mount_t, void *);
188 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
189 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
190 boolean_t partial_copy);
191 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
192 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
193 struct componentname *cnp, user_addr_t fsmountargs,
194 int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
195 vfs_context_t ctx);
196 void vfs_notify_mount(vnode_t pdvp);
197
198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
199
200 struct fd_vn_data * fg_vn_data_alloc(void);
201
202 /*
203 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
204 * Concurrent lookups (or lookups by ids) on hard links can cause the
205 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
206 * does) to return ENOENT as the path cannot be returned from the name cache
207 * alone. We have no option but to retry and hope to get one namei->reverse path
208 * generation done without an intervening lookup, lookup by id on the hard link
209 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
210 * which currently are the MAC hooks for rename, unlink and rmdir.
211 */
212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
213
214 /* Max retry limit for rename due to vnode recycling. */
215 #define MAX_RENAME_ERECYCLE_RETRIES 1024
216
217 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
218 int unlink_flags);
219
220 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
221
222 #ifdef CONFIG_IMGSRC_ACCESS
223 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
224 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
225 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
226 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
227 static void mount_end_update(mount_t mp);
228 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
229 #endif /* CONFIG_IMGSRC_ACCESS */
230
231 #if CONFIG_LOCKERBOOT
232 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
233 const char *pbdevpath);
234 #endif
235
236 //snapshot functions
237 #if CONFIG_MNT_ROOTSNAP
238 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
239 #else
240 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
241 #endif
242
243 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
244
245 __private_extern__
246 int sync_internal(void);
247
248 __private_extern__
249 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
250
251 extern lck_grp_t *fd_vn_lck_grp;
252 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
253 extern lck_attr_t *fd_vn_lck_attr;
254
255 /*
256 * incremented each time a mount or unmount operation occurs
257 * used to invalidate the cached value of the rootvp in the
258 * mount structure utilized by cache_lookup_path
259 */
260 uint32_t mount_generation = 0;
261
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269
270 /*
271 * Virtual File System System Calls
272 */
273
274 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
275 /*
276 * Private in-kernel mounting spi (NFS only, not exported)
277 */
278 __private_extern__
279 boolean_t
280 vfs_iskernelmount(mount_t mp)
281 {
282 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
283 }
284
285 __private_extern__
286 int
287 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
288 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
289 {
290 struct nameidata nd;
291 boolean_t did_namei;
292 int error;
293
294 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
295 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
296
297 /*
298 * Get the vnode to be covered if it's not supplied
299 */
300 if (vp == NULLVP) {
301 error = namei(&nd);
302 if (error) {
303 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
304 printf("failed to locate mount-on path: %s ", path);
305 }
306 return error;
307 }
308 vp = nd.ni_vp;
309 pvp = nd.ni_dvp;
310 did_namei = TRUE;
311 } else {
312 char *pnbuf = CAST_DOWN(char *, path);
313
314 nd.ni_cnd.cn_pnbuf = pnbuf;
315 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
316 did_namei = FALSE;
317 }
318
319 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
320 syscall_flags, kern_flags, NULL, TRUE, ctx);
321
322 if (did_namei) {
323 vnode_put(vp);
324 vnode_put(pvp);
325 nameidone(&nd);
326 }
327
328 return error;
329 }
330 #endif /* CONFIG_NFS_CLIENT || DEVFS */
331
332 /*
333 * Mount a file system.
334 */
335 /* ARGSUSED */
336 int
337 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
338 {
339 struct __mac_mount_args muap;
340
341 muap.type = uap->type;
342 muap.path = uap->path;
343 muap.flags = uap->flags;
344 muap.data = uap->data;
345 muap.mac_p = USER_ADDR_NULL;
346 return __mac_mount(p, &muap, retval);
347 }
348
349 int
350 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
351 {
352 struct componentname cn;
353 vfs_context_t ctx = vfs_context_current();
354 size_t dummy = 0;
355 int error;
356 int flags = uap->flags;
357 char fstypename[MFSNAMELEN];
358 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
359 vnode_t pvp;
360 vnode_t vp;
361
362 AUDIT_ARG(fd, uap->fd);
363 AUDIT_ARG(fflags, flags);
364 /* fstypename will get audited by mount_common */
365
366 /* Sanity check the flags */
367 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
368 return ENOTSUP;
369 }
370
371 if (flags & MNT_UNION) {
372 return EPERM;
373 }
374
375 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
376 if (error) {
377 return error;
378 }
379
380 if ((error = file_vnode(uap->fd, &vp)) != 0) {
381 return error;
382 }
383
384 if ((error = vnode_getwithref(vp)) != 0) {
385 file_drop(uap->fd);
386 return error;
387 }
388
389 pvp = vnode_getparent(vp);
390 if (pvp == NULL) {
391 vnode_put(vp);
392 file_drop(uap->fd);
393 return EINVAL;
394 }
395
396 memset(&cn, 0, sizeof(struct componentname));
397 MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
398 cn.cn_pnlen = MAXPATHLEN;
399
400 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
401 FREE(cn.cn_pnbuf, M_TEMP);
402 vnode_put(pvp);
403 vnode_put(vp);
404 file_drop(uap->fd);
405 return error;
406 }
407
408 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
409
410 FREE(cn.cn_pnbuf, M_TEMP);
411 vnode_put(pvp);
412 vnode_put(vp);
413 file_drop(uap->fd);
414
415 return error;
416 }
417
418 void
419 vfs_notify_mount(vnode_t pdvp)
420 {
421 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
422 lock_vnode_and_post(pdvp, NOTE_WRITE);
423 }
424
425 /*
426 * __mac_mount:
427 * Mount a file system taking into account MAC label behavior.
428 * See mount(2) man page for more information
429 *
430 * Parameters: p Process requesting the mount
431 * uap User argument descriptor (see below)
432 * retval (ignored)
433 *
434 * Indirect: uap->type Filesystem type
435 * uap->path Path to mount
436 * uap->data Mount arguments
437 * uap->mac_p MAC info
438 * uap->flags Mount flags
439 *
440 *
441 * Returns: 0 Success
442 * !0 Not success
443 */
444 boolean_t root_fs_upgrade_try = FALSE;
445
446 int
447 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
448 {
449 vnode_t pvp = NULL;
450 vnode_t vp = NULL;
451 int need_nameidone = 0;
452 vfs_context_t ctx = vfs_context_current();
453 char fstypename[MFSNAMELEN];
454 struct nameidata nd;
455 size_t dummy = 0;
456 char *labelstr = NULL;
457 int flags = uap->flags;
458 int error;
459 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
460 boolean_t is_64bit = IS_64BIT_PROCESS(p);
461 #else
462 #pragma unused(p)
463 #endif
464 /*
465 * Get the fs type name from user space
466 */
467 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
468 if (error) {
469 return error;
470 }
471
472 /*
473 * Get the vnode to be covered
474 */
475 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
476 UIO_USERSPACE, uap->path, ctx);
477 error = namei(&nd);
478 if (error) {
479 goto out;
480 }
481 need_nameidone = 1;
482 vp = nd.ni_vp;
483 pvp = nd.ni_dvp;
484
485 #ifdef CONFIG_IMGSRC_ACCESS
486 /* Mounting image source cannot be batched with other operations */
487 if (flags == MNT_IMGSRC_BY_INDEX) {
488 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
489 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
490 goto out;
491 }
492 #endif /* CONFIG_IMGSRC_ACCESS */
493
494 #if CONFIG_MACF
495 /*
496 * Get the label string (if any) from user space
497 */
498 if (uap->mac_p != USER_ADDR_NULL) {
499 struct user_mac mac;
500 size_t ulen = 0;
501
502 if (is_64bit) {
503 struct user64_mac mac64;
504 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
505 mac.m_buflen = mac64.m_buflen;
506 mac.m_string = mac64.m_string;
507 } else {
508 struct user32_mac mac32;
509 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
510 mac.m_buflen = mac32.m_buflen;
511 mac.m_string = mac32.m_string;
512 }
513 if (error) {
514 goto out;
515 }
516 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
517 (mac.m_buflen < 2)) {
518 error = EINVAL;
519 goto out;
520 }
521 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
522 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
523 if (error) {
524 goto out;
525 }
526 AUDIT_ARG(mac_string, labelstr);
527 }
528 #endif /* CONFIG_MACF */
529
530 AUDIT_ARG(fflags, flags);
531
532 #if SECURE_KERNEL
533 if (flags & MNT_UNION) {
534 /* No union mounts on release kernels */
535 error = EPERM;
536 goto out;
537 }
538 #endif
539
540 if ((vp->v_flag & VROOT) &&
541 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
542 if (!(flags & MNT_UNION)) {
543 flags |= MNT_UPDATE;
544 } else {
545 /*
546 * For a union mount on '/', treat it as fresh
547 * mount instead of update.
548 * Otherwise, union mouting on '/' used to panic the
549 * system before, since mnt_vnodecovered was found to
550 * be NULL for '/' which is required for unionlookup
551 * after it gets ENOENT on union mount.
552 */
553 flags = (flags & ~(MNT_UPDATE));
554 }
555
556 #if SECURE_KERNEL
557 if ((flags & MNT_RDONLY) == 0) {
558 /* Release kernels are not allowed to mount "/" as rw */
559 error = EPERM;
560 goto out;
561 }
562 #endif
563 /*
564 * See 7392553 for more details on why this check exists.
565 * Suffice to say: If this check is ON and something tries
566 * to mount the rootFS RW, we'll turn off the codesign
567 * bitmap optimization.
568 */
569 #if CHECK_CS_VALIDATION_BITMAP
570 if ((flags & MNT_RDONLY) == 0) {
571 root_fs_upgrade_try = TRUE;
572 }
573 #endif
574 }
575
576 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
577 labelstr, FALSE, ctx);
578
579 out:
580
581 #if CONFIG_MACF
582 if (labelstr) {
583 FREE(labelstr, M_MACTEMP);
584 }
585 #endif /* CONFIG_MACF */
586
587 if (vp) {
588 vnode_put(vp);
589 }
590 if (pvp) {
591 vnode_put(pvp);
592 }
593 if (need_nameidone) {
594 nameidone(&nd);
595 }
596
597 return error;
598 }
599
600 /*
601 * common mount implementation (final stage of mounting)
602 *
603 * Arguments:
604 * fstypename file system type (ie it's vfs name)
605 * pvp parent of covered vnode
606 * vp covered vnode
607 * cnp component name (ie path) of covered vnode
608 * flags generic mount flags
609 * fsmountargs file system specific data
610 * labelstr optional MAC label
611 * kernelmount TRUE for mounts initiated from inside the kernel
612 * ctx caller's context
613 */
614 static int
615 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
616 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
617 char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
618 {
619 #if !CONFIG_MACF
620 #pragma unused(labelstr)
621 #endif
622 struct vnode *devvp = NULLVP;
623 struct vnode *device_vnode = NULLVP;
624 #if CONFIG_MACF
625 struct vnode *rvp;
626 #endif
627 struct mount *mp;
628 struct vfstable *vfsp = (struct vfstable *)0;
629 struct proc *p = vfs_context_proc(ctx);
630 int error, flag = 0;
631 user_addr_t devpath = USER_ADDR_NULL;
632 int ronly = 0;
633 int mntalloc = 0;
634 boolean_t vfsp_ref = FALSE;
635 boolean_t is_rwlock_locked = FALSE;
636 boolean_t did_rele = FALSE;
637 boolean_t have_usecount = FALSE;
638
639 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
640 /* Check for mutually-exclusive flag bits */
641 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
642 int bitcount = 0;
643 while (checkflags != 0) {
644 checkflags &= (checkflags - 1);
645 bitcount++;
646 }
647
648 if (bitcount > 1) {
649 //not allowed to request multiple mount-by-role flags
650 error = EINVAL;
651 goto out1;
652 }
653 #endif
654
655 /*
656 * Process an update for an existing mount
657 */
658 if (flags & MNT_UPDATE) {
659 if ((vp->v_flag & VROOT) == 0) {
660 error = EINVAL;
661 goto out1;
662 }
663 mp = vp->v_mount;
664
665 /* unmount in progress return error */
666 mount_lock_spin(mp);
667 if (mp->mnt_lflag & MNT_LUNMOUNT) {
668 mount_unlock(mp);
669 error = EBUSY;
670 goto out1;
671 }
672 mount_unlock(mp);
673 lck_rw_lock_exclusive(&mp->mnt_rwlock);
674 is_rwlock_locked = TRUE;
675 /*
676 * We only allow the filesystem to be reloaded if it
677 * is currently mounted read-only.
678 */
679 if ((flags & MNT_RELOAD) &&
680 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
681 error = ENOTSUP;
682 goto out1;
683 }
684
685 /*
686 * If content protection is enabled, update mounts are not
687 * allowed to turn it off.
688 */
689 if ((mp->mnt_flag & MNT_CPROTECT) &&
690 ((flags & MNT_CPROTECT) == 0)) {
691 error = EINVAL;
692 goto out1;
693 }
694
695 /*
696 * can't turn off MNT_REMOVABLE either but it may be an unexpected
697 * failure to return an error for this so we'll just silently
698 * add it if it is not passed in.
699 */
700 if ((mp->mnt_flag & MNT_REMOVABLE) &&
701 ((flags & MNT_REMOVABLE) == 0)) {
702 flags |= MNT_REMOVABLE;
703 }
704
705 #ifdef CONFIG_IMGSRC_ACCESS
706 /* Can't downgrade the backer of the root FS */
707 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
708 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
709 error = ENOTSUP;
710 goto out1;
711 }
712 #endif /* CONFIG_IMGSRC_ACCESS */
713
714 /*
715 * Only root, or the user that did the original mount is
716 * permitted to update it.
717 */
718 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
719 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
720 goto out1;
721 }
722 #if CONFIG_MACF
723 error = mac_mount_check_remount(ctx, mp);
724 if (error != 0) {
725 goto out1;
726 }
727 #endif
728 /*
729 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
730 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
731 */
732 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
733 flags |= MNT_NOSUID | MNT_NODEV;
734 if (mp->mnt_flag & MNT_NOEXEC) {
735 flags |= MNT_NOEXEC;
736 }
737 }
738 flag = mp->mnt_flag;
739
740
741
742 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
743
744 vfsp = mp->mnt_vtable;
745 goto update;
746 } // MNT_UPDATE
747
748 /*
749 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
750 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
751 */
752 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
753 flags |= MNT_NOSUID | MNT_NODEV;
754 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
755 flags |= MNT_NOEXEC;
756 }
757 }
758
759 /* XXXAUDIT: Should we capture the type on the error path as well? */
760 AUDIT_ARG(text, fstypename);
761 mount_list_lock();
762 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
763 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
764 vfsp->vfc_refcount++;
765 vfsp_ref = TRUE;
766 break;
767 }
768 }
769 mount_list_unlock();
770 if (vfsp == NULL) {
771 error = ENODEV;
772 goto out1;
773 }
774
775 /*
776 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
777 * except in ROSV configs.
778 */
779 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
780 ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
781 error = EINVAL; /* unsupported request */
782 goto out1;
783 }
784
785 error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
786 if (error != 0) {
787 goto out1;
788 }
789
790 /*
791 * Allocate and initialize the filesystem (mount_t)
792 */
793 MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
794 M_MOUNT, M_WAITOK);
795 bzero((char *)mp, (u_int32_t)sizeof(struct mount));
796 mntalloc = 1;
797
798 /* Initialize the default IO constraints */
799 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
800 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
801 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
802 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
803 mp->mnt_devblocksize = DEV_BSIZE;
804 mp->mnt_alignmentmask = PAGE_MASK;
805 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
806 mp->mnt_ioscale = 1;
807 mp->mnt_ioflags = 0;
808 mp->mnt_realrootvp = NULLVP;
809 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
810
811 TAILQ_INIT(&mp->mnt_vnodelist);
812 TAILQ_INIT(&mp->mnt_workerqueue);
813 TAILQ_INIT(&mp->mnt_newvnodes);
814 mount_lock_init(mp);
815 lck_rw_lock_exclusive(&mp->mnt_rwlock);
816 is_rwlock_locked = TRUE;
817 mp->mnt_op = vfsp->vfc_vfsops;
818 mp->mnt_vtable = vfsp;
819 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
820 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
821 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
822 do {
823 int pathlen = MAXPATHLEN;
824
825 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
826 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
827 }
828 } while (0);
829 mp->mnt_vnodecovered = vp;
830 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
831 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
832 mp->mnt_devbsdunit = 0;
833
834 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
835 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
836
837 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
838 if (kernelmount) {
839 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
840 }
841 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
842 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
843 }
844 #endif /* CONFIG_NFS_CLIENT || DEVFS */
845
846 update:
847
848 /*
849 * Set the mount level flags.
850 */
851 if (flags & MNT_RDONLY) {
852 mp->mnt_flag |= MNT_RDONLY;
853 } else if (mp->mnt_flag & MNT_RDONLY) {
854 // disallow read/write upgrades of file systems that
855 // had the TYPENAME_OVERRIDE feature set.
856 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
857 error = EPERM;
858 goto out1;
859 }
860 mp->mnt_kern_flag |= MNTK_WANTRDWR;
861 }
862 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
863 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
864 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
865 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
866 MNT_QUARANTINE | MNT_CPROTECT);
867
868 #if SECURE_KERNEL
869 #if !CONFIG_MNT_SUID
870 /*
871 * On release builds of iOS based platforms, always enforce NOSUID on
872 * all mounts. We do this here because we can catch update mounts as well as
873 * non-update mounts in this case.
874 */
875 mp->mnt_flag |= (MNT_NOSUID);
876 #endif
877 #endif
878
879 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
880 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
881 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
882 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
883 MNT_QUARANTINE | MNT_CPROTECT);
884
885 #if CONFIG_MACF
886 if (flags & MNT_MULTILABEL) {
887 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
888 error = EINVAL;
889 goto out1;
890 }
891 mp->mnt_flag |= MNT_MULTILABEL;
892 }
893 #endif
894 /*
895 * Process device path for local file systems if requested
896 */
897 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
898 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
899 //snapshot, vm, datavolume mounts are special
900 if (vfs_context_is64bit(ctx)) {
901 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
902 goto out1;
903 }
904 fsmountargs += sizeof(devpath);
905 } else {
906 user32_addr_t tmp;
907 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
908 goto out1;
909 }
910 /* munge into LP64 addr */
911 devpath = CAST_USER_ADDR_T(tmp);
912 fsmountargs += sizeof(tmp);
913 }
914
915 /* Lookup device and authorize access to it */
916 if ((devpath)) {
917 struct nameidata nd;
918
919 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
920 if ((error = namei(&nd))) {
921 goto out1;
922 }
923
924 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
925 devvp = nd.ni_vp;
926
927 nameidone(&nd);
928
929 if (devvp->v_type != VBLK) {
930 error = ENOTBLK;
931 goto out2;
932 }
933 if (major(devvp->v_rdev) >= nblkdev) {
934 error = ENXIO;
935 goto out2;
936 }
937 /*
938 * If mount by non-root, then verify that user has necessary
939 * permissions on the device.
940 */
941 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
942 mode_t accessmode = KAUTH_VNODE_READ_DATA;
943
944 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
945 accessmode |= KAUTH_VNODE_WRITE_DATA;
946 }
947 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
948 goto out2;
949 }
950 }
951 }
952 /* On first mount, preflight and open device */
953 if (devpath && ((flags & MNT_UPDATE) == 0)) {
954 if ((error = vnode_ref(devvp))) {
955 goto out2;
956 }
957 /*
958 * Disallow multiple mounts of the same device.
959 * Disallow mounting of a device that is currently in use
960 * (except for root, which might share swap device for miniroot).
961 * Flush out any old buffers remaining from a previous use.
962 */
963 if ((error = vfs_mountedon(devvp))) {
964 goto out3;
965 }
966
967 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
968 error = EBUSY;
969 goto out3;
970 }
971 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
972 error = ENOTBLK;
973 goto out3;
974 }
975 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
976 goto out3;
977 }
978
979 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
980 #if CONFIG_MACF
981 error = mac_vnode_check_open(ctx,
982 devvp,
983 ronly ? FREAD : FREAD | FWRITE);
984 if (error) {
985 goto out3;
986 }
987 #endif /* MAC */
988 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
989 goto out3;
990 }
991
992 mp->mnt_devvp = devvp;
993 device_vnode = devvp;
994 } else if ((mp->mnt_flag & MNT_RDONLY) &&
995 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
996 (device_vnode = mp->mnt_devvp)) {
997 dev_t dev;
998 int maj;
999 /*
1000 * If upgrade to read-write by non-root, then verify
1001 * that user has necessary permissions on the device.
1002 */
1003 vnode_getalways(device_vnode);
1004
1005 if (suser(vfs_context_ucred(ctx), NULL) &&
1006 (error = vnode_authorize(device_vnode, NULL,
1007 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1008 ctx)) != 0) {
1009 vnode_put(device_vnode);
1010 goto out2;
1011 }
1012
1013 /* Tell the device that we're upgrading */
1014 dev = (dev_t)device_vnode->v_rdev;
1015 maj = major(dev);
1016
1017 if ((u_int)maj >= (u_int)nblkdev) {
1018 panic("Volume mounted on a device with invalid major number.");
1019 }
1020
1021 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1022 vnode_put(device_vnode);
1023 device_vnode = NULLVP;
1024 if (error != 0) {
1025 goto out2;
1026 }
1027 }
1028 } // localargs && !(snapshot | data | vm)
1029
1030 #if CONFIG_MACF
1031 if ((flags & MNT_UPDATE) == 0) {
1032 mac_mount_label_init(mp);
1033 mac_mount_label_associate(ctx, mp);
1034 }
1035 if (labelstr) {
1036 if ((flags & MNT_UPDATE) != 0) {
1037 error = mac_mount_check_label_update(ctx, mp);
1038 if (error != 0) {
1039 goto out3;
1040 }
1041 }
1042 }
1043 #endif
1044 /*
1045 * Mount the filesystem. We already asserted that internal_flags
1046 * cannot have more than one mount-by-role bit set.
1047 */
1048 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1049 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1050 (caddr_t)fsmountargs, 0, ctx);
1051 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1052 #if CONFIG_ROSV_STARTUP
1053 struct mount *origin_mp = (struct mount*)fsmountargs;
1054 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1055 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1056 if (error) {
1057 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1058 } else {
1059 /* Mark volume associated with system volume */
1060 mp->mnt_kern_flag |= MNTK_SYSTEM;
1061
1062 /* Attempt to acquire the mnt_devvp and set it up */
1063 struct vnode *mp_devvp = NULL;
1064 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1065 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1066 0, &mp_devvp, vfs_context_kernel());
1067 if (!lerr) {
1068 mp->mnt_devvp = mp_devvp;
1069 //vnode_lookup took an iocount, need to drop it.
1070 vnode_put(mp_devvp);
1071 // now set `device_vnode` to the devvp that was acquired.
1072 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1073 // note that though the iocount above was dropped, the mount acquires
1074 // an implicit reference against the device.
1075 device_vnode = mp_devvp;
1076 }
1077 }
1078 }
1079 #else
1080 error = EINVAL;
1081 #endif
1082 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1083 #if CONFIG_MOUNT_VM
1084 struct mount *origin_mp = (struct mount*)fsmountargs;
1085 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1086 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1087 if (error) {
1088 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1089 } else {
1090 /* Mark volume associated with system volume and a swap mount */
1091 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1092 /* Attempt to acquire the mnt_devvp and set it up */
1093 struct vnode *mp_devvp = NULL;
1094 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1095 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1096 0, &mp_devvp, vfs_context_kernel());
1097 if (!lerr) {
1098 mp->mnt_devvp = mp_devvp;
1099 //vnode_lookup took an iocount, need to drop it.
1100 vnode_put(mp_devvp);
1101
1102 // now set `device_vnode` to the devvp that was acquired.
1103 // note that though the iocount above was dropped, the mount acquires
1104 // an implicit reference against the device.
1105 device_vnode = mp_devvp;
1106 }
1107 }
1108 }
1109 #else
1110 error = EINVAL;
1111 #endif
1112 } else {
1113 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1114 }
1115
1116 if (flags & MNT_UPDATE) {
1117 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1118 mp->mnt_flag &= ~MNT_RDONLY;
1119 }
1120 mp->mnt_flag &= ~
1121 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1122 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1123 if (error) {
1124 mp->mnt_flag = flag; /* restore flag value */
1125 }
1126 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1127 lck_rw_done(&mp->mnt_rwlock);
1128 is_rwlock_locked = FALSE;
1129 if (!error) {
1130 enablequotas(mp, ctx);
1131 }
1132 goto exit;
1133 }
1134
1135 /*
1136 * Put the new filesystem on the mount list after root.
1137 */
1138 if (error == 0) {
1139 struct vfs_attr vfsattr;
1140 #if CONFIG_MACF
1141 error = mac_mount_check_mount_late(ctx, mp);
1142 if (error != 0) {
1143 goto out3;
1144 }
1145
1146 if (vfs_flags(mp) & MNT_MULTILABEL) {
1147 error = VFS_ROOT(mp, &rvp, ctx);
1148 if (error) {
1149 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1150 goto out3;
1151 }
1152 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1153 /*
1154 * drop reference provided by VFS_ROOT
1155 */
1156 vnode_put(rvp);
1157
1158 if (error) {
1159 goto out3;
1160 }
1161 }
1162 #endif /* MAC */
1163
1164 vnode_lock_spin(vp);
1165 CLR(vp->v_flag, VMOUNT);
1166 vp->v_mountedhere = mp;
1167 vnode_unlock(vp);
1168
1169 /*
1170 * taking the name_cache_lock exclusively will
1171 * insure that everyone is out of the fast path who
1172 * might be trying to use a now stale copy of
1173 * vp->v_mountedhere->mnt_realrootvp
1174 * bumping mount_generation causes the cached values
1175 * to be invalidated
1176 */
1177 name_cache_lock();
1178 mount_generation++;
1179 name_cache_unlock();
1180
1181 error = vnode_ref(vp);
1182 if (error != 0) {
1183 goto out4;
1184 }
1185
1186 have_usecount = TRUE;
1187
1188 error = checkdirs(vp, ctx);
1189 if (error != 0) {
1190 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1191 goto out4;
1192 }
1193 /*
1194 * there is no cleanup code here so I have made it void
1195 * we need to revisit this
1196 */
1197 (void)VFS_START(mp, 0, ctx);
1198
1199 if (mount_list_add(mp) != 0) {
1200 /*
1201 * The system is shutting down trying to umount
1202 * everything, so fail with a plausible errno.
1203 */
1204 error = EBUSY;
1205 goto out4;
1206 }
1207 lck_rw_done(&mp->mnt_rwlock);
1208 is_rwlock_locked = FALSE;
1209
1210 /* Check if this mounted file system supports EAs or named streams. */
1211 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1212 VFSATTR_INIT(&vfsattr);
1213 VFSATTR_WANTED(&vfsattr, f_capabilities);
1214 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1215 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1216 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1217 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1218 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1219 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1220 }
1221 #if NAMEDSTREAMS
1222 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1223 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1224 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1225 }
1226 #endif
1227 /* Check if this file system supports path from id lookups. */
1228 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1229 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1230 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1231 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1232 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1233 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1234 }
1235
1236 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1237 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1238 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1239 }
1240 }
1241 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1242 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1243 }
1244 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1245 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1246 }
1247 /* increment the operations count */
1248 OSAddAtomic(1, &vfs_nummntops);
1249 enablequotas(mp, ctx);
1250
1251 if (device_vnode) {
1252 device_vnode->v_specflags |= SI_MOUNTEDON;
1253
1254 /*
1255 * cache the IO attributes for the underlying physical media...
1256 * an error return indicates the underlying driver doesn't
1257 * support all the queries necessary... however, reasonable
1258 * defaults will have been set, so no reason to bail or care
1259 */
1260 vfs_init_io_attributes(device_vnode, mp);
1261 }
1262
1263 /* Now that mount is setup, notify the listeners */
1264 vfs_notify_mount(pvp);
1265 IOBSDMountChange(mp, kIOMountChangeMount);
1266 } else {
1267 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1268 if (mp->mnt_vnodelist.tqh_first != NULL) {
1269 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1270 mp->mnt_vtable->vfc_name, error);
1271 }
1272
1273 vnode_lock_spin(vp);
1274 CLR(vp->v_flag, VMOUNT);
1275 vnode_unlock(vp);
1276 mount_list_lock();
1277 mp->mnt_vtable->vfc_refcount--;
1278 mount_list_unlock();
1279
1280 if (device_vnode) {
1281 vnode_rele(device_vnode);
1282 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1283 }
1284 lck_rw_done(&mp->mnt_rwlock);
1285 is_rwlock_locked = FALSE;
1286
1287 /*
1288 * if we get here, we have a mount structure that needs to be freed,
1289 * but since the coveredvp hasn't yet been updated to point at it,
1290 * no need to worry about other threads holding a crossref on this mp
1291 * so it's ok to just free it
1292 */
1293 mount_lock_destroy(mp);
1294 #if CONFIG_MACF
1295 mac_mount_label_destroy(mp);
1296 #endif
1297 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1298 }
1299 exit:
1300 /*
1301 * drop I/O count on the device vp if there was one
1302 */
1303 if (devpath && devvp) {
1304 vnode_put(devvp);
1305 }
1306
1307 return error;
1308
1309 /* Error condition exits */
1310 out4:
1311 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1312
1313 /*
1314 * If the mount has been placed on the covered vp,
1315 * it may have been discovered by now, so we have
1316 * to treat this just like an unmount
1317 */
1318 mount_lock_spin(mp);
1319 mp->mnt_lflag |= MNT_LDEAD;
1320 mount_unlock(mp);
1321
1322 if (device_vnode != NULLVP) {
1323 vnode_rele(device_vnode);
1324 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1325 ctx);
1326 did_rele = TRUE;
1327 }
1328
1329 vnode_lock_spin(vp);
1330
1331 mp->mnt_crossref++;
1332 vp->v_mountedhere = (mount_t) 0;
1333
1334 vnode_unlock(vp);
1335
1336 if (have_usecount) {
1337 vnode_rele(vp);
1338 }
1339 out3:
1340 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1341 vnode_rele(devvp);
1342 }
1343 out2:
1344 if (devpath && devvp) {
1345 vnode_put(devvp);
1346 }
1347 out1:
1348 /* Release mnt_rwlock only when it was taken */
1349 if (is_rwlock_locked == TRUE) {
1350 lck_rw_done(&mp->mnt_rwlock);
1351 }
1352
1353 if (mntalloc) {
1354 if (mp->mnt_crossref) {
1355 mount_dropcrossref(mp, vp, 0);
1356 } else {
1357 mount_lock_destroy(mp);
1358 #if CONFIG_MACF
1359 mac_mount_label_destroy(mp);
1360 #endif
1361 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1362 }
1363 }
1364 if (vfsp_ref) {
1365 mount_list_lock();
1366 vfsp->vfc_refcount--;
1367 mount_list_unlock();
1368 }
1369
1370 return error;
1371 }
1372
1373 /*
1374 * Flush in-core data, check for competing mount attempts,
1375 * and set VMOUNT
1376 */
1377 int
1378 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1379 {
1380 #if !CONFIG_MACF
1381 #pragma unused(cnp,fsname)
1382 #endif
1383 struct vnode_attr va;
1384 int error;
1385
1386 if (!skip_auth) {
1387 /*
1388 * If the user is not root, ensure that they own the directory
1389 * onto which we are attempting to mount.
1390 */
1391 VATTR_INIT(&va);
1392 VATTR_WANTED(&va, va_uid);
1393 if ((error = vnode_getattr(vp, &va, ctx)) ||
1394 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1395 (!vfs_context_issuser(ctx)))) {
1396 error = EPERM;
1397 goto out;
1398 }
1399 }
1400
1401 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1402 goto out;
1403 }
1404
1405 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1406 goto out;
1407 }
1408
1409 if (vp->v_type != VDIR) {
1410 error = ENOTDIR;
1411 goto out;
1412 }
1413
1414 if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1415 error = EBUSY;
1416 goto out;
1417 }
1418
1419 #if CONFIG_MACF
1420 error = mac_mount_check_mount(ctx, vp,
1421 cnp, fsname);
1422 if (error != 0) {
1423 goto out;
1424 }
1425 #endif
1426
1427 vnode_lock_spin(vp);
1428 SET(vp->v_flag, VMOUNT);
1429 vnode_unlock(vp);
1430
1431 out:
1432 return error;
1433 }
1434
1435 #if CONFIG_IMGSRC_ACCESS
1436
1437 #define DEBUG_IMGSRC 0
1438
1439 #if DEBUG_IMGSRC
1440 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1441 #else
1442 #define IMGSRC_DEBUG(args...) do { } while(0)
1443 #endif
1444
1445 static int
1446 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1447 {
1448 struct nameidata nd;
1449 vnode_t vp, realdevvp;
1450 mode_t accessmode;
1451 int error;
1452 enum uio_seg uio = UIO_USERSPACE;
1453
1454 if (ctx == vfs_context_kernel()) {
1455 uio = UIO_SYSSPACE;
1456 }
1457
1458 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1459 if ((error = namei(&nd))) {
1460 IMGSRC_DEBUG("namei() failed with %d\n", error);
1461 return error;
1462 }
1463
1464 vp = nd.ni_vp;
1465
1466 if (!vnode_isblk(vp)) {
1467 IMGSRC_DEBUG("Not block device.\n");
1468 error = ENOTBLK;
1469 goto out;
1470 }
1471
1472 realdevvp = mp->mnt_devvp;
1473 if (realdevvp == NULLVP) {
1474 IMGSRC_DEBUG("No device backs the mount.\n");
1475 error = ENXIO;
1476 goto out;
1477 }
1478
1479 error = vnode_getwithref(realdevvp);
1480 if (error != 0) {
1481 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1482 goto out;
1483 }
1484
1485 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1486 IMGSRC_DEBUG("Wrong dev_t.\n");
1487 error = ENXIO;
1488 goto out1;
1489 }
1490
1491 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1492
1493 /*
1494 * If mount by non-root, then verify that user has necessary
1495 * permissions on the device.
1496 */
1497 if (!vfs_context_issuser(ctx)) {
1498 accessmode = KAUTH_VNODE_READ_DATA;
1499 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1500 accessmode |= KAUTH_VNODE_WRITE_DATA;
1501 }
1502 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1503 IMGSRC_DEBUG("Access denied.\n");
1504 goto out1;
1505 }
1506 }
1507
1508 *devvpp = vp;
1509
1510 out1:
1511 vnode_put(realdevvp);
1512
1513 out:
1514 nameidone(&nd);
1515
1516 if (error) {
1517 vnode_put(vp);
1518 }
1519
1520 return error;
1521 }
1522
1523 /*
1524 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1525 * and call checkdirs()
1526 */
1527 static int
1528 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1529 {
1530 int error;
1531
1532 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1533
1534 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1535 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1536
1537 vnode_lock_spin(vp);
1538 CLR(vp->v_flag, VMOUNT);
1539 vp->v_mountedhere = mp;
1540 vnode_unlock(vp);
1541
1542 /*
1543 * taking the name_cache_lock exclusively will
1544 * insure that everyone is out of the fast path who
1545 * might be trying to use a now stale copy of
1546 * vp->v_mountedhere->mnt_realrootvp
1547 * bumping mount_generation causes the cached values
1548 * to be invalidated
1549 */
1550 name_cache_lock();
1551 mount_generation++;
1552 name_cache_unlock();
1553
1554 error = vnode_ref(vp);
1555 if (error != 0) {
1556 goto out;
1557 }
1558
1559 error = checkdirs(vp, ctx);
1560 if (error != 0) {
1561 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1562 vnode_rele(vp);
1563 goto out;
1564 }
1565
1566 out:
1567 if (error != 0) {
1568 mp->mnt_vnodecovered = NULLVP;
1569 }
1570 return error;
1571 }
1572
1573 static void
1574 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1575 {
1576 vnode_rele(vp);
1577 vnode_lock_spin(vp);
1578 vp->v_mountedhere = (mount_t)NULL;
1579 vnode_unlock(vp);
1580
1581 mp->mnt_vnodecovered = NULLVP;
1582 }
1583
1584 static int
1585 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1586 {
1587 int error;
1588
1589 /* unmount in progress return error */
1590 mount_lock_spin(mp);
1591 if (mp->mnt_lflag & MNT_LUNMOUNT) {
1592 mount_unlock(mp);
1593 return EBUSY;
1594 }
1595 mount_unlock(mp);
1596 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1597
1598 /*
1599 * We only allow the filesystem to be reloaded if it
1600 * is currently mounted read-only.
1601 */
1602 if ((flags & MNT_RELOAD) &&
1603 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1604 error = ENOTSUP;
1605 goto out;
1606 }
1607
1608 /*
1609 * Only root, or the user that did the original mount is
1610 * permitted to update it.
1611 */
1612 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1613 (!vfs_context_issuser(ctx))) {
1614 error = EPERM;
1615 goto out;
1616 }
1617 #if CONFIG_MACF
1618 error = mac_mount_check_remount(ctx, mp);
1619 if (error != 0) {
1620 goto out;
1621 }
1622 #endif
1623
1624 out:
1625 if (error) {
1626 lck_rw_done(&mp->mnt_rwlock);
1627 }
1628
1629 return error;
1630 }
1631
1632 static void
1633 mount_end_update(mount_t mp)
1634 {
1635 lck_rw_done(&mp->mnt_rwlock);
1636 }
1637
1638 static int
1639 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1640 {
1641 vnode_t vp;
1642
1643 if (height >= MAX_IMAGEBOOT_NESTING) {
1644 return EINVAL;
1645 }
1646
1647 vp = imgsrc_rootvnodes[height];
1648 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1649 *rvpp = vp;
1650 return 0;
1651 } else {
1652 return ENOENT;
1653 }
1654 }
1655
1656 static int
1657 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1658 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1659 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1660 {
1661 int error;
1662 mount_t mp;
1663 boolean_t placed = FALSE;
1664 struct vfstable *vfsp;
1665 user_addr_t devpath;
1666 char *old_mntonname;
1667 vnode_t rvp;
1668 vnode_t devvp;
1669 uint32_t height;
1670 uint32_t flags;
1671
1672 /* If we didn't imageboot, nothing to move */
1673 if (imgsrc_rootvnodes[0] == NULLVP) {
1674 return EINVAL;
1675 }
1676
1677 /* Only root can do this */
1678 if (!vfs_context_issuser(ctx)) {
1679 return EPERM;
1680 }
1681
1682 IMGSRC_DEBUG("looking for root vnode.\n");
1683
1684 /*
1685 * Get root vnode of filesystem we're moving.
1686 */
1687 if (by_index) {
1688 if (is64bit) {
1689 struct user64_mnt_imgsrc_args mia64;
1690 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1691 if (error != 0) {
1692 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1693 return error;
1694 }
1695
1696 height = mia64.mi_height;
1697 flags = mia64.mi_flags;
1698 devpath = mia64.mi_devpath;
1699 } else {
1700 struct user32_mnt_imgsrc_args mia32;
1701 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1702 if (error != 0) {
1703 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1704 return error;
1705 }
1706
1707 height = mia32.mi_height;
1708 flags = mia32.mi_flags;
1709 devpath = mia32.mi_devpath;
1710 }
1711 } else {
1712 /*
1713 * For binary compatibility--assumes one level of nesting.
1714 */
1715 if (is64bit) {
1716 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1717 return error;
1718 }
1719 } else {
1720 user32_addr_t tmp;
1721 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1722 return error;
1723 }
1724
1725 /* munge into LP64 addr */
1726 devpath = CAST_USER_ADDR_T(tmp);
1727 }
1728
1729 height = 0;
1730 flags = 0;
1731 }
1732
1733 if (flags != 0) {
1734 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1735 return EINVAL;
1736 }
1737
1738 error = get_imgsrc_rootvnode(height, &rvp);
1739 if (error != 0) {
1740 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1741 return error;
1742 }
1743
1744 IMGSRC_DEBUG("got old root vnode\n");
1745
1746 MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1747
1748 /* Can only move once */
1749 mp = vnode_mount(rvp);
1750 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1751 IMGSRC_DEBUG("Already moved.\n");
1752 error = EBUSY;
1753 goto out0;
1754 }
1755
1756 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1757 IMGSRC_DEBUG("Starting updated.\n");
1758
1759 /* Get exclusive rwlock on mount, authorize update on mp */
1760 error = mount_begin_update(mp, ctx, 0);
1761 if (error != 0) {
1762 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1763 goto out0;
1764 }
1765
1766 /*
1767 * It can only be moved once. Flag is set under the rwlock,
1768 * so we're now safe to proceed.
1769 */
1770 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1771 IMGSRC_DEBUG("Already moved [2]\n");
1772 goto out1;
1773 }
1774
1775 IMGSRC_DEBUG("Preparing coveredvp.\n");
1776
1777 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1778 error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1779 if (error != 0) {
1780 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1781 goto out1;
1782 }
1783
1784 IMGSRC_DEBUG("Covered vp OK.\n");
1785
1786 /* Sanity check the name caller has provided */
1787 vfsp = mp->mnt_vtable;
1788 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1789 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1790 vfsp->vfc_name, fsname);
1791 error = EINVAL;
1792 goto out2;
1793 }
1794
1795 /* Check the device vnode and update mount-from name, for local filesystems */
1796 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1797 IMGSRC_DEBUG("Local, doing device validation.\n");
1798
1799 if (devpath != USER_ADDR_NULL) {
1800 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1801 if (error) {
1802 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1803 goto out2;
1804 }
1805
1806 vnode_put(devvp);
1807 }
1808 }
1809
1810 /*
1811 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1812 * and increment the name cache's mount generation
1813 */
1814
1815 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1816 error = place_mount_and_checkdirs(mp, vp, ctx);
1817 if (error != 0) {
1818 goto out2;
1819 }
1820
1821 placed = TRUE;
1822
1823 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1824 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1825
1826 /* Forbid future moves */
1827 mount_lock(mp);
1828 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1829 mount_unlock(mp);
1830
1831 /* Finally, add to mount list, completely ready to go */
1832 if (mount_list_add(mp) != 0) {
1833 /*
1834 * The system is shutting down trying to umount
1835 * everything, so fail with a plausible errno.
1836 */
1837 error = EBUSY;
1838 goto out3;
1839 }
1840
1841 mount_end_update(mp);
1842 vnode_put(rvp);
1843 FREE(old_mntonname, M_TEMP);
1844
1845 vfs_notify_mount(pvp);
1846
1847 return 0;
1848 out3:
1849 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1850
1851 mount_lock(mp);
1852 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1853 mount_unlock(mp);
1854
1855 out2:
1856 /*
1857 * Placing the mp on the vnode clears VMOUNT,
1858 * so cleanup is different after that point
1859 */
1860 if (placed) {
1861 /* Rele the vp, clear VMOUNT and v_mountedhere */
1862 undo_place_on_covered_vp(mp, vp);
1863 } else {
1864 vnode_lock_spin(vp);
1865 CLR(vp->v_flag, VMOUNT);
1866 vnode_unlock(vp);
1867 }
1868 out1:
1869 mount_end_update(mp);
1870
1871 out0:
1872 vnode_put(rvp);
1873 FREE(old_mntonname, M_TEMP);
1874 return error;
1875 }
1876
1877 #if CONFIG_LOCKERBOOT
1878 __private_extern__
1879 int
1880 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1881 const char *pbdevpath)
1882 {
1883 int error = -1;
1884 struct nameidata nd;
1885 boolean_t cleanup_nd = FALSE;
1886 vfs_context_t ctx = vfs_context_kernel();
1887 boolean_t is64 = TRUE;
1888 boolean_t by_index = TRUE;
1889 struct user64_mnt_imgsrc_args mia64 = {
1890 .mi_height = 0,
1891 .mi_flags = 0,
1892 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1893 };
1894 user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1895
1896 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1897 UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1898 error = namei(&nd);
1899 if (error) {
1900 IMGSRC_DEBUG("namei: %d\n", error);
1901 goto out;
1902 }
1903
1904 cleanup_nd = TRUE;
1905 error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1906 &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1907
1908 out:
1909 if (cleanup_nd) {
1910 int stashed = error;
1911
1912 error = vnode_put(nd.ni_vp);
1913 if (error) {
1914 panic("vnode_put() returned non-zero: %d", error);
1915 }
1916
1917 if (nd.ni_dvp) {
1918 error = vnode_put(nd.ni_dvp);
1919 if (error) {
1920 panic("vnode_put() returned non-zero: %d", error);
1921 }
1922 }
1923 nameidone(&nd);
1924
1925 error = stashed;
1926 }
1927 return error;
1928 }
1929 #endif /* CONFIG_LOCKERBOOT */
1930 #endif /* CONFIG_IMGSRC_ACCESS */
1931
1932 void
1933 enablequotas(struct mount *mp, vfs_context_t ctx)
1934 {
1935 struct nameidata qnd;
1936 int type;
1937 char qfpath[MAXPATHLEN];
1938 const char *qfname = QUOTAFILENAME;
1939 const char *qfopsname = QUOTAOPSNAME;
1940 const char *qfextension[] = INITQFNAMES;
1941
1942 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1943 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1944 return;
1945 }
1946 /*
1947 * Enable filesystem disk quotas if necessary.
1948 * We ignore errors as this should not interfere with final mount
1949 */
1950 for (type = 0; type < MAXQUOTAS; type++) {
1951 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1952 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1953 CAST_USER_ADDR_T(qfpath), ctx);
1954 if (namei(&qnd) != 0) {
1955 continue; /* option file to trigger quotas is not present */
1956 }
1957 vnode_put(qnd.ni_vp);
1958 nameidone(&qnd);
1959 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1960
1961 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1962 }
1963 return;
1964 }
1965
1966
1967 static int
1968 checkdirs_callback(proc_t p, void * arg)
1969 {
1970 struct cdirargs * cdrp = (struct cdirargs *)arg;
1971 vnode_t olddp = cdrp->olddp;
1972 vnode_t newdp = cdrp->newdp;
1973 struct filedesc *fdp;
1974 vnode_t new_cvp = newdp;
1975 vnode_t new_rvp = newdp;
1976 vnode_t old_cvp = NULL;
1977 vnode_t old_rvp = NULL;
1978
1979 /*
1980 * XXX Also needs to iterate each thread in the process to see if it
1981 * XXX is using a per-thread current working directory, and, if so,
1982 * XXX update that as well.
1983 */
1984
1985 /*
1986 * First, with the proc_fdlock held, check to see if we will need
1987 * to do any work. If not, we will get out fast.
1988 */
1989 proc_fdlock(p);
1990 fdp = p->p_fd;
1991 if (fdp == NULL ||
1992 (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1993 proc_fdunlock(p);
1994 return PROC_RETURNED;
1995 }
1996 proc_fdunlock(p);
1997
1998 /*
1999 * Ok, we will have to do some work. Always take two refs
2000 * because we might need that many. We'll dispose of whatever
2001 * we ended up not using.
2002 */
2003 if (vnode_ref(newdp) != 0) {
2004 return PROC_RETURNED;
2005 }
2006 if (vnode_ref(newdp) != 0) {
2007 vnode_rele(newdp);
2008 return PROC_RETURNED;
2009 }
2010
2011 proc_dirs_lock_exclusive(p);
2012 /*
2013 * Now do the work. Note: we dropped the proc_fdlock, so we
2014 * have to do all of the checks again.
2015 */
2016 proc_fdlock(p);
2017 fdp = p->p_fd;
2018 if (fdp != NULL) {
2019 if (fdp->fd_cdir == olddp) {
2020 old_cvp = olddp;
2021 fdp->fd_cdir = newdp;
2022 new_cvp = NULL;
2023 }
2024 if (fdp->fd_rdir == olddp) {
2025 old_rvp = olddp;
2026 fdp->fd_rdir = newdp;
2027 new_rvp = NULL;
2028 }
2029 }
2030 proc_fdunlock(p);
2031 proc_dirs_unlock_exclusive(p);
2032
2033 /*
2034 * Dispose of any references that are no longer needed.
2035 */
2036 if (old_cvp != NULL) {
2037 vnode_rele(old_cvp);
2038 }
2039 if (old_rvp != NULL) {
2040 vnode_rele(old_rvp);
2041 }
2042 if (new_cvp != NULL) {
2043 vnode_rele(new_cvp);
2044 }
2045 if (new_rvp != NULL) {
2046 vnode_rele(new_rvp);
2047 }
2048
2049 return PROC_RETURNED;
2050 }
2051
2052
2053
2054 /*
2055 * Scan all active processes to see if any of them have a current
2056 * or root directory onto which the new filesystem has just been
2057 * mounted. If so, replace them with the new mount point.
2058 */
2059 static int
2060 checkdirs(vnode_t olddp, vfs_context_t ctx)
2061 {
2062 vnode_t newdp;
2063 vnode_t tvp;
2064 int err;
2065 struct cdirargs cdr;
2066
2067 if (olddp->v_usecount == 1) {
2068 return 0;
2069 }
2070 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2071
2072 if (err != 0) {
2073 #if DIAGNOSTIC
2074 panic("mount: lost mount: error %d", err);
2075 #endif
2076 return err;
2077 }
2078
2079 cdr.olddp = olddp;
2080 cdr.newdp = newdp;
2081 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2082 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2083
2084 if (rootvnode == olddp) {
2085 vnode_ref(newdp);
2086 tvp = rootvnode;
2087 rootvnode = newdp;
2088 vnode_rele(tvp);
2089 }
2090
2091 vnode_put(newdp);
2092 return 0;
2093 }
2094
2095 /*
2096 * Unmount a file system.
2097 *
2098 * Note: unmount takes a path to the vnode mounted on as argument,
2099 * not special file (as before).
2100 */
2101 /* ARGSUSED */
2102 int
2103 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2104 {
2105 vnode_t vp;
2106 struct mount *mp;
2107 int error;
2108 struct nameidata nd;
2109 vfs_context_t ctx = vfs_context_current();
2110
2111 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2112 UIO_USERSPACE, uap->path, ctx);
2113 error = namei(&nd);
2114 if (error) {
2115 return error;
2116 }
2117 vp = nd.ni_vp;
2118 mp = vp->v_mount;
2119 nameidone(&nd);
2120
2121 #if CONFIG_MACF
2122 error = mac_mount_check_umount(ctx, mp);
2123 if (error != 0) {
2124 vnode_put(vp);
2125 return error;
2126 }
2127 #endif
2128 /*
2129 * Must be the root of the filesystem
2130 */
2131 if ((vp->v_flag & VROOT) == 0) {
2132 vnode_put(vp);
2133 return EINVAL;
2134 }
2135 mount_ref(mp, 0);
2136 vnode_put(vp);
2137 /* safedounmount consumes the mount ref */
2138 return safedounmount(mp, uap->flags, ctx);
2139 }
2140
2141 int
2142 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2143 {
2144 mount_t mp;
2145
2146 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2147 if (mp == (mount_t)0) {
2148 return ENOENT;
2149 }
2150 mount_ref(mp, 0);
2151 mount_iterdrop(mp);
2152 /* safedounmount consumes the mount ref */
2153 return safedounmount(mp, flags, ctx);
2154 }
2155
2156
2157 /*
2158 * The mount struct comes with a mount ref which will be consumed.
2159 * Do the actual file system unmount, prevent some common foot shooting.
2160 */
2161 int
2162 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2163 {
2164 int error;
2165 proc_t p = vfs_context_proc(ctx);
2166
2167 /*
2168 * If the file system is not responding and MNT_NOBLOCK
2169 * is set and not a forced unmount then return EBUSY.
2170 */
2171 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2172 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2173 error = EBUSY;
2174 goto out;
2175 }
2176
2177 /*
2178 * Skip authorization if the mount is tagged as permissive and
2179 * this is not a forced-unmount attempt.
2180 */
2181 if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2182 /*
2183 * Only root, or the user that did the original mount is
2184 * permitted to unmount this filesystem.
2185 */
2186 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2187 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2188 goto out;
2189 }
2190 }
2191 /*
2192 * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2193 */
2194 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2195 error = EBUSY; /* the root (or associated volumes) is always busy */
2196 goto out;
2197 }
2198
2199 #ifdef CONFIG_IMGSRC_ACCESS
2200 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2201 error = EBUSY;
2202 goto out;
2203 }
2204 #endif /* CONFIG_IMGSRC_ACCESS */
2205
2206 return dounmount(mp, flags, 1, ctx);
2207
2208 out:
2209 mount_drop(mp, 0);
2210 return error;
2211 }
2212
2213 /*
2214 * Do the actual file system unmount.
2215 */
2216 int
2217 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2218 {
2219 vnode_t coveredvp = (vnode_t)0;
2220 int error;
2221 int needwakeup = 0;
2222 int forcedunmount = 0;
2223 int lflags = 0;
2224 struct vnode *devvp = NULLVP;
2225 #if CONFIG_TRIGGERS
2226 proc_t p = vfs_context_proc(ctx);
2227 int did_vflush = 0;
2228 int pflags_save = 0;
2229 #endif /* CONFIG_TRIGGERS */
2230
2231 #if CONFIG_FSE
2232 if (!(flags & MNT_FORCE)) {
2233 fsevent_unmount(mp, ctx); /* has to come first! */
2234 }
2235 #endif
2236
2237 mount_lock(mp);
2238
2239 /*
2240 * If already an unmount in progress just return EBUSY.
2241 * Even a forced unmount cannot override.
2242 */
2243 if (mp->mnt_lflag & MNT_LUNMOUNT) {
2244 if (withref != 0) {
2245 mount_drop(mp, 1);
2246 }
2247 mount_unlock(mp);
2248 return EBUSY;
2249 }
2250
2251 if (flags & MNT_FORCE) {
2252 forcedunmount = 1;
2253 mp->mnt_lflag |= MNT_LFORCE;
2254 }
2255
2256 #if CONFIG_TRIGGERS
2257 if (flags & MNT_NOBLOCK && p != kernproc) {
2258 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2259 }
2260 #endif
2261
2262 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2263 mp->mnt_lflag |= MNT_LUNMOUNT;
2264 mp->mnt_flag &= ~MNT_ASYNC;
2265 /*
2266 * anyone currently in the fast path that
2267 * trips over the cached rootvp will be
2268 * dumped out and forced into the slow path
2269 * to regenerate a new cached value
2270 */
2271 mp->mnt_realrootvp = NULLVP;
2272 mount_unlock(mp);
2273
2274 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2275 /*
2276 * Force unmount any mounts in this filesystem.
2277 * If any unmounts fail - just leave them dangling.
2278 * Avoids recursion.
2279 */
2280 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2281 }
2282
2283 /*
2284 * taking the name_cache_lock exclusively will
2285 * insure that everyone is out of the fast path who
2286 * might be trying to use a now stale copy of
2287 * vp->v_mountedhere->mnt_realrootvp
2288 * bumping mount_generation causes the cached values
2289 * to be invalidated
2290 */
2291 name_cache_lock();
2292 mount_generation++;
2293 name_cache_unlock();
2294
2295
2296 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2297 if (withref != 0) {
2298 mount_drop(mp, 0);
2299 }
2300 error = 0;
2301 if (forcedunmount == 0) {
2302 ubc_umount(mp); /* release cached vnodes */
2303 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2304 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2305 if (error) {
2306 mount_lock(mp);
2307 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2308 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2309 mp->mnt_lflag &= ~MNT_LFORCE;
2310 goto out;
2311 }
2312 }
2313 }
2314
2315 IOBSDMountChange(mp, kIOMountChangeUnmount);
2316
2317 #if CONFIG_TRIGGERS
2318 vfs_nested_trigger_unmounts(mp, flags, ctx);
2319 did_vflush = 1;
2320 #endif
2321 if (forcedunmount) {
2322 lflags |= FORCECLOSE;
2323 }
2324 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2325 if ((forcedunmount == 0) && error) {
2326 mount_lock(mp);
2327 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2328 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2329 mp->mnt_lflag &= ~MNT_LFORCE;
2330 goto out;
2331 }
2332
2333 /* make sure there are no one in the mount iterations or lookup */
2334 mount_iterdrain(mp);
2335
2336 error = VFS_UNMOUNT(mp, flags, ctx);
2337 if (error) {
2338 mount_iterreset(mp);
2339 mount_lock(mp);
2340 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2341 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2342 mp->mnt_lflag &= ~MNT_LFORCE;
2343 goto out;
2344 }
2345
2346 /* increment the operations count */
2347 if (!error) {
2348 OSAddAtomic(1, &vfs_nummntops);
2349 }
2350
2351 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2352 /* hold an io reference and drop the usecount before close */
2353 devvp = mp->mnt_devvp;
2354 vnode_getalways(devvp);
2355 vnode_rele(devvp);
2356 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2357 ctx);
2358 vnode_clearmountedon(devvp);
2359 vnode_put(devvp);
2360 }
2361 lck_rw_done(&mp->mnt_rwlock);
2362 mount_list_remove(mp);
2363 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2364
2365 /* mark the mount point hook in the vp but not drop the ref yet */
2366 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2367 /*
2368 * The covered vnode needs special handling. Trying to get an
2369 * iocount must not block here as this may lead to deadlocks
2370 * if the Filesystem to which the covered vnode belongs is
2371 * undergoing forced unmounts. Since we hold a usecount, the
2372 * vnode cannot be reused (it can, however, still be terminated)
2373 */
2374 vnode_getalways(coveredvp);
2375 vnode_lock_spin(coveredvp);
2376
2377 mp->mnt_crossref++;
2378 coveredvp->v_mountedhere = (struct mount *)0;
2379 CLR(coveredvp->v_flag, VMOUNT);
2380
2381 vnode_unlock(coveredvp);
2382 vnode_put(coveredvp);
2383 }
2384
2385 mount_list_lock();
2386 mp->mnt_vtable->vfc_refcount--;
2387 mount_list_unlock();
2388
2389 cache_purgevfs(mp); /* remove cache entries for this file sys */
2390 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2391 mount_lock(mp);
2392 mp->mnt_lflag |= MNT_LDEAD;
2393
2394 if (mp->mnt_lflag & MNT_LWAIT) {
2395 /*
2396 * do the wakeup here
2397 * in case we block in mount_refdrain
2398 * which will drop the mount lock
2399 * and allow anyone blocked in vfs_busy
2400 * to wakeup and see the LDEAD state
2401 */
2402 mp->mnt_lflag &= ~MNT_LWAIT;
2403 wakeup((caddr_t)mp);
2404 }
2405 mount_refdrain(mp);
2406
2407 /* free disk_conditioner_info structure for this mount */
2408 disk_conditioner_unmount(mp);
2409
2410 out:
2411 if (mp->mnt_lflag & MNT_LWAIT) {
2412 mp->mnt_lflag &= ~MNT_LWAIT;
2413 needwakeup = 1;
2414 }
2415
2416 #if CONFIG_TRIGGERS
2417 if (flags & MNT_NOBLOCK && p != kernproc) {
2418 // Restore P_NOREMOTEHANG bit to its previous value
2419 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2420 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2421 }
2422 }
2423
2424 /*
2425 * Callback and context are set together under the mount lock, and
2426 * never cleared, so we're safe to examine them here, drop the lock,
2427 * and call out.
2428 */
2429 if (mp->mnt_triggercallback != NULL) {
2430 mount_unlock(mp);
2431 if (error == 0) {
2432 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2433 } else if (did_vflush) {
2434 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2435 }
2436 } else {
2437 mount_unlock(mp);
2438 }
2439 #else
2440 mount_unlock(mp);
2441 #endif /* CONFIG_TRIGGERS */
2442
2443 lck_rw_done(&mp->mnt_rwlock);
2444
2445 if (needwakeup) {
2446 wakeup((caddr_t)mp);
2447 }
2448
2449 if (!error) {
2450 if ((coveredvp != NULLVP)) {
2451 vnode_t pvp = NULLVP;
2452
2453 /*
2454 * The covered vnode needs special handling. Trying to
2455 * get an iocount must not block here as this may lead
2456 * to deadlocks if the Filesystem to which the covered
2457 * vnode belongs is undergoing forced unmounts. Since we
2458 * hold a usecount, the vnode cannot be reused
2459 * (it can, however, still be terminated).
2460 */
2461 vnode_getalways(coveredvp);
2462
2463 mount_dropcrossref(mp, coveredvp, 0);
2464 /*
2465 * We'll _try_ to detect if this really needs to be
2466 * done. The coveredvp can only be in termination (or
2467 * terminated) if the coveredvp's mount point is in a
2468 * forced unmount (or has been) since we still hold the
2469 * ref.
2470 */
2471 if (!vnode_isrecycled(coveredvp)) {
2472 pvp = vnode_getparent(coveredvp);
2473 #if CONFIG_TRIGGERS
2474 if (coveredvp->v_resolve) {
2475 vnode_trigger_rearm(coveredvp, ctx);
2476 }
2477 #endif
2478 }
2479
2480 vnode_rele(coveredvp);
2481 vnode_put(coveredvp);
2482 coveredvp = NULLVP;
2483
2484 if (pvp) {
2485 lock_vnode_and_post(pvp, NOTE_WRITE);
2486 vnode_put(pvp);
2487 }
2488 } else if (mp->mnt_flag & MNT_ROOTFS) {
2489 mount_lock_destroy(mp);
2490 #if CONFIG_MACF
2491 mac_mount_label_destroy(mp);
2492 #endif
2493 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2494 } else {
2495 panic("dounmount: no coveredvp");
2496 }
2497 }
2498 return error;
2499 }
2500
2501 /*
2502 * Unmount any mounts in this filesystem.
2503 */
2504 void
2505 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2506 {
2507 mount_t smp;
2508 fsid_t *fsids, fsid;
2509 int fsids_sz;
2510 int count = 0, i, m = 0;
2511 vnode_t vp;
2512
2513 mount_list_lock();
2514
2515 // Get an array to hold the submounts fsids.
2516 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2517 count++;
2518 fsids_sz = count * sizeof(fsid_t);
2519 MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2520 if (fsids == NULL) {
2521 mount_list_unlock();
2522 goto out;
2523 }
2524 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2525
2526 /*
2527 * Fill the array with submount fsids.
2528 * Since mounts are always added to the tail of the mount list, the
2529 * list is always in mount order.
2530 * For each mount check if the mounted-on vnode belongs to a
2531 * mount that's already added to our array of mounts to be unmounted.
2532 */
2533 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2534 vp = smp->mnt_vnodecovered;
2535 if (vp == NULL) {
2536 continue;
2537 }
2538 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2539 for (i = 0; i <= m; i++) {
2540 if (fsids[i].val[0] == fsid.val[0] &&
2541 fsids[i].val[1] == fsid.val[1]) {
2542 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2543 break;
2544 }
2545 }
2546 }
2547 mount_list_unlock();
2548
2549 // Unmount the submounts in reverse order. Ignore errors.
2550 for (i = m; i > 0; i--) {
2551 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2552 if (smp) {
2553 mount_ref(smp, 0);
2554 mount_iterdrop(smp);
2555 (void) dounmount(smp, flags, 1, ctx);
2556 }
2557 }
2558 out:
2559 if (fsids) {
2560 FREE(fsids, M_TEMP);
2561 }
2562 }
2563
2564 void
2565 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2566 {
2567 vnode_lock(dp);
2568 mp->mnt_crossref--;
2569
2570 if (mp->mnt_crossref < 0) {
2571 panic("mount cross refs -ve");
2572 }
2573
2574 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2575 if (need_put) {
2576 vnode_put_locked(dp);
2577 }
2578 vnode_unlock(dp);
2579
2580 mount_lock_destroy(mp);
2581 #if CONFIG_MACF
2582 mac_mount_label_destroy(mp);
2583 #endif
2584 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2585 return;
2586 }
2587 if (need_put) {
2588 vnode_put_locked(dp);
2589 }
2590 vnode_unlock(dp);
2591 }
2592
2593
2594 /*
2595 * Sync each mounted filesystem.
2596 */
2597 #if DIAGNOSTIC
2598 int syncprt = 0;
2599 #endif
2600
2601 int print_vmpage_stat = 0;
2602
2603 /*
2604 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2605 * mounted read-write with the passed waitfor value.
2606 *
2607 * Parameters: mp mount-point descriptor per mounted file-system instance.
2608 * arg user argument (please see below)
2609 *
2610 * User argument is a pointer to 32 bit unsigned integer which describes the
2611 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2612 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2613 * waitfor value.
2614 *
2615 * Returns: VFS_RETURNED
2616 */
2617 static int
2618 sync_callback(mount_t mp, void *arg)
2619 {
2620 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2621 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2622 unsigned waitfor = MNT_NOWAIT;
2623
2624 if (arg) {
2625 waitfor = *(uint32_t*)arg;
2626 }
2627
2628 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2629 if (waitfor != MNT_WAIT &&
2630 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2631 waitfor != MNT_NOWAIT &&
2632 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2633 waitfor != MNT_DWAIT &&
2634 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2635 panic("Passed inappropriate waitfor %u to "
2636 "sync_callback()", waitfor);
2637 }
2638
2639 mp->mnt_flag &= ~MNT_ASYNC;
2640 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2641 if (asyncflag) {
2642 mp->mnt_flag |= MNT_ASYNC;
2643 }
2644 }
2645
2646 return VFS_RETURNED;
2647 }
2648
2649 /* ARGSUSED */
2650 int
2651 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2652 {
2653 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2654
2655 if (print_vmpage_stat) {
2656 vm_countdirtypages();
2657 }
2658
2659 #if DIAGNOSTIC
2660 if (syncprt) {
2661 vfs_bufstats();
2662 }
2663 #endif /* DIAGNOSTIC */
2664 return 0;
2665 }
2666
2667 typedef enum {
2668 SYNC_ALL = 0,
2669 SYNC_ONLY_RELIABLE_MEDIA = 1,
2670 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2671 } sync_type_t;
2672
2673 static int
2674 sync_internal_callback(mount_t mp, void *arg)
2675 {
2676 if (arg) {
2677 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2678 (mp->mnt_flag & MNT_LOCAL);
2679 sync_type_t sync_type = *((sync_type_t *)arg);
2680
2681 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2682 return VFS_RETURNED;
2683 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2684 return VFS_RETURNED;
2685 }
2686 }
2687
2688 (void)sync_callback(mp, NULL);
2689
2690 return VFS_RETURNED;
2691 }
2692
2693 int sync_thread_state = 0;
2694 int sync_timeout_seconds = 5;
2695
2696 #define SYNC_THREAD_RUN 0x0001
2697 #define SYNC_THREAD_RUNNING 0x0002
2698
2699 static void
2700 sync_thread(__unused void *arg, __unused wait_result_t wr)
2701 {
2702 sync_type_t sync_type;
2703
2704 lck_mtx_lock(sync_mtx_lck);
2705 while (sync_thread_state & SYNC_THREAD_RUN) {
2706 sync_thread_state &= ~SYNC_THREAD_RUN;
2707 lck_mtx_unlock(sync_mtx_lck);
2708
2709 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2710 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2711 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2712 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2713
2714 lck_mtx_lock(sync_mtx_lck);
2715 }
2716 /*
2717 * This wakeup _has_ to be issued before the lock is released otherwise
2718 * we may end up waking up a thread in sync_internal which is
2719 * expecting a wakeup from a thread it just created and not from this
2720 * thread which is about to exit.
2721 */
2722 wakeup(&sync_thread_state);
2723 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2724 lck_mtx_unlock(sync_mtx_lck);
2725
2726 if (print_vmpage_stat) {
2727 vm_countdirtypages();
2728 }
2729
2730 #if DIAGNOSTIC
2731 if (syncprt) {
2732 vfs_bufstats();
2733 }
2734 #endif /* DIAGNOSTIC */
2735 }
2736
2737 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2738
2739 /*
2740 * An in-kernel sync for power management to call.
2741 * This function always returns within sync_timeout seconds.
2742 */
2743 __private_extern__ int
2744 sync_internal(void)
2745 {
2746 thread_t thd;
2747 int error;
2748 int thread_created = FALSE;
2749 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2750
2751 lck_mtx_lock(sync_mtx_lck);
2752 sync_thread_state |= SYNC_THREAD_RUN;
2753 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2754 int kr;
2755
2756 sync_thread_state |= SYNC_THREAD_RUNNING;
2757 kr = kernel_thread_start(sync_thread, NULL, &thd);
2758 if (kr != KERN_SUCCESS) {
2759 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2760 lck_mtx_unlock(sync_mtx_lck);
2761 printf("sync_thread failed\n");
2762 return 0;
2763 }
2764 thread_created = TRUE;
2765 }
2766
2767 error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2768 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2769 if (error) {
2770 struct timeval now;
2771
2772 microtime(&now);
2773 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2774 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2775 sync_timeout_last_print.tv_sec = now.tv_sec;
2776 }
2777 }
2778
2779 if (thread_created) {
2780 thread_deallocate(thd);
2781 }
2782
2783 return 0;
2784 } /* end of sync_internal call */
2785
2786 /*
2787 * Change filesystem quotas.
2788 */
2789 #if QUOTA
2790 int
2791 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2792 {
2793 struct mount *mp;
2794 int error, quota_cmd, quota_status = 0;
2795 caddr_t datap;
2796 size_t fnamelen;
2797 struct nameidata nd;
2798 vfs_context_t ctx = vfs_context_current();
2799 struct dqblk my_dqblk = {};
2800
2801 AUDIT_ARG(uid, uap->uid);
2802 AUDIT_ARG(cmd, uap->cmd);
2803 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2804 uap->path, ctx);
2805 error = namei(&nd);
2806 if (error) {
2807 return error;
2808 }
2809 mp = nd.ni_vp->v_mount;
2810 mount_ref(mp, 0);
2811 vnode_put(nd.ni_vp);
2812 nameidone(&nd);
2813
2814 /* copyin any data we will need for downstream code */
2815 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2816
2817 switch (quota_cmd) {
2818 case Q_QUOTAON:
2819 /* uap->arg specifies a file from which to take the quotas */
2820 fnamelen = MAXPATHLEN;
2821 datap = kalloc(MAXPATHLEN);
2822 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2823 break;
2824 case Q_GETQUOTA:
2825 /* uap->arg is a pointer to a dqblk structure. */
2826 datap = (caddr_t) &my_dqblk;
2827 break;
2828 case Q_SETQUOTA:
2829 case Q_SETUSE:
2830 /* uap->arg is a pointer to a dqblk structure. */
2831 datap = (caddr_t) &my_dqblk;
2832 if (proc_is64bit(p)) {
2833 struct user_dqblk my_dqblk64;
2834 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2835 if (error == 0) {
2836 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2837 }
2838 } else {
2839 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2840 }
2841 break;
2842 case Q_QUOTASTAT:
2843 /* uap->arg is a pointer to an integer */
2844 datap = (caddr_t) &quota_status;
2845 break;
2846 default:
2847 datap = NULL;
2848 break;
2849 } /* switch */
2850
2851 if (error == 0) {
2852 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2853 }
2854
2855 switch (quota_cmd) {
2856 case Q_QUOTAON:
2857 if (datap != NULL) {
2858 kfree(datap, MAXPATHLEN);
2859 }
2860 break;
2861 case Q_GETQUOTA:
2862 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2863 if (error == 0) {
2864 if (proc_is64bit(p)) {
2865 struct user_dqblk my_dqblk64;
2866
2867 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2868 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2869 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2870 } else {
2871 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2872 }
2873 }
2874 break;
2875 case Q_QUOTASTAT:
2876 /* uap->arg is a pointer to an integer */
2877 if (error == 0) {
2878 error = copyout(datap, uap->arg, sizeof(quota_status));
2879 }
2880 break;
2881 default:
2882 break;
2883 } /* switch */
2884
2885 mount_drop(mp, 0);
2886 return error;
2887 }
2888 #else
2889 int
2890 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2891 {
2892 return EOPNOTSUPP;
2893 }
2894 #endif /* QUOTA */
2895
2896 /*
2897 * Get filesystem statistics.
2898 *
2899 * Returns: 0 Success
2900 * namei:???
2901 * vfs_update_vfsstat:???
2902 * munge_statfs:EFAULT
2903 */
2904 /* ARGSUSED */
2905 int
2906 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2907 {
2908 struct mount *mp;
2909 struct vfsstatfs *sp;
2910 int error;
2911 struct nameidata nd;
2912 vfs_context_t ctx = vfs_context_current();
2913 vnode_t vp;
2914
2915 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2916 UIO_USERSPACE, uap->path, ctx);
2917 error = namei(&nd);
2918 if (error != 0) {
2919 return error;
2920 }
2921 vp = nd.ni_vp;
2922 mp = vp->v_mount;
2923 sp = &mp->mnt_vfsstat;
2924 nameidone(&nd);
2925
2926 #if CONFIG_MACF
2927 error = mac_mount_check_stat(ctx, mp);
2928 if (error != 0) {
2929 vnode_put(vp);
2930 return error;
2931 }
2932 #endif
2933
2934 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2935 if (error != 0) {
2936 vnode_put(vp);
2937 return error;
2938 }
2939
2940 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2941 vnode_put(vp);
2942 return error;
2943 }
2944
2945 /*
2946 * Get filesystem statistics.
2947 */
2948 /* ARGSUSED */
2949 int
2950 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2951 {
2952 vnode_t vp;
2953 struct mount *mp;
2954 struct vfsstatfs *sp;
2955 int error;
2956
2957 AUDIT_ARG(fd, uap->fd);
2958
2959 if ((error = file_vnode(uap->fd, &vp))) {
2960 return error;
2961 }
2962
2963 error = vnode_getwithref(vp);
2964 if (error) {
2965 file_drop(uap->fd);
2966 return error;
2967 }
2968
2969 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2970
2971 mp = vp->v_mount;
2972 if (!mp) {
2973 error = EBADF;
2974 goto out;
2975 }
2976
2977 #if CONFIG_MACF
2978 error = mac_mount_check_stat(vfs_context_current(), mp);
2979 if (error != 0) {
2980 goto out;
2981 }
2982 #endif
2983
2984 sp = &mp->mnt_vfsstat;
2985 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2986 goto out;
2987 }
2988
2989 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2990
2991 out:
2992 file_drop(uap->fd);
2993 vnode_put(vp);
2994
2995 return error;
2996 }
2997
2998 void
2999 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3000 {
3001 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3002
3003 bzero(sfs, sizeof(*sfs));
3004
3005 sfs->f_bsize = vsfs->f_bsize;
3006 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3007 sfs->f_blocks = vsfs->f_blocks;
3008 sfs->f_bfree = vsfs->f_bfree;
3009 sfs->f_bavail = vsfs->f_bavail;
3010 sfs->f_files = vsfs->f_files;
3011 sfs->f_ffree = vsfs->f_ffree;
3012 sfs->f_fsid = vsfs->f_fsid;
3013 sfs->f_owner = vsfs->f_owner;
3014 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3015 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3016 sfs->f_fssubtype = vsfs->f_fssubtype;
3017 sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3018 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3019 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3020 } else {
3021 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3022 }
3023 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3024 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3025 }
3026
3027 /*
3028 * Get file system statistics in 64-bit mode
3029 */
3030 int
3031 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3032 {
3033 struct mount *mp;
3034 int error;
3035 struct nameidata nd;
3036 struct statfs64 sfs;
3037 vfs_context_t ctxp = vfs_context_current();
3038 vnode_t vp;
3039
3040 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3041 UIO_USERSPACE, uap->path, ctxp);
3042 error = namei(&nd);
3043 if (error != 0) {
3044 return error;
3045 }
3046 vp = nd.ni_vp;
3047 mp = vp->v_mount;
3048 nameidone(&nd);
3049
3050 #if CONFIG_MACF
3051 error = mac_mount_check_stat(ctxp, mp);
3052 if (error != 0) {
3053 vnode_put(vp);
3054 return error;
3055 }
3056 #endif
3057
3058 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3059 if (error != 0) {
3060 vnode_put(vp);
3061 return error;
3062 }
3063
3064 vfs_get_statfs64(mp, &sfs);
3065 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3066 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3067 /* This process does not want to see a seperate data volume mountpoint */
3068 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3069 }
3070 error = copyout(&sfs, uap->buf, sizeof(sfs));
3071 vnode_put(vp);
3072
3073 return error;
3074 }
3075
3076 /*
3077 * Get file system statistics in 64-bit mode
3078 */
3079 int
3080 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3081 {
3082 struct vnode *vp;
3083 struct mount *mp;
3084 struct statfs64 sfs;
3085 int error;
3086
3087 AUDIT_ARG(fd, uap->fd);
3088
3089 if ((error = file_vnode(uap->fd, &vp))) {
3090 return error;
3091 }
3092
3093 error = vnode_getwithref(vp);
3094 if (error) {
3095 file_drop(uap->fd);
3096 return error;
3097 }
3098
3099 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3100
3101 mp = vp->v_mount;
3102 if (!mp) {
3103 error = EBADF;
3104 goto out;
3105 }
3106
3107 #if CONFIG_MACF
3108 error = mac_mount_check_stat(vfs_context_current(), mp);
3109 if (error != 0) {
3110 goto out;
3111 }
3112 #endif
3113
3114 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3115 goto out;
3116 }
3117
3118 vfs_get_statfs64(mp, &sfs);
3119 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3120 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3121 /* This process does not want to see a seperate data volume mountpoint */
3122 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3123 }
3124 error = copyout(&sfs, uap->buf, sizeof(sfs));
3125
3126 out:
3127 file_drop(uap->fd);
3128 vnode_put(vp);
3129
3130 return error;
3131 }
3132
3133 struct getfsstat_struct {
3134 user_addr_t sfsp;
3135 user_addr_t *mp;
3136 int count;
3137 int maxcount;
3138 int flags;
3139 int error;
3140 };
3141
3142
3143 static int
3144 getfsstat_callback(mount_t mp, void * arg)
3145 {
3146 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3147 struct vfsstatfs *sp;
3148 int error, my_size;
3149 vfs_context_t ctx = vfs_context_current();
3150
3151 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3152 #if CONFIG_MACF
3153 error = mac_mount_check_stat(ctx, mp);
3154 if (error != 0) {
3155 fstp->error = error;
3156 return VFS_RETURNED_DONE;
3157 }
3158 #endif
3159 sp = &mp->mnt_vfsstat;
3160 /*
3161 * If MNT_NOWAIT is specified, do not refresh the
3162 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3163 */
3164 if ((mp->mnt_lflag & MNT_LDEAD) ||
3165 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3166 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3167 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3168 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3169 return VFS_RETURNED;
3170 }
3171
3172 /*
3173 * Need to handle LP64 version of struct statfs
3174 */
3175 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3176 if (error) {
3177 fstp->error = error;
3178 return VFS_RETURNED_DONE;
3179 }
3180 fstp->sfsp += my_size;
3181
3182 if (fstp->mp) {
3183 #if CONFIG_MACF
3184 error = mac_mount_label_get(mp, *fstp->mp);
3185 if (error) {
3186 fstp->error = error;
3187 return VFS_RETURNED_DONE;
3188 }
3189 #endif
3190 fstp->mp++;
3191 }
3192 }
3193 fstp->count++;
3194 return VFS_RETURNED;
3195 }
3196
3197 /*
3198 * Get statistics on all filesystems.
3199 */
3200 int
3201 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3202 {
3203 struct __mac_getfsstat_args muap;
3204
3205 muap.buf = uap->buf;
3206 muap.bufsize = uap->bufsize;
3207 muap.mac = USER_ADDR_NULL;
3208 muap.macsize = 0;
3209 muap.flags = uap->flags;
3210
3211 return __mac_getfsstat(p, &muap, retval);
3212 }
3213
3214 /*
3215 * __mac_getfsstat: Get MAC-related file system statistics
3216 *
3217 * Parameters: p (ignored)
3218 * uap User argument descriptor (see below)
3219 * retval Count of file system statistics (N stats)
3220 *
3221 * Indirect: uap->bufsize Buffer size
3222 * uap->macsize MAC info size
3223 * uap->buf Buffer where information will be returned
3224 * uap->mac MAC info
3225 * uap->flags File system flags
3226 *
3227 *
3228 * Returns: 0 Success
3229 * !0 Not success
3230 *
3231 */
3232 int
3233 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3234 {
3235 user_addr_t sfsp;
3236 user_addr_t *mp;
3237 size_t count, maxcount, bufsize, macsize;
3238 struct getfsstat_struct fst;
3239
3240 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3241 return EINVAL;
3242 }
3243
3244 bufsize = (size_t) uap->bufsize;
3245 macsize = (size_t) uap->macsize;
3246
3247 if (IS_64BIT_PROCESS(p)) {
3248 maxcount = bufsize / sizeof(struct user64_statfs);
3249 } else {
3250 maxcount = bufsize / sizeof(struct user32_statfs);
3251 }
3252 sfsp = uap->buf;
3253 count = 0;
3254
3255 mp = NULL;
3256
3257 #if CONFIG_MACF
3258 if (uap->mac != USER_ADDR_NULL) {
3259 u_int32_t *mp0;
3260 int error;
3261 unsigned int i;
3262
3263 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3264 if (count != maxcount) {
3265 return EINVAL;
3266 }
3267
3268 /* Copy in the array */
3269 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3270 if (mp0 == NULL) {
3271 return ENOMEM;
3272 }
3273
3274 error = copyin(uap->mac, mp0, macsize);
3275 if (error) {
3276 FREE(mp0, M_MACTEMP);
3277 return error;
3278 }
3279
3280 /* Normalize to an array of user_addr_t */
3281 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3282 if (mp == NULL) {
3283 FREE(mp0, M_MACTEMP);
3284 return ENOMEM;
3285 }
3286
3287 for (i = 0; i < count; i++) {
3288 if (IS_64BIT_PROCESS(p)) {
3289 mp[i] = ((user_addr_t *)mp0)[i];
3290 } else {
3291 mp[i] = (user_addr_t)mp0[i];
3292 }
3293 }
3294 FREE(mp0, M_MACTEMP);
3295 }
3296 #endif
3297
3298
3299 fst.sfsp = sfsp;
3300 fst.mp = mp;
3301 fst.flags = uap->flags;
3302 fst.count = 0;
3303 fst.error = 0;
3304 fst.maxcount = maxcount;
3305
3306
3307 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3308
3309 if (mp) {
3310 FREE(mp, M_MACTEMP);
3311 }
3312
3313 if (fst.error) {
3314 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3315 return fst.error;
3316 }
3317
3318 if (fst.sfsp && fst.count > fst.maxcount) {
3319 *retval = fst.maxcount;
3320 } else {
3321 *retval = fst.count;
3322 }
3323 return 0;
3324 }
3325
3326 static int
3327 getfsstat64_callback(mount_t mp, void * arg)
3328 {
3329 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3330 struct vfsstatfs *sp;
3331 struct statfs64 sfs;
3332 int error;
3333
3334 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3335 #if CONFIG_MACF
3336 error = mac_mount_check_stat(vfs_context_current(), mp);
3337 if (error != 0) {
3338 fstp->error = error;
3339 return VFS_RETURNED_DONE;
3340 }
3341 #endif
3342 sp = &mp->mnt_vfsstat;
3343 /*
3344 * If MNT_NOWAIT is specified, do not refresh the fsstat
3345 * cache. MNT_WAIT overrides MNT_NOWAIT.
3346 *
3347 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3348 * getfsstat, since the constants are out of the same
3349 * namespace.
3350 */
3351 if ((mp->mnt_lflag & MNT_LDEAD) ||
3352 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3353 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3354 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3355 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3356 return VFS_RETURNED;
3357 }
3358
3359 vfs_get_statfs64(mp, &sfs);
3360 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3361 if (error) {
3362 fstp->error = error;
3363 return VFS_RETURNED_DONE;
3364 }
3365 fstp->sfsp += sizeof(sfs);
3366 }
3367 fstp->count++;
3368 return VFS_RETURNED;
3369 }
3370
3371 /*
3372 * Get statistics on all file systems in 64 bit mode.
3373 */
3374 int
3375 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3376 {
3377 user_addr_t sfsp;
3378 int count, maxcount;
3379 struct getfsstat_struct fst;
3380
3381 maxcount = uap->bufsize / sizeof(struct statfs64);
3382
3383 sfsp = uap->buf;
3384 count = 0;
3385
3386 fst.sfsp = sfsp;
3387 fst.flags = uap->flags;
3388 fst.count = 0;
3389 fst.error = 0;
3390 fst.maxcount = maxcount;
3391
3392 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3393
3394 if (fst.error) {
3395 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3396 return fst.error;
3397 }
3398
3399 if (fst.sfsp && fst.count > fst.maxcount) {
3400 *retval = fst.maxcount;
3401 } else {
3402 *retval = fst.count;
3403 }
3404
3405 return 0;
3406 }
3407
3408 /*
3409 * gets the associated vnode with the file descriptor passed.
3410 * as input
3411 *
3412 * INPUT
3413 * ctx - vfs context of caller
3414 * fd - file descriptor for which vnode is required.
3415 * vpp - Pointer to pointer to vnode to be returned.
3416 *
3417 * The vnode is returned with an iocount so any vnode obtained
3418 * by this call needs a vnode_put
3419 *
3420 */
3421 int
3422 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3423 {
3424 int error;
3425 vnode_t vp;
3426 struct fileproc *fp;
3427 proc_t p = vfs_context_proc(ctx);
3428
3429 *vpp = NULLVP;
3430
3431 error = fp_getfvp(p, fd, &fp, &vp);
3432 if (error) {
3433 return error;
3434 }
3435
3436 error = vnode_getwithref(vp);
3437 if (error) {
3438 (void)fp_drop(p, fd, fp, 0);
3439 return error;
3440 }
3441
3442 (void)fp_drop(p, fd, fp, 0);
3443 *vpp = vp;
3444 return error;
3445 }
3446
3447 /*
3448 * Wrapper function around namei to start lookup from a directory
3449 * specified by a file descriptor ni_dirfd.
3450 *
3451 * In addition to all the errors returned by namei, this call can
3452 * return ENOTDIR if the file descriptor does not refer to a directory.
3453 * and EBADF if the file descriptor is not valid.
3454 */
3455 int
3456 nameiat(struct nameidata *ndp, int dirfd)
3457 {
3458 if ((dirfd != AT_FDCWD) &&
3459 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3460 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3461 int error = 0;
3462 char c;
3463
3464 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3465 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3466 if (error) {
3467 return error;
3468 }
3469 } else {
3470 c = *((char *)(ndp->ni_dirp));
3471 }
3472
3473 if (c != '/') {
3474 vnode_t dvp_at;
3475
3476 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3477 &dvp_at);
3478 if (error) {
3479 return error;
3480 }
3481
3482 if (vnode_vtype(dvp_at) != VDIR) {
3483 vnode_put(dvp_at);
3484 return ENOTDIR;
3485 }
3486
3487 ndp->ni_dvp = dvp_at;
3488 ndp->ni_cnd.cn_flags |= USEDVP;
3489 error = namei(ndp);
3490 ndp->ni_cnd.cn_flags &= ~USEDVP;
3491 vnode_put(dvp_at);
3492 return error;
3493 }
3494 }
3495
3496 return namei(ndp);
3497 }
3498
3499 /*
3500 * Change current working directory to a given file descriptor.
3501 */
3502 /* ARGSUSED */
3503 static int
3504 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3505 {
3506 struct filedesc *fdp = p->p_fd;
3507 vnode_t vp;
3508 vnode_t tdp;
3509 vnode_t tvp;
3510 struct mount *mp;
3511 int error;
3512 vfs_context_t ctx = vfs_context_current();
3513
3514 AUDIT_ARG(fd, uap->fd);
3515 if (per_thread && uap->fd == -1) {
3516 /*
3517 * Switching back from per-thread to per process CWD; verify we
3518 * in fact have one before proceeding. The only success case
3519 * for this code path is to return 0 preemptively after zapping
3520 * the thread structure contents.
3521 */
3522 thread_t th = vfs_context_thread(ctx);
3523 if (th) {
3524 uthread_t uth = get_bsdthread_info(th);
3525 tvp = uth->uu_cdir;
3526 uth->uu_cdir = NULLVP;
3527 if (tvp != NULLVP) {
3528 vnode_rele(tvp);
3529 return 0;
3530 }
3531 }
3532 return EBADF;
3533 }
3534
3535 if ((error = file_vnode(uap->fd, &vp))) {
3536 return error;
3537 }
3538 if ((error = vnode_getwithref(vp))) {
3539 file_drop(uap->fd);
3540 return error;
3541 }
3542
3543 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3544
3545 if (vp->v_type != VDIR) {
3546 error = ENOTDIR;
3547 goto out;
3548 }
3549
3550 #if CONFIG_MACF
3551 error = mac_vnode_check_chdir(ctx, vp);
3552 if (error) {
3553 goto out;
3554 }
3555 #endif
3556 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3557 if (error) {
3558 goto out;
3559 }
3560
3561 while (!error && (mp = vp->v_mountedhere) != NULL) {
3562 if (vfs_busy(mp, LK_NOWAIT)) {
3563 error = EACCES;
3564 goto out;
3565 }
3566 error = VFS_ROOT(mp, &tdp, ctx);
3567 vfs_unbusy(mp);
3568 if (error) {
3569 break;
3570 }
3571 vnode_put(vp);
3572 vp = tdp;
3573 }
3574 if (error) {
3575 goto out;
3576 }
3577 if ((error = vnode_ref(vp))) {
3578 goto out;
3579 }
3580 vnode_put(vp);
3581
3582 if (per_thread) {
3583 thread_t th = vfs_context_thread(ctx);
3584 if (th) {
3585 uthread_t uth = get_bsdthread_info(th);
3586 tvp = uth->uu_cdir;
3587 uth->uu_cdir = vp;
3588 OSBitOrAtomic(P_THCWD, &p->p_flag);
3589 } else {
3590 vnode_rele(vp);
3591 return ENOENT;
3592 }
3593 } else {
3594 proc_dirs_lock_exclusive(p);
3595 proc_fdlock(p);
3596 tvp = fdp->fd_cdir;
3597 fdp->fd_cdir = vp;
3598 proc_fdunlock(p);
3599 proc_dirs_unlock_exclusive(p);
3600 }
3601
3602 if (tvp) {
3603 vnode_rele(tvp);
3604 }
3605 file_drop(uap->fd);
3606
3607 return 0;
3608 out:
3609 vnode_put(vp);
3610 file_drop(uap->fd);
3611
3612 return error;
3613 }
3614
3615 int
3616 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3617 {
3618 return common_fchdir(p, uap, 0);
3619 }
3620
3621 int
3622 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3623 {
3624 return common_fchdir(p, (void *)uap, 1);
3625 }
3626
3627
3628 /*
3629 * Change current working directory (".").
3630 *
3631 * Returns: 0 Success
3632 * change_dir:ENOTDIR
3633 * change_dir:???
3634 * vnode_ref:ENOENT No such file or directory
3635 */
3636 /* ARGSUSED */
3637 int
3638 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3639 {
3640 struct filedesc *fdp = p->p_fd;
3641 int error;
3642 vnode_t tvp;
3643
3644 error = change_dir(ndp, ctx);
3645 if (error) {
3646 return error;
3647 }
3648 if ((error = vnode_ref(ndp->ni_vp))) {
3649 vnode_put(ndp->ni_vp);
3650 return error;
3651 }
3652 /*
3653 * drop the iocount we picked up in change_dir
3654 */
3655 vnode_put(ndp->ni_vp);
3656
3657 if (per_thread) {
3658 thread_t th = vfs_context_thread(ctx);
3659 if (th) {
3660 uthread_t uth = get_bsdthread_info(th);
3661 tvp = uth->uu_cdir;
3662 uth->uu_cdir = ndp->ni_vp;
3663 OSBitOrAtomic(P_THCWD, &p->p_flag);
3664 } else {
3665 vnode_rele(ndp->ni_vp);
3666 return ENOENT;
3667 }
3668 } else {
3669 proc_dirs_lock_exclusive(p);
3670 proc_fdlock(p);
3671 tvp = fdp->fd_cdir;
3672 fdp->fd_cdir = ndp->ni_vp;
3673 proc_fdunlock(p);
3674 proc_dirs_unlock_exclusive(p);
3675 }
3676
3677 if (tvp) {
3678 vnode_rele(tvp);
3679 }
3680
3681 return 0;
3682 }
3683
3684
3685 /*
3686 * Change current working directory (".").
3687 *
3688 * Returns: 0 Success
3689 * chdir_internal:ENOTDIR
3690 * chdir_internal:ENOENT No such file or directory
3691 * chdir_internal:???
3692 */
3693 /* ARGSUSED */
3694 static int
3695 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3696 {
3697 struct nameidata nd;
3698 vfs_context_t ctx = vfs_context_current();
3699
3700 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3701 UIO_USERSPACE, uap->path, ctx);
3702
3703 return chdir_internal(p, ctx, &nd, per_thread);
3704 }
3705
3706
3707 /*
3708 * chdir
3709 *
3710 * Change current working directory (".") for the entire process
3711 *
3712 * Parameters: p Process requesting the call
3713 * uap User argument descriptor (see below)
3714 * retval (ignored)
3715 *
3716 * Indirect parameters: uap->path Directory path
3717 *
3718 * Returns: 0 Success
3719 * common_chdir: ENOTDIR
3720 * common_chdir: ENOENT No such file or directory
3721 * common_chdir: ???
3722 *
3723 */
3724 int
3725 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3726 {
3727 return common_chdir(p, (void *)uap, 0);
3728 }
3729
3730 /*
3731 * __pthread_chdir
3732 *
3733 * Change current working directory (".") for a single thread
3734 *
3735 * Parameters: p Process requesting the call
3736 * uap User argument descriptor (see below)
3737 * retval (ignored)
3738 *
3739 * Indirect parameters: uap->path Directory path
3740 *
3741 * Returns: 0 Success
3742 * common_chdir: ENOTDIR
3743 * common_chdir: ENOENT No such file or directory
3744 * common_chdir: ???
3745 *
3746 */
3747 int
3748 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3749 {
3750 return common_chdir(p, (void *)uap, 1);
3751 }
3752
3753
3754 /*
3755 * Change notion of root (``/'') directory.
3756 */
3757 /* ARGSUSED */
3758 int
3759 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3760 {
3761 struct filedesc *fdp = p->p_fd;
3762 int error;
3763 struct nameidata nd;
3764 vnode_t tvp;
3765 vfs_context_t ctx = vfs_context_current();
3766
3767 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3768 return error;
3769 }
3770
3771 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3772 UIO_USERSPACE, uap->path, ctx);
3773 error = change_dir(&nd, ctx);
3774 if (error) {
3775 return error;
3776 }
3777
3778 #if CONFIG_MACF
3779 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3780 &nd.ni_cnd);
3781 if (error) {
3782 vnode_put(nd.ni_vp);
3783 return error;
3784 }
3785 #endif
3786
3787 if ((error = vnode_ref(nd.ni_vp))) {
3788 vnode_put(nd.ni_vp);
3789 return error;
3790 }
3791 vnode_put(nd.ni_vp);
3792
3793 /*
3794 * This lock provides the guarantee that as long as you hold the lock
3795 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3796 * on a referenced vnode in namei when determining the rootvnode for
3797 * a process.
3798 */
3799 /* needed for synchronization with lookup */
3800 proc_dirs_lock_exclusive(p);
3801 /* needed for setting the flag and other activities on the fd itself */
3802 proc_fdlock(p);
3803 tvp = fdp->fd_rdir;
3804 fdp->fd_rdir = nd.ni_vp;
3805 fdp->fd_flags |= FD_CHROOT;
3806 proc_fdunlock(p);
3807 proc_dirs_unlock_exclusive(p);
3808
3809 if (tvp != NULL) {
3810 vnode_rele(tvp);
3811 }
3812
3813 return 0;
3814 }
3815
3816 /*
3817 * Common routine for chroot and chdir.
3818 *
3819 * Returns: 0 Success
3820 * ENOTDIR Not a directory
3821 * namei:??? [anything namei can return]
3822 * vnode_authorize:??? [anything vnode_authorize can return]
3823 */
3824 static int
3825 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3826 {
3827 vnode_t vp;
3828 int error;
3829
3830 if ((error = namei(ndp))) {
3831 return error;
3832 }
3833 nameidone(ndp);
3834 vp = ndp->ni_vp;
3835
3836 if (vp->v_type != VDIR) {
3837 vnode_put(vp);
3838 return ENOTDIR;
3839 }
3840
3841 #if CONFIG_MACF
3842 error = mac_vnode_check_chdir(ctx, vp);
3843 if (error) {
3844 vnode_put(vp);
3845 return error;
3846 }
3847 #endif
3848
3849 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3850 if (error) {
3851 vnode_put(vp);
3852 return error;
3853 }
3854
3855 return error;
3856 }
3857
3858 /*
3859 * Free the vnode data (for directories) associated with the file glob.
3860 */
3861 struct fd_vn_data *
3862 fg_vn_data_alloc(void)
3863 {
3864 struct fd_vn_data *fvdata;
3865
3866 /* Allocate per fd vnode data */
3867 MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3868 M_FD_VN_DATA, M_WAITOK | M_ZERO);
3869 lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3870 return fvdata;
3871 }
3872
3873 /*
3874 * Free the vnode data (for directories) associated with the file glob.
3875 */
3876 void
3877 fg_vn_data_free(void *fgvndata)
3878 {
3879 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3880
3881 if (fvdata->fv_buf) {
3882 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3883 }
3884 lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3885 FREE(fvdata, M_FD_VN_DATA);
3886 }
3887
3888 /*
3889 * Check permissions, allocate an open file structure,
3890 * and call the device open routine if any.
3891 *
3892 * Returns: 0 Success
3893 * EINVAL
3894 * EINTR
3895 * falloc:ENFILE
3896 * falloc:EMFILE
3897 * falloc:ENOMEM
3898 * vn_open_auth:???
3899 * dupfdopen:???
3900 * VNOP_ADVLOCK:???
3901 * vnode_setsize:???
3902 *
3903 * XXX Need to implement uid, gid
3904 */
3905 int
3906 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3907 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3908 int32_t *retval)
3909 {
3910 proc_t p = vfs_context_proc(ctx);
3911 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3912 struct fileproc *fp;
3913 vnode_t vp;
3914 int flags, oflags;
3915 int type, indx, error;
3916 struct flock lf;
3917 struct vfs_context context;
3918
3919 oflags = uflags;
3920
3921 if ((oflags & O_ACCMODE) == O_ACCMODE) {
3922 return EINVAL;
3923 }
3924
3925 flags = FFLAGS(uflags);
3926 CLR(flags, FENCRYPTED);
3927 CLR(flags, FUNENCRYPTED);
3928
3929 AUDIT_ARG(fflags, oflags);
3930 AUDIT_ARG(mode, vap->va_mode);
3931
3932 if ((error = falloc_withalloc(p,
3933 &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3934 return error;
3935 }
3936 uu->uu_dupfd = -indx - 1;
3937
3938 if ((error = vn_open_auth(ndp, &flags, vap))) {
3939 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) { /* XXX from fdopen */
3940 if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3941 fp_drop(p, indx, NULL, 0);
3942 *retval = indx;
3943 return 0;
3944 }
3945 }
3946 if (error == ERESTART) {
3947 error = EINTR;
3948 }
3949 fp_free(p, indx, fp);
3950 return error;
3951 }
3952 uu->uu_dupfd = 0;
3953 vp = ndp->ni_vp;
3954
3955 fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3956 fp->f_fglob->fg_ops = &vnops;
3957 fp->f_fglob->fg_data = (caddr_t)vp;
3958
3959 if (flags & (O_EXLOCK | O_SHLOCK)) {
3960 lf.l_whence = SEEK_SET;
3961 lf.l_start = 0;
3962 lf.l_len = 0;
3963 if (flags & O_EXLOCK) {
3964 lf.l_type = F_WRLCK;
3965 } else {
3966 lf.l_type = F_RDLCK;
3967 }
3968 type = F_FLOCK;
3969 if ((flags & FNONBLOCK) == 0) {
3970 type |= F_WAIT;
3971 }
3972 #if CONFIG_MACF
3973 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3974 F_SETLK, &lf);
3975 if (error) {
3976 goto bad;
3977 }
3978 #endif
3979 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3980 goto bad;
3981 }
3982 fp->f_fglob->fg_flag |= FHASLOCK;
3983 }
3984
3985 /* try to truncate by setting the size attribute */
3986 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3987 goto bad;
3988 }
3989
3990 /*
3991 * For directories we hold some additional information in the fd.
3992 */
3993 if (vnode_vtype(vp) == VDIR) {
3994 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3995 } else {
3996 fp->f_fglob->fg_vn_data = NULL;
3997 }
3998
3999 vnode_put(vp);
4000
4001 /*
4002 * The first terminal open (without a O_NOCTTY) by a session leader
4003 * results in it being set as the controlling terminal.
4004 */
4005 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4006 !(flags & O_NOCTTY)) {
4007 int tmp = 0;
4008
4009 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4010 (caddr_t)&tmp, ctx);
4011 }
4012
4013 proc_fdlock(p);
4014 if (flags & O_CLOEXEC) {
4015 *fdflags(p, indx) |= UF_EXCLOSE;
4016 }
4017 if (flags & O_CLOFORK) {
4018 *fdflags(p, indx) |= UF_FORKCLOSE;
4019 }
4020 procfdtbl_releasefd(p, indx, NULL);
4021
4022 #if CONFIG_SECLUDED_MEMORY
4023 if (secluded_for_filecache &&
4024 FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4025 vnode_vtype(vp) == VREG) {
4026 memory_object_control_t moc;
4027
4028 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4029
4030 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4031 /* nothing to do... */
4032 } else if (fp->f_fglob->fg_flag & FWRITE) {
4033 /* writable -> no longer eligible for secluded pages */
4034 memory_object_mark_eligible_for_secluded(moc,
4035 FALSE);
4036 } else if (secluded_for_filecache == 1) {
4037 char pathname[32] = { 0, };
4038 size_t copied;
4039 /* XXX FBDP: better way to detect /Applications/ ? */
4040 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4041 (void)copyinstr(ndp->ni_dirp,
4042 pathname,
4043 sizeof(pathname),
4044 &copied);
4045 } else {
4046 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4047 pathname,
4048 sizeof(pathname),
4049 &copied);
4050 }
4051 pathname[sizeof(pathname) - 1] = '\0';
4052 if (strncmp(pathname,
4053 "/Applications/",
4054 strlen("/Applications/")) == 0 &&
4055 strncmp(pathname,
4056 "/Applications/Camera.app/",
4057 strlen("/Applications/Camera.app/")) != 0) {
4058 /*
4059 * not writable
4060 * AND from "/Applications/"
4061 * AND not from "/Applications/Camera.app/"
4062 * ==> eligible for secluded
4063 */
4064 memory_object_mark_eligible_for_secluded(moc,
4065 TRUE);
4066 }
4067 } else if (secluded_for_filecache == 2) {
4068 #if __arm64__
4069 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4070 #elif __arm__
4071 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4072 #else
4073 /* not implemented... */
4074 #endif
4075 size_t len = strlen(vp->v_name);
4076 if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4077 !strncmp(vp->v_name, "dyld", len) ||
4078 !strncmp(vp->v_name, "launchd", len) ||
4079 !strncmp(vp->v_name, "Camera", len) ||
4080 !strncmp(vp->v_name, "mediaserverd", len) ||
4081 !strncmp(vp->v_name, "SpringBoard", len) ||
4082 !strncmp(vp->v_name, "backboardd", len)) {
4083 /*
4084 * This file matters when launching Camera:
4085 * do not store its contents in the secluded
4086 * pool that will be drained on Camera launch.
4087 */
4088 memory_object_mark_eligible_for_secluded(moc,
4089 FALSE);
4090 }
4091 }
4092 }
4093 #endif /* CONFIG_SECLUDED_MEMORY */
4094
4095 fp_drop(p, indx, fp, 1);
4096 proc_fdunlock(p);
4097
4098 *retval = indx;
4099
4100 return 0;
4101 bad:
4102 context = *vfs_context_current();
4103 context.vc_ucred = fp->f_fglob->fg_cred;
4104
4105 if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4106 (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4107 lf.l_whence = SEEK_SET;
4108 lf.l_start = 0;
4109 lf.l_len = 0;
4110 lf.l_type = F_UNLCK;
4111
4112 (void)VNOP_ADVLOCK(
4113 vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4114 }
4115
4116 vn_close(vp, fp->f_fglob->fg_flag, &context);
4117 vnode_put(vp);
4118 fp_free(p, indx, fp);
4119
4120 return error;
4121 }
4122
4123 /*
4124 * While most of the *at syscall handlers can call nameiat() which
4125 * is a wrapper around namei, the use of namei and initialisation
4126 * of nameidata are far removed and in different functions - namei
4127 * gets called in vn_open_auth for open1. So we'll just do here what
4128 * nameiat() does.
4129 */
4130 static int
4131 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4132 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4133 int dirfd)
4134 {
4135 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4136 int error;
4137 char c;
4138
4139 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4140 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4141 if (error) {
4142 return error;
4143 }
4144 } else {
4145 c = *((char *)(ndp->ni_dirp));
4146 }
4147
4148 if (c != '/') {
4149 vnode_t dvp_at;
4150
4151 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4152 &dvp_at);
4153 if (error) {
4154 return error;
4155 }
4156
4157 if (vnode_vtype(dvp_at) != VDIR) {
4158 vnode_put(dvp_at);
4159 return ENOTDIR;
4160 }
4161
4162 ndp->ni_dvp = dvp_at;
4163 ndp->ni_cnd.cn_flags |= USEDVP;
4164 error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4165 retval);
4166 vnode_put(dvp_at);
4167 return error;
4168 }
4169 }
4170
4171 return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4172 }
4173
4174 /*
4175 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4176 *
4177 * Parameters: p Process requesting the open
4178 * uap User argument descriptor (see below)
4179 * retval Pointer to an area to receive the
4180 * return calue from the system call
4181 *
4182 * Indirect: uap->path Path to open (same as 'open')
4183 * uap->flags Flags to open (same as 'open'
4184 * uap->uid UID to set, if creating
4185 * uap->gid GID to set, if creating
4186 * uap->mode File mode, if creating (same as 'open')
4187 * uap->xsecurity ACL to set, if creating
4188 *
4189 * Returns: 0 Success
4190 * !0 errno value
4191 *
4192 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4193 *
4194 * XXX: We should enummerate the possible errno values here, and where
4195 * in the code they originated.
4196 */
4197 int
4198 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4199 {
4200 struct filedesc *fdp = p->p_fd;
4201 int ciferror;
4202 kauth_filesec_t xsecdst;
4203 struct vnode_attr va;
4204 struct nameidata nd;
4205 int cmode;
4206
4207 AUDIT_ARG(owner, uap->uid, uap->gid);
4208
4209 xsecdst = NULL;
4210 if ((uap->xsecurity != USER_ADDR_NULL) &&
4211 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4212 return ciferror;
4213 }
4214
4215 VATTR_INIT(&va);
4216 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4217 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4218 if (uap->uid != KAUTH_UID_NONE) {
4219 VATTR_SET(&va, va_uid, uap->uid);
4220 }
4221 if (uap->gid != KAUTH_GID_NONE) {
4222 VATTR_SET(&va, va_gid, uap->gid);
4223 }
4224 if (xsecdst != NULL) {
4225 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4226 }
4227
4228 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4229 uap->path, vfs_context_current());
4230
4231 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4232 fileproc_alloc_init, NULL, retval);
4233 if (xsecdst != NULL) {
4234 kauth_filesec_free(xsecdst);
4235 }
4236
4237 return ciferror;
4238 }
4239
4240 /*
4241 * Go through the data-protected atomically controlled open (2)
4242 *
4243 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4244 */
4245 int
4246 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4247 {
4248 int flags = uap->flags;
4249 int class = uap->class;
4250 int dpflags = uap->dpflags;
4251
4252 /*
4253 * Follow the same path as normal open(2)
4254 * Look up the item if it exists, and acquire the vnode.
4255 */
4256 struct filedesc *fdp = p->p_fd;
4257 struct vnode_attr va;
4258 struct nameidata nd;
4259 int cmode;
4260 int error;
4261
4262 VATTR_INIT(&va);
4263 /* Mask off all but regular access permissions */
4264 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4265 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4266
4267 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4268 uap->path, vfs_context_current());
4269
4270 /*
4271 * Initialize the extra fields in vnode_attr to pass down our
4272 * extra fields.
4273 * 1. target cprotect class.
4274 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4275 */
4276 if (flags & O_CREAT) {
4277 /* lower level kernel code validates that the class is valid before applying it. */
4278 if (class != PROTECTION_CLASS_DEFAULT) {
4279 /*
4280 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4281 * file behave the same as open (2)
4282 */
4283 VATTR_SET(&va, va_dataprotect_class, class);
4284 }
4285 }
4286
4287 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4288 if (flags & (O_RDWR | O_WRONLY)) {
4289 /* Not allowed to write raw encrypted bytes */
4290 return EINVAL;
4291 }
4292 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4293 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4294 }
4295 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4296 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4297 }
4298 }
4299
4300 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4301 fileproc_alloc_init, NULL, retval);
4302
4303 return error;
4304 }
4305
4306 static int
4307 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4308 int fd, enum uio_seg segflg, int *retval)
4309 {
4310 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4311 struct vnode_attr va;
4312 struct nameidata nd;
4313 int cmode;
4314
4315 VATTR_INIT(&va);
4316 /* Mask off all but regular access permissions */
4317 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4318 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4319
4320 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4321 segflg, path, ctx);
4322
4323 return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4324 retval, fd);
4325 }
4326
4327 int
4328 open(proc_t p, struct open_args *uap, int32_t *retval)
4329 {
4330 __pthread_testcancel(1);
4331 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4332 }
4333
4334 int
4335 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4336 int32_t *retval)
4337 {
4338 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4339 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4340 }
4341
4342 int
4343 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4344 int32_t *retval)
4345 {
4346 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4347 uap->mode, uap->fd, UIO_USERSPACE, retval);
4348 }
4349
4350 int
4351 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4352 {
4353 __pthread_testcancel(1);
4354 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4355 }
4356
4357 /*
4358 * openbyid_np: open a file given a file system id and a file system object id
4359 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4360 * file systems that don't support object ids it is a node id (uint64_t).
4361 *
4362 * Parameters: p Process requesting the open
4363 * uap User argument descriptor (see below)
4364 * retval Pointer to an area to receive the
4365 * return calue from the system call
4366 *
4367 * Indirect: uap->path Path to open (same as 'open')
4368 *
4369 * uap->fsid id of target file system
4370 * uap->objid id of target file system object
4371 * uap->flags Flags to open (same as 'open')
4372 *
4373 * Returns: 0 Success
4374 * !0 errno value
4375 *
4376 *
4377 * XXX: We should enummerate the possible errno values here, and where
4378 * in the code they originated.
4379 */
4380 int
4381 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4382 {
4383 fsid_t fsid;
4384 uint64_t objid;
4385 int error;
4386 char *buf = NULL;
4387 int buflen = MAXPATHLEN;
4388 int pathlen = 0;
4389 vfs_context_t ctx = vfs_context_current();
4390
4391 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4392 return error;
4393 }
4394
4395 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4396 return error;
4397 }
4398
4399 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4400 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4401 return error;
4402 }
4403
4404 AUDIT_ARG(value32, fsid.val[0]);
4405 AUDIT_ARG(value64, objid);
4406
4407 /*resolve path from fsis, objid*/
4408 do {
4409 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4410 if (buf == NULL) {
4411 return ENOMEM;
4412 }
4413
4414 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4415 buf, FSOPT_ISREALFSID, &pathlen);
4416
4417 if (error) {
4418 FREE(buf, M_TEMP);
4419 buf = NULL;
4420 }
4421 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4422
4423 if (error) {
4424 return error;
4425 }
4426
4427 buf[pathlen] = 0;
4428
4429 error = openat_internal(
4430 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4431
4432 FREE(buf, M_TEMP);
4433
4434 return error;
4435 }
4436
4437
4438 /*
4439 * Create a special file.
4440 */
4441 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4442
4443 int
4444 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4445 {
4446 struct vnode_attr va;
4447 vfs_context_t ctx = vfs_context_current();
4448 int error;
4449 struct nameidata nd;
4450 vnode_t vp, dvp;
4451
4452 VATTR_INIT(&va);
4453 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4454 VATTR_SET(&va, va_rdev, uap->dev);
4455
4456 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4457 if ((uap->mode & S_IFMT) == S_IFIFO) {
4458 return mkfifo1(ctx, uap->path, &va);
4459 }
4460
4461 AUDIT_ARG(mode, uap->mode);
4462 AUDIT_ARG(value32, uap->dev);
4463
4464 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4465 return error;
4466 }
4467 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4468 UIO_USERSPACE, uap->path, ctx);
4469 error = namei(&nd);
4470 if (error) {
4471 return error;
4472 }
4473 dvp = nd.ni_dvp;
4474 vp = nd.ni_vp;
4475
4476 if (vp != NULL) {
4477 error = EEXIST;
4478 goto out;
4479 }
4480
4481 switch (uap->mode & S_IFMT) {
4482 case S_IFCHR:
4483 VATTR_SET(&va, va_type, VCHR);
4484 break;
4485 case S_IFBLK:
4486 VATTR_SET(&va, va_type, VBLK);
4487 break;
4488 default:
4489 error = EINVAL;
4490 goto out;
4491 }
4492
4493 #if CONFIG_MACF
4494 error = mac_vnode_check_create(ctx,
4495 nd.ni_dvp, &nd.ni_cnd, &va);
4496 if (error) {
4497 goto out;
4498 }
4499 #endif
4500
4501 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4502 goto out;
4503 }
4504
4505 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4506 goto out;
4507 }
4508
4509 if (vp) {
4510 int update_flags = 0;
4511
4512 // Make sure the name & parent pointers are hooked up
4513 if (vp->v_name == NULL) {
4514 update_flags |= VNODE_UPDATE_NAME;
4515 }
4516 if (vp->v_parent == NULLVP) {
4517 update_flags |= VNODE_UPDATE_PARENT;
4518 }
4519
4520 if (update_flags) {
4521 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4522 }
4523
4524 #if CONFIG_FSE
4525 add_fsevent(FSE_CREATE_FILE, ctx,
4526 FSE_ARG_VNODE, vp,
4527 FSE_ARG_DONE);
4528 #endif
4529 }
4530
4531 out:
4532 /*
4533 * nameidone has to happen before we vnode_put(dvp)
4534 * since it may need to release the fs_nodelock on the dvp
4535 */
4536 nameidone(&nd);
4537
4538 if (vp) {
4539 vnode_put(vp);
4540 }
4541 vnode_put(dvp);
4542
4543 return error;
4544 }
4545
4546 /*
4547 * Create a named pipe.
4548 *
4549 * Returns: 0 Success
4550 * EEXIST
4551 * namei:???
4552 * vnode_authorize:???
4553 * vn_create:???
4554 */
4555 static int
4556 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4557 {
4558 vnode_t vp, dvp;
4559 int error;
4560 struct nameidata nd;
4561
4562 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4563 UIO_USERSPACE, upath, ctx);
4564 error = namei(&nd);
4565 if (error) {
4566 return error;
4567 }
4568 dvp = nd.ni_dvp;
4569 vp = nd.ni_vp;
4570
4571 /* check that this is a new file and authorize addition */
4572 if (vp != NULL) {
4573 error = EEXIST;
4574 goto out;
4575 }
4576 VATTR_SET(vap, va_type, VFIFO);
4577
4578 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4579 goto out;
4580 }
4581
4582 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4583 out:
4584 /*
4585 * nameidone has to happen before we vnode_put(dvp)
4586 * since it may need to release the fs_nodelock on the dvp
4587 */
4588 nameidone(&nd);
4589
4590 if (vp) {
4591 vnode_put(vp);
4592 }
4593 vnode_put(dvp);
4594
4595 return error;
4596 }
4597
4598
4599 /*
4600 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4601 *
4602 * Parameters: p Process requesting the open
4603 * uap User argument descriptor (see below)
4604 * retval (Ignored)
4605 *
4606 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4607 * uap->uid UID to set
4608 * uap->gid GID to set
4609 * uap->mode File mode to set (same as 'mkfifo')
4610 * uap->xsecurity ACL to set, if creating
4611 *
4612 * Returns: 0 Success
4613 * !0 errno value
4614 *
4615 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4616 *
4617 * XXX: We should enummerate the possible errno values here, and where
4618 * in the code they originated.
4619 */
4620 int
4621 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4622 {
4623 int ciferror;
4624 kauth_filesec_t xsecdst;
4625 struct vnode_attr va;
4626
4627 AUDIT_ARG(owner, uap->uid, uap->gid);
4628
4629 xsecdst = KAUTH_FILESEC_NONE;
4630 if (uap->xsecurity != USER_ADDR_NULL) {
4631 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4632 return ciferror;
4633 }
4634 }
4635
4636 VATTR_INIT(&va);
4637 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4638 if (uap->uid != KAUTH_UID_NONE) {
4639 VATTR_SET(&va, va_uid, uap->uid);
4640 }
4641 if (uap->gid != KAUTH_GID_NONE) {
4642 VATTR_SET(&va, va_gid, uap->gid);
4643 }
4644 if (xsecdst != KAUTH_FILESEC_NONE) {
4645 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4646 }
4647
4648 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4649
4650 if (xsecdst != KAUTH_FILESEC_NONE) {
4651 kauth_filesec_free(xsecdst);
4652 }
4653 return ciferror;
4654 }
4655
4656 /* ARGSUSED */
4657 int
4658 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4659 {
4660 struct vnode_attr va;
4661
4662 VATTR_INIT(&va);
4663 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4664
4665 return mkfifo1(vfs_context_current(), uap->path, &va);
4666 }
4667
4668
4669 static char *
4670 my_strrchr(char *p, int ch)
4671 {
4672 char *save;
4673
4674 for (save = NULL;; ++p) {
4675 if (*p == ch) {
4676 save = p;
4677 }
4678 if (!*p) {
4679 return save;
4680 }
4681 }
4682 /* NOTREACHED */
4683 }
4684
4685 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4686 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4687 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4688
4689 int
4690 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4691 {
4692 int ret, len = _len;
4693
4694 *truncated_path = 0;
4695
4696 if (firmlink) {
4697 ret = vn_getpath(dvp, path, &len);
4698 } else {
4699 ret = vn_getpath_no_firmlink(dvp, path, &len);
4700 }
4701 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4702 if (leafname) {
4703 path[len - 1] = '/';
4704 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4705 if (len > MAXPATHLEN) {
4706 char *ptr;
4707
4708 // the string got truncated!
4709 *truncated_path = 1;
4710 ptr = my_strrchr(path, '/');
4711 if (ptr) {
4712 *ptr = '\0'; // chop off the string at the last directory component
4713 }
4714 len = strlen(path) + 1;
4715 }
4716 }
4717 } else if (ret == 0) {
4718 *truncated_path = 1;
4719 } else if (ret != 0) {
4720 struct vnode *mydvp = dvp;
4721
4722 if (ret != ENOSPC) {
4723 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4724 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4725 }
4726 *truncated_path = 1;
4727
4728 do {
4729 if (mydvp->v_parent != NULL) {
4730 mydvp = mydvp->v_parent;
4731 } else if (mydvp->v_mount) {
4732 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4733 break;
4734 } else {
4735 // no parent and no mount point? only thing is to punt and say "/" changed
4736 strlcpy(path, "/", _len);
4737 len = 2;
4738 mydvp = NULL;
4739 }
4740
4741 if (mydvp == NULL) {
4742 break;
4743 }
4744
4745 len = _len;
4746 if (firmlink) {
4747 ret = vn_getpath(mydvp, path, &len);
4748 } else {
4749 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4750 }
4751 } while (ret == ENOSPC);
4752 }
4753
4754 return len;
4755 }
4756
4757 int
4758 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4759 {
4760 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4761 }
4762
4763 int
4764 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4765 {
4766 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4767 }
4768
4769 /*
4770 * Make a hard file link.
4771 *
4772 * Returns: 0 Success
4773 * EPERM
4774 * EEXIST
4775 * EXDEV
4776 * namei:???
4777 * vnode_authorize:???
4778 * VNOP_LINK:???
4779 */
4780 /* ARGSUSED */
4781 static int
4782 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4783 user_addr_t link, int flag, enum uio_seg segflg)
4784 {
4785 vnode_t vp, pvp, dvp, lvp;
4786 struct nameidata nd;
4787 int follow;
4788 int error;
4789 #if CONFIG_FSE
4790 fse_info finfo;
4791 #endif
4792 int need_event, has_listeners, need_kpath2;
4793 char *target_path = NULL;
4794 int truncated = 0;
4795
4796 vp = dvp = lvp = NULLVP;
4797
4798 /* look up the object we are linking to */
4799 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4800 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4801 segflg, path, ctx);
4802
4803 error = nameiat(&nd, fd1);
4804 if (error) {
4805 if (error == EPERM) {
4806 printf("XXX 54841485: nameiat() src EPERM\n");
4807 }
4808 return error;
4809 }
4810 vp = nd.ni_vp;
4811
4812 nameidone(&nd);
4813
4814 /*
4815 * Normally, linking to directories is not supported.
4816 * However, some file systems may have limited support.
4817 */
4818 if (vp->v_type == VDIR) {
4819 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4820 error = EPERM; /* POSIX */
4821 printf("XXX 54841485: VDIR EPERM\n");
4822 goto out;
4823 }
4824
4825 /* Linking to a directory requires ownership. */
4826 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4827 struct vnode_attr dva;
4828
4829 VATTR_INIT(&dva);
4830 VATTR_WANTED(&dva, va_uid);
4831 if (vnode_getattr(vp, &dva, ctx) != 0 ||
4832 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4833 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4834 error = EACCES;
4835 goto out;
4836 }
4837 }
4838 }
4839
4840 /* lookup the target node */
4841 #if CONFIG_TRIGGERS
4842 nd.ni_op = OP_LINK;
4843 #endif
4844 nd.ni_cnd.cn_nameiop = CREATE;
4845 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4846 nd.ni_dirp = link;
4847 error = nameiat(&nd, fd2);
4848 if (error != 0) {
4849 if (error == EPERM) {
4850 printf("XXX 54841485: nameiat() dst EPERM\n");
4851 }
4852 goto out;
4853 }
4854 dvp = nd.ni_dvp;
4855 lvp = nd.ni_vp;
4856
4857 #if CONFIG_MACF
4858 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4859 if (error == EPERM) {
4860 printf("XXX 54841485: mac_vnode_check_link() EPERM\n");
4861 }
4862 goto out2;
4863 }
4864 #endif
4865
4866 /* or to anything that kauth doesn't want us to (eg. immutable items) */
4867 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4868 if (error == EPERM) {
4869 printf("XXX 54841485: vnode_authorize() LINKTARGET EPERM\n");
4870 }
4871 goto out2;
4872 }
4873
4874 /* target node must not exist */
4875 if (lvp != NULLVP) {
4876 error = EEXIST;
4877 goto out2;
4878 }
4879 /* cannot link across mountpoints */
4880 if (vnode_mount(vp) != vnode_mount(dvp)) {
4881 error = EXDEV;
4882 goto out2;
4883 }
4884
4885 /* authorize creation of the target note */
4886 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4887 if (error == EPERM) {
4888 printf("XXX 54841485: vnode_authorize() ADD_FILE EPERM\n");
4889 }
4890 goto out2;
4891 }
4892
4893 /* and finally make the link */
4894 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4895 if (error) {
4896 if (error == EPERM) {
4897 printf("XXX 54841485: VNOP_LINK() EPERM\n");
4898 }
4899 goto out2;
4900 }
4901
4902 #if CONFIG_MACF
4903 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4904 #endif
4905
4906 #if CONFIG_FSE
4907 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4908 #else
4909 need_event = 0;
4910 #endif
4911 has_listeners = kauth_authorize_fileop_has_listeners();
4912
4913 need_kpath2 = 0;
4914 #if CONFIG_AUDIT
4915 if (AUDIT_RECORD_EXISTS()) {
4916 need_kpath2 = 1;
4917 }
4918 #endif
4919
4920 if (need_event || has_listeners || need_kpath2) {
4921 char *link_to_path = NULL;
4922 int len, link_name_len;
4923
4924 /* build the path to the new link file */
4925 GET_PATH(target_path);
4926 if (target_path == NULL) {
4927 error = ENOMEM;
4928 goto out2;
4929 }
4930
4931 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4932
4933 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4934
4935 if (has_listeners) {
4936 /* build the path to file we are linking to */
4937 GET_PATH(link_to_path);
4938 if (link_to_path == NULL) {
4939 error = ENOMEM;
4940 goto out2;
4941 }
4942
4943 link_name_len = MAXPATHLEN;
4944 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4945 /*
4946 * Call out to allow 3rd party notification of rename.
4947 * Ignore result of kauth_authorize_fileop call.
4948 */
4949 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4950 (uintptr_t)link_to_path,
4951 (uintptr_t)target_path);
4952 }
4953 if (link_to_path != NULL) {
4954 RELEASE_PATH(link_to_path);
4955 }
4956 }
4957 #if CONFIG_FSE
4958 if (need_event) {
4959 /* construct fsevent */
4960 if (get_fse_info(vp, &finfo, ctx) == 0) {
4961 if (truncated) {
4962 finfo.mode |= FSE_TRUNCATED_PATH;
4963 }
4964
4965 // build the path to the destination of the link
4966 add_fsevent(FSE_CREATE_FILE, ctx,
4967 FSE_ARG_STRING, len, target_path,
4968 FSE_ARG_FINFO, &finfo,
4969 FSE_ARG_DONE);
4970 }
4971
4972 pvp = vp->v_parent;
4973 // need an iocount on pvp in this case
4974 if (pvp && pvp != dvp) {
4975 error = vnode_get(pvp);
4976 if (error) {
4977 pvp = NULLVP;
4978 error = 0;
4979 }
4980 }
4981 if (pvp) {
4982 add_fsevent(FSE_STAT_CHANGED, ctx,
4983 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4984 }
4985 if (pvp && pvp != dvp) {
4986 vnode_put(pvp);
4987 }
4988 }
4989 #endif
4990 }
4991 out2:
4992 /*
4993 * nameidone has to happen before we vnode_put(dvp)
4994 * since it may need to release the fs_nodelock on the dvp
4995 */
4996 nameidone(&nd);
4997 if (target_path != NULL) {
4998 RELEASE_PATH(target_path);
4999 }
5000 out:
5001 if (lvp) {
5002 vnode_put(lvp);
5003 }
5004 if (dvp) {
5005 vnode_put(dvp);
5006 }
5007 vnode_put(vp);
5008 return error;
5009 }
5010
5011 int
5012 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5013 {
5014 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5015 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5016 }
5017
5018 int
5019 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5020 {
5021 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5022 return EINVAL;
5023 }
5024
5025 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5026 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5027 }
5028
5029 /*
5030 * Make a symbolic link.
5031 *
5032 * We could add support for ACLs here too...
5033 */
5034 /* ARGSUSED */
5035 static int
5036 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5037 user_addr_t link, enum uio_seg segflg)
5038 {
5039 struct vnode_attr va;
5040 char *path;
5041 int error;
5042 struct nameidata nd;
5043 vnode_t vp, dvp;
5044 size_t dummy = 0;
5045 proc_t p;
5046
5047 error = 0;
5048 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5049 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5050 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5051 } else {
5052 path = (char *)path_data;
5053 }
5054 if (error) {
5055 goto out;
5056 }
5057 AUDIT_ARG(text, path); /* This is the link string */
5058
5059 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5060 segflg, link, ctx);
5061
5062 error = nameiat(&nd, fd);
5063 if (error) {
5064 goto out;
5065 }
5066 dvp = nd.ni_dvp;
5067 vp = nd.ni_vp;
5068
5069 p = vfs_context_proc(ctx);
5070 VATTR_INIT(&va);
5071 VATTR_SET(&va, va_type, VLNK);
5072 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5073
5074 #if CONFIG_MACF
5075 error = mac_vnode_check_create(ctx,
5076 dvp, &nd.ni_cnd, &va);
5077 #endif
5078 if (error != 0) {
5079 goto skipit;
5080 }
5081
5082 if (vp != NULL) {
5083 error = EEXIST;
5084 goto skipit;
5085 }
5086
5087 /* authorize */
5088 if (error == 0) {
5089 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5090 }
5091 /* get default ownership, etc. */
5092 if (error == 0) {
5093 error = vnode_authattr_new(dvp, &va, 0, ctx);
5094 }
5095 if (error == 0) {
5096 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5097 }
5098
5099 /* do fallback attribute handling */
5100 if (error == 0 && vp) {
5101 error = vnode_setattr_fallback(vp, &va, ctx);
5102 }
5103
5104 #if CONFIG_MACF
5105 if (error == 0 && vp) {
5106 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5107 }
5108 #endif
5109
5110 if (error == 0) {
5111 int update_flags = 0;
5112
5113 /*check if a new vnode was created, else try to get one*/
5114 if (vp == NULL) {
5115 nd.ni_cnd.cn_nameiop = LOOKUP;
5116 #if CONFIG_TRIGGERS
5117 nd.ni_op = OP_LOOKUP;
5118 #endif
5119 nd.ni_cnd.cn_flags = 0;
5120 error = nameiat(&nd, fd);
5121 vp = nd.ni_vp;
5122
5123 if (vp == NULL) {
5124 goto skipit;
5125 }
5126 }
5127
5128 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5129 /* call out to allow 3rd party notification of rename.
5130 * Ignore result of kauth_authorize_fileop call.
5131 */
5132 if (kauth_authorize_fileop_has_listeners() &&
5133 namei(&nd) == 0) {
5134 char *new_link_path = NULL;
5135 int len;
5136
5137 /* build the path to the new link file */
5138 new_link_path = get_pathbuff();
5139 len = MAXPATHLEN;
5140 vn_getpath(dvp, new_link_path, &len);
5141 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5142 new_link_path[len - 1] = '/';
5143 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5144 }
5145
5146 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5147 (uintptr_t)path, (uintptr_t)new_link_path);
5148 if (new_link_path != NULL) {
5149 release_pathbuff(new_link_path);
5150 }
5151 }
5152 #endif
5153 // Make sure the name & parent pointers are hooked up
5154 if (vp->v_name == NULL) {
5155 update_flags |= VNODE_UPDATE_NAME;
5156 }
5157 if (vp->v_parent == NULLVP) {
5158 update_flags |= VNODE_UPDATE_PARENT;
5159 }
5160
5161 if (update_flags) {
5162 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5163 }
5164
5165 #if CONFIG_FSE
5166 add_fsevent(FSE_CREATE_FILE, ctx,
5167 FSE_ARG_VNODE, vp,
5168 FSE_ARG_DONE);
5169 #endif
5170 }
5171
5172 skipit:
5173 /*
5174 * nameidone has to happen before we vnode_put(dvp)
5175 * since it may need to release the fs_nodelock on the dvp
5176 */
5177 nameidone(&nd);
5178
5179 if (vp) {
5180 vnode_put(vp);
5181 }
5182 vnode_put(dvp);
5183 out:
5184 if (path && (path != (char *)path_data)) {
5185 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5186 }
5187
5188 return error;
5189 }
5190
5191 int
5192 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5193 {
5194 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5195 uap->link, UIO_USERSPACE);
5196 }
5197
5198 int
5199 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5200 __unused int32_t *retval)
5201 {
5202 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5203 uap->path2, UIO_USERSPACE);
5204 }
5205
5206 /*
5207 * Delete a whiteout from the filesystem.
5208 * No longer supported.
5209 */
5210 int
5211 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5212 {
5213 return ENOTSUP;
5214 }
5215
5216 /*
5217 * Delete a name from the filesystem.
5218 */
5219 /* ARGSUSED */
5220 static int
5221 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5222 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5223 {
5224 struct nameidata nd;
5225 vnode_t vp, dvp;
5226 int error;
5227 struct componentname *cnp;
5228 char *path = NULL;
5229 char *no_firmlink_path = NULL;
5230 int len_path = 0;
5231 int len_no_firmlink_path = 0;
5232 #if CONFIG_FSE
5233 fse_info finfo;
5234 struct vnode_attr va;
5235 #endif
5236 int flags;
5237 int need_event;
5238 int has_listeners;
5239 int truncated_path;
5240 int truncated_no_firmlink_path;
5241 int batched;
5242 struct vnode_attr *vap;
5243 int do_retry;
5244 int retry_count = 0;
5245 int cn_flags;
5246
5247 cn_flags = LOCKPARENT;
5248 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5249 cn_flags |= AUDITVNPATH1;
5250 }
5251 /* If a starting dvp is passed, it trumps any fd passed. */
5252 if (start_dvp) {
5253 cn_flags |= USEDVP;
5254 }
5255
5256 #if NAMEDRSRCFORK
5257 /* unlink or delete is allowed on rsrc forks and named streams */
5258 cn_flags |= CN_ALLOWRSRCFORK;
5259 #endif
5260
5261 retry:
5262 do_retry = 0;
5263 flags = 0;
5264 need_event = 0;
5265 has_listeners = 0;
5266 truncated_path = 0;
5267 truncated_no_firmlink_path = 0;
5268 vap = NULL;
5269
5270 NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5271
5272 nd.ni_dvp = start_dvp;
5273 nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5274 cnp = &nd.ni_cnd;
5275
5276 continue_lookup:
5277 error = nameiat(&nd, fd);
5278 if (error) {
5279 return error;
5280 }
5281
5282 dvp = nd.ni_dvp;
5283 vp = nd.ni_vp;
5284
5285
5286 /* With Carbon delete semantics, busy files cannot be deleted */
5287 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5288 flags |= VNODE_REMOVE_NODELETEBUSY;
5289 }
5290
5291 /* Skip any potential upcalls if told to. */
5292 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5293 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5294 }
5295
5296 if (vp) {
5297 batched = vnode_compound_remove_available(vp);
5298 /*
5299 * The root of a mounted filesystem cannot be deleted.
5300 */
5301 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5302 error = EBUSY;
5303 goto out;
5304 }
5305
5306 #if DEVELOPMENT || DEBUG
5307 /*
5308 * XXX VSWAP: Check for entitlements or special flag here
5309 * so we can restrict access appropriately.
5310 */
5311 #else /* DEVELOPMENT || DEBUG */
5312
5313 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5314 error = EPERM;
5315 goto out;
5316 }
5317 #endif /* DEVELOPMENT || DEBUG */
5318
5319 if (!batched) {
5320 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5321 if (error) {
5322 if (error == ENOENT) {
5323 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5324 do_retry = 1;
5325 retry_count++;
5326 }
5327 }
5328 goto out;
5329 }
5330 }
5331 } else {
5332 batched = 1;
5333
5334 if (!vnode_compound_remove_available(dvp)) {
5335 panic("No vp, but no compound remove?");
5336 }
5337 }
5338
5339 #if CONFIG_FSE
5340 need_event = need_fsevent(FSE_DELETE, dvp);
5341 if (need_event) {
5342 if (!batched) {
5343 if ((vp->v_flag & VISHARDLINK) == 0) {
5344 /* XXX need to get these data in batched VNOP */
5345 get_fse_info(vp, &finfo, ctx);
5346 }
5347 } else {
5348 error = vfs_get_notify_attributes(&va);
5349 if (error) {
5350 goto out;
5351 }
5352
5353 vap = &va;
5354 }
5355 }
5356 #endif
5357 has_listeners = kauth_authorize_fileop_has_listeners();
5358 if (need_event || has_listeners) {
5359 if (path == NULL) {
5360 GET_PATH(path);
5361 if (path == NULL) {
5362 error = ENOMEM;
5363 goto out;
5364 }
5365 }
5366 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5367 if (no_firmlink_path == NULL) {
5368 GET_PATH(no_firmlink_path);
5369 if (no_firmlink_path == NULL) {
5370 error = ENOMEM;
5371 goto out;
5372 }
5373 }
5374 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5375 }
5376
5377 #if NAMEDRSRCFORK
5378 if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5379 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5380 } else
5381 #endif
5382 {
5383 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5384 vp = nd.ni_vp;
5385 if (error == EKEEPLOOKING) {
5386 if (!batched) {
5387 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5388 }
5389
5390 if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5391 panic("EKEEPLOOKING, but continue flag not set?");
5392 }
5393
5394 if (vnode_isdir(vp)) {
5395 error = EISDIR;
5396 goto out;
5397 }
5398 goto continue_lookup;
5399 } else if (error == ENOENT && batched) {
5400 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5401 /*
5402 * For compound VNOPs, the authorization callback may
5403 * return ENOENT in case of racing hardlink lookups
5404 * hitting the name cache, redrive the lookup.
5405 */
5406 do_retry = 1;
5407 retry_count += 1;
5408 goto out;
5409 }
5410 }
5411 }
5412
5413 /*
5414 * Call out to allow 3rd party notification of delete.
5415 * Ignore result of kauth_authorize_fileop call.
5416 */
5417 if (!error) {
5418 if (has_listeners) {
5419 kauth_authorize_fileop(vfs_context_ucred(ctx),
5420 KAUTH_FILEOP_DELETE,
5421 (uintptr_t)vp,
5422 (uintptr_t)path);
5423 }
5424
5425 if (vp->v_flag & VISHARDLINK) {
5426 //
5427 // if a hardlink gets deleted we want to blow away the
5428 // v_parent link because the path that got us to this
5429 // instance of the link is no longer valid. this will
5430 // force the next call to get the path to ask the file
5431 // system instead of just following the v_parent link.
5432 //
5433 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5434 }
5435
5436 #if CONFIG_FSE
5437 if (need_event) {
5438 if (vp->v_flag & VISHARDLINK) {
5439 get_fse_info(vp, &finfo, ctx);
5440 } else if (vap) {
5441 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5442 }
5443 if (truncated_path) {
5444 finfo.mode |= FSE_TRUNCATED_PATH;
5445 }
5446 add_fsevent(FSE_DELETE, ctx,
5447 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5448 FSE_ARG_FINFO, &finfo,
5449 FSE_ARG_DONE);
5450 }
5451 #endif
5452 }
5453
5454 out:
5455 if (path != NULL) {
5456 RELEASE_PATH(path);
5457 path = NULL;
5458 }
5459
5460 if (no_firmlink_path != NULL) {
5461 RELEASE_PATH(no_firmlink_path);
5462 no_firmlink_path = NULL;
5463 }
5464 #if NAMEDRSRCFORK
5465 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5466 * will cause its shadow file to go away if necessary.
5467 */
5468 if (vp && (vnode_isnamedstream(vp)) &&
5469 (vp->v_parent != NULLVP) &&
5470 vnode_isshadow(vp)) {
5471 vnode_recycle(vp);
5472 }
5473 #endif
5474 /*
5475 * nameidone has to happen before we vnode_put(dvp)
5476 * since it may need to release the fs_nodelock on the dvp
5477 */
5478 nameidone(&nd);
5479 vnode_put(dvp);
5480 if (vp) {
5481 vnode_put(vp);
5482 }
5483
5484 if (do_retry) {
5485 goto retry;
5486 }
5487
5488 return error;
5489 }
5490
5491 int
5492 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5493 enum uio_seg segflg, int unlink_flags)
5494 {
5495 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5496 unlink_flags);
5497 }
5498
5499 /*
5500 * Delete a name from the filesystem using Carbon semantics.
5501 */
5502 int
5503 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5504 {
5505 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5506 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5507 }
5508
5509 /*
5510 * Delete a name from the filesystem using POSIX semantics.
5511 */
5512 int
5513 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5514 {
5515 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5516 uap->path, UIO_USERSPACE, 0);
5517 }
5518
5519 int
5520 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5521 {
5522 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5523 return EINVAL;
5524 }
5525
5526 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5527 int unlink_flags = 0;
5528
5529 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5530 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5531 }
5532 return rmdirat_internal(vfs_context_current(), uap->fd,
5533 uap->path, UIO_USERSPACE, unlink_flags);
5534 } else {
5535 return unlinkat_internal(vfs_context_current(), uap->fd,
5536 NULLVP, uap->path, UIO_USERSPACE, 0);
5537 }
5538 }
5539
5540 /*
5541 * Reposition read/write file offset.
5542 */
5543 int
5544 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5545 {
5546 struct fileproc *fp;
5547 vnode_t vp;
5548 struct vfs_context *ctx;
5549 off_t offset = uap->offset, file_size;
5550 int error;
5551
5552 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5553 if (error == ENOTSUP) {
5554 return ESPIPE;
5555 }
5556 return error;
5557 }
5558 if (vnode_isfifo(vp)) {
5559 file_drop(uap->fd);
5560 return ESPIPE;
5561 }
5562
5563
5564 ctx = vfs_context_current();
5565 #if CONFIG_MACF
5566 if (uap->whence == L_INCR && uap->offset == 0) {
5567 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5568 fp->f_fglob);
5569 } else {
5570 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5571 fp->f_fglob);
5572 }
5573 if (error) {
5574 file_drop(uap->fd);
5575 return error;
5576 }
5577 #endif
5578 if ((error = vnode_getwithref(vp))) {
5579 file_drop(uap->fd);
5580 return error;
5581 }
5582
5583 switch (uap->whence) {
5584 case L_INCR:
5585 offset += fp->f_fglob->fg_offset;
5586 break;
5587 case L_XTND:
5588 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5589 break;
5590 }
5591 offset += file_size;
5592 break;
5593 case L_SET:
5594 break;
5595 case SEEK_HOLE:
5596 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5597 break;
5598 case SEEK_DATA:
5599 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5600 break;
5601 default:
5602 error = EINVAL;
5603 }
5604 if (error == 0) {
5605 if (uap->offset > 0 && offset < 0) {
5606 /* Incremented/relative move past max size */
5607 error = EOVERFLOW;
5608 } else {
5609 /*
5610 * Allow negative offsets on character devices, per
5611 * POSIX 1003.1-2001. Most likely for writing disk
5612 * labels.
5613 */
5614 if (offset < 0 && vp->v_type != VCHR) {
5615 /* Decremented/relative move before start */
5616 error = EINVAL;
5617 } else {
5618 /* Success */
5619 fp->f_fglob->fg_offset = offset;
5620 *retval = fp->f_fglob->fg_offset;
5621 }
5622 }
5623 }
5624
5625 /*
5626 * An lseek can affect whether data is "available to read." Use
5627 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5628 */
5629 post_event_if_success(vp, error, NOTE_NONE);
5630 (void)vnode_put(vp);
5631 file_drop(uap->fd);
5632 return error;
5633 }
5634
5635
5636 /*
5637 * Check access permissions.
5638 *
5639 * Returns: 0 Success
5640 * vnode_authorize:???
5641 */
5642 static int
5643 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5644 {
5645 kauth_action_t action;
5646 int error;
5647
5648 /*
5649 * If just the regular access bits, convert them to something
5650 * that vnode_authorize will understand.
5651 */
5652 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5653 action = 0;
5654 if (uflags & R_OK) {
5655 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5656 }
5657 if (uflags & W_OK) {
5658 if (vnode_isdir(vp)) {
5659 action |= KAUTH_VNODE_ADD_FILE |
5660 KAUTH_VNODE_ADD_SUBDIRECTORY;
5661 /* might want delete rights here too */
5662 } else {
5663 action |= KAUTH_VNODE_WRITE_DATA;
5664 }
5665 }
5666 if (uflags & X_OK) {
5667 if (vnode_isdir(vp)) {
5668 action |= KAUTH_VNODE_SEARCH;
5669 } else {
5670 action |= KAUTH_VNODE_EXECUTE;
5671 }
5672 }
5673 } else {
5674 /* take advantage of definition of uflags */
5675 action = uflags >> 8;
5676 }
5677
5678 #if CONFIG_MACF
5679 error = mac_vnode_check_access(ctx, vp, uflags);
5680 if (error) {
5681 return error;
5682 }
5683 #endif /* MAC */
5684
5685 /* action == 0 means only check for existence */
5686 if (action != 0) {
5687 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5688 } else {
5689 error = 0;
5690 }
5691
5692 return error;
5693 }
5694
5695
5696
5697 /*
5698 * access_extended: Check access permissions in bulk.
5699 *
5700 * Description: uap->entries Pointer to an array of accessx
5701 * descriptor structs, plus one or
5702 * more NULL terminated strings (see
5703 * "Notes" section below).
5704 * uap->size Size of the area pointed to by
5705 * uap->entries.
5706 * uap->results Pointer to the results array.
5707 *
5708 * Returns: 0 Success
5709 * ENOMEM Insufficient memory
5710 * EINVAL Invalid arguments
5711 * namei:EFAULT Bad address
5712 * namei:ENAMETOOLONG Filename too long
5713 * namei:ENOENT No such file or directory
5714 * namei:ELOOP Too many levels of symbolic links
5715 * namei:EBADF Bad file descriptor
5716 * namei:ENOTDIR Not a directory
5717 * namei:???
5718 * access1:
5719 *
5720 * Implicit returns:
5721 * uap->results Array contents modified
5722 *
5723 * Notes: The uap->entries are structured as an arbitrary length array
5724 * of accessx descriptors, followed by one or more NULL terminated
5725 * strings
5726 *
5727 * struct accessx_descriptor[0]
5728 * ...
5729 * struct accessx_descriptor[n]
5730 * char name_data[0];
5731 *
5732 * We determine the entry count by walking the buffer containing
5733 * the uap->entries argument descriptor. For each descriptor we
5734 * see, the valid values for the offset ad_name_offset will be
5735 * in the byte range:
5736 *
5737 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5738 * to
5739 * [ uap->entries + uap->size - 2 ]
5740 *
5741 * since we must have at least one string, and the string must
5742 * be at least one character plus the NULL terminator in length.
5743 *
5744 * XXX: Need to support the check-as uid argument
5745 */
5746 int
5747 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5748 {
5749 struct accessx_descriptor *input = NULL;
5750 errno_t *result = NULL;
5751 errno_t error = 0;
5752 int wantdelete = 0;
5753 unsigned int desc_max, desc_actual, i, j;
5754 struct vfs_context context;
5755 struct nameidata nd;
5756 int niopts;
5757 vnode_t vp = NULL;
5758 vnode_t dvp = NULL;
5759 #define ACCESSX_MAX_DESCR_ON_STACK 10
5760 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5761
5762 context.vc_ucred = NULL;
5763
5764 /*
5765 * Validate parameters; if valid, copy the descriptor array and string
5766 * arguments into local memory. Before proceeding, the following
5767 * conditions must have been met:
5768 *
5769 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5770 * o There must be sufficient room in the request for at least one
5771 * descriptor and a one yte NUL terminated string.
5772 * o The allocation of local storage must not fail.
5773 */
5774 if (uap->size > ACCESSX_MAX_TABLESIZE) {
5775 return ENOMEM;
5776 }
5777 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5778 return EINVAL;
5779 }
5780 if (uap->size <= sizeof(stack_input)) {
5781 input = stack_input;
5782 } else {
5783 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5784 if (input == NULL) {
5785 error = ENOMEM;
5786 goto out;
5787 }
5788 }
5789 error = copyin(uap->entries, input, uap->size);
5790 if (error) {
5791 goto out;
5792 }
5793
5794 AUDIT_ARG(opaque, input, uap->size);
5795
5796 /*
5797 * Force NUL termination of the copyin buffer to avoid nami() running
5798 * off the end. If the caller passes us bogus data, they may get a
5799 * bogus result.
5800 */
5801 ((char *)input)[uap->size - 1] = 0;
5802
5803 /*
5804 * Access is defined as checking against the process' real identity,
5805 * even if operations are checking the effective identity. This
5806 * requires that we use a local vfs context.
5807 */
5808 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5809 context.vc_thread = current_thread();
5810
5811 /*
5812 * Find out how many entries we have, so we can allocate the result
5813 * array by walking the list and adjusting the count downward by the
5814 * earliest string offset we see.
5815 */
5816 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5817 desc_actual = desc_max;
5818 for (i = 0; i < desc_actual; i++) {
5819 /*
5820 * Take the offset to the name string for this entry and
5821 * convert to an input array index, which would be one off
5822 * the end of the array if this entry was the lowest-addressed
5823 * name string.
5824 */
5825 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5826
5827 /*
5828 * An offset greater than the max allowable offset is an error.
5829 * It is also an error for any valid entry to point
5830 * to a location prior to the end of the current entry, if
5831 * it's not a reference to the string of the previous entry.
5832 */
5833 if (j > desc_max || (j != 0 && j <= i)) {
5834 error = EINVAL;
5835 goto out;
5836 }
5837
5838 /* Also do not let ad_name_offset point to something beyond the size of the input */
5839 if (input[i].ad_name_offset >= uap->size) {
5840 error = EINVAL;
5841 goto out;
5842 }
5843
5844 /*
5845 * An offset of 0 means use the previous descriptor's offset;
5846 * this is used to chain multiple requests for the same file
5847 * to avoid multiple lookups.
5848 */
5849 if (j == 0) {
5850 /* This is not valid for the first entry */
5851 if (i == 0) {
5852 error = EINVAL;
5853 goto out;
5854 }
5855 continue;
5856 }
5857
5858 /*
5859 * If the offset of the string for this descriptor is before
5860 * what we believe is the current actual last descriptor,
5861 * then we need to adjust our estimate downward; this permits
5862 * the string table following the last descriptor to be out
5863 * of order relative to the descriptor list.
5864 */
5865 if (j < desc_actual) {
5866 desc_actual = j;
5867 }
5868 }
5869
5870 /*
5871 * We limit the actual number of descriptors we are willing to process
5872 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
5873 * requested does not exceed this limit,
5874 */
5875 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5876 error = ENOMEM;
5877 goto out;
5878 }
5879 MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5880 if (result == NULL) {
5881 error = ENOMEM;
5882 goto out;
5883 }
5884
5885 /*
5886 * Do the work by iterating over the descriptor entries we know to
5887 * at least appear to contain valid data.
5888 */
5889 error = 0;
5890 for (i = 0; i < desc_actual; i++) {
5891 /*
5892 * If the ad_name_offset is 0, then we use the previous
5893 * results to make the check; otherwise, we are looking up
5894 * a new file name.
5895 */
5896 if (input[i].ad_name_offset != 0) {
5897 /* discard old vnodes */
5898 if (vp) {
5899 vnode_put(vp);
5900 vp = NULL;
5901 }
5902 if (dvp) {
5903 vnode_put(dvp);
5904 dvp = NULL;
5905 }
5906
5907 /*
5908 * Scan forward in the descriptor list to see if we
5909 * need the parent vnode. We will need it if we are
5910 * deleting, since we must have rights to remove
5911 * entries in the parent directory, as well as the
5912 * rights to delete the object itself.
5913 */
5914 wantdelete = input[i].ad_flags & _DELETE_OK;
5915 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5916 if (input[j].ad_flags & _DELETE_OK) {
5917 wantdelete = 1;
5918 }
5919 }
5920
5921 niopts = FOLLOW | AUDITVNPATH1;
5922
5923 /* need parent for vnode_authorize for deletion test */
5924 if (wantdelete) {
5925 niopts |= WANTPARENT;
5926 }
5927
5928 /* do the lookup */
5929 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5930 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5931 &context);
5932 error = namei(&nd);
5933 if (!error) {
5934 vp = nd.ni_vp;
5935 if (wantdelete) {
5936 dvp = nd.ni_dvp;
5937 }
5938 }
5939 nameidone(&nd);
5940 }
5941
5942 /*
5943 * Handle lookup errors.
5944 */
5945 switch (error) {
5946 case ENOENT:
5947 case EACCES:
5948 case EPERM:
5949 case ENOTDIR:
5950 result[i] = error;
5951 break;
5952 case 0:
5953 /* run this access check */
5954 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5955 break;
5956 default:
5957 /* fatal lookup error */
5958
5959 goto out;
5960 }
5961 }
5962
5963 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5964
5965 /* copy out results */
5966 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5967
5968 out:
5969 if (input && input != stack_input) {
5970 FREE(input, M_TEMP);
5971 }
5972 if (result) {
5973 FREE(result, M_TEMP);
5974 }
5975 if (vp) {
5976 vnode_put(vp);
5977 }
5978 if (dvp) {
5979 vnode_put(dvp);
5980 }
5981 if (IS_VALID_CRED(context.vc_ucred)) {
5982 kauth_cred_unref(&context.vc_ucred);
5983 }
5984 return error;
5985 }
5986
5987
5988 /*
5989 * Returns: 0 Success
5990 * namei:EFAULT Bad address
5991 * namei:ENAMETOOLONG Filename too long
5992 * namei:ENOENT No such file or directory
5993 * namei:ELOOP Too many levels of symbolic links
5994 * namei:EBADF Bad file descriptor
5995 * namei:ENOTDIR Not a directory
5996 * namei:???
5997 * access1:
5998 */
5999 static int
6000 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6001 int flag, enum uio_seg segflg)
6002 {
6003 int error;
6004 struct nameidata nd;
6005 int niopts;
6006 struct vfs_context context;
6007 #if NAMEDRSRCFORK
6008 int is_namedstream = 0;
6009 #endif
6010
6011 /*
6012 * Unless the AT_EACCESS option is used, Access is defined as checking
6013 * against the process' real identity, even if operations are checking
6014 * the effective identity. So we need to tweak the credential
6015 * in the context for that case.
6016 */
6017 if (!(flag & AT_EACCESS)) {
6018 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6019 } else {
6020 context.vc_ucred = ctx->vc_ucred;
6021 }
6022 context.vc_thread = ctx->vc_thread;
6023
6024
6025 niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6026 /* need parent for vnode_authorize for deletion test */
6027 if (amode & _DELETE_OK) {
6028 niopts |= WANTPARENT;
6029 }
6030 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6031 path, &context);
6032
6033 #if NAMEDRSRCFORK
6034 /* access(F_OK) calls are allowed for resource forks. */
6035 if (amode == F_OK) {
6036 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6037 }
6038 #endif
6039 error = nameiat(&nd, fd);
6040 if (error) {
6041 goto out;
6042 }
6043
6044 #if NAMEDRSRCFORK
6045 /* Grab reference on the shadow stream file vnode to
6046 * force an inactive on release which will mark it
6047 * for recycle.
6048 */
6049 if (vnode_isnamedstream(nd.ni_vp) &&
6050 (nd.ni_vp->v_parent != NULLVP) &&
6051 vnode_isshadow(nd.ni_vp)) {
6052 is_namedstream = 1;
6053 vnode_ref(nd.ni_vp);
6054 }
6055 #endif
6056
6057 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6058
6059 #if NAMEDRSRCFORK
6060 if (is_namedstream) {
6061 vnode_rele(nd.ni_vp);
6062 }
6063 #endif
6064
6065 vnode_put(nd.ni_vp);
6066 if (amode & _DELETE_OK) {
6067 vnode_put(nd.ni_dvp);
6068 }
6069 nameidone(&nd);
6070
6071 out:
6072 if (!(flag & AT_EACCESS)) {
6073 kauth_cred_unref(&context.vc_ucred);
6074 }
6075 return error;
6076 }
6077
6078 int
6079 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6080 {
6081 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6082 uap->path, uap->flags, 0, UIO_USERSPACE);
6083 }
6084
6085 int
6086 faccessat(__unused proc_t p, struct faccessat_args *uap,
6087 __unused int32_t *retval)
6088 {
6089 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6090 return EINVAL;
6091 }
6092
6093 return faccessat_internal(vfs_context_current(), uap->fd,
6094 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6095 }
6096
6097 /*
6098 * Returns: 0 Success
6099 * EFAULT
6100 * copyout:EFAULT
6101 * namei:???
6102 * vn_stat:???
6103 */
6104 static int
6105 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6106 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6107 enum uio_seg segflg, int fd, int flag)
6108 {
6109 struct nameidata nd;
6110 int follow;
6111 union {
6112 struct stat sb;
6113 struct stat64 sb64;
6114 } source = {};
6115 union {
6116 struct user64_stat user64_sb;
6117 struct user32_stat user32_sb;
6118 struct user64_stat64 user64_sb64;
6119 struct user32_stat64 user32_sb64;
6120 } dest = {};
6121 caddr_t sbp;
6122 int error, my_size;
6123 kauth_filesec_t fsec;
6124 size_t xsecurity_bufsize;
6125 void * statptr;
6126 struct fileproc *fp = NULL;
6127 int needsrealdev = 0;
6128
6129 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6130 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6131 segflg, path, ctx);
6132
6133 #if NAMEDRSRCFORK
6134 int is_namedstream = 0;
6135 /* stat calls are allowed for resource forks. */
6136 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6137 #endif
6138
6139 if (flag & AT_FDONLY) {
6140 vnode_t fvp;
6141
6142 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6143 if (error) {
6144 return error;
6145 }
6146 if ((error = vnode_getwithref(fvp))) {
6147 file_drop(fd);
6148 return error;
6149 }
6150 nd.ni_vp = fvp;
6151 } else {
6152 error = nameiat(&nd, fd);
6153 if (error) {
6154 return error;
6155 }
6156 }
6157 fsec = KAUTH_FILESEC_NONE;
6158
6159 statptr = (void *)&source;
6160
6161 #if NAMEDRSRCFORK
6162 /* Grab reference on the shadow stream file vnode to
6163 * force an inactive on release which will mark it
6164 * for recycle.
6165 */
6166 if (vnode_isnamedstream(nd.ni_vp) &&
6167 (nd.ni_vp->v_parent != NULLVP) &&
6168 vnode_isshadow(nd.ni_vp)) {
6169 is_namedstream = 1;
6170 vnode_ref(nd.ni_vp);
6171 }
6172 #endif
6173
6174 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6175 if (fp && (xsecurity == USER_ADDR_NULL)) {
6176 /*
6177 * If the caller has the file open, and is not
6178 * requesting extended security information, we are
6179 * going to let them get the basic stat information.
6180 */
6181 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6182 fp->f_fglob->fg_cred);
6183 } else {
6184 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6185 isstat64, needsrealdev, ctx);
6186 }
6187
6188 #if NAMEDRSRCFORK
6189 if (is_namedstream) {
6190 vnode_rele(nd.ni_vp);
6191 }
6192 #endif
6193 vnode_put(nd.ni_vp);
6194 nameidone(&nd);
6195 if (fp) {
6196 file_drop(fd);
6197 fp = NULL;
6198 }
6199
6200 if (error) {
6201 return error;
6202 }
6203 /* Zap spare fields */
6204 if (isstat64 != 0) {
6205 source.sb64.st_lspare = 0;
6206 source.sb64.st_qspare[0] = 0LL;
6207 source.sb64.st_qspare[1] = 0LL;
6208 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6209 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6210 my_size = sizeof(dest.user64_sb64);
6211 sbp = (caddr_t)&dest.user64_sb64;
6212 } else {
6213 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6214 my_size = sizeof(dest.user32_sb64);
6215 sbp = (caddr_t)&dest.user32_sb64;
6216 }
6217 /*
6218 * Check if we raced (post lookup) against the last unlink of a file.
6219 */
6220 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6221 source.sb64.st_nlink = 1;
6222 }
6223 } else {
6224 source.sb.st_lspare = 0;
6225 source.sb.st_qspare[0] = 0LL;
6226 source.sb.st_qspare[1] = 0LL;
6227 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6228 munge_user64_stat(&source.sb, &dest.user64_sb);
6229 my_size = sizeof(dest.user64_sb);
6230 sbp = (caddr_t)&dest.user64_sb;
6231 } else {
6232 munge_user32_stat(&source.sb, &dest.user32_sb);
6233 my_size = sizeof(dest.user32_sb);
6234 sbp = (caddr_t)&dest.user32_sb;
6235 }
6236
6237 /*
6238 * Check if we raced (post lookup) against the last unlink of a file.
6239 */
6240 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6241 source.sb.st_nlink = 1;
6242 }
6243 }
6244 if ((error = copyout(sbp, ub, my_size)) != 0) {
6245 goto out;
6246 }
6247
6248 /* caller wants extended security information? */
6249 if (xsecurity != USER_ADDR_NULL) {
6250 /* did we get any? */
6251 if (fsec == KAUTH_FILESEC_NONE) {
6252 if (susize(xsecurity_size, 0) != 0) {
6253 error = EFAULT;
6254 goto out;
6255 }
6256 } else {
6257 /* find the user buffer size */
6258 xsecurity_bufsize = fusize(xsecurity_size);
6259
6260 /* copy out the actual data size */
6261 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6262 error = EFAULT;
6263 goto out;
6264 }
6265
6266 /* if the caller supplied enough room, copy out to it */
6267 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6268 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6269 }
6270 }
6271 }
6272 out:
6273 if (fsec != KAUTH_FILESEC_NONE) {
6274 kauth_filesec_free(fsec);
6275 }
6276 return error;
6277 }
6278
6279 /*
6280 * stat_extended: Get file status; with extended security (ACL).
6281 *
6282 * Parameters: p (ignored)
6283 * uap User argument descriptor (see below)
6284 * retval (ignored)
6285 *
6286 * Indirect: uap->path Path of file to get status from
6287 * uap->ub User buffer (holds file status info)
6288 * uap->xsecurity ACL to get (extended security)
6289 * uap->xsecurity_size Size of ACL
6290 *
6291 * Returns: 0 Success
6292 * !0 errno value
6293 *
6294 */
6295 int
6296 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6297 __unused int32_t *retval)
6298 {
6299 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6300 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6301 0);
6302 }
6303
6304 /*
6305 * Returns: 0 Success
6306 * fstatat_internal:??? [see fstatat_internal() in this file]
6307 */
6308 int
6309 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6310 {
6311 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6312 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6313 }
6314
6315 int
6316 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6317 {
6318 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6319 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6320 }
6321
6322 /*
6323 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6324 *
6325 * Parameters: p (ignored)
6326 * uap User argument descriptor (see below)
6327 * retval (ignored)
6328 *
6329 * Indirect: uap->path Path of file to get status from
6330 * uap->ub User buffer (holds file status info)
6331 * uap->xsecurity ACL to get (extended security)
6332 * uap->xsecurity_size Size of ACL
6333 *
6334 * Returns: 0 Success
6335 * !0 errno value
6336 *
6337 */
6338 int
6339 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6340 {
6341 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6342 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6343 0);
6344 }
6345
6346 /*
6347 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6348 *
6349 * Parameters: p (ignored)
6350 * uap User argument descriptor (see below)
6351 * retval (ignored)
6352 *
6353 * Indirect: uap->path Path of file to get status from
6354 * uap->ub User buffer (holds file status info)
6355 * uap->xsecurity ACL to get (extended security)
6356 * uap->xsecurity_size Size of ACL
6357 *
6358 * Returns: 0 Success
6359 * !0 errno value
6360 *
6361 */
6362 int
6363 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6364 {
6365 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6366 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6367 AT_SYMLINK_NOFOLLOW);
6368 }
6369
6370 /*
6371 * Get file status; this version does not follow links.
6372 */
6373 int
6374 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6375 {
6376 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6377 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6378 }
6379
6380 int
6381 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6382 {
6383 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6384 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6385 }
6386
6387 /*
6388 * lstat64_extended: Get file status; can handle large inode numbers; does not
6389 * follow links; with extended security (ACL).
6390 *
6391 * Parameters: p (ignored)
6392 * uap User argument descriptor (see below)
6393 * retval (ignored)
6394 *
6395 * Indirect: uap->path Path of file to get status from
6396 * uap->ub User buffer (holds file status info)
6397 * uap->xsecurity ACL to get (extended security)
6398 * uap->xsecurity_size Size of ACL
6399 *
6400 * Returns: 0 Success
6401 * !0 errno value
6402 *
6403 */
6404 int
6405 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6406 {
6407 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6408 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6409 AT_SYMLINK_NOFOLLOW);
6410 }
6411
6412 int
6413 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6414 {
6415 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6416 return EINVAL;
6417 }
6418
6419 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6420 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6421 }
6422
6423 int
6424 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6425 __unused int32_t *retval)
6426 {
6427 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6428 return EINVAL;
6429 }
6430
6431 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6432 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6433 }
6434
6435 /*
6436 * Get configurable pathname variables.
6437 *
6438 * Returns: 0 Success
6439 * namei:???
6440 * vn_pathconf:???
6441 *
6442 * Notes: Global implementation constants are intended to be
6443 * implemented in this function directly; all other constants
6444 * are per-FS implementation, and therefore must be handled in
6445 * each respective FS, instead.
6446 *
6447 * XXX We implement some things globally right now that should actually be
6448 * XXX per-FS; we will need to deal with this at some point.
6449 */
6450 /* ARGSUSED */
6451 int
6452 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6453 {
6454 int error;
6455 struct nameidata nd;
6456 vfs_context_t ctx = vfs_context_current();
6457
6458 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6459 UIO_USERSPACE, uap->path, ctx);
6460 error = namei(&nd);
6461 if (error) {
6462 return error;
6463 }
6464
6465 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6466
6467 vnode_put(nd.ni_vp);
6468 nameidone(&nd);
6469 return error;
6470 }
6471
6472 /*
6473 * Return target name of a symbolic link.
6474 */
6475 /* ARGSUSED */
6476 static int
6477 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6478 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6479 int *retval)
6480 {
6481 vnode_t vp;
6482 uio_t auio;
6483 int error;
6484 struct nameidata nd;
6485 char uio_buf[UIO_SIZEOF(1)];
6486
6487 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6488 seg, path, ctx);
6489
6490 error = nameiat(&nd, fd);
6491 if (error) {
6492 return error;
6493 }
6494 vp = nd.ni_vp;
6495
6496 nameidone(&nd);
6497
6498 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6499 &uio_buf[0], sizeof(uio_buf));
6500 uio_addiov(auio, buf, bufsize);
6501 if (vp->v_type != VLNK) {
6502 error = EINVAL;
6503 } else {
6504 #if CONFIG_MACF
6505 error = mac_vnode_check_readlink(ctx, vp);
6506 #endif
6507 if (error == 0) {
6508 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6509 ctx);
6510 }
6511 if (error == 0) {
6512 error = VNOP_READLINK(vp, auio, ctx);
6513 }
6514 }
6515 vnode_put(vp);
6516
6517 *retval = bufsize - (int)uio_resid(auio);
6518 return error;
6519 }
6520
6521 int
6522 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6523 {
6524 enum uio_seg procseg;
6525
6526 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6527 return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6528 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6529 uap->count, procseg, retval);
6530 }
6531
6532 int
6533 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6534 {
6535 enum uio_seg procseg;
6536
6537 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6538 return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6539 procseg, uap->buf, uap->bufsize, procseg, retval);
6540 }
6541
6542 /*
6543 * Change file flags, the deep inner layer.
6544 */
6545 static int
6546 chflags0(vnode_t vp, struct vnode_attr *va,
6547 int (*setattr)(vnode_t, void *, vfs_context_t),
6548 void *arg, vfs_context_t ctx)
6549 {
6550 kauth_action_t action = 0;
6551 int error;
6552
6553 #if CONFIG_MACF
6554 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6555 if (error) {
6556 goto out;
6557 }
6558 #endif
6559
6560 /* request authorisation, disregard immutability */
6561 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6562 goto out;
6563 }
6564 /*
6565 * Request that the auth layer disregard those file flags it's allowed to when
6566 * authorizing this operation; we need to do this in order to be able to
6567 * clear immutable flags.
6568 */
6569 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6570 goto out;
6571 }
6572 error = (*setattr)(vp, arg, ctx);
6573
6574 #if CONFIG_MACF
6575 if (error == 0) {
6576 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6577 }
6578 #endif
6579
6580 out:
6581 return error;
6582 }
6583
6584 /*
6585 * Change file flags.
6586 *
6587 * NOTE: this will vnode_put() `vp'
6588 */
6589 static int
6590 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6591 {
6592 struct vnode_attr va;
6593 int error;
6594
6595 VATTR_INIT(&va);
6596 VATTR_SET(&va, va_flags, flags);
6597
6598 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6599 vnode_put(vp);
6600
6601 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6602 error = ENOTSUP;
6603 }
6604
6605 return error;
6606 }
6607
6608 /*
6609 * Change flags of a file given a path name.
6610 */
6611 /* ARGSUSED */
6612 int
6613 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6614 {
6615 vnode_t vp;
6616 vfs_context_t ctx = vfs_context_current();
6617 int error;
6618 struct nameidata nd;
6619
6620 AUDIT_ARG(fflags, uap->flags);
6621 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6622 UIO_USERSPACE, uap->path, ctx);
6623 error = namei(&nd);
6624 if (error) {
6625 return error;
6626 }
6627 vp = nd.ni_vp;
6628 nameidone(&nd);
6629
6630 /* we don't vnode_put() here because chflags1 does internally */
6631 error = chflags1(vp, uap->flags, ctx);
6632
6633 return error;
6634 }
6635
6636 /*
6637 * Change flags of a file given a file descriptor.
6638 */
6639 /* ARGSUSED */
6640 int
6641 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6642 {
6643 vnode_t vp;
6644 int error;
6645
6646 AUDIT_ARG(fd, uap->fd);
6647 AUDIT_ARG(fflags, uap->flags);
6648 if ((error = file_vnode(uap->fd, &vp))) {
6649 return error;
6650 }
6651
6652 if ((error = vnode_getwithref(vp))) {
6653 file_drop(uap->fd);
6654 return error;
6655 }
6656
6657 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6658
6659 /* we don't vnode_put() here because chflags1 does internally */
6660 error = chflags1(vp, uap->flags, vfs_context_current());
6661
6662 file_drop(uap->fd);
6663 return error;
6664 }
6665
6666 /*
6667 * Change security information on a filesystem object.
6668 *
6669 * Returns: 0 Success
6670 * EPERM Operation not permitted
6671 * vnode_authattr:??? [anything vnode_authattr can return]
6672 * vnode_authorize:??? [anything vnode_authorize can return]
6673 * vnode_setattr:??? [anything vnode_setattr can return]
6674 *
6675 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6676 * translated to EPERM before being returned.
6677 */
6678 static int
6679 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6680 {
6681 kauth_action_t action;
6682 int error;
6683
6684 AUDIT_ARG(mode, vap->va_mode);
6685 /* XXX audit new args */
6686
6687 #if NAMEDSTREAMS
6688 /* chmod calls are not allowed for resource forks. */
6689 if (vp->v_flag & VISNAMEDSTREAM) {
6690 return EPERM;
6691 }
6692 #endif
6693
6694 #if CONFIG_MACF
6695 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6696 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6697 return error;
6698 }
6699
6700 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6701 if ((error = mac_vnode_check_setowner(ctx, vp,
6702 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6703 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6704 return error;
6705 }
6706 }
6707
6708 if (VATTR_IS_ACTIVE(vap, va_acl) &&
6709 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6710 return error;
6711 }
6712 #endif
6713
6714 /* make sure that the caller is allowed to set this security information */
6715 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6716 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6717 if (error == EACCES) {
6718 error = EPERM;
6719 }
6720 return error;
6721 }
6722
6723 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6724 return error;
6725 }
6726
6727 #if CONFIG_MACF
6728 if (VATTR_IS_ACTIVE(vap, va_mode)) {
6729 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6730 }
6731
6732 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6733 mac_vnode_notify_setowner(ctx, vp,
6734 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6735 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6736 }
6737
6738 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6739 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6740 }
6741 #endif
6742
6743 return error;
6744 }
6745
6746
6747 /*
6748 * Change mode of a file given a path name.
6749 *
6750 * Returns: 0 Success
6751 * namei:??? [anything namei can return]
6752 * chmod_vnode:??? [anything chmod_vnode can return]
6753 */
6754 static int
6755 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6756 int fd, int flag, enum uio_seg segflg)
6757 {
6758 struct nameidata nd;
6759 int follow, error;
6760
6761 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6762 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6763 segflg, path, ctx);
6764 if ((error = nameiat(&nd, fd))) {
6765 return error;
6766 }
6767 error = chmod_vnode(ctx, nd.ni_vp, vap);
6768 vnode_put(nd.ni_vp);
6769 nameidone(&nd);
6770 return error;
6771 }
6772
6773 /*
6774 * chmod_extended: Change the mode of a file given a path name; with extended
6775 * argument list (including extended security (ACL)).
6776 *
6777 * Parameters: p Process requesting the open
6778 * uap User argument descriptor (see below)
6779 * retval (ignored)
6780 *
6781 * Indirect: uap->path Path to object (same as 'chmod')
6782 * uap->uid UID to set
6783 * uap->gid GID to set
6784 * uap->mode File mode to set (same as 'chmod')
6785 * uap->xsecurity ACL to set (or delete)
6786 *
6787 * Returns: 0 Success
6788 * !0 errno value
6789 *
6790 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
6791 *
6792 * XXX: We should enummerate the possible errno values here, and where
6793 * in the code they originated.
6794 */
6795 int
6796 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6797 {
6798 int error;
6799 struct vnode_attr va;
6800 kauth_filesec_t xsecdst;
6801
6802 AUDIT_ARG(owner, uap->uid, uap->gid);
6803
6804 VATTR_INIT(&va);
6805 if (uap->mode != -1) {
6806 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6807 }
6808 if (uap->uid != KAUTH_UID_NONE) {
6809 VATTR_SET(&va, va_uid, uap->uid);
6810 }
6811 if (uap->gid != KAUTH_GID_NONE) {
6812 VATTR_SET(&va, va_gid, uap->gid);
6813 }
6814
6815 xsecdst = NULL;
6816 switch (uap->xsecurity) {
6817 /* explicit remove request */
6818 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
6819 VATTR_SET(&va, va_acl, NULL);
6820 break;
6821 /* not being set */
6822 case USER_ADDR_NULL:
6823 break;
6824 default:
6825 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6826 return error;
6827 }
6828 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6829 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6830 }
6831
6832 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6833 UIO_USERSPACE);
6834
6835 if (xsecdst != NULL) {
6836 kauth_filesec_free(xsecdst);
6837 }
6838 return error;
6839 }
6840
6841 /*
6842 * Returns: 0 Success
6843 * chmodat:??? [anything chmodat can return]
6844 */
6845 static int
6846 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6847 int flag, enum uio_seg segflg)
6848 {
6849 struct vnode_attr va;
6850
6851 VATTR_INIT(&va);
6852 VATTR_SET(&va, va_mode, mode & ALLPERMS);
6853
6854 return chmodat(ctx, path, &va, fd, flag, segflg);
6855 }
6856
6857 int
6858 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6859 {
6860 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6861 AT_FDCWD, 0, UIO_USERSPACE);
6862 }
6863
6864 int
6865 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6866 {
6867 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6868 return EINVAL;
6869 }
6870
6871 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6872 uap->fd, uap->flag, UIO_USERSPACE);
6873 }
6874
6875 /*
6876 * Change mode of a file given a file descriptor.
6877 */
6878 static int
6879 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6880 {
6881 vnode_t vp;
6882 int error;
6883
6884 AUDIT_ARG(fd, fd);
6885
6886 if ((error = file_vnode(fd, &vp)) != 0) {
6887 return error;
6888 }
6889 if ((error = vnode_getwithref(vp)) != 0) {
6890 file_drop(fd);
6891 return error;
6892 }
6893 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6894
6895 error = chmod_vnode(vfs_context_current(), vp, vap);
6896 (void)vnode_put(vp);
6897 file_drop(fd);
6898
6899 return error;
6900 }
6901
6902 /*
6903 * fchmod_extended: Change mode of a file given a file descriptor; with
6904 * extended argument list (including extended security (ACL)).
6905 *
6906 * Parameters: p Process requesting to change file mode
6907 * uap User argument descriptor (see below)
6908 * retval (ignored)
6909 *
6910 * Indirect: uap->mode File mode to set (same as 'chmod')
6911 * uap->uid UID to set
6912 * uap->gid GID to set
6913 * uap->xsecurity ACL to set (or delete)
6914 * uap->fd File descriptor of file to change mode
6915 *
6916 * Returns: 0 Success
6917 * !0 errno value
6918 *
6919 */
6920 int
6921 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6922 {
6923 int error;
6924 struct vnode_attr va;
6925 kauth_filesec_t xsecdst;
6926
6927 AUDIT_ARG(owner, uap->uid, uap->gid);
6928
6929 VATTR_INIT(&va);
6930 if (uap->mode != -1) {
6931 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6932 }
6933 if (uap->uid != KAUTH_UID_NONE) {
6934 VATTR_SET(&va, va_uid, uap->uid);
6935 }
6936 if (uap->gid != KAUTH_GID_NONE) {
6937 VATTR_SET(&va, va_gid, uap->gid);
6938 }
6939
6940 xsecdst = NULL;
6941 switch (uap->xsecurity) {
6942 case USER_ADDR_NULL:
6943 VATTR_SET(&va, va_acl, NULL);
6944 break;
6945 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
6946 VATTR_SET(&va, va_acl, NULL);
6947 break;
6948 /* not being set */
6949 case CAST_USER_ADDR_T(-1):
6950 break;
6951 default:
6952 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6953 return error;
6954 }
6955 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6956 }
6957
6958 error = fchmod1(p, uap->fd, &va);
6959
6960
6961 switch (uap->xsecurity) {
6962 case USER_ADDR_NULL:
6963 case CAST_USER_ADDR_T(-1):
6964 break;
6965 default:
6966 if (xsecdst != NULL) {
6967 kauth_filesec_free(xsecdst);
6968 }
6969 }
6970 return error;
6971 }
6972
6973 int
6974 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6975 {
6976 struct vnode_attr va;
6977
6978 VATTR_INIT(&va);
6979 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6980
6981 return fchmod1(p, uap->fd, &va);
6982 }
6983
6984
6985 /*
6986 * Set ownership given a path name.
6987 */
6988 /* ARGSUSED */
6989 static int
6990 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6991 gid_t gid, int flag, enum uio_seg segflg)
6992 {
6993 vnode_t vp;
6994 struct vnode_attr va;
6995 int error;
6996 struct nameidata nd;
6997 int follow;
6998 kauth_action_t action;
6999
7000 AUDIT_ARG(owner, uid, gid);
7001
7002 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7003 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7004 path, ctx);
7005 error = nameiat(&nd, fd);
7006 if (error) {
7007 return error;
7008 }
7009 vp = nd.ni_vp;
7010
7011 nameidone(&nd);
7012
7013 VATTR_INIT(&va);
7014 if (uid != (uid_t)VNOVAL) {
7015 VATTR_SET(&va, va_uid, uid);
7016 }
7017 if (gid != (gid_t)VNOVAL) {
7018 VATTR_SET(&va, va_gid, gid);
7019 }
7020
7021 #if CONFIG_MACF
7022 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7023 if (error) {
7024 goto out;
7025 }
7026 #endif
7027
7028 /* preflight and authorize attribute changes */
7029 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7030 goto out;
7031 }
7032 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7033 goto out;
7034 }
7035 error = vnode_setattr(vp, &va, ctx);
7036
7037 #if CONFIG_MACF
7038 if (error == 0) {
7039 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7040 }
7041 #endif
7042
7043 out:
7044 /*
7045 * EACCES is only allowed from namei(); permissions failure should
7046 * return EPERM, so we need to translate the error code.
7047 */
7048 if (error == EACCES) {
7049 error = EPERM;
7050 }
7051
7052 vnode_put(vp);
7053 return error;
7054 }
7055
7056 int
7057 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7058 {
7059 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7060 uap->uid, uap->gid, 0, UIO_USERSPACE);
7061 }
7062
7063 int
7064 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7065 {
7066 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7067 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7068 }
7069
7070 int
7071 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7072 {
7073 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7074 return EINVAL;
7075 }
7076
7077 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7078 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7079 }
7080
7081 /*
7082 * Set ownership given a file descriptor.
7083 */
7084 /* ARGSUSED */
7085 int
7086 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7087 {
7088 struct vnode_attr va;
7089 vfs_context_t ctx = vfs_context_current();
7090 vnode_t vp;
7091 int error;
7092 kauth_action_t action;
7093
7094 AUDIT_ARG(owner, uap->uid, uap->gid);
7095 AUDIT_ARG(fd, uap->fd);
7096
7097 if ((error = file_vnode(uap->fd, &vp))) {
7098 return error;
7099 }
7100
7101 if ((error = vnode_getwithref(vp))) {
7102 file_drop(uap->fd);
7103 return error;
7104 }
7105 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7106
7107 VATTR_INIT(&va);
7108 if (uap->uid != VNOVAL) {
7109 VATTR_SET(&va, va_uid, uap->uid);
7110 }
7111 if (uap->gid != VNOVAL) {
7112 VATTR_SET(&va, va_gid, uap->gid);
7113 }
7114
7115 #if NAMEDSTREAMS
7116 /* chown calls are not allowed for resource forks. */
7117 if (vp->v_flag & VISNAMEDSTREAM) {
7118 error = EPERM;
7119 goto out;
7120 }
7121 #endif
7122
7123 #if CONFIG_MACF
7124 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7125 if (error) {
7126 goto out;
7127 }
7128 #endif
7129
7130 /* preflight and authorize attribute changes */
7131 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7132 goto out;
7133 }
7134 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7135 if (error == EACCES) {
7136 error = EPERM;
7137 }
7138 goto out;
7139 }
7140 error = vnode_setattr(vp, &va, ctx);
7141
7142 #if CONFIG_MACF
7143 if (error == 0) {
7144 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7145 }
7146 #endif
7147
7148 out:
7149 (void)vnode_put(vp);
7150 file_drop(uap->fd);
7151 return error;
7152 }
7153
7154 static int
7155 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7156 {
7157 int error;
7158
7159 if (usrtvp == USER_ADDR_NULL) {
7160 struct timeval old_tv;
7161 /* XXX Y2038 bug because of microtime argument */
7162 microtime(&old_tv);
7163 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7164 tsp[1] = tsp[0];
7165 } else {
7166 if (IS_64BIT_PROCESS(current_proc())) {
7167 struct user64_timeval tv[2];
7168 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7169 if (error) {
7170 return error;
7171 }
7172 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7173 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7174 } else {
7175 struct user32_timeval tv[2];
7176 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7177 if (error) {
7178 return error;
7179 }
7180 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7181 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7182 }
7183 }
7184 return 0;
7185 }
7186
7187 static int
7188 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7189 int nullflag)
7190 {
7191 int error;
7192 struct vnode_attr va;
7193 kauth_action_t action;
7194
7195 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7196
7197 VATTR_INIT(&va);
7198 VATTR_SET(&va, va_access_time, ts[0]);
7199 VATTR_SET(&va, va_modify_time, ts[1]);
7200 if (nullflag) {
7201 va.va_vaflags |= VA_UTIMES_NULL;
7202 }
7203
7204 #if NAMEDSTREAMS
7205 /* utimes calls are not allowed for resource forks. */
7206 if (vp->v_flag & VISNAMEDSTREAM) {
7207 error = EPERM;
7208 goto out;
7209 }
7210 #endif
7211
7212 #if CONFIG_MACF
7213 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7214 if (error) {
7215 goto out;
7216 }
7217 #endif
7218 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7219 if (!nullflag && error == EACCES) {
7220 error = EPERM;
7221 }
7222 goto out;
7223 }
7224
7225 /* since we may not need to auth anything, check here */
7226 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7227 if (!nullflag && error == EACCES) {
7228 error = EPERM;
7229 }
7230 goto out;
7231 }
7232 error = vnode_setattr(vp, &va, ctx);
7233
7234 #if CONFIG_MACF
7235 if (error == 0) {
7236 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7237 }
7238 #endif
7239
7240 out:
7241 return error;
7242 }
7243
7244 /*
7245 * Set the access and modification times of a file.
7246 */
7247 /* ARGSUSED */
7248 int
7249 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7250 {
7251 struct timespec ts[2];
7252 user_addr_t usrtvp;
7253 int error;
7254 struct nameidata nd;
7255 vfs_context_t ctx = vfs_context_current();
7256
7257 /*
7258 * AUDIT: Needed to change the order of operations to do the
7259 * name lookup first because auditing wants the path.
7260 */
7261 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7262 UIO_USERSPACE, uap->path, ctx);
7263 error = namei(&nd);
7264 if (error) {
7265 return error;
7266 }
7267 nameidone(&nd);
7268
7269 /*
7270 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7271 * the current time instead.
7272 */
7273 usrtvp = uap->tptr;
7274 if ((error = getutimes(usrtvp, ts)) != 0) {
7275 goto out;
7276 }
7277
7278 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7279
7280 out:
7281 vnode_put(nd.ni_vp);
7282 return error;
7283 }
7284
7285 /*
7286 * Set the access and modification times of a file.
7287 */
7288 /* ARGSUSED */
7289 int
7290 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7291 {
7292 struct timespec ts[2];
7293 vnode_t vp;
7294 user_addr_t usrtvp;
7295 int error;
7296
7297 AUDIT_ARG(fd, uap->fd);
7298 usrtvp = uap->tptr;
7299 if ((error = getutimes(usrtvp, ts)) != 0) {
7300 return error;
7301 }
7302 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7303 return error;
7304 }
7305 if ((error = vnode_getwithref(vp))) {
7306 file_drop(uap->fd);
7307 return error;
7308 }
7309
7310 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7311 vnode_put(vp);
7312 file_drop(uap->fd);
7313 return error;
7314 }
7315
7316 /*
7317 * Truncate a file given its path name.
7318 */
7319 /* ARGSUSED */
7320 int
7321 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7322 {
7323 vnode_t vp;
7324 struct vnode_attr va;
7325 vfs_context_t ctx = vfs_context_current();
7326 int error;
7327 struct nameidata nd;
7328 kauth_action_t action;
7329
7330 if (uap->length < 0) {
7331 return EINVAL;
7332 }
7333 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7334 UIO_USERSPACE, uap->path, ctx);
7335 if ((error = namei(&nd))) {
7336 return error;
7337 }
7338 vp = nd.ni_vp;
7339
7340 nameidone(&nd);
7341
7342 VATTR_INIT(&va);
7343 VATTR_SET(&va, va_data_size, uap->length);
7344
7345 #if CONFIG_MACF
7346 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7347 if (error) {
7348 goto out;
7349 }
7350 #endif
7351
7352 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7353 goto out;
7354 }
7355 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7356 goto out;
7357 }
7358 error = vnode_setattr(vp, &va, ctx);
7359
7360 #if CONFIG_MACF
7361 if (error == 0) {
7362 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7363 }
7364 #endif
7365
7366 out:
7367 vnode_put(vp);
7368 return error;
7369 }
7370
7371 /*
7372 * Truncate a file given a file descriptor.
7373 */
7374 /* ARGSUSED */
7375 int
7376 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7377 {
7378 vfs_context_t ctx = vfs_context_current();
7379 struct vnode_attr va;
7380 vnode_t vp;
7381 struct fileproc *fp;
7382 int error;
7383 int fd = uap->fd;
7384
7385 AUDIT_ARG(fd, uap->fd);
7386 if (uap->length < 0) {
7387 return EINVAL;
7388 }
7389
7390 if ((error = fp_lookup(p, fd, &fp, 0))) {
7391 return error;
7392 }
7393
7394 switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7395 case DTYPE_PSXSHM:
7396 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7397 goto out;
7398 case DTYPE_VNODE:
7399 break;
7400 default:
7401 error = EINVAL;
7402 goto out;
7403 }
7404
7405 vp = (vnode_t)fp->f_fglob->fg_data;
7406
7407 if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7408 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7409 error = EINVAL;
7410 goto out;
7411 }
7412
7413 if ((error = vnode_getwithref(vp)) != 0) {
7414 goto out;
7415 }
7416
7417 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7418
7419 #if CONFIG_MACF
7420 error = mac_vnode_check_truncate(ctx,
7421 fp->f_fglob->fg_cred, vp);
7422 if (error) {
7423 (void)vnode_put(vp);
7424 goto out;
7425 }
7426 #endif
7427 VATTR_INIT(&va);
7428 VATTR_SET(&va, va_data_size, uap->length);
7429 error = vnode_setattr(vp, &va, ctx);
7430
7431 #if CONFIG_MACF
7432 if (error == 0) {
7433 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7434 }
7435 #endif
7436
7437 (void)vnode_put(vp);
7438 out:
7439 file_drop(fd);
7440 return error;
7441 }
7442
7443
7444 /*
7445 * Sync an open file with synchronized I/O _file_ integrity completion
7446 */
7447 /* ARGSUSED */
7448 int
7449 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7450 {
7451 __pthread_testcancel(1);
7452 return fsync_common(p, uap, MNT_WAIT);
7453 }
7454
7455
7456 /*
7457 * Sync an open file with synchronized I/O _file_ integrity completion
7458 *
7459 * Notes: This is a legacy support function that does not test for
7460 * thread cancellation points.
7461 */
7462 /* ARGSUSED */
7463 int
7464 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7465 {
7466 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7467 }
7468
7469
7470 /*
7471 * Sync an open file with synchronized I/O _data_ integrity completion
7472 */
7473 /* ARGSUSED */
7474 int
7475 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7476 {
7477 __pthread_testcancel(1);
7478 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7479 }
7480
7481
7482 /*
7483 * fsync_common
7484 *
7485 * Common fsync code to support both synchronized I/O file integrity completion
7486 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7487 *
7488 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7489 * will only guarantee that the file data contents are retrievable. If
7490 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7491 * includes additional metadata unnecessary for retrieving the file data
7492 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7493 * storage.
7494 *
7495 * Parameters: p The process
7496 * uap->fd The descriptor to synchronize
7497 * flags The data integrity flags
7498 *
7499 * Returns: int Success
7500 * fp_getfvp:EBADF Bad file descriptor
7501 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7502 * VNOP_FSYNC:??? unspecified
7503 *
7504 * Notes: We use struct fsync_args because it is a short name, and all
7505 * caller argument structures are otherwise identical.
7506 */
7507 static int
7508 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7509 {
7510 vnode_t vp;
7511 struct fileproc *fp;
7512 vfs_context_t ctx = vfs_context_current();
7513 int error;
7514
7515 AUDIT_ARG(fd, uap->fd);
7516
7517 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7518 return error;
7519 }
7520 if ((error = vnode_getwithref(vp))) {
7521 file_drop(uap->fd);
7522 return error;
7523 }
7524
7525 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7526
7527 error = VNOP_FSYNC(vp, flags, ctx);
7528
7529 #if NAMEDRSRCFORK
7530 /* Sync resource fork shadow file if necessary. */
7531 if ((error == 0) &&
7532 (vp->v_flag & VISNAMEDSTREAM) &&
7533 (vp->v_parent != NULLVP) &&
7534 vnode_isshadow(vp) &&
7535 (fp->f_flags & FP_WRITTEN)) {
7536 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7537 }
7538 #endif
7539
7540 (void)vnode_put(vp);
7541 file_drop(uap->fd);
7542 return error;
7543 }
7544
7545 /*
7546 * Duplicate files. Source must be a file, target must be a file or
7547 * must not exist.
7548 *
7549 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7550 * perform inheritance correctly.
7551 */
7552 /* ARGSUSED */
7553 int
7554 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7555 {
7556 vnode_t tvp, fvp, tdvp, sdvp;
7557 struct nameidata fromnd, tond;
7558 int error;
7559 vfs_context_t ctx = vfs_context_current();
7560 #if CONFIG_MACF
7561 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7562 struct vnode_attr va;
7563 #endif
7564
7565 /* Check that the flags are valid. */
7566
7567 if (uap->flags & ~CPF_MASK) {
7568 return EINVAL;
7569 }
7570
7571 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7572 UIO_USERSPACE, uap->from, ctx);
7573 if ((error = namei(&fromnd))) {
7574 return error;
7575 }
7576 fvp = fromnd.ni_vp;
7577
7578 NDINIT(&tond, CREATE, OP_LINK,
7579 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7580 UIO_USERSPACE, uap->to, ctx);
7581 if ((error = namei(&tond))) {
7582 goto out1;
7583 }
7584 tdvp = tond.ni_dvp;
7585 tvp = tond.ni_vp;
7586
7587 if (tvp != NULL) {
7588 if (!(uap->flags & CPF_OVERWRITE)) {
7589 error = EEXIST;
7590 goto out;
7591 }
7592 }
7593
7594 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7595 error = EISDIR;
7596 goto out;
7597 }
7598
7599 /* This calls existing MAC hooks for open */
7600 if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7601 NULL))) {
7602 goto out;
7603 }
7604
7605 if (tvp) {
7606 /*
7607 * See unlinkat_internal for an explanation of the potential
7608 * ENOENT from the MAC hook but the gist is that the MAC hook
7609 * can fail because vn_getpath isn't able to return the full
7610 * path. We choose to ignore this failure.
7611 */
7612 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7613 if (error && error != ENOENT) {
7614 goto out;
7615 }
7616 error = 0;
7617 }
7618
7619 #if CONFIG_MACF
7620 VATTR_INIT(&va);
7621 VATTR_SET(&va, va_type, fvp->v_type);
7622 /* Mask off all but regular access permissions */
7623 VATTR_SET(&va, va_mode,
7624 ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7625 error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7626 if (error) {
7627 goto out;
7628 }
7629 #endif /* CONFIG_MACF */
7630
7631 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7632 goto out;
7633 }
7634
7635 if (fvp == tdvp) {
7636 error = EINVAL;
7637 }
7638 /*
7639 * If source is the same as the destination (that is the
7640 * same inode number) then there is nothing to do.
7641 * (fixed to have POSIX semantics - CSM 3/2/98)
7642 */
7643 if (fvp == tvp) {
7644 error = -1;
7645 }
7646 if (!error) {
7647 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7648 }
7649 out:
7650 sdvp = tond.ni_startdir;
7651 /*
7652 * nameidone has to happen before we vnode_put(tdvp)
7653 * since it may need to release the fs_nodelock on the tdvp
7654 */
7655 nameidone(&tond);
7656
7657 if (tvp) {
7658 vnode_put(tvp);
7659 }
7660 vnode_put(tdvp);
7661 vnode_put(sdvp);
7662 out1:
7663 vnode_put(fvp);
7664
7665 nameidone(&fromnd);
7666
7667 if (error == -1) {
7668 return 0;
7669 }
7670 return error;
7671 }
7672
7673 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7674
7675 /*
7676 * Helper function for doing clones. The caller is expected to provide an
7677 * iocounted source vnode and release it.
7678 */
7679 static int
7680 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7681 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7682 {
7683 vnode_t tvp, tdvp;
7684 struct nameidata tond;
7685 int error;
7686 int follow;
7687 boolean_t free_src_acl;
7688 boolean_t attr_cleanup;
7689 enum vtype v_type;
7690 kauth_action_t action;
7691 struct componentname *cnp;
7692 uint32_t defaulted;
7693 struct vnode_attr va;
7694 struct vnode_attr nva;
7695 uint32_t vnop_flags;
7696
7697 v_type = vnode_vtype(fvp);
7698 switch (v_type) {
7699 case VLNK:
7700 /* FALLTHRU */
7701 case VREG:
7702 action = KAUTH_VNODE_ADD_FILE;
7703 break;
7704 case VDIR:
7705 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7706 fvp->v_mountedhere) {
7707 return EINVAL;
7708 }
7709 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7710 break;
7711 default:
7712 return EINVAL;
7713 }
7714
7715 AUDIT_ARG(fd2, dst_dirfd);
7716 AUDIT_ARG(value32, flags);
7717
7718 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7719 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7720 UIO_USERSPACE, dst, ctx);
7721 if ((error = nameiat(&tond, dst_dirfd))) {
7722 return error;
7723 }
7724 cnp = &tond.ni_cnd;
7725 tdvp = tond.ni_dvp;
7726 tvp = tond.ni_vp;
7727
7728 free_src_acl = FALSE;
7729 attr_cleanup = FALSE;
7730
7731 if (tvp != NULL) {
7732 error = EEXIST;
7733 goto out;
7734 }
7735
7736 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7737 error = EXDEV;
7738 goto out;
7739 }
7740
7741 #if CONFIG_MACF
7742 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7743 goto out;
7744 }
7745 #endif
7746 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7747 goto out;
7748 }
7749
7750 action = KAUTH_VNODE_GENERIC_READ_BITS;
7751 if (data_read_authorised) {
7752 action &= ~KAUTH_VNODE_READ_DATA;
7753 }
7754 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7755 goto out;
7756 }
7757
7758 /*
7759 * certain attributes may need to be changed from the source, we ask for
7760 * those here.
7761 */
7762 VATTR_INIT(&va);
7763 VATTR_WANTED(&va, va_uid);
7764 VATTR_WANTED(&va, va_gid);
7765 VATTR_WANTED(&va, va_mode);
7766 VATTR_WANTED(&va, va_flags);
7767 VATTR_WANTED(&va, va_acl);
7768
7769 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7770 goto out;
7771 }
7772
7773 VATTR_INIT(&nva);
7774 VATTR_SET(&nva, va_type, v_type);
7775 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7776 VATTR_SET(&nva, va_acl, va.va_acl);
7777 free_src_acl = TRUE;
7778 }
7779
7780 /* Handle ACL inheritance, initialize vap. */
7781 if (v_type == VLNK) {
7782 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7783 } else {
7784 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7785 if (error) {
7786 goto out;
7787 }
7788 attr_cleanup = TRUE;
7789 }
7790
7791 vnop_flags = VNODE_CLONEFILE_DEFAULT;
7792 /*
7793 * We've got initial values for all security parameters,
7794 * If we are superuser, then we can change owners to be the
7795 * same as the source. Both superuser and the owner have default
7796 * WRITE_SECURITY privileges so all other fields can be taken
7797 * from source as well.
7798 */
7799 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7800 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7801 VATTR_SET(&nva, va_uid, va.va_uid);
7802 }
7803 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7804 VATTR_SET(&nva, va_gid, va.va_gid);
7805 }
7806 } else {
7807 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7808 }
7809
7810 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7811 VATTR_SET(&nva, va_mode, va.va_mode);
7812 }
7813 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7814 VATTR_SET(&nva, va_flags,
7815 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7816 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7817 }
7818
7819 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7820
7821 if (!error && tvp) {
7822 int update_flags = 0;
7823 #if CONFIG_FSE
7824 int fsevent;
7825 #endif /* CONFIG_FSE */
7826
7827 /*
7828 * If some of the requested attributes weren't handled by the
7829 * VNOP, use our fallback code.
7830 */
7831 if (!VATTR_ALL_SUPPORTED(&va)) {
7832 (void)vnode_setattr_fallback(tvp, &nva, ctx);
7833 }
7834
7835 #if CONFIG_MACF
7836 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7837 VNODE_LABEL_CREATE, ctx);
7838 #endif
7839
7840 // Make sure the name & parent pointers are hooked up
7841 if (tvp->v_name == NULL) {
7842 update_flags |= VNODE_UPDATE_NAME;
7843 }
7844 if (tvp->v_parent == NULLVP) {
7845 update_flags |= VNODE_UPDATE_PARENT;
7846 }
7847
7848 if (update_flags) {
7849 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7850 cnp->cn_namelen, cnp->cn_hash, update_flags);
7851 }
7852
7853 #if CONFIG_FSE
7854 switch (vnode_vtype(tvp)) {
7855 case VLNK:
7856 /* FALLTHRU */
7857 case VREG:
7858 fsevent = FSE_CREATE_FILE;
7859 break;
7860 case VDIR:
7861 fsevent = FSE_CREATE_DIR;
7862 break;
7863 default:
7864 goto out;
7865 }
7866
7867 if (need_fsevent(fsevent, tvp)) {
7868 /*
7869 * The following is a sequence of three explicit events.
7870 * A pair of FSE_CLONE events representing the source and destination
7871 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7872 * fseventsd may coalesce the destination clone and create events
7873 * into a single event resulting in the following sequence for a client
7874 * FSE_CLONE (src)
7875 * FSE_CLONE | FSE_CREATE (dst)
7876 */
7877 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7878 FSE_ARG_DONE);
7879 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7880 FSE_ARG_DONE);
7881 }
7882 #endif /* CONFIG_FSE */
7883 }
7884
7885 out:
7886 if (attr_cleanup) {
7887 vn_attribute_cleanup(&nva, defaulted);
7888 }
7889 if (free_src_acl && va.va_acl) {
7890 kauth_acl_free(va.va_acl);
7891 }
7892 nameidone(&tond);
7893 if (tvp) {
7894 vnode_put(tvp);
7895 }
7896 vnode_put(tdvp);
7897 return error;
7898 }
7899
7900 /*
7901 * clone files or directories, target must not exist.
7902 */
7903 /* ARGSUSED */
7904 int
7905 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7906 __unused int32_t *retval)
7907 {
7908 vnode_t fvp;
7909 struct nameidata fromnd;
7910 int follow;
7911 int error;
7912 vfs_context_t ctx = vfs_context_current();
7913
7914 /* Check that the flags are valid. */
7915 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7916 return EINVAL;
7917 }
7918
7919 AUDIT_ARG(fd, uap->src_dirfd);
7920
7921 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7922 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7923 UIO_USERSPACE, uap->src, ctx);
7924 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7925 return error;
7926 }
7927
7928 fvp = fromnd.ni_vp;
7929 nameidone(&fromnd);
7930
7931 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7932 uap->flags, ctx);
7933
7934 vnode_put(fvp);
7935 return error;
7936 }
7937
7938 int
7939 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7940 __unused int32_t *retval)
7941 {
7942 vnode_t fvp;
7943 struct fileproc *fp;
7944 int error;
7945 vfs_context_t ctx = vfs_context_current();
7946
7947 /* Check that the flags are valid. */
7948 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7949 return EINVAL;
7950 }
7951
7952 AUDIT_ARG(fd, uap->src_fd);
7953 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7954 if (error) {
7955 return error;
7956 }
7957
7958 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7959 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7960 error = EBADF;
7961 goto out;
7962 }
7963
7964 if ((error = vnode_getwithref(fvp))) {
7965 goto out;
7966 }
7967
7968 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7969
7970 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7971 uap->flags, ctx);
7972
7973 vnode_put(fvp);
7974 out:
7975 file_drop(uap->src_fd);
7976 return error;
7977 }
7978
7979 static int
7980 rename_submounts_callback(mount_t mp, void *arg)
7981 {
7982 int error = 0;
7983 mount_t pmp = (mount_t)arg;
7984 int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7985
7986 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7987 return 0;
7988 }
7989
7990 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7991 return 0;
7992 }
7993
7994 if ((error = vfs_busy(mp, LK_NOWAIT))) {
7995 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7996 return -1;
7997 }
7998
7999 int pathlen = MAXPATHLEN;
8000 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8001 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8002 }
8003
8004 vfs_unbusy(mp);
8005
8006 return error;
8007 }
8008
8009 /*
8010 * Rename files. Source and destination must either both be directories,
8011 * or both not be directories. If target is a directory, it must be empty.
8012 */
8013 /* ARGSUSED */
8014 static int
8015 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8016 int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8017 {
8018 if (flags & ~VFS_RENAME_FLAGS_MASK) {
8019 return EINVAL;
8020 }
8021
8022 if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8023 return EINVAL;
8024 }
8025
8026 vnode_t tvp, tdvp;
8027 vnode_t fvp, fdvp;
8028 struct nameidata *fromnd, *tond;
8029 int error;
8030 int do_retry;
8031 int retry_count;
8032 int mntrename;
8033 int need_event;
8034 int need_kpath2;
8035 int has_listeners;
8036 const char *oname = NULL;
8037 char *from_name = NULL, *to_name = NULL;
8038 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8039 int from_len = 0, to_len = 0;
8040 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8041 int holding_mntlock;
8042 mount_t locked_mp = NULL;
8043 vnode_t oparent = NULLVP;
8044 #if CONFIG_FSE
8045 fse_info from_finfo, to_finfo;
8046 #endif
8047 int from_truncated = 0, to_truncated = 0;
8048 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8049 int batched = 0;
8050 struct vnode_attr *fvap, *tvap;
8051 int continuing = 0;
8052 /* carving out a chunk for structs that are too big to be on stack. */
8053 struct {
8054 struct nameidata from_node, to_node;
8055 struct vnode_attr fv_attr, tv_attr;
8056 } * __rename_data;
8057 MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8058 fromnd = &__rename_data->from_node;
8059 tond = &__rename_data->to_node;
8060
8061 holding_mntlock = 0;
8062 do_retry = 0;
8063 retry_count = 0;
8064 retry:
8065 fvp = tvp = NULL;
8066 fdvp = tdvp = NULL;
8067 fvap = tvap = NULL;
8068 mntrename = FALSE;
8069
8070 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8071 segflg, from, ctx);
8072 fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8073
8074 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8075 segflg, to, ctx);
8076 tond->ni_flag = NAMEI_COMPOUNDRENAME;
8077
8078 continue_lookup:
8079 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8080 if ((error = nameiat(fromnd, fromfd))) {
8081 goto out1;
8082 }
8083 fdvp = fromnd->ni_dvp;
8084 fvp = fromnd->ni_vp;
8085
8086 if (fvp && fvp->v_type == VDIR) {
8087 tond->ni_cnd.cn_flags |= WILLBEDIR;
8088 }
8089 }
8090
8091 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8092 if ((error = nameiat(tond, tofd))) {
8093 /*
8094 * Translate error code for rename("dir1", "dir2/.").
8095 */
8096 if (error == EISDIR && fvp->v_type == VDIR) {
8097 error = EINVAL;
8098 }
8099 goto out1;
8100 }
8101 tdvp = tond->ni_dvp;
8102 tvp = tond->ni_vp;
8103 }
8104
8105 #if DEVELOPMENT || DEBUG
8106 /*
8107 * XXX VSWAP: Check for entitlements or special flag here
8108 * so we can restrict access appropriately.
8109 */
8110 #else /* DEVELOPMENT || DEBUG */
8111
8112 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8113 error = EPERM;
8114 goto out1;
8115 }
8116
8117 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8118 error = EPERM;
8119 goto out1;
8120 }
8121 #endif /* DEVELOPMENT || DEBUG */
8122
8123 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8124 error = ENOENT;
8125 goto out1;
8126 }
8127
8128 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8129 error = EEXIST;
8130 goto out1;
8131 }
8132
8133 batched = vnode_compound_rename_available(fdvp);
8134
8135 #if CONFIG_FSE
8136 need_event = need_fsevent(FSE_RENAME, fdvp);
8137 if (need_event) {
8138 if (fvp) {
8139 get_fse_info(fvp, &from_finfo, ctx);
8140 } else {
8141 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8142 if (error) {
8143 goto out1;
8144 }
8145
8146 fvap = &__rename_data->fv_attr;
8147 }
8148
8149 if (tvp) {
8150 get_fse_info(tvp, &to_finfo, ctx);
8151 } else if (batched) {
8152 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8153 if (error) {
8154 goto out1;
8155 }
8156
8157 tvap = &__rename_data->tv_attr;
8158 }
8159 }
8160 #else
8161 need_event = 0;
8162 #endif /* CONFIG_FSE */
8163
8164 has_listeners = kauth_authorize_fileop_has_listeners();
8165
8166 need_kpath2 = 0;
8167 #if CONFIG_AUDIT
8168 if (AUDIT_RECORD_EXISTS()) {
8169 need_kpath2 = 1;
8170 }
8171 #endif
8172
8173 if (need_event || has_listeners) {
8174 if (from_name == NULL) {
8175 GET_PATH(from_name);
8176 if (from_name == NULL) {
8177 error = ENOMEM;
8178 goto out1;
8179 }
8180 }
8181
8182 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8183
8184 if (from_name_no_firmlink == NULL) {
8185 GET_PATH(from_name_no_firmlink);
8186 if (from_name_no_firmlink == NULL) {
8187 error = ENOMEM;
8188 goto out1;
8189 }
8190 }
8191
8192 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8193 }
8194
8195 if (need_event || need_kpath2 || has_listeners) {
8196 if (to_name == NULL) {
8197 GET_PATH(to_name);
8198 if (to_name == NULL) {
8199 error = ENOMEM;
8200 goto out1;
8201 }
8202 }
8203
8204 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8205
8206 if (to_name_no_firmlink == NULL) {
8207 GET_PATH(to_name_no_firmlink);
8208 if (to_name_no_firmlink == NULL) {
8209 error = ENOMEM;
8210 goto out1;
8211 }
8212 }
8213
8214 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8215 if (to_name && need_kpath2) {
8216 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8217 }
8218 }
8219 if (!fvp) {
8220 /*
8221 * Claim: this check will never reject a valid rename.
8222 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8223 * Suppose fdvp and tdvp are not on the same mount.
8224 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8225 * then you can't move it to within another dir on the same mountpoint.
8226 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8227 *
8228 * If this check passes, then we are safe to pass these vnodes to the same FS.
8229 */
8230 if (fdvp->v_mount != tdvp->v_mount) {
8231 error = EXDEV;
8232 goto out1;
8233 }
8234 goto skipped_lookup;
8235 }
8236
8237 if (!batched) {
8238 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8239 if (error) {
8240 if (error == ENOENT) {
8241 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8242 /*
8243 * We encountered a race where after doing the namei, tvp stops
8244 * being valid. If so, simply re-drive the rename call from the
8245 * top.
8246 */
8247 do_retry = 1;
8248 retry_count += 1;
8249 }
8250 }
8251 goto out1;
8252 }
8253 }
8254
8255 /*
8256 * If the source and destination are the same (i.e. they're
8257 * links to the same vnode) and the target file system is
8258 * case sensitive, then there is nothing to do.
8259 *
8260 * XXX Come back to this.
8261 */
8262 if (fvp == tvp) {
8263 int pathconf_val;
8264
8265 /*
8266 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8267 * then assume that this file system is case sensitive.
8268 */
8269 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8270 pathconf_val != 0) {
8271 goto out1;
8272 }
8273 }
8274
8275 /*
8276 * Allow the renaming of mount points.
8277 * - target must not exist
8278 * - target must reside in the same directory as source
8279 * - union mounts cannot be renamed
8280 * - "/" cannot be renamed
8281 *
8282 * XXX Handle this in VFS after a continued lookup (if we missed
8283 * in the cache to start off)
8284 *
8285 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8286 * we'll skip past here. The file system is responsible for
8287 * checking that @tvp is not a descendent of @fvp and vice versa
8288 * so it should always return EINVAL if either @tvp or @fvp is the
8289 * root of a volume.
8290 */
8291 if ((fvp->v_flag & VROOT) &&
8292 (fvp->v_type == VDIR) &&
8293 (tvp == NULL) &&
8294 (fvp->v_mountedhere == NULL) &&
8295 (fdvp == tdvp) &&
8296 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8297 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8298 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8299 vnode_t coveredvp;
8300
8301 /* switch fvp to the covered vnode */
8302 coveredvp = fvp->v_mount->mnt_vnodecovered;
8303 if ((vnode_getwithref(coveredvp))) {
8304 error = ENOENT;
8305 goto out1;
8306 }
8307 vnode_put(fvp);
8308
8309 fvp = coveredvp;
8310 mntrename = TRUE;
8311 }
8312 /*
8313 * Check for cross-device rename.
8314 */
8315 if ((fvp->v_mount != tdvp->v_mount) ||
8316 (tvp && (fvp->v_mount != tvp->v_mount))) {
8317 error = EXDEV;
8318 goto out1;
8319 }
8320
8321 /*
8322 * If source is the same as the destination (that is the
8323 * same inode number) then there is nothing to do...
8324 * EXCEPT if the underlying file system supports case
8325 * insensitivity and is case preserving. In this case
8326 * the file system needs to handle the special case of
8327 * getting the same vnode as target (fvp) and source (tvp).
8328 *
8329 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8330 * and _PC_CASE_PRESERVING can have this exception, and they need to
8331 * handle the special case of getting the same vnode as target and
8332 * source. NOTE: Then the target is unlocked going into vnop_rename,
8333 * so not to cause locking problems. There is a single reference on tvp.
8334 *
8335 * NOTE - that fvp == tvp also occurs if they are hard linked and
8336 * that correct behaviour then is just to return success without doing
8337 * anything.
8338 *
8339 * XXX filesystem should take care of this itself, perhaps...
8340 */
8341 if (fvp == tvp && fdvp == tdvp) {
8342 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8343 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8344 fromnd->ni_cnd.cn_namelen)) {
8345 goto out1;
8346 }
8347 }
8348
8349 if (holding_mntlock && fvp->v_mount != locked_mp) {
8350 /*
8351 * we're holding a reference and lock
8352 * on locked_mp, but it no longer matches
8353 * what we want to do... so drop our hold
8354 */
8355 mount_unlock_renames(locked_mp);
8356 mount_drop(locked_mp, 0);
8357 holding_mntlock = 0;
8358 }
8359 if (tdvp != fdvp && fvp->v_type == VDIR) {
8360 /*
8361 * serialize renames that re-shape
8362 * the tree... if holding_mntlock is
8363 * set, then we're ready to go...
8364 * otherwise we
8365 * first need to drop the iocounts
8366 * we picked up, second take the
8367 * lock to serialize the access,
8368 * then finally start the lookup
8369 * process over with the lock held
8370 */
8371 if (!holding_mntlock) {
8372 /*
8373 * need to grab a reference on
8374 * the mount point before we
8375 * drop all the iocounts... once
8376 * the iocounts are gone, the mount
8377 * could follow
8378 */
8379 locked_mp = fvp->v_mount;
8380 mount_ref(locked_mp, 0);
8381
8382 /*
8383 * nameidone has to happen before we vnode_put(tvp)
8384 * since it may need to release the fs_nodelock on the tvp
8385 */
8386 nameidone(tond);
8387
8388 if (tvp) {
8389 vnode_put(tvp);
8390 }
8391 vnode_put(tdvp);
8392
8393 /*
8394 * nameidone has to happen before we vnode_put(fdvp)
8395 * since it may need to release the fs_nodelock on the fvp
8396 */
8397 nameidone(fromnd);
8398
8399 vnode_put(fvp);
8400 vnode_put(fdvp);
8401
8402 mount_lock_renames(locked_mp);
8403 holding_mntlock = 1;
8404
8405 goto retry;
8406 }
8407 } else {
8408 /*
8409 * when we dropped the iocounts to take
8410 * the lock, we allowed the identity of
8411 * the various vnodes to change... if they did,
8412 * we may no longer be dealing with a rename
8413 * that reshapes the tree... once we're holding
8414 * the iocounts, the vnodes can't change type
8415 * so we're free to drop the lock at this point
8416 * and continue on
8417 */
8418 if (holding_mntlock) {
8419 mount_unlock_renames(locked_mp);
8420 mount_drop(locked_mp, 0);
8421 holding_mntlock = 0;
8422 }
8423 }
8424
8425 // save these off so we can later verify that fvp is the same
8426 oname = fvp->v_name;
8427 oparent = fvp->v_parent;
8428
8429 skipped_lookup:
8430 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8431 tdvp, &tvp, &tond->ni_cnd, tvap,
8432 flags, ctx);
8433
8434 if (holding_mntlock) {
8435 /*
8436 * we can drop our serialization
8437 * lock now
8438 */
8439 mount_unlock_renames(locked_mp);
8440 mount_drop(locked_mp, 0);
8441 holding_mntlock = 0;
8442 }
8443 if (error) {
8444 if (error == EDATALESS) {
8445 /*
8446 * If we've been here before, something has gone
8447 * horribly wrong and we should just get out lest
8448 * we spiral around the drain forever.
8449 */
8450 if (flags & VFS_RENAME_DATALESS) {
8451 error = EIO;
8452 goto out1;
8453 }
8454
8455 /*
8456 * The object we're renaming is dataless (or has a
8457 * dataless descendent) and requires materialization
8458 * before the rename occurs. But we're holding the
8459 * mount point's rename lock, so it's not safe to
8460 * make the upcall.
8461 *
8462 * In this case, we release the lock, perform the
8463 * materialization, and start the whole thing over.
8464 */
8465 error = vnode_materialize_dataless_file(fvp,
8466 NAMESPACE_HANDLER_RENAME_OP);
8467
8468 if (error == 0) {
8469 /*
8470 * The next time around we need to tell the
8471 * file system that the materializtaion has
8472 * been performed.
8473 */
8474 flags |= VFS_RENAME_DATALESS;
8475 do_retry = 1;
8476 }
8477 goto out1;
8478 }
8479 if (error == EKEEPLOOKING) {
8480 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8481 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8482 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8483 }
8484 }
8485
8486 fromnd->ni_vp = fvp;
8487 tond->ni_vp = tvp;
8488
8489 goto continue_lookup;
8490 }
8491
8492 /*
8493 * We may encounter a race in the VNOP where the destination didn't
8494 * exist when we did the namei, but it does by the time we go and
8495 * try to create the entry. In this case, we should re-drive this rename
8496 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8497 * but other filesystems susceptible to this race could return it, too.
8498 */
8499 if (error == ERECYCLE) {
8500 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8501 do_retry = 1;
8502 retry_count += 1;
8503 } else {
8504 printf("rename retry limit due to ERECYCLE reached\n");
8505 error = ENOENT;
8506 }
8507 }
8508
8509 /*
8510 * For compound VNOPs, the authorization callback may return
8511 * ENOENT in case of racing hardlink lookups hitting the name
8512 * cache, redrive the lookup.
8513 */
8514 if (batched && error == ENOENT) {
8515 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8516 do_retry = 1;
8517 retry_count += 1;
8518 }
8519 }
8520
8521 goto out1;
8522 }
8523
8524 /* call out to allow 3rd party notification of rename.
8525 * Ignore result of kauth_authorize_fileop call.
8526 */
8527 kauth_authorize_fileop(vfs_context_ucred(ctx),
8528 KAUTH_FILEOP_RENAME,
8529 (uintptr_t)from_name, (uintptr_t)to_name);
8530 if (flags & VFS_RENAME_SWAP) {
8531 kauth_authorize_fileop(vfs_context_ucred(ctx),
8532 KAUTH_FILEOP_RENAME,
8533 (uintptr_t)to_name, (uintptr_t)from_name);
8534 }
8535
8536 #if CONFIG_FSE
8537 if (from_name != NULL && to_name != NULL) {
8538 if (from_truncated || to_truncated) {
8539 // set it here since only the from_finfo gets reported up to user space
8540 from_finfo.mode |= FSE_TRUNCATED_PATH;
8541 }
8542
8543 if (tvap && tvp) {
8544 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8545 }
8546 if (fvap) {
8547 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8548 }
8549
8550 if (tvp) {
8551 add_fsevent(FSE_RENAME, ctx,
8552 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8553 FSE_ARG_FINFO, &from_finfo,
8554 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8555 FSE_ARG_FINFO, &to_finfo,
8556 FSE_ARG_DONE);
8557 if (flags & VFS_RENAME_SWAP) {
8558 /*
8559 * Strictly speaking, swap is the equivalent of
8560 * *three* renames. FSEvents clients should only take
8561 * the events as a hint, so we only bother reporting
8562 * two.
8563 */
8564 add_fsevent(FSE_RENAME, ctx,
8565 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8566 FSE_ARG_FINFO, &to_finfo,
8567 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8568 FSE_ARG_FINFO, &from_finfo,
8569 FSE_ARG_DONE);
8570 }
8571 } else {
8572 add_fsevent(FSE_RENAME, ctx,
8573 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8574 FSE_ARG_FINFO, &from_finfo,
8575 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8576 FSE_ARG_DONE);
8577 }
8578 }
8579 #endif /* CONFIG_FSE */
8580
8581 /*
8582 * update filesystem's mount point data
8583 */
8584 if (mntrename) {
8585 char *cp, *pathend, *mpname;
8586 char * tobuf;
8587 struct mount *mp;
8588 int maxlen;
8589 size_t len = 0;
8590
8591 mp = fvp->v_mountedhere;
8592
8593 if (vfs_busy(mp, LK_NOWAIT)) {
8594 error = EBUSY;
8595 goto out1;
8596 }
8597 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8598
8599 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8600 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8601 } else {
8602 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8603 }
8604 if (!error) {
8605 /* find current mount point prefix */
8606 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8607 for (cp = pathend; *cp != '\0'; ++cp) {
8608 if (*cp == '/') {
8609 pathend = cp + 1;
8610 }
8611 }
8612 /* find last component of target name */
8613 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8614 if (*cp == '/') {
8615 mpname = cp + 1;
8616 }
8617 }
8618
8619 /* Update f_mntonname of sub mounts */
8620 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8621
8622 /* append name to prefix */
8623 maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8624 bzero(pathend, maxlen);
8625
8626 strlcpy(pathend, mpname, maxlen);
8627 }
8628 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8629
8630 vfs_unbusy(mp);
8631
8632 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8633 }
8634 /*
8635 * fix up name & parent pointers. note that we first
8636 * check that fvp has the same name/parent pointers it
8637 * had before the rename call... this is a 'weak' check
8638 * at best...
8639 *
8640 * XXX oparent and oname may not be set in the compound vnop case
8641 */
8642 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8643 int update_flags;
8644
8645 update_flags = VNODE_UPDATE_NAME;
8646
8647 if (fdvp != tdvp) {
8648 update_flags |= VNODE_UPDATE_PARENT;
8649 }
8650
8651 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8652 }
8653 out1:
8654 if (to_name != NULL) {
8655 RELEASE_PATH(to_name);
8656 to_name = NULL;
8657 }
8658 if (to_name_no_firmlink != NULL) {
8659 RELEASE_PATH(to_name_no_firmlink);
8660 to_name_no_firmlink = NULL;
8661 }
8662 if (from_name != NULL) {
8663 RELEASE_PATH(from_name);
8664 from_name = NULL;
8665 }
8666 if (from_name_no_firmlink != NULL) {
8667 RELEASE_PATH(from_name_no_firmlink);
8668 from_name_no_firmlink = NULL;
8669 }
8670 if (holding_mntlock) {
8671 mount_unlock_renames(locked_mp);
8672 mount_drop(locked_mp, 0);
8673 holding_mntlock = 0;
8674 }
8675 if (tdvp) {
8676 /*
8677 * nameidone has to happen before we vnode_put(tdvp)
8678 * since it may need to release the fs_nodelock on the tdvp
8679 */
8680 nameidone(tond);
8681
8682 if (tvp) {
8683 vnode_put(tvp);
8684 }
8685 vnode_put(tdvp);
8686 }
8687 if (fdvp) {
8688 /*
8689 * nameidone has to happen before we vnode_put(fdvp)
8690 * since it may need to release the fs_nodelock on the fdvp
8691 */
8692 nameidone(fromnd);
8693
8694 if (fvp) {
8695 vnode_put(fvp);
8696 }
8697 vnode_put(fdvp);
8698 }
8699
8700 /*
8701 * If things changed after we did the namei, then we will re-drive
8702 * this rename call from the top.
8703 */
8704 if (do_retry) {
8705 do_retry = 0;
8706 goto retry;
8707 }
8708
8709 FREE(__rename_data, M_TEMP);
8710 return error;
8711 }
8712
8713 int
8714 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8715 {
8716 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8717 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8718 }
8719
8720 int
8721 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8722 {
8723 return renameat_internal(
8724 vfs_context_current(),
8725 uap->fromfd, uap->from,
8726 uap->tofd, uap->to,
8727 UIO_USERSPACE, uap->flags);
8728 }
8729
8730 int
8731 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8732 {
8733 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8734 uap->tofd, uap->to, UIO_USERSPACE, 0);
8735 }
8736
8737 /*
8738 * Make a directory file.
8739 *
8740 * Returns: 0 Success
8741 * EEXIST
8742 * namei:???
8743 * vnode_authorize:???
8744 * vn_create:???
8745 */
8746 /* ARGSUSED */
8747 static int
8748 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8749 enum uio_seg segflg)
8750 {
8751 vnode_t vp, dvp;
8752 int error;
8753 int update_flags = 0;
8754 int batched;
8755 struct nameidata nd;
8756
8757 AUDIT_ARG(mode, vap->va_mode);
8758 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8759 path, ctx);
8760 nd.ni_cnd.cn_flags |= WILLBEDIR;
8761 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8762
8763 continue_lookup:
8764 error = nameiat(&nd, fd);
8765 if (error) {
8766 return error;
8767 }
8768 dvp = nd.ni_dvp;
8769 vp = nd.ni_vp;
8770
8771 if (vp != NULL) {
8772 error = EEXIST;
8773 goto out;
8774 }
8775
8776 batched = vnode_compound_mkdir_available(dvp);
8777
8778 VATTR_SET(vap, va_type, VDIR);
8779
8780 /*
8781 * XXX
8782 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8783 * only get EXISTS or EISDIR for existing path components, and not that it could see
8784 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8785 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
8786 */
8787 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8788 if (error == EACCES || error == EPERM) {
8789 int error2;
8790
8791 nameidone(&nd);
8792 vnode_put(dvp);
8793 dvp = NULLVP;
8794
8795 /*
8796 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8797 * rather than EACCESS if the target exists.
8798 */
8799 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8800 path, ctx);
8801 error2 = nameiat(&nd, fd);
8802 if (error2) {
8803 goto out;
8804 } else {
8805 vp = nd.ni_vp;
8806 error = EEXIST;
8807 goto out;
8808 }
8809 }
8810
8811 goto out;
8812 }
8813
8814 /*
8815 * make the directory
8816 */
8817 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8818 if (error == EKEEPLOOKING) {
8819 nd.ni_vp = vp;
8820 goto continue_lookup;
8821 }
8822
8823 goto out;
8824 }
8825
8826 // Make sure the name & parent pointers are hooked up
8827 if (vp->v_name == NULL) {
8828 update_flags |= VNODE_UPDATE_NAME;
8829 }
8830 if (vp->v_parent == NULLVP) {
8831 update_flags |= VNODE_UPDATE_PARENT;
8832 }
8833
8834 if (update_flags) {
8835 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8836 }
8837
8838 #if CONFIG_FSE
8839 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8840 #endif
8841
8842 out:
8843 /*
8844 * nameidone has to happen before we vnode_put(dvp)
8845 * since it may need to release the fs_nodelock on the dvp
8846 */
8847 nameidone(&nd);
8848
8849 if (vp) {
8850 vnode_put(vp);
8851 }
8852 if (dvp) {
8853 vnode_put(dvp);
8854 }
8855
8856 return error;
8857 }
8858
8859 /*
8860 * mkdir_extended: Create a directory; with extended security (ACL).
8861 *
8862 * Parameters: p Process requesting to create the directory
8863 * uap User argument descriptor (see below)
8864 * retval (ignored)
8865 *
8866 * Indirect: uap->path Path of directory to create
8867 * uap->mode Access permissions to set
8868 * uap->xsecurity ACL to set
8869 *
8870 * Returns: 0 Success
8871 * !0 Not success
8872 *
8873 */
8874 int
8875 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8876 {
8877 int ciferror;
8878 kauth_filesec_t xsecdst;
8879 struct vnode_attr va;
8880
8881 AUDIT_ARG(owner, uap->uid, uap->gid);
8882
8883 xsecdst = NULL;
8884 if ((uap->xsecurity != USER_ADDR_NULL) &&
8885 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8886 return ciferror;
8887 }
8888
8889 VATTR_INIT(&va);
8890 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8891 if (xsecdst != NULL) {
8892 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8893 }
8894
8895 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8896 UIO_USERSPACE);
8897 if (xsecdst != NULL) {
8898 kauth_filesec_free(xsecdst);
8899 }
8900 return ciferror;
8901 }
8902
8903 int
8904 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8905 {
8906 struct vnode_attr va;
8907
8908 VATTR_INIT(&va);
8909 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8910
8911 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8912 UIO_USERSPACE);
8913 }
8914
8915 int
8916 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8917 {
8918 struct vnode_attr va;
8919
8920 VATTR_INIT(&va);
8921 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8922
8923 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8924 UIO_USERSPACE);
8925 }
8926
8927 static int
8928 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8929 enum uio_seg segflg, int unlink_flags)
8930 {
8931 vnode_t vp, dvp;
8932 int error;
8933 struct nameidata nd;
8934 char *path = NULL;
8935 char *no_firmlink_path = NULL;
8936 int len_path = 0;
8937 int len_no_firmlink_path = 0;
8938 int has_listeners = 0;
8939 int need_event = 0;
8940 int truncated_path = 0;
8941 int truncated_no_firmlink_path = 0;
8942 #if CONFIG_FSE
8943 struct vnode_attr va;
8944 #endif /* CONFIG_FSE */
8945 struct vnode_attr *vap = NULL;
8946 int restart_count = 0;
8947 int batched;
8948
8949 int restart_flag;
8950
8951 /*
8952 * This loop exists to restart rmdir in the unlikely case that two
8953 * processes are simultaneously trying to remove the same directory
8954 * containing orphaned appleDouble files.
8955 */
8956 do {
8957 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8958 segflg, dirpath, ctx);
8959 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8960 continue_lookup:
8961 restart_flag = 0;
8962 vap = NULL;
8963
8964 error = nameiat(&nd, fd);
8965 if (error) {
8966 return error;
8967 }
8968
8969 dvp = nd.ni_dvp;
8970 vp = nd.ni_vp;
8971
8972 if (vp) {
8973 batched = vnode_compound_rmdir_available(vp);
8974
8975 if (vp->v_flag & VROOT) {
8976 /*
8977 * The root of a mounted filesystem cannot be deleted.
8978 */
8979 error = EBUSY;
8980 goto out;
8981 }
8982
8983 #if DEVELOPMENT || DEBUG
8984 /*
8985 * XXX VSWAP: Check for entitlements or special flag here
8986 * so we can restrict access appropriately.
8987 */
8988 #else /* DEVELOPMENT || DEBUG */
8989
8990 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8991 error = EPERM;
8992 goto out;
8993 }
8994 #endif /* DEVELOPMENT || DEBUG */
8995
8996 /*
8997 * Removed a check here; we used to abort if vp's vid
8998 * was not the same as what we'd seen the last time around.
8999 * I do not think that check was valid, because if we retry
9000 * and all dirents are gone, the directory could legitimately
9001 * be recycled but still be present in a situation where we would
9002 * have had permission to delete. Therefore, we won't make
9003 * an effort to preserve that check now that we may not have a
9004 * vp here.
9005 */
9006
9007 if (!batched) {
9008 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
9009 if (error) {
9010 if (error == ENOENT) {
9011 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9012 restart_flag = 1;
9013 restart_count += 1;
9014 }
9015 }
9016 goto out;
9017 }
9018 }
9019 } else {
9020 batched = 1;
9021
9022 if (!vnode_compound_rmdir_available(dvp)) {
9023 panic("No error, but no compound rmdir?");
9024 }
9025 }
9026
9027 #if CONFIG_FSE
9028 fse_info finfo;
9029
9030 need_event = need_fsevent(FSE_DELETE, dvp);
9031 if (need_event) {
9032 if (!batched) {
9033 get_fse_info(vp, &finfo, ctx);
9034 } else {
9035 error = vfs_get_notify_attributes(&va);
9036 if (error) {
9037 goto out;
9038 }
9039
9040 vap = &va;
9041 }
9042 }
9043 #endif
9044 has_listeners = kauth_authorize_fileop_has_listeners();
9045 if (need_event || has_listeners) {
9046 if (path == NULL) {
9047 GET_PATH(path);
9048 if (path == NULL) {
9049 error = ENOMEM;
9050 goto out;
9051 }
9052 }
9053
9054 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9055
9056 if (no_firmlink_path == NULL) {
9057 GET_PATH(no_firmlink_path);
9058 if (no_firmlink_path == NULL) {
9059 error = ENOMEM;
9060 goto out;
9061 }
9062 }
9063
9064 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9065 #if CONFIG_FSE
9066 if (truncated_no_firmlink_path) {
9067 finfo.mode |= FSE_TRUNCATED_PATH;
9068 }
9069 #endif
9070 }
9071
9072 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9073 nd.ni_vp = vp;
9074 if (vp == NULLVP) {
9075 /* Couldn't find a vnode */
9076 goto out;
9077 }
9078
9079 if (error == EKEEPLOOKING) {
9080 goto continue_lookup;
9081 } else if (batched && error == ENOENT) {
9082 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9083 /*
9084 * For compound VNOPs, the authorization callback
9085 * may return ENOENT in case of racing hard link lookups
9086 * redrive the lookup.
9087 */
9088 restart_flag = 1;
9089 restart_count += 1;
9090 goto out;
9091 }
9092 }
9093
9094 /*
9095 * XXX There's no provision for passing flags
9096 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9097 * because it's not empty, then we try again
9098 * with VNOP_REMOVE(), passing in a special
9099 * flag that clever file systems will know
9100 * how to handle.
9101 */
9102 if (error == ENOTEMPTY &&
9103 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9104 /*
9105 * If this fails, we want to keep the original
9106 * error.
9107 */
9108 if (vn_remove(dvp, &vp, &nd,
9109 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9110 error = 0;
9111 }
9112 }
9113
9114 #if CONFIG_APPLEDOUBLE
9115 /*
9116 * Special case to remove orphaned AppleDouble
9117 * files. I don't like putting this in the kernel,
9118 * but carbon does not like putting this in carbon either,
9119 * so here we are.
9120 */
9121 if (error == ENOTEMPTY) {
9122 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9123 if (ad_error == EBUSY) {
9124 error = ad_error;
9125 goto out;
9126 }
9127
9128
9129 /*
9130 * Assuming everything went well, we will try the RMDIR again
9131 */
9132 if (!ad_error) {
9133 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9134 }
9135 }
9136 #endif /* CONFIG_APPLEDOUBLE */
9137 /*
9138 * Call out to allow 3rd party notification of delete.
9139 * Ignore result of kauth_authorize_fileop call.
9140 */
9141 if (!error) {
9142 if (has_listeners) {
9143 kauth_authorize_fileop(vfs_context_ucred(ctx),
9144 KAUTH_FILEOP_DELETE,
9145 (uintptr_t)vp,
9146 (uintptr_t)path);
9147 }
9148
9149 if (vp->v_flag & VISHARDLINK) {
9150 // see the comment in unlink1() about why we update
9151 // the parent of a hard link when it is removed
9152 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9153 }
9154
9155 #if CONFIG_FSE
9156 if (need_event) {
9157 if (vap) {
9158 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9159 }
9160 add_fsevent(FSE_DELETE, ctx,
9161 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9162 FSE_ARG_FINFO, &finfo,
9163 FSE_ARG_DONE);
9164 }
9165 #endif
9166 }
9167
9168 out:
9169 if (path != NULL) {
9170 RELEASE_PATH(path);
9171 path = NULL;
9172 }
9173
9174 if (no_firmlink_path != NULL) {
9175 RELEASE_PATH(no_firmlink_path);
9176 no_firmlink_path = NULL;
9177 }
9178
9179 /*
9180 * nameidone has to happen before we vnode_put(dvp)
9181 * since it may need to release the fs_nodelock on the dvp
9182 */
9183 nameidone(&nd);
9184 vnode_put(dvp);
9185
9186 if (vp) {
9187 vnode_put(vp);
9188 }
9189
9190 if (restart_flag == 0) {
9191 wakeup_one((caddr_t)vp);
9192 return error;
9193 }
9194 tsleep(vp, PVFS, "rm AD", 1);
9195 } while (restart_flag != 0);
9196
9197 return error;
9198 }
9199
9200 /*
9201 * Remove a directory file.
9202 */
9203 /* ARGSUSED */
9204 int
9205 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9206 {
9207 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9208 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9209 }
9210
9211 /* Get direntry length padded to 8 byte alignment */
9212 #define DIRENT64_LEN(namlen) \
9213 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9214
9215 /* Get dirent length padded to 4 byte alignment */
9216 #define DIRENT_LEN(namelen) \
9217 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9218
9219 /* Get the end of this dirent */
9220 #define DIRENT_END(dep) \
9221 (((char *)(dep)) + (dep)->d_reclen - 1)
9222
9223 errno_t
9224 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9225 int *numdirent, vfs_context_t ctxp)
9226 {
9227 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9228 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9229 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9230 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9231 } else {
9232 size_t bufsize;
9233 void * bufptr;
9234 uio_t auio;
9235 struct direntry *entry64;
9236 struct dirent *dep;
9237 int bytesread;
9238 int error;
9239
9240 /*
9241 * We're here because the underlying file system does not
9242 * support direnties or we mounted denying support so we must
9243 * fall back to dirents and convert them to direntries.
9244 *
9245 * Our kernel buffer needs to be smaller since re-packing will
9246 * expand each dirent. The worse case (when the name length
9247 * is 3 or less) corresponds to a struct direntry size of 32
9248 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9249 * (4-byte aligned). So having a buffer that is 3/8 the size
9250 * will prevent us from reading more than we can pack.
9251 *
9252 * Since this buffer is wired memory, we will limit the
9253 * buffer size to a maximum of 32K. We would really like to
9254 * use 32K in the MIN(), but we use magic number 87371 to
9255 * prevent uio_resid() * 3 / 8 from overflowing.
9256 */
9257 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9258 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9259 if (bufptr == NULL) {
9260 return ENOMEM;
9261 }
9262
9263 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9264 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9265 auio->uio_offset = uio->uio_offset;
9266
9267 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9268
9269 dep = (struct dirent *)bufptr;
9270 bytesread = bufsize - uio_resid(auio);
9271
9272 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9273 M_TEMP, M_WAITOK);
9274 /*
9275 * Convert all the entries and copy them out to user's buffer.
9276 */
9277 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9278 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9279
9280 if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9281 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9282 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9283 vp->v_mount->mnt_vfsstat.f_mntonname,
9284 vp->v_name ? vp->v_name : "<unknown>");
9285 error = EIO;
9286 break;
9287 }
9288
9289 bzero(entry64, enbufsize);
9290 /* Convert a dirent to a dirent64. */
9291 entry64->d_ino = dep->d_ino;
9292 entry64->d_seekoff = 0;
9293 entry64->d_reclen = enbufsize;
9294 entry64->d_namlen = dep->d_namlen;
9295 entry64->d_type = dep->d_type;
9296 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9297
9298 /* Move to next entry. */
9299 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9300
9301 /* Copy entry64 to user's buffer. */
9302 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9303 }
9304
9305 /* Update the real offset using the offset we got from VNOP_READDIR. */
9306 if (error == 0) {
9307 uio->uio_offset = auio->uio_offset;
9308 }
9309 uio_free(auio);
9310 FREE(bufptr, M_TEMP);
9311 FREE(entry64, M_TEMP);
9312 return error;
9313 }
9314 }
9315
9316 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9317
9318 /*
9319 * Read a block of directory entries in a file system independent format.
9320 */
9321 static int
9322 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9323 off_t *offset, int *eofflag, int flags)
9324 {
9325 vnode_t vp;
9326 struct vfs_context context = *vfs_context_current(); /* local copy */
9327 struct fileproc *fp;
9328 uio_t auio;
9329 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9330 off_t loff;
9331 int error, numdirent;
9332 char uio_buf[UIO_SIZEOF(1)];
9333
9334 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9335 if (error) {
9336 return error;
9337 }
9338 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9339 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9340 error = EBADF;
9341 goto out;
9342 }
9343
9344 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9345 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9346 }
9347
9348 #if CONFIG_MACF
9349 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9350 if (error) {
9351 goto out;
9352 }
9353 #endif
9354 if ((error = vnode_getwithref(vp))) {
9355 goto out;
9356 }
9357 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9358
9359 unionread:
9360 if (vp->v_type != VDIR) {
9361 (void)vnode_put(vp);
9362 error = EINVAL;
9363 goto out;
9364 }
9365
9366 #if CONFIG_MACF
9367 error = mac_vnode_check_readdir(&context, vp);
9368 if (error != 0) {
9369 (void)vnode_put(vp);
9370 goto out;
9371 }
9372 #endif /* MAC */
9373
9374 loff = fp->f_fglob->fg_offset;
9375 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9376 uio_addiov(auio, bufp, bufsize);
9377
9378 if (flags & VNODE_READDIR_EXTENDED) {
9379 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9380 fp->f_fglob->fg_offset = uio_offset(auio);
9381 } else {
9382 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9383 fp->f_fglob->fg_offset = uio_offset(auio);
9384 }
9385 if (error) {
9386 (void)vnode_put(vp);
9387 goto out;
9388 }
9389
9390 if ((user_ssize_t)bufsize == uio_resid(auio)) {
9391 if (union_dircheckp) {
9392 error = union_dircheckp(&vp, fp, &context);
9393 if (error == -1) {
9394 goto unionread;
9395 }
9396 if (error) {
9397 (void)vnode_put(vp);
9398 goto out;
9399 }
9400 }
9401
9402 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9403 struct vnode *tvp = vp;
9404 if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9405 vnode_ref(vp);
9406 fp->f_fglob->fg_data = (caddr_t) vp;
9407 fp->f_fglob->fg_offset = 0;
9408 vnode_rele(tvp);
9409 vnode_put(tvp);
9410 goto unionread;
9411 }
9412 vp = tvp;
9413 }
9414 }
9415
9416 vnode_put(vp);
9417 if (offset) {
9418 *offset = loff;
9419 }
9420
9421 *bytesread = bufsize - uio_resid(auio);
9422 out:
9423 file_drop(fd);
9424 return error;
9425 }
9426
9427
9428 int
9429 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9430 {
9431 off_t offset;
9432 ssize_t bytesread;
9433 int error, eofflag;
9434
9435 AUDIT_ARG(fd, uap->fd);
9436 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9437 &bytesread, &offset, &eofflag, 0);
9438
9439 if (error == 0) {
9440 if (proc_is64bit(p)) {
9441 user64_long_t base = (user64_long_t)offset;
9442 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9443 } else {
9444 user32_long_t base = (user32_long_t)offset;
9445 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9446 }
9447 *retval = bytesread;
9448 }
9449 return error;
9450 }
9451
9452 int
9453 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9454 {
9455 off_t offset;
9456 ssize_t bytesread;
9457 int error, eofflag;
9458 user_size_t bufsize;
9459
9460 AUDIT_ARG(fd, uap->fd);
9461
9462 /*
9463 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9464 * then the kernel carves out the last 4 bytes to return extended
9465 * information to userspace (namely whether we reached EOF with this call).
9466 */
9467 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9468 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9469 } else {
9470 bufsize = uap->bufsize;
9471 }
9472
9473 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9474 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9475
9476 if (error == 0) {
9477 *retval = bytesread;
9478 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9479
9480 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9481 getdirentries64_flags_t flags = 0;
9482 if (eofflag) {
9483 flags |= GETDIRENTRIES64_EOF;
9484 }
9485 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9486 sizeof(flags));
9487 }
9488 }
9489 return error;
9490 }
9491
9492
9493 /*
9494 * Set the mode mask for creation of filesystem nodes.
9495 * XXX implement xsecurity
9496 */
9497 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9498 static int
9499 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9500 {
9501 struct filedesc *fdp;
9502
9503 AUDIT_ARG(mask, newmask);
9504 proc_fdlock(p);
9505 fdp = p->p_fd;
9506 *retval = fdp->fd_cmask;
9507 fdp->fd_cmask = newmask & ALLPERMS;
9508 proc_fdunlock(p);
9509 return 0;
9510 }
9511
9512 /*
9513 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9514 *
9515 * Parameters: p Process requesting to set the umask
9516 * uap User argument descriptor (see below)
9517 * retval umask of the process (parameter p)
9518 *
9519 * Indirect: uap->newmask umask to set
9520 * uap->xsecurity ACL to set
9521 *
9522 * Returns: 0 Success
9523 * !0 Not success
9524 *
9525 */
9526 int
9527 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9528 {
9529 int ciferror;
9530 kauth_filesec_t xsecdst;
9531
9532 xsecdst = KAUTH_FILESEC_NONE;
9533 if (uap->xsecurity != USER_ADDR_NULL) {
9534 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9535 return ciferror;
9536 }
9537 } else {
9538 xsecdst = KAUTH_FILESEC_NONE;
9539 }
9540
9541 ciferror = umask1(p, uap->newmask, xsecdst, retval);
9542
9543 if (xsecdst != KAUTH_FILESEC_NONE) {
9544 kauth_filesec_free(xsecdst);
9545 }
9546 return ciferror;
9547 }
9548
9549 int
9550 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9551 {
9552 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9553 }
9554
9555 /*
9556 * Void all references to file by ripping underlying filesystem
9557 * away from vnode.
9558 */
9559 /* ARGSUSED */
9560 int
9561 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9562 {
9563 vnode_t vp;
9564 struct vnode_attr va;
9565 vfs_context_t ctx = vfs_context_current();
9566 int error;
9567 struct nameidata nd;
9568
9569 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9570 uap->path, ctx);
9571 error = namei(&nd);
9572 if (error) {
9573 return error;
9574 }
9575 vp = nd.ni_vp;
9576
9577 nameidone(&nd);
9578
9579 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9580 error = ENOTSUP;
9581 goto out;
9582 }
9583
9584 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9585 error = EBUSY;
9586 goto out;
9587 }
9588
9589 #if CONFIG_MACF
9590 error = mac_vnode_check_revoke(ctx, vp);
9591 if (error) {
9592 goto out;
9593 }
9594 #endif
9595
9596 VATTR_INIT(&va);
9597 VATTR_WANTED(&va, va_uid);
9598 if ((error = vnode_getattr(vp, &va, ctx))) {
9599 goto out;
9600 }
9601 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9602 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9603 goto out;
9604 }
9605 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9606 VNOP_REVOKE(vp, REVOKEALL, ctx);
9607 }
9608 out:
9609 vnode_put(vp);
9610 return error;
9611 }
9612
9613
9614 /*
9615 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9616 * The following system calls are designed to support features
9617 * which are specific to the HFS & HFS Plus volume formats
9618 */
9619
9620
9621 /*
9622 * Obtain attribute information on objects in a directory while enumerating
9623 * the directory.
9624 */
9625 /* ARGSUSED */
9626 int
9627 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9628 {
9629 vnode_t vp;
9630 struct fileproc *fp;
9631 uio_t auio = NULL;
9632 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9633 uint32_t count = 0, savecount = 0;
9634 uint32_t newstate = 0;
9635 int error, eofflag;
9636 uint32_t loff = 0;
9637 struct attrlist attributelist;
9638 vfs_context_t ctx = vfs_context_current();
9639 int fd = uap->fd;
9640 char uio_buf[UIO_SIZEOF(1)];
9641 kauth_action_t action;
9642
9643 AUDIT_ARG(fd, fd);
9644
9645 /* Get the attributes into kernel space */
9646 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9647 return error;
9648 }
9649 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9650 return error;
9651 }
9652 savecount = count;
9653 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9654 return error;
9655 }
9656 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9657 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9658 error = EBADF;
9659 goto out;
9660 }
9661
9662
9663 #if CONFIG_MACF
9664 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9665 fp->f_fglob);
9666 if (error) {
9667 goto out;
9668 }
9669 #endif
9670
9671
9672 if ((error = vnode_getwithref(vp))) {
9673 goto out;
9674 }
9675
9676 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9677
9678 unionread:
9679 if (vp->v_type != VDIR) {
9680 (void)vnode_put(vp);
9681 error = EINVAL;
9682 goto out;
9683 }
9684
9685 #if CONFIG_MACF
9686 error = mac_vnode_check_readdir(ctx, vp);
9687 if (error != 0) {
9688 (void)vnode_put(vp);
9689 goto out;
9690 }
9691 #endif /* MAC */
9692
9693 /* set up the uio structure which will contain the users return buffer */
9694 loff = fp->f_fglob->fg_offset;
9695 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9696 uio_addiov(auio, uap->buffer, uap->buffersize);
9697
9698 /*
9699 * If the only item requested is file names, we can let that past with
9700 * just LIST_DIRECTORY. If they want any other attributes, that means
9701 * they need SEARCH as well.
9702 */
9703 action = KAUTH_VNODE_LIST_DIRECTORY;
9704 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9705 attributelist.fileattr || attributelist.dirattr) {
9706 action |= KAUTH_VNODE_SEARCH;
9707 }
9708
9709 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9710 /* Believe it or not, uap->options only has 32-bits of valid
9711 * info, so truncate before extending again */
9712
9713 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9714 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9715 }
9716
9717 if (error) {
9718 (void) vnode_put(vp);
9719 goto out;
9720 }
9721
9722 /*
9723 * If we've got the last entry of a directory in a union mount
9724 * then reset the eofflag and pretend there's still more to come.
9725 * The next call will again set eofflag and the buffer will be empty,
9726 * so traverse to the underlying directory and do the directory
9727 * read there.
9728 */
9729 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9730 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9731 eofflag = 0;
9732 } else { // Empty buffer
9733 struct vnode *tvp = vp;
9734 if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9735 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9736 fp->f_fglob->fg_data = (caddr_t) vp;
9737 fp->f_fglob->fg_offset = 0; // reset index for new dir
9738 count = savecount;
9739 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9740 vnode_put(tvp);
9741 goto unionread;
9742 }
9743 vp = tvp;
9744 }
9745 }
9746
9747 (void)vnode_put(vp);
9748
9749 if (error) {
9750 goto out;
9751 }
9752 fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9753
9754 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9755 goto out;
9756 }
9757 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9758 goto out;
9759 }
9760 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9761 goto out;
9762 }
9763
9764 *retval = eofflag; /* similar to getdirentries */
9765 error = 0;
9766 out:
9767 file_drop(fd);
9768 return error; /* return error earlier, an retval of 0 or 1 now */
9769 } /* end of getdirentriesattr system call */
9770
9771 /*
9772 * Exchange data between two files
9773 */
9774
9775 /* ARGSUSED */
9776 int
9777 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9778 {
9779 struct nameidata fnd, snd;
9780 vfs_context_t ctx = vfs_context_current();
9781 vnode_t fvp;
9782 vnode_t svp;
9783 int error;
9784 u_int32_t nameiflags;
9785 char *fpath = NULL;
9786 char *spath = NULL;
9787 int flen = 0, slen = 0;
9788 int from_truncated = 0, to_truncated = 0;
9789 #if CONFIG_FSE
9790 fse_info f_finfo, s_finfo;
9791 #endif
9792
9793 nameiflags = 0;
9794 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9795 nameiflags |= FOLLOW;
9796 }
9797
9798 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9799 UIO_USERSPACE, uap->path1, ctx);
9800
9801 error = namei(&fnd);
9802 if (error) {
9803 goto out2;
9804 }
9805
9806 nameidone(&fnd);
9807 fvp = fnd.ni_vp;
9808
9809 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9810 UIO_USERSPACE, uap->path2, ctx);
9811
9812 error = namei(&snd);
9813 if (error) {
9814 vnode_put(fvp);
9815 goto out2;
9816 }
9817 nameidone(&snd);
9818 svp = snd.ni_vp;
9819
9820 /*
9821 * if the files are the same, return an inval error
9822 */
9823 if (svp == fvp) {
9824 error = EINVAL;
9825 goto out;
9826 }
9827
9828 /*
9829 * if the files are on different volumes, return an error
9830 */
9831 if (svp->v_mount != fvp->v_mount) {
9832 error = EXDEV;
9833 goto out;
9834 }
9835
9836 /* If they're not files, return an error */
9837 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9838 error = EINVAL;
9839 goto out;
9840 }
9841
9842 #if CONFIG_MACF
9843 error = mac_vnode_check_exchangedata(ctx,
9844 fvp, svp);
9845 if (error) {
9846 goto out;
9847 }
9848 #endif
9849 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9850 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9851 goto out;
9852 }
9853
9854 if (
9855 #if CONFIG_FSE
9856 need_fsevent(FSE_EXCHANGE, fvp) ||
9857 #endif
9858 kauth_authorize_fileop_has_listeners()) {
9859 GET_PATH(fpath);
9860 GET_PATH(spath);
9861 if (fpath == NULL || spath == NULL) {
9862 error = ENOMEM;
9863 goto out;
9864 }
9865
9866 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9867 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9868
9869 #if CONFIG_FSE
9870 get_fse_info(fvp, &f_finfo, ctx);
9871 get_fse_info(svp, &s_finfo, ctx);
9872 if (from_truncated || to_truncated) {
9873 // set it here since only the f_finfo gets reported up to user space
9874 f_finfo.mode |= FSE_TRUNCATED_PATH;
9875 }
9876 #endif
9877 }
9878 /* Ok, make the call */
9879 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9880
9881 if (error == 0) {
9882 const char *tmpname;
9883
9884 if (fpath != NULL && spath != NULL) {
9885 /* call out to allow 3rd party notification of exchangedata.
9886 * Ignore result of kauth_authorize_fileop call.
9887 */
9888 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9889 (uintptr_t)fpath, (uintptr_t)spath);
9890 }
9891 name_cache_lock();
9892
9893 tmpname = fvp->v_name;
9894 fvp->v_name = svp->v_name;
9895 svp->v_name = tmpname;
9896
9897 if (fvp->v_parent != svp->v_parent) {
9898 vnode_t tmp;
9899
9900 tmp = fvp->v_parent;
9901 fvp->v_parent = svp->v_parent;
9902 svp->v_parent = tmp;
9903 }
9904 name_cache_unlock();
9905
9906 #if CONFIG_FSE
9907 if (fpath != NULL && spath != NULL) {
9908 add_fsevent(FSE_EXCHANGE, ctx,
9909 FSE_ARG_STRING, flen, fpath,
9910 FSE_ARG_FINFO, &f_finfo,
9911 FSE_ARG_STRING, slen, spath,
9912 FSE_ARG_FINFO, &s_finfo,
9913 FSE_ARG_DONE);
9914 }
9915 #endif
9916 }
9917
9918 out:
9919 if (fpath != NULL) {
9920 RELEASE_PATH(fpath);
9921 }
9922 if (spath != NULL) {
9923 RELEASE_PATH(spath);
9924 }
9925 vnode_put(svp);
9926 vnode_put(fvp);
9927 out2:
9928 return error;
9929 }
9930
9931 /*
9932 * Return (in MB) the amount of freespace on the given vnode's volume.
9933 */
9934 uint32_t freespace_mb(vnode_t vp);
9935
9936 uint32_t
9937 freespace_mb(vnode_t vp)
9938 {
9939 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9940 return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9941 vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9942 }
9943
9944 #if CONFIG_SEARCHFS
9945
9946 /* ARGSUSED */
9947
9948 int
9949 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9950 {
9951 vnode_t vp, tvp;
9952 int i, error = 0;
9953 int fserror = 0;
9954 struct nameidata nd;
9955 struct user64_fssearchblock searchblock;
9956 struct searchstate *state;
9957 struct attrlist *returnattrs;
9958 struct timeval timelimit;
9959 void *searchparams1, *searchparams2;
9960 uio_t auio = NULL;
9961 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9962 uint32_t nummatches;
9963 int mallocsize;
9964 uint32_t nameiflags;
9965 vfs_context_t ctx = vfs_context_current();
9966 char uio_buf[UIO_SIZEOF(1)];
9967
9968 /* Start by copying in fsearchblock parameter list */
9969 if (IS_64BIT_PROCESS(p)) {
9970 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9971 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9972 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9973 } else {
9974 struct user32_fssearchblock tmp_searchblock;
9975
9976 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9977 // munge into 64-bit version
9978 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9979 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9980 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9981 searchblock.maxmatches = tmp_searchblock.maxmatches;
9982 /*
9983 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9984 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9985 */
9986 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9987 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9988 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9989 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9990 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9991 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9992 searchblock.searchattrs = tmp_searchblock.searchattrs;
9993 }
9994 if (error) {
9995 return error;
9996 }
9997
9998 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9999 */
10000 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10001 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10002 return EINVAL;
10003 }
10004
10005 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10006 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10007 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10008 /* block. */
10009 /* */
10010 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10011 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10012 /* assumes the size is still 556 bytes it will continue to work */
10013
10014 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10015 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10016
10017 MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
10018
10019 /* Now set up the various pointers to the correct place in our newly allocated memory */
10020
10021 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10022 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10023 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10024
10025 /* Now copy in the stuff given our local variables. */
10026
10027 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10028 goto freeandexit;
10029 }
10030
10031 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10032 goto freeandexit;
10033 }
10034
10035 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10036 goto freeandexit;
10037 }
10038
10039 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10040 goto freeandexit;
10041 }
10042
10043 /*
10044 * When searching a union mount, need to set the
10045 * start flag at the first call on each layer to
10046 * reset state for the new volume.
10047 */
10048 if (uap->options & SRCHFS_START) {
10049 state->ss_union_layer = 0;
10050 } else {
10051 uap->options |= state->ss_union_flags;
10052 }
10053 state->ss_union_flags = 0;
10054
10055 /*
10056 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10057 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10058 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10059 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10060 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10061 */
10062
10063 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10064 attrreference_t* string_ref;
10065 u_int32_t* start_length;
10066 user64_size_t param_length;
10067
10068 /* validate searchparams1 */
10069 param_length = searchblock.sizeofsearchparams1;
10070 /* skip the word that specifies length of the buffer */
10071 start_length = (u_int32_t*) searchparams1;
10072 start_length = start_length + 1;
10073 string_ref = (attrreference_t*) start_length;
10074
10075 /* ensure no negative offsets or too big offsets */
10076 if (string_ref->attr_dataoffset < 0) {
10077 error = EINVAL;
10078 goto freeandexit;
10079 }
10080 if (string_ref->attr_length > MAXPATHLEN) {
10081 error = EINVAL;
10082 goto freeandexit;
10083 }
10084
10085 /* Check for pointer overflow in the string ref */
10086 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10087 error = EINVAL;
10088 goto freeandexit;
10089 }
10090
10091 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10092 error = EINVAL;
10093 goto freeandexit;
10094 }
10095 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10096 error = EINVAL;
10097 goto freeandexit;
10098 }
10099 }
10100
10101 /* set up the uio structure which will contain the users return buffer */
10102 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10103 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10104
10105 nameiflags = 0;
10106 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10107 nameiflags |= FOLLOW;
10108 }
10109 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10110 UIO_USERSPACE, uap->path, ctx);
10111
10112 error = namei(&nd);
10113 if (error) {
10114 goto freeandexit;
10115 }
10116 vp = nd.ni_vp;
10117 nameidone(&nd);
10118
10119 /*
10120 * Switch to the root vnode for the volume
10121 */
10122 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10123 vnode_put(vp);
10124 if (error) {
10125 goto freeandexit;
10126 }
10127 vp = tvp;
10128
10129 /*
10130 * If it's a union mount, the path lookup takes
10131 * us to the top layer. But we may need to descend
10132 * to a lower layer. For non-union mounts the layer
10133 * is always zero.
10134 */
10135 for (i = 0; i < (int) state->ss_union_layer; i++) {
10136 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10137 break;
10138 }
10139 tvp = vp;
10140 vp = vp->v_mount->mnt_vnodecovered;
10141 if (vp == NULL) {
10142 vnode_put(tvp);
10143 error = ENOENT;
10144 goto freeandexit;
10145 }
10146 error = vnode_getwithref(vp);
10147 vnode_put(tvp);
10148 if (error) {
10149 goto freeandexit;
10150 }
10151 }
10152
10153 #if CONFIG_MACF
10154 error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10155 if (error) {
10156 vnode_put(vp);
10157 goto freeandexit;
10158 }
10159 #endif
10160
10161
10162 /*
10163 * If searchblock.maxmatches == 0, then skip the search. This has happened
10164 * before and sometimes the underlying code doesnt deal with it well.
10165 */
10166 if (searchblock.maxmatches == 0) {
10167 nummatches = 0;
10168 goto saveandexit;
10169 }
10170
10171 /*
10172 * Allright, we have everything we need, so lets make that call.
10173 *
10174 * We keep special track of the return value from the file system:
10175 * EAGAIN is an acceptable error condition that shouldn't keep us
10176 * from copying out any results...
10177 */
10178
10179 fserror = VNOP_SEARCHFS(vp,
10180 searchparams1,
10181 searchparams2,
10182 &searchblock.searchattrs,
10183 (u_long)searchblock.maxmatches,
10184 &timelimit,
10185 returnattrs,
10186 &nummatches,
10187 (u_long)uap->scriptcode,
10188 (u_long)uap->options,
10189 auio,
10190 (struct searchstate *) &state->ss_fsstate,
10191 ctx);
10192
10193 /*
10194 * If it's a union mount we need to be called again
10195 * to search the mounted-on filesystem.
10196 */
10197 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10198 state->ss_union_flags = SRCHFS_START;
10199 state->ss_union_layer++; // search next layer down
10200 fserror = EAGAIN;
10201 }
10202
10203 saveandexit:
10204
10205 vnode_put(vp);
10206
10207 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10208 * search state. Everything was already put into he return buffer by the vop call. */
10209
10210 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10211 goto freeandexit;
10212 }
10213
10214 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10215 goto freeandexit;
10216 }
10217
10218 error = fserror;
10219
10220 freeandexit:
10221
10222 FREE(searchparams1, M_TEMP);
10223
10224 return error;
10225 } /* end of searchfs system call */
10226
10227 #else /* CONFIG_SEARCHFS */
10228
10229 int
10230 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10231 {
10232 return ENOTSUP;
10233 }
10234
10235 #endif /* CONFIG_SEARCHFS */
10236
10237
10238 #if CONFIG_DATALESS_FILES
10239
10240 /*
10241 * === Namespace Resolver Up-call Mechanism ===
10242 *
10243 * When I/O is performed to a dataless file or directory (read, write,
10244 * lookup-in, etc.), the file system performs an upcall to the namespace
10245 * resolver (filecoordinationd) to materialize the object.
10246 *
10247 * We need multiple up-calls to be in flight at once, and we need these
10248 * up-calls to be interruptible, thus the following implementation:
10249 *
10250 * => The nspace_resolver_request represents the in-kernel request state.
10251 * It contains a request ID, storage space for the errno code returned
10252 * by filecoordinationd, and flags.
10253 *
10254 * => The request ID is simply a global monotonically incrementing 32-bit
10255 * number. Outstanding requests are stored in a hash table, and the
10256 * hash function is extremely simple.
10257 *
10258 * => When an upcall is to be made to filecoordinationd, a request structure
10259 * is allocated on the stack (it is small, and needs to live only during
10260 * the duration of the call to resolve_nspace_item_ext()). It is
10261 * initialized and inserted into the table. Some backpressure from
10262 * filecoordinationd is applied by limiting the numnber of entries that
10263 * can be inserted into the table (and thus limiting the number of
10264 * outstanding requests issued to filecoordinationd); waiting for an
10265 * available slot is interruptible.
10266 *
10267 * => Once the request has been inserted into the table, the up-call is made
10268 * to filecoordinationd via a MiG-generated stub. The up-call returns
10269 * immediately and filecoordinationd processes the request asynchronously.
10270 *
10271 * => The caller now waits for the request to complete. Tnis is achieved by
10272 * sleeping on the address of the request structure and waiting for
10273 * filecoordinationd to mark the request structure as complete. This
10274 * is an interruptible sleep call; if interrupted, the request structure
10275 * is removed from the table and EINTR is returned to the caller. If
10276 * this occurs, an advisory up-call is made to filecoordinationd with
10277 * the request ID to indicate that the request can be aborted or
10278 * de-prioritized at the discretion of filecoordinationd.
10279 *
10280 * => When filecoordinationd has completed the request, it signals completion
10281 * by writing to the vfs.nspace.complete sysctl node. Only a process
10282 * decorated as a namespace resolver can write to this sysctl node. The
10283 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10284 * The request ID is looked up in the table, and if the request is found,
10285 * the error code is stored in the request structure and a wakeup()
10286 * issued on the address of the request structure. If the request is not
10287 * found, we simply drop the completion notification, assuming that the
10288 * caller was interrupted.
10289 *
10290 * => When the waiting thread wakes up, it extracts the error code from the
10291 * request structure, removes the request from the table, and returns the
10292 * error code to the calling function. Fini!
10293 */
10294
10295 struct nspace_resolver_request {
10296 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10297 uint32_t r_req_id;
10298 int r_resolver_error;
10299 int r_flags;
10300 };
10301
10302 #define RRF_COMPLETE 0x0001
10303
10304 static uint32_t
10305 next_nspace_req_id(void)
10306 {
10307 static uint32_t next_req_id;
10308
10309 return OSAddAtomic(1, &next_req_id);
10310 }
10311
10312 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10313 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10314
10315 static LIST_HEAD(nspace_resolver_requesthead,
10316 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10317 static u_long nspace_resolver_request_hashmask;
10318 static u_int nspace_resolver_request_count;
10319 static bool nspace_resolver_request_wait_slot;
10320 static lck_grp_t *nspace_resolver_request_lck_grp;
10321 static lck_mtx_t nspace_resolver_request_hash_mutex;
10322
10323 #define NSPACE_REQ_LOCK() \
10324 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10325 #define NSPACE_REQ_UNLOCK() \
10326 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10327
10328 #define NSPACE_RESOLVER_HASH(req_id) \
10329 (&nspace_resolver_request_hashtbl[(req_id) & \
10330 nspace_resolver_request_hashmask])
10331
10332 static struct nspace_resolver_request *
10333 nspace_resolver_req_lookup(uint32_t req_id)
10334 {
10335 struct nspace_resolver_requesthead *bucket;
10336 struct nspace_resolver_request *req;
10337
10338 bucket = NSPACE_RESOLVER_HASH(req_id);
10339 LIST_FOREACH(req, bucket, r_hashlink) {
10340 if (req->r_req_id == req_id) {
10341 return req;
10342 }
10343 }
10344
10345 return NULL;
10346 }
10347
10348 static int
10349 nspace_resolver_req_add(struct nspace_resolver_request *req)
10350 {
10351 struct nspace_resolver_requesthead *bucket;
10352 int error;
10353
10354 while (nspace_resolver_request_count >=
10355 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10356 nspace_resolver_request_wait_slot = true;
10357 error = msleep(&nspace_resolver_request_count,
10358 &nspace_resolver_request_hash_mutex,
10359 PVFS | PCATCH, "nspacerq", NULL);
10360 if (error) {
10361 return error;
10362 }
10363 }
10364
10365 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10366 #if DIAGNOSTIC
10367 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10368 #endif /* DIAGNOSTIC */
10369 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10370 nspace_resolver_request_count++;
10371
10372 return 0;
10373 }
10374
10375 static void
10376 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10377 {
10378 struct nspace_resolver_requesthead *bucket;
10379
10380 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10381 #if DIAGNOSTIC
10382 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10383 #endif /* DIAGNOSTIC */
10384 LIST_REMOVE(req, r_hashlink);
10385 nspace_resolver_request_count--;
10386
10387 if (nspace_resolver_request_wait_slot) {
10388 nspace_resolver_request_wait_slot = false;
10389 wakeup(&nspace_resolver_request_count);
10390 }
10391 }
10392
10393 static void
10394 nspace_resolver_req_cancel(uint32_t req_id)
10395 {
10396 kern_return_t kr;
10397 mach_port_t mp;
10398
10399 // Failures here aren't fatal -- the cancellation message
10400 // sent to the resolver is merely advisory.
10401
10402 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10403 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10404 return;
10405 }
10406
10407 kr = send_nspace_resolve_cancel(mp, req_id);
10408 if (kr != KERN_SUCCESS) {
10409 os_log_error(OS_LOG_DEFAULT,
10410 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10411 }
10412
10413 ipc_port_release_send(mp);
10414 }
10415
10416 static int
10417 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10418 {
10419 bool send_cancel_message = false;
10420 int error;
10421
10422 NSPACE_REQ_LOCK();
10423
10424 while ((req->r_flags & RRF_COMPLETE) == 0) {
10425 error = msleep(req, &nspace_resolver_request_hash_mutex,
10426 PVFS | PCATCH, "nspace", NULL);
10427 if (error && error != ERESTART) {
10428 req->r_resolver_error = (error == EINTR) ? EINTR :
10429 ETIMEDOUT;
10430 send_cancel_message = true;
10431 break;
10432 }
10433 }
10434
10435 nspace_resolver_req_remove(req);
10436
10437 NSPACE_REQ_UNLOCK();
10438
10439 if (send_cancel_message) {
10440 nspace_resolver_req_cancel(req->r_req_id);
10441 }
10442
10443 return req->r_resolver_error;
10444 }
10445
10446 static void
10447 nspace_resolver_req_mark_complete(
10448 struct nspace_resolver_request *req,
10449 int resolver_error)
10450 {
10451 req->r_resolver_error = resolver_error;
10452 req->r_flags |= RRF_COMPLETE;
10453 wakeup(req);
10454 }
10455
10456 static void
10457 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10458 {
10459 struct nspace_resolver_request *req;
10460
10461 NSPACE_REQ_LOCK();
10462
10463 // If we don't find the request corresponding to our req_id,
10464 // just drop the completion signal on the floor; it's likely
10465 // that the requester interrupted with a signal.
10466
10467 req = nspace_resolver_req_lookup(req_id);
10468 if (req) {
10469 nspace_resolver_req_mark_complete(req, resolver_error);
10470 }
10471
10472 NSPACE_REQ_UNLOCK();
10473 }
10474
10475 static struct proc *nspace_resolver_proc;
10476
10477 static int
10478 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10479 {
10480 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10481 p == nspace_resolver_proc) ? 1 : 0;
10482 return 0;
10483 }
10484
10485 static int
10486 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10487 {
10488 vfs_context_t ctx = vfs_context_current();
10489 int error = 0;
10490
10491 //
10492 // The system filecoordinationd runs as uid == 0. This also
10493 // has the nice side-effect of filtering out filecoordinationd
10494 // running in the simulator.
10495 //
10496 if (!vfs_context_issuser(ctx)) {
10497 return EPERM;
10498 }
10499
10500 error = priv_check_cred(vfs_context_ucred(ctx),
10501 PRIV_VFS_DATALESS_RESOLVER, 0);
10502 if (error) {
10503 return error;
10504 }
10505
10506 if (is_resolver) {
10507 NSPACE_REQ_LOCK();
10508
10509 if (nspace_resolver_proc == NULL) {
10510 proc_lock(p);
10511 p->p_lflag |= P_LNSPACE_RESOLVER;
10512 proc_unlock(p);
10513 nspace_resolver_proc = p;
10514 } else {
10515 error = EBUSY;
10516 }
10517
10518 NSPACE_REQ_UNLOCK();
10519 } else {
10520 // This is basically just like the exit case.
10521 // nspace_resolver_exited() will verify that the
10522 // process is the resolver, and will clear the
10523 // global.
10524 nspace_resolver_exited(p);
10525 }
10526
10527 return error;
10528 }
10529
10530 static int
10531 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10532 {
10533 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10534 (p->p_vfs_iopolicy &
10535 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10536 *is_prevented = 1;
10537 } else {
10538 *is_prevented = 0;
10539 }
10540 return 0;
10541 }
10542
10543 static int
10544 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10545 {
10546 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10547 return is_prevented ? 0 : EBUSY;
10548 }
10549
10550 if (is_prevented) {
10551 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10552 } else {
10553 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10554 }
10555 return 0;
10556 }
10557
10558 static int
10559 nspace_materialization_get_thread_state(int *is_prevented)
10560 {
10561 uthread_t ut = get_bsdthread_info(current_thread());
10562
10563 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10564 return 0;
10565 }
10566
10567 static int
10568 nspace_materialization_set_thread_state(int is_prevented)
10569 {
10570 uthread_t ut = get_bsdthread_info(current_thread());
10571
10572 if (is_prevented) {
10573 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10574 } else {
10575 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10576 }
10577 return 0;
10578 }
10579
10580 static int
10581 nspace_materialization_is_prevented(void)
10582 {
10583 proc_t p = current_proc();
10584 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10585 vfs_context_t ctx = vfs_context_current();
10586
10587 /*
10588 * Kernel context ==> return EDEADLK, as we would with any random
10589 * process decorated as no-materialize.
10590 */
10591 if (ctx == vfs_context_kernel()) {
10592 return EDEADLK;
10593 }
10594
10595 /*
10596 * If the process has the dataless-manipulation entitlement,
10597 * materialization is prevented, and depending on the kind
10598 * of file system operation, things get to proceed as if the
10599 * object is not dataless.
10600 */
10601 if (vfs_context_is_dataless_manipulator(ctx)) {
10602 return EJUSTRETURN;
10603 }
10604
10605 /*
10606 * Per-thread decorations override any process-wide decorations.
10607 * (Foundation uses this, and this overrides even the dataless-
10608 * manipulation entitlement so as to make API contracts consistent.)
10609 */
10610 if (ut != NULL) {
10611 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10612 return EDEADLK;
10613 }
10614 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10615 return 0;
10616 }
10617 }
10618
10619 /*
10620 * If the process's iopolicy specifies that dataless files
10621 * can be materialized, then we let it go ahead.
10622 */
10623 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10624 return 0;
10625 }
10626
10627 /*
10628 * The default behavior is to not materialize dataless files;
10629 * return to the caller that deadlock was detected.
10630 */
10631 return EDEADLK;
10632 }
10633
10634 /* the vfs.nspace branch */
10635 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10636
10637 static int
10638 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10639 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10640 {
10641 struct proc *p = req->p;
10642 int new_value, old_value, changed = 0;
10643 int error;
10644
10645 error = nspace_resolver_get_proc_state(p, &old_value);
10646 if (error) {
10647 return error;
10648 }
10649
10650 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10651 &changed);
10652 if (error == 0 && changed) {
10653 error = nspace_resolver_set_proc_state(p, new_value);
10654 }
10655 return error;
10656 }
10657
10658 /* decorate this process as the dataless file resolver */
10659 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10660 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10661 0, 0, sysctl_nspace_resolver, "I", "");
10662
10663 static int
10664 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10665 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10666 {
10667 struct proc *p = req->p;
10668 int new_value, old_value, changed = 0;
10669 int error;
10670
10671 error = nspace_materialization_get_proc_state(p, &old_value);
10672 if (error) {
10673 return error;
10674 }
10675
10676 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10677 &changed);
10678 if (error == 0 && changed) {
10679 error = nspace_materialization_set_proc_state(p, new_value);
10680 }
10681 return error;
10682 }
10683
10684 /* decorate this process as not wanting to materialize dataless files */
10685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10686 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10687 0, 0, sysctl_nspace_prevent_materialization, "I", "");
10688
10689 static int
10690 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10691 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10692 {
10693 int new_value, old_value, changed = 0;
10694 int error;
10695
10696 error = nspace_materialization_get_thread_state(&old_value);
10697 if (error) {
10698 return error;
10699 }
10700
10701 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10702 &changed);
10703 if (error == 0 && changed) {
10704 error = nspace_materialization_set_thread_state(new_value);
10705 }
10706 return error;
10707 }
10708
10709 /* decorate this thread as not wanting to materialize dataless files */
10710 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10711 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10712 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10713
10714 static int
10715 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10716 __unused int arg2, struct sysctl_req *req)
10717 {
10718 struct proc *p = req->p;
10719 uint32_t req_status[2] = { 0, 0 };
10720 int error, is_resolver, changed = 0;
10721
10722 error = nspace_resolver_get_proc_state(p, &is_resolver);
10723 if (error) {
10724 return error;
10725 }
10726
10727 if (!is_resolver) {
10728 return EPERM;
10729 }
10730
10731 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10732 &changed);
10733 if (error) {
10734 return error;
10735 }
10736
10737 /*
10738 * req_status[0] is the req_id
10739 *
10740 * req_status[1] is the errno
10741 */
10742 if (error == 0 && changed) {
10743 nspace_resolver_req_completed(req_status[0],
10744 (int)req_status[1]);
10745 }
10746 return error;
10747 }
10748
10749 /* Resolver reports completed reqs here. */
10750 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10751 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10752 0, 0, sysctl_nspace_complete, "-", "");
10753
10754 #endif /* CONFIG_DATALESS_FILES */
10755
10756 #if CONFIG_DATALESS_FILES
10757 #define __no_dataless_unused /* nothing */
10758 #else
10759 #define __no_dataless_unused __unused
10760 #endif
10761
10762 void
10763 nspace_resolver_init(void)
10764 {
10765 #if CONFIG_DATALESS_FILES
10766 nspace_resolver_request_lck_grp =
10767 lck_grp_alloc_init("file namespace resolver", NULL);
10768
10769 lck_mtx_init(&nspace_resolver_request_hash_mutex,
10770 nspace_resolver_request_lck_grp, NULL);
10771
10772 nspace_resolver_request_hashtbl =
10773 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10774 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10775 #endif /* CONFIG_DATALESS_FILES */
10776 }
10777
10778 void
10779 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10780 {
10781 #if CONFIG_DATALESS_FILES
10782 struct nspace_resolver_requesthead *bucket;
10783 struct nspace_resolver_request *req;
10784 u_long idx;
10785
10786 NSPACE_REQ_LOCK();
10787
10788 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10789 p == nspace_resolver_proc) {
10790 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10791 bucket = &nspace_resolver_request_hashtbl[idx];
10792 LIST_FOREACH(req, bucket, r_hashlink) {
10793 nspace_resolver_req_mark_complete(req,
10794 ETIMEDOUT);
10795 }
10796 }
10797 nspace_resolver_proc = NULL;
10798 }
10799
10800 NSPACE_REQ_UNLOCK();
10801 #endif /* CONFIG_DATALESS_FILES */
10802 }
10803
10804 int
10805 resolve_nspace_item(struct vnode *vp, uint64_t op)
10806 {
10807 return resolve_nspace_item_ext(vp, op, NULL);
10808 }
10809
10810 #define DATALESS_RESOLVER_ENTITLEMENT \
10811 "com.apple.private.vfs.dataless-resolver"
10812 #define DATALESS_MANIPULATION_ENTITLEMENT \
10813 "com.apple.private.vfs.dataless-manipulation"
10814
10815 /*
10816 * Return TRUE if the vfs context is associated with a process entitled
10817 * for dataless manipulation.
10818 *
10819 * XXX Arguably belongs in vfs_subr.c, but is here because of the
10820 * complication around CONFIG_DATALESS_FILES.
10821 */
10822 boolean_t
10823 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10824 {
10825 #if CONFIG_DATALESS_FILES
10826 assert(ctx->vc_thread == current_thread());
10827 task_t const task = current_task();
10828 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10829 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10830 #else
10831 return false;
10832 #endif /* CONFIG_DATALESS_FILES */
10833 }
10834
10835 int
10836 resolve_nspace_item_ext(
10837 struct vnode *vp __no_dataless_unused,
10838 uint64_t op __no_dataless_unused,
10839 void *arg __unused)
10840 {
10841 #if CONFIG_DATALESS_FILES
10842 int error;
10843 mach_port_t mp;
10844 char *path = NULL;
10845 int path_len;
10846 kern_return_t kr;
10847 struct nspace_resolver_request req;
10848
10849 // only allow namespace events on regular files, directories and symlinks.
10850 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10851 return EFTYPE;
10852 }
10853
10854 //
10855 // if this is a snapshot event and the vnode is on a
10856 // disk image just pretend nothing happened since any
10857 // change to the disk image will cause the disk image
10858 // itself to get backed up and this avoids multi-way
10859 // deadlocks between the snapshot handler and the ever
10860 // popular diskimages-helper process. the variable
10861 // nspace_allow_virtual_devs allows this behavior to
10862 // be overridden (for use by the Mobile TimeMachine
10863 // testing infrastructure which uses disk images)
10864 //
10865 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10866 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10867 return ENOTSUP;
10868 }
10869
10870 error = nspace_materialization_is_prevented();
10871 if (error) {
10872 os_log_debug(OS_LOG_DEFAULT,
10873 "NSPACE process/thread is decorated as no-materialization");
10874 return error;
10875 }
10876
10877 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10878 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10879 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10880 // Treat this like being unable to access the backing
10881 // store server.
10882 return ETIMEDOUT;
10883 }
10884
10885 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10886 if (path == NULL) {
10887 error = ENOMEM;
10888 goto out_release_port;
10889 }
10890 path_len = MAXPATHLEN;
10891
10892 error = vn_getpath(vp, path, &path_len);
10893 if (error == 0) {
10894 int xxx_rdar44371223; /* XXX Mig bug */
10895 req.r_req_id = next_nspace_req_id();
10896 req.r_resolver_error = 0;
10897 req.r_flags = 0;
10898
10899 NSPACE_REQ_LOCK();
10900 error = nspace_resolver_req_add(&req);
10901 NSPACE_REQ_UNLOCK();
10902 if (error) {
10903 goto out_release_port;
10904 }
10905
10906 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10907 kr = send_nspace_resolve_path(mp, req.r_req_id,
10908 current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10909 path, &xxx_rdar44371223);
10910 if (kr != KERN_SUCCESS) {
10911 // Also treat this like being unable to access
10912 // the backing store server.
10913 os_log_error(OS_LOG_DEFAULT,
10914 "NSPACE resolve_path failure: %d", kr);
10915 error = ETIMEDOUT;
10916
10917 NSPACE_REQ_LOCK();
10918 nspace_resolver_req_remove(&req);
10919 NSPACE_REQ_UNLOCK();
10920 goto out_release_port;
10921 }
10922
10923 // Give back the memory we allocated earlier while
10924 // we wait; we no longer need it.
10925 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10926 path = NULL;
10927
10928 // Request has been submitted to the resolver.
10929 // Now (interruptibly) wait for completion.
10930 // Upon requrn, the request will have been removed
10931 // from the lookup table.
10932 error = nspace_resolver_req_wait(&req);
10933 }
10934
10935 out_release_port:
10936 if (path != NULL) {
10937 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10938 }
10939 ipc_port_release_send(mp);
10940
10941 return error;
10942 #else
10943 return ENOTSUP;
10944 #endif /* CONFIG_DATALESS_FILES */
10945 }
10946
10947 int
10948 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
10949 __unused uint64_t op_type, __unused void *arg)
10950 {
10951 return 0;
10952 }
10953
10954 #if 0
10955 static int
10956 build_volfs_path(struct vnode *vp, char *path, int *len)
10957 {
10958 struct vnode_attr va;
10959 int ret;
10960
10961 VATTR_INIT(&va);
10962 VATTR_WANTED(&va, va_fsid);
10963 VATTR_WANTED(&va, va_fileid);
10964
10965 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10966 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10967 ret = -1;
10968 } else {
10969 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10970 ret = 0;
10971 }
10972
10973 return ret;
10974 }
10975 #endif
10976
10977 static unsigned long
10978 fsctl_bogus_command_compat(unsigned long cmd)
10979 {
10980 switch (cmd) {
10981 case IOCBASECMD(FSIOC_SYNC_VOLUME):
10982 return FSIOC_SYNC_VOLUME;
10983 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10984 return FSIOC_ROUTEFS_SETROUTEID;
10985 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10986 return FSIOC_SET_PACKAGE_EXTS;
10987 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10988 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10989 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10990 return DISK_CONDITIONER_IOC_GET;
10991 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10992 return DISK_CONDITIONER_IOC_SET;
10993 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10994 return FSIOC_FIOSEEKHOLE;
10995 case IOCBASECMD(FSIOC_FIOSEEKDATA):
10996 return FSIOC_FIOSEEKDATA;
10997 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10998 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10999 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11000 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11001 }
11002
11003 return cmd;
11004 }
11005
11006 static int
11007 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11008 {
11009 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11010 }
11011
11012 /*
11013 * Make a filesystem-specific control call:
11014 */
11015 /* ARGSUSED */
11016 static int
11017 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11018 {
11019 int error = 0;
11020 boolean_t is64bit;
11021 u_int size;
11022 #define STK_PARAMS 128
11023 char stkbuf[STK_PARAMS] = {0};
11024 caddr_t data, memp;
11025 vnode_t vp = *arg_vp;
11026
11027 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11028 return ENOTTY;
11029 }
11030
11031 cmd = fsctl_bogus_command_compat(cmd);
11032
11033 size = IOCPARM_LEN(cmd);
11034 if (size > IOCPARM_MAX) {
11035 return EINVAL;
11036 }
11037
11038 is64bit = proc_is64bit(p);
11039
11040 memp = NULL;
11041
11042 if (size > sizeof(stkbuf)) {
11043 if ((memp = (caddr_t)kalloc(size)) == 0) {
11044 return ENOMEM;
11045 }
11046 data = memp;
11047 } else {
11048 data = &stkbuf[0];
11049 };
11050
11051 if (cmd & IOC_IN) {
11052 if (size) {
11053 error = copyin(udata, data, size);
11054 if (error) {
11055 if (memp) {
11056 kfree(memp, size);
11057 }
11058 return error;
11059 }
11060 } else {
11061 if (is64bit) {
11062 *(user_addr_t *)data = udata;
11063 } else {
11064 *(uint32_t *)data = (uint32_t)udata;
11065 }
11066 };
11067 } else if ((cmd & IOC_OUT) && size) {
11068 /*
11069 * Zero the buffer so the user always
11070 * gets back something deterministic.
11071 */
11072 bzero(data, size);
11073 } else if (cmd & IOC_VOID) {
11074 if (is64bit) {
11075 *(user_addr_t *)data = udata;
11076 } else {
11077 *(uint32_t *)data = (uint32_t)udata;
11078 }
11079 }
11080
11081 /* Check to see if it's a generic command */
11082 switch (cmd) {
11083 case FSIOC_SYNC_VOLUME: {
11084 struct vfs_attr vfa;
11085 mount_t mp = vp->v_mount;
11086 unsigned arg;
11087
11088
11089 /* record vid of vp so we can drop it below. */
11090 uint32_t vvid = vp->v_id;
11091
11092 /*
11093 * Then grab mount_iterref so that we can release the vnode.
11094 * Without this, a thread may call vnode_iterate_prepare then
11095 * get into a deadlock because we've never released the root vp
11096 */
11097 error = mount_iterref(mp, 0);
11098 if (error) {
11099 break;
11100 }
11101 vnode_put(vp);
11102
11103 arg = MNT_NOWAIT;
11104 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11105 arg = MNT_WAIT;
11106 }
11107
11108 /*
11109 * If the filessytem supports multiple filesytems in a
11110 * partition (For eg APFS volumes in a container, it knows
11111 * that the waitfor argument to VFS_SYNC are flags.
11112 */
11113 VFSATTR_INIT(&vfa);
11114 VFSATTR_WANTED(&vfa, f_capabilities);
11115 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11116 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11117 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11118 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11119 arg |= MNT_VOLUME;
11120 }
11121
11122 /* issue the sync for this volume */
11123 (void)sync_callback(mp, &arg);
11124
11125 /*
11126 * Then release the mount_iterref once we're done syncing; it's not
11127 * needed for the VNOP_IOCTL below
11128 */
11129 mount_iterdrop(mp);
11130
11131 if (arg & FSCTL_SYNC_FULLSYNC) {
11132 /* re-obtain vnode iocount on the root vp, if possible */
11133 error = vnode_getwithvid(vp, vvid);
11134 if (error == 0) {
11135 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11136 vnode_put(vp);
11137 }
11138 }
11139 /* mark the argument VP as having been released */
11140 *arg_vp = NULL;
11141 }
11142 break;
11143
11144 case FSIOC_ROUTEFS_SETROUTEID: {
11145 #if ROUTEFS
11146 char routepath[MAXPATHLEN];
11147 size_t len = 0;
11148
11149 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11150 break;
11151 }
11152 bzero(routepath, MAXPATHLEN);
11153 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11154 if (error) {
11155 break;
11156 }
11157 error = routefs_kernel_mount(routepath);
11158 if (error) {
11159 break;
11160 }
11161 #endif
11162 }
11163 break;
11164
11165 case FSIOC_SET_PACKAGE_EXTS: {
11166 user_addr_t ext_strings;
11167 uint32_t num_entries;
11168 uint32_t max_width;
11169
11170 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11171 break;
11172 }
11173
11174 if ((is64bit && size != sizeof(user64_package_ext_info))
11175 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11176 // either you're 64-bit and passed a 64-bit struct or
11177 // you're 32-bit and passed a 32-bit struct. otherwise
11178 // it's not ok.
11179 error = EINVAL;
11180 break;
11181 }
11182
11183 if (is64bit) {
11184 ext_strings = ((user64_package_ext_info *)data)->strings;
11185 num_entries = ((user64_package_ext_info *)data)->num_entries;
11186 max_width = ((user64_package_ext_info *)data)->max_width;
11187 } else {
11188 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11189 num_entries = ((user32_package_ext_info *)data)->num_entries;
11190 max_width = ((user32_package_ext_info *)data)->max_width;
11191 }
11192 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11193 }
11194 break;
11195
11196 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11197 {
11198 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11199 break;
11200 }
11201 if (vp->v_mount) {
11202 mount_lock(vp->v_mount);
11203 if (data[0] != 0) {
11204 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11205 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11206 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11207 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11208 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11209 }
11210 } else {
11211 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11212 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11213 }
11214 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11215 vp->v_mount->fstypename_override[0] = '\0';
11216 }
11217 mount_unlock(vp->v_mount);
11218 }
11219 }
11220 break;
11221
11222 case DISK_CONDITIONER_IOC_GET: {
11223 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11224 }
11225 break;
11226
11227 case DISK_CONDITIONER_IOC_SET: {
11228 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11229 }
11230 break;
11231
11232 case FSIOC_CAS_BSDFLAGS: {
11233 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11234 struct vnode_attr va;
11235
11236 VATTR_INIT(&va);
11237 VATTR_SET(&va, va_flags, cas->new_flags);
11238
11239 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11240 }
11241 break;
11242
11243 case FSIOC_FD_ONLY_OPEN_ONCE: {
11244 if (vnode_usecount(vp) > 1) {
11245 error = EBUSY;
11246 } else {
11247 error = 0;
11248 }
11249 }
11250 break;
11251
11252 default: {
11253 /* other, known commands shouldn't be passed down here */
11254 switch (cmd) {
11255 case F_PUNCHHOLE:
11256 case F_TRIM_ACTIVE_FILE:
11257 case F_RDADVISE:
11258 case F_TRANSCODEKEY:
11259 case F_GETPROTECTIONLEVEL:
11260 case F_GETDEFAULTPROTLEVEL:
11261 case F_MAKECOMPRESSED:
11262 case F_SET_GREEDY_MODE:
11263 case F_SETSTATICCONTENT:
11264 case F_SETIOTYPE:
11265 case F_SETBACKINGSTORE:
11266 case F_GETPATH_MTMINFO:
11267 case APFSIOC_REVERT_TO_SNAPSHOT:
11268 case FSIOC_FIOSEEKHOLE:
11269 case FSIOC_FIOSEEKDATA:
11270 case HFS_GET_BOOT_INFO:
11271 case HFS_SET_BOOT_INFO:
11272 case FIOPINSWAP:
11273 case F_CHKCLEAN:
11274 case F_FULLFSYNC:
11275 case F_BARRIERFSYNC:
11276 case F_FREEZE_FS:
11277 case F_THAW_FS:
11278 error = EINVAL;
11279 goto outdrop;
11280 }
11281 /* Invoke the filesystem-specific code */
11282 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11283 }
11284 } /* end switch stmt */
11285
11286 /*
11287 * if no errors, copy any data to user. Size was
11288 * already set and checked above.
11289 */
11290 if (error == 0 && (cmd & IOC_OUT) && size) {
11291 error = copyout(data, udata, size);
11292 }
11293
11294 outdrop:
11295 if (memp) {
11296 kfree(memp, size);
11297 }
11298
11299 return error;
11300 }
11301
11302 /* ARGSUSED */
11303 int
11304 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11305 {
11306 int error;
11307 struct nameidata nd;
11308 u_long nameiflags;
11309 vnode_t vp = NULL;
11310 vfs_context_t ctx = vfs_context_current();
11311
11312 AUDIT_ARG(cmd, uap->cmd);
11313 AUDIT_ARG(value32, uap->options);
11314 /* Get the vnode for the file we are getting info on: */
11315 nameiflags = 0;
11316 //
11317 // if we come through fsctl() then the file is by definition not open.
11318 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11319 // lest the caller mistakenly thinks the only open is their own (but in
11320 // reality it's someone elses).
11321 //
11322 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11323 return EINVAL;
11324 }
11325 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11326 nameiflags |= FOLLOW;
11327 }
11328 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11329 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11330 }
11331 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11332 UIO_USERSPACE, uap->path, ctx);
11333 if ((error = namei(&nd))) {
11334 goto done;
11335 }
11336 vp = nd.ni_vp;
11337 nameidone(&nd);
11338
11339 #if CONFIG_MACF
11340 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11341 if (error) {
11342 goto done;
11343 }
11344 #endif
11345
11346 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11347
11348 done:
11349 if (vp) {
11350 vnode_put(vp);
11351 }
11352 return error;
11353 }
11354 /* ARGSUSED */
11355 int
11356 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11357 {
11358 int error;
11359 vnode_t vp = NULL;
11360 vfs_context_t ctx = vfs_context_current();
11361 int fd = -1;
11362
11363 AUDIT_ARG(fd, uap->fd);
11364 AUDIT_ARG(cmd, uap->cmd);
11365 AUDIT_ARG(value32, uap->options);
11366
11367 /* Get the vnode for the file we are getting info on: */
11368 if ((error = file_vnode(uap->fd, &vp))) {
11369 return error;
11370 }
11371 fd = uap->fd;
11372 if ((error = vnode_getwithref(vp))) {
11373 file_drop(fd);
11374 return error;
11375 }
11376
11377 #if CONFIG_MACF
11378 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11379 file_drop(fd);
11380 vnode_put(vp);
11381 return error;
11382 }
11383 #endif
11384
11385 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11386
11387 file_drop(fd);
11388
11389 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11390 if (vp) {
11391 vnode_put(vp);
11392 }
11393
11394 return error;
11395 }
11396 /* end of fsctl system call */
11397
11398 /*
11399 * Retrieve the data of an extended attribute.
11400 */
11401 int
11402 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11403 {
11404 vnode_t vp;
11405 struct nameidata nd;
11406 char attrname[XATTR_MAXNAMELEN + 1];
11407 vfs_context_t ctx = vfs_context_current();
11408 uio_t auio = NULL;
11409 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11410 size_t attrsize = 0;
11411 size_t namelen;
11412 u_int32_t nameiflags;
11413 int error;
11414 char uio_buf[UIO_SIZEOF(1)];
11415
11416 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11417 return EINVAL;
11418 }
11419
11420 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11421 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11422 if ((error = namei(&nd))) {
11423 return error;
11424 }
11425 vp = nd.ni_vp;
11426 nameidone(&nd);
11427
11428 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11429 if (error != 0) {
11430 goto out;
11431 }
11432 if (xattr_protected(attrname)) {
11433 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11434 error = EPERM;
11435 goto out;
11436 }
11437 }
11438 /*
11439 * the specific check for 0xffffffff is a hack to preserve
11440 * binaray compatibilty in K64 with applications that discovered
11441 * that passing in a buf pointer and a size of -1 resulted in
11442 * just the size of the indicated extended attribute being returned.
11443 * this isn't part of the documented behavior, but because of the
11444 * original implemtation's check for "uap->size > 0", this behavior
11445 * was allowed. In K32 that check turned into a signed comparison
11446 * even though uap->size is unsigned... in K64, we blow by that
11447 * check because uap->size is unsigned and doesn't get sign smeared
11448 * in the munger for a 32 bit user app. we also need to add a
11449 * check to limit the maximum size of the buffer being passed in...
11450 * unfortunately, the underlying fileystems seem to just malloc
11451 * the requested size even if the actual extended attribute is tiny.
11452 * because that malloc is for kernel wired memory, we have to put a
11453 * sane limit on it.
11454 *
11455 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11456 * U64 running on K64 will yield -1 (64 bits wide)
11457 * U32/U64 running on K32 will yield -1 (32 bits wide)
11458 */
11459 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11460 goto no_uio;
11461 }
11462
11463 if (uap->value) {
11464 if (uap->size > (size_t)XATTR_MAXSIZE) {
11465 uap->size = XATTR_MAXSIZE;
11466 }
11467
11468 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11469 &uio_buf[0], sizeof(uio_buf));
11470 uio_addiov(auio, uap->value, uap->size);
11471 }
11472 no_uio:
11473 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11474 out:
11475 vnode_put(vp);
11476
11477 if (auio) {
11478 *retval = uap->size - uio_resid(auio);
11479 } else {
11480 *retval = (user_ssize_t)attrsize;
11481 }
11482
11483 return error;
11484 }
11485
11486 /*
11487 * Retrieve the data of an extended attribute.
11488 */
11489 int
11490 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11491 {
11492 vnode_t vp;
11493 char attrname[XATTR_MAXNAMELEN + 1];
11494 uio_t auio = NULL;
11495 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11496 size_t attrsize = 0;
11497 size_t namelen;
11498 int error;
11499 char uio_buf[UIO_SIZEOF(1)];
11500
11501 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11502 return EINVAL;
11503 }
11504
11505 if ((error = file_vnode(uap->fd, &vp))) {
11506 return error;
11507 }
11508 if ((error = vnode_getwithref(vp))) {
11509 file_drop(uap->fd);
11510 return error;
11511 }
11512 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11513 if (error != 0) {
11514 goto out;
11515 }
11516 if (xattr_protected(attrname)) {
11517 error = EPERM;
11518 goto out;
11519 }
11520 if (uap->value && uap->size > 0) {
11521 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11522 &uio_buf[0], sizeof(uio_buf));
11523 uio_addiov(auio, uap->value, uap->size);
11524 }
11525
11526 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11527 out:
11528 (void)vnode_put(vp);
11529 file_drop(uap->fd);
11530
11531 if (auio) {
11532 *retval = uap->size - uio_resid(auio);
11533 } else {
11534 *retval = (user_ssize_t)attrsize;
11535 }
11536 return error;
11537 }
11538
11539 /*
11540 * Set the data of an extended attribute.
11541 */
11542 int
11543 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11544 {
11545 vnode_t vp;
11546 struct nameidata nd;
11547 char attrname[XATTR_MAXNAMELEN + 1];
11548 vfs_context_t ctx = vfs_context_current();
11549 uio_t auio = NULL;
11550 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11551 size_t namelen;
11552 u_int32_t nameiflags;
11553 int error;
11554 char uio_buf[UIO_SIZEOF(1)];
11555
11556 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11557 return EINVAL;
11558 }
11559
11560 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11561 if (error != 0) {
11562 if (error == EPERM) {
11563 /* if the string won't fit in attrname, copyinstr emits EPERM */
11564 return ENAMETOOLONG;
11565 }
11566 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11567 return error;
11568 }
11569 if (xattr_protected(attrname)) {
11570 return EPERM;
11571 }
11572 if (uap->size != 0 && uap->value == 0) {
11573 return EINVAL;
11574 }
11575
11576 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11577 NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11578 if ((error = namei(&nd))) {
11579 return error;
11580 }
11581 vp = nd.ni_vp;
11582 nameidone(&nd);
11583
11584 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11585 &uio_buf[0], sizeof(uio_buf));
11586 uio_addiov(auio, uap->value, uap->size);
11587
11588 error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11589 #if CONFIG_FSE
11590 if (error == 0) {
11591 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11592 FSE_ARG_VNODE, vp,
11593 FSE_ARG_DONE);
11594 }
11595 #endif
11596 vnode_put(vp);
11597 *retval = 0;
11598 return error;
11599 }
11600
11601 /*
11602 * Set the data of an extended attribute.
11603 */
11604 int
11605 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11606 {
11607 vnode_t vp;
11608 char attrname[XATTR_MAXNAMELEN + 1];
11609 uio_t auio = NULL;
11610 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11611 size_t namelen;
11612 int error;
11613 char uio_buf[UIO_SIZEOF(1)];
11614 #if CONFIG_FSE
11615 vfs_context_t ctx = vfs_context_current();
11616 #endif
11617
11618 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11619 return EINVAL;
11620 }
11621
11622 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11623 if (error != 0) {
11624 if (error == EPERM) {
11625 /* if the string won't fit in attrname, copyinstr emits EPERM */
11626 return ENAMETOOLONG;
11627 }
11628 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11629 return error;
11630 }
11631 if (xattr_protected(attrname)) {
11632 return EPERM;
11633 }
11634 if (uap->size != 0 && uap->value == 0) {
11635 return EINVAL;
11636 }
11637 if ((error = file_vnode(uap->fd, &vp))) {
11638 return error;
11639 }
11640 if ((error = vnode_getwithref(vp))) {
11641 file_drop(uap->fd);
11642 return error;
11643 }
11644 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11645 &uio_buf[0], sizeof(uio_buf));
11646 uio_addiov(auio, uap->value, uap->size);
11647
11648 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11649 #if CONFIG_FSE
11650 if (error == 0) {
11651 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11652 FSE_ARG_VNODE, vp,
11653 FSE_ARG_DONE);
11654 }
11655 #endif
11656 vnode_put(vp);
11657 file_drop(uap->fd);
11658 *retval = 0;
11659 return error;
11660 }
11661
11662 /*
11663 * Remove an extended attribute.
11664 * XXX Code duplication here.
11665 */
11666 int
11667 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11668 {
11669 vnode_t vp;
11670 struct nameidata nd;
11671 char attrname[XATTR_MAXNAMELEN + 1];
11672 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11673 vfs_context_t ctx = vfs_context_current();
11674 size_t namelen;
11675 u_int32_t nameiflags;
11676 int error;
11677
11678 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11679 return EINVAL;
11680 }
11681
11682 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11683 if (error != 0) {
11684 return error;
11685 }
11686 if (xattr_protected(attrname)) {
11687 return EPERM;
11688 }
11689 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11690 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11691 if ((error = namei(&nd))) {
11692 return error;
11693 }
11694 vp = nd.ni_vp;
11695 nameidone(&nd);
11696
11697 error = vn_removexattr(vp, attrname, uap->options, ctx);
11698 #if CONFIG_FSE
11699 if (error == 0) {
11700 add_fsevent(FSE_XATTR_REMOVED, ctx,
11701 FSE_ARG_VNODE, vp,
11702 FSE_ARG_DONE);
11703 }
11704 #endif
11705 vnode_put(vp);
11706 *retval = 0;
11707 return error;
11708 }
11709
11710 /*
11711 * Remove an extended attribute.
11712 * XXX Code duplication here.
11713 */
11714 int
11715 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11716 {
11717 vnode_t vp;
11718 char attrname[XATTR_MAXNAMELEN + 1];
11719 size_t namelen;
11720 int error;
11721 #if CONFIG_FSE
11722 vfs_context_t ctx = vfs_context_current();
11723 #endif
11724
11725 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11726 return EINVAL;
11727 }
11728
11729 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11730 if (error != 0) {
11731 return error;
11732 }
11733 if (xattr_protected(attrname)) {
11734 return EPERM;
11735 }
11736 if ((error = file_vnode(uap->fd, &vp))) {
11737 return error;
11738 }
11739 if ((error = vnode_getwithref(vp))) {
11740 file_drop(uap->fd);
11741 return error;
11742 }
11743
11744 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11745 #if CONFIG_FSE
11746 if (error == 0) {
11747 add_fsevent(FSE_XATTR_REMOVED, ctx,
11748 FSE_ARG_VNODE, vp,
11749 FSE_ARG_DONE);
11750 }
11751 #endif
11752 vnode_put(vp);
11753 file_drop(uap->fd);
11754 *retval = 0;
11755 return error;
11756 }
11757
11758 /*
11759 * Retrieve the list of extended attribute names.
11760 * XXX Code duplication here.
11761 */
11762 int
11763 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11764 {
11765 vnode_t vp;
11766 struct nameidata nd;
11767 vfs_context_t ctx = vfs_context_current();
11768 uio_t auio = NULL;
11769 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11770 size_t attrsize = 0;
11771 u_int32_t nameiflags;
11772 int error;
11773 char uio_buf[UIO_SIZEOF(1)];
11774
11775 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11776 return EINVAL;
11777 }
11778
11779 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11780 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11781 if ((error = namei(&nd))) {
11782 return error;
11783 }
11784 vp = nd.ni_vp;
11785 nameidone(&nd);
11786 if (uap->namebuf != 0 && uap->bufsize > 0) {
11787 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11788 &uio_buf[0], sizeof(uio_buf));
11789 uio_addiov(auio, uap->namebuf, uap->bufsize);
11790 }
11791
11792 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11793
11794 vnode_put(vp);
11795 if (auio) {
11796 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11797 } else {
11798 *retval = (user_ssize_t)attrsize;
11799 }
11800 return error;
11801 }
11802
11803 /*
11804 * Retrieve the list of extended attribute names.
11805 * XXX Code duplication here.
11806 */
11807 int
11808 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11809 {
11810 vnode_t vp;
11811 uio_t auio = NULL;
11812 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11813 size_t attrsize = 0;
11814 int error;
11815 char uio_buf[UIO_SIZEOF(1)];
11816
11817 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11818 return EINVAL;
11819 }
11820
11821 if ((error = file_vnode(uap->fd, &vp))) {
11822 return error;
11823 }
11824 if ((error = vnode_getwithref(vp))) {
11825 file_drop(uap->fd);
11826 return error;
11827 }
11828 if (uap->namebuf != 0 && uap->bufsize > 0) {
11829 auio = uio_createwithbuffer(1, 0, spacetype,
11830 UIO_READ, &uio_buf[0], sizeof(uio_buf));
11831 uio_addiov(auio, uap->namebuf, uap->bufsize);
11832 }
11833
11834 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11835
11836 vnode_put(vp);
11837 file_drop(uap->fd);
11838 if (auio) {
11839 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11840 } else {
11841 *retval = (user_ssize_t)attrsize;
11842 }
11843 return error;
11844 }
11845
11846 static int
11847 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11848 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11849 {
11850 int error;
11851 struct mount *mp = NULL;
11852 vnode_t vp;
11853 int length;
11854 int bpflags;
11855 /* maximum number of times to retry build_path */
11856 unsigned int retries = 0x10;
11857
11858 if (bufsize > PAGE_SIZE) {
11859 return EINVAL;
11860 }
11861
11862 if (buf == NULL) {
11863 return ENOMEM;
11864 }
11865
11866 retry:
11867 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11868 error = ENOTSUP; /* unexpected failure */
11869 return ENOTSUP;
11870 }
11871
11872 unionget:
11873 if (objid == 2) {
11874 struct vfs_attr vfsattr;
11875 int use_vfs_root = TRUE;
11876
11877 VFSATTR_INIT(&vfsattr);
11878 VFSATTR_WANTED(&vfsattr, f_capabilities);
11879 if (!(options & FSOPT_ISREALFSID) &&
11880 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11881 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11882 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11883 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11884 use_vfs_root = FALSE;
11885 }
11886 }
11887
11888 if (use_vfs_root) {
11889 error = VFS_ROOT(mp, &vp, ctx);
11890 } else {
11891 error = VFS_VGET(mp, objid, &vp, ctx);
11892 }
11893 } else {
11894 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11895 }
11896
11897 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11898 /*
11899 * If the fileid isn't found and we're in a union
11900 * mount volume, then see if the fileid is in the
11901 * mounted-on volume.
11902 */
11903 struct mount *tmp = mp;
11904 mp = vnode_mount(tmp->mnt_vnodecovered);
11905 vfs_unbusy(tmp);
11906 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11907 goto unionget;
11908 }
11909 } else {
11910 vfs_unbusy(mp);
11911 }
11912
11913 if (error) {
11914 return error;
11915 }
11916
11917 #if CONFIG_MACF
11918 error = mac_vnode_check_fsgetpath(ctx, vp);
11919 if (error) {
11920 vnode_put(vp);
11921 return error;
11922 }
11923 #endif
11924
11925 /* Obtain the absolute path to this vnode. */
11926 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11927 if (options & FSOPT_NOFIRMLINKPATH) {
11928 bpflags |= BUILDPATH_NO_FIRMLINK;
11929 }
11930 bpflags |= BUILDPATH_CHECK_MOVED;
11931 error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11932 vnode_put(vp);
11933
11934 if (error) {
11935 /* there was a race building the path, try a few more times */
11936 if (error == EAGAIN) {
11937 --retries;
11938 if (retries > 0) {
11939 goto retry;
11940 }
11941
11942 error = ENOENT;
11943 }
11944 goto out;
11945 }
11946
11947 AUDIT_ARG(text, buf);
11948
11949 if (kdebug_enable) {
11950 long dbg_parms[NUMPARMS];
11951 int dbg_namelen;
11952
11953 dbg_namelen = (int)sizeof(dbg_parms);
11954
11955 if (length < dbg_namelen) {
11956 memcpy((char *)dbg_parms, buf, length);
11957 memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11958
11959 dbg_namelen = length;
11960 } else {
11961 memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11962 }
11963
11964 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11965 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11966 }
11967
11968 *pathlen = (user_ssize_t)length; /* may be superseded by error */
11969
11970 out:
11971 return error;
11972 }
11973
11974 /*
11975 * Obtain the full pathname of a file system object by id.
11976 */
11977 static int
11978 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11979 uint32_t options, user_ssize_t *retval)
11980 {
11981 vfs_context_t ctx = vfs_context_current();
11982 fsid_t fsid;
11983 char *realpath;
11984 int length;
11985 int error;
11986
11987 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11988 return EINVAL;
11989 }
11990
11991 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11992 return error;
11993 }
11994 AUDIT_ARG(value32, fsid.val[0]);
11995 AUDIT_ARG(value64, objid);
11996 /* Restrict output buffer size for now. */
11997
11998 if (bufsize > PAGE_SIZE || bufsize <= 0) {
11999 return EINVAL;
12000 }
12001 MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
12002 if (realpath == NULL) {
12003 return ENOMEM;
12004 }
12005
12006 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12007 options, &length);
12008
12009 if (error) {
12010 goto out;
12011 }
12012
12013 error = copyout((caddr_t)realpath, buf, length);
12014
12015 *retval = (user_ssize_t)length; /* may be superseded by error */
12016 out:
12017 if (realpath) {
12018 FREE(realpath, M_TEMP);
12019 }
12020 return error;
12021 }
12022
12023 int
12024 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12025 {
12026 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12027 0, retval);
12028 }
12029
12030 int
12031 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12032 {
12033 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12034 uap->options, retval);
12035 }
12036
12037 /*
12038 * Common routine to handle various flavors of statfs data heading out
12039 * to user space.
12040 *
12041 * Returns: 0 Success
12042 * EFAULT
12043 */
12044 static int
12045 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12046 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12047 boolean_t partial_copy)
12048 {
12049 int error;
12050 int my_size, copy_size;
12051
12052 if (is_64_bit) {
12053 struct user64_statfs sfs;
12054 my_size = copy_size = sizeof(sfs);
12055 bzero(&sfs, my_size);
12056 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12057 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12058 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12059 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12060 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12061 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12062 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12063 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12064 sfs.f_files = (user64_long_t)sfsp->f_files;
12065 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12066 sfs.f_fsid = sfsp->f_fsid;
12067 sfs.f_owner = sfsp->f_owner;
12068 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12069 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12070 } else {
12071 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12072 }
12073 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12074 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12075
12076 if (partial_copy) {
12077 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12078 }
12079 error = copyout((caddr_t)&sfs, bufp, copy_size);
12080 } else {
12081 struct user32_statfs sfs;
12082
12083 my_size = copy_size = sizeof(sfs);
12084 bzero(&sfs, my_size);
12085
12086 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12087 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12088 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12089
12090 /*
12091 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12092 * have to fudge the numbers here in that case. We inflate the blocksize in order
12093 * to reflect the filesystem size as best we can.
12094 */
12095 if ((sfsp->f_blocks > INT_MAX)
12096 /* Hack for 4061702 . I think the real fix is for Carbon to
12097 * look for some volume capability and not depend on hidden
12098 * semantics agreed between a FS and carbon.
12099 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12100 * for Carbon to set bNoVolumeSizes volume attribute.
12101 * Without this the webdavfs files cannot be copied onto
12102 * disk as they look huge. This change should not affect
12103 * XSAN as they should not setting these to -1..
12104 */
12105 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12106 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12107 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12108 int shift;
12109
12110 /*
12111 * Work out how far we have to shift the block count down to make it fit.
12112 * Note that it's possible to have to shift so far that the resulting
12113 * blocksize would be unreportably large. At that point, we will clip
12114 * any values that don't fit.
12115 *
12116 * For safety's sake, we also ensure that f_iosize is never reported as
12117 * being smaller than f_bsize.
12118 */
12119 for (shift = 0; shift < 32; shift++) {
12120 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12121 break;
12122 }
12123 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12124 break;
12125 }
12126 }
12127 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12128 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12129 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12130 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12131 #undef __SHIFT_OR_CLIP
12132 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12133 sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12134 } else {
12135 /* filesystem is small enough to be reported honestly */
12136 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12137 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12138 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12139 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12140 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12141 }
12142 sfs.f_files = (user32_long_t)sfsp->f_files;
12143 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12144 sfs.f_fsid = sfsp->f_fsid;
12145 sfs.f_owner = sfsp->f_owner;
12146 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12147 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12148 } else {
12149 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12150 }
12151 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12152 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12153
12154 if (partial_copy) {
12155 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12156 }
12157 error = copyout((caddr_t)&sfs, bufp, copy_size);
12158 }
12159
12160 if (sizep != NULL) {
12161 *sizep = my_size;
12162 }
12163 return error;
12164 }
12165
12166 /*
12167 * copy stat structure into user_stat structure.
12168 */
12169 void
12170 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12171 {
12172 bzero(usbp, sizeof(*usbp));
12173
12174 usbp->st_dev = sbp->st_dev;
12175 usbp->st_ino = sbp->st_ino;
12176 usbp->st_mode = sbp->st_mode;
12177 usbp->st_nlink = sbp->st_nlink;
12178 usbp->st_uid = sbp->st_uid;
12179 usbp->st_gid = sbp->st_gid;
12180 usbp->st_rdev = sbp->st_rdev;
12181 #ifndef _POSIX_C_SOURCE
12182 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12183 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12184 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12185 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12186 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12187 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12188 #else
12189 usbp->st_atime = sbp->st_atime;
12190 usbp->st_atimensec = sbp->st_atimensec;
12191 usbp->st_mtime = sbp->st_mtime;
12192 usbp->st_mtimensec = sbp->st_mtimensec;
12193 usbp->st_ctime = sbp->st_ctime;
12194 usbp->st_ctimensec = sbp->st_ctimensec;
12195 #endif
12196 usbp->st_size = sbp->st_size;
12197 usbp->st_blocks = sbp->st_blocks;
12198 usbp->st_blksize = sbp->st_blksize;
12199 usbp->st_flags = sbp->st_flags;
12200 usbp->st_gen = sbp->st_gen;
12201 usbp->st_lspare = sbp->st_lspare;
12202 usbp->st_qspare[0] = sbp->st_qspare[0];
12203 usbp->st_qspare[1] = sbp->st_qspare[1];
12204 }
12205
12206 void
12207 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12208 {
12209 bzero(usbp, sizeof(*usbp));
12210
12211 usbp->st_dev = sbp->st_dev;
12212 usbp->st_ino = sbp->st_ino;
12213 usbp->st_mode = sbp->st_mode;
12214 usbp->st_nlink = sbp->st_nlink;
12215 usbp->st_uid = sbp->st_uid;
12216 usbp->st_gid = sbp->st_gid;
12217 usbp->st_rdev = sbp->st_rdev;
12218 #ifndef _POSIX_C_SOURCE
12219 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12220 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12221 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12222 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12223 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12224 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12225 #else
12226 usbp->st_atime = sbp->st_atime;
12227 usbp->st_atimensec = sbp->st_atimensec;
12228 usbp->st_mtime = sbp->st_mtime;
12229 usbp->st_mtimensec = sbp->st_mtimensec;
12230 usbp->st_ctime = sbp->st_ctime;
12231 usbp->st_ctimensec = sbp->st_ctimensec;
12232 #endif
12233 usbp->st_size = sbp->st_size;
12234 usbp->st_blocks = sbp->st_blocks;
12235 usbp->st_blksize = sbp->st_blksize;
12236 usbp->st_flags = sbp->st_flags;
12237 usbp->st_gen = sbp->st_gen;
12238 usbp->st_lspare = sbp->st_lspare;
12239 usbp->st_qspare[0] = sbp->st_qspare[0];
12240 usbp->st_qspare[1] = sbp->st_qspare[1];
12241 }
12242
12243 /*
12244 * copy stat64 structure into user_stat64 structure.
12245 */
12246 void
12247 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12248 {
12249 bzero(usbp, sizeof(*usbp));
12250
12251 usbp->st_dev = sbp->st_dev;
12252 usbp->st_ino = sbp->st_ino;
12253 usbp->st_mode = sbp->st_mode;
12254 usbp->st_nlink = sbp->st_nlink;
12255 usbp->st_uid = sbp->st_uid;
12256 usbp->st_gid = sbp->st_gid;
12257 usbp->st_rdev = sbp->st_rdev;
12258 #ifndef _POSIX_C_SOURCE
12259 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12260 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12261 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12262 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12263 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12264 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12265 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12266 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12267 #else
12268 usbp->st_atime = sbp->st_atime;
12269 usbp->st_atimensec = sbp->st_atimensec;
12270 usbp->st_mtime = sbp->st_mtime;
12271 usbp->st_mtimensec = sbp->st_mtimensec;
12272 usbp->st_ctime = sbp->st_ctime;
12273 usbp->st_ctimensec = sbp->st_ctimensec;
12274 usbp->st_birthtime = sbp->st_birthtime;
12275 usbp->st_birthtimensec = sbp->st_birthtimensec;
12276 #endif
12277 usbp->st_size = sbp->st_size;
12278 usbp->st_blocks = sbp->st_blocks;
12279 usbp->st_blksize = sbp->st_blksize;
12280 usbp->st_flags = sbp->st_flags;
12281 usbp->st_gen = sbp->st_gen;
12282 usbp->st_lspare = sbp->st_lspare;
12283 usbp->st_qspare[0] = sbp->st_qspare[0];
12284 usbp->st_qspare[1] = sbp->st_qspare[1];
12285 }
12286
12287 void
12288 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12289 {
12290 bzero(usbp, sizeof(*usbp));
12291
12292 usbp->st_dev = sbp->st_dev;
12293 usbp->st_ino = sbp->st_ino;
12294 usbp->st_mode = sbp->st_mode;
12295 usbp->st_nlink = sbp->st_nlink;
12296 usbp->st_uid = sbp->st_uid;
12297 usbp->st_gid = sbp->st_gid;
12298 usbp->st_rdev = sbp->st_rdev;
12299 #ifndef _POSIX_C_SOURCE
12300 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12301 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12302 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12303 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12304 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12305 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12306 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12307 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12308 #else
12309 usbp->st_atime = sbp->st_atime;
12310 usbp->st_atimensec = sbp->st_atimensec;
12311 usbp->st_mtime = sbp->st_mtime;
12312 usbp->st_mtimensec = sbp->st_mtimensec;
12313 usbp->st_ctime = sbp->st_ctime;
12314 usbp->st_ctimensec = sbp->st_ctimensec;
12315 usbp->st_birthtime = sbp->st_birthtime;
12316 usbp->st_birthtimensec = sbp->st_birthtimensec;
12317 #endif
12318 usbp->st_size = sbp->st_size;
12319 usbp->st_blocks = sbp->st_blocks;
12320 usbp->st_blksize = sbp->st_blksize;
12321 usbp->st_flags = sbp->st_flags;
12322 usbp->st_gen = sbp->st_gen;
12323 usbp->st_lspare = sbp->st_lspare;
12324 usbp->st_qspare[0] = sbp->st_qspare[0];
12325 usbp->st_qspare[1] = sbp->st_qspare[1];
12326 }
12327
12328 /*
12329 * Purge buffer cache for simulating cold starts
12330 */
12331 static int
12332 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12333 {
12334 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12335
12336 return VNODE_RETURNED;
12337 }
12338
12339 static int
12340 vfs_purge_callback(mount_t mp, __unused void * arg)
12341 {
12342 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12343
12344 return VFS_RETURNED;
12345 }
12346
12347 int
12348 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12349 {
12350 if (!kauth_cred_issuser(kauth_cred_get())) {
12351 return EPERM;
12352 }
12353
12354 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12355
12356 return 0;
12357 }
12358
12359 /*
12360 * gets the vnode associated with the (unnamed) snapshot directory
12361 * for a Filesystem. The snapshot directory vnode is returned with
12362 * an iocount on it.
12363 */
12364 int
12365 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12366 {
12367 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12368 }
12369
12370 /*
12371 * Get the snapshot vnode.
12372 *
12373 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12374 * needs nameidone() on ndp.
12375 *
12376 * If the snapshot vnode exists it is returned in ndp->ni_vp.
12377 *
12378 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12379 * not needed.
12380 */
12381 static int
12382 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12383 user_addr_t name, struct nameidata *ndp, int32_t op,
12384 #if !CONFIG_TRIGGERS
12385 __unused
12386 #endif
12387 enum path_operation pathop,
12388 vfs_context_t ctx)
12389 {
12390 int error, i;
12391 caddr_t name_buf;
12392 size_t name_len;
12393 struct vfs_attr vfa;
12394
12395 *sdvpp = NULLVP;
12396 *rvpp = NULLVP;
12397
12398 error = vnode_getfromfd(ctx, dirfd, rvpp);
12399 if (error) {
12400 return error;
12401 }
12402
12403 if (!vnode_isvroot(*rvpp)) {
12404 error = EINVAL;
12405 goto out;
12406 }
12407
12408 /* Make sure the filesystem supports snapshots */
12409 VFSATTR_INIT(&vfa);
12410 VFSATTR_WANTED(&vfa, f_capabilities);
12411 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12412 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12413 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12414 VOL_CAP_INT_SNAPSHOT)) ||
12415 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12416 VOL_CAP_INT_SNAPSHOT))) {
12417 error = ENOTSUP;
12418 goto out;
12419 }
12420
12421 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12422 if (error) {
12423 goto out;
12424 }
12425
12426 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12427 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12428 if (error) {
12429 goto out1;
12430 }
12431
12432 /*
12433 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12434 * (the length returned by copyinstr includes the terminating NUL)
12435 */
12436 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12437 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12438 error = EINVAL;
12439 goto out1;
12440 }
12441 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12442 ;
12443 }
12444 if (i < (int)name_len) {
12445 error = EINVAL;
12446 goto out1;
12447 }
12448
12449 #if CONFIG_MACF
12450 if (op == CREATE) {
12451 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12452 name_buf);
12453 } else if (op == DELETE) {
12454 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12455 name_buf);
12456 }
12457 if (error) {
12458 goto out1;
12459 }
12460 #endif
12461
12462 /* Check if the snapshot already exists ... */
12463 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12464 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12465 ndp->ni_dvp = *sdvpp;
12466
12467 error = namei(ndp);
12468 out1:
12469 FREE(name_buf, M_TEMP);
12470 out:
12471 if (error) {
12472 if (*sdvpp) {
12473 vnode_put(*sdvpp);
12474 *sdvpp = NULLVP;
12475 }
12476 if (*rvpp) {
12477 vnode_put(*rvpp);
12478 *rvpp = NULLVP;
12479 }
12480 }
12481 return error;
12482 }
12483
12484 /*
12485 * create a filesystem snapshot (for supporting filesystems)
12486 *
12487 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12488 * We get to the (unnamed) snapshot directory vnode and create the vnode
12489 * for the snapshot in it.
12490 *
12491 * Restrictions:
12492 *
12493 * a) Passed in name for snapshot cannot have slashes.
12494 * b) name can't be "." or ".."
12495 *
12496 * Since this requires superuser privileges, vnode_authorize calls are not
12497 * made.
12498 */
12499 static int
12500 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12501 vfs_context_t ctx)
12502 {
12503 vnode_t rvp, snapdvp;
12504 int error;
12505 struct nameidata namend;
12506
12507 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12508 OP_LINK, ctx);
12509 if (error) {
12510 return error;
12511 }
12512
12513 if (namend.ni_vp) {
12514 vnode_put(namend.ni_vp);
12515 error = EEXIST;
12516 } else {
12517 struct vnode_attr va;
12518 vnode_t vp = NULLVP;
12519
12520 VATTR_INIT(&va);
12521 VATTR_SET(&va, va_type, VREG);
12522 VATTR_SET(&va, va_mode, 0);
12523
12524 error = vn_create(snapdvp, &vp, &namend, &va,
12525 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12526 if (!error && vp) {
12527 vnode_put(vp);
12528 }
12529 }
12530
12531 nameidone(&namend);
12532 vnode_put(snapdvp);
12533 vnode_put(rvp);
12534 return error;
12535 }
12536
12537 /*
12538 * Delete a Filesystem snapshot
12539 *
12540 * get the vnode for the unnamed snapshot directory and the snapshot and
12541 * delete the snapshot.
12542 */
12543 static int
12544 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12545 vfs_context_t ctx)
12546 {
12547 vnode_t rvp, snapdvp;
12548 int error;
12549 struct nameidata namend;
12550
12551 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12552 OP_UNLINK, ctx);
12553 if (error) {
12554 goto out;
12555 }
12556
12557 error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12558 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12559
12560 vnode_put(namend.ni_vp);
12561 nameidone(&namend);
12562 vnode_put(snapdvp);
12563 vnode_put(rvp);
12564 out:
12565 return error;
12566 }
12567
12568 /*
12569 * Revert a filesystem to a snapshot
12570 *
12571 * Marks the filesystem to revert to the given snapshot on next mount.
12572 */
12573 static int
12574 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12575 vfs_context_t ctx)
12576 {
12577 int error;
12578 vnode_t rvp;
12579 mount_t mp;
12580 struct fs_snapshot_revert_args revert_data;
12581 struct componentname cnp;
12582 caddr_t name_buf;
12583 size_t name_len;
12584
12585 error = vnode_getfromfd(ctx, dirfd, &rvp);
12586 if (error) {
12587 return error;
12588 }
12589 mp = vnode_mount(rvp);
12590
12591 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12592 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12593 if (error) {
12594 FREE(name_buf, M_TEMP);
12595 vnode_put(rvp);
12596 return error;
12597 }
12598
12599 #if CONFIG_MACF
12600 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12601 if (error) {
12602 FREE(name_buf, M_TEMP);
12603 vnode_put(rvp);
12604 return error;
12605 }
12606 #endif
12607
12608 /*
12609 * Grab mount_iterref so that we can release the vnode,
12610 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12611 */
12612 error = mount_iterref(mp, 0);
12613 vnode_put(rvp);
12614 if (error) {
12615 FREE(name_buf, M_TEMP);
12616 return error;
12617 }
12618
12619 memset(&cnp, 0, sizeof(cnp));
12620 cnp.cn_pnbuf = (char *)name_buf;
12621 cnp.cn_nameiop = LOOKUP;
12622 cnp.cn_flags = ISLASTCN | HASBUF;
12623 cnp.cn_pnlen = MAXPATHLEN;
12624 cnp.cn_nameptr = cnp.cn_pnbuf;
12625 cnp.cn_namelen = (int)name_len;
12626 revert_data.sr_cnp = &cnp;
12627
12628 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12629 mount_iterdrop(mp);
12630 FREE(name_buf, M_TEMP);
12631
12632 if (error) {
12633 /* If there was any error, try again using VNOP_IOCTL */
12634
12635 vnode_t snapdvp;
12636 struct nameidata namend;
12637
12638 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12639 OP_LOOKUP, ctx);
12640 if (error) {
12641 return error;
12642 }
12643
12644
12645 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12646 0, ctx);
12647
12648 vnode_put(namend.ni_vp);
12649 nameidone(&namend);
12650 vnode_put(snapdvp);
12651 vnode_put(rvp);
12652 }
12653
12654 return error;
12655 }
12656
12657 /*
12658 * rename a Filesystem snapshot
12659 *
12660 * get the vnode for the unnamed snapshot directory and the snapshot and
12661 * rename the snapshot. This is a very specialised (and simple) case of
12662 * rename(2) (which has to deal with a lot more complications). It differs
12663 * slightly from rename(2) in that EEXIST is returned if the new name exists.
12664 */
12665 static int
12666 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12667 __unused uint32_t flags, vfs_context_t ctx)
12668 {
12669 vnode_t rvp, snapdvp;
12670 int error, i;
12671 caddr_t newname_buf;
12672 size_t name_len;
12673 vnode_t fvp;
12674 struct nameidata *fromnd, *tond;
12675 /* carving out a chunk for structs that are too big to be on stack. */
12676 struct {
12677 struct nameidata from_node;
12678 struct nameidata to_node;
12679 } * __rename_data;
12680
12681 MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12682 fromnd = &__rename_data->from_node;
12683 tond = &__rename_data->to_node;
12684
12685 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12686 OP_UNLINK, ctx);
12687 if (error) {
12688 goto out;
12689 }
12690 fvp = fromnd->ni_vp;
12691
12692 MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12693 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12694 if (error) {
12695 goto out1;
12696 }
12697
12698 /*
12699 * Some sanity checks- new name can't be empty, "." or ".." or have
12700 * slashes.
12701 * (the length returned by copyinstr includes the terminating NUL)
12702 *
12703 * The FS rename VNOP is suppossed to handle this but we'll pick it
12704 * off here itself.
12705 */
12706 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12707 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12708 error = EINVAL;
12709 goto out1;
12710 }
12711 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12712 ;
12713 }
12714 if (i < (int)name_len) {
12715 error = EINVAL;
12716 goto out1;
12717 }
12718
12719 #if CONFIG_MACF
12720 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12721 newname_buf);
12722 if (error) {
12723 goto out1;
12724 }
12725 #endif
12726
12727 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12728 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12729 tond->ni_dvp = snapdvp;
12730
12731 error = namei(tond);
12732 if (error) {
12733 goto out2;
12734 } else if (tond->ni_vp) {
12735 /*
12736 * snapshot rename behaves differently than rename(2) - if the
12737 * new name exists, EEXIST is returned.
12738 */
12739 vnode_put(tond->ni_vp);
12740 error = EEXIST;
12741 goto out2;
12742 }
12743
12744 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12745 &tond->ni_cnd, ctx);
12746
12747 out2:
12748 nameidone(tond);
12749 out1:
12750 FREE(newname_buf, M_TEMP);
12751 vnode_put(fvp);
12752 vnode_put(snapdvp);
12753 vnode_put(rvp);
12754 nameidone(fromnd);
12755 out:
12756 FREE(__rename_data, M_TEMP);
12757 return error;
12758 }
12759
12760 /*
12761 * Mount a Filesystem snapshot
12762 *
12763 * get the vnode for the unnamed snapshot directory and the snapshot and
12764 * mount the snapshot.
12765 */
12766 static int
12767 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12768 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12769 {
12770 mount_t mp;
12771 vnode_t rvp, snapdvp, snapvp, vp, pvp;
12772 struct fs_snapshot_mount_args smnt_data;
12773 int error;
12774 struct nameidata *snapndp, *dirndp;
12775 /* carving out a chunk for structs that are too big to be on stack. */
12776 struct {
12777 struct nameidata snapnd;
12778 struct nameidata dirnd;
12779 } * __snapshot_mount_data;
12780
12781 MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12782 M_TEMP, M_WAITOK);
12783 snapndp = &__snapshot_mount_data->snapnd;
12784 dirndp = &__snapshot_mount_data->dirnd;
12785
12786 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12787 OP_LOOKUP, ctx);
12788 if (error) {
12789 goto out;
12790 }
12791
12792 snapvp = snapndp->ni_vp;
12793 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12794 error = EIO;
12795 goto out1;
12796 }
12797
12798 /* Get the vnode to be covered */
12799 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12800 UIO_USERSPACE, directory, ctx);
12801 error = namei(dirndp);
12802 if (error) {
12803 goto out1;
12804 }
12805
12806 vp = dirndp->ni_vp;
12807 pvp = dirndp->ni_dvp;
12808 mp = vnode_mount(rvp);
12809
12810 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12811 error = EINVAL;
12812 goto out2;
12813 }
12814
12815 #if CONFIG_MACF
12816 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
12817 mp->mnt_vfsstat.f_fstypename);
12818 if (error) {
12819 goto out2;
12820 }
12821 #endif
12822
12823 smnt_data.sm_mp = mp;
12824 smnt_data.sm_cnp = &snapndp->ni_cnd;
12825 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12826 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12827 KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12828
12829 out2:
12830 vnode_put(vp);
12831 vnode_put(pvp);
12832 nameidone(dirndp);
12833 out1:
12834 vnode_put(snapvp);
12835 vnode_put(snapdvp);
12836 vnode_put(rvp);
12837 nameidone(snapndp);
12838 out:
12839 FREE(__snapshot_mount_data, M_TEMP);
12840 return error;
12841 }
12842
12843 /*
12844 * Root from a snapshot of the filesystem
12845 *
12846 * Marks the filesystem to root from the given snapshot on next boot.
12847 */
12848 static int
12849 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12850 vfs_context_t ctx)
12851 {
12852 int error;
12853 vnode_t rvp;
12854 mount_t mp;
12855 struct fs_snapshot_root_args root_data;
12856 struct componentname cnp;
12857 caddr_t name_buf;
12858 size_t name_len;
12859
12860 error = vnode_getfromfd(ctx, dirfd, &rvp);
12861 if (error) {
12862 return error;
12863 }
12864 mp = vnode_mount(rvp);
12865
12866 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12867 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12868 if (error) {
12869 FREE(name_buf, M_TEMP);
12870 vnode_put(rvp);
12871 return error;
12872 }
12873
12874 // XXX MAC checks ?
12875
12876 /*
12877 * Grab mount_iterref so that we can release the vnode,
12878 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12879 */
12880 error = mount_iterref(mp, 0);
12881 vnode_put(rvp);
12882 if (error) {
12883 FREE(name_buf, M_TEMP);
12884 return error;
12885 }
12886
12887 memset(&cnp, 0, sizeof(cnp));
12888 cnp.cn_pnbuf = (char *)name_buf;
12889 cnp.cn_nameiop = LOOKUP;
12890 cnp.cn_flags = ISLASTCN | HASBUF;
12891 cnp.cn_pnlen = MAXPATHLEN;
12892 cnp.cn_nameptr = cnp.cn_pnbuf;
12893 cnp.cn_namelen = (int)name_len;
12894 root_data.sr_cnp = &cnp;
12895
12896 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12897
12898 mount_iterdrop(mp);
12899 FREE(name_buf, M_TEMP);
12900
12901 return error;
12902 }
12903
12904 /*
12905 * FS snapshot operations dispatcher
12906 */
12907 int
12908 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12909 __unused int32_t *retval)
12910 {
12911 int error;
12912 vfs_context_t ctx = vfs_context_current();
12913
12914 AUDIT_ARG(fd, uap->dirfd);
12915 AUDIT_ARG(value32, uap->op);
12916
12917 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12918 if (error) {
12919 return error;
12920 }
12921
12922 /*
12923 * Enforce user authorization for snapshot modification operations
12924 */
12925 if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12926 (uap->op != SNAPSHOT_OP_ROOT)) {
12927 vnode_t dvp = NULLVP;
12928 vnode_t devvp = NULLVP;
12929 mount_t mp;
12930
12931 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12932 if (error) {
12933 return error;
12934 }
12935 mp = vnode_mount(dvp);
12936 devvp = mp->mnt_devvp;
12937
12938 /* get an iocount on devvp */
12939 if (devvp == NULLVP) {
12940 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12941 /* for mounts which arent block devices */
12942 if (error == ENOENT) {
12943 error = ENXIO;
12944 }
12945 } else {
12946 error = vnode_getwithref(devvp);
12947 }
12948
12949 if (error) {
12950 vnode_put(dvp);
12951 return error;
12952 }
12953
12954 if ((vfs_context_issuser(ctx) == 0) &&
12955 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12956 error = EPERM;
12957 }
12958 vnode_put(dvp);
12959 vnode_put(devvp);
12960
12961 if (error) {
12962 return error;
12963 }
12964 }
12965
12966 switch (uap->op) {
12967 case SNAPSHOT_OP_CREATE:
12968 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12969 break;
12970 case SNAPSHOT_OP_DELETE:
12971 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12972 break;
12973 case SNAPSHOT_OP_RENAME:
12974 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12975 uap->flags, ctx);
12976 break;
12977 case SNAPSHOT_OP_MOUNT:
12978 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12979 uap->data, uap->flags, ctx);
12980 break;
12981 case SNAPSHOT_OP_REVERT:
12982 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12983 break;
12984 #if CONFIG_MNT_ROOTSNAP
12985 case SNAPSHOT_OP_ROOT:
12986 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12987 break;
12988 #endif /* CONFIG_MNT_ROOTSNAP */
12989 default:
12990 error = ENOSYS;
12991 }
12992
12993 return error;
12994 }