]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_syscalls.c
a0a04deb8b43910efb399a0595a1ed2051891b57
[apple/xnu.git] / bsd / vfs / vfs_syscalls.c
1 /*
2 * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <sys/malloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/fsctl.h>
101 #include <sys/ubc_internal.h>
102 #include <sys/disk.h>
103 #include <sys/content_protection.h>
104 #include <sys/clonefile.h>
105 #include <sys/snapshot.h>
106 #include <sys/priv.h>
107 #include <sys/fsgetpath.h>
108 #include <machine/cons.h>
109 #include <machine/limits.h>
110 #include <miscfs/specfs/specdev.h>
111
112 #include <vfs/vfs_disk_conditioner.h>
113
114 #include <security/audit/audit.h>
115 #include <bsm/audit_kevents.h>
116
117 #include <mach/mach_types.h>
118 #include <kern/kern_types.h>
119 #include <kern/kalloc.h>
120 #include <kern/task.h>
121
122 #include <vm/vm_pageout.h>
123 #include <vm/vm_protos.h>
124
125 #include <libkern/OSAtomic.h>
126 #include <pexpert/pexpert.h>
127 #include <IOKit/IOBSD.h>
128
129 // deps for MIG call
130 #include <kern/host.h>
131 #include <kern/ipc_misc.h>
132 #include <mach/host_priv.h>
133 #include <mach/vfs_nspace.h>
134 #include <os/log.h>
135
136 #include <nfs/nfs_conf.h>
137
138 #if ROUTEFS
139 #include <miscfs/routefs/routefs.h>
140 #endif /* ROUTEFS */
141
142 #if CONFIG_MACF
143 #include <security/mac.h>
144 #include <security/mac_framework.h>
145 #endif
146
147 #if CONFIG_FSE
148 #define GET_PATH(x) \
149 (x) = get_pathbuff();
150 #define RELEASE_PATH(x) \
151 release_pathbuff(x);
152 #else
153 #define GET_PATH(x) \
154 MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
155 #define RELEASE_PATH(x) \
156 FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
157 #endif /* CONFIG_FSE */
158
159 #ifndef HFS_GET_BOOT_INFO
160 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
161 #endif
162
163 #ifndef HFS_SET_BOOT_INFO
164 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
165 #endif
166
167 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
168 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
169 #endif
170
171 extern void disk_conditioner_unmount(mount_t mp);
172
173 /* struct for checkdirs iteration */
174 struct cdirargs {
175 vnode_t olddp;
176 vnode_t newdp;
177 };
178 /* callback for checkdirs iteration */
179 static int checkdirs_callback(proc_t p, void * arg);
180
181 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
182 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
183 void enablequotas(struct mount *mp, vfs_context_t ctx);
184 static int getfsstat_callback(mount_t mp, void * arg);
185 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
186 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
187 static int sync_callback(mount_t, void *);
188 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
189 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
190 boolean_t partial_copy);
191 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
192 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
193 struct componentname *cnp, user_addr_t fsmountargs,
194 int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
195 vfs_context_t ctx);
196 void vfs_notify_mount(vnode_t pdvp);
197
198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
199
200 struct fd_vn_data * fg_vn_data_alloc(void);
201
202 /*
203 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
204 * Concurrent lookups (or lookups by ids) on hard links can cause the
205 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
206 * does) to return ENOENT as the path cannot be returned from the name cache
207 * alone. We have no option but to retry and hope to get one namei->reverse path
208 * generation done without an intervening lookup, lookup by id on the hard link
209 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
210 * which currently are the MAC hooks for rename, unlink and rmdir.
211 */
212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
213
214 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
215 int unlink_flags);
216
217 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
218
219 #ifdef CONFIG_IMGSRC_ACCESS
220 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
221 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
222 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
223 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
224 static void mount_end_update(mount_t mp);
225 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
226 #endif /* CONFIG_IMGSRC_ACCESS */
227
228 #if CONFIG_LOCKERBOOT
229 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
230 const char *pbdevpath);
231 #endif
232
233 //snapshot functions
234 #if CONFIG_MNT_ROOTSNAP
235 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236 #else
237 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238 #endif
239
240 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
241
242 __private_extern__
243 int sync_internal(void);
244
245 __private_extern__
246 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
247
248 extern lck_grp_t *fd_vn_lck_grp;
249 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
250 extern lck_attr_t *fd_vn_lck_attr;
251
252 /*
253 * incremented each time a mount or unmount operation occurs
254 * used to invalidate the cached value of the rootvp in the
255 * mount structure utilized by cache_lookup_path
256 */
257 uint32_t mount_generation = 0;
258
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261
262 extern const struct fileops vnops;
263 #if CONFIG_APPLEDOUBLE
264 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
265 #endif /* CONFIG_APPLEDOUBLE */
266
267 /*
268 * Virtual File System System Calls
269 */
270
271 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
272 /*
273 * Private in-kernel mounting spi (NFS only, not exported)
274 */
275 __private_extern__
276 boolean_t
277 vfs_iskernelmount(mount_t mp)
278 {
279 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
280 }
281
282 __private_extern__
283 int
284 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
285 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
286 {
287 struct nameidata nd;
288 boolean_t did_namei;
289 int error;
290
291 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
292 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
293
294 /*
295 * Get the vnode to be covered if it's not supplied
296 */
297 if (vp == NULLVP) {
298 error = namei(&nd);
299 if (error) {
300 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
301 printf("failed to locate mount-on path: %s ", path);
302 }
303 return error;
304 }
305 vp = nd.ni_vp;
306 pvp = nd.ni_dvp;
307 did_namei = TRUE;
308 } else {
309 char *pnbuf = CAST_DOWN(char *, path);
310
311 nd.ni_cnd.cn_pnbuf = pnbuf;
312 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
313 did_namei = FALSE;
314 }
315
316 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
317 syscall_flags, kern_flags, NULL, TRUE, ctx);
318
319 if (did_namei) {
320 vnode_put(vp);
321 vnode_put(pvp);
322 nameidone(&nd);
323 }
324
325 return error;
326 }
327 #endif /* CONFIG_NFS_CLIENT || DEVFS */
328
329 /*
330 * Mount a file system.
331 */
332 /* ARGSUSED */
333 int
334 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
335 {
336 struct __mac_mount_args muap;
337
338 muap.type = uap->type;
339 muap.path = uap->path;
340 muap.flags = uap->flags;
341 muap.data = uap->data;
342 muap.mac_p = USER_ADDR_NULL;
343 return __mac_mount(p, &muap, retval);
344 }
345
346 int
347 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
348 {
349 struct componentname cn;
350 vfs_context_t ctx = vfs_context_current();
351 size_t dummy = 0;
352 int error;
353 int flags = uap->flags;
354 char fstypename[MFSNAMELEN];
355 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
356 vnode_t pvp;
357 vnode_t vp;
358
359 AUDIT_ARG(fd, uap->fd);
360 AUDIT_ARG(fflags, flags);
361 /* fstypename will get audited by mount_common */
362
363 /* Sanity check the flags */
364 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
365 return ENOTSUP;
366 }
367
368 if (flags & MNT_UNION) {
369 return EPERM;
370 }
371
372 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
373 if (error) {
374 return error;
375 }
376
377 if ((error = file_vnode(uap->fd, &vp)) != 0) {
378 return error;
379 }
380
381 if ((error = vnode_getwithref(vp)) != 0) {
382 file_drop(uap->fd);
383 return error;
384 }
385
386 pvp = vnode_getparent(vp);
387 if (pvp == NULL) {
388 vnode_put(vp);
389 file_drop(uap->fd);
390 return EINVAL;
391 }
392
393 memset(&cn, 0, sizeof(struct componentname));
394 MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
395 cn.cn_pnlen = MAXPATHLEN;
396
397 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
398 FREE(cn.cn_pnbuf, M_TEMP);
399 vnode_put(pvp);
400 vnode_put(vp);
401 file_drop(uap->fd);
402 return error;
403 }
404
405 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
406
407 FREE(cn.cn_pnbuf, M_TEMP);
408 vnode_put(pvp);
409 vnode_put(vp);
410 file_drop(uap->fd);
411
412 return error;
413 }
414
415 void
416 vfs_notify_mount(vnode_t pdvp)
417 {
418 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
419 lock_vnode_and_post(pdvp, NOTE_WRITE);
420 }
421
422 /*
423 * __mac_mount:
424 * Mount a file system taking into account MAC label behavior.
425 * See mount(2) man page for more information
426 *
427 * Parameters: p Process requesting the mount
428 * uap User argument descriptor (see below)
429 * retval (ignored)
430 *
431 * Indirect: uap->type Filesystem type
432 * uap->path Path to mount
433 * uap->data Mount arguments
434 * uap->mac_p MAC info
435 * uap->flags Mount flags
436 *
437 *
438 * Returns: 0 Success
439 * !0 Not success
440 */
441 boolean_t root_fs_upgrade_try = FALSE;
442
443 int
444 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
445 {
446 vnode_t pvp = NULL;
447 vnode_t vp = NULL;
448 int need_nameidone = 0;
449 vfs_context_t ctx = vfs_context_current();
450 char fstypename[MFSNAMELEN];
451 struct nameidata nd;
452 size_t dummy = 0;
453 char *labelstr = NULL;
454 int flags = uap->flags;
455 int error;
456 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
457 boolean_t is_64bit = IS_64BIT_PROCESS(p);
458 #else
459 #pragma unused(p)
460 #endif
461 /*
462 * Get the fs type name from user space
463 */
464 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
465 if (error) {
466 return error;
467 }
468
469 /*
470 * Get the vnode to be covered
471 */
472 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
473 UIO_USERSPACE, uap->path, ctx);
474 error = namei(&nd);
475 if (error) {
476 goto out;
477 }
478 need_nameidone = 1;
479 vp = nd.ni_vp;
480 pvp = nd.ni_dvp;
481
482 #ifdef CONFIG_IMGSRC_ACCESS
483 /* Mounting image source cannot be batched with other operations */
484 if (flags == MNT_IMGSRC_BY_INDEX) {
485 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
486 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
487 goto out;
488 }
489 #endif /* CONFIG_IMGSRC_ACCESS */
490
491 #if CONFIG_MACF
492 /*
493 * Get the label string (if any) from user space
494 */
495 if (uap->mac_p != USER_ADDR_NULL) {
496 struct user_mac mac;
497 size_t ulen = 0;
498
499 if (is_64bit) {
500 struct user64_mac mac64;
501 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
502 mac.m_buflen = mac64.m_buflen;
503 mac.m_string = mac64.m_string;
504 } else {
505 struct user32_mac mac32;
506 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
507 mac.m_buflen = mac32.m_buflen;
508 mac.m_string = mac32.m_string;
509 }
510 if (error) {
511 goto out;
512 }
513 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
514 (mac.m_buflen < 2)) {
515 error = EINVAL;
516 goto out;
517 }
518 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
519 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
520 if (error) {
521 goto out;
522 }
523 AUDIT_ARG(mac_string, labelstr);
524 }
525 #endif /* CONFIG_MACF */
526
527 AUDIT_ARG(fflags, flags);
528
529 #if SECURE_KERNEL
530 if (flags & MNT_UNION) {
531 /* No union mounts on release kernels */
532 error = EPERM;
533 goto out;
534 }
535 #endif
536
537 if ((vp->v_flag & VROOT) &&
538 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
539 if (!(flags & MNT_UNION)) {
540 flags |= MNT_UPDATE;
541 } else {
542 /*
543 * For a union mount on '/', treat it as fresh
544 * mount instead of update.
545 * Otherwise, union mouting on '/' used to panic the
546 * system before, since mnt_vnodecovered was found to
547 * be NULL for '/' which is required for unionlookup
548 * after it gets ENOENT on union mount.
549 */
550 flags = (flags & ~(MNT_UPDATE));
551 }
552
553 #if SECURE_KERNEL
554 if ((flags & MNT_RDONLY) == 0) {
555 /* Release kernels are not allowed to mount "/" as rw */
556 error = EPERM;
557 goto out;
558 }
559 #endif
560 /*
561 * See 7392553 for more details on why this check exists.
562 * Suffice to say: If this check is ON and something tries
563 * to mount the rootFS RW, we'll turn off the codesign
564 * bitmap optimization.
565 */
566 #if CHECK_CS_VALIDATION_BITMAP
567 if ((flags & MNT_RDONLY) == 0) {
568 root_fs_upgrade_try = TRUE;
569 }
570 #endif
571 }
572
573 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
574 labelstr, FALSE, ctx);
575
576 out:
577
578 #if CONFIG_MACF
579 if (labelstr) {
580 FREE(labelstr, M_MACTEMP);
581 }
582 #endif /* CONFIG_MACF */
583
584 if (vp) {
585 vnode_put(vp);
586 }
587 if (pvp) {
588 vnode_put(pvp);
589 }
590 if (need_nameidone) {
591 nameidone(&nd);
592 }
593
594 return error;
595 }
596
597 /*
598 * common mount implementation (final stage of mounting)
599 *
600 * Arguments:
601 * fstypename file system type (ie it's vfs name)
602 * pvp parent of covered vnode
603 * vp covered vnode
604 * cnp component name (ie path) of covered vnode
605 * flags generic mount flags
606 * fsmountargs file system specific data
607 * labelstr optional MAC label
608 * kernelmount TRUE for mounts initiated from inside the kernel
609 * ctx caller's context
610 */
611 static int
612 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
613 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
614 char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
615 {
616 #if !CONFIG_MACF
617 #pragma unused(labelstr)
618 #endif
619 struct vnode *devvp = NULLVP;
620 struct vnode *device_vnode = NULLVP;
621 #if CONFIG_MACF
622 struct vnode *rvp;
623 #endif
624 struct mount *mp;
625 struct vfstable *vfsp = (struct vfstable *)0;
626 struct proc *p = vfs_context_proc(ctx);
627 int error, flag = 0;
628 user_addr_t devpath = USER_ADDR_NULL;
629 int ronly = 0;
630 int mntalloc = 0;
631 boolean_t vfsp_ref = FALSE;
632 boolean_t is_rwlock_locked = FALSE;
633 boolean_t did_rele = FALSE;
634 boolean_t have_usecount = FALSE;
635
636 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
637 /* Check for mutually-exclusive flag bits */
638 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
639 int bitcount = 0;
640 while (checkflags != 0) {
641 checkflags &= (checkflags - 1);
642 bitcount++;
643 }
644
645 if (bitcount > 1) {
646 //not allowed to request multiple mount-by-role flags
647 error = EINVAL;
648 goto out1;
649 }
650 #endif
651
652 /*
653 * Process an update for an existing mount
654 */
655 if (flags & MNT_UPDATE) {
656 if ((vp->v_flag & VROOT) == 0) {
657 error = EINVAL;
658 goto out1;
659 }
660 mp = vp->v_mount;
661
662 /* unmount in progress return error */
663 mount_lock_spin(mp);
664 if (mp->mnt_lflag & MNT_LUNMOUNT) {
665 mount_unlock(mp);
666 error = EBUSY;
667 goto out1;
668 }
669 mount_unlock(mp);
670 lck_rw_lock_exclusive(&mp->mnt_rwlock);
671 is_rwlock_locked = TRUE;
672 /*
673 * We only allow the filesystem to be reloaded if it
674 * is currently mounted read-only.
675 */
676 if ((flags & MNT_RELOAD) &&
677 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
678 error = ENOTSUP;
679 goto out1;
680 }
681
682 /*
683 * If content protection is enabled, update mounts are not
684 * allowed to turn it off.
685 */
686 if ((mp->mnt_flag & MNT_CPROTECT) &&
687 ((flags & MNT_CPROTECT) == 0)) {
688 error = EINVAL;
689 goto out1;
690 }
691
692 /*
693 * can't turn off MNT_REMOVABLE either but it may be an unexpected
694 * failure to return an error for this so we'll just silently
695 * add it if it is not passed in.
696 */
697 if ((mp->mnt_flag & MNT_REMOVABLE) &&
698 ((flags & MNT_REMOVABLE) == 0)) {
699 flags |= MNT_REMOVABLE;
700 }
701
702 #ifdef CONFIG_IMGSRC_ACCESS
703 /* Can't downgrade the backer of the root FS */
704 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
705 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
706 error = ENOTSUP;
707 goto out1;
708 }
709 #endif /* CONFIG_IMGSRC_ACCESS */
710
711 /*
712 * Only root, or the user that did the original mount is
713 * permitted to update it.
714 */
715 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
716 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
717 goto out1;
718 }
719 #if CONFIG_MACF
720 error = mac_mount_check_remount(ctx, mp);
721 if (error != 0) {
722 goto out1;
723 }
724 #endif
725 /*
726 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
727 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
728 */
729 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
730 flags |= MNT_NOSUID | MNT_NODEV;
731 if (mp->mnt_flag & MNT_NOEXEC) {
732 flags |= MNT_NOEXEC;
733 }
734 }
735 flag = mp->mnt_flag;
736
737
738
739 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
740
741 vfsp = mp->mnt_vtable;
742 goto update;
743 } // MNT_UPDATE
744
745 /*
746 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
747 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
748 */
749 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
750 flags |= MNT_NOSUID | MNT_NODEV;
751 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
752 flags |= MNT_NOEXEC;
753 }
754 }
755
756 /* XXXAUDIT: Should we capture the type on the error path as well? */
757 AUDIT_ARG(text, fstypename);
758 mount_list_lock();
759 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
760 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
761 vfsp->vfc_refcount++;
762 vfsp_ref = TRUE;
763 break;
764 }
765 }
766 mount_list_unlock();
767 if (vfsp == NULL) {
768 error = ENODEV;
769 goto out1;
770 }
771
772 /*
773 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
774 * except in ROSV configs.
775 */
776 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
777 ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
778 error = EINVAL; /* unsupported request */
779 goto out1;
780 }
781
782 error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
783 if (error != 0) {
784 goto out1;
785 }
786
787 /*
788 * Allocate and initialize the filesystem (mount_t)
789 */
790 MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
791 M_MOUNT, M_WAITOK);
792 bzero((char *)mp, (u_int32_t)sizeof(struct mount));
793 mntalloc = 1;
794
795 /* Initialize the default IO constraints */
796 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
797 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
798 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
799 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
800 mp->mnt_devblocksize = DEV_BSIZE;
801 mp->mnt_alignmentmask = PAGE_MASK;
802 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
803 mp->mnt_ioscale = 1;
804 mp->mnt_ioflags = 0;
805 mp->mnt_realrootvp = NULLVP;
806 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
807
808 TAILQ_INIT(&mp->mnt_vnodelist);
809 TAILQ_INIT(&mp->mnt_workerqueue);
810 TAILQ_INIT(&mp->mnt_newvnodes);
811 mount_lock_init(mp);
812 lck_rw_lock_exclusive(&mp->mnt_rwlock);
813 is_rwlock_locked = TRUE;
814 mp->mnt_op = vfsp->vfc_vfsops;
815 mp->mnt_vtable = vfsp;
816 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
817 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
818 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
819 do {
820 int pathlen = MAXPATHLEN;
821
822 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
823 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
824 }
825 } while (0);
826 mp->mnt_vnodecovered = vp;
827 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
828 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
829 mp->mnt_devbsdunit = 0;
830
831 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
832 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
833
834 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
835 if (kernelmount) {
836 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
837 }
838 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
839 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
840 }
841 #endif /* CONFIG_NFS_CLIENT || DEVFS */
842
843 update:
844
845 /*
846 * Set the mount level flags.
847 */
848 if (flags & MNT_RDONLY) {
849 mp->mnt_flag |= MNT_RDONLY;
850 } else if (mp->mnt_flag & MNT_RDONLY) {
851 // disallow read/write upgrades of file systems that
852 // had the TYPENAME_OVERRIDE feature set.
853 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
854 error = EPERM;
855 goto out1;
856 }
857 mp->mnt_kern_flag |= MNTK_WANTRDWR;
858 }
859 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
860 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
861 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
862 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
863 MNT_QUARANTINE | MNT_CPROTECT);
864
865 #if SECURE_KERNEL
866 #if !CONFIG_MNT_SUID
867 /*
868 * On release builds of iOS based platforms, always enforce NOSUID on
869 * all mounts. We do this here because we can catch update mounts as well as
870 * non-update mounts in this case.
871 */
872 mp->mnt_flag |= (MNT_NOSUID);
873 #endif
874 #endif
875
876 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
877 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
878 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
879 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
880 MNT_QUARANTINE | MNT_CPROTECT);
881
882 #if CONFIG_MACF
883 if (flags & MNT_MULTILABEL) {
884 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
885 error = EINVAL;
886 goto out1;
887 }
888 mp->mnt_flag |= MNT_MULTILABEL;
889 }
890 #endif
891 /*
892 * Process device path for local file systems if requested
893 */
894 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
895 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
896 //snapshot, vm, datavolume mounts are special
897 if (vfs_context_is64bit(ctx)) {
898 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
899 goto out1;
900 }
901 fsmountargs += sizeof(devpath);
902 } else {
903 user32_addr_t tmp;
904 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
905 goto out1;
906 }
907 /* munge into LP64 addr */
908 devpath = CAST_USER_ADDR_T(tmp);
909 fsmountargs += sizeof(tmp);
910 }
911
912 /* Lookup device and authorize access to it */
913 if ((devpath)) {
914 struct nameidata nd;
915
916 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
917 if ((error = namei(&nd))) {
918 goto out1;
919 }
920
921 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
922 devvp = nd.ni_vp;
923
924 nameidone(&nd);
925
926 if (devvp->v_type != VBLK) {
927 error = ENOTBLK;
928 goto out2;
929 }
930 if (major(devvp->v_rdev) >= nblkdev) {
931 error = ENXIO;
932 goto out2;
933 }
934 /*
935 * If mount by non-root, then verify that user has necessary
936 * permissions on the device.
937 */
938 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
939 mode_t accessmode = KAUTH_VNODE_READ_DATA;
940
941 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
942 accessmode |= KAUTH_VNODE_WRITE_DATA;
943 }
944 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
945 goto out2;
946 }
947 }
948 }
949 /* On first mount, preflight and open device */
950 if (devpath && ((flags & MNT_UPDATE) == 0)) {
951 if ((error = vnode_ref(devvp))) {
952 goto out2;
953 }
954 /*
955 * Disallow multiple mounts of the same device.
956 * Disallow mounting of a device that is currently in use
957 * (except for root, which might share swap device for miniroot).
958 * Flush out any old buffers remaining from a previous use.
959 */
960 if ((error = vfs_mountedon(devvp))) {
961 goto out3;
962 }
963
964 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
965 error = EBUSY;
966 goto out3;
967 }
968 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
969 error = ENOTBLK;
970 goto out3;
971 }
972 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
973 goto out3;
974 }
975
976 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
977 #if CONFIG_MACF
978 error = mac_vnode_check_open(ctx,
979 devvp,
980 ronly ? FREAD : FREAD | FWRITE);
981 if (error) {
982 goto out3;
983 }
984 #endif /* MAC */
985 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
986 goto out3;
987 }
988
989 mp->mnt_devvp = devvp;
990 device_vnode = devvp;
991 } else if ((mp->mnt_flag & MNT_RDONLY) &&
992 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
993 (device_vnode = mp->mnt_devvp)) {
994 dev_t dev;
995 int maj;
996 /*
997 * If upgrade to read-write by non-root, then verify
998 * that user has necessary permissions on the device.
999 */
1000 vnode_getalways(device_vnode);
1001
1002 if (suser(vfs_context_ucred(ctx), NULL) &&
1003 (error = vnode_authorize(device_vnode, NULL,
1004 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1005 ctx)) != 0) {
1006 vnode_put(device_vnode);
1007 goto out2;
1008 }
1009
1010 /* Tell the device that we're upgrading */
1011 dev = (dev_t)device_vnode->v_rdev;
1012 maj = major(dev);
1013
1014 if ((u_int)maj >= (u_int)nblkdev) {
1015 panic("Volume mounted on a device with invalid major number.");
1016 }
1017
1018 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1019 vnode_put(device_vnode);
1020 device_vnode = NULLVP;
1021 if (error != 0) {
1022 goto out2;
1023 }
1024 }
1025 } // localargs && !(snapshot | data | vm)
1026
1027 #if CONFIG_MACF
1028 if ((flags & MNT_UPDATE) == 0) {
1029 mac_mount_label_init(mp);
1030 mac_mount_label_associate(ctx, mp);
1031 }
1032 if (labelstr) {
1033 if ((flags & MNT_UPDATE) != 0) {
1034 error = mac_mount_check_label_update(ctx, mp);
1035 if (error != 0) {
1036 goto out3;
1037 }
1038 }
1039 }
1040 #endif
1041 /*
1042 * Mount the filesystem. We already asserted that internal_flags
1043 * cannot have more than one mount-by-role bit set.
1044 */
1045 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1046 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1047 (caddr_t)fsmountargs, 0, ctx);
1048 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1049 #if CONFIG_ROSV_STARTUP
1050 struct mount *origin_mp = (struct mount*)fsmountargs;
1051 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1052 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1053 if (error) {
1054 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1055 } else {
1056 /* Mark volume associated with system volume */
1057 mp->mnt_kern_flag |= MNTK_SYSTEM;
1058
1059 /* Attempt to acquire the mnt_devvp and set it up */
1060 struct vnode *mp_devvp = NULL;
1061 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1062 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1063 0, &mp_devvp, vfs_context_kernel());
1064 if (!lerr) {
1065 mp->mnt_devvp = mp_devvp;
1066 //vnode_lookup took an iocount, need to drop it.
1067 vnode_put(mp_devvp);
1068 // now set `device_vnode` to the devvp that was acquired.
1069 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1070 // note that though the iocount above was dropped, the mount acquires
1071 // an implicit reference against the device.
1072 device_vnode = mp_devvp;
1073 }
1074 }
1075 }
1076 #else
1077 error = EINVAL;
1078 #endif
1079 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1080 #if CONFIG_MOUNT_VM
1081 struct mount *origin_mp = (struct mount*)fsmountargs;
1082 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1083 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1084 if (error) {
1085 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1086 } else {
1087 /* Mark volume associated with system volume and a swap mount */
1088 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1089 /* Attempt to acquire the mnt_devvp and set it up */
1090 struct vnode *mp_devvp = NULL;
1091 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1092 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1093 0, &mp_devvp, vfs_context_kernel());
1094 if (!lerr) {
1095 mp->mnt_devvp = mp_devvp;
1096 //vnode_lookup took an iocount, need to drop it.
1097 vnode_put(mp_devvp);
1098
1099 // now set `device_vnode` to the devvp that was acquired.
1100 // note that though the iocount above was dropped, the mount acquires
1101 // an implicit reference against the device.
1102 device_vnode = mp_devvp;
1103 }
1104 }
1105 }
1106 #else
1107 error = EINVAL;
1108 #endif
1109 } else {
1110 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1111 }
1112
1113 if (flags & MNT_UPDATE) {
1114 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1115 mp->mnt_flag &= ~MNT_RDONLY;
1116 }
1117 mp->mnt_flag &= ~
1118 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1119 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1120 if (error) {
1121 mp->mnt_flag = flag; /* restore flag value */
1122 }
1123 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1124 lck_rw_done(&mp->mnt_rwlock);
1125 is_rwlock_locked = FALSE;
1126 if (!error) {
1127 enablequotas(mp, ctx);
1128 }
1129 goto exit;
1130 }
1131
1132 /*
1133 * Put the new filesystem on the mount list after root.
1134 */
1135 if (error == 0) {
1136 struct vfs_attr vfsattr;
1137 #if CONFIG_MACF
1138 error = mac_mount_check_mount_late(ctx, mp);
1139 if (error != 0) {
1140 goto out3;
1141 }
1142
1143 if (vfs_flags(mp) & MNT_MULTILABEL) {
1144 error = VFS_ROOT(mp, &rvp, ctx);
1145 if (error) {
1146 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1147 goto out3;
1148 }
1149 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1150 /*
1151 * drop reference provided by VFS_ROOT
1152 */
1153 vnode_put(rvp);
1154
1155 if (error) {
1156 goto out3;
1157 }
1158 }
1159 #endif /* MAC */
1160
1161 vnode_lock_spin(vp);
1162 CLR(vp->v_flag, VMOUNT);
1163 vp->v_mountedhere = mp;
1164 vnode_unlock(vp);
1165
1166 /*
1167 * taking the name_cache_lock exclusively will
1168 * insure that everyone is out of the fast path who
1169 * might be trying to use a now stale copy of
1170 * vp->v_mountedhere->mnt_realrootvp
1171 * bumping mount_generation causes the cached values
1172 * to be invalidated
1173 */
1174 name_cache_lock();
1175 mount_generation++;
1176 name_cache_unlock();
1177
1178 error = vnode_ref(vp);
1179 if (error != 0) {
1180 goto out4;
1181 }
1182
1183 have_usecount = TRUE;
1184
1185 error = checkdirs(vp, ctx);
1186 if (error != 0) {
1187 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1188 goto out4;
1189 }
1190 /*
1191 * there is no cleanup code here so I have made it void
1192 * we need to revisit this
1193 */
1194 (void)VFS_START(mp, 0, ctx);
1195
1196 if (mount_list_add(mp) != 0) {
1197 /*
1198 * The system is shutting down trying to umount
1199 * everything, so fail with a plausible errno.
1200 */
1201 error = EBUSY;
1202 goto out4;
1203 }
1204 lck_rw_done(&mp->mnt_rwlock);
1205 is_rwlock_locked = FALSE;
1206
1207 /* Check if this mounted file system supports EAs or named streams. */
1208 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1209 VFSATTR_INIT(&vfsattr);
1210 VFSATTR_WANTED(&vfsattr, f_capabilities);
1211 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1212 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1213 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1214 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1215 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1216 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1217 }
1218 #if NAMEDSTREAMS
1219 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1220 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1221 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1222 }
1223 #endif
1224 /* Check if this file system supports path from id lookups. */
1225 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1226 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1227 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1228 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1229 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1230 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1231 }
1232
1233 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1234 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1235 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1236 }
1237 }
1238 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1239 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1240 }
1241 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1242 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1243 }
1244 /* increment the operations count */
1245 OSAddAtomic(1, &vfs_nummntops);
1246 enablequotas(mp, ctx);
1247
1248 if (device_vnode) {
1249 device_vnode->v_specflags |= SI_MOUNTEDON;
1250
1251 /*
1252 * cache the IO attributes for the underlying physical media...
1253 * an error return indicates the underlying driver doesn't
1254 * support all the queries necessary... however, reasonable
1255 * defaults will have been set, so no reason to bail or care
1256 */
1257 vfs_init_io_attributes(device_vnode, mp);
1258 }
1259
1260 /* Now that mount is setup, notify the listeners */
1261 vfs_notify_mount(pvp);
1262 IOBSDMountChange(mp, kIOMountChangeMount);
1263 } else {
1264 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1265 if (mp->mnt_vnodelist.tqh_first != NULL) {
1266 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1267 mp->mnt_vtable->vfc_name, error);
1268 }
1269
1270 vnode_lock_spin(vp);
1271 CLR(vp->v_flag, VMOUNT);
1272 vnode_unlock(vp);
1273 mount_list_lock();
1274 mp->mnt_vtable->vfc_refcount--;
1275 mount_list_unlock();
1276
1277 if (device_vnode) {
1278 vnode_rele(device_vnode);
1279 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1280 }
1281 lck_rw_done(&mp->mnt_rwlock);
1282 is_rwlock_locked = FALSE;
1283
1284 /*
1285 * if we get here, we have a mount structure that needs to be freed,
1286 * but since the coveredvp hasn't yet been updated to point at it,
1287 * no need to worry about other threads holding a crossref on this mp
1288 * so it's ok to just free it
1289 */
1290 mount_lock_destroy(mp);
1291 #if CONFIG_MACF
1292 mac_mount_label_destroy(mp);
1293 #endif
1294 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1295 }
1296 exit:
1297 /*
1298 * drop I/O count on the device vp if there was one
1299 */
1300 if (devpath && devvp) {
1301 vnode_put(devvp);
1302 }
1303
1304 return error;
1305
1306 /* Error condition exits */
1307 out4:
1308 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1309
1310 /*
1311 * If the mount has been placed on the covered vp,
1312 * it may have been discovered by now, so we have
1313 * to treat this just like an unmount
1314 */
1315 mount_lock_spin(mp);
1316 mp->mnt_lflag |= MNT_LDEAD;
1317 mount_unlock(mp);
1318
1319 if (device_vnode != NULLVP) {
1320 vnode_rele(device_vnode);
1321 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1322 ctx);
1323 did_rele = TRUE;
1324 }
1325
1326 vnode_lock_spin(vp);
1327
1328 mp->mnt_crossref++;
1329 vp->v_mountedhere = (mount_t) 0;
1330
1331 vnode_unlock(vp);
1332
1333 if (have_usecount) {
1334 vnode_rele(vp);
1335 }
1336 out3:
1337 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1338 vnode_rele(devvp);
1339 }
1340 out2:
1341 if (devpath && devvp) {
1342 vnode_put(devvp);
1343 }
1344 out1:
1345 /* Release mnt_rwlock only when it was taken */
1346 if (is_rwlock_locked == TRUE) {
1347 lck_rw_done(&mp->mnt_rwlock);
1348 }
1349
1350 if (mntalloc) {
1351 if (mp->mnt_crossref) {
1352 mount_dropcrossref(mp, vp, 0);
1353 } else {
1354 mount_lock_destroy(mp);
1355 #if CONFIG_MACF
1356 mac_mount_label_destroy(mp);
1357 #endif
1358 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1359 }
1360 }
1361 if (vfsp_ref) {
1362 mount_list_lock();
1363 vfsp->vfc_refcount--;
1364 mount_list_unlock();
1365 }
1366
1367 return error;
1368 }
1369
1370 /*
1371 * Flush in-core data, check for competing mount attempts,
1372 * and set VMOUNT
1373 */
1374 int
1375 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1376 {
1377 #if !CONFIG_MACF
1378 #pragma unused(cnp,fsname)
1379 #endif
1380 struct vnode_attr va;
1381 int error;
1382
1383 if (!skip_auth) {
1384 /*
1385 * If the user is not root, ensure that they own the directory
1386 * onto which we are attempting to mount.
1387 */
1388 VATTR_INIT(&va);
1389 VATTR_WANTED(&va, va_uid);
1390 if ((error = vnode_getattr(vp, &va, ctx)) ||
1391 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1392 (!vfs_context_issuser(ctx)))) {
1393 error = EPERM;
1394 goto out;
1395 }
1396 }
1397
1398 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1399 goto out;
1400 }
1401
1402 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1403 goto out;
1404 }
1405
1406 if (vp->v_type != VDIR) {
1407 error = ENOTDIR;
1408 goto out;
1409 }
1410
1411 if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1412 error = EBUSY;
1413 goto out;
1414 }
1415
1416 #if CONFIG_MACF
1417 error = mac_mount_check_mount(ctx, vp,
1418 cnp, fsname);
1419 if (error != 0) {
1420 goto out;
1421 }
1422 #endif
1423
1424 vnode_lock_spin(vp);
1425 SET(vp->v_flag, VMOUNT);
1426 vnode_unlock(vp);
1427
1428 out:
1429 return error;
1430 }
1431
1432 #if CONFIG_IMGSRC_ACCESS
1433
1434 #define DEBUG_IMGSRC 0
1435
1436 #if DEBUG_IMGSRC
1437 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1438 #else
1439 #define IMGSRC_DEBUG(args...) do { } while(0)
1440 #endif
1441
1442 static int
1443 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1444 {
1445 struct nameidata nd;
1446 vnode_t vp, realdevvp;
1447 mode_t accessmode;
1448 int error;
1449 enum uio_seg uio = UIO_USERSPACE;
1450
1451 if (ctx == vfs_context_kernel()) {
1452 uio = UIO_SYSSPACE;
1453 }
1454
1455 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1456 if ((error = namei(&nd))) {
1457 IMGSRC_DEBUG("namei() failed with %d\n", error);
1458 return error;
1459 }
1460
1461 vp = nd.ni_vp;
1462
1463 if (!vnode_isblk(vp)) {
1464 IMGSRC_DEBUG("Not block device.\n");
1465 error = ENOTBLK;
1466 goto out;
1467 }
1468
1469 realdevvp = mp->mnt_devvp;
1470 if (realdevvp == NULLVP) {
1471 IMGSRC_DEBUG("No device backs the mount.\n");
1472 error = ENXIO;
1473 goto out;
1474 }
1475
1476 error = vnode_getwithref(realdevvp);
1477 if (error != 0) {
1478 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1479 goto out;
1480 }
1481
1482 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1483 IMGSRC_DEBUG("Wrong dev_t.\n");
1484 error = ENXIO;
1485 goto out1;
1486 }
1487
1488 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1489
1490 /*
1491 * If mount by non-root, then verify that user has necessary
1492 * permissions on the device.
1493 */
1494 if (!vfs_context_issuser(ctx)) {
1495 accessmode = KAUTH_VNODE_READ_DATA;
1496 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1497 accessmode |= KAUTH_VNODE_WRITE_DATA;
1498 }
1499 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1500 IMGSRC_DEBUG("Access denied.\n");
1501 goto out1;
1502 }
1503 }
1504
1505 *devvpp = vp;
1506
1507 out1:
1508 vnode_put(realdevvp);
1509
1510 out:
1511 nameidone(&nd);
1512
1513 if (error) {
1514 vnode_put(vp);
1515 }
1516
1517 return error;
1518 }
1519
1520 /*
1521 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1522 * and call checkdirs()
1523 */
1524 static int
1525 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1526 {
1527 int error;
1528
1529 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1530
1531 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1532 mp->mnt_vtable->vfc_name, vnode_getname(vp));
1533
1534 vnode_lock_spin(vp);
1535 CLR(vp->v_flag, VMOUNT);
1536 vp->v_mountedhere = mp;
1537 vnode_unlock(vp);
1538
1539 /*
1540 * taking the name_cache_lock exclusively will
1541 * insure that everyone is out of the fast path who
1542 * might be trying to use a now stale copy of
1543 * vp->v_mountedhere->mnt_realrootvp
1544 * bumping mount_generation causes the cached values
1545 * to be invalidated
1546 */
1547 name_cache_lock();
1548 mount_generation++;
1549 name_cache_unlock();
1550
1551 error = vnode_ref(vp);
1552 if (error != 0) {
1553 goto out;
1554 }
1555
1556 error = checkdirs(vp, ctx);
1557 if (error != 0) {
1558 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1559 vnode_rele(vp);
1560 goto out;
1561 }
1562
1563 out:
1564 if (error != 0) {
1565 mp->mnt_vnodecovered = NULLVP;
1566 }
1567 return error;
1568 }
1569
1570 static void
1571 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1572 {
1573 vnode_rele(vp);
1574 vnode_lock_spin(vp);
1575 vp->v_mountedhere = (mount_t)NULL;
1576 vnode_unlock(vp);
1577
1578 mp->mnt_vnodecovered = NULLVP;
1579 }
1580
1581 static int
1582 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1583 {
1584 int error;
1585
1586 /* unmount in progress return error */
1587 mount_lock_spin(mp);
1588 if (mp->mnt_lflag & MNT_LUNMOUNT) {
1589 mount_unlock(mp);
1590 return EBUSY;
1591 }
1592 mount_unlock(mp);
1593 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1594
1595 /*
1596 * We only allow the filesystem to be reloaded if it
1597 * is currently mounted read-only.
1598 */
1599 if ((flags & MNT_RELOAD) &&
1600 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1601 error = ENOTSUP;
1602 goto out;
1603 }
1604
1605 /*
1606 * Only root, or the user that did the original mount is
1607 * permitted to update it.
1608 */
1609 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1610 (!vfs_context_issuser(ctx))) {
1611 error = EPERM;
1612 goto out;
1613 }
1614 #if CONFIG_MACF
1615 error = mac_mount_check_remount(ctx, mp);
1616 if (error != 0) {
1617 goto out;
1618 }
1619 #endif
1620
1621 out:
1622 if (error) {
1623 lck_rw_done(&mp->mnt_rwlock);
1624 }
1625
1626 return error;
1627 }
1628
1629 static void
1630 mount_end_update(mount_t mp)
1631 {
1632 lck_rw_done(&mp->mnt_rwlock);
1633 }
1634
1635 static int
1636 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1637 {
1638 vnode_t vp;
1639
1640 if (height >= MAX_IMAGEBOOT_NESTING) {
1641 return EINVAL;
1642 }
1643
1644 vp = imgsrc_rootvnodes[height];
1645 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1646 *rvpp = vp;
1647 return 0;
1648 } else {
1649 return ENOENT;
1650 }
1651 }
1652
1653 static int
1654 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1655 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1656 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1657 {
1658 int error;
1659 mount_t mp;
1660 boolean_t placed = FALSE;
1661 struct vfstable *vfsp;
1662 user_addr_t devpath;
1663 char *old_mntonname;
1664 vnode_t rvp;
1665 vnode_t devvp;
1666 uint32_t height;
1667 uint32_t flags;
1668
1669 /* If we didn't imageboot, nothing to move */
1670 if (imgsrc_rootvnodes[0] == NULLVP) {
1671 return EINVAL;
1672 }
1673
1674 /* Only root can do this */
1675 if (!vfs_context_issuser(ctx)) {
1676 return EPERM;
1677 }
1678
1679 IMGSRC_DEBUG("looking for root vnode.\n");
1680
1681 /*
1682 * Get root vnode of filesystem we're moving.
1683 */
1684 if (by_index) {
1685 if (is64bit) {
1686 struct user64_mnt_imgsrc_args mia64;
1687 error = copyin(fsmountargs, &mia64, sizeof(mia64));
1688 if (error != 0) {
1689 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1690 return error;
1691 }
1692
1693 height = mia64.mi_height;
1694 flags = mia64.mi_flags;
1695 devpath = mia64.mi_devpath;
1696 } else {
1697 struct user32_mnt_imgsrc_args mia32;
1698 error = copyin(fsmountargs, &mia32, sizeof(mia32));
1699 if (error != 0) {
1700 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1701 return error;
1702 }
1703
1704 height = mia32.mi_height;
1705 flags = mia32.mi_flags;
1706 devpath = mia32.mi_devpath;
1707 }
1708 } else {
1709 /*
1710 * For binary compatibility--assumes one level of nesting.
1711 */
1712 if (is64bit) {
1713 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1714 return error;
1715 }
1716 } else {
1717 user32_addr_t tmp;
1718 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1719 return error;
1720 }
1721
1722 /* munge into LP64 addr */
1723 devpath = CAST_USER_ADDR_T(tmp);
1724 }
1725
1726 height = 0;
1727 flags = 0;
1728 }
1729
1730 if (flags != 0) {
1731 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1732 return EINVAL;
1733 }
1734
1735 error = get_imgsrc_rootvnode(height, &rvp);
1736 if (error != 0) {
1737 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1738 return error;
1739 }
1740
1741 IMGSRC_DEBUG("got old root vnode\n");
1742
1743 MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1744
1745 /* Can only move once */
1746 mp = vnode_mount(rvp);
1747 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1748 IMGSRC_DEBUG("Already moved.\n");
1749 error = EBUSY;
1750 goto out0;
1751 }
1752
1753 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1754 IMGSRC_DEBUG("Starting updated.\n");
1755
1756 /* Get exclusive rwlock on mount, authorize update on mp */
1757 error = mount_begin_update(mp, ctx, 0);
1758 if (error != 0) {
1759 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1760 goto out0;
1761 }
1762
1763 /*
1764 * It can only be moved once. Flag is set under the rwlock,
1765 * so we're now safe to proceed.
1766 */
1767 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1768 IMGSRC_DEBUG("Already moved [2]\n");
1769 goto out1;
1770 }
1771
1772 IMGSRC_DEBUG("Preparing coveredvp.\n");
1773
1774 /* Mark covered vnode as mount in progress, authorize placing mount on top */
1775 error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1776 if (error != 0) {
1777 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1778 goto out1;
1779 }
1780
1781 IMGSRC_DEBUG("Covered vp OK.\n");
1782
1783 /* Sanity check the name caller has provided */
1784 vfsp = mp->mnt_vtable;
1785 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1786 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1787 vfsp->vfc_name, fsname);
1788 error = EINVAL;
1789 goto out2;
1790 }
1791
1792 /* Check the device vnode and update mount-from name, for local filesystems */
1793 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1794 IMGSRC_DEBUG("Local, doing device validation.\n");
1795
1796 if (devpath != USER_ADDR_NULL) {
1797 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1798 if (error) {
1799 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1800 goto out2;
1801 }
1802
1803 vnode_put(devvp);
1804 }
1805 }
1806
1807 /*
1808 * Place mp on top of vnode, ref the vnode, call checkdirs(),
1809 * and increment the name cache's mount generation
1810 */
1811
1812 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1813 error = place_mount_and_checkdirs(mp, vp, ctx);
1814 if (error != 0) {
1815 goto out2;
1816 }
1817
1818 placed = TRUE;
1819
1820 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1821 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1822
1823 /* Forbid future moves */
1824 mount_lock(mp);
1825 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1826 mount_unlock(mp);
1827
1828 /* Finally, add to mount list, completely ready to go */
1829 if (mount_list_add(mp) != 0) {
1830 /*
1831 * The system is shutting down trying to umount
1832 * everything, so fail with a plausible errno.
1833 */
1834 error = EBUSY;
1835 goto out3;
1836 }
1837
1838 mount_end_update(mp);
1839 vnode_put(rvp);
1840 FREE(old_mntonname, M_TEMP);
1841
1842 vfs_notify_mount(pvp);
1843
1844 return 0;
1845 out3:
1846 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1847
1848 mount_lock(mp);
1849 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1850 mount_unlock(mp);
1851
1852 out2:
1853 /*
1854 * Placing the mp on the vnode clears VMOUNT,
1855 * so cleanup is different after that point
1856 */
1857 if (placed) {
1858 /* Rele the vp, clear VMOUNT and v_mountedhere */
1859 undo_place_on_covered_vp(mp, vp);
1860 } else {
1861 vnode_lock_spin(vp);
1862 CLR(vp->v_flag, VMOUNT);
1863 vnode_unlock(vp);
1864 }
1865 out1:
1866 mount_end_update(mp);
1867
1868 out0:
1869 vnode_put(rvp);
1870 FREE(old_mntonname, M_TEMP);
1871 return error;
1872 }
1873
1874 #if CONFIG_LOCKERBOOT
1875 __private_extern__
1876 int
1877 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1878 const char *pbdevpath)
1879 {
1880 int error = -1;
1881 struct nameidata nd;
1882 boolean_t cleanup_nd = FALSE;
1883 vfs_context_t ctx = vfs_context_kernel();
1884 boolean_t is64 = TRUE;
1885 boolean_t by_index = TRUE;
1886 struct user64_mnt_imgsrc_args mia64 = {
1887 .mi_height = 0,
1888 .mi_flags = 0,
1889 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1890 };
1891 user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1892
1893 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1894 UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1895 error = namei(&nd);
1896 if (error) {
1897 IMGSRC_DEBUG("namei: %d\n", error);
1898 goto out;
1899 }
1900
1901 cleanup_nd = TRUE;
1902 error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1903 &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1904
1905 out:
1906 if (cleanup_nd) {
1907 int stashed = error;
1908
1909 error = vnode_put(nd.ni_vp);
1910 if (error) {
1911 panic("vnode_put() returned non-zero: %d", error);
1912 }
1913
1914 if (nd.ni_dvp) {
1915 error = vnode_put(nd.ni_dvp);
1916 if (error) {
1917 panic("vnode_put() returned non-zero: %d", error);
1918 }
1919 }
1920 nameidone(&nd);
1921
1922 error = stashed;
1923 }
1924 return error;
1925 }
1926 #endif /* CONFIG_LOCKERBOOT */
1927 #endif /* CONFIG_IMGSRC_ACCESS */
1928
1929 void
1930 enablequotas(struct mount *mp, vfs_context_t ctx)
1931 {
1932 struct nameidata qnd;
1933 int type;
1934 char qfpath[MAXPATHLEN];
1935 const char *qfname = QUOTAFILENAME;
1936 const char *qfopsname = QUOTAOPSNAME;
1937 const char *qfextension[] = INITQFNAMES;
1938
1939 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1940 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1941 return;
1942 }
1943 /*
1944 * Enable filesystem disk quotas if necessary.
1945 * We ignore errors as this should not interfere with final mount
1946 */
1947 for (type = 0; type < MAXQUOTAS; type++) {
1948 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1949 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1950 CAST_USER_ADDR_T(qfpath), ctx);
1951 if (namei(&qnd) != 0) {
1952 continue; /* option file to trigger quotas is not present */
1953 }
1954 vnode_put(qnd.ni_vp);
1955 nameidone(&qnd);
1956 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1957
1958 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1959 }
1960 return;
1961 }
1962
1963
1964 static int
1965 checkdirs_callback(proc_t p, void * arg)
1966 {
1967 struct cdirargs * cdrp = (struct cdirargs *)arg;
1968 vnode_t olddp = cdrp->olddp;
1969 vnode_t newdp = cdrp->newdp;
1970 struct filedesc *fdp;
1971 vnode_t new_cvp = newdp;
1972 vnode_t new_rvp = newdp;
1973 vnode_t old_cvp = NULL;
1974 vnode_t old_rvp = NULL;
1975
1976 /*
1977 * XXX Also needs to iterate each thread in the process to see if it
1978 * XXX is using a per-thread current working directory, and, if so,
1979 * XXX update that as well.
1980 */
1981
1982 /*
1983 * First, with the proc_fdlock held, check to see if we will need
1984 * to do any work. If not, we will get out fast.
1985 */
1986 proc_fdlock(p);
1987 fdp = p->p_fd;
1988 if (fdp == NULL ||
1989 (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1990 proc_fdunlock(p);
1991 return PROC_RETURNED;
1992 }
1993 proc_fdunlock(p);
1994
1995 /*
1996 * Ok, we will have to do some work. Always take two refs
1997 * because we might need that many. We'll dispose of whatever
1998 * we ended up not using.
1999 */
2000 if (vnode_ref(newdp) != 0) {
2001 return PROC_RETURNED;
2002 }
2003 if (vnode_ref(newdp) != 0) {
2004 vnode_rele(newdp);
2005 return PROC_RETURNED;
2006 }
2007
2008 /*
2009 * Now do the work. Note: we dropped the proc_fdlock, so we
2010 * have to do all of the checks again.
2011 */
2012 proc_fdlock(p);
2013 fdp = p->p_fd;
2014 if (fdp != NULL) {
2015 if (fdp->fd_cdir == olddp) {
2016 old_cvp = olddp;
2017 fdp->fd_cdir = newdp;
2018 new_cvp = NULL;
2019 }
2020 if (fdp->fd_rdir == olddp) {
2021 old_rvp = olddp;
2022 fdp->fd_rdir = newdp;
2023 new_rvp = NULL;
2024 }
2025 }
2026 proc_fdunlock(p);
2027
2028 /*
2029 * Dispose of any references that are no longer needed.
2030 */
2031 if (old_cvp != NULL) {
2032 vnode_rele(old_cvp);
2033 }
2034 if (old_rvp != NULL) {
2035 vnode_rele(old_rvp);
2036 }
2037 if (new_cvp != NULL) {
2038 vnode_rele(new_cvp);
2039 }
2040 if (new_rvp != NULL) {
2041 vnode_rele(new_rvp);
2042 }
2043
2044 return PROC_RETURNED;
2045 }
2046
2047
2048
2049 /*
2050 * Scan all active processes to see if any of them have a current
2051 * or root directory onto which the new filesystem has just been
2052 * mounted. If so, replace them with the new mount point.
2053 */
2054 static int
2055 checkdirs(vnode_t olddp, vfs_context_t ctx)
2056 {
2057 vnode_t newdp;
2058 vnode_t tvp;
2059 int err;
2060 struct cdirargs cdr;
2061
2062 if (olddp->v_usecount == 1) {
2063 return 0;
2064 }
2065 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2066
2067 if (err != 0) {
2068 #if DIAGNOSTIC
2069 panic("mount: lost mount: error %d", err);
2070 #endif
2071 return err;
2072 }
2073
2074 cdr.olddp = olddp;
2075 cdr.newdp = newdp;
2076 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2077 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2078
2079 if (rootvnode == olddp) {
2080 vnode_ref(newdp);
2081 tvp = rootvnode;
2082 rootvnode = newdp;
2083 vnode_rele(tvp);
2084 }
2085
2086 vnode_put(newdp);
2087 return 0;
2088 }
2089
2090 /*
2091 * Unmount a file system.
2092 *
2093 * Note: unmount takes a path to the vnode mounted on as argument,
2094 * not special file (as before).
2095 */
2096 /* ARGSUSED */
2097 int
2098 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2099 {
2100 vnode_t vp;
2101 struct mount *mp;
2102 int error;
2103 struct nameidata nd;
2104 vfs_context_t ctx = vfs_context_current();
2105
2106 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2107 UIO_USERSPACE, uap->path, ctx);
2108 error = namei(&nd);
2109 if (error) {
2110 return error;
2111 }
2112 vp = nd.ni_vp;
2113 mp = vp->v_mount;
2114 nameidone(&nd);
2115
2116 #if CONFIG_MACF
2117 error = mac_mount_check_umount(ctx, mp);
2118 if (error != 0) {
2119 vnode_put(vp);
2120 return error;
2121 }
2122 #endif
2123 /*
2124 * Must be the root of the filesystem
2125 */
2126 if ((vp->v_flag & VROOT) == 0) {
2127 vnode_put(vp);
2128 return EINVAL;
2129 }
2130 mount_ref(mp, 0);
2131 vnode_put(vp);
2132 /* safedounmount consumes the mount ref */
2133 return safedounmount(mp, uap->flags, ctx);
2134 }
2135
2136 int
2137 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2138 {
2139 mount_t mp;
2140
2141 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2142 if (mp == (mount_t)0) {
2143 return ENOENT;
2144 }
2145 mount_ref(mp, 0);
2146 mount_iterdrop(mp);
2147 /* safedounmount consumes the mount ref */
2148 return safedounmount(mp, flags, ctx);
2149 }
2150
2151
2152 /*
2153 * The mount struct comes with a mount ref which will be consumed.
2154 * Do the actual file system unmount, prevent some common foot shooting.
2155 */
2156 int
2157 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2158 {
2159 int error;
2160 proc_t p = vfs_context_proc(ctx);
2161
2162 /*
2163 * If the file system is not responding and MNT_NOBLOCK
2164 * is set and not a forced unmount then return EBUSY.
2165 */
2166 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2167 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2168 error = EBUSY;
2169 goto out;
2170 }
2171
2172 /*
2173 * Skip authorization if the mount is tagged as permissive and
2174 * this is not a forced-unmount attempt.
2175 */
2176 if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2177 /*
2178 * Only root, or the user that did the original mount is
2179 * permitted to unmount this filesystem.
2180 */
2181 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2182 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2183 goto out;
2184 }
2185 }
2186 /*
2187 * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2188 */
2189 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2190 error = EBUSY; /* the root (or associated volumes) is always busy */
2191 goto out;
2192 }
2193
2194 #ifdef CONFIG_IMGSRC_ACCESS
2195 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2196 error = EBUSY;
2197 goto out;
2198 }
2199 #endif /* CONFIG_IMGSRC_ACCESS */
2200
2201 return dounmount(mp, flags, 1, ctx);
2202
2203 out:
2204 mount_drop(mp, 0);
2205 return error;
2206 }
2207
2208 /*
2209 * Do the actual file system unmount.
2210 */
2211 int
2212 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2213 {
2214 vnode_t coveredvp = (vnode_t)0;
2215 int error;
2216 int needwakeup = 0;
2217 int forcedunmount = 0;
2218 int lflags = 0;
2219 struct vnode *devvp = NULLVP;
2220 #if CONFIG_TRIGGERS
2221 proc_t p = vfs_context_proc(ctx);
2222 int did_vflush = 0;
2223 int pflags_save = 0;
2224 #endif /* CONFIG_TRIGGERS */
2225
2226 #if CONFIG_FSE
2227 if (!(flags & MNT_FORCE)) {
2228 fsevent_unmount(mp, ctx); /* has to come first! */
2229 }
2230 #endif
2231
2232 mount_lock(mp);
2233
2234 /*
2235 * If already an unmount in progress just return EBUSY.
2236 * Even a forced unmount cannot override.
2237 */
2238 if (mp->mnt_lflag & MNT_LUNMOUNT) {
2239 if (withref != 0) {
2240 mount_drop(mp, 1);
2241 }
2242 mount_unlock(mp);
2243 return EBUSY;
2244 }
2245
2246 if (flags & MNT_FORCE) {
2247 forcedunmount = 1;
2248 mp->mnt_lflag |= MNT_LFORCE;
2249 }
2250
2251 #if CONFIG_TRIGGERS
2252 if (flags & MNT_NOBLOCK && p != kernproc) {
2253 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2254 }
2255 #endif
2256
2257 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2258 mp->mnt_lflag |= MNT_LUNMOUNT;
2259 mp->mnt_flag &= ~MNT_ASYNC;
2260 /*
2261 * anyone currently in the fast path that
2262 * trips over the cached rootvp will be
2263 * dumped out and forced into the slow path
2264 * to regenerate a new cached value
2265 */
2266 mp->mnt_realrootvp = NULLVP;
2267 mount_unlock(mp);
2268
2269 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2270 /*
2271 * Force unmount any mounts in this filesystem.
2272 * If any unmounts fail - just leave them dangling.
2273 * Avoids recursion.
2274 */
2275 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2276 }
2277
2278 /*
2279 * taking the name_cache_lock exclusively will
2280 * insure that everyone is out of the fast path who
2281 * might be trying to use a now stale copy of
2282 * vp->v_mountedhere->mnt_realrootvp
2283 * bumping mount_generation causes the cached values
2284 * to be invalidated
2285 */
2286 name_cache_lock();
2287 mount_generation++;
2288 name_cache_unlock();
2289
2290
2291 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2292 if (withref != 0) {
2293 mount_drop(mp, 0);
2294 }
2295 error = 0;
2296 if (forcedunmount == 0) {
2297 ubc_umount(mp); /* release cached vnodes */
2298 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2299 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2300 if (error) {
2301 mount_lock(mp);
2302 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2303 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2304 mp->mnt_lflag &= ~MNT_LFORCE;
2305 goto out;
2306 }
2307 }
2308 }
2309
2310 IOBSDMountChange(mp, kIOMountChangeUnmount);
2311
2312 #if CONFIG_TRIGGERS
2313 vfs_nested_trigger_unmounts(mp, flags, ctx);
2314 did_vflush = 1;
2315 #endif
2316 if (forcedunmount) {
2317 lflags |= FORCECLOSE;
2318 }
2319 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2320 if ((forcedunmount == 0) && error) {
2321 mount_lock(mp);
2322 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2323 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2324 mp->mnt_lflag &= ~MNT_LFORCE;
2325 goto out;
2326 }
2327
2328 /* make sure there are no one in the mount iterations or lookup */
2329 mount_iterdrain(mp);
2330
2331 error = VFS_UNMOUNT(mp, flags, ctx);
2332 if (error) {
2333 mount_iterreset(mp);
2334 mount_lock(mp);
2335 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2336 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2337 mp->mnt_lflag &= ~MNT_LFORCE;
2338 goto out;
2339 }
2340
2341 /* increment the operations count */
2342 if (!error) {
2343 OSAddAtomic(1, &vfs_nummntops);
2344 }
2345
2346 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2347 /* hold an io reference and drop the usecount before close */
2348 devvp = mp->mnt_devvp;
2349 vnode_getalways(devvp);
2350 vnode_rele(devvp);
2351 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2352 ctx);
2353 vnode_clearmountedon(devvp);
2354 vnode_put(devvp);
2355 }
2356 lck_rw_done(&mp->mnt_rwlock);
2357 mount_list_remove(mp);
2358 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2359
2360 /* mark the mount point hook in the vp but not drop the ref yet */
2361 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2362 /*
2363 * The covered vnode needs special handling. Trying to get an
2364 * iocount must not block here as this may lead to deadlocks
2365 * if the Filesystem to which the covered vnode belongs is
2366 * undergoing forced unmounts. Since we hold a usecount, the
2367 * vnode cannot be reused (it can, however, still be terminated)
2368 */
2369 vnode_getalways(coveredvp);
2370 vnode_lock_spin(coveredvp);
2371
2372 mp->mnt_crossref++;
2373 coveredvp->v_mountedhere = (struct mount *)0;
2374 CLR(coveredvp->v_flag, VMOUNT);
2375
2376 vnode_unlock(coveredvp);
2377 vnode_put(coveredvp);
2378 }
2379
2380 mount_list_lock();
2381 mp->mnt_vtable->vfc_refcount--;
2382 mount_list_unlock();
2383
2384 cache_purgevfs(mp); /* remove cache entries for this file sys */
2385 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2386 mount_lock(mp);
2387 mp->mnt_lflag |= MNT_LDEAD;
2388
2389 if (mp->mnt_lflag & MNT_LWAIT) {
2390 /*
2391 * do the wakeup here
2392 * in case we block in mount_refdrain
2393 * which will drop the mount lock
2394 * and allow anyone blocked in vfs_busy
2395 * to wakeup and see the LDEAD state
2396 */
2397 mp->mnt_lflag &= ~MNT_LWAIT;
2398 wakeup((caddr_t)mp);
2399 }
2400 mount_refdrain(mp);
2401
2402 /* free disk_conditioner_info structure for this mount */
2403 disk_conditioner_unmount(mp);
2404
2405 out:
2406 if (mp->mnt_lflag & MNT_LWAIT) {
2407 mp->mnt_lflag &= ~MNT_LWAIT;
2408 needwakeup = 1;
2409 }
2410
2411 #if CONFIG_TRIGGERS
2412 if (flags & MNT_NOBLOCK && p != kernproc) {
2413 // Restore P_NOREMOTEHANG bit to its previous value
2414 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2415 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2416 }
2417 }
2418
2419 /*
2420 * Callback and context are set together under the mount lock, and
2421 * never cleared, so we're safe to examine them here, drop the lock,
2422 * and call out.
2423 */
2424 if (mp->mnt_triggercallback != NULL) {
2425 mount_unlock(mp);
2426 if (error == 0) {
2427 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2428 } else if (did_vflush) {
2429 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2430 }
2431 } else {
2432 mount_unlock(mp);
2433 }
2434 #else
2435 mount_unlock(mp);
2436 #endif /* CONFIG_TRIGGERS */
2437
2438 lck_rw_done(&mp->mnt_rwlock);
2439
2440 if (needwakeup) {
2441 wakeup((caddr_t)mp);
2442 }
2443
2444 if (!error) {
2445 if ((coveredvp != NULLVP)) {
2446 vnode_t pvp = NULLVP;
2447
2448 /*
2449 * The covered vnode needs special handling. Trying to
2450 * get an iocount must not block here as this may lead
2451 * to deadlocks if the Filesystem to which the covered
2452 * vnode belongs is undergoing forced unmounts. Since we
2453 * hold a usecount, the vnode cannot be reused
2454 * (it can, however, still be terminated).
2455 */
2456 vnode_getalways(coveredvp);
2457
2458 mount_dropcrossref(mp, coveredvp, 0);
2459 /*
2460 * We'll _try_ to detect if this really needs to be
2461 * done. The coveredvp can only be in termination (or
2462 * terminated) if the coveredvp's mount point is in a
2463 * forced unmount (or has been) since we still hold the
2464 * ref.
2465 */
2466 if (!vnode_isrecycled(coveredvp)) {
2467 pvp = vnode_getparent(coveredvp);
2468 #if CONFIG_TRIGGERS
2469 if (coveredvp->v_resolve) {
2470 vnode_trigger_rearm(coveredvp, ctx);
2471 }
2472 #endif
2473 }
2474
2475 vnode_rele(coveredvp);
2476 vnode_put(coveredvp);
2477 coveredvp = NULLVP;
2478
2479 if (pvp) {
2480 lock_vnode_and_post(pvp, NOTE_WRITE);
2481 vnode_put(pvp);
2482 }
2483 } else if (mp->mnt_flag & MNT_ROOTFS) {
2484 mount_lock_destroy(mp);
2485 #if CONFIG_MACF
2486 mac_mount_label_destroy(mp);
2487 #endif
2488 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2489 } else {
2490 panic("dounmount: no coveredvp");
2491 }
2492 }
2493 return error;
2494 }
2495
2496 /*
2497 * Unmount any mounts in this filesystem.
2498 */
2499 void
2500 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2501 {
2502 mount_t smp;
2503 fsid_t *fsids, fsid;
2504 int fsids_sz;
2505 int count = 0, i, m = 0;
2506 vnode_t vp;
2507
2508 mount_list_lock();
2509
2510 // Get an array to hold the submounts fsids.
2511 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2512 count++;
2513 fsids_sz = count * sizeof(fsid_t);
2514 MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2515 if (fsids == NULL) {
2516 mount_list_unlock();
2517 goto out;
2518 }
2519 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2520
2521 /*
2522 * Fill the array with submount fsids.
2523 * Since mounts are always added to the tail of the mount list, the
2524 * list is always in mount order.
2525 * For each mount check if the mounted-on vnode belongs to a
2526 * mount that's already added to our array of mounts to be unmounted.
2527 */
2528 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2529 vp = smp->mnt_vnodecovered;
2530 if (vp == NULL) {
2531 continue;
2532 }
2533 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
2534 for (i = 0; i <= m; i++) {
2535 if (fsids[i].val[0] == fsid.val[0] &&
2536 fsids[i].val[1] == fsid.val[1]) {
2537 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2538 break;
2539 }
2540 }
2541 }
2542 mount_list_unlock();
2543
2544 // Unmount the submounts in reverse order. Ignore errors.
2545 for (i = m; i > 0; i--) {
2546 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2547 if (smp) {
2548 mount_ref(smp, 0);
2549 mount_iterdrop(smp);
2550 (void) dounmount(smp, flags, 1, ctx);
2551 }
2552 }
2553 out:
2554 if (fsids) {
2555 FREE(fsids, M_TEMP);
2556 }
2557 }
2558
2559 void
2560 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2561 {
2562 vnode_lock(dp);
2563 mp->mnt_crossref--;
2564
2565 if (mp->mnt_crossref < 0) {
2566 panic("mount cross refs -ve");
2567 }
2568
2569 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2570 if (need_put) {
2571 vnode_put_locked(dp);
2572 }
2573 vnode_unlock(dp);
2574
2575 mount_lock_destroy(mp);
2576 #if CONFIG_MACF
2577 mac_mount_label_destroy(mp);
2578 #endif
2579 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2580 return;
2581 }
2582 if (need_put) {
2583 vnode_put_locked(dp);
2584 }
2585 vnode_unlock(dp);
2586 }
2587
2588
2589 /*
2590 * Sync each mounted filesystem.
2591 */
2592 #if DIAGNOSTIC
2593 int syncprt = 0;
2594 #endif
2595
2596 int print_vmpage_stat = 0;
2597
2598 /*
2599 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
2600 * mounted read-write with the passed waitfor value.
2601 *
2602 * Parameters: mp mount-point descriptor per mounted file-system instance.
2603 * arg user argument (please see below)
2604 *
2605 * User argument is a pointer to 32 bit unsigned integer which describes the
2606 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
2607 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2608 * waitfor value.
2609 *
2610 * Returns: VFS_RETURNED
2611 */
2612 static int
2613 sync_callback(mount_t mp, void *arg)
2614 {
2615 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2616 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2617 unsigned waitfor = MNT_NOWAIT;
2618
2619 if (arg) {
2620 waitfor = *(uint32_t*)arg;
2621 }
2622
2623 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2624 if (waitfor != MNT_WAIT &&
2625 waitfor != (MNT_WAIT | MNT_VOLUME) &&
2626 waitfor != MNT_NOWAIT &&
2627 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2628 waitfor != MNT_DWAIT &&
2629 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2630 panic("Passed inappropriate waitfor %u to "
2631 "sync_callback()", waitfor);
2632 }
2633
2634 mp->mnt_flag &= ~MNT_ASYNC;
2635 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2636 if (asyncflag) {
2637 mp->mnt_flag |= MNT_ASYNC;
2638 }
2639 }
2640
2641 return VFS_RETURNED;
2642 }
2643
2644 /* ARGSUSED */
2645 int
2646 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2647 {
2648 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2649
2650 if (print_vmpage_stat) {
2651 vm_countdirtypages();
2652 }
2653
2654 #if DIAGNOSTIC
2655 if (syncprt) {
2656 vfs_bufstats();
2657 }
2658 #endif /* DIAGNOSTIC */
2659 return 0;
2660 }
2661
2662 typedef enum {
2663 SYNC_ALL = 0,
2664 SYNC_ONLY_RELIABLE_MEDIA = 1,
2665 SYNC_ONLY_UNRELIABLE_MEDIA = 2
2666 } sync_type_t;
2667
2668 static int
2669 sync_internal_callback(mount_t mp, void *arg)
2670 {
2671 if (arg) {
2672 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2673 (mp->mnt_flag & MNT_LOCAL);
2674 sync_type_t sync_type = *((sync_type_t *)arg);
2675
2676 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2677 return VFS_RETURNED;
2678 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2679 return VFS_RETURNED;
2680 }
2681 }
2682
2683 (void)sync_callback(mp, NULL);
2684
2685 return VFS_RETURNED;
2686 }
2687
2688 int sync_thread_state = 0;
2689 int sync_timeout_seconds = 5;
2690
2691 #define SYNC_THREAD_RUN 0x0001
2692 #define SYNC_THREAD_RUNNING 0x0002
2693
2694 static void
2695 sync_thread(__unused void *arg, __unused wait_result_t wr)
2696 {
2697 sync_type_t sync_type;
2698
2699 lck_mtx_lock(sync_mtx_lck);
2700 while (sync_thread_state & SYNC_THREAD_RUN) {
2701 sync_thread_state &= ~SYNC_THREAD_RUN;
2702 lck_mtx_unlock(sync_mtx_lck);
2703
2704 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2705 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2706 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2707 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2708
2709 lck_mtx_lock(sync_mtx_lck);
2710 }
2711 /*
2712 * This wakeup _has_ to be issued before the lock is released otherwise
2713 * we may end up waking up a thread in sync_internal which is
2714 * expecting a wakeup from a thread it just created and not from this
2715 * thread which is about to exit.
2716 */
2717 wakeup(&sync_thread_state);
2718 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2719 lck_mtx_unlock(sync_mtx_lck);
2720
2721 if (print_vmpage_stat) {
2722 vm_countdirtypages();
2723 }
2724
2725 #if DIAGNOSTIC
2726 if (syncprt) {
2727 vfs_bufstats();
2728 }
2729 #endif /* DIAGNOSTIC */
2730 }
2731
2732 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2733
2734 /*
2735 * An in-kernel sync for power management to call.
2736 * This function always returns within sync_timeout seconds.
2737 */
2738 __private_extern__ int
2739 sync_internal(void)
2740 {
2741 thread_t thd;
2742 int error;
2743 int thread_created = FALSE;
2744 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2745
2746 lck_mtx_lock(sync_mtx_lck);
2747 sync_thread_state |= SYNC_THREAD_RUN;
2748 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2749 int kr;
2750
2751 sync_thread_state |= SYNC_THREAD_RUNNING;
2752 kr = kernel_thread_start(sync_thread, NULL, &thd);
2753 if (kr != KERN_SUCCESS) {
2754 sync_thread_state &= ~SYNC_THREAD_RUNNING;
2755 lck_mtx_unlock(sync_mtx_lck);
2756 printf("sync_thread failed\n");
2757 return 0;
2758 }
2759 thread_created = TRUE;
2760 }
2761
2762 error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2763 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2764 if (error) {
2765 struct timeval now;
2766
2767 microtime(&now);
2768 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2769 printf("sync timed out: %d sec\n", sync_timeout_seconds);
2770 sync_timeout_last_print.tv_sec = now.tv_sec;
2771 }
2772 }
2773
2774 if (thread_created) {
2775 thread_deallocate(thd);
2776 }
2777
2778 return 0;
2779 } /* end of sync_internal call */
2780
2781 /*
2782 * Change filesystem quotas.
2783 */
2784 #if QUOTA
2785 int
2786 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2787 {
2788 struct mount *mp;
2789 int error, quota_cmd, quota_status = 0;
2790 caddr_t datap;
2791 size_t fnamelen;
2792 struct nameidata nd;
2793 vfs_context_t ctx = vfs_context_current();
2794 struct dqblk my_dqblk = {};
2795
2796 AUDIT_ARG(uid, uap->uid);
2797 AUDIT_ARG(cmd, uap->cmd);
2798 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2799 uap->path, ctx);
2800 error = namei(&nd);
2801 if (error) {
2802 return error;
2803 }
2804 mp = nd.ni_vp->v_mount;
2805 mount_ref(mp, 0);
2806 vnode_put(nd.ni_vp);
2807 nameidone(&nd);
2808
2809 /* copyin any data we will need for downstream code */
2810 quota_cmd = uap->cmd >> SUBCMDSHIFT;
2811
2812 switch (quota_cmd) {
2813 case Q_QUOTAON:
2814 /* uap->arg specifies a file from which to take the quotas */
2815 fnamelen = MAXPATHLEN;
2816 datap = kalloc(MAXPATHLEN);
2817 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2818 break;
2819 case Q_GETQUOTA:
2820 /* uap->arg is a pointer to a dqblk structure. */
2821 datap = (caddr_t) &my_dqblk;
2822 break;
2823 case Q_SETQUOTA:
2824 case Q_SETUSE:
2825 /* uap->arg is a pointer to a dqblk structure. */
2826 datap = (caddr_t) &my_dqblk;
2827 if (proc_is64bit(p)) {
2828 struct user_dqblk my_dqblk64;
2829 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2830 if (error == 0) {
2831 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2832 }
2833 } else {
2834 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2835 }
2836 break;
2837 case Q_QUOTASTAT:
2838 /* uap->arg is a pointer to an integer */
2839 datap = (caddr_t) &quota_status;
2840 break;
2841 default:
2842 datap = NULL;
2843 break;
2844 } /* switch */
2845
2846 if (error == 0) {
2847 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2848 }
2849
2850 switch (quota_cmd) {
2851 case Q_QUOTAON:
2852 if (datap != NULL) {
2853 kfree(datap, MAXPATHLEN);
2854 }
2855 break;
2856 case Q_GETQUOTA:
2857 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2858 if (error == 0) {
2859 if (proc_is64bit(p)) {
2860 struct user_dqblk my_dqblk64;
2861
2862 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2863 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2864 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2865 } else {
2866 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2867 }
2868 }
2869 break;
2870 case Q_QUOTASTAT:
2871 /* uap->arg is a pointer to an integer */
2872 if (error == 0) {
2873 error = copyout(datap, uap->arg, sizeof(quota_status));
2874 }
2875 break;
2876 default:
2877 break;
2878 } /* switch */
2879
2880 mount_drop(mp, 0);
2881 return error;
2882 }
2883 #else
2884 int
2885 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2886 {
2887 return EOPNOTSUPP;
2888 }
2889 #endif /* QUOTA */
2890
2891 /*
2892 * Get filesystem statistics.
2893 *
2894 * Returns: 0 Success
2895 * namei:???
2896 * vfs_update_vfsstat:???
2897 * munge_statfs:EFAULT
2898 */
2899 /* ARGSUSED */
2900 int
2901 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2902 {
2903 struct mount *mp;
2904 struct vfsstatfs *sp;
2905 int error;
2906 struct nameidata nd;
2907 vfs_context_t ctx = vfs_context_current();
2908 vnode_t vp;
2909
2910 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2911 UIO_USERSPACE, uap->path, ctx);
2912 error = namei(&nd);
2913 if (error != 0) {
2914 return error;
2915 }
2916 vp = nd.ni_vp;
2917 mp = vp->v_mount;
2918 sp = &mp->mnt_vfsstat;
2919 nameidone(&nd);
2920
2921 #if CONFIG_MACF
2922 error = mac_mount_check_stat(ctx, mp);
2923 if (error != 0) {
2924 vnode_put(vp);
2925 return error;
2926 }
2927 #endif
2928
2929 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2930 if (error != 0) {
2931 vnode_put(vp);
2932 return error;
2933 }
2934
2935 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2936 vnode_put(vp);
2937 return error;
2938 }
2939
2940 /*
2941 * Get filesystem statistics.
2942 */
2943 /* ARGSUSED */
2944 int
2945 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2946 {
2947 vnode_t vp;
2948 struct mount *mp;
2949 struct vfsstatfs *sp;
2950 int error;
2951
2952 AUDIT_ARG(fd, uap->fd);
2953
2954 if ((error = file_vnode(uap->fd, &vp))) {
2955 return error;
2956 }
2957
2958 error = vnode_getwithref(vp);
2959 if (error) {
2960 file_drop(uap->fd);
2961 return error;
2962 }
2963
2964 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2965
2966 mp = vp->v_mount;
2967 if (!mp) {
2968 error = EBADF;
2969 goto out;
2970 }
2971
2972 #if CONFIG_MACF
2973 error = mac_mount_check_stat(vfs_context_current(), mp);
2974 if (error != 0) {
2975 goto out;
2976 }
2977 #endif
2978
2979 sp = &mp->mnt_vfsstat;
2980 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2981 goto out;
2982 }
2983
2984 error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2985
2986 out:
2987 file_drop(uap->fd);
2988 vnode_put(vp);
2989
2990 return error;
2991 }
2992
2993 void
2994 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
2995 {
2996 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
2997
2998 bzero(sfs, sizeof(*sfs));
2999
3000 sfs->f_bsize = vsfs->f_bsize;
3001 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3002 sfs->f_blocks = vsfs->f_blocks;
3003 sfs->f_bfree = vsfs->f_bfree;
3004 sfs->f_bavail = vsfs->f_bavail;
3005 sfs->f_files = vsfs->f_files;
3006 sfs->f_ffree = vsfs->f_ffree;
3007 sfs->f_fsid = vsfs->f_fsid;
3008 sfs->f_owner = vsfs->f_owner;
3009 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3010 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3011 sfs->f_fssubtype = vsfs->f_fssubtype;
3012 sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3013 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3014 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3015 } else {
3016 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3017 }
3018 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3019 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3020 }
3021
3022 /*
3023 * Get file system statistics in 64-bit mode
3024 */
3025 int
3026 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3027 {
3028 struct mount *mp;
3029 int error;
3030 struct nameidata nd;
3031 struct statfs64 sfs;
3032 vfs_context_t ctxp = vfs_context_current();
3033 vnode_t vp;
3034
3035 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3036 UIO_USERSPACE, uap->path, ctxp);
3037 error = namei(&nd);
3038 if (error != 0) {
3039 return error;
3040 }
3041 vp = nd.ni_vp;
3042 mp = vp->v_mount;
3043 nameidone(&nd);
3044
3045 #if CONFIG_MACF
3046 error = mac_mount_check_stat(ctxp, mp);
3047 if (error != 0) {
3048 vnode_put(vp);
3049 return error;
3050 }
3051 #endif
3052
3053 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3054 if (error != 0) {
3055 vnode_put(vp);
3056 return error;
3057 }
3058
3059 vfs_get_statfs64(mp, &sfs);
3060 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3061 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3062 /* This process does not want to see a seperate data volume mountpoint */
3063 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3064 }
3065 error = copyout(&sfs, uap->buf, sizeof(sfs));
3066 vnode_put(vp);
3067
3068 return error;
3069 }
3070
3071 /*
3072 * Get file system statistics in 64-bit mode
3073 */
3074 int
3075 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3076 {
3077 struct vnode *vp;
3078 struct mount *mp;
3079 struct statfs64 sfs;
3080 int error;
3081
3082 AUDIT_ARG(fd, uap->fd);
3083
3084 if ((error = file_vnode(uap->fd, &vp))) {
3085 return error;
3086 }
3087
3088 error = vnode_getwithref(vp);
3089 if (error) {
3090 file_drop(uap->fd);
3091 return error;
3092 }
3093
3094 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3095
3096 mp = vp->v_mount;
3097 if (!mp) {
3098 error = EBADF;
3099 goto out;
3100 }
3101
3102 #if CONFIG_MACF
3103 error = mac_mount_check_stat(vfs_context_current(), mp);
3104 if (error != 0) {
3105 goto out;
3106 }
3107 #endif
3108
3109 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3110 goto out;
3111 }
3112
3113 vfs_get_statfs64(mp, &sfs);
3114 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3115 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3116 /* This process does not want to see a seperate data volume mountpoint */
3117 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3118 }
3119 error = copyout(&sfs, uap->buf, sizeof(sfs));
3120
3121 out:
3122 file_drop(uap->fd);
3123 vnode_put(vp);
3124
3125 return error;
3126 }
3127
3128 struct getfsstat_struct {
3129 user_addr_t sfsp;
3130 user_addr_t *mp;
3131 int count;
3132 int maxcount;
3133 int flags;
3134 int error;
3135 };
3136
3137
3138 static int
3139 getfsstat_callback(mount_t mp, void * arg)
3140 {
3141 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3142 struct vfsstatfs *sp;
3143 int error, my_size;
3144 vfs_context_t ctx = vfs_context_current();
3145
3146 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3147 #if CONFIG_MACF
3148 error = mac_mount_check_stat(ctx, mp);
3149 if (error != 0) {
3150 fstp->error = error;
3151 return VFS_RETURNED_DONE;
3152 }
3153 #endif
3154 sp = &mp->mnt_vfsstat;
3155 /*
3156 * If MNT_NOWAIT is specified, do not refresh the
3157 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3158 */
3159 if ((mp->mnt_lflag & MNT_LDEAD) ||
3160 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3161 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3162 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3163 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3164 return VFS_RETURNED;
3165 }
3166
3167 /*
3168 * Need to handle LP64 version of struct statfs
3169 */
3170 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3171 if (error) {
3172 fstp->error = error;
3173 return VFS_RETURNED_DONE;
3174 }
3175 fstp->sfsp += my_size;
3176
3177 if (fstp->mp) {
3178 #if CONFIG_MACF
3179 error = mac_mount_label_get(mp, *fstp->mp);
3180 if (error) {
3181 fstp->error = error;
3182 return VFS_RETURNED_DONE;
3183 }
3184 #endif
3185 fstp->mp++;
3186 }
3187 }
3188 fstp->count++;
3189 return VFS_RETURNED;
3190 }
3191
3192 /*
3193 * Get statistics on all filesystems.
3194 */
3195 int
3196 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3197 {
3198 struct __mac_getfsstat_args muap;
3199
3200 muap.buf = uap->buf;
3201 muap.bufsize = uap->bufsize;
3202 muap.mac = USER_ADDR_NULL;
3203 muap.macsize = 0;
3204 muap.flags = uap->flags;
3205
3206 return __mac_getfsstat(p, &muap, retval);
3207 }
3208
3209 /*
3210 * __mac_getfsstat: Get MAC-related file system statistics
3211 *
3212 * Parameters: p (ignored)
3213 * uap User argument descriptor (see below)
3214 * retval Count of file system statistics (N stats)
3215 *
3216 * Indirect: uap->bufsize Buffer size
3217 * uap->macsize MAC info size
3218 * uap->buf Buffer where information will be returned
3219 * uap->mac MAC info
3220 * uap->flags File system flags
3221 *
3222 *
3223 * Returns: 0 Success
3224 * !0 Not success
3225 *
3226 */
3227 int
3228 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3229 {
3230 user_addr_t sfsp;
3231 user_addr_t *mp;
3232 size_t count, maxcount, bufsize, macsize;
3233 struct getfsstat_struct fst;
3234
3235 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3236 return EINVAL;
3237 }
3238
3239 bufsize = (size_t) uap->bufsize;
3240 macsize = (size_t) uap->macsize;
3241
3242 if (IS_64BIT_PROCESS(p)) {
3243 maxcount = bufsize / sizeof(struct user64_statfs);
3244 } else {
3245 maxcount = bufsize / sizeof(struct user32_statfs);
3246 }
3247 sfsp = uap->buf;
3248 count = 0;
3249
3250 mp = NULL;
3251
3252 #if CONFIG_MACF
3253 if (uap->mac != USER_ADDR_NULL) {
3254 u_int32_t *mp0;
3255 int error;
3256 unsigned int i;
3257
3258 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3259 if (count != maxcount) {
3260 return EINVAL;
3261 }
3262
3263 /* Copy in the array */
3264 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3265 if (mp0 == NULL) {
3266 return ENOMEM;
3267 }
3268
3269 error = copyin(uap->mac, mp0, macsize);
3270 if (error) {
3271 FREE(mp0, M_MACTEMP);
3272 return error;
3273 }
3274
3275 /* Normalize to an array of user_addr_t */
3276 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3277 if (mp == NULL) {
3278 FREE(mp0, M_MACTEMP);
3279 return ENOMEM;
3280 }
3281
3282 for (i = 0; i < count; i++) {
3283 if (IS_64BIT_PROCESS(p)) {
3284 mp[i] = ((user_addr_t *)mp0)[i];
3285 } else {
3286 mp[i] = (user_addr_t)mp0[i];
3287 }
3288 }
3289 FREE(mp0, M_MACTEMP);
3290 }
3291 #endif
3292
3293
3294 fst.sfsp = sfsp;
3295 fst.mp = mp;
3296 fst.flags = uap->flags;
3297 fst.count = 0;
3298 fst.error = 0;
3299 fst.maxcount = maxcount;
3300
3301
3302 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3303
3304 if (mp) {
3305 FREE(mp, M_MACTEMP);
3306 }
3307
3308 if (fst.error) {
3309 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3310 return fst.error;
3311 }
3312
3313 if (fst.sfsp && fst.count > fst.maxcount) {
3314 *retval = fst.maxcount;
3315 } else {
3316 *retval = fst.count;
3317 }
3318 return 0;
3319 }
3320
3321 static int
3322 getfsstat64_callback(mount_t mp, void * arg)
3323 {
3324 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3325 struct vfsstatfs *sp;
3326 struct statfs64 sfs;
3327 int error;
3328
3329 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3330 #if CONFIG_MACF
3331 error = mac_mount_check_stat(vfs_context_current(), mp);
3332 if (error != 0) {
3333 fstp->error = error;
3334 return VFS_RETURNED_DONE;
3335 }
3336 #endif
3337 sp = &mp->mnt_vfsstat;
3338 /*
3339 * If MNT_NOWAIT is specified, do not refresh the fsstat
3340 * cache. MNT_WAIT overrides MNT_NOWAIT.
3341 *
3342 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3343 * getfsstat, since the constants are out of the same
3344 * namespace.
3345 */
3346 if ((mp->mnt_lflag & MNT_LDEAD) ||
3347 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3348 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3349 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3350 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3351 return VFS_RETURNED;
3352 }
3353
3354 vfs_get_statfs64(mp, &sfs);
3355 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3356 if (error) {
3357 fstp->error = error;
3358 return VFS_RETURNED_DONE;
3359 }
3360 fstp->sfsp += sizeof(sfs);
3361 }
3362 fstp->count++;
3363 return VFS_RETURNED;
3364 }
3365
3366 /*
3367 * Get statistics on all file systems in 64 bit mode.
3368 */
3369 int
3370 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3371 {
3372 user_addr_t sfsp;
3373 int count, maxcount;
3374 struct getfsstat_struct fst;
3375
3376 maxcount = uap->bufsize / sizeof(struct statfs64);
3377
3378 sfsp = uap->buf;
3379 count = 0;
3380
3381 fst.sfsp = sfsp;
3382 fst.flags = uap->flags;
3383 fst.count = 0;
3384 fst.error = 0;
3385 fst.maxcount = maxcount;
3386
3387 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3388
3389 if (fst.error) {
3390 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3391 return fst.error;
3392 }
3393
3394 if (fst.sfsp && fst.count > fst.maxcount) {
3395 *retval = fst.maxcount;
3396 } else {
3397 *retval = fst.count;
3398 }
3399
3400 return 0;
3401 }
3402
3403 /*
3404 * gets the associated vnode with the file descriptor passed.
3405 * as input
3406 *
3407 * INPUT
3408 * ctx - vfs context of caller
3409 * fd - file descriptor for which vnode is required.
3410 * vpp - Pointer to pointer to vnode to be returned.
3411 *
3412 * The vnode is returned with an iocount so any vnode obtained
3413 * by this call needs a vnode_put
3414 *
3415 */
3416 int
3417 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3418 {
3419 int error;
3420 vnode_t vp;
3421 struct fileproc *fp;
3422 proc_t p = vfs_context_proc(ctx);
3423
3424 *vpp = NULLVP;
3425
3426 error = fp_getfvp(p, fd, &fp, &vp);
3427 if (error) {
3428 return error;
3429 }
3430
3431 error = vnode_getwithref(vp);
3432 if (error) {
3433 (void)fp_drop(p, fd, fp, 0);
3434 return error;
3435 }
3436
3437 (void)fp_drop(p, fd, fp, 0);
3438 *vpp = vp;
3439 return error;
3440 }
3441
3442 /*
3443 * Wrapper function around namei to start lookup from a directory
3444 * specified by a file descriptor ni_dirfd.
3445 *
3446 * In addition to all the errors returned by namei, this call can
3447 * return ENOTDIR if the file descriptor does not refer to a directory.
3448 * and EBADF if the file descriptor is not valid.
3449 */
3450 int
3451 nameiat(struct nameidata *ndp, int dirfd)
3452 {
3453 if ((dirfd != AT_FDCWD) &&
3454 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3455 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3456 int error = 0;
3457 char c;
3458
3459 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3460 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3461 if (error) {
3462 return error;
3463 }
3464 } else {
3465 c = *((char *)(ndp->ni_dirp));
3466 }
3467
3468 if (c != '/') {
3469 vnode_t dvp_at;
3470
3471 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3472 &dvp_at);
3473 if (error) {
3474 return error;
3475 }
3476
3477 if (vnode_vtype(dvp_at) != VDIR) {
3478 vnode_put(dvp_at);
3479 return ENOTDIR;
3480 }
3481
3482 ndp->ni_dvp = dvp_at;
3483 ndp->ni_cnd.cn_flags |= USEDVP;
3484 error = namei(ndp);
3485 ndp->ni_cnd.cn_flags &= ~USEDVP;
3486 vnode_put(dvp_at);
3487 return error;
3488 }
3489 }
3490
3491 return namei(ndp);
3492 }
3493
3494 /*
3495 * Change current working directory to a given file descriptor.
3496 */
3497 /* ARGSUSED */
3498 static int
3499 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3500 {
3501 struct filedesc *fdp = p->p_fd;
3502 vnode_t vp;
3503 vnode_t tdp;
3504 vnode_t tvp;
3505 struct mount *mp;
3506 int error;
3507 vfs_context_t ctx = vfs_context_current();
3508
3509 AUDIT_ARG(fd, uap->fd);
3510 if (per_thread && uap->fd == -1) {
3511 /*
3512 * Switching back from per-thread to per process CWD; verify we
3513 * in fact have one before proceeding. The only success case
3514 * for this code path is to return 0 preemptively after zapping
3515 * the thread structure contents.
3516 */
3517 thread_t th = vfs_context_thread(ctx);
3518 if (th) {
3519 uthread_t uth = get_bsdthread_info(th);
3520 tvp = uth->uu_cdir;
3521 uth->uu_cdir = NULLVP;
3522 if (tvp != NULLVP) {
3523 vnode_rele(tvp);
3524 return 0;
3525 }
3526 }
3527 return EBADF;
3528 }
3529
3530 if ((error = file_vnode(uap->fd, &vp))) {
3531 return error;
3532 }
3533 if ((error = vnode_getwithref(vp))) {
3534 file_drop(uap->fd);
3535 return error;
3536 }
3537
3538 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3539
3540 if (vp->v_type != VDIR) {
3541 error = ENOTDIR;
3542 goto out;
3543 }
3544
3545 #if CONFIG_MACF
3546 error = mac_vnode_check_chdir(ctx, vp);
3547 if (error) {
3548 goto out;
3549 }
3550 #endif
3551 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3552 if (error) {
3553 goto out;
3554 }
3555
3556 while (!error && (mp = vp->v_mountedhere) != NULL) {
3557 if (vfs_busy(mp, LK_NOWAIT)) {
3558 error = EACCES;
3559 goto out;
3560 }
3561 error = VFS_ROOT(mp, &tdp, ctx);
3562 vfs_unbusy(mp);
3563 if (error) {
3564 break;
3565 }
3566 vnode_put(vp);
3567 vp = tdp;
3568 }
3569 if (error) {
3570 goto out;
3571 }
3572 if ((error = vnode_ref(vp))) {
3573 goto out;
3574 }
3575 vnode_put(vp);
3576
3577 if (per_thread) {
3578 thread_t th = vfs_context_thread(ctx);
3579 if (th) {
3580 uthread_t uth = get_bsdthread_info(th);
3581 tvp = uth->uu_cdir;
3582 uth->uu_cdir = vp;
3583 OSBitOrAtomic(P_THCWD, &p->p_flag);
3584 } else {
3585 vnode_rele(vp);
3586 return ENOENT;
3587 }
3588 } else {
3589 proc_fdlock(p);
3590 tvp = fdp->fd_cdir;
3591 fdp->fd_cdir = vp;
3592 proc_fdunlock(p);
3593 }
3594
3595 if (tvp) {
3596 vnode_rele(tvp);
3597 }
3598 file_drop(uap->fd);
3599
3600 return 0;
3601 out:
3602 vnode_put(vp);
3603 file_drop(uap->fd);
3604
3605 return error;
3606 }
3607
3608 int
3609 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3610 {
3611 return common_fchdir(p, uap, 0);
3612 }
3613
3614 int
3615 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3616 {
3617 return common_fchdir(p, (void *)uap, 1);
3618 }
3619
3620
3621 /*
3622 * Change current working directory (".").
3623 *
3624 * Returns: 0 Success
3625 * change_dir:ENOTDIR
3626 * change_dir:???
3627 * vnode_ref:ENOENT No such file or directory
3628 */
3629 /* ARGSUSED */
3630 int
3631 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3632 {
3633 struct filedesc *fdp = p->p_fd;
3634 int error;
3635 vnode_t tvp;
3636
3637 error = change_dir(ndp, ctx);
3638 if (error) {
3639 return error;
3640 }
3641 if ((error = vnode_ref(ndp->ni_vp))) {
3642 vnode_put(ndp->ni_vp);
3643 return error;
3644 }
3645 /*
3646 * drop the iocount we picked up in change_dir
3647 */
3648 vnode_put(ndp->ni_vp);
3649
3650 if (per_thread) {
3651 thread_t th = vfs_context_thread(ctx);
3652 if (th) {
3653 uthread_t uth = get_bsdthread_info(th);
3654 tvp = uth->uu_cdir;
3655 uth->uu_cdir = ndp->ni_vp;
3656 OSBitOrAtomic(P_THCWD, &p->p_flag);
3657 } else {
3658 vnode_rele(ndp->ni_vp);
3659 return ENOENT;
3660 }
3661 } else {
3662 proc_fdlock(p);
3663 tvp = fdp->fd_cdir;
3664 fdp->fd_cdir = ndp->ni_vp;
3665 proc_fdunlock(p);
3666 }
3667
3668 if (tvp) {
3669 vnode_rele(tvp);
3670 }
3671
3672 return 0;
3673 }
3674
3675
3676 /*
3677 * Change current working directory (".").
3678 *
3679 * Returns: 0 Success
3680 * chdir_internal:ENOTDIR
3681 * chdir_internal:ENOENT No such file or directory
3682 * chdir_internal:???
3683 */
3684 /* ARGSUSED */
3685 static int
3686 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3687 {
3688 struct nameidata nd;
3689 vfs_context_t ctx = vfs_context_current();
3690
3691 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3692 UIO_USERSPACE, uap->path, ctx);
3693
3694 return chdir_internal(p, ctx, &nd, per_thread);
3695 }
3696
3697
3698 /*
3699 * chdir
3700 *
3701 * Change current working directory (".") for the entire process
3702 *
3703 * Parameters: p Process requesting the call
3704 * uap User argument descriptor (see below)
3705 * retval (ignored)
3706 *
3707 * Indirect parameters: uap->path Directory path
3708 *
3709 * Returns: 0 Success
3710 * common_chdir: ENOTDIR
3711 * common_chdir: ENOENT No such file or directory
3712 * common_chdir: ???
3713 *
3714 */
3715 int
3716 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3717 {
3718 return common_chdir(p, (void *)uap, 0);
3719 }
3720
3721 /*
3722 * __pthread_chdir
3723 *
3724 * Change current working directory (".") for a single thread
3725 *
3726 * Parameters: p Process requesting the call
3727 * uap User argument descriptor (see below)
3728 * retval (ignored)
3729 *
3730 * Indirect parameters: uap->path Directory path
3731 *
3732 * Returns: 0 Success
3733 * common_chdir: ENOTDIR
3734 * common_chdir: ENOENT No such file or directory
3735 * common_chdir: ???
3736 *
3737 */
3738 int
3739 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3740 {
3741 return common_chdir(p, (void *)uap, 1);
3742 }
3743
3744
3745 /*
3746 * Change notion of root (``/'') directory.
3747 */
3748 /* ARGSUSED */
3749 int
3750 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3751 {
3752 struct filedesc *fdp = p->p_fd;
3753 int error;
3754 struct nameidata nd;
3755 vnode_t tvp;
3756 vfs_context_t ctx = vfs_context_current();
3757
3758 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3759 return error;
3760 }
3761
3762 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3763 UIO_USERSPACE, uap->path, ctx);
3764 error = change_dir(&nd, ctx);
3765 if (error) {
3766 return error;
3767 }
3768
3769 #if CONFIG_MACF
3770 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3771 &nd.ni_cnd);
3772 if (error) {
3773 vnode_put(nd.ni_vp);
3774 return error;
3775 }
3776 #endif
3777
3778 if ((error = vnode_ref(nd.ni_vp))) {
3779 vnode_put(nd.ni_vp);
3780 return error;
3781 }
3782 vnode_put(nd.ni_vp);
3783
3784 proc_fdlock(p);
3785 tvp = fdp->fd_rdir;
3786 fdp->fd_rdir = nd.ni_vp;
3787 fdp->fd_flags |= FD_CHROOT;
3788 proc_fdunlock(p);
3789
3790 if (tvp != NULL) {
3791 vnode_rele(tvp);
3792 }
3793
3794 return 0;
3795 }
3796
3797 /*
3798 * Common routine for chroot and chdir.
3799 *
3800 * Returns: 0 Success
3801 * ENOTDIR Not a directory
3802 * namei:??? [anything namei can return]
3803 * vnode_authorize:??? [anything vnode_authorize can return]
3804 */
3805 static int
3806 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3807 {
3808 vnode_t vp;
3809 int error;
3810
3811 if ((error = namei(ndp))) {
3812 return error;
3813 }
3814 nameidone(ndp);
3815 vp = ndp->ni_vp;
3816
3817 if (vp->v_type != VDIR) {
3818 vnode_put(vp);
3819 return ENOTDIR;
3820 }
3821
3822 #if CONFIG_MACF
3823 error = mac_vnode_check_chdir(ctx, vp);
3824 if (error) {
3825 vnode_put(vp);
3826 return error;
3827 }
3828 #endif
3829
3830 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3831 if (error) {
3832 vnode_put(vp);
3833 return error;
3834 }
3835
3836 return error;
3837 }
3838
3839 /*
3840 * Free the vnode data (for directories) associated with the file glob.
3841 */
3842 struct fd_vn_data *
3843 fg_vn_data_alloc(void)
3844 {
3845 struct fd_vn_data *fvdata;
3846
3847 /* Allocate per fd vnode data */
3848 MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3849 M_FD_VN_DATA, M_WAITOK | M_ZERO);
3850 lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3851 return fvdata;
3852 }
3853
3854 /*
3855 * Free the vnode data (for directories) associated with the file glob.
3856 */
3857 void
3858 fg_vn_data_free(void *fgvndata)
3859 {
3860 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3861
3862 if (fvdata->fv_buf) {
3863 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3864 }
3865 lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3866 FREE(fvdata, M_FD_VN_DATA);
3867 }
3868
3869 /*
3870 * Check permissions, allocate an open file structure,
3871 * and call the device open routine if any.
3872 *
3873 * Returns: 0 Success
3874 * EINVAL
3875 * EINTR
3876 * falloc:ENFILE
3877 * falloc:EMFILE
3878 * falloc:ENOMEM
3879 * vn_open_auth:???
3880 * dupfdopen:???
3881 * VNOP_ADVLOCK:???
3882 * vnode_setsize:???
3883 *
3884 * XXX Need to implement uid, gid
3885 */
3886 int
3887 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3888 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3889 int32_t *retval)
3890 {
3891 proc_t p = vfs_context_proc(ctx);
3892 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3893 struct fileproc *fp;
3894 vnode_t vp;
3895 int flags, oflags;
3896 int type, indx, error;
3897 struct flock lf;
3898 struct vfs_context context;
3899
3900 oflags = uflags;
3901
3902 if ((oflags & O_ACCMODE) == O_ACCMODE) {
3903 return EINVAL;
3904 }
3905
3906 flags = FFLAGS(uflags);
3907 CLR(flags, FENCRYPTED);
3908 CLR(flags, FUNENCRYPTED);
3909
3910 AUDIT_ARG(fflags, oflags);
3911 AUDIT_ARG(mode, vap->va_mode);
3912
3913 if ((error = falloc_withalloc(p,
3914 &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3915 return error;
3916 }
3917 uu->uu_dupfd = -indx - 1;
3918
3919 if ((error = vn_open_auth(ndp, &flags, vap))) {
3920 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) { /* XXX from fdopen */
3921 if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3922 fp_drop(p, indx, NULL, 0);
3923 *retval = indx;
3924 return 0;
3925 }
3926 }
3927 if (error == ERESTART) {
3928 error = EINTR;
3929 }
3930 fp_free(p, indx, fp);
3931 return error;
3932 }
3933 uu->uu_dupfd = 0;
3934 vp = ndp->ni_vp;
3935
3936 fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3937 fp->f_fglob->fg_ops = &vnops;
3938 fp->f_fglob->fg_data = (caddr_t)vp;
3939
3940 if (flags & (O_EXLOCK | O_SHLOCK)) {
3941 lf.l_whence = SEEK_SET;
3942 lf.l_start = 0;
3943 lf.l_len = 0;
3944 if (flags & O_EXLOCK) {
3945 lf.l_type = F_WRLCK;
3946 } else {
3947 lf.l_type = F_RDLCK;
3948 }
3949 type = F_FLOCK;
3950 if ((flags & FNONBLOCK) == 0) {
3951 type |= F_WAIT;
3952 }
3953 #if CONFIG_MACF
3954 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3955 F_SETLK, &lf);
3956 if (error) {
3957 goto bad;
3958 }
3959 #endif
3960 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3961 goto bad;
3962 }
3963 fp->f_fglob->fg_flag |= FHASLOCK;
3964 }
3965
3966 /* try to truncate by setting the size attribute */
3967 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3968 goto bad;
3969 }
3970
3971 /*
3972 * For directories we hold some additional information in the fd.
3973 */
3974 if (vnode_vtype(vp) == VDIR) {
3975 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3976 } else {
3977 fp->f_fglob->fg_vn_data = NULL;
3978 }
3979
3980 vnode_put(vp);
3981
3982 /*
3983 * The first terminal open (without a O_NOCTTY) by a session leader
3984 * results in it being set as the controlling terminal.
3985 */
3986 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3987 !(flags & O_NOCTTY)) {
3988 int tmp = 0;
3989
3990 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3991 (caddr_t)&tmp, ctx);
3992 }
3993
3994 proc_fdlock(p);
3995 if (flags & O_CLOEXEC) {
3996 *fdflags(p, indx) |= UF_EXCLOSE;
3997 }
3998 if (flags & O_CLOFORK) {
3999 *fdflags(p, indx) |= UF_FORKCLOSE;
4000 }
4001 procfdtbl_releasefd(p, indx, NULL);
4002
4003 #if CONFIG_SECLUDED_MEMORY
4004 if (secluded_for_filecache &&
4005 FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4006 vnode_vtype(vp) == VREG) {
4007 memory_object_control_t moc;
4008
4009 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4010
4011 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4012 /* nothing to do... */
4013 } else if (fp->f_fglob->fg_flag & FWRITE) {
4014 /* writable -> no longer eligible for secluded pages */
4015 memory_object_mark_eligible_for_secluded(moc,
4016 FALSE);
4017 } else if (secluded_for_filecache == 1) {
4018 char pathname[32] = { 0, };
4019 size_t copied;
4020 /* XXX FBDP: better way to detect /Applications/ ? */
4021 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4022 (void)copyinstr(ndp->ni_dirp,
4023 pathname,
4024 sizeof(pathname),
4025 &copied);
4026 } else {
4027 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4028 pathname,
4029 sizeof(pathname),
4030 &copied);
4031 }
4032 pathname[sizeof(pathname) - 1] = '\0';
4033 if (strncmp(pathname,
4034 "/Applications/",
4035 strlen("/Applications/")) == 0 &&
4036 strncmp(pathname,
4037 "/Applications/Camera.app/",
4038 strlen("/Applications/Camera.app/")) != 0) {
4039 /*
4040 * not writable
4041 * AND from "/Applications/"
4042 * AND not from "/Applications/Camera.app/"
4043 * ==> eligible for secluded
4044 */
4045 memory_object_mark_eligible_for_secluded(moc,
4046 TRUE);
4047 }
4048 } else if (secluded_for_filecache == 2) {
4049 #if __arm64__
4050 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4051 #elif __arm__
4052 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4053 #else
4054 /* not implemented... */
4055 #endif
4056 size_t len = strlen(vp->v_name);
4057 if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4058 !strncmp(vp->v_name, "dyld", len) ||
4059 !strncmp(vp->v_name, "launchd", len) ||
4060 !strncmp(vp->v_name, "Camera", len) ||
4061 !strncmp(vp->v_name, "mediaserverd", len) ||
4062 !strncmp(vp->v_name, "SpringBoard", len) ||
4063 !strncmp(vp->v_name, "backboardd", len)) {
4064 /*
4065 * This file matters when launching Camera:
4066 * do not store its contents in the secluded
4067 * pool that will be drained on Camera launch.
4068 */
4069 memory_object_mark_eligible_for_secluded(moc,
4070 FALSE);
4071 }
4072 }
4073 }
4074 #endif /* CONFIG_SECLUDED_MEMORY */
4075
4076 fp_drop(p, indx, fp, 1);
4077 proc_fdunlock(p);
4078
4079 *retval = indx;
4080
4081 return 0;
4082 bad:
4083 context = *vfs_context_current();
4084 context.vc_ucred = fp->f_fglob->fg_cred;
4085
4086 if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4087 (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4088 lf.l_whence = SEEK_SET;
4089 lf.l_start = 0;
4090 lf.l_len = 0;
4091 lf.l_type = F_UNLCK;
4092
4093 (void)VNOP_ADVLOCK(
4094 vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4095 }
4096
4097 vn_close(vp, fp->f_fglob->fg_flag, &context);
4098 vnode_put(vp);
4099 fp_free(p, indx, fp);
4100
4101 return error;
4102 }
4103
4104 /*
4105 * While most of the *at syscall handlers can call nameiat() which
4106 * is a wrapper around namei, the use of namei and initialisation
4107 * of nameidata are far removed and in different functions - namei
4108 * gets called in vn_open_auth for open1. So we'll just do here what
4109 * nameiat() does.
4110 */
4111 static int
4112 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4113 struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4114 int dirfd)
4115 {
4116 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4117 int error;
4118 char c;
4119
4120 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4121 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4122 if (error) {
4123 return error;
4124 }
4125 } else {
4126 c = *((char *)(ndp->ni_dirp));
4127 }
4128
4129 if (c != '/') {
4130 vnode_t dvp_at;
4131
4132 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4133 &dvp_at);
4134 if (error) {
4135 return error;
4136 }
4137
4138 if (vnode_vtype(dvp_at) != VDIR) {
4139 vnode_put(dvp_at);
4140 return ENOTDIR;
4141 }
4142
4143 ndp->ni_dvp = dvp_at;
4144 ndp->ni_cnd.cn_flags |= USEDVP;
4145 error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4146 retval);
4147 vnode_put(dvp_at);
4148 return error;
4149 }
4150 }
4151
4152 return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4153 }
4154
4155 /*
4156 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4157 *
4158 * Parameters: p Process requesting the open
4159 * uap User argument descriptor (see below)
4160 * retval Pointer to an area to receive the
4161 * return calue from the system call
4162 *
4163 * Indirect: uap->path Path to open (same as 'open')
4164 * uap->flags Flags to open (same as 'open'
4165 * uap->uid UID to set, if creating
4166 * uap->gid GID to set, if creating
4167 * uap->mode File mode, if creating (same as 'open')
4168 * uap->xsecurity ACL to set, if creating
4169 *
4170 * Returns: 0 Success
4171 * !0 errno value
4172 *
4173 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4174 *
4175 * XXX: We should enummerate the possible errno values here, and where
4176 * in the code they originated.
4177 */
4178 int
4179 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4180 {
4181 struct filedesc *fdp = p->p_fd;
4182 int ciferror;
4183 kauth_filesec_t xsecdst;
4184 struct vnode_attr va;
4185 struct nameidata nd;
4186 int cmode;
4187
4188 AUDIT_ARG(owner, uap->uid, uap->gid);
4189
4190 xsecdst = NULL;
4191 if ((uap->xsecurity != USER_ADDR_NULL) &&
4192 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4193 return ciferror;
4194 }
4195
4196 VATTR_INIT(&va);
4197 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4198 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4199 if (uap->uid != KAUTH_UID_NONE) {
4200 VATTR_SET(&va, va_uid, uap->uid);
4201 }
4202 if (uap->gid != KAUTH_GID_NONE) {
4203 VATTR_SET(&va, va_gid, uap->gid);
4204 }
4205 if (xsecdst != NULL) {
4206 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4207 }
4208
4209 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4210 uap->path, vfs_context_current());
4211
4212 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4213 fileproc_alloc_init, NULL, retval);
4214 if (xsecdst != NULL) {
4215 kauth_filesec_free(xsecdst);
4216 }
4217
4218 return ciferror;
4219 }
4220
4221 /*
4222 * Go through the data-protected atomically controlled open (2)
4223 *
4224 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4225 */
4226 int
4227 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4228 {
4229 int flags = uap->flags;
4230 int class = uap->class;
4231 int dpflags = uap->dpflags;
4232
4233 /*
4234 * Follow the same path as normal open(2)
4235 * Look up the item if it exists, and acquire the vnode.
4236 */
4237 struct filedesc *fdp = p->p_fd;
4238 struct vnode_attr va;
4239 struct nameidata nd;
4240 int cmode;
4241 int error;
4242
4243 VATTR_INIT(&va);
4244 /* Mask off all but regular access permissions */
4245 cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4246 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4247
4248 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4249 uap->path, vfs_context_current());
4250
4251 /*
4252 * Initialize the extra fields in vnode_attr to pass down our
4253 * extra fields.
4254 * 1. target cprotect class.
4255 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4256 */
4257 if (flags & O_CREAT) {
4258 /* lower level kernel code validates that the class is valid before applying it. */
4259 if (class != PROTECTION_CLASS_DEFAULT) {
4260 /*
4261 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4262 * file behave the same as open (2)
4263 */
4264 VATTR_SET(&va, va_dataprotect_class, class);
4265 }
4266 }
4267
4268 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4269 if (flags & (O_RDWR | O_WRONLY)) {
4270 /* Not allowed to write raw encrypted bytes */
4271 return EINVAL;
4272 }
4273 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4274 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4275 }
4276 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4277 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4278 }
4279 }
4280
4281 error = open1(vfs_context_current(), &nd, uap->flags, &va,
4282 fileproc_alloc_init, NULL, retval);
4283
4284 return error;
4285 }
4286
4287 static int
4288 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4289 int fd, enum uio_seg segflg, int *retval)
4290 {
4291 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4292 struct vnode_attr va;
4293 struct nameidata nd;
4294 int cmode;
4295
4296 VATTR_INIT(&va);
4297 /* Mask off all but regular access permissions */
4298 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4299 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4300
4301 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4302 segflg, path, ctx);
4303
4304 return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4305 retval, fd);
4306 }
4307
4308 int
4309 open(proc_t p, struct open_args *uap, int32_t *retval)
4310 {
4311 __pthread_testcancel(1);
4312 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4313 }
4314
4315 int
4316 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4317 int32_t *retval)
4318 {
4319 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4320 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4321 }
4322
4323 int
4324 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4325 int32_t *retval)
4326 {
4327 return openat_internal(vfs_context_current(), uap->path, uap->flags,
4328 uap->mode, uap->fd, UIO_USERSPACE, retval);
4329 }
4330
4331 int
4332 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4333 {
4334 __pthread_testcancel(1);
4335 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4336 }
4337
4338 /*
4339 * openbyid_np: open a file given a file system id and a file system object id
4340 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
4341 * file systems that don't support object ids it is a node id (uint64_t).
4342 *
4343 * Parameters: p Process requesting the open
4344 * uap User argument descriptor (see below)
4345 * retval Pointer to an area to receive the
4346 * return calue from the system call
4347 *
4348 * Indirect: uap->path Path to open (same as 'open')
4349 *
4350 * uap->fsid id of target file system
4351 * uap->objid id of target file system object
4352 * uap->flags Flags to open (same as 'open')
4353 *
4354 * Returns: 0 Success
4355 * !0 errno value
4356 *
4357 *
4358 * XXX: We should enummerate the possible errno values here, and where
4359 * in the code they originated.
4360 */
4361 int
4362 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4363 {
4364 fsid_t fsid;
4365 uint64_t objid;
4366 int error;
4367 char *buf = NULL;
4368 int buflen = MAXPATHLEN;
4369 int pathlen = 0;
4370 vfs_context_t ctx = vfs_context_current();
4371
4372 if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4373 return error;
4374 }
4375
4376 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4377 return error;
4378 }
4379
4380 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4381 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4382 return error;
4383 }
4384
4385 AUDIT_ARG(value32, fsid.val[0]);
4386 AUDIT_ARG(value64, objid);
4387
4388 /*resolve path from fsis, objid*/
4389 do {
4390 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4391 if (buf == NULL) {
4392 return ENOMEM;
4393 }
4394
4395 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4396 buf, FSOPT_ISREALFSID, &pathlen);
4397
4398 if (error) {
4399 FREE(buf, M_TEMP);
4400 buf = NULL;
4401 }
4402 } while (error == ENOSPC && (buflen += MAXPATHLEN));
4403
4404 if (error) {
4405 return error;
4406 }
4407
4408 buf[pathlen] = 0;
4409
4410 error = openat_internal(
4411 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4412
4413 FREE(buf, M_TEMP);
4414
4415 return error;
4416 }
4417
4418
4419 /*
4420 * Create a special file.
4421 */
4422 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4423
4424 int
4425 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4426 {
4427 struct vnode_attr va;
4428 vfs_context_t ctx = vfs_context_current();
4429 int error;
4430 struct nameidata nd;
4431 vnode_t vp, dvp;
4432
4433 VATTR_INIT(&va);
4434 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4435 VATTR_SET(&va, va_rdev, uap->dev);
4436
4437 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4438 if ((uap->mode & S_IFMT) == S_IFIFO) {
4439 return mkfifo1(ctx, uap->path, &va);
4440 }
4441
4442 AUDIT_ARG(mode, uap->mode);
4443 AUDIT_ARG(value32, uap->dev);
4444
4445 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4446 return error;
4447 }
4448 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4449 UIO_USERSPACE, uap->path, ctx);
4450 error = namei(&nd);
4451 if (error) {
4452 return error;
4453 }
4454 dvp = nd.ni_dvp;
4455 vp = nd.ni_vp;
4456
4457 if (vp != NULL) {
4458 error = EEXIST;
4459 goto out;
4460 }
4461
4462 switch (uap->mode & S_IFMT) {
4463 case S_IFCHR:
4464 VATTR_SET(&va, va_type, VCHR);
4465 break;
4466 case S_IFBLK:
4467 VATTR_SET(&va, va_type, VBLK);
4468 break;
4469 default:
4470 error = EINVAL;
4471 goto out;
4472 }
4473
4474 #if CONFIG_MACF
4475 error = mac_vnode_check_create(ctx,
4476 nd.ni_dvp, &nd.ni_cnd, &va);
4477 if (error) {
4478 goto out;
4479 }
4480 #endif
4481
4482 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4483 goto out;
4484 }
4485
4486 if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4487 goto out;
4488 }
4489
4490 if (vp) {
4491 int update_flags = 0;
4492
4493 // Make sure the name & parent pointers are hooked up
4494 if (vp->v_name == NULL) {
4495 update_flags |= VNODE_UPDATE_NAME;
4496 }
4497 if (vp->v_parent == NULLVP) {
4498 update_flags |= VNODE_UPDATE_PARENT;
4499 }
4500
4501 if (update_flags) {
4502 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4503 }
4504
4505 #if CONFIG_FSE
4506 add_fsevent(FSE_CREATE_FILE, ctx,
4507 FSE_ARG_VNODE, vp,
4508 FSE_ARG_DONE);
4509 #endif
4510 }
4511
4512 out:
4513 /*
4514 * nameidone has to happen before we vnode_put(dvp)
4515 * since it may need to release the fs_nodelock on the dvp
4516 */
4517 nameidone(&nd);
4518
4519 if (vp) {
4520 vnode_put(vp);
4521 }
4522 vnode_put(dvp);
4523
4524 return error;
4525 }
4526
4527 /*
4528 * Create a named pipe.
4529 *
4530 * Returns: 0 Success
4531 * EEXIST
4532 * namei:???
4533 * vnode_authorize:???
4534 * vn_create:???
4535 */
4536 static int
4537 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4538 {
4539 vnode_t vp, dvp;
4540 int error;
4541 struct nameidata nd;
4542
4543 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4544 UIO_USERSPACE, upath, ctx);
4545 error = namei(&nd);
4546 if (error) {
4547 return error;
4548 }
4549 dvp = nd.ni_dvp;
4550 vp = nd.ni_vp;
4551
4552 /* check that this is a new file and authorize addition */
4553 if (vp != NULL) {
4554 error = EEXIST;
4555 goto out;
4556 }
4557 VATTR_SET(vap, va_type, VFIFO);
4558
4559 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4560 goto out;
4561 }
4562
4563 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4564 out:
4565 /*
4566 * nameidone has to happen before we vnode_put(dvp)
4567 * since it may need to release the fs_nodelock on the dvp
4568 */
4569 nameidone(&nd);
4570
4571 if (vp) {
4572 vnode_put(vp);
4573 }
4574 vnode_put(dvp);
4575
4576 return error;
4577 }
4578
4579
4580 /*
4581 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4582 *
4583 * Parameters: p Process requesting the open
4584 * uap User argument descriptor (see below)
4585 * retval (Ignored)
4586 *
4587 * Indirect: uap->path Path to fifo (same as 'mkfifo')
4588 * uap->uid UID to set
4589 * uap->gid GID to set
4590 * uap->mode File mode to set (same as 'mkfifo')
4591 * uap->xsecurity ACL to set, if creating
4592 *
4593 * Returns: 0 Success
4594 * !0 errno value
4595 *
4596 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4597 *
4598 * XXX: We should enummerate the possible errno values here, and where
4599 * in the code they originated.
4600 */
4601 int
4602 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4603 {
4604 int ciferror;
4605 kauth_filesec_t xsecdst;
4606 struct vnode_attr va;
4607
4608 AUDIT_ARG(owner, uap->uid, uap->gid);
4609
4610 xsecdst = KAUTH_FILESEC_NONE;
4611 if (uap->xsecurity != USER_ADDR_NULL) {
4612 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4613 return ciferror;
4614 }
4615 }
4616
4617 VATTR_INIT(&va);
4618 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4619 if (uap->uid != KAUTH_UID_NONE) {
4620 VATTR_SET(&va, va_uid, uap->uid);
4621 }
4622 if (uap->gid != KAUTH_GID_NONE) {
4623 VATTR_SET(&va, va_gid, uap->gid);
4624 }
4625 if (xsecdst != KAUTH_FILESEC_NONE) {
4626 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4627 }
4628
4629 ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4630
4631 if (xsecdst != KAUTH_FILESEC_NONE) {
4632 kauth_filesec_free(xsecdst);
4633 }
4634 return ciferror;
4635 }
4636
4637 /* ARGSUSED */
4638 int
4639 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4640 {
4641 struct vnode_attr va;
4642
4643 VATTR_INIT(&va);
4644 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4645
4646 return mkfifo1(vfs_context_current(), uap->path, &va);
4647 }
4648
4649
4650 static char *
4651 my_strrchr(char *p, int ch)
4652 {
4653 char *save;
4654
4655 for (save = NULL;; ++p) {
4656 if (*p == ch) {
4657 save = p;
4658 }
4659 if (!*p) {
4660 return save;
4661 }
4662 }
4663 /* NOTREACHED */
4664 }
4665
4666 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4667 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4668 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4669
4670 int
4671 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4672 {
4673 int ret, len = _len;
4674
4675 *truncated_path = 0;
4676
4677 if (firmlink) {
4678 ret = vn_getpath(dvp, path, &len);
4679 } else {
4680 ret = vn_getpath_no_firmlink(dvp, path, &len);
4681 }
4682 if (ret == 0 && len < (MAXPATHLEN - 1)) {
4683 if (leafname) {
4684 path[len - 1] = '/';
4685 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4686 if (len > MAXPATHLEN) {
4687 char *ptr;
4688
4689 // the string got truncated!
4690 *truncated_path = 1;
4691 ptr = my_strrchr(path, '/');
4692 if (ptr) {
4693 *ptr = '\0'; // chop off the string at the last directory component
4694 }
4695 len = strlen(path) + 1;
4696 }
4697 }
4698 } else if (ret == 0) {
4699 *truncated_path = 1;
4700 } else if (ret != 0) {
4701 struct vnode *mydvp = dvp;
4702
4703 if (ret != ENOSPC) {
4704 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4705 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4706 }
4707 *truncated_path = 1;
4708
4709 do {
4710 if (mydvp->v_parent != NULL) {
4711 mydvp = mydvp->v_parent;
4712 } else if (mydvp->v_mount) {
4713 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4714 break;
4715 } else {
4716 // no parent and no mount point? only thing is to punt and say "/" changed
4717 strlcpy(path, "/", _len);
4718 len = 2;
4719 mydvp = NULL;
4720 }
4721
4722 if (mydvp == NULL) {
4723 break;
4724 }
4725
4726 len = _len;
4727 if (firmlink) {
4728 ret = vn_getpath(mydvp, path, &len);
4729 } else {
4730 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4731 }
4732 } while (ret == ENOSPC);
4733 }
4734
4735 return len;
4736 }
4737
4738 int
4739 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4740 {
4741 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4742 }
4743
4744 int
4745 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4746 {
4747 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4748 }
4749
4750 /*
4751 * Make a hard file link.
4752 *
4753 * Returns: 0 Success
4754 * EPERM
4755 * EEXIST
4756 * EXDEV
4757 * namei:???
4758 * vnode_authorize:???
4759 * VNOP_LINK:???
4760 */
4761 /* ARGSUSED */
4762 static int
4763 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4764 user_addr_t link, int flag, enum uio_seg segflg)
4765 {
4766 vnode_t vp, pvp, dvp, lvp;
4767 struct nameidata nd;
4768 int follow;
4769 int error;
4770 #if CONFIG_FSE
4771 fse_info finfo;
4772 #endif
4773 int need_event, has_listeners, need_kpath2;
4774 char *target_path = NULL;
4775 int truncated = 0;
4776
4777 vp = dvp = lvp = NULLVP;
4778
4779 /* look up the object we are linking to */
4780 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4781 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4782 segflg, path, ctx);
4783
4784 error = nameiat(&nd, fd1);
4785 if (error) {
4786 if (error == EPERM) {
4787 printf("XXX 54841485: nameiat() src EPERM\n");
4788 }
4789 return error;
4790 }
4791 vp = nd.ni_vp;
4792
4793 nameidone(&nd);
4794
4795 /*
4796 * Normally, linking to directories is not supported.
4797 * However, some file systems may have limited support.
4798 */
4799 if (vp->v_type == VDIR) {
4800 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4801 error = EPERM; /* POSIX */
4802 printf("XXX 54841485: VDIR EPERM\n");
4803 goto out;
4804 }
4805
4806 /* Linking to a directory requires ownership. */
4807 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4808 struct vnode_attr dva;
4809
4810 VATTR_INIT(&dva);
4811 VATTR_WANTED(&dva, va_uid);
4812 if (vnode_getattr(vp, &dva, ctx) != 0 ||
4813 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4814 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4815 error = EACCES;
4816 goto out;
4817 }
4818 }
4819 }
4820
4821 /* lookup the target node */
4822 #if CONFIG_TRIGGERS
4823 nd.ni_op = OP_LINK;
4824 #endif
4825 nd.ni_cnd.cn_nameiop = CREATE;
4826 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4827 nd.ni_dirp = link;
4828 error = nameiat(&nd, fd2);
4829 if (error != 0) {
4830 if (error == EPERM) {
4831 printf("XXX 54841485: nameiat() dst EPERM\n");
4832 }
4833 goto out;
4834 }
4835 dvp = nd.ni_dvp;
4836 lvp = nd.ni_vp;
4837
4838 #if CONFIG_MACF
4839 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4840 if (error == EPERM) {
4841 printf("XXX 54841485: mac_vnode_check_link() EPERM\n");
4842 }
4843 goto out2;
4844 }
4845 #endif
4846
4847 /* or to anything that kauth doesn't want us to (eg. immutable items) */
4848 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4849 if (error == EPERM) {
4850 printf("XXX 54841485: vnode_authorize() LINKTARGET EPERM\n");
4851 }
4852 goto out2;
4853 }
4854
4855 /* target node must not exist */
4856 if (lvp != NULLVP) {
4857 error = EEXIST;
4858 goto out2;
4859 }
4860 /* cannot link across mountpoints */
4861 if (vnode_mount(vp) != vnode_mount(dvp)) {
4862 error = EXDEV;
4863 goto out2;
4864 }
4865
4866 /* authorize creation of the target note */
4867 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4868 if (error == EPERM) {
4869 printf("XXX 54841485: vnode_authorize() ADD_FILE EPERM\n");
4870 }
4871 goto out2;
4872 }
4873
4874 /* and finally make the link */
4875 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4876 if (error) {
4877 if (error == EPERM) {
4878 printf("XXX 54841485: VNOP_LINK() EPERM\n");
4879 }
4880 goto out2;
4881 }
4882
4883 #if CONFIG_MACF
4884 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4885 #endif
4886
4887 #if CONFIG_FSE
4888 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4889 #else
4890 need_event = 0;
4891 #endif
4892 has_listeners = kauth_authorize_fileop_has_listeners();
4893
4894 need_kpath2 = 0;
4895 #if CONFIG_AUDIT
4896 if (AUDIT_RECORD_EXISTS()) {
4897 need_kpath2 = 1;
4898 }
4899 #endif
4900
4901 if (need_event || has_listeners || need_kpath2) {
4902 char *link_to_path = NULL;
4903 int len, link_name_len;
4904
4905 /* build the path to the new link file */
4906 GET_PATH(target_path);
4907 if (target_path == NULL) {
4908 error = ENOMEM;
4909 goto out2;
4910 }
4911
4912 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4913
4914 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4915
4916 if (has_listeners) {
4917 /* build the path to file we are linking to */
4918 GET_PATH(link_to_path);
4919 if (link_to_path == NULL) {
4920 error = ENOMEM;
4921 goto out2;
4922 }
4923
4924 link_name_len = MAXPATHLEN;
4925 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4926 /*
4927 * Call out to allow 3rd party notification of rename.
4928 * Ignore result of kauth_authorize_fileop call.
4929 */
4930 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4931 (uintptr_t)link_to_path,
4932 (uintptr_t)target_path);
4933 }
4934 if (link_to_path != NULL) {
4935 RELEASE_PATH(link_to_path);
4936 }
4937 }
4938 #if CONFIG_FSE
4939 if (need_event) {
4940 /* construct fsevent */
4941 if (get_fse_info(vp, &finfo, ctx) == 0) {
4942 if (truncated) {
4943 finfo.mode |= FSE_TRUNCATED_PATH;
4944 }
4945
4946 // build the path to the destination of the link
4947 add_fsevent(FSE_CREATE_FILE, ctx,
4948 FSE_ARG_STRING, len, target_path,
4949 FSE_ARG_FINFO, &finfo,
4950 FSE_ARG_DONE);
4951 }
4952
4953 pvp = vp->v_parent;
4954 // need an iocount on pvp in this case
4955 if (pvp && pvp != dvp) {
4956 error = vnode_get(pvp);
4957 if (error) {
4958 pvp = NULLVP;
4959 error = 0;
4960 }
4961 }
4962 if (pvp) {
4963 add_fsevent(FSE_STAT_CHANGED, ctx,
4964 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4965 }
4966 if (pvp && pvp != dvp) {
4967 vnode_put(pvp);
4968 }
4969 }
4970 #endif
4971 }
4972 out2:
4973 /*
4974 * nameidone has to happen before we vnode_put(dvp)
4975 * since it may need to release the fs_nodelock on the dvp
4976 */
4977 nameidone(&nd);
4978 if (target_path != NULL) {
4979 RELEASE_PATH(target_path);
4980 }
4981 out:
4982 if (lvp) {
4983 vnode_put(lvp);
4984 }
4985 if (dvp) {
4986 vnode_put(dvp);
4987 }
4988 vnode_put(vp);
4989 return error;
4990 }
4991
4992 int
4993 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4994 {
4995 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4996 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4997 }
4998
4999 int
5000 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5001 {
5002 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5003 return EINVAL;
5004 }
5005
5006 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5007 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5008 }
5009
5010 /*
5011 * Make a symbolic link.
5012 *
5013 * We could add support for ACLs here too...
5014 */
5015 /* ARGSUSED */
5016 static int
5017 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5018 user_addr_t link, enum uio_seg segflg)
5019 {
5020 struct vnode_attr va;
5021 char *path;
5022 int error;
5023 struct nameidata nd;
5024 vnode_t vp, dvp;
5025 size_t dummy = 0;
5026 proc_t p;
5027
5028 error = 0;
5029 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5030 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5031 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5032 } else {
5033 path = (char *)path_data;
5034 }
5035 if (error) {
5036 goto out;
5037 }
5038 AUDIT_ARG(text, path); /* This is the link string */
5039
5040 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5041 segflg, link, ctx);
5042
5043 error = nameiat(&nd, fd);
5044 if (error) {
5045 goto out;
5046 }
5047 dvp = nd.ni_dvp;
5048 vp = nd.ni_vp;
5049
5050 p = vfs_context_proc(ctx);
5051 VATTR_INIT(&va);
5052 VATTR_SET(&va, va_type, VLNK);
5053 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5054
5055 #if CONFIG_MACF
5056 error = mac_vnode_check_create(ctx,
5057 dvp, &nd.ni_cnd, &va);
5058 #endif
5059 if (error != 0) {
5060 goto skipit;
5061 }
5062
5063 if (vp != NULL) {
5064 error = EEXIST;
5065 goto skipit;
5066 }
5067
5068 /* authorize */
5069 if (error == 0) {
5070 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5071 }
5072 /* get default ownership, etc. */
5073 if (error == 0) {
5074 error = vnode_authattr_new(dvp, &va, 0, ctx);
5075 }
5076 if (error == 0) {
5077 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5078 }
5079
5080 /* do fallback attribute handling */
5081 if (error == 0 && vp) {
5082 error = vnode_setattr_fallback(vp, &va, ctx);
5083 }
5084
5085 #if CONFIG_MACF
5086 if (error == 0 && vp) {
5087 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5088 }
5089 #endif
5090
5091 if (error == 0) {
5092 int update_flags = 0;
5093
5094 /*check if a new vnode was created, else try to get one*/
5095 if (vp == NULL) {
5096 nd.ni_cnd.cn_nameiop = LOOKUP;
5097 #if CONFIG_TRIGGERS
5098 nd.ni_op = OP_LOOKUP;
5099 #endif
5100 nd.ni_cnd.cn_flags = 0;
5101 error = nameiat(&nd, fd);
5102 vp = nd.ni_vp;
5103
5104 if (vp == NULL) {
5105 goto skipit;
5106 }
5107 }
5108
5109 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5110 /* call out to allow 3rd party notification of rename.
5111 * Ignore result of kauth_authorize_fileop call.
5112 */
5113 if (kauth_authorize_fileop_has_listeners() &&
5114 namei(&nd) == 0) {
5115 char *new_link_path = NULL;
5116 int len;
5117
5118 /* build the path to the new link file */
5119 new_link_path = get_pathbuff();
5120 len = MAXPATHLEN;
5121 vn_getpath(dvp, new_link_path, &len);
5122 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5123 new_link_path[len - 1] = '/';
5124 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5125 }
5126
5127 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5128 (uintptr_t)path, (uintptr_t)new_link_path);
5129 if (new_link_path != NULL) {
5130 release_pathbuff(new_link_path);
5131 }
5132 }
5133 #endif
5134 // Make sure the name & parent pointers are hooked up
5135 if (vp->v_name == NULL) {
5136 update_flags |= VNODE_UPDATE_NAME;
5137 }
5138 if (vp->v_parent == NULLVP) {
5139 update_flags |= VNODE_UPDATE_PARENT;
5140 }
5141
5142 if (update_flags) {
5143 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5144 }
5145
5146 #if CONFIG_FSE
5147 add_fsevent(FSE_CREATE_FILE, ctx,
5148 FSE_ARG_VNODE, vp,
5149 FSE_ARG_DONE);
5150 #endif
5151 }
5152
5153 skipit:
5154 /*
5155 * nameidone has to happen before we vnode_put(dvp)
5156 * since it may need to release the fs_nodelock on the dvp
5157 */
5158 nameidone(&nd);
5159
5160 if (vp) {
5161 vnode_put(vp);
5162 }
5163 vnode_put(dvp);
5164 out:
5165 if (path && (path != (char *)path_data)) {
5166 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5167 }
5168
5169 return error;
5170 }
5171
5172 int
5173 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5174 {
5175 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5176 uap->link, UIO_USERSPACE);
5177 }
5178
5179 int
5180 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5181 __unused int32_t *retval)
5182 {
5183 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5184 uap->path2, UIO_USERSPACE);
5185 }
5186
5187 /*
5188 * Delete a whiteout from the filesystem.
5189 * No longer supported.
5190 */
5191 int
5192 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5193 {
5194 return ENOTSUP;
5195 }
5196
5197 /*
5198 * Delete a name from the filesystem.
5199 */
5200 /* ARGSUSED */
5201 static int
5202 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5203 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5204 {
5205 struct nameidata nd;
5206 vnode_t vp, dvp;
5207 int error;
5208 struct componentname *cnp;
5209 char *path = NULL;
5210 char *no_firmlink_path = NULL;
5211 int len_path = 0;
5212 int len_no_firmlink_path = 0;
5213 #if CONFIG_FSE
5214 fse_info finfo;
5215 struct vnode_attr va;
5216 #endif
5217 int flags;
5218 int need_event;
5219 int has_listeners;
5220 int truncated_path;
5221 int truncated_no_firmlink_path;
5222 int batched;
5223 struct vnode_attr *vap;
5224 int do_retry;
5225 int retry_count = 0;
5226 int cn_flags;
5227
5228 cn_flags = LOCKPARENT;
5229 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5230 cn_flags |= AUDITVNPATH1;
5231 }
5232 /* If a starting dvp is passed, it trumps any fd passed. */
5233 if (start_dvp) {
5234 cn_flags |= USEDVP;
5235 }
5236
5237 #if NAMEDRSRCFORK
5238 /* unlink or delete is allowed on rsrc forks and named streams */
5239 cn_flags |= CN_ALLOWRSRCFORK;
5240 #endif
5241
5242 retry:
5243 do_retry = 0;
5244 flags = 0;
5245 need_event = 0;
5246 has_listeners = 0;
5247 truncated_path = 0;
5248 truncated_no_firmlink_path = 0;
5249 vap = NULL;
5250
5251 NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5252
5253 nd.ni_dvp = start_dvp;
5254 nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5255 cnp = &nd.ni_cnd;
5256
5257 continue_lookup:
5258 error = nameiat(&nd, fd);
5259 if (error) {
5260 return error;
5261 }
5262
5263 dvp = nd.ni_dvp;
5264 vp = nd.ni_vp;
5265
5266
5267 /* With Carbon delete semantics, busy files cannot be deleted */
5268 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5269 flags |= VNODE_REMOVE_NODELETEBUSY;
5270 }
5271
5272 /* Skip any potential upcalls if told to. */
5273 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5274 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5275 }
5276
5277 if (vp) {
5278 batched = vnode_compound_remove_available(vp);
5279 /*
5280 * The root of a mounted filesystem cannot be deleted.
5281 */
5282 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5283 error = EBUSY;
5284 goto out;
5285 }
5286
5287 #if DEVELOPMENT || DEBUG
5288 /*
5289 * XXX VSWAP: Check for entitlements or special flag here
5290 * so we can restrict access appropriately.
5291 */
5292 #else /* DEVELOPMENT || DEBUG */
5293
5294 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5295 error = EPERM;
5296 goto out;
5297 }
5298 #endif /* DEVELOPMENT || DEBUG */
5299
5300 if (!batched) {
5301 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5302 if (error) {
5303 if (error == ENOENT) {
5304 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5305 do_retry = 1;
5306 retry_count++;
5307 }
5308 }
5309 goto out;
5310 }
5311 }
5312 } else {
5313 batched = 1;
5314
5315 if (!vnode_compound_remove_available(dvp)) {
5316 panic("No vp, but no compound remove?");
5317 }
5318 }
5319
5320 #if CONFIG_FSE
5321 need_event = need_fsevent(FSE_DELETE, dvp);
5322 if (need_event) {
5323 if (!batched) {
5324 if ((vp->v_flag & VISHARDLINK) == 0) {
5325 /* XXX need to get these data in batched VNOP */
5326 get_fse_info(vp, &finfo, ctx);
5327 }
5328 } else {
5329 error = vfs_get_notify_attributes(&va);
5330 if (error) {
5331 goto out;
5332 }
5333
5334 vap = &va;
5335 }
5336 }
5337 #endif
5338 has_listeners = kauth_authorize_fileop_has_listeners();
5339 if (need_event || has_listeners) {
5340 if (path == NULL) {
5341 GET_PATH(path);
5342 if (path == NULL) {
5343 error = ENOMEM;
5344 goto out;
5345 }
5346 }
5347 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5348 if (no_firmlink_path == NULL) {
5349 GET_PATH(no_firmlink_path);
5350 if (no_firmlink_path == NULL) {
5351 error = ENOMEM;
5352 goto out;
5353 }
5354 }
5355 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5356 }
5357
5358 #if NAMEDRSRCFORK
5359 if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5360 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5361 } else
5362 #endif
5363 {
5364 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5365 vp = nd.ni_vp;
5366 if (error == EKEEPLOOKING) {
5367 if (!batched) {
5368 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5369 }
5370
5371 if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5372 panic("EKEEPLOOKING, but continue flag not set?");
5373 }
5374
5375 if (vnode_isdir(vp)) {
5376 error = EISDIR;
5377 goto out;
5378 }
5379 goto continue_lookup;
5380 } else if (error == ENOENT && batched) {
5381 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5382 /*
5383 * For compound VNOPs, the authorization callback may
5384 * return ENOENT in case of racing hardlink lookups
5385 * hitting the name cache, redrive the lookup.
5386 */
5387 do_retry = 1;
5388 retry_count += 1;
5389 goto out;
5390 }
5391 }
5392 }
5393
5394 /*
5395 * Call out to allow 3rd party notification of delete.
5396 * Ignore result of kauth_authorize_fileop call.
5397 */
5398 if (!error) {
5399 if (has_listeners) {
5400 kauth_authorize_fileop(vfs_context_ucred(ctx),
5401 KAUTH_FILEOP_DELETE,
5402 (uintptr_t)vp,
5403 (uintptr_t)path);
5404 }
5405
5406 if (vp->v_flag & VISHARDLINK) {
5407 //
5408 // if a hardlink gets deleted we want to blow away the
5409 // v_parent link because the path that got us to this
5410 // instance of the link is no longer valid. this will
5411 // force the next call to get the path to ask the file
5412 // system instead of just following the v_parent link.
5413 //
5414 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5415 }
5416
5417 #if CONFIG_FSE
5418 if (need_event) {
5419 if (vp->v_flag & VISHARDLINK) {
5420 get_fse_info(vp, &finfo, ctx);
5421 } else if (vap) {
5422 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5423 }
5424 if (truncated_path) {
5425 finfo.mode |= FSE_TRUNCATED_PATH;
5426 }
5427 add_fsevent(FSE_DELETE, ctx,
5428 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5429 FSE_ARG_FINFO, &finfo,
5430 FSE_ARG_DONE);
5431 }
5432 #endif
5433 }
5434
5435 out:
5436 if (path != NULL) {
5437 RELEASE_PATH(path);
5438 path = NULL;
5439 }
5440
5441 if (no_firmlink_path != NULL) {
5442 RELEASE_PATH(no_firmlink_path);
5443 no_firmlink_path = NULL;
5444 }
5445 #if NAMEDRSRCFORK
5446 /* recycle the deleted rsrc fork vnode to force a reclaim, which
5447 * will cause its shadow file to go away if necessary.
5448 */
5449 if (vp && (vnode_isnamedstream(vp)) &&
5450 (vp->v_parent != NULLVP) &&
5451 vnode_isshadow(vp)) {
5452 vnode_recycle(vp);
5453 }
5454 #endif
5455 /*
5456 * nameidone has to happen before we vnode_put(dvp)
5457 * since it may need to release the fs_nodelock on the dvp
5458 */
5459 nameidone(&nd);
5460 vnode_put(dvp);
5461 if (vp) {
5462 vnode_put(vp);
5463 }
5464
5465 if (do_retry) {
5466 goto retry;
5467 }
5468
5469 return error;
5470 }
5471
5472 int
5473 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5474 enum uio_seg segflg, int unlink_flags)
5475 {
5476 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5477 unlink_flags);
5478 }
5479
5480 /*
5481 * Delete a name from the filesystem using Carbon semantics.
5482 */
5483 int
5484 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5485 {
5486 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5487 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5488 }
5489
5490 /*
5491 * Delete a name from the filesystem using POSIX semantics.
5492 */
5493 int
5494 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5495 {
5496 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5497 uap->path, UIO_USERSPACE, 0);
5498 }
5499
5500 int
5501 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5502 {
5503 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5504 return EINVAL;
5505 }
5506
5507 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5508 int unlink_flags = 0;
5509
5510 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5511 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5512 }
5513 return rmdirat_internal(vfs_context_current(), uap->fd,
5514 uap->path, UIO_USERSPACE, unlink_flags);
5515 } else {
5516 return unlinkat_internal(vfs_context_current(), uap->fd,
5517 NULLVP, uap->path, UIO_USERSPACE, 0);
5518 }
5519 }
5520
5521 /*
5522 * Reposition read/write file offset.
5523 */
5524 int
5525 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5526 {
5527 struct fileproc *fp;
5528 vnode_t vp;
5529 struct vfs_context *ctx;
5530 off_t offset = uap->offset, file_size;
5531 int error;
5532
5533 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5534 if (error == ENOTSUP) {
5535 return ESPIPE;
5536 }
5537 return error;
5538 }
5539 if (vnode_isfifo(vp)) {
5540 file_drop(uap->fd);
5541 return ESPIPE;
5542 }
5543
5544
5545 ctx = vfs_context_current();
5546 #if CONFIG_MACF
5547 if (uap->whence == L_INCR && uap->offset == 0) {
5548 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5549 fp->f_fglob);
5550 } else {
5551 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5552 fp->f_fglob);
5553 }
5554 if (error) {
5555 file_drop(uap->fd);
5556 return error;
5557 }
5558 #endif
5559 if ((error = vnode_getwithref(vp))) {
5560 file_drop(uap->fd);
5561 return error;
5562 }
5563
5564 switch (uap->whence) {
5565 case L_INCR:
5566 offset += fp->f_fglob->fg_offset;
5567 break;
5568 case L_XTND:
5569 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5570 break;
5571 }
5572 offset += file_size;
5573 break;
5574 case L_SET:
5575 break;
5576 case SEEK_HOLE:
5577 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5578 break;
5579 case SEEK_DATA:
5580 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5581 break;
5582 default:
5583 error = EINVAL;
5584 }
5585 if (error == 0) {
5586 if (uap->offset > 0 && offset < 0) {
5587 /* Incremented/relative move past max size */
5588 error = EOVERFLOW;
5589 } else {
5590 /*
5591 * Allow negative offsets on character devices, per
5592 * POSIX 1003.1-2001. Most likely for writing disk
5593 * labels.
5594 */
5595 if (offset < 0 && vp->v_type != VCHR) {
5596 /* Decremented/relative move before start */
5597 error = EINVAL;
5598 } else {
5599 /* Success */
5600 fp->f_fglob->fg_offset = offset;
5601 *retval = fp->f_fglob->fg_offset;
5602 }
5603 }
5604 }
5605
5606 /*
5607 * An lseek can affect whether data is "available to read." Use
5608 * hint of NOTE_NONE so no EVFILT_VNODE events fire
5609 */
5610 post_event_if_success(vp, error, NOTE_NONE);
5611 (void)vnode_put(vp);
5612 file_drop(uap->fd);
5613 return error;
5614 }
5615
5616
5617 /*
5618 * Check access permissions.
5619 *
5620 * Returns: 0 Success
5621 * vnode_authorize:???
5622 */
5623 static int
5624 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5625 {
5626 kauth_action_t action;
5627 int error;
5628
5629 /*
5630 * If just the regular access bits, convert them to something
5631 * that vnode_authorize will understand.
5632 */
5633 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5634 action = 0;
5635 if (uflags & R_OK) {
5636 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
5637 }
5638 if (uflags & W_OK) {
5639 if (vnode_isdir(vp)) {
5640 action |= KAUTH_VNODE_ADD_FILE |
5641 KAUTH_VNODE_ADD_SUBDIRECTORY;
5642 /* might want delete rights here too */
5643 } else {
5644 action |= KAUTH_VNODE_WRITE_DATA;
5645 }
5646 }
5647 if (uflags & X_OK) {
5648 if (vnode_isdir(vp)) {
5649 action |= KAUTH_VNODE_SEARCH;
5650 } else {
5651 action |= KAUTH_VNODE_EXECUTE;
5652 }
5653 }
5654 } else {
5655 /* take advantage of definition of uflags */
5656 action = uflags >> 8;
5657 }
5658
5659 #if CONFIG_MACF
5660 error = mac_vnode_check_access(ctx, vp, uflags);
5661 if (error) {
5662 return error;
5663 }
5664 #endif /* MAC */
5665
5666 /* action == 0 means only check for existence */
5667 if (action != 0) {
5668 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5669 } else {
5670 error = 0;
5671 }
5672
5673 return error;
5674 }
5675
5676
5677
5678 /*
5679 * access_extended: Check access permissions in bulk.
5680 *
5681 * Description: uap->entries Pointer to an array of accessx
5682 * descriptor structs, plus one or
5683 * more NULL terminated strings (see
5684 * "Notes" section below).
5685 * uap->size Size of the area pointed to by
5686 * uap->entries.
5687 * uap->results Pointer to the results array.
5688 *
5689 * Returns: 0 Success
5690 * ENOMEM Insufficient memory
5691 * EINVAL Invalid arguments
5692 * namei:EFAULT Bad address
5693 * namei:ENAMETOOLONG Filename too long
5694 * namei:ENOENT No such file or directory
5695 * namei:ELOOP Too many levels of symbolic links
5696 * namei:EBADF Bad file descriptor
5697 * namei:ENOTDIR Not a directory
5698 * namei:???
5699 * access1:
5700 *
5701 * Implicit returns:
5702 * uap->results Array contents modified
5703 *
5704 * Notes: The uap->entries are structured as an arbitrary length array
5705 * of accessx descriptors, followed by one or more NULL terminated
5706 * strings
5707 *
5708 * struct accessx_descriptor[0]
5709 * ...
5710 * struct accessx_descriptor[n]
5711 * char name_data[0];
5712 *
5713 * We determine the entry count by walking the buffer containing
5714 * the uap->entries argument descriptor. For each descriptor we
5715 * see, the valid values for the offset ad_name_offset will be
5716 * in the byte range:
5717 *
5718 * [ uap->entries + sizeof(struct accessx_descriptor) ]
5719 * to
5720 * [ uap->entries + uap->size - 2 ]
5721 *
5722 * since we must have at least one string, and the string must
5723 * be at least one character plus the NULL terminator in length.
5724 *
5725 * XXX: Need to support the check-as uid argument
5726 */
5727 int
5728 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5729 {
5730 struct accessx_descriptor *input = NULL;
5731 errno_t *result = NULL;
5732 errno_t error = 0;
5733 int wantdelete = 0;
5734 unsigned int desc_max, desc_actual, i, j;
5735 struct vfs_context context;
5736 struct nameidata nd;
5737 int niopts;
5738 vnode_t vp = NULL;
5739 vnode_t dvp = NULL;
5740 #define ACCESSX_MAX_DESCR_ON_STACK 10
5741 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5742
5743 context.vc_ucred = NULL;
5744
5745 /*
5746 * Validate parameters; if valid, copy the descriptor array and string
5747 * arguments into local memory. Before proceeding, the following
5748 * conditions must have been met:
5749 *
5750 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5751 * o There must be sufficient room in the request for at least one
5752 * descriptor and a one yte NUL terminated string.
5753 * o The allocation of local storage must not fail.
5754 */
5755 if (uap->size > ACCESSX_MAX_TABLESIZE) {
5756 return ENOMEM;
5757 }
5758 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5759 return EINVAL;
5760 }
5761 if (uap->size <= sizeof(stack_input)) {
5762 input = stack_input;
5763 } else {
5764 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5765 if (input == NULL) {
5766 error = ENOMEM;
5767 goto out;
5768 }
5769 }
5770 error = copyin(uap->entries, input, uap->size);
5771 if (error) {
5772 goto out;
5773 }
5774
5775 AUDIT_ARG(opaque, input, uap->size);
5776
5777 /*
5778 * Force NUL termination of the copyin buffer to avoid nami() running
5779 * off the end. If the caller passes us bogus data, they may get a
5780 * bogus result.
5781 */
5782 ((char *)input)[uap->size - 1] = 0;
5783
5784 /*
5785 * Access is defined as checking against the process' real identity,
5786 * even if operations are checking the effective identity. This
5787 * requires that we use a local vfs context.
5788 */
5789 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5790 context.vc_thread = current_thread();
5791
5792 /*
5793 * Find out how many entries we have, so we can allocate the result
5794 * array by walking the list and adjusting the count downward by the
5795 * earliest string offset we see.
5796 */
5797 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5798 desc_actual = desc_max;
5799 for (i = 0; i < desc_actual; i++) {
5800 /*
5801 * Take the offset to the name string for this entry and
5802 * convert to an input array index, which would be one off
5803 * the end of the array if this entry was the lowest-addressed
5804 * name string.
5805 */
5806 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5807
5808 /*
5809 * An offset greater than the max allowable offset is an error.
5810 * It is also an error for any valid entry to point
5811 * to a location prior to the end of the current entry, if
5812 * it's not a reference to the string of the previous entry.
5813 */
5814 if (j > desc_max || (j != 0 && j <= i)) {
5815 error = EINVAL;
5816 goto out;
5817 }
5818
5819 /* Also do not let ad_name_offset point to something beyond the size of the input */
5820 if (input[i].ad_name_offset >= uap->size) {
5821 error = EINVAL;
5822 goto out;
5823 }
5824
5825 /*
5826 * An offset of 0 means use the previous descriptor's offset;
5827 * this is used to chain multiple requests for the same file
5828 * to avoid multiple lookups.
5829 */
5830 if (j == 0) {
5831 /* This is not valid for the first entry */
5832 if (i == 0) {
5833 error = EINVAL;
5834 goto out;
5835 }
5836 continue;
5837 }
5838
5839 /*
5840 * If the offset of the string for this descriptor is before
5841 * what we believe is the current actual last descriptor,
5842 * then we need to adjust our estimate downward; this permits
5843 * the string table following the last descriptor to be out
5844 * of order relative to the descriptor list.
5845 */
5846 if (j < desc_actual) {
5847 desc_actual = j;
5848 }
5849 }
5850
5851 /*
5852 * We limit the actual number of descriptors we are willing to process
5853 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
5854 * requested does not exceed this limit,
5855 */
5856 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5857 error = ENOMEM;
5858 goto out;
5859 }
5860 MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5861 if (result == NULL) {
5862 error = ENOMEM;
5863 goto out;
5864 }
5865
5866 /*
5867 * Do the work by iterating over the descriptor entries we know to
5868 * at least appear to contain valid data.
5869 */
5870 error = 0;
5871 for (i = 0; i < desc_actual; i++) {
5872 /*
5873 * If the ad_name_offset is 0, then we use the previous
5874 * results to make the check; otherwise, we are looking up
5875 * a new file name.
5876 */
5877 if (input[i].ad_name_offset != 0) {
5878 /* discard old vnodes */
5879 if (vp) {
5880 vnode_put(vp);
5881 vp = NULL;
5882 }
5883 if (dvp) {
5884 vnode_put(dvp);
5885 dvp = NULL;
5886 }
5887
5888 /*
5889 * Scan forward in the descriptor list to see if we
5890 * need the parent vnode. We will need it if we are
5891 * deleting, since we must have rights to remove
5892 * entries in the parent directory, as well as the
5893 * rights to delete the object itself.
5894 */
5895 wantdelete = input[i].ad_flags & _DELETE_OK;
5896 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5897 if (input[j].ad_flags & _DELETE_OK) {
5898 wantdelete = 1;
5899 }
5900 }
5901
5902 niopts = FOLLOW | AUDITVNPATH1;
5903
5904 /* need parent for vnode_authorize for deletion test */
5905 if (wantdelete) {
5906 niopts |= WANTPARENT;
5907 }
5908
5909 /* do the lookup */
5910 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5911 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5912 &context);
5913 error = namei(&nd);
5914 if (!error) {
5915 vp = nd.ni_vp;
5916 if (wantdelete) {
5917 dvp = nd.ni_dvp;
5918 }
5919 }
5920 nameidone(&nd);
5921 }
5922
5923 /*
5924 * Handle lookup errors.
5925 */
5926 switch (error) {
5927 case ENOENT:
5928 case EACCES:
5929 case EPERM:
5930 case ENOTDIR:
5931 result[i] = error;
5932 break;
5933 case 0:
5934 /* run this access check */
5935 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5936 break;
5937 default:
5938 /* fatal lookup error */
5939
5940 goto out;
5941 }
5942 }
5943
5944 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5945
5946 /* copy out results */
5947 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5948
5949 out:
5950 if (input && input != stack_input) {
5951 FREE(input, M_TEMP);
5952 }
5953 if (result) {
5954 FREE(result, M_TEMP);
5955 }
5956 if (vp) {
5957 vnode_put(vp);
5958 }
5959 if (dvp) {
5960 vnode_put(dvp);
5961 }
5962 if (IS_VALID_CRED(context.vc_ucred)) {
5963 kauth_cred_unref(&context.vc_ucred);
5964 }
5965 return error;
5966 }
5967
5968
5969 /*
5970 * Returns: 0 Success
5971 * namei:EFAULT Bad address
5972 * namei:ENAMETOOLONG Filename too long
5973 * namei:ENOENT No such file or directory
5974 * namei:ELOOP Too many levels of symbolic links
5975 * namei:EBADF Bad file descriptor
5976 * namei:ENOTDIR Not a directory
5977 * namei:???
5978 * access1:
5979 */
5980 static int
5981 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5982 int flag, enum uio_seg segflg)
5983 {
5984 int error;
5985 struct nameidata nd;
5986 int niopts;
5987 struct vfs_context context;
5988 #if NAMEDRSRCFORK
5989 int is_namedstream = 0;
5990 #endif
5991
5992 /*
5993 * Unless the AT_EACCESS option is used, Access is defined as checking
5994 * against the process' real identity, even if operations are checking
5995 * the effective identity. So we need to tweak the credential
5996 * in the context for that case.
5997 */
5998 if (!(flag & AT_EACCESS)) {
5999 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6000 } else {
6001 context.vc_ucred = ctx->vc_ucred;
6002 }
6003 context.vc_thread = ctx->vc_thread;
6004
6005
6006 niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6007 /* need parent for vnode_authorize for deletion test */
6008 if (amode & _DELETE_OK) {
6009 niopts |= WANTPARENT;
6010 }
6011 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6012 path, &context);
6013
6014 #if NAMEDRSRCFORK
6015 /* access(F_OK) calls are allowed for resource forks. */
6016 if (amode == F_OK) {
6017 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6018 }
6019 #endif
6020 error = nameiat(&nd, fd);
6021 if (error) {
6022 goto out;
6023 }
6024
6025 #if NAMEDRSRCFORK
6026 /* Grab reference on the shadow stream file vnode to
6027 * force an inactive on release which will mark it
6028 * for recycle.
6029 */
6030 if (vnode_isnamedstream(nd.ni_vp) &&
6031 (nd.ni_vp->v_parent != NULLVP) &&
6032 vnode_isshadow(nd.ni_vp)) {
6033 is_namedstream = 1;
6034 vnode_ref(nd.ni_vp);
6035 }
6036 #endif
6037
6038 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6039
6040 #if NAMEDRSRCFORK
6041 if (is_namedstream) {
6042 vnode_rele(nd.ni_vp);
6043 }
6044 #endif
6045
6046 vnode_put(nd.ni_vp);
6047 if (amode & _DELETE_OK) {
6048 vnode_put(nd.ni_dvp);
6049 }
6050 nameidone(&nd);
6051
6052 out:
6053 if (!(flag & AT_EACCESS)) {
6054 kauth_cred_unref(&context.vc_ucred);
6055 }
6056 return error;
6057 }
6058
6059 int
6060 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6061 {
6062 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6063 uap->path, uap->flags, 0, UIO_USERSPACE);
6064 }
6065
6066 int
6067 faccessat(__unused proc_t p, struct faccessat_args *uap,
6068 __unused int32_t *retval)
6069 {
6070 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6071 return EINVAL;
6072 }
6073
6074 return faccessat_internal(vfs_context_current(), uap->fd,
6075 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6076 }
6077
6078 /*
6079 * Returns: 0 Success
6080 * EFAULT
6081 * copyout:EFAULT
6082 * namei:???
6083 * vn_stat:???
6084 */
6085 static int
6086 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6087 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6088 enum uio_seg segflg, int fd, int flag)
6089 {
6090 struct nameidata nd;
6091 int follow;
6092 union {
6093 struct stat sb;
6094 struct stat64 sb64;
6095 } source = {};
6096 union {
6097 struct user64_stat user64_sb;
6098 struct user32_stat user32_sb;
6099 struct user64_stat64 user64_sb64;
6100 struct user32_stat64 user32_sb64;
6101 } dest = {};
6102 caddr_t sbp;
6103 int error, my_size;
6104 kauth_filesec_t fsec;
6105 size_t xsecurity_bufsize;
6106 void * statptr;
6107 struct fileproc *fp = NULL;
6108 int needsrealdev = 0;
6109
6110 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6111 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6112 segflg, path, ctx);
6113
6114 #if NAMEDRSRCFORK
6115 int is_namedstream = 0;
6116 /* stat calls are allowed for resource forks. */
6117 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6118 #endif
6119
6120 if (flag & AT_FDONLY) {
6121 vnode_t fvp;
6122
6123 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6124 if (error) {
6125 return error;
6126 }
6127 if ((error = vnode_getwithref(fvp))) {
6128 file_drop(fd);
6129 return error;
6130 }
6131 nd.ni_vp = fvp;
6132 } else {
6133 error = nameiat(&nd, fd);
6134 if (error) {
6135 return error;
6136 }
6137 }
6138 fsec = KAUTH_FILESEC_NONE;
6139
6140 statptr = (void *)&source;
6141
6142 #if NAMEDRSRCFORK
6143 /* Grab reference on the shadow stream file vnode to
6144 * force an inactive on release which will mark it
6145 * for recycle.
6146 */
6147 if (vnode_isnamedstream(nd.ni_vp) &&
6148 (nd.ni_vp->v_parent != NULLVP) &&
6149 vnode_isshadow(nd.ni_vp)) {
6150 is_namedstream = 1;
6151 vnode_ref(nd.ni_vp);
6152 }
6153 #endif
6154
6155 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6156 if (fp && (xsecurity == USER_ADDR_NULL)) {
6157 /*
6158 * If the caller has the file open, and is not
6159 * requesting extended security information, we are
6160 * going to let them get the basic stat information.
6161 */
6162 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6163 fp->f_fglob->fg_cred);
6164 } else {
6165 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6166 isstat64, needsrealdev, ctx);
6167 }
6168
6169 #if NAMEDRSRCFORK
6170 if (is_namedstream) {
6171 vnode_rele(nd.ni_vp);
6172 }
6173 #endif
6174 vnode_put(nd.ni_vp);
6175 nameidone(&nd);
6176 if (fp) {
6177 file_drop(fd);
6178 fp = NULL;
6179 }
6180
6181 if (error) {
6182 return error;
6183 }
6184 /* Zap spare fields */
6185 if (isstat64 != 0) {
6186 source.sb64.st_lspare = 0;
6187 source.sb64.st_qspare[0] = 0LL;
6188 source.sb64.st_qspare[1] = 0LL;
6189 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6190 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6191 my_size = sizeof(dest.user64_sb64);
6192 sbp = (caddr_t)&dest.user64_sb64;
6193 } else {
6194 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6195 my_size = sizeof(dest.user32_sb64);
6196 sbp = (caddr_t)&dest.user32_sb64;
6197 }
6198 /*
6199 * Check if we raced (post lookup) against the last unlink of a file.
6200 */
6201 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6202 source.sb64.st_nlink = 1;
6203 }
6204 } else {
6205 source.sb.st_lspare = 0;
6206 source.sb.st_qspare[0] = 0LL;
6207 source.sb.st_qspare[1] = 0LL;
6208 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6209 munge_user64_stat(&source.sb, &dest.user64_sb);
6210 my_size = sizeof(dest.user64_sb);
6211 sbp = (caddr_t)&dest.user64_sb;
6212 } else {
6213 munge_user32_stat(&source.sb, &dest.user32_sb);
6214 my_size = sizeof(dest.user32_sb);
6215 sbp = (caddr_t)&dest.user32_sb;
6216 }
6217
6218 /*
6219 * Check if we raced (post lookup) against the last unlink of a file.
6220 */
6221 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6222 source.sb.st_nlink = 1;
6223 }
6224 }
6225 if ((error = copyout(sbp, ub, my_size)) != 0) {
6226 goto out;
6227 }
6228
6229 /* caller wants extended security information? */
6230 if (xsecurity != USER_ADDR_NULL) {
6231 /* did we get any? */
6232 if (fsec == KAUTH_FILESEC_NONE) {
6233 if (susize(xsecurity_size, 0) != 0) {
6234 error = EFAULT;
6235 goto out;
6236 }
6237 } else {
6238 /* find the user buffer size */
6239 xsecurity_bufsize = fusize(xsecurity_size);
6240
6241 /* copy out the actual data size */
6242 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6243 error = EFAULT;
6244 goto out;
6245 }
6246
6247 /* if the caller supplied enough room, copy out to it */
6248 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6249 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6250 }
6251 }
6252 }
6253 out:
6254 if (fsec != KAUTH_FILESEC_NONE) {
6255 kauth_filesec_free(fsec);
6256 }
6257 return error;
6258 }
6259
6260 /*
6261 * stat_extended: Get file status; with extended security (ACL).
6262 *
6263 * Parameters: p (ignored)
6264 * uap User argument descriptor (see below)
6265 * retval (ignored)
6266 *
6267 * Indirect: uap->path Path of file to get status from
6268 * uap->ub User buffer (holds file status info)
6269 * uap->xsecurity ACL to get (extended security)
6270 * uap->xsecurity_size Size of ACL
6271 *
6272 * Returns: 0 Success
6273 * !0 errno value
6274 *
6275 */
6276 int
6277 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6278 __unused int32_t *retval)
6279 {
6280 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6281 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6282 0);
6283 }
6284
6285 /*
6286 * Returns: 0 Success
6287 * fstatat_internal:??? [see fstatat_internal() in this file]
6288 */
6289 int
6290 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6291 {
6292 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6293 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6294 }
6295
6296 int
6297 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6298 {
6299 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6300 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6301 }
6302
6303 /*
6304 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6305 *
6306 * Parameters: p (ignored)
6307 * uap User argument descriptor (see below)
6308 * retval (ignored)
6309 *
6310 * Indirect: uap->path Path of file to get status from
6311 * uap->ub User buffer (holds file status info)
6312 * uap->xsecurity ACL to get (extended security)
6313 * uap->xsecurity_size Size of ACL
6314 *
6315 * Returns: 0 Success
6316 * !0 errno value
6317 *
6318 */
6319 int
6320 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6321 {
6322 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6323 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6324 0);
6325 }
6326
6327 /*
6328 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6329 *
6330 * Parameters: p (ignored)
6331 * uap User argument descriptor (see below)
6332 * retval (ignored)
6333 *
6334 * Indirect: uap->path Path of file to get status from
6335 * uap->ub User buffer (holds file status info)
6336 * uap->xsecurity ACL to get (extended security)
6337 * uap->xsecurity_size Size of ACL
6338 *
6339 * Returns: 0 Success
6340 * !0 errno value
6341 *
6342 */
6343 int
6344 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6345 {
6346 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6347 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6348 AT_SYMLINK_NOFOLLOW);
6349 }
6350
6351 /*
6352 * Get file status; this version does not follow links.
6353 */
6354 int
6355 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6356 {
6357 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6358 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6359 }
6360
6361 int
6362 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6363 {
6364 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6365 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6366 }
6367
6368 /*
6369 * lstat64_extended: Get file status; can handle large inode numbers; does not
6370 * follow links; with extended security (ACL).
6371 *
6372 * Parameters: p (ignored)
6373 * uap User argument descriptor (see below)
6374 * retval (ignored)
6375 *
6376 * Indirect: uap->path Path of file to get status from
6377 * uap->ub User buffer (holds file status info)
6378 * uap->xsecurity ACL to get (extended security)
6379 * uap->xsecurity_size Size of ACL
6380 *
6381 * Returns: 0 Success
6382 * !0 errno value
6383 *
6384 */
6385 int
6386 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6387 {
6388 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6389 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6390 AT_SYMLINK_NOFOLLOW);
6391 }
6392
6393 int
6394 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6395 {
6396 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6397 return EINVAL;
6398 }
6399
6400 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6401 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6402 }
6403
6404 int
6405 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6406 __unused int32_t *retval)
6407 {
6408 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6409 return EINVAL;
6410 }
6411
6412 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6413 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6414 }
6415
6416 /*
6417 * Get configurable pathname variables.
6418 *
6419 * Returns: 0 Success
6420 * namei:???
6421 * vn_pathconf:???
6422 *
6423 * Notes: Global implementation constants are intended to be
6424 * implemented in this function directly; all other constants
6425 * are per-FS implementation, and therefore must be handled in
6426 * each respective FS, instead.
6427 *
6428 * XXX We implement some things globally right now that should actually be
6429 * XXX per-FS; we will need to deal with this at some point.
6430 */
6431 /* ARGSUSED */
6432 int
6433 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6434 {
6435 int error;
6436 struct nameidata nd;
6437 vfs_context_t ctx = vfs_context_current();
6438
6439 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6440 UIO_USERSPACE, uap->path, ctx);
6441 error = namei(&nd);
6442 if (error) {
6443 return error;
6444 }
6445
6446 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6447
6448 vnode_put(nd.ni_vp);
6449 nameidone(&nd);
6450 return error;
6451 }
6452
6453 /*
6454 * Return target name of a symbolic link.
6455 */
6456 /* ARGSUSED */
6457 static int
6458 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6459 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6460 int *retval)
6461 {
6462 vnode_t vp;
6463 uio_t auio;
6464 int error;
6465 struct nameidata nd;
6466 char uio_buf[UIO_SIZEOF(1)];
6467
6468 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6469 seg, path, ctx);
6470
6471 error = nameiat(&nd, fd);
6472 if (error) {
6473 return error;
6474 }
6475 vp = nd.ni_vp;
6476
6477 nameidone(&nd);
6478
6479 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6480 &uio_buf[0], sizeof(uio_buf));
6481 uio_addiov(auio, buf, bufsize);
6482 if (vp->v_type != VLNK) {
6483 error = EINVAL;
6484 } else {
6485 #if CONFIG_MACF
6486 error = mac_vnode_check_readlink(ctx, vp);
6487 #endif
6488 if (error == 0) {
6489 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6490 ctx);
6491 }
6492 if (error == 0) {
6493 error = VNOP_READLINK(vp, auio, ctx);
6494 }
6495 }
6496 vnode_put(vp);
6497
6498 *retval = bufsize - (int)uio_resid(auio);
6499 return error;
6500 }
6501
6502 int
6503 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6504 {
6505 enum uio_seg procseg;
6506
6507 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6508 return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6509 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6510 uap->count, procseg, retval);
6511 }
6512
6513 int
6514 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6515 {
6516 enum uio_seg procseg;
6517
6518 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6519 return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6520 procseg, uap->buf, uap->bufsize, procseg, retval);
6521 }
6522
6523 /*
6524 * Change file flags, the deep inner layer.
6525 */
6526 static int
6527 chflags0(vnode_t vp, struct vnode_attr *va,
6528 int (*setattr)(vnode_t, void *, vfs_context_t),
6529 void *arg, vfs_context_t ctx)
6530 {
6531 kauth_action_t action = 0;
6532 int error;
6533
6534 #if CONFIG_MACF
6535 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6536 if (error) {
6537 goto out;
6538 }
6539 #endif
6540
6541 /* request authorisation, disregard immutability */
6542 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6543 goto out;
6544 }
6545 /*
6546 * Request that the auth layer disregard those file flags it's allowed to when
6547 * authorizing this operation; we need to do this in order to be able to
6548 * clear immutable flags.
6549 */
6550 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6551 goto out;
6552 }
6553 error = (*setattr)(vp, arg, ctx);
6554
6555 #if CONFIG_MACF
6556 if (error == 0) {
6557 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6558 }
6559 #endif
6560
6561 out:
6562 return error;
6563 }
6564
6565 /*
6566 * Change file flags.
6567 *
6568 * NOTE: this will vnode_put() `vp'
6569 */
6570 static int
6571 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6572 {
6573 struct vnode_attr va;
6574 int error;
6575
6576 VATTR_INIT(&va);
6577 VATTR_SET(&va, va_flags, flags);
6578
6579 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6580 vnode_put(vp);
6581
6582 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6583 error = ENOTSUP;
6584 }
6585
6586 return error;
6587 }
6588
6589 /*
6590 * Change flags of a file given a path name.
6591 */
6592 /* ARGSUSED */
6593 int
6594 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6595 {
6596 vnode_t vp;
6597 vfs_context_t ctx = vfs_context_current();
6598 int error;
6599 struct nameidata nd;
6600
6601 AUDIT_ARG(fflags, uap->flags);
6602 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6603 UIO_USERSPACE, uap->path, ctx);
6604 error = namei(&nd);
6605 if (error) {
6606 return error;
6607 }
6608 vp = nd.ni_vp;
6609 nameidone(&nd);
6610
6611 /* we don't vnode_put() here because chflags1 does internally */
6612 error = chflags1(vp, uap->flags, ctx);
6613
6614 return error;
6615 }
6616
6617 /*
6618 * Change flags of a file given a file descriptor.
6619 */
6620 /* ARGSUSED */
6621 int
6622 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6623 {
6624 vnode_t vp;
6625 int error;
6626
6627 AUDIT_ARG(fd, uap->fd);
6628 AUDIT_ARG(fflags, uap->flags);
6629 if ((error = file_vnode(uap->fd, &vp))) {
6630 return error;
6631 }
6632
6633 if ((error = vnode_getwithref(vp))) {
6634 file_drop(uap->fd);
6635 return error;
6636 }
6637
6638 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6639
6640 /* we don't vnode_put() here because chflags1 does internally */
6641 error = chflags1(vp, uap->flags, vfs_context_current());
6642
6643 file_drop(uap->fd);
6644 return error;
6645 }
6646
6647 /*
6648 * Change security information on a filesystem object.
6649 *
6650 * Returns: 0 Success
6651 * EPERM Operation not permitted
6652 * vnode_authattr:??? [anything vnode_authattr can return]
6653 * vnode_authorize:??? [anything vnode_authorize can return]
6654 * vnode_setattr:??? [anything vnode_setattr can return]
6655 *
6656 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
6657 * translated to EPERM before being returned.
6658 */
6659 static int
6660 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6661 {
6662 kauth_action_t action;
6663 int error;
6664
6665 AUDIT_ARG(mode, vap->va_mode);
6666 /* XXX audit new args */
6667
6668 #if NAMEDSTREAMS
6669 /* chmod calls are not allowed for resource forks. */
6670 if (vp->v_flag & VISNAMEDSTREAM) {
6671 return EPERM;
6672 }
6673 #endif
6674
6675 #if CONFIG_MACF
6676 if (VATTR_IS_ACTIVE(vap, va_mode) &&
6677 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6678 return error;
6679 }
6680
6681 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6682 if ((error = mac_vnode_check_setowner(ctx, vp,
6683 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6684 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6685 return error;
6686 }
6687 }
6688
6689 if (VATTR_IS_ACTIVE(vap, va_acl) &&
6690 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6691 return error;
6692 }
6693 #endif
6694
6695 /* make sure that the caller is allowed to set this security information */
6696 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6697 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6698 if (error == EACCES) {
6699 error = EPERM;
6700 }
6701 return error;
6702 }
6703
6704 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6705 return error;
6706 }
6707
6708 #if CONFIG_MACF
6709 if (VATTR_IS_ACTIVE(vap, va_mode)) {
6710 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6711 }
6712
6713 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6714 mac_vnode_notify_setowner(ctx, vp,
6715 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6716 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6717 }
6718
6719 if (VATTR_IS_ACTIVE(vap, va_acl)) {
6720 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6721 }
6722 #endif
6723
6724 return error;
6725 }
6726
6727
6728 /*
6729 * Change mode of a file given a path name.
6730 *
6731 * Returns: 0 Success
6732 * namei:??? [anything namei can return]
6733 * chmod_vnode:??? [anything chmod_vnode can return]
6734 */
6735 static int
6736 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6737 int fd, int flag, enum uio_seg segflg)
6738 {
6739 struct nameidata nd;
6740 int follow, error;
6741
6742 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6743 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6744 segflg, path, ctx);
6745 if ((error = nameiat(&nd, fd))) {
6746 return error;
6747 }
6748 error = chmod_vnode(ctx, nd.ni_vp, vap);
6749 vnode_put(nd.ni_vp);
6750 nameidone(&nd);
6751 return error;
6752 }
6753
6754 /*
6755 * chmod_extended: Change the mode of a file given a path name; with extended
6756 * argument list (including extended security (ACL)).
6757 *
6758 * Parameters: p Process requesting the open
6759 * uap User argument descriptor (see below)
6760 * retval (ignored)
6761 *
6762 * Indirect: uap->path Path to object (same as 'chmod')
6763 * uap->uid UID to set
6764 * uap->gid GID to set
6765 * uap->mode File mode to set (same as 'chmod')
6766 * uap->xsecurity ACL to set (or delete)
6767 *
6768 * Returns: 0 Success
6769 * !0 errno value
6770 *
6771 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
6772 *
6773 * XXX: We should enummerate the possible errno values here, and where
6774 * in the code they originated.
6775 */
6776 int
6777 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6778 {
6779 int error;
6780 struct vnode_attr va;
6781 kauth_filesec_t xsecdst;
6782
6783 AUDIT_ARG(owner, uap->uid, uap->gid);
6784
6785 VATTR_INIT(&va);
6786 if (uap->mode != -1) {
6787 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6788 }
6789 if (uap->uid != KAUTH_UID_NONE) {
6790 VATTR_SET(&va, va_uid, uap->uid);
6791 }
6792 if (uap->gid != KAUTH_GID_NONE) {
6793 VATTR_SET(&va, va_gid, uap->gid);
6794 }
6795
6796 xsecdst = NULL;
6797 switch (uap->xsecurity) {
6798 /* explicit remove request */
6799 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
6800 VATTR_SET(&va, va_acl, NULL);
6801 break;
6802 /* not being set */
6803 case USER_ADDR_NULL:
6804 break;
6805 default:
6806 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6807 return error;
6808 }
6809 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6810 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6811 }
6812
6813 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6814 UIO_USERSPACE);
6815
6816 if (xsecdst != NULL) {
6817 kauth_filesec_free(xsecdst);
6818 }
6819 return error;
6820 }
6821
6822 /*
6823 * Returns: 0 Success
6824 * chmodat:??? [anything chmodat can return]
6825 */
6826 static int
6827 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6828 int flag, enum uio_seg segflg)
6829 {
6830 struct vnode_attr va;
6831
6832 VATTR_INIT(&va);
6833 VATTR_SET(&va, va_mode, mode & ALLPERMS);
6834
6835 return chmodat(ctx, path, &va, fd, flag, segflg);
6836 }
6837
6838 int
6839 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6840 {
6841 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6842 AT_FDCWD, 0, UIO_USERSPACE);
6843 }
6844
6845 int
6846 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6847 {
6848 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6849 return EINVAL;
6850 }
6851
6852 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6853 uap->fd, uap->flag, UIO_USERSPACE);
6854 }
6855
6856 /*
6857 * Change mode of a file given a file descriptor.
6858 */
6859 static int
6860 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6861 {
6862 vnode_t vp;
6863 int error;
6864
6865 AUDIT_ARG(fd, fd);
6866
6867 if ((error = file_vnode(fd, &vp)) != 0) {
6868 return error;
6869 }
6870 if ((error = vnode_getwithref(vp)) != 0) {
6871 file_drop(fd);
6872 return error;
6873 }
6874 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6875
6876 error = chmod_vnode(vfs_context_current(), vp, vap);
6877 (void)vnode_put(vp);
6878 file_drop(fd);
6879
6880 return error;
6881 }
6882
6883 /*
6884 * fchmod_extended: Change mode of a file given a file descriptor; with
6885 * extended argument list (including extended security (ACL)).
6886 *
6887 * Parameters: p Process requesting to change file mode
6888 * uap User argument descriptor (see below)
6889 * retval (ignored)
6890 *
6891 * Indirect: uap->mode File mode to set (same as 'chmod')
6892 * uap->uid UID to set
6893 * uap->gid GID to set
6894 * uap->xsecurity ACL to set (or delete)
6895 * uap->fd File descriptor of file to change mode
6896 *
6897 * Returns: 0 Success
6898 * !0 errno value
6899 *
6900 */
6901 int
6902 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6903 {
6904 int error;
6905 struct vnode_attr va;
6906 kauth_filesec_t xsecdst;
6907
6908 AUDIT_ARG(owner, uap->uid, uap->gid);
6909
6910 VATTR_INIT(&va);
6911 if (uap->mode != -1) {
6912 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6913 }
6914 if (uap->uid != KAUTH_UID_NONE) {
6915 VATTR_SET(&va, va_uid, uap->uid);
6916 }
6917 if (uap->gid != KAUTH_GID_NONE) {
6918 VATTR_SET(&va, va_gid, uap->gid);
6919 }
6920
6921 xsecdst = NULL;
6922 switch (uap->xsecurity) {
6923 case USER_ADDR_NULL:
6924 VATTR_SET(&va, va_acl, NULL);
6925 break;
6926 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
6927 VATTR_SET(&va, va_acl, NULL);
6928 break;
6929 /* not being set */
6930 case CAST_USER_ADDR_T(-1):
6931 break;
6932 default:
6933 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6934 return error;
6935 }
6936 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6937 }
6938
6939 error = fchmod1(p, uap->fd, &va);
6940
6941
6942 switch (uap->xsecurity) {
6943 case USER_ADDR_NULL:
6944 case CAST_USER_ADDR_T(-1):
6945 break;
6946 default:
6947 if (xsecdst != NULL) {
6948 kauth_filesec_free(xsecdst);
6949 }
6950 }
6951 return error;
6952 }
6953
6954 int
6955 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6956 {
6957 struct vnode_attr va;
6958
6959 VATTR_INIT(&va);
6960 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6961
6962 return fchmod1(p, uap->fd, &va);
6963 }
6964
6965
6966 /*
6967 * Set ownership given a path name.
6968 */
6969 /* ARGSUSED */
6970 static int
6971 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6972 gid_t gid, int flag, enum uio_seg segflg)
6973 {
6974 vnode_t vp;
6975 struct vnode_attr va;
6976 int error;
6977 struct nameidata nd;
6978 int follow;
6979 kauth_action_t action;
6980
6981 AUDIT_ARG(owner, uid, gid);
6982
6983 follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6984 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6985 path, ctx);
6986 error = nameiat(&nd, fd);
6987 if (error) {
6988 return error;
6989 }
6990 vp = nd.ni_vp;
6991
6992 nameidone(&nd);
6993
6994 VATTR_INIT(&va);
6995 if (uid != (uid_t)VNOVAL) {
6996 VATTR_SET(&va, va_uid, uid);
6997 }
6998 if (gid != (gid_t)VNOVAL) {
6999 VATTR_SET(&va, va_gid, gid);
7000 }
7001
7002 #if CONFIG_MACF
7003 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7004 if (error) {
7005 goto out;
7006 }
7007 #endif
7008
7009 /* preflight and authorize attribute changes */
7010 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7011 goto out;
7012 }
7013 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7014 goto out;
7015 }
7016 error = vnode_setattr(vp, &va, ctx);
7017
7018 #if CONFIG_MACF
7019 if (error == 0) {
7020 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7021 }
7022 #endif
7023
7024 out:
7025 /*
7026 * EACCES is only allowed from namei(); permissions failure should
7027 * return EPERM, so we need to translate the error code.
7028 */
7029 if (error == EACCES) {
7030 error = EPERM;
7031 }
7032
7033 vnode_put(vp);
7034 return error;
7035 }
7036
7037 int
7038 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7039 {
7040 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7041 uap->uid, uap->gid, 0, UIO_USERSPACE);
7042 }
7043
7044 int
7045 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7046 {
7047 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7048 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7049 }
7050
7051 int
7052 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7053 {
7054 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7055 return EINVAL;
7056 }
7057
7058 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7059 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7060 }
7061
7062 /*
7063 * Set ownership given a file descriptor.
7064 */
7065 /* ARGSUSED */
7066 int
7067 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7068 {
7069 struct vnode_attr va;
7070 vfs_context_t ctx = vfs_context_current();
7071 vnode_t vp;
7072 int error;
7073 kauth_action_t action;
7074
7075 AUDIT_ARG(owner, uap->uid, uap->gid);
7076 AUDIT_ARG(fd, uap->fd);
7077
7078 if ((error = file_vnode(uap->fd, &vp))) {
7079 return error;
7080 }
7081
7082 if ((error = vnode_getwithref(vp))) {
7083 file_drop(uap->fd);
7084 return error;
7085 }
7086 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7087
7088 VATTR_INIT(&va);
7089 if (uap->uid != VNOVAL) {
7090 VATTR_SET(&va, va_uid, uap->uid);
7091 }
7092 if (uap->gid != VNOVAL) {
7093 VATTR_SET(&va, va_gid, uap->gid);
7094 }
7095
7096 #if NAMEDSTREAMS
7097 /* chown calls are not allowed for resource forks. */
7098 if (vp->v_flag & VISNAMEDSTREAM) {
7099 error = EPERM;
7100 goto out;
7101 }
7102 #endif
7103
7104 #if CONFIG_MACF
7105 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7106 if (error) {
7107 goto out;
7108 }
7109 #endif
7110
7111 /* preflight and authorize attribute changes */
7112 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7113 goto out;
7114 }
7115 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7116 if (error == EACCES) {
7117 error = EPERM;
7118 }
7119 goto out;
7120 }
7121 error = vnode_setattr(vp, &va, ctx);
7122
7123 #if CONFIG_MACF
7124 if (error == 0) {
7125 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7126 }
7127 #endif
7128
7129 out:
7130 (void)vnode_put(vp);
7131 file_drop(uap->fd);
7132 return error;
7133 }
7134
7135 static int
7136 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7137 {
7138 int error;
7139
7140 if (usrtvp == USER_ADDR_NULL) {
7141 struct timeval old_tv;
7142 /* XXX Y2038 bug because of microtime argument */
7143 microtime(&old_tv);
7144 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7145 tsp[1] = tsp[0];
7146 } else {
7147 if (IS_64BIT_PROCESS(current_proc())) {
7148 struct user64_timeval tv[2];
7149 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7150 if (error) {
7151 return error;
7152 }
7153 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7154 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7155 } else {
7156 struct user32_timeval tv[2];
7157 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7158 if (error) {
7159 return error;
7160 }
7161 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7162 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7163 }
7164 }
7165 return 0;
7166 }
7167
7168 static int
7169 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7170 int nullflag)
7171 {
7172 int error;
7173 struct vnode_attr va;
7174 kauth_action_t action;
7175
7176 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7177
7178 VATTR_INIT(&va);
7179 VATTR_SET(&va, va_access_time, ts[0]);
7180 VATTR_SET(&va, va_modify_time, ts[1]);
7181 if (nullflag) {
7182 va.va_vaflags |= VA_UTIMES_NULL;
7183 }
7184
7185 #if NAMEDSTREAMS
7186 /* utimes calls are not allowed for resource forks. */
7187 if (vp->v_flag & VISNAMEDSTREAM) {
7188 error = EPERM;
7189 goto out;
7190 }
7191 #endif
7192
7193 #if CONFIG_MACF
7194 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7195 if (error) {
7196 goto out;
7197 }
7198 #endif
7199 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7200 if (!nullflag && error == EACCES) {
7201 error = EPERM;
7202 }
7203 goto out;
7204 }
7205
7206 /* since we may not need to auth anything, check here */
7207 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7208 if (!nullflag && error == EACCES) {
7209 error = EPERM;
7210 }
7211 goto out;
7212 }
7213 error = vnode_setattr(vp, &va, ctx);
7214
7215 #if CONFIG_MACF
7216 if (error == 0) {
7217 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7218 }
7219 #endif
7220
7221 out:
7222 return error;
7223 }
7224
7225 /*
7226 * Set the access and modification times of a file.
7227 */
7228 /* ARGSUSED */
7229 int
7230 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7231 {
7232 struct timespec ts[2];
7233 user_addr_t usrtvp;
7234 int error;
7235 struct nameidata nd;
7236 vfs_context_t ctx = vfs_context_current();
7237
7238 /*
7239 * AUDIT: Needed to change the order of operations to do the
7240 * name lookup first because auditing wants the path.
7241 */
7242 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7243 UIO_USERSPACE, uap->path, ctx);
7244 error = namei(&nd);
7245 if (error) {
7246 return error;
7247 }
7248 nameidone(&nd);
7249
7250 /*
7251 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
7252 * the current time instead.
7253 */
7254 usrtvp = uap->tptr;
7255 if ((error = getutimes(usrtvp, ts)) != 0) {
7256 goto out;
7257 }
7258
7259 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7260
7261 out:
7262 vnode_put(nd.ni_vp);
7263 return error;
7264 }
7265
7266 /*
7267 * Set the access and modification times of a file.
7268 */
7269 /* ARGSUSED */
7270 int
7271 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7272 {
7273 struct timespec ts[2];
7274 vnode_t vp;
7275 user_addr_t usrtvp;
7276 int error;
7277
7278 AUDIT_ARG(fd, uap->fd);
7279 usrtvp = uap->tptr;
7280 if ((error = getutimes(usrtvp, ts)) != 0) {
7281 return error;
7282 }
7283 if ((error = file_vnode(uap->fd, &vp)) != 0) {
7284 return error;
7285 }
7286 if ((error = vnode_getwithref(vp))) {
7287 file_drop(uap->fd);
7288 return error;
7289 }
7290
7291 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7292 vnode_put(vp);
7293 file_drop(uap->fd);
7294 return error;
7295 }
7296
7297 /*
7298 * Truncate a file given its path name.
7299 */
7300 /* ARGSUSED */
7301 int
7302 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7303 {
7304 vnode_t vp;
7305 struct vnode_attr va;
7306 vfs_context_t ctx = vfs_context_current();
7307 int error;
7308 struct nameidata nd;
7309 kauth_action_t action;
7310
7311 if (uap->length < 0) {
7312 return EINVAL;
7313 }
7314 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7315 UIO_USERSPACE, uap->path, ctx);
7316 if ((error = namei(&nd))) {
7317 return error;
7318 }
7319 vp = nd.ni_vp;
7320
7321 nameidone(&nd);
7322
7323 VATTR_INIT(&va);
7324 VATTR_SET(&va, va_data_size, uap->length);
7325
7326 #if CONFIG_MACF
7327 error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7328 if (error) {
7329 goto out;
7330 }
7331 #endif
7332
7333 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7334 goto out;
7335 }
7336 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7337 goto out;
7338 }
7339 error = vnode_setattr(vp, &va, ctx);
7340
7341 #if CONFIG_MACF
7342 if (error == 0) {
7343 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7344 }
7345 #endif
7346
7347 out:
7348 vnode_put(vp);
7349 return error;
7350 }
7351
7352 /*
7353 * Truncate a file given a file descriptor.
7354 */
7355 /* ARGSUSED */
7356 int
7357 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7358 {
7359 vfs_context_t ctx = vfs_context_current();
7360 struct vnode_attr va;
7361 vnode_t vp;
7362 struct fileproc *fp;
7363 int error;
7364 int fd = uap->fd;
7365
7366 AUDIT_ARG(fd, uap->fd);
7367 if (uap->length < 0) {
7368 return EINVAL;
7369 }
7370
7371 if ((error = fp_lookup(p, fd, &fp, 0))) {
7372 return error;
7373 }
7374
7375 switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7376 case DTYPE_PSXSHM:
7377 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7378 goto out;
7379 case DTYPE_VNODE:
7380 break;
7381 default:
7382 error = EINVAL;
7383 goto out;
7384 }
7385
7386 vp = (vnode_t)fp->f_fglob->fg_data;
7387
7388 if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7389 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7390 error = EINVAL;
7391 goto out;
7392 }
7393
7394 if ((error = vnode_getwithref(vp)) != 0) {
7395 goto out;
7396 }
7397
7398 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7399
7400 #if CONFIG_MACF
7401 error = mac_vnode_check_truncate(ctx,
7402 fp->f_fglob->fg_cred, vp);
7403 if (error) {
7404 (void)vnode_put(vp);
7405 goto out;
7406 }
7407 #endif
7408 VATTR_INIT(&va);
7409 VATTR_SET(&va, va_data_size, uap->length);
7410 error = vnode_setattr(vp, &va, ctx);
7411
7412 #if CONFIG_MACF
7413 if (error == 0) {
7414 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7415 }
7416 #endif
7417
7418 (void)vnode_put(vp);
7419 out:
7420 file_drop(fd);
7421 return error;
7422 }
7423
7424
7425 /*
7426 * Sync an open file with synchronized I/O _file_ integrity completion
7427 */
7428 /* ARGSUSED */
7429 int
7430 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7431 {
7432 __pthread_testcancel(1);
7433 return fsync_common(p, uap, MNT_WAIT);
7434 }
7435
7436
7437 /*
7438 * Sync an open file with synchronized I/O _file_ integrity completion
7439 *
7440 * Notes: This is a legacy support function that does not test for
7441 * thread cancellation points.
7442 */
7443 /* ARGSUSED */
7444 int
7445 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7446 {
7447 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7448 }
7449
7450
7451 /*
7452 * Sync an open file with synchronized I/O _data_ integrity completion
7453 */
7454 /* ARGSUSED */
7455 int
7456 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7457 {
7458 __pthread_testcancel(1);
7459 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7460 }
7461
7462
7463 /*
7464 * fsync_common
7465 *
7466 * Common fsync code to support both synchronized I/O file integrity completion
7467 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7468 *
7469 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7470 * will only guarantee that the file data contents are retrievable. If
7471 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7472 * includes additional metadata unnecessary for retrieving the file data
7473 * contents, such as atime, mtime, ctime, etc., also be committed to stable
7474 * storage.
7475 *
7476 * Parameters: p The process
7477 * uap->fd The descriptor to synchronize
7478 * flags The data integrity flags
7479 *
7480 * Returns: int Success
7481 * fp_getfvp:EBADF Bad file descriptor
7482 * fp_getfvp:ENOTSUP fd does not refer to a vnode
7483 * VNOP_FSYNC:??? unspecified
7484 *
7485 * Notes: We use struct fsync_args because it is a short name, and all
7486 * caller argument structures are otherwise identical.
7487 */
7488 static int
7489 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7490 {
7491 vnode_t vp;
7492 struct fileproc *fp;
7493 vfs_context_t ctx = vfs_context_current();
7494 int error;
7495
7496 AUDIT_ARG(fd, uap->fd);
7497
7498 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7499 return error;
7500 }
7501 if ((error = vnode_getwithref(vp))) {
7502 file_drop(uap->fd);
7503 return error;
7504 }
7505
7506 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7507
7508 error = VNOP_FSYNC(vp, flags, ctx);
7509
7510 #if NAMEDRSRCFORK
7511 /* Sync resource fork shadow file if necessary. */
7512 if ((error == 0) &&
7513 (vp->v_flag & VISNAMEDSTREAM) &&
7514 (vp->v_parent != NULLVP) &&
7515 vnode_isshadow(vp) &&
7516 (fp->f_flags & FP_WRITTEN)) {
7517 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7518 }
7519 #endif
7520
7521 (void)vnode_put(vp);
7522 file_drop(uap->fd);
7523 return error;
7524 }
7525
7526 /*
7527 * Duplicate files. Source must be a file, target must be a file or
7528 * must not exist.
7529 *
7530 * XXX Copyfile authorisation checking is woefully inadequate, and will not
7531 * perform inheritance correctly.
7532 */
7533 /* ARGSUSED */
7534 int
7535 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7536 {
7537 vnode_t tvp, fvp, tdvp, sdvp;
7538 struct nameidata fromnd, tond;
7539 int error;
7540 vfs_context_t ctx = vfs_context_current();
7541 #if CONFIG_MACF
7542 struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7543 struct vnode_attr va;
7544 #endif
7545
7546 /* Check that the flags are valid. */
7547
7548 if (uap->flags & ~CPF_MASK) {
7549 return EINVAL;
7550 }
7551
7552 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7553 UIO_USERSPACE, uap->from, ctx);
7554 if ((error = namei(&fromnd))) {
7555 return error;
7556 }
7557 fvp = fromnd.ni_vp;
7558
7559 NDINIT(&tond, CREATE, OP_LINK,
7560 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7561 UIO_USERSPACE, uap->to, ctx);
7562 if ((error = namei(&tond))) {
7563 goto out1;
7564 }
7565 tdvp = tond.ni_dvp;
7566 tvp = tond.ni_vp;
7567
7568 if (tvp != NULL) {
7569 if (!(uap->flags & CPF_OVERWRITE)) {
7570 error = EEXIST;
7571 goto out;
7572 }
7573 }
7574
7575 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7576 error = EISDIR;
7577 goto out;
7578 }
7579
7580 /* This calls existing MAC hooks for open */
7581 if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7582 NULL))) {
7583 goto out;
7584 }
7585
7586 if (tvp) {
7587 /*
7588 * See unlinkat_internal for an explanation of the potential
7589 * ENOENT from the MAC hook but the gist is that the MAC hook
7590 * can fail because vn_getpath isn't able to return the full
7591 * path. We choose to ignore this failure.
7592 */
7593 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7594 if (error && error != ENOENT) {
7595 goto out;
7596 }
7597 error = 0;
7598 }
7599
7600 #if CONFIG_MACF
7601 VATTR_INIT(&va);
7602 VATTR_SET(&va, va_type, fvp->v_type);
7603 /* Mask off all but regular access permissions */
7604 VATTR_SET(&va, va_mode,
7605 ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7606 error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7607 if (error) {
7608 goto out;
7609 }
7610 #endif /* CONFIG_MACF */
7611
7612 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7613 goto out;
7614 }
7615
7616 if (fvp == tdvp) {
7617 error = EINVAL;
7618 }
7619 /*
7620 * If source is the same as the destination (that is the
7621 * same inode number) then there is nothing to do.
7622 * (fixed to have POSIX semantics - CSM 3/2/98)
7623 */
7624 if (fvp == tvp) {
7625 error = -1;
7626 }
7627 if (!error) {
7628 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7629 }
7630 out:
7631 sdvp = tond.ni_startdir;
7632 /*
7633 * nameidone has to happen before we vnode_put(tdvp)
7634 * since it may need to release the fs_nodelock on the tdvp
7635 */
7636 nameidone(&tond);
7637
7638 if (tvp) {
7639 vnode_put(tvp);
7640 }
7641 vnode_put(tdvp);
7642 vnode_put(sdvp);
7643 out1:
7644 vnode_put(fvp);
7645
7646 nameidone(&fromnd);
7647
7648 if (error == -1) {
7649 return 0;
7650 }
7651 return error;
7652 }
7653
7654 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7655
7656 /*
7657 * Helper function for doing clones. The caller is expected to provide an
7658 * iocounted source vnode and release it.
7659 */
7660 static int
7661 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7662 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7663 {
7664 vnode_t tvp, tdvp;
7665 struct nameidata tond;
7666 int error;
7667 int follow;
7668 boolean_t free_src_acl;
7669 boolean_t attr_cleanup;
7670 enum vtype v_type;
7671 kauth_action_t action;
7672 struct componentname *cnp;
7673 uint32_t defaulted;
7674 struct vnode_attr va;
7675 struct vnode_attr nva;
7676 uint32_t vnop_flags;
7677
7678 v_type = vnode_vtype(fvp);
7679 switch (v_type) {
7680 case VLNK:
7681 /* FALLTHRU */
7682 case VREG:
7683 action = KAUTH_VNODE_ADD_FILE;
7684 break;
7685 case VDIR:
7686 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7687 fvp->v_mountedhere) {
7688 return EINVAL;
7689 }
7690 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7691 break;
7692 default:
7693 return EINVAL;
7694 }
7695
7696 AUDIT_ARG(fd2, dst_dirfd);
7697 AUDIT_ARG(value32, flags);
7698
7699 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7700 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7701 UIO_USERSPACE, dst, ctx);
7702 if ((error = nameiat(&tond, dst_dirfd))) {
7703 return error;
7704 }
7705 cnp = &tond.ni_cnd;
7706 tdvp = tond.ni_dvp;
7707 tvp = tond.ni_vp;
7708
7709 free_src_acl = FALSE;
7710 attr_cleanup = FALSE;
7711
7712 if (tvp != NULL) {
7713 error = EEXIST;
7714 goto out;
7715 }
7716
7717 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7718 error = EXDEV;
7719 goto out;
7720 }
7721
7722 #if CONFIG_MACF
7723 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7724 goto out;
7725 }
7726 #endif
7727 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7728 goto out;
7729 }
7730
7731 action = KAUTH_VNODE_GENERIC_READ_BITS;
7732 if (data_read_authorised) {
7733 action &= ~KAUTH_VNODE_READ_DATA;
7734 }
7735 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7736 goto out;
7737 }
7738
7739 /*
7740 * certain attributes may need to be changed from the source, we ask for
7741 * those here.
7742 */
7743 VATTR_INIT(&va);
7744 VATTR_WANTED(&va, va_uid);
7745 VATTR_WANTED(&va, va_gid);
7746 VATTR_WANTED(&va, va_mode);
7747 VATTR_WANTED(&va, va_flags);
7748 VATTR_WANTED(&va, va_acl);
7749
7750 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7751 goto out;
7752 }
7753
7754 VATTR_INIT(&nva);
7755 VATTR_SET(&nva, va_type, v_type);
7756 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7757 VATTR_SET(&nva, va_acl, va.va_acl);
7758 free_src_acl = TRUE;
7759 }
7760
7761 /* Handle ACL inheritance, initialize vap. */
7762 if (v_type == VLNK) {
7763 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7764 } else {
7765 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7766 if (error) {
7767 goto out;
7768 }
7769 attr_cleanup = TRUE;
7770 }
7771
7772 vnop_flags = VNODE_CLONEFILE_DEFAULT;
7773 /*
7774 * We've got initial values for all security parameters,
7775 * If we are superuser, then we can change owners to be the
7776 * same as the source. Both superuser and the owner have default
7777 * WRITE_SECURITY privileges so all other fields can be taken
7778 * from source as well.
7779 */
7780 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7781 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7782 VATTR_SET(&nva, va_uid, va.va_uid);
7783 }
7784 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7785 VATTR_SET(&nva, va_gid, va.va_gid);
7786 }
7787 } else {
7788 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7789 }
7790
7791 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7792 VATTR_SET(&nva, va_mode, va.va_mode);
7793 }
7794 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7795 VATTR_SET(&nva, va_flags,
7796 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7797 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7798 }
7799
7800 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7801
7802 if (!error && tvp) {
7803 int update_flags = 0;
7804 #if CONFIG_FSE
7805 int fsevent;
7806 #endif /* CONFIG_FSE */
7807
7808 /*
7809 * If some of the requested attributes weren't handled by the
7810 * VNOP, use our fallback code.
7811 */
7812 if (!VATTR_ALL_SUPPORTED(&va)) {
7813 (void)vnode_setattr_fallback(tvp, &nva, ctx);
7814 }
7815
7816 #if CONFIG_MACF
7817 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7818 VNODE_LABEL_CREATE, ctx);
7819 #endif
7820
7821 // Make sure the name & parent pointers are hooked up
7822 if (tvp->v_name == NULL) {
7823 update_flags |= VNODE_UPDATE_NAME;
7824 }
7825 if (tvp->v_parent == NULLVP) {
7826 update_flags |= VNODE_UPDATE_PARENT;
7827 }
7828
7829 if (update_flags) {
7830 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7831 cnp->cn_namelen, cnp->cn_hash, update_flags);
7832 }
7833
7834 #if CONFIG_FSE
7835 switch (vnode_vtype(tvp)) {
7836 case VLNK:
7837 /* FALLTHRU */
7838 case VREG:
7839 fsevent = FSE_CREATE_FILE;
7840 break;
7841 case VDIR:
7842 fsevent = FSE_CREATE_DIR;
7843 break;
7844 default:
7845 goto out;
7846 }
7847
7848 if (need_fsevent(fsevent, tvp)) {
7849 /*
7850 * The following is a sequence of three explicit events.
7851 * A pair of FSE_CLONE events representing the source and destination
7852 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7853 * fseventsd may coalesce the destination clone and create events
7854 * into a single event resulting in the following sequence for a client
7855 * FSE_CLONE (src)
7856 * FSE_CLONE | FSE_CREATE (dst)
7857 */
7858 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7859 FSE_ARG_DONE);
7860 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7861 FSE_ARG_DONE);
7862 }
7863 #endif /* CONFIG_FSE */
7864 }
7865
7866 out:
7867 if (attr_cleanup) {
7868 vn_attribute_cleanup(&nva, defaulted);
7869 }
7870 if (free_src_acl && va.va_acl) {
7871 kauth_acl_free(va.va_acl);
7872 }
7873 nameidone(&tond);
7874 if (tvp) {
7875 vnode_put(tvp);
7876 }
7877 vnode_put(tdvp);
7878 return error;
7879 }
7880
7881 /*
7882 * clone files or directories, target must not exist.
7883 */
7884 /* ARGSUSED */
7885 int
7886 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7887 __unused int32_t *retval)
7888 {
7889 vnode_t fvp;
7890 struct nameidata fromnd;
7891 int follow;
7892 int error;
7893 vfs_context_t ctx = vfs_context_current();
7894
7895 /* Check that the flags are valid. */
7896 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7897 return EINVAL;
7898 }
7899
7900 AUDIT_ARG(fd, uap->src_dirfd);
7901
7902 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7903 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7904 UIO_USERSPACE, uap->src, ctx);
7905 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7906 return error;
7907 }
7908
7909 fvp = fromnd.ni_vp;
7910 nameidone(&fromnd);
7911
7912 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7913 uap->flags, ctx);
7914
7915 vnode_put(fvp);
7916 return error;
7917 }
7918
7919 int
7920 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7921 __unused int32_t *retval)
7922 {
7923 vnode_t fvp;
7924 struct fileproc *fp;
7925 int error;
7926 vfs_context_t ctx = vfs_context_current();
7927
7928 /* Check that the flags are valid. */
7929 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7930 return EINVAL;
7931 }
7932
7933 AUDIT_ARG(fd, uap->src_fd);
7934 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7935 if (error) {
7936 return error;
7937 }
7938
7939 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7940 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7941 error = EBADF;
7942 goto out;
7943 }
7944
7945 if ((error = vnode_getwithref(fvp))) {
7946 goto out;
7947 }
7948
7949 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7950
7951 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7952 uap->flags, ctx);
7953
7954 vnode_put(fvp);
7955 out:
7956 file_drop(uap->src_fd);
7957 return error;
7958 }
7959
7960 static int
7961 rename_submounts_callback(mount_t mp, void *arg)
7962 {
7963 int error = 0;
7964 mount_t pmp = (mount_t)arg;
7965 int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7966
7967 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7968 return 0;
7969 }
7970
7971 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7972 return 0;
7973 }
7974
7975 if ((error = vfs_busy(mp, LK_NOWAIT))) {
7976 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7977 return -1;
7978 }
7979
7980 int pathlen = MAXPATHLEN;
7981 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
7982 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
7983 }
7984
7985 vfs_unbusy(mp);
7986
7987 return error;
7988 }
7989
7990 /*
7991 * Rename files. Source and destination must either both be directories,
7992 * or both not be directories. If target is a directory, it must be empty.
7993 */
7994 /* ARGSUSED */
7995 static int
7996 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7997 int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7998 {
7999 if (flags & ~VFS_RENAME_FLAGS_MASK) {
8000 return EINVAL;
8001 }
8002
8003 if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8004 return EINVAL;
8005 }
8006
8007 vnode_t tvp, tdvp;
8008 vnode_t fvp, fdvp;
8009 struct nameidata *fromnd, *tond;
8010 int error;
8011 int do_retry;
8012 int retry_count;
8013 int mntrename;
8014 int need_event;
8015 int need_kpath2;
8016 int has_listeners;
8017 const char *oname = NULL;
8018 char *from_name = NULL, *to_name = NULL;
8019 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8020 int from_len = 0, to_len = 0;
8021 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8022 int holding_mntlock;
8023 mount_t locked_mp = NULL;
8024 vnode_t oparent = NULLVP;
8025 #if CONFIG_FSE
8026 fse_info from_finfo, to_finfo;
8027 #endif
8028 int from_truncated = 0, to_truncated = 0;
8029 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8030 int batched = 0;
8031 struct vnode_attr *fvap, *tvap;
8032 int continuing = 0;
8033 /* carving out a chunk for structs that are too big to be on stack. */
8034 struct {
8035 struct nameidata from_node, to_node;
8036 struct vnode_attr fv_attr, tv_attr;
8037 } * __rename_data;
8038 MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8039 fromnd = &__rename_data->from_node;
8040 tond = &__rename_data->to_node;
8041
8042 holding_mntlock = 0;
8043 do_retry = 0;
8044 retry_count = 0;
8045 retry:
8046 fvp = tvp = NULL;
8047 fdvp = tdvp = NULL;
8048 fvap = tvap = NULL;
8049 mntrename = FALSE;
8050
8051 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8052 segflg, from, ctx);
8053 fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8054
8055 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8056 segflg, to, ctx);
8057 tond->ni_flag = NAMEI_COMPOUNDRENAME;
8058
8059 continue_lookup:
8060 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8061 if ((error = nameiat(fromnd, fromfd))) {
8062 goto out1;
8063 }
8064 fdvp = fromnd->ni_dvp;
8065 fvp = fromnd->ni_vp;
8066
8067 if (fvp && fvp->v_type == VDIR) {
8068 tond->ni_cnd.cn_flags |= WILLBEDIR;
8069 }
8070 }
8071
8072 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8073 if ((error = nameiat(tond, tofd))) {
8074 /*
8075 * Translate error code for rename("dir1", "dir2/.").
8076 */
8077 if (error == EISDIR && fvp->v_type == VDIR) {
8078 error = EINVAL;
8079 }
8080 goto out1;
8081 }
8082 tdvp = tond->ni_dvp;
8083 tvp = tond->ni_vp;
8084 }
8085
8086 #if DEVELOPMENT || DEBUG
8087 /*
8088 * XXX VSWAP: Check for entitlements or special flag here
8089 * so we can restrict access appropriately.
8090 */
8091 #else /* DEVELOPMENT || DEBUG */
8092
8093 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8094 error = EPERM;
8095 goto out1;
8096 }
8097
8098 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8099 error = EPERM;
8100 goto out1;
8101 }
8102 #endif /* DEVELOPMENT || DEBUG */
8103
8104 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8105 error = ENOENT;
8106 goto out1;
8107 }
8108
8109 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8110 error = EEXIST;
8111 goto out1;
8112 }
8113
8114 batched = vnode_compound_rename_available(fdvp);
8115
8116 #if CONFIG_FSE
8117 need_event = need_fsevent(FSE_RENAME, fdvp);
8118 if (need_event) {
8119 if (fvp) {
8120 get_fse_info(fvp, &from_finfo, ctx);
8121 } else {
8122 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8123 if (error) {
8124 goto out1;
8125 }
8126
8127 fvap = &__rename_data->fv_attr;
8128 }
8129
8130 if (tvp) {
8131 get_fse_info(tvp, &to_finfo, ctx);
8132 } else if (batched) {
8133 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8134 if (error) {
8135 goto out1;
8136 }
8137
8138 tvap = &__rename_data->tv_attr;
8139 }
8140 }
8141 #else
8142 need_event = 0;
8143 #endif /* CONFIG_FSE */
8144
8145 has_listeners = kauth_authorize_fileop_has_listeners();
8146
8147 need_kpath2 = 0;
8148 #if CONFIG_AUDIT
8149 if (AUDIT_RECORD_EXISTS()) {
8150 need_kpath2 = 1;
8151 }
8152 #endif
8153
8154 if (need_event || has_listeners) {
8155 if (from_name == NULL) {
8156 GET_PATH(from_name);
8157 if (from_name == NULL) {
8158 error = ENOMEM;
8159 goto out1;
8160 }
8161 }
8162
8163 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8164
8165 if (from_name_no_firmlink == NULL) {
8166 GET_PATH(from_name_no_firmlink);
8167 if (from_name_no_firmlink == NULL) {
8168 error = ENOMEM;
8169 goto out1;
8170 }
8171 }
8172
8173 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8174 }
8175
8176 if (need_event || need_kpath2 || has_listeners) {
8177 if (to_name == NULL) {
8178 GET_PATH(to_name);
8179 if (to_name == NULL) {
8180 error = ENOMEM;
8181 goto out1;
8182 }
8183 }
8184
8185 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8186
8187 if (to_name_no_firmlink == NULL) {
8188 GET_PATH(to_name_no_firmlink);
8189 if (to_name_no_firmlink == NULL) {
8190 error = ENOMEM;
8191 goto out1;
8192 }
8193 }
8194
8195 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8196 if (to_name && need_kpath2) {
8197 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8198 }
8199 }
8200 if (!fvp) {
8201 /*
8202 * Claim: this check will never reject a valid rename.
8203 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8204 * Suppose fdvp and tdvp are not on the same mount.
8205 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
8206 * then you can't move it to within another dir on the same mountpoint.
8207 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8208 *
8209 * If this check passes, then we are safe to pass these vnodes to the same FS.
8210 */
8211 if (fdvp->v_mount != tdvp->v_mount) {
8212 error = EXDEV;
8213 goto out1;
8214 }
8215 goto skipped_lookup;
8216 }
8217
8218 if (!batched) {
8219 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8220 if (error) {
8221 if (error == ENOENT) {
8222 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8223 /*
8224 * We encountered a race where after doing the namei, tvp stops
8225 * being valid. If so, simply re-drive the rename call from the
8226 * top.
8227 */
8228 do_retry = 1;
8229 retry_count += 1;
8230 }
8231 }
8232 goto out1;
8233 }
8234 }
8235
8236 /*
8237 * If the source and destination are the same (i.e. they're
8238 * links to the same vnode) and the target file system is
8239 * case sensitive, then there is nothing to do.
8240 *
8241 * XXX Come back to this.
8242 */
8243 if (fvp == tvp) {
8244 int pathconf_val;
8245
8246 /*
8247 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8248 * then assume that this file system is case sensitive.
8249 */
8250 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8251 pathconf_val != 0) {
8252 goto out1;
8253 }
8254 }
8255
8256 /*
8257 * Allow the renaming of mount points.
8258 * - target must not exist
8259 * - target must reside in the same directory as source
8260 * - union mounts cannot be renamed
8261 * - "/" cannot be renamed
8262 *
8263 * XXX Handle this in VFS after a continued lookup (if we missed
8264 * in the cache to start off)
8265 *
8266 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8267 * we'll skip past here. The file system is responsible for
8268 * checking that @tvp is not a descendent of @fvp and vice versa
8269 * so it should always return EINVAL if either @tvp or @fvp is the
8270 * root of a volume.
8271 */
8272 if ((fvp->v_flag & VROOT) &&
8273 (fvp->v_type == VDIR) &&
8274 (tvp == NULL) &&
8275 (fvp->v_mountedhere == NULL) &&
8276 (fdvp == tdvp) &&
8277 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8278 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8279 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8280 vnode_t coveredvp;
8281
8282 /* switch fvp to the covered vnode */
8283 coveredvp = fvp->v_mount->mnt_vnodecovered;
8284 if ((vnode_getwithref(coveredvp))) {
8285 error = ENOENT;
8286 goto out1;
8287 }
8288 vnode_put(fvp);
8289
8290 fvp = coveredvp;
8291 mntrename = TRUE;
8292 }
8293 /*
8294 * Check for cross-device rename.
8295 */
8296 if ((fvp->v_mount != tdvp->v_mount) ||
8297 (tvp && (fvp->v_mount != tvp->v_mount))) {
8298 error = EXDEV;
8299 goto out1;
8300 }
8301
8302 /*
8303 * If source is the same as the destination (that is the
8304 * same inode number) then there is nothing to do...
8305 * EXCEPT if the underlying file system supports case
8306 * insensitivity and is case preserving. In this case
8307 * the file system needs to handle the special case of
8308 * getting the same vnode as target (fvp) and source (tvp).
8309 *
8310 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8311 * and _PC_CASE_PRESERVING can have this exception, and they need to
8312 * handle the special case of getting the same vnode as target and
8313 * source. NOTE: Then the target is unlocked going into vnop_rename,
8314 * so not to cause locking problems. There is a single reference on tvp.
8315 *
8316 * NOTE - that fvp == tvp also occurs if they are hard linked and
8317 * that correct behaviour then is just to return success without doing
8318 * anything.
8319 *
8320 * XXX filesystem should take care of this itself, perhaps...
8321 */
8322 if (fvp == tvp && fdvp == tdvp) {
8323 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8324 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8325 fromnd->ni_cnd.cn_namelen)) {
8326 goto out1;
8327 }
8328 }
8329
8330 if (holding_mntlock && fvp->v_mount != locked_mp) {
8331 /*
8332 * we're holding a reference and lock
8333 * on locked_mp, but it no longer matches
8334 * what we want to do... so drop our hold
8335 */
8336 mount_unlock_renames(locked_mp);
8337 mount_drop(locked_mp, 0);
8338 holding_mntlock = 0;
8339 }
8340 if (tdvp != fdvp && fvp->v_type == VDIR) {
8341 /*
8342 * serialize renames that re-shape
8343 * the tree... if holding_mntlock is
8344 * set, then we're ready to go...
8345 * otherwise we
8346 * first need to drop the iocounts
8347 * we picked up, second take the
8348 * lock to serialize the access,
8349 * then finally start the lookup
8350 * process over with the lock held
8351 */
8352 if (!holding_mntlock) {
8353 /*
8354 * need to grab a reference on
8355 * the mount point before we
8356 * drop all the iocounts... once
8357 * the iocounts are gone, the mount
8358 * could follow
8359 */
8360 locked_mp = fvp->v_mount;
8361 mount_ref(locked_mp, 0);
8362
8363 /*
8364 * nameidone has to happen before we vnode_put(tvp)
8365 * since it may need to release the fs_nodelock on the tvp
8366 */
8367 nameidone(tond);
8368
8369 if (tvp) {
8370 vnode_put(tvp);
8371 }
8372 vnode_put(tdvp);
8373
8374 /*
8375 * nameidone has to happen before we vnode_put(fdvp)
8376 * since it may need to release the fs_nodelock on the fvp
8377 */
8378 nameidone(fromnd);
8379
8380 vnode_put(fvp);
8381 vnode_put(fdvp);
8382
8383 mount_lock_renames(locked_mp);
8384 holding_mntlock = 1;
8385
8386 goto retry;
8387 }
8388 } else {
8389 /*
8390 * when we dropped the iocounts to take
8391 * the lock, we allowed the identity of
8392 * the various vnodes to change... if they did,
8393 * we may no longer be dealing with a rename
8394 * that reshapes the tree... once we're holding
8395 * the iocounts, the vnodes can't change type
8396 * so we're free to drop the lock at this point
8397 * and continue on
8398 */
8399 if (holding_mntlock) {
8400 mount_unlock_renames(locked_mp);
8401 mount_drop(locked_mp, 0);
8402 holding_mntlock = 0;
8403 }
8404 }
8405
8406 // save these off so we can later verify that fvp is the same
8407 oname = fvp->v_name;
8408 oparent = fvp->v_parent;
8409
8410 skipped_lookup:
8411 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8412 tdvp, &tvp, &tond->ni_cnd, tvap,
8413 flags, ctx);
8414
8415 if (holding_mntlock) {
8416 /*
8417 * we can drop our serialization
8418 * lock now
8419 */
8420 mount_unlock_renames(locked_mp);
8421 mount_drop(locked_mp, 0);
8422 holding_mntlock = 0;
8423 }
8424 if (error) {
8425 if (error == EDATALESS) {
8426 /*
8427 * If we've been here before, something has gone
8428 * horribly wrong and we should just get out lest
8429 * we spiral around the drain forever.
8430 */
8431 if (flags & VFS_RENAME_DATALESS) {
8432 error = EIO;
8433 goto out1;
8434 }
8435
8436 /*
8437 * The object we're renaming is dataless (or has a
8438 * dataless descendent) and requires materialization
8439 * before the rename occurs. But we're holding the
8440 * mount point's rename lock, so it's not safe to
8441 * make the upcall.
8442 *
8443 * In this case, we release the lock, perform the
8444 * materialization, and start the whole thing over.
8445 */
8446 error = vnode_materialize_dataless_file(fvp,
8447 NAMESPACE_HANDLER_RENAME_OP);
8448
8449 if (error == 0) {
8450 /*
8451 * The next time around we need to tell the
8452 * file system that the materializtaion has
8453 * been performed.
8454 */
8455 flags |= VFS_RENAME_DATALESS;
8456 do_retry = 1;
8457 }
8458 goto out1;
8459 }
8460 if (error == EKEEPLOOKING) {
8461 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8462 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8463 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8464 }
8465 }
8466
8467 fromnd->ni_vp = fvp;
8468 tond->ni_vp = tvp;
8469
8470 goto continue_lookup;
8471 }
8472
8473 /*
8474 * We may encounter a race in the VNOP where the destination didn't
8475 * exist when we did the namei, but it does by the time we go and
8476 * try to create the entry. In this case, we should re-drive this rename
8477 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
8478 * but other filesystems susceptible to this race could return it, too.
8479 */
8480 if (error == ERECYCLE) {
8481 do_retry = 1;
8482 }
8483
8484 /*
8485 * For compound VNOPs, the authorization callback may return
8486 * ENOENT in case of racing hardlink lookups hitting the name
8487 * cache, redrive the lookup.
8488 */
8489 if (batched && error == ENOENT) {
8490 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8491 do_retry = 1;
8492 retry_count += 1;
8493 }
8494 }
8495
8496 goto out1;
8497 }
8498
8499 /* call out to allow 3rd party notification of rename.
8500 * Ignore result of kauth_authorize_fileop call.
8501 */
8502 kauth_authorize_fileop(vfs_context_ucred(ctx),
8503 KAUTH_FILEOP_RENAME,
8504 (uintptr_t)from_name, (uintptr_t)to_name);
8505 if (flags & VFS_RENAME_SWAP) {
8506 kauth_authorize_fileop(vfs_context_ucred(ctx),
8507 KAUTH_FILEOP_RENAME,
8508 (uintptr_t)to_name, (uintptr_t)from_name);
8509 }
8510
8511 #if CONFIG_FSE
8512 if (from_name != NULL && to_name != NULL) {
8513 if (from_truncated || to_truncated) {
8514 // set it here since only the from_finfo gets reported up to user space
8515 from_finfo.mode |= FSE_TRUNCATED_PATH;
8516 }
8517
8518 if (tvap && tvp) {
8519 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8520 }
8521 if (fvap) {
8522 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8523 }
8524
8525 if (tvp) {
8526 add_fsevent(FSE_RENAME, ctx,
8527 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8528 FSE_ARG_FINFO, &from_finfo,
8529 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8530 FSE_ARG_FINFO, &to_finfo,
8531 FSE_ARG_DONE);
8532 if (flags & VFS_RENAME_SWAP) {
8533 /*
8534 * Strictly speaking, swap is the equivalent of
8535 * *three* renames. FSEvents clients should only take
8536 * the events as a hint, so we only bother reporting
8537 * two.
8538 */
8539 add_fsevent(FSE_RENAME, ctx,
8540 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8541 FSE_ARG_FINFO, &to_finfo,
8542 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8543 FSE_ARG_FINFO, &from_finfo,
8544 FSE_ARG_DONE);
8545 }
8546 } else {
8547 add_fsevent(FSE_RENAME, ctx,
8548 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8549 FSE_ARG_FINFO, &from_finfo,
8550 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8551 FSE_ARG_DONE);
8552 }
8553 }
8554 #endif /* CONFIG_FSE */
8555
8556 /*
8557 * update filesystem's mount point data
8558 */
8559 if (mntrename) {
8560 char *cp, *pathend, *mpname;
8561 char * tobuf;
8562 struct mount *mp;
8563 int maxlen;
8564 size_t len = 0;
8565
8566 mp = fvp->v_mountedhere;
8567
8568 if (vfs_busy(mp, LK_NOWAIT)) {
8569 error = EBUSY;
8570 goto out1;
8571 }
8572 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8573
8574 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8575 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8576 } else {
8577 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8578 }
8579 if (!error) {
8580 /* find current mount point prefix */
8581 pathend = &mp->mnt_vfsstat.f_mntonname[0];
8582 for (cp = pathend; *cp != '\0'; ++cp) {
8583 if (*cp == '/') {
8584 pathend = cp + 1;
8585 }
8586 }
8587 /* find last component of target name */
8588 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8589 if (*cp == '/') {
8590 mpname = cp + 1;
8591 }
8592 }
8593
8594 /* Update f_mntonname of sub mounts */
8595 vfs_iterate(0, rename_submounts_callback, (void *)mp);
8596
8597 /* append name to prefix */
8598 maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8599 bzero(pathend, maxlen);
8600
8601 strlcpy(pathend, mpname, maxlen);
8602 }
8603 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8604
8605 vfs_unbusy(mp);
8606
8607 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8608 }
8609 /*
8610 * fix up name & parent pointers. note that we first
8611 * check that fvp has the same name/parent pointers it
8612 * had before the rename call... this is a 'weak' check
8613 * at best...
8614 *
8615 * XXX oparent and oname may not be set in the compound vnop case
8616 */
8617 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8618 int update_flags;
8619
8620 update_flags = VNODE_UPDATE_NAME;
8621
8622 if (fdvp != tdvp) {
8623 update_flags |= VNODE_UPDATE_PARENT;
8624 }
8625
8626 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8627 }
8628 out1:
8629 if (to_name != NULL) {
8630 RELEASE_PATH(to_name);
8631 to_name = NULL;
8632 }
8633 if (to_name_no_firmlink != NULL) {
8634 RELEASE_PATH(to_name_no_firmlink);
8635 to_name_no_firmlink = NULL;
8636 }
8637 if (from_name != NULL) {
8638 RELEASE_PATH(from_name);
8639 from_name = NULL;
8640 }
8641 if (from_name_no_firmlink != NULL) {
8642 RELEASE_PATH(from_name_no_firmlink);
8643 from_name_no_firmlink = NULL;
8644 }
8645 if (holding_mntlock) {
8646 mount_unlock_renames(locked_mp);
8647 mount_drop(locked_mp, 0);
8648 holding_mntlock = 0;
8649 }
8650 if (tdvp) {
8651 /*
8652 * nameidone has to happen before we vnode_put(tdvp)
8653 * since it may need to release the fs_nodelock on the tdvp
8654 */
8655 nameidone(tond);
8656
8657 if (tvp) {
8658 vnode_put(tvp);
8659 }
8660 vnode_put(tdvp);
8661 }
8662 if (fdvp) {
8663 /*
8664 * nameidone has to happen before we vnode_put(fdvp)
8665 * since it may need to release the fs_nodelock on the fdvp
8666 */
8667 nameidone(fromnd);
8668
8669 if (fvp) {
8670 vnode_put(fvp);
8671 }
8672 vnode_put(fdvp);
8673 }
8674
8675 /*
8676 * If things changed after we did the namei, then we will re-drive
8677 * this rename call from the top.
8678 */
8679 if (do_retry) {
8680 do_retry = 0;
8681 goto retry;
8682 }
8683
8684 FREE(__rename_data, M_TEMP);
8685 return error;
8686 }
8687
8688 int
8689 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8690 {
8691 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8692 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8693 }
8694
8695 int
8696 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8697 {
8698 return renameat_internal(
8699 vfs_context_current(),
8700 uap->fromfd, uap->from,
8701 uap->tofd, uap->to,
8702 UIO_USERSPACE, uap->flags);
8703 }
8704
8705 int
8706 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8707 {
8708 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8709 uap->tofd, uap->to, UIO_USERSPACE, 0);
8710 }
8711
8712 /*
8713 * Make a directory file.
8714 *
8715 * Returns: 0 Success
8716 * EEXIST
8717 * namei:???
8718 * vnode_authorize:???
8719 * vn_create:???
8720 */
8721 /* ARGSUSED */
8722 static int
8723 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8724 enum uio_seg segflg)
8725 {
8726 vnode_t vp, dvp;
8727 int error;
8728 int update_flags = 0;
8729 int batched;
8730 struct nameidata nd;
8731
8732 AUDIT_ARG(mode, vap->va_mode);
8733 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8734 path, ctx);
8735 nd.ni_cnd.cn_flags |= WILLBEDIR;
8736 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8737
8738 continue_lookup:
8739 error = nameiat(&nd, fd);
8740 if (error) {
8741 return error;
8742 }
8743 dvp = nd.ni_dvp;
8744 vp = nd.ni_vp;
8745
8746 if (vp != NULL) {
8747 error = EEXIST;
8748 goto out;
8749 }
8750
8751 batched = vnode_compound_mkdir_available(dvp);
8752
8753 VATTR_SET(vap, va_type, VDIR);
8754
8755 /*
8756 * XXX
8757 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8758 * only get EXISTS or EISDIR for existing path components, and not that it could see
8759 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8760 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
8761 */
8762 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8763 if (error == EACCES || error == EPERM) {
8764 int error2;
8765
8766 nameidone(&nd);
8767 vnode_put(dvp);
8768 dvp = NULLVP;
8769
8770 /*
8771 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8772 * rather than EACCESS if the target exists.
8773 */
8774 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8775 path, ctx);
8776 error2 = nameiat(&nd, fd);
8777 if (error2) {
8778 goto out;
8779 } else {
8780 vp = nd.ni_vp;
8781 error = EEXIST;
8782 goto out;
8783 }
8784 }
8785
8786 goto out;
8787 }
8788
8789 /*
8790 * make the directory
8791 */
8792 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8793 if (error == EKEEPLOOKING) {
8794 nd.ni_vp = vp;
8795 goto continue_lookup;
8796 }
8797
8798 goto out;
8799 }
8800
8801 // Make sure the name & parent pointers are hooked up
8802 if (vp->v_name == NULL) {
8803 update_flags |= VNODE_UPDATE_NAME;
8804 }
8805 if (vp->v_parent == NULLVP) {
8806 update_flags |= VNODE_UPDATE_PARENT;
8807 }
8808
8809 if (update_flags) {
8810 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8811 }
8812
8813 #if CONFIG_FSE
8814 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8815 #endif
8816
8817 out:
8818 /*
8819 * nameidone has to happen before we vnode_put(dvp)
8820 * since it may need to release the fs_nodelock on the dvp
8821 */
8822 nameidone(&nd);
8823
8824 if (vp) {
8825 vnode_put(vp);
8826 }
8827 if (dvp) {
8828 vnode_put(dvp);
8829 }
8830
8831 return error;
8832 }
8833
8834 /*
8835 * mkdir_extended: Create a directory; with extended security (ACL).
8836 *
8837 * Parameters: p Process requesting to create the directory
8838 * uap User argument descriptor (see below)
8839 * retval (ignored)
8840 *
8841 * Indirect: uap->path Path of directory to create
8842 * uap->mode Access permissions to set
8843 * uap->xsecurity ACL to set
8844 *
8845 * Returns: 0 Success
8846 * !0 Not success
8847 *
8848 */
8849 int
8850 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8851 {
8852 int ciferror;
8853 kauth_filesec_t xsecdst;
8854 struct vnode_attr va;
8855
8856 AUDIT_ARG(owner, uap->uid, uap->gid);
8857
8858 xsecdst = NULL;
8859 if ((uap->xsecurity != USER_ADDR_NULL) &&
8860 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8861 return ciferror;
8862 }
8863
8864 VATTR_INIT(&va);
8865 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8866 if (xsecdst != NULL) {
8867 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8868 }
8869
8870 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8871 UIO_USERSPACE);
8872 if (xsecdst != NULL) {
8873 kauth_filesec_free(xsecdst);
8874 }
8875 return ciferror;
8876 }
8877
8878 int
8879 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8880 {
8881 struct vnode_attr va;
8882
8883 VATTR_INIT(&va);
8884 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8885
8886 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8887 UIO_USERSPACE);
8888 }
8889
8890 int
8891 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8892 {
8893 struct vnode_attr va;
8894
8895 VATTR_INIT(&va);
8896 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8897
8898 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8899 UIO_USERSPACE);
8900 }
8901
8902 static int
8903 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8904 enum uio_seg segflg, int unlink_flags)
8905 {
8906 vnode_t vp, dvp;
8907 int error;
8908 struct nameidata nd;
8909 char *path = NULL;
8910 char *no_firmlink_path = NULL;
8911 int len_path = 0;
8912 int len_no_firmlink_path = 0;
8913 int has_listeners = 0;
8914 int need_event = 0;
8915 int truncated_path = 0;
8916 int truncated_no_firmlink_path = 0;
8917 #if CONFIG_FSE
8918 struct vnode_attr va;
8919 #endif /* CONFIG_FSE */
8920 struct vnode_attr *vap = NULL;
8921 int restart_count = 0;
8922 int batched;
8923
8924 int restart_flag;
8925
8926 /*
8927 * This loop exists to restart rmdir in the unlikely case that two
8928 * processes are simultaneously trying to remove the same directory
8929 * containing orphaned appleDouble files.
8930 */
8931 do {
8932 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8933 segflg, dirpath, ctx);
8934 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8935 continue_lookup:
8936 restart_flag = 0;
8937 vap = NULL;
8938
8939 error = nameiat(&nd, fd);
8940 if (error) {
8941 return error;
8942 }
8943
8944 dvp = nd.ni_dvp;
8945 vp = nd.ni_vp;
8946
8947 if (vp) {
8948 batched = vnode_compound_rmdir_available(vp);
8949
8950 if (vp->v_flag & VROOT) {
8951 /*
8952 * The root of a mounted filesystem cannot be deleted.
8953 */
8954 error = EBUSY;
8955 goto out;
8956 }
8957
8958 #if DEVELOPMENT || DEBUG
8959 /*
8960 * XXX VSWAP: Check for entitlements or special flag here
8961 * so we can restrict access appropriately.
8962 */
8963 #else /* DEVELOPMENT || DEBUG */
8964
8965 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8966 error = EPERM;
8967 goto out;
8968 }
8969 #endif /* DEVELOPMENT || DEBUG */
8970
8971 /*
8972 * Removed a check here; we used to abort if vp's vid
8973 * was not the same as what we'd seen the last time around.
8974 * I do not think that check was valid, because if we retry
8975 * and all dirents are gone, the directory could legitimately
8976 * be recycled but still be present in a situation where we would
8977 * have had permission to delete. Therefore, we won't make
8978 * an effort to preserve that check now that we may not have a
8979 * vp here.
8980 */
8981
8982 if (!batched) {
8983 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8984 if (error) {
8985 if (error == ENOENT) {
8986 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8987 restart_flag = 1;
8988 restart_count += 1;
8989 }
8990 }
8991 goto out;
8992 }
8993 }
8994 } else {
8995 batched = 1;
8996
8997 if (!vnode_compound_rmdir_available(dvp)) {
8998 panic("No error, but no compound rmdir?");
8999 }
9000 }
9001
9002 #if CONFIG_FSE
9003 fse_info finfo;
9004
9005 need_event = need_fsevent(FSE_DELETE, dvp);
9006 if (need_event) {
9007 if (!batched) {
9008 get_fse_info(vp, &finfo, ctx);
9009 } else {
9010 error = vfs_get_notify_attributes(&va);
9011 if (error) {
9012 goto out;
9013 }
9014
9015 vap = &va;
9016 }
9017 }
9018 #endif
9019 has_listeners = kauth_authorize_fileop_has_listeners();
9020 if (need_event || has_listeners) {
9021 if (path == NULL) {
9022 GET_PATH(path);
9023 if (path == NULL) {
9024 error = ENOMEM;
9025 goto out;
9026 }
9027 }
9028
9029 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9030
9031 if (no_firmlink_path == NULL) {
9032 GET_PATH(no_firmlink_path);
9033 if (no_firmlink_path == NULL) {
9034 error = ENOMEM;
9035 goto out;
9036 }
9037 }
9038
9039 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9040 #if CONFIG_FSE
9041 if (truncated_no_firmlink_path) {
9042 finfo.mode |= FSE_TRUNCATED_PATH;
9043 }
9044 #endif
9045 }
9046
9047 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9048 nd.ni_vp = vp;
9049 if (vp == NULLVP) {
9050 /* Couldn't find a vnode */
9051 goto out;
9052 }
9053
9054 if (error == EKEEPLOOKING) {
9055 goto continue_lookup;
9056 } else if (batched && error == ENOENT) {
9057 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9058 /*
9059 * For compound VNOPs, the authorization callback
9060 * may return ENOENT in case of racing hard link lookups
9061 * redrive the lookup.
9062 */
9063 restart_flag = 1;
9064 restart_count += 1;
9065 goto out;
9066 }
9067 }
9068
9069 /*
9070 * XXX There's no provision for passing flags
9071 * to VNOP_RMDIR(). So, if vn_rmdir() fails
9072 * because it's not empty, then we try again
9073 * with VNOP_REMOVE(), passing in a special
9074 * flag that clever file systems will know
9075 * how to handle.
9076 */
9077 if (error == ENOTEMPTY &&
9078 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9079 /*
9080 * If this fails, we want to keep the original
9081 * error.
9082 */
9083 if (vn_remove(dvp, &vp, &nd,
9084 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9085 error = 0;
9086 }
9087 }
9088
9089 #if CONFIG_APPLEDOUBLE
9090 /*
9091 * Special case to remove orphaned AppleDouble
9092 * files. I don't like putting this in the kernel,
9093 * but carbon does not like putting this in carbon either,
9094 * so here we are.
9095 */
9096 if (error == ENOTEMPTY) {
9097 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9098 if (ad_error == EBUSY) {
9099 error = ad_error;
9100 goto out;
9101 }
9102
9103
9104 /*
9105 * Assuming everything went well, we will try the RMDIR again
9106 */
9107 if (!ad_error) {
9108 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9109 }
9110 }
9111 #endif /* CONFIG_APPLEDOUBLE */
9112 /*
9113 * Call out to allow 3rd party notification of delete.
9114 * Ignore result of kauth_authorize_fileop call.
9115 */
9116 if (!error) {
9117 if (has_listeners) {
9118 kauth_authorize_fileop(vfs_context_ucred(ctx),
9119 KAUTH_FILEOP_DELETE,
9120 (uintptr_t)vp,
9121 (uintptr_t)path);
9122 }
9123
9124 if (vp->v_flag & VISHARDLINK) {
9125 // see the comment in unlink1() about why we update
9126 // the parent of a hard link when it is removed
9127 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9128 }
9129
9130 #if CONFIG_FSE
9131 if (need_event) {
9132 if (vap) {
9133 vnode_get_fse_info_from_vap(vp, &finfo, vap);
9134 }
9135 add_fsevent(FSE_DELETE, ctx,
9136 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9137 FSE_ARG_FINFO, &finfo,
9138 FSE_ARG_DONE);
9139 }
9140 #endif
9141 }
9142
9143 out:
9144 if (path != NULL) {
9145 RELEASE_PATH(path);
9146 path = NULL;
9147 }
9148
9149 if (no_firmlink_path != NULL) {
9150 RELEASE_PATH(no_firmlink_path);
9151 no_firmlink_path = NULL;
9152 }
9153
9154 /*
9155 * nameidone has to happen before we vnode_put(dvp)
9156 * since it may need to release the fs_nodelock on the dvp
9157 */
9158 nameidone(&nd);
9159 vnode_put(dvp);
9160
9161 if (vp) {
9162 vnode_put(vp);
9163 }
9164
9165 if (restart_flag == 0) {
9166 wakeup_one((caddr_t)vp);
9167 return error;
9168 }
9169 tsleep(vp, PVFS, "rm AD", 1);
9170 } while (restart_flag != 0);
9171
9172 return error;
9173 }
9174
9175 /*
9176 * Remove a directory file.
9177 */
9178 /* ARGSUSED */
9179 int
9180 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9181 {
9182 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9183 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9184 }
9185
9186 /* Get direntry length padded to 8 byte alignment */
9187 #define DIRENT64_LEN(namlen) \
9188 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9189
9190 /* Get dirent length padded to 4 byte alignment */
9191 #define DIRENT_LEN(namelen) \
9192 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9193
9194 /* Get the end of this dirent */
9195 #define DIRENT_END(dep) \
9196 (((char *)(dep)) + (dep)->d_reclen - 1)
9197
9198 errno_t
9199 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9200 int *numdirent, vfs_context_t ctxp)
9201 {
9202 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9203 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9204 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9205 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9206 } else {
9207 size_t bufsize;
9208 void * bufptr;
9209 uio_t auio;
9210 struct direntry *entry64;
9211 struct dirent *dep;
9212 int bytesread;
9213 int error;
9214
9215 /*
9216 * We're here because the underlying file system does not
9217 * support direnties or we mounted denying support so we must
9218 * fall back to dirents and convert them to direntries.
9219 *
9220 * Our kernel buffer needs to be smaller since re-packing will
9221 * expand each dirent. The worse case (when the name length
9222 * is 3 or less) corresponds to a struct direntry size of 32
9223 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9224 * (4-byte aligned). So having a buffer that is 3/8 the size
9225 * will prevent us from reading more than we can pack.
9226 *
9227 * Since this buffer is wired memory, we will limit the
9228 * buffer size to a maximum of 32K. We would really like to
9229 * use 32K in the MIN(), but we use magic number 87371 to
9230 * prevent uio_resid() * 3 / 8 from overflowing.
9231 */
9232 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9233 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9234 if (bufptr == NULL) {
9235 return ENOMEM;
9236 }
9237
9238 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9239 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9240 auio->uio_offset = uio->uio_offset;
9241
9242 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9243
9244 dep = (struct dirent *)bufptr;
9245 bytesread = bufsize - uio_resid(auio);
9246
9247 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9248 M_TEMP, M_WAITOK);
9249 /*
9250 * Convert all the entries and copy them out to user's buffer.
9251 */
9252 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9253 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
9254
9255 if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9256 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9257 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9258 vp->v_mount->mnt_vfsstat.f_mntonname,
9259 vp->v_name ? vp->v_name : "<unknown>");
9260 error = EIO;
9261 break;
9262 }
9263
9264 bzero(entry64, enbufsize);
9265 /* Convert a dirent to a dirent64. */
9266 entry64->d_ino = dep->d_ino;
9267 entry64->d_seekoff = 0;
9268 entry64->d_reclen = enbufsize;
9269 entry64->d_namlen = dep->d_namlen;
9270 entry64->d_type = dep->d_type;
9271 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9272
9273 /* Move to next entry. */
9274 dep = (struct dirent *)((char *)dep + dep->d_reclen);
9275
9276 /* Copy entry64 to user's buffer. */
9277 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9278 }
9279
9280 /* Update the real offset using the offset we got from VNOP_READDIR. */
9281 if (error == 0) {
9282 uio->uio_offset = auio->uio_offset;
9283 }
9284 uio_free(auio);
9285 FREE(bufptr, M_TEMP);
9286 FREE(entry64, M_TEMP);
9287 return error;
9288 }
9289 }
9290
9291 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
9292
9293 /*
9294 * Read a block of directory entries in a file system independent format.
9295 */
9296 static int
9297 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9298 off_t *offset, int *eofflag, int flags)
9299 {
9300 vnode_t vp;
9301 struct vfs_context context = *vfs_context_current(); /* local copy */
9302 struct fileproc *fp;
9303 uio_t auio;
9304 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9305 off_t loff;
9306 int error, numdirent;
9307 char uio_buf[UIO_SIZEOF(1)];
9308
9309 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9310 if (error) {
9311 return error;
9312 }
9313 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9314 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9315 error = EBADF;
9316 goto out;
9317 }
9318
9319 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9320 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9321 }
9322
9323 #if CONFIG_MACF
9324 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9325 if (error) {
9326 goto out;
9327 }
9328 #endif
9329 if ((error = vnode_getwithref(vp))) {
9330 goto out;
9331 }
9332 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9333
9334 unionread:
9335 if (vp->v_type != VDIR) {
9336 (void)vnode_put(vp);
9337 error = EINVAL;
9338 goto out;
9339 }
9340
9341 #if CONFIG_MACF
9342 error = mac_vnode_check_readdir(&context, vp);
9343 if (error != 0) {
9344 (void)vnode_put(vp);
9345 goto out;
9346 }
9347 #endif /* MAC */
9348
9349 loff = fp->f_fglob->fg_offset;
9350 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9351 uio_addiov(auio, bufp, bufsize);
9352
9353 if (flags & VNODE_READDIR_EXTENDED) {
9354 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9355 fp->f_fglob->fg_offset = uio_offset(auio);
9356 } else {
9357 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9358 fp->f_fglob->fg_offset = uio_offset(auio);
9359 }
9360 if (error) {
9361 (void)vnode_put(vp);
9362 goto out;
9363 }
9364
9365 if ((user_ssize_t)bufsize == uio_resid(auio)) {
9366 if (union_dircheckp) {
9367 error = union_dircheckp(&vp, fp, &context);
9368 if (error == -1) {
9369 goto unionread;
9370 }
9371 if (error) {
9372 (void)vnode_put(vp);
9373 goto out;
9374 }
9375 }
9376
9377 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9378 struct vnode *tvp = vp;
9379 if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9380 vnode_ref(vp);
9381 fp->f_fglob->fg_data = (caddr_t) vp;
9382 fp->f_fglob->fg_offset = 0;
9383 vnode_rele(tvp);
9384 vnode_put(tvp);
9385 goto unionread;
9386 }
9387 vp = tvp;
9388 }
9389 }
9390
9391 vnode_put(vp);
9392 if (offset) {
9393 *offset = loff;
9394 }
9395
9396 *bytesread = bufsize - uio_resid(auio);
9397 out:
9398 file_drop(fd);
9399 return error;
9400 }
9401
9402
9403 int
9404 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9405 {
9406 off_t offset;
9407 ssize_t bytesread;
9408 int error, eofflag;
9409
9410 AUDIT_ARG(fd, uap->fd);
9411 error = getdirentries_common(uap->fd, uap->buf, uap->count,
9412 &bytesread, &offset, &eofflag, 0);
9413
9414 if (error == 0) {
9415 if (proc_is64bit(p)) {
9416 user64_long_t base = (user64_long_t)offset;
9417 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9418 } else {
9419 user32_long_t base = (user32_long_t)offset;
9420 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9421 }
9422 *retval = bytesread;
9423 }
9424 return error;
9425 }
9426
9427 int
9428 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9429 {
9430 off_t offset;
9431 ssize_t bytesread;
9432 int error, eofflag;
9433 user_size_t bufsize;
9434
9435 AUDIT_ARG(fd, uap->fd);
9436
9437 /*
9438 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9439 * then the kernel carves out the last 4 bytes to return extended
9440 * information to userspace (namely whether we reached EOF with this call).
9441 */
9442 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9443 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9444 } else {
9445 bufsize = uap->bufsize;
9446 }
9447
9448 error = getdirentries_common(uap->fd, uap->buf, bufsize,
9449 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9450
9451 if (error == 0) {
9452 *retval = bytesread;
9453 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9454
9455 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9456 getdirentries64_flags_t flags = 0;
9457 if (eofflag) {
9458 flags |= GETDIRENTRIES64_EOF;
9459 }
9460 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9461 sizeof(flags));
9462 }
9463 }
9464 return error;
9465 }
9466
9467
9468 /*
9469 * Set the mode mask for creation of filesystem nodes.
9470 * XXX implement xsecurity
9471 */
9472 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
9473 static int
9474 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9475 {
9476 struct filedesc *fdp;
9477
9478 AUDIT_ARG(mask, newmask);
9479 proc_fdlock(p);
9480 fdp = p->p_fd;
9481 *retval = fdp->fd_cmask;
9482 fdp->fd_cmask = newmask & ALLPERMS;
9483 proc_fdunlock(p);
9484 return 0;
9485 }
9486
9487 /*
9488 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9489 *
9490 * Parameters: p Process requesting to set the umask
9491 * uap User argument descriptor (see below)
9492 * retval umask of the process (parameter p)
9493 *
9494 * Indirect: uap->newmask umask to set
9495 * uap->xsecurity ACL to set
9496 *
9497 * Returns: 0 Success
9498 * !0 Not success
9499 *
9500 */
9501 int
9502 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9503 {
9504 int ciferror;
9505 kauth_filesec_t xsecdst;
9506
9507 xsecdst = KAUTH_FILESEC_NONE;
9508 if (uap->xsecurity != USER_ADDR_NULL) {
9509 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9510 return ciferror;
9511 }
9512 } else {
9513 xsecdst = KAUTH_FILESEC_NONE;
9514 }
9515
9516 ciferror = umask1(p, uap->newmask, xsecdst, retval);
9517
9518 if (xsecdst != KAUTH_FILESEC_NONE) {
9519 kauth_filesec_free(xsecdst);
9520 }
9521 return ciferror;
9522 }
9523
9524 int
9525 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9526 {
9527 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9528 }
9529
9530 /*
9531 * Void all references to file by ripping underlying filesystem
9532 * away from vnode.
9533 */
9534 /* ARGSUSED */
9535 int
9536 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9537 {
9538 vnode_t vp;
9539 struct vnode_attr va;
9540 vfs_context_t ctx = vfs_context_current();
9541 int error;
9542 struct nameidata nd;
9543
9544 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9545 uap->path, ctx);
9546 error = namei(&nd);
9547 if (error) {
9548 return error;
9549 }
9550 vp = nd.ni_vp;
9551
9552 nameidone(&nd);
9553
9554 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9555 error = ENOTSUP;
9556 goto out;
9557 }
9558
9559 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9560 error = EBUSY;
9561 goto out;
9562 }
9563
9564 #if CONFIG_MACF
9565 error = mac_vnode_check_revoke(ctx, vp);
9566 if (error) {
9567 goto out;
9568 }
9569 #endif
9570
9571 VATTR_INIT(&va);
9572 VATTR_WANTED(&va, va_uid);
9573 if ((error = vnode_getattr(vp, &va, ctx))) {
9574 goto out;
9575 }
9576 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9577 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9578 goto out;
9579 }
9580 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9581 VNOP_REVOKE(vp, REVOKEALL, ctx);
9582 }
9583 out:
9584 vnode_put(vp);
9585 return error;
9586 }
9587
9588
9589 /*
9590 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9591 * The following system calls are designed to support features
9592 * which are specific to the HFS & HFS Plus volume formats
9593 */
9594
9595
9596 /*
9597 * Obtain attribute information on objects in a directory while enumerating
9598 * the directory.
9599 */
9600 /* ARGSUSED */
9601 int
9602 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9603 {
9604 vnode_t vp;
9605 struct fileproc *fp;
9606 uio_t auio = NULL;
9607 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9608 uint32_t count = 0, savecount = 0;
9609 uint32_t newstate = 0;
9610 int error, eofflag;
9611 uint32_t loff = 0;
9612 struct attrlist attributelist;
9613 vfs_context_t ctx = vfs_context_current();
9614 int fd = uap->fd;
9615 char uio_buf[UIO_SIZEOF(1)];
9616 kauth_action_t action;
9617
9618 AUDIT_ARG(fd, fd);
9619
9620 /* Get the attributes into kernel space */
9621 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9622 return error;
9623 }
9624 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9625 return error;
9626 }
9627 savecount = count;
9628 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9629 return error;
9630 }
9631 if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9632 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9633 error = EBADF;
9634 goto out;
9635 }
9636
9637
9638 #if CONFIG_MACF
9639 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9640 fp->f_fglob);
9641 if (error) {
9642 goto out;
9643 }
9644 #endif
9645
9646
9647 if ((error = vnode_getwithref(vp))) {
9648 goto out;
9649 }
9650
9651 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9652
9653 unionread:
9654 if (vp->v_type != VDIR) {
9655 (void)vnode_put(vp);
9656 error = EINVAL;
9657 goto out;
9658 }
9659
9660 #if CONFIG_MACF
9661 error = mac_vnode_check_readdir(ctx, vp);
9662 if (error != 0) {
9663 (void)vnode_put(vp);
9664 goto out;
9665 }
9666 #endif /* MAC */
9667
9668 /* set up the uio structure which will contain the users return buffer */
9669 loff = fp->f_fglob->fg_offset;
9670 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9671 uio_addiov(auio, uap->buffer, uap->buffersize);
9672
9673 /*
9674 * If the only item requested is file names, we can let that past with
9675 * just LIST_DIRECTORY. If they want any other attributes, that means
9676 * they need SEARCH as well.
9677 */
9678 action = KAUTH_VNODE_LIST_DIRECTORY;
9679 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9680 attributelist.fileattr || attributelist.dirattr) {
9681 action |= KAUTH_VNODE_SEARCH;
9682 }
9683
9684 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9685 /* Believe it or not, uap->options only has 32-bits of valid
9686 * info, so truncate before extending again */
9687
9688 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9689 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9690 }
9691
9692 if (error) {
9693 (void) vnode_put(vp);
9694 goto out;
9695 }
9696
9697 /*
9698 * If we've got the last entry of a directory in a union mount
9699 * then reset the eofflag and pretend there's still more to come.
9700 * The next call will again set eofflag and the buffer will be empty,
9701 * so traverse to the underlying directory and do the directory
9702 * read there.
9703 */
9704 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9705 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9706 eofflag = 0;
9707 } else { // Empty buffer
9708 struct vnode *tvp = vp;
9709 if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9710 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9711 fp->f_fglob->fg_data = (caddr_t) vp;
9712 fp->f_fglob->fg_offset = 0; // reset index for new dir
9713 count = savecount;
9714 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9715 vnode_put(tvp);
9716 goto unionread;
9717 }
9718 vp = tvp;
9719 }
9720 }
9721
9722 (void)vnode_put(vp);
9723
9724 if (error) {
9725 goto out;
9726 }
9727 fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9728
9729 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9730 goto out;
9731 }
9732 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9733 goto out;
9734 }
9735 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9736 goto out;
9737 }
9738
9739 *retval = eofflag; /* similar to getdirentries */
9740 error = 0;
9741 out:
9742 file_drop(fd);
9743 return error; /* return error earlier, an retval of 0 or 1 now */
9744 } /* end of getdirentriesattr system call */
9745
9746 /*
9747 * Exchange data between two files
9748 */
9749
9750 /* ARGSUSED */
9751 int
9752 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9753 {
9754 struct nameidata fnd, snd;
9755 vfs_context_t ctx = vfs_context_current();
9756 vnode_t fvp;
9757 vnode_t svp;
9758 int error;
9759 u_int32_t nameiflags;
9760 char *fpath = NULL;
9761 char *spath = NULL;
9762 int flen = 0, slen = 0;
9763 int from_truncated = 0, to_truncated = 0;
9764 #if CONFIG_FSE
9765 fse_info f_finfo, s_finfo;
9766 #endif
9767
9768 nameiflags = 0;
9769 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9770 nameiflags |= FOLLOW;
9771 }
9772
9773 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9774 UIO_USERSPACE, uap->path1, ctx);
9775
9776 error = namei(&fnd);
9777 if (error) {
9778 goto out2;
9779 }
9780
9781 nameidone(&fnd);
9782 fvp = fnd.ni_vp;
9783
9784 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9785 UIO_USERSPACE, uap->path2, ctx);
9786
9787 error = namei(&snd);
9788 if (error) {
9789 vnode_put(fvp);
9790 goto out2;
9791 }
9792 nameidone(&snd);
9793 svp = snd.ni_vp;
9794
9795 /*
9796 * if the files are the same, return an inval error
9797 */
9798 if (svp == fvp) {
9799 error = EINVAL;
9800 goto out;
9801 }
9802
9803 /*
9804 * if the files are on different volumes, return an error
9805 */
9806 if (svp->v_mount != fvp->v_mount) {
9807 error = EXDEV;
9808 goto out;
9809 }
9810
9811 /* If they're not files, return an error */
9812 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9813 error = EINVAL;
9814 goto out;
9815 }
9816
9817 #if CONFIG_MACF
9818 error = mac_vnode_check_exchangedata(ctx,
9819 fvp, svp);
9820 if (error) {
9821 goto out;
9822 }
9823 #endif
9824 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9825 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9826 goto out;
9827 }
9828
9829 if (
9830 #if CONFIG_FSE
9831 need_fsevent(FSE_EXCHANGE, fvp) ||
9832 #endif
9833 kauth_authorize_fileop_has_listeners()) {
9834 GET_PATH(fpath);
9835 GET_PATH(spath);
9836 if (fpath == NULL || spath == NULL) {
9837 error = ENOMEM;
9838 goto out;
9839 }
9840
9841 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9842 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9843
9844 #if CONFIG_FSE
9845 get_fse_info(fvp, &f_finfo, ctx);
9846 get_fse_info(svp, &s_finfo, ctx);
9847 if (from_truncated || to_truncated) {
9848 // set it here since only the f_finfo gets reported up to user space
9849 f_finfo.mode |= FSE_TRUNCATED_PATH;
9850 }
9851 #endif
9852 }
9853 /* Ok, make the call */
9854 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9855
9856 if (error == 0) {
9857 const char *tmpname;
9858
9859 if (fpath != NULL && spath != NULL) {
9860 /* call out to allow 3rd party notification of exchangedata.
9861 * Ignore result of kauth_authorize_fileop call.
9862 */
9863 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9864 (uintptr_t)fpath, (uintptr_t)spath);
9865 }
9866 name_cache_lock();
9867
9868 tmpname = fvp->v_name;
9869 fvp->v_name = svp->v_name;
9870 svp->v_name = tmpname;
9871
9872 if (fvp->v_parent != svp->v_parent) {
9873 vnode_t tmp;
9874
9875 tmp = fvp->v_parent;
9876 fvp->v_parent = svp->v_parent;
9877 svp->v_parent = tmp;
9878 }
9879 name_cache_unlock();
9880
9881 #if CONFIG_FSE
9882 if (fpath != NULL && spath != NULL) {
9883 add_fsevent(FSE_EXCHANGE, ctx,
9884 FSE_ARG_STRING, flen, fpath,
9885 FSE_ARG_FINFO, &f_finfo,
9886 FSE_ARG_STRING, slen, spath,
9887 FSE_ARG_FINFO, &s_finfo,
9888 FSE_ARG_DONE);
9889 }
9890 #endif
9891 }
9892
9893 out:
9894 if (fpath != NULL) {
9895 RELEASE_PATH(fpath);
9896 }
9897 if (spath != NULL) {
9898 RELEASE_PATH(spath);
9899 }
9900 vnode_put(svp);
9901 vnode_put(fvp);
9902 out2:
9903 return error;
9904 }
9905
9906 /*
9907 * Return (in MB) the amount of freespace on the given vnode's volume.
9908 */
9909 uint32_t freespace_mb(vnode_t vp);
9910
9911 uint32_t
9912 freespace_mb(vnode_t vp)
9913 {
9914 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9915 return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9916 vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9917 }
9918
9919 #if CONFIG_SEARCHFS
9920
9921 /* ARGSUSED */
9922
9923 int
9924 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9925 {
9926 vnode_t vp, tvp;
9927 int i, error = 0;
9928 int fserror = 0;
9929 struct nameidata nd;
9930 struct user64_fssearchblock searchblock;
9931 struct searchstate *state;
9932 struct attrlist *returnattrs;
9933 struct timeval timelimit;
9934 void *searchparams1, *searchparams2;
9935 uio_t auio = NULL;
9936 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9937 uint32_t nummatches;
9938 int mallocsize;
9939 uint32_t nameiflags;
9940 vfs_context_t ctx = vfs_context_current();
9941 char uio_buf[UIO_SIZEOF(1)];
9942
9943 /* Start by copying in fsearchblock parameter list */
9944 if (IS_64BIT_PROCESS(p)) {
9945 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9946 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9947 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9948 } else {
9949 struct user32_fssearchblock tmp_searchblock;
9950
9951 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9952 // munge into 64-bit version
9953 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9954 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9955 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9956 searchblock.maxmatches = tmp_searchblock.maxmatches;
9957 /*
9958 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9959 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9960 */
9961 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9962 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9963 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9964 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9965 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9966 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9967 searchblock.searchattrs = tmp_searchblock.searchattrs;
9968 }
9969 if (error) {
9970 return error;
9971 }
9972
9973 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9974 */
9975 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9976 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9977 return EINVAL;
9978 }
9979
9980 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9981 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
9982 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9983 /* block. */
9984 /* */
9985 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
9986 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
9987 /* assumes the size is still 556 bytes it will continue to work */
9988
9989 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9990 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9991
9992 MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9993
9994 /* Now set up the various pointers to the correct place in our newly allocated memory */
9995
9996 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9997 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9998 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9999
10000 /* Now copy in the stuff given our local variables. */
10001
10002 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10003 goto freeandexit;
10004 }
10005
10006 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10007 goto freeandexit;
10008 }
10009
10010 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10011 goto freeandexit;
10012 }
10013
10014 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10015 goto freeandexit;
10016 }
10017
10018 /*
10019 * When searching a union mount, need to set the
10020 * start flag at the first call on each layer to
10021 * reset state for the new volume.
10022 */
10023 if (uap->options & SRCHFS_START) {
10024 state->ss_union_layer = 0;
10025 } else {
10026 uap->options |= state->ss_union_flags;
10027 }
10028 state->ss_union_flags = 0;
10029
10030 /*
10031 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10032 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10033 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10034 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10035 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10036 */
10037
10038 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10039 attrreference_t* string_ref;
10040 u_int32_t* start_length;
10041 user64_size_t param_length;
10042
10043 /* validate searchparams1 */
10044 param_length = searchblock.sizeofsearchparams1;
10045 /* skip the word that specifies length of the buffer */
10046 start_length = (u_int32_t*) searchparams1;
10047 start_length = start_length + 1;
10048 string_ref = (attrreference_t*) start_length;
10049
10050 /* ensure no negative offsets or too big offsets */
10051 if (string_ref->attr_dataoffset < 0) {
10052 error = EINVAL;
10053 goto freeandexit;
10054 }
10055 if (string_ref->attr_length > MAXPATHLEN) {
10056 error = EINVAL;
10057 goto freeandexit;
10058 }
10059
10060 /* Check for pointer overflow in the string ref */
10061 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10062 error = EINVAL;
10063 goto freeandexit;
10064 }
10065
10066 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10067 error = EINVAL;
10068 goto freeandexit;
10069 }
10070 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10071 error = EINVAL;
10072 goto freeandexit;
10073 }
10074 }
10075
10076 /* set up the uio structure which will contain the users return buffer */
10077 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10078 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10079
10080 nameiflags = 0;
10081 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10082 nameiflags |= FOLLOW;
10083 }
10084 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10085 UIO_USERSPACE, uap->path, ctx);
10086
10087 error = namei(&nd);
10088 if (error) {
10089 goto freeandexit;
10090 }
10091 vp = nd.ni_vp;
10092 nameidone(&nd);
10093
10094 /*
10095 * Switch to the root vnode for the volume
10096 */
10097 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10098 vnode_put(vp);
10099 if (error) {
10100 goto freeandexit;
10101 }
10102 vp = tvp;
10103
10104 /*
10105 * If it's a union mount, the path lookup takes
10106 * us to the top layer. But we may need to descend
10107 * to a lower layer. For non-union mounts the layer
10108 * is always zero.
10109 */
10110 for (i = 0; i < (int) state->ss_union_layer; i++) {
10111 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10112 break;
10113 }
10114 tvp = vp;
10115 vp = vp->v_mount->mnt_vnodecovered;
10116 if (vp == NULL) {
10117 vnode_put(tvp);
10118 error = ENOENT;
10119 goto freeandexit;
10120 }
10121 error = vnode_getwithref(vp);
10122 vnode_put(tvp);
10123 if (error) {
10124 goto freeandexit;
10125 }
10126 }
10127
10128 #if CONFIG_MACF
10129 error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10130 if (error) {
10131 vnode_put(vp);
10132 goto freeandexit;
10133 }
10134 #endif
10135
10136
10137 /*
10138 * If searchblock.maxmatches == 0, then skip the search. This has happened
10139 * before and sometimes the underlying code doesnt deal with it well.
10140 */
10141 if (searchblock.maxmatches == 0) {
10142 nummatches = 0;
10143 goto saveandexit;
10144 }
10145
10146 /*
10147 * Allright, we have everything we need, so lets make that call.
10148 *
10149 * We keep special track of the return value from the file system:
10150 * EAGAIN is an acceptable error condition that shouldn't keep us
10151 * from copying out any results...
10152 */
10153
10154 fserror = VNOP_SEARCHFS(vp,
10155 searchparams1,
10156 searchparams2,
10157 &searchblock.searchattrs,
10158 (u_long)searchblock.maxmatches,
10159 &timelimit,
10160 returnattrs,
10161 &nummatches,
10162 (u_long)uap->scriptcode,
10163 (u_long)uap->options,
10164 auio,
10165 (struct searchstate *) &state->ss_fsstate,
10166 ctx);
10167
10168 /*
10169 * If it's a union mount we need to be called again
10170 * to search the mounted-on filesystem.
10171 */
10172 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10173 state->ss_union_flags = SRCHFS_START;
10174 state->ss_union_layer++; // search next layer down
10175 fserror = EAGAIN;
10176 }
10177
10178 saveandexit:
10179
10180 vnode_put(vp);
10181
10182 /* Now copy out the stuff that needs copying out. That means the number of matches, the
10183 * search state. Everything was already put into he return buffer by the vop call. */
10184
10185 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10186 goto freeandexit;
10187 }
10188
10189 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10190 goto freeandexit;
10191 }
10192
10193 error = fserror;
10194
10195 freeandexit:
10196
10197 FREE(searchparams1, M_TEMP);
10198
10199 return error;
10200 } /* end of searchfs system call */
10201
10202 #else /* CONFIG_SEARCHFS */
10203
10204 int
10205 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10206 {
10207 return ENOTSUP;
10208 }
10209
10210 #endif /* CONFIG_SEARCHFS */
10211
10212
10213 #if CONFIG_DATALESS_FILES
10214
10215 /*
10216 * === Namespace Resolver Up-call Mechanism ===
10217 *
10218 * When I/O is performed to a dataless file or directory (read, write,
10219 * lookup-in, etc.), the file system performs an upcall to the namespace
10220 * resolver (filecoordinationd) to materialize the object.
10221 *
10222 * We need multiple up-calls to be in flight at once, and we need these
10223 * up-calls to be interruptible, thus the following implementation:
10224 *
10225 * => The nspace_resolver_request represents the in-kernel request state.
10226 * It contains a request ID, storage space for the errno code returned
10227 * by filecoordinationd, and flags.
10228 *
10229 * => The request ID is simply a global monotonically incrementing 32-bit
10230 * number. Outstanding requests are stored in a hash table, and the
10231 * hash function is extremely simple.
10232 *
10233 * => When an upcall is to be made to filecoordinationd, a request structure
10234 * is allocated on the stack (it is small, and needs to live only during
10235 * the duration of the call to resolve_nspace_item_ext()). It is
10236 * initialized and inserted into the table. Some backpressure from
10237 * filecoordinationd is applied by limiting the numnber of entries that
10238 * can be inserted into the table (and thus limiting the number of
10239 * outstanding requests issued to filecoordinationd); waiting for an
10240 * available slot is interruptible.
10241 *
10242 * => Once the request has been inserted into the table, the up-call is made
10243 * to filecoordinationd via a MiG-generated stub. The up-call returns
10244 * immediately and filecoordinationd processes the request asynchronously.
10245 *
10246 * => The caller now waits for the request to complete. Tnis is achieved by
10247 * sleeping on the address of the request structure and waiting for
10248 * filecoordinationd to mark the request structure as complete. This
10249 * is an interruptible sleep call; if interrupted, the request structure
10250 * is removed from the table and EINTR is returned to the caller. If
10251 * this occurs, an advisory up-call is made to filecoordinationd with
10252 * the request ID to indicate that the request can be aborted or
10253 * de-prioritized at the discretion of filecoordinationd.
10254 *
10255 * => When filecoordinationd has completed the request, it signals completion
10256 * by writing to the vfs.nspace.complete sysctl node. Only a process
10257 * decorated as a namespace resolver can write to this sysctl node. The
10258 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10259 * The request ID is looked up in the table, and if the request is found,
10260 * the error code is stored in the request structure and a wakeup()
10261 * issued on the address of the request structure. If the request is not
10262 * found, we simply drop the completion notification, assuming that the
10263 * caller was interrupted.
10264 *
10265 * => When the waiting thread wakes up, it extracts the error code from the
10266 * request structure, removes the request from the table, and returns the
10267 * error code to the calling function. Fini!
10268 */
10269
10270 struct nspace_resolver_request {
10271 LIST_ENTRY(nspace_resolver_request) r_hashlink;
10272 uint32_t r_req_id;
10273 int r_resolver_error;
10274 int r_flags;
10275 };
10276
10277 #define RRF_COMPLETE 0x0001
10278
10279 static uint32_t
10280 next_nspace_req_id(void)
10281 {
10282 static uint32_t next_req_id;
10283
10284 return OSAddAtomic(1, &next_req_id);
10285 }
10286
10287 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
10288 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
10289
10290 static LIST_HEAD(nspace_resolver_requesthead,
10291 nspace_resolver_request) * nspace_resolver_request_hashtbl;
10292 static u_long nspace_resolver_request_hashmask;
10293 static u_int nspace_resolver_request_count;
10294 static bool nspace_resolver_request_wait_slot;
10295 static lck_grp_t *nspace_resolver_request_lck_grp;
10296 static lck_mtx_t nspace_resolver_request_hash_mutex;
10297
10298 #define NSPACE_REQ_LOCK() \
10299 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10300 #define NSPACE_REQ_UNLOCK() \
10301 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10302
10303 #define NSPACE_RESOLVER_HASH(req_id) \
10304 (&nspace_resolver_request_hashtbl[(req_id) & \
10305 nspace_resolver_request_hashmask])
10306
10307 static struct nspace_resolver_request *
10308 nspace_resolver_req_lookup(uint32_t req_id)
10309 {
10310 struct nspace_resolver_requesthead *bucket;
10311 struct nspace_resolver_request *req;
10312
10313 bucket = NSPACE_RESOLVER_HASH(req_id);
10314 LIST_FOREACH(req, bucket, r_hashlink) {
10315 if (req->r_req_id == req_id) {
10316 return req;
10317 }
10318 }
10319
10320 return NULL;
10321 }
10322
10323 static int
10324 nspace_resolver_req_add(struct nspace_resolver_request *req)
10325 {
10326 struct nspace_resolver_requesthead *bucket;
10327 int error;
10328
10329 while (nspace_resolver_request_count >=
10330 NSPACE_RESOLVER_MAX_OUTSTANDING) {
10331 nspace_resolver_request_wait_slot = true;
10332 error = msleep(&nspace_resolver_request_count,
10333 &nspace_resolver_request_hash_mutex,
10334 PVFS | PCATCH, "nspacerq", NULL);
10335 if (error) {
10336 return error;
10337 }
10338 }
10339
10340 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10341 #if DIAGNOSTIC
10342 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10343 #endif /* DIAGNOSTIC */
10344 LIST_INSERT_HEAD(bucket, req, r_hashlink);
10345 nspace_resolver_request_count++;
10346
10347 return 0;
10348 }
10349
10350 static void
10351 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10352 {
10353 struct nspace_resolver_requesthead *bucket;
10354
10355 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10356 #if DIAGNOSTIC
10357 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10358 #endif /* DIAGNOSTIC */
10359 LIST_REMOVE(req, r_hashlink);
10360 nspace_resolver_request_count--;
10361
10362 if (nspace_resolver_request_wait_slot) {
10363 nspace_resolver_request_wait_slot = false;
10364 wakeup(&nspace_resolver_request_count);
10365 }
10366 }
10367
10368 static void
10369 nspace_resolver_req_cancel(uint32_t req_id)
10370 {
10371 kern_return_t kr;
10372 mach_port_t mp;
10373
10374 // Failures here aren't fatal -- the cancellation message
10375 // sent to the resolver is merely advisory.
10376
10377 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10378 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10379 return;
10380 }
10381
10382 kr = send_nspace_resolve_cancel(mp, req_id);
10383 if (kr != KERN_SUCCESS) {
10384 os_log_error(OS_LOG_DEFAULT,
10385 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10386 }
10387
10388 ipc_port_release_send(mp);
10389 }
10390
10391 static int
10392 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10393 {
10394 bool send_cancel_message = false;
10395 int error;
10396
10397 NSPACE_REQ_LOCK();
10398
10399 while ((req->r_flags & RRF_COMPLETE) == 0) {
10400 error = msleep(req, &nspace_resolver_request_hash_mutex,
10401 PVFS | PCATCH, "nspace", NULL);
10402 if (error && error != ERESTART) {
10403 req->r_resolver_error = (error == EINTR) ? EINTR :
10404 ETIMEDOUT;
10405 send_cancel_message = true;
10406 break;
10407 }
10408 }
10409
10410 nspace_resolver_req_remove(req);
10411
10412 NSPACE_REQ_UNLOCK();
10413
10414 if (send_cancel_message) {
10415 nspace_resolver_req_cancel(req->r_req_id);
10416 }
10417
10418 return req->r_resolver_error;
10419 }
10420
10421 static void
10422 nspace_resolver_req_mark_complete(
10423 struct nspace_resolver_request *req,
10424 int resolver_error)
10425 {
10426 req->r_resolver_error = resolver_error;
10427 req->r_flags |= RRF_COMPLETE;
10428 wakeup(req);
10429 }
10430
10431 static void
10432 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10433 {
10434 struct nspace_resolver_request *req;
10435
10436 NSPACE_REQ_LOCK();
10437
10438 // If we don't find the request corresponding to our req_id,
10439 // just drop the completion signal on the floor; it's likely
10440 // that the requester interrupted with a signal.
10441
10442 req = nspace_resolver_req_lookup(req_id);
10443 if (req) {
10444 nspace_resolver_req_mark_complete(req, resolver_error);
10445 }
10446
10447 NSPACE_REQ_UNLOCK();
10448 }
10449
10450 static struct proc *nspace_resolver_proc;
10451
10452 static int
10453 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10454 {
10455 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10456 p == nspace_resolver_proc) ? 1 : 0;
10457 return 0;
10458 }
10459
10460 static int
10461 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10462 {
10463 vfs_context_t ctx = vfs_context_current();
10464 int error = 0;
10465
10466 //
10467 // The system filecoordinationd runs as uid == 0. This also
10468 // has the nice side-effect of filtering out filecoordinationd
10469 // running in the simulator.
10470 //
10471 if (!vfs_context_issuser(ctx)) {
10472 return EPERM;
10473 }
10474
10475 error = priv_check_cred(vfs_context_ucred(ctx),
10476 PRIV_VFS_DATALESS_RESOLVER, 0);
10477 if (error) {
10478 return error;
10479 }
10480
10481 if (is_resolver) {
10482 NSPACE_REQ_LOCK();
10483
10484 if (nspace_resolver_proc == NULL) {
10485 proc_lock(p);
10486 p->p_lflag |= P_LNSPACE_RESOLVER;
10487 proc_unlock(p);
10488 nspace_resolver_proc = p;
10489 } else {
10490 error = EBUSY;
10491 }
10492
10493 NSPACE_REQ_UNLOCK();
10494 } else {
10495 // This is basically just like the exit case.
10496 // nspace_resolver_exited() will verify that the
10497 // process is the resolver, and will clear the
10498 // global.
10499 nspace_resolver_exited(p);
10500 }
10501
10502 return error;
10503 }
10504
10505 static int
10506 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10507 {
10508 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10509 (p->p_vfs_iopolicy &
10510 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10511 *is_prevented = 1;
10512 } else {
10513 *is_prevented = 0;
10514 }
10515 return 0;
10516 }
10517
10518 static int
10519 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10520 {
10521 if (p->p_lflag & P_LNSPACE_RESOLVER) {
10522 return is_prevented ? 0 : EBUSY;
10523 }
10524
10525 if (is_prevented) {
10526 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10527 } else {
10528 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10529 }
10530 return 0;
10531 }
10532
10533 static int
10534 nspace_materialization_get_thread_state(int *is_prevented)
10535 {
10536 uthread_t ut = get_bsdthread_info(current_thread());
10537
10538 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10539 return 0;
10540 }
10541
10542 static int
10543 nspace_materialization_set_thread_state(int is_prevented)
10544 {
10545 uthread_t ut = get_bsdthread_info(current_thread());
10546
10547 if (is_prevented) {
10548 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10549 } else {
10550 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10551 }
10552 return 0;
10553 }
10554
10555 static int
10556 nspace_materialization_is_prevented(void)
10557 {
10558 proc_t p = current_proc();
10559 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10560 vfs_context_t ctx = vfs_context_current();
10561
10562 /*
10563 * Kernel context ==> return EDEADLK, as we would with any random
10564 * process decorated as no-materialize.
10565 */
10566 if (ctx == vfs_context_kernel()) {
10567 return EDEADLK;
10568 }
10569
10570 /*
10571 * If the process has the dataless-manipulation entitlement,
10572 * materialization is prevented, and depending on the kind
10573 * of file system operation, things get to proceed as if the
10574 * object is not dataless.
10575 */
10576 if (vfs_context_is_dataless_manipulator(ctx)) {
10577 return EJUSTRETURN;
10578 }
10579
10580 /*
10581 * Per-thread decorations override any process-wide decorations.
10582 * (Foundation uses this, and this overrides even the dataless-
10583 * manipulation entitlement so as to make API contracts consistent.)
10584 */
10585 if (ut != NULL) {
10586 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10587 return EDEADLK;
10588 }
10589 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10590 return 0;
10591 }
10592 }
10593
10594 /*
10595 * If the process's iopolicy specifies that dataless files
10596 * can be materialized, then we let it go ahead.
10597 */
10598 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10599 return 0;
10600 }
10601
10602 /*
10603 * The default behavior is to not materialize dataless files;
10604 * return to the caller that deadlock was detected.
10605 */
10606 return EDEADLK;
10607 }
10608
10609 /* the vfs.nspace branch */
10610 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10611
10612 static int
10613 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10614 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10615 {
10616 struct proc *p = req->p;
10617 int new_value, old_value, changed = 0;
10618 int error;
10619
10620 error = nspace_resolver_get_proc_state(p, &old_value);
10621 if (error) {
10622 return error;
10623 }
10624
10625 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10626 &changed);
10627 if (error == 0 && changed) {
10628 error = nspace_resolver_set_proc_state(p, new_value);
10629 }
10630 return error;
10631 }
10632
10633 /* decorate this process as the dataless file resolver */
10634 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10635 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10636 0, 0, sysctl_nspace_resolver, "I", "");
10637
10638 static int
10639 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10640 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10641 {
10642 struct proc *p = req->p;
10643 int new_value, old_value, changed = 0;
10644 int error;
10645
10646 error = nspace_materialization_get_proc_state(p, &old_value);
10647 if (error) {
10648 return error;
10649 }
10650
10651 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10652 &changed);
10653 if (error == 0 && changed) {
10654 error = nspace_materialization_set_proc_state(p, new_value);
10655 }
10656 return error;
10657 }
10658
10659 /* decorate this process as not wanting to materialize dataless files */
10660 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10661 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10662 0, 0, sysctl_nspace_prevent_materialization, "I", "");
10663
10664 static int
10665 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10666 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10667 {
10668 int new_value, old_value, changed = 0;
10669 int error;
10670
10671 error = nspace_materialization_get_thread_state(&old_value);
10672 if (error) {
10673 return error;
10674 }
10675
10676 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10677 &changed);
10678 if (error == 0 && changed) {
10679 error = nspace_materialization_set_thread_state(new_value);
10680 }
10681 return error;
10682 }
10683
10684 /* decorate this thread as not wanting to materialize dataless files */
10685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10686 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10687 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10688
10689 static int
10690 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10691 __unused int arg2, struct sysctl_req *req)
10692 {
10693 struct proc *p = req->p;
10694 uint32_t req_status[2] = { 0, 0 };
10695 int error, is_resolver, changed = 0;
10696
10697 error = nspace_resolver_get_proc_state(p, &is_resolver);
10698 if (error) {
10699 return error;
10700 }
10701
10702 if (!is_resolver) {
10703 return EPERM;
10704 }
10705
10706 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10707 &changed);
10708 if (error) {
10709 return error;
10710 }
10711
10712 /*
10713 * req_status[0] is the req_id
10714 *
10715 * req_status[1] is the errno
10716 */
10717 if (error == 0 && changed) {
10718 nspace_resolver_req_completed(req_status[0],
10719 (int)req_status[1]);
10720 }
10721 return error;
10722 }
10723
10724 /* Resolver reports completed reqs here. */
10725 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10726 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10727 0, 0, sysctl_nspace_complete, "-", "");
10728
10729 #endif /* CONFIG_DATALESS_FILES */
10730
10731 #if CONFIG_DATALESS_FILES
10732 #define __no_dataless_unused /* nothing */
10733 #else
10734 #define __no_dataless_unused __unused
10735 #endif
10736
10737 void
10738 nspace_resolver_init(void)
10739 {
10740 #if CONFIG_DATALESS_FILES
10741 nspace_resolver_request_lck_grp =
10742 lck_grp_alloc_init("file namespace resolver", NULL);
10743
10744 lck_mtx_init(&nspace_resolver_request_hash_mutex,
10745 nspace_resolver_request_lck_grp, NULL);
10746
10747 nspace_resolver_request_hashtbl =
10748 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10749 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10750 #endif /* CONFIG_DATALESS_FILES */
10751 }
10752
10753 void
10754 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10755 {
10756 #if CONFIG_DATALESS_FILES
10757 struct nspace_resolver_requesthead *bucket;
10758 struct nspace_resolver_request *req;
10759 u_long idx;
10760
10761 NSPACE_REQ_LOCK();
10762
10763 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10764 p == nspace_resolver_proc) {
10765 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10766 bucket = &nspace_resolver_request_hashtbl[idx];
10767 LIST_FOREACH(req, bucket, r_hashlink) {
10768 nspace_resolver_req_mark_complete(req,
10769 ETIMEDOUT);
10770 }
10771 }
10772 nspace_resolver_proc = NULL;
10773 }
10774
10775 NSPACE_REQ_UNLOCK();
10776 #endif /* CONFIG_DATALESS_FILES */
10777 }
10778
10779 int
10780 resolve_nspace_item(struct vnode *vp, uint64_t op)
10781 {
10782 return resolve_nspace_item_ext(vp, op, NULL);
10783 }
10784
10785 #define DATALESS_RESOLVER_ENTITLEMENT \
10786 "com.apple.private.vfs.dataless-resolver"
10787 #define DATALESS_MANIPULATION_ENTITLEMENT \
10788 "com.apple.private.vfs.dataless-manipulation"
10789
10790 /*
10791 * Return TRUE if the vfs context is associated with a process entitled
10792 * for dataless manipulation.
10793 *
10794 * XXX Arguably belongs in vfs_subr.c, but is here because of the
10795 * complication around CONFIG_DATALESS_FILES.
10796 */
10797 boolean_t
10798 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10799 {
10800 #if CONFIG_DATALESS_FILES
10801 assert(ctx->vc_thread == current_thread());
10802 task_t const task = current_task();
10803 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10804 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10805 #else
10806 return false;
10807 #endif /* CONFIG_DATALESS_FILES */
10808 }
10809
10810 int
10811 resolve_nspace_item_ext(
10812 struct vnode *vp __no_dataless_unused,
10813 uint64_t op __no_dataless_unused,
10814 void *arg __unused)
10815 {
10816 #if CONFIG_DATALESS_FILES
10817 int error;
10818 mach_port_t mp;
10819 char *path = NULL;
10820 int path_len;
10821 kern_return_t kr;
10822 struct nspace_resolver_request req;
10823
10824 // only allow namespace events on regular files, directories and symlinks.
10825 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10826 return EFTYPE;
10827 }
10828
10829 //
10830 // if this is a snapshot event and the vnode is on a
10831 // disk image just pretend nothing happened since any
10832 // change to the disk image will cause the disk image
10833 // itself to get backed up and this avoids multi-way
10834 // deadlocks between the snapshot handler and the ever
10835 // popular diskimages-helper process. the variable
10836 // nspace_allow_virtual_devs allows this behavior to
10837 // be overridden (for use by the Mobile TimeMachine
10838 // testing infrastructure which uses disk images)
10839 //
10840 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10841 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10842 return ENOTSUP;
10843 }
10844
10845 error = nspace_materialization_is_prevented();
10846 if (error) {
10847 os_log_debug(OS_LOG_DEFAULT,
10848 "NSPACE process/thread is decorated as no-materialization");
10849 return error;
10850 }
10851
10852 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10853 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10854 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10855 // Treat this like being unable to access the backing
10856 // store server.
10857 return ETIMEDOUT;
10858 }
10859
10860 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10861 if (path == NULL) {
10862 error = ENOMEM;
10863 goto out_release_port;
10864 }
10865 path_len = MAXPATHLEN;
10866
10867 error = vn_getpath(vp, path, &path_len);
10868 if (error == 0) {
10869 int xxx_rdar44371223; /* XXX Mig bug */
10870 req.r_req_id = next_nspace_req_id();
10871 req.r_resolver_error = 0;
10872 req.r_flags = 0;
10873
10874 NSPACE_REQ_LOCK();
10875 error = nspace_resolver_req_add(&req);
10876 NSPACE_REQ_UNLOCK();
10877 if (error) {
10878 goto out_release_port;
10879 }
10880
10881 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10882 kr = send_nspace_resolve_path(mp, req.r_req_id,
10883 current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10884 path, &xxx_rdar44371223);
10885 if (kr != KERN_SUCCESS) {
10886 // Also treat this like being unable to access
10887 // the backing store server.
10888 os_log_error(OS_LOG_DEFAULT,
10889 "NSPACE resolve_path failure: %d", kr);
10890 error = ETIMEDOUT;
10891
10892 NSPACE_REQ_LOCK();
10893 nspace_resolver_req_remove(&req);
10894 NSPACE_REQ_UNLOCK();
10895 goto out_release_port;
10896 }
10897
10898 // Give back the memory we allocated earlier while
10899 // we wait; we no longer need it.
10900 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10901 path = NULL;
10902
10903 // Request has been submitted to the resolver.
10904 // Now (interruptibly) wait for completion.
10905 // Upon requrn, the request will have been removed
10906 // from the lookup table.
10907 error = nspace_resolver_req_wait(&req);
10908 }
10909
10910 out_release_port:
10911 if (path != NULL) {
10912 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10913 }
10914 ipc_port_release_send(mp);
10915
10916 return error;
10917 #else
10918 return ENOTSUP;
10919 #endif /* CONFIG_DATALESS_FILES */
10920 }
10921
10922 int
10923 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
10924 __unused uint64_t op_type, __unused void *arg)
10925 {
10926 return 0;
10927 }
10928
10929 #if 0
10930 static int
10931 build_volfs_path(struct vnode *vp, char *path, int *len)
10932 {
10933 struct vnode_attr va;
10934 int ret;
10935
10936 VATTR_INIT(&va);
10937 VATTR_WANTED(&va, va_fsid);
10938 VATTR_WANTED(&va, va_fileid);
10939
10940 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10941 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10942 ret = -1;
10943 } else {
10944 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10945 ret = 0;
10946 }
10947
10948 return ret;
10949 }
10950 #endif
10951
10952 static unsigned long
10953 fsctl_bogus_command_compat(unsigned long cmd)
10954 {
10955 switch (cmd) {
10956 case IOCBASECMD(FSIOC_SYNC_VOLUME):
10957 return FSIOC_SYNC_VOLUME;
10958 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10959 return FSIOC_ROUTEFS_SETROUTEID;
10960 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10961 return FSIOC_SET_PACKAGE_EXTS;
10962 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10963 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10964 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10965 return DISK_CONDITIONER_IOC_GET;
10966 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10967 return DISK_CONDITIONER_IOC_SET;
10968 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10969 return FSIOC_FIOSEEKHOLE;
10970 case IOCBASECMD(FSIOC_FIOSEEKDATA):
10971 return FSIOC_FIOSEEKDATA;
10972 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10973 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10974 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10975 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10976 }
10977
10978 return cmd;
10979 }
10980
10981 static int
10982 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
10983 {
10984 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
10985 }
10986
10987 /*
10988 * Make a filesystem-specific control call:
10989 */
10990 /* ARGSUSED */
10991 static int
10992 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10993 {
10994 int error = 0;
10995 boolean_t is64bit;
10996 u_int size;
10997 #define STK_PARAMS 128
10998 char stkbuf[STK_PARAMS] = {0};
10999 caddr_t data, memp;
11000 vnode_t vp = *arg_vp;
11001
11002 if (vp->v_type == VCHR || vp->v_type == VBLK) {
11003 return ENOTTY;
11004 }
11005
11006 cmd = fsctl_bogus_command_compat(cmd);
11007
11008 size = IOCPARM_LEN(cmd);
11009 if (size > IOCPARM_MAX) {
11010 return EINVAL;
11011 }
11012
11013 is64bit = proc_is64bit(p);
11014
11015 memp = NULL;
11016
11017 if (size > sizeof(stkbuf)) {
11018 if ((memp = (caddr_t)kalloc(size)) == 0) {
11019 return ENOMEM;
11020 }
11021 data = memp;
11022 } else {
11023 data = &stkbuf[0];
11024 };
11025
11026 if (cmd & IOC_IN) {
11027 if (size) {
11028 error = copyin(udata, data, size);
11029 if (error) {
11030 if (memp) {
11031 kfree(memp, size);
11032 }
11033 return error;
11034 }
11035 } else {
11036 if (is64bit) {
11037 *(user_addr_t *)data = udata;
11038 } else {
11039 *(uint32_t *)data = (uint32_t)udata;
11040 }
11041 };
11042 } else if ((cmd & IOC_OUT) && size) {
11043 /*
11044 * Zero the buffer so the user always
11045 * gets back something deterministic.
11046 */
11047 bzero(data, size);
11048 } else if (cmd & IOC_VOID) {
11049 if (is64bit) {
11050 *(user_addr_t *)data = udata;
11051 } else {
11052 *(uint32_t *)data = (uint32_t)udata;
11053 }
11054 }
11055
11056 /* Check to see if it's a generic command */
11057 switch (cmd) {
11058 case FSIOC_SYNC_VOLUME: {
11059 struct vfs_attr vfa;
11060 mount_t mp = vp->v_mount;
11061 unsigned arg;
11062
11063
11064 /* record vid of vp so we can drop it below. */
11065 uint32_t vvid = vp->v_id;
11066
11067 /*
11068 * Then grab mount_iterref so that we can release the vnode.
11069 * Without this, a thread may call vnode_iterate_prepare then
11070 * get into a deadlock because we've never released the root vp
11071 */
11072 error = mount_iterref(mp, 0);
11073 if (error) {
11074 break;
11075 }
11076 vnode_put(vp);
11077
11078 arg = MNT_NOWAIT;
11079 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11080 arg = MNT_WAIT;
11081 }
11082
11083 /*
11084 * If the filessytem supports multiple filesytems in a
11085 * partition (For eg APFS volumes in a container, it knows
11086 * that the waitfor argument to VFS_SYNC are flags.
11087 */
11088 VFSATTR_INIT(&vfa);
11089 VFSATTR_WANTED(&vfa, f_capabilities);
11090 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11091 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11092 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11093 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11094 arg |= MNT_VOLUME;
11095 }
11096
11097 /* issue the sync for this volume */
11098 (void)sync_callback(mp, &arg);
11099
11100 /*
11101 * Then release the mount_iterref once we're done syncing; it's not
11102 * needed for the VNOP_IOCTL below
11103 */
11104 mount_iterdrop(mp);
11105
11106 if (arg & FSCTL_SYNC_FULLSYNC) {
11107 /* re-obtain vnode iocount on the root vp, if possible */
11108 error = vnode_getwithvid(vp, vvid);
11109 if (error == 0) {
11110 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11111 vnode_put(vp);
11112 }
11113 }
11114 /* mark the argument VP as having been released */
11115 *arg_vp = NULL;
11116 }
11117 break;
11118
11119 case FSIOC_ROUTEFS_SETROUTEID: {
11120 #if ROUTEFS
11121 char routepath[MAXPATHLEN];
11122 size_t len = 0;
11123
11124 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11125 break;
11126 }
11127 bzero(routepath, MAXPATHLEN);
11128 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11129 if (error) {
11130 break;
11131 }
11132 error = routefs_kernel_mount(routepath);
11133 if (error) {
11134 break;
11135 }
11136 #endif
11137 }
11138 break;
11139
11140 case FSIOC_SET_PACKAGE_EXTS: {
11141 user_addr_t ext_strings;
11142 uint32_t num_entries;
11143 uint32_t max_width;
11144
11145 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11146 break;
11147 }
11148
11149 if ((is64bit && size != sizeof(user64_package_ext_info))
11150 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11151 // either you're 64-bit and passed a 64-bit struct or
11152 // you're 32-bit and passed a 32-bit struct. otherwise
11153 // it's not ok.
11154 error = EINVAL;
11155 break;
11156 }
11157
11158 if (is64bit) {
11159 ext_strings = ((user64_package_ext_info *)data)->strings;
11160 num_entries = ((user64_package_ext_info *)data)->num_entries;
11161 max_width = ((user64_package_ext_info *)data)->max_width;
11162 } else {
11163 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11164 num_entries = ((user32_package_ext_info *)data)->num_entries;
11165 max_width = ((user32_package_ext_info *)data)->max_width;
11166 }
11167 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11168 }
11169 break;
11170
11171 case FSIOC_SET_FSTYPENAME_OVERRIDE:
11172 {
11173 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11174 break;
11175 }
11176 if (vp->v_mount) {
11177 mount_lock(vp->v_mount);
11178 if (data[0] != 0) {
11179 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11180 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11181 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11182 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11183 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11184 }
11185 } else {
11186 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11187 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11188 }
11189 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11190 vp->v_mount->fstypename_override[0] = '\0';
11191 }
11192 mount_unlock(vp->v_mount);
11193 }
11194 }
11195 break;
11196
11197 case DISK_CONDITIONER_IOC_GET: {
11198 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11199 }
11200 break;
11201
11202 case DISK_CONDITIONER_IOC_SET: {
11203 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11204 }
11205 break;
11206
11207 case FSIOC_CAS_BSDFLAGS: {
11208 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11209 struct vnode_attr va;
11210
11211 VATTR_INIT(&va);
11212 VATTR_SET(&va, va_flags, cas->new_flags);
11213
11214 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11215 }
11216 break;
11217
11218 case FSIOC_FD_ONLY_OPEN_ONCE: {
11219 if (vnode_usecount(vp) > 1) {
11220 error = EBUSY;
11221 } else {
11222 error = 0;
11223 }
11224 }
11225 break;
11226
11227 default: {
11228 /* other, known commands shouldn't be passed down here */
11229 switch (cmd) {
11230 case F_PUNCHHOLE:
11231 case F_TRIM_ACTIVE_FILE:
11232 case F_RDADVISE:
11233 case F_TRANSCODEKEY:
11234 case F_GETPROTECTIONLEVEL:
11235 case F_GETDEFAULTPROTLEVEL:
11236 case F_MAKECOMPRESSED:
11237 case F_SET_GREEDY_MODE:
11238 case F_SETSTATICCONTENT:
11239 case F_SETIOTYPE:
11240 case F_SETBACKINGSTORE:
11241 case F_GETPATH_MTMINFO:
11242 case APFSIOC_REVERT_TO_SNAPSHOT:
11243 case FSIOC_FIOSEEKHOLE:
11244 case FSIOC_FIOSEEKDATA:
11245 case HFS_GET_BOOT_INFO:
11246 case HFS_SET_BOOT_INFO:
11247 case FIOPINSWAP:
11248 case F_CHKCLEAN:
11249 case F_FULLFSYNC:
11250 case F_BARRIERFSYNC:
11251 case F_FREEZE_FS:
11252 case F_THAW_FS:
11253 error = EINVAL;
11254 goto outdrop;
11255 }
11256 /* Invoke the filesystem-specific code */
11257 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11258 }
11259 } /* end switch stmt */
11260
11261 /*
11262 * if no errors, copy any data to user. Size was
11263 * already set and checked above.
11264 */
11265 if (error == 0 && (cmd & IOC_OUT) && size) {
11266 error = copyout(data, udata, size);
11267 }
11268
11269 outdrop:
11270 if (memp) {
11271 kfree(memp, size);
11272 }
11273
11274 return error;
11275 }
11276
11277 /* ARGSUSED */
11278 int
11279 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11280 {
11281 int error;
11282 struct nameidata nd;
11283 u_long nameiflags;
11284 vnode_t vp = NULL;
11285 vfs_context_t ctx = vfs_context_current();
11286
11287 AUDIT_ARG(cmd, uap->cmd);
11288 AUDIT_ARG(value32, uap->options);
11289 /* Get the vnode for the file we are getting info on: */
11290 nameiflags = 0;
11291 //
11292 // if we come through fsctl() then the file is by definition not open.
11293 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11294 // lest the caller mistakenly thinks the only open is their own (but in
11295 // reality it's someone elses).
11296 //
11297 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11298 return EINVAL;
11299 }
11300 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11301 nameiflags |= FOLLOW;
11302 }
11303 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11304 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11305 }
11306 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11307 UIO_USERSPACE, uap->path, ctx);
11308 if ((error = namei(&nd))) {
11309 goto done;
11310 }
11311 vp = nd.ni_vp;
11312 nameidone(&nd);
11313
11314 #if CONFIG_MACF
11315 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11316 if (error) {
11317 goto done;
11318 }
11319 #endif
11320
11321 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11322
11323 done:
11324 if (vp) {
11325 vnode_put(vp);
11326 }
11327 return error;
11328 }
11329 /* ARGSUSED */
11330 int
11331 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11332 {
11333 int error;
11334 vnode_t vp = NULL;
11335 vfs_context_t ctx = vfs_context_current();
11336 int fd = -1;
11337
11338 AUDIT_ARG(fd, uap->fd);
11339 AUDIT_ARG(cmd, uap->cmd);
11340 AUDIT_ARG(value32, uap->options);
11341
11342 /* Get the vnode for the file we are getting info on: */
11343 if ((error = file_vnode(uap->fd, &vp))) {
11344 return error;
11345 }
11346 fd = uap->fd;
11347 if ((error = vnode_getwithref(vp))) {
11348 file_drop(fd);
11349 return error;
11350 }
11351
11352 #if CONFIG_MACF
11353 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11354 file_drop(fd);
11355 vnode_put(vp);
11356 return error;
11357 }
11358 #endif
11359
11360 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11361
11362 file_drop(fd);
11363
11364 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11365 if (vp) {
11366 vnode_put(vp);
11367 }
11368
11369 return error;
11370 }
11371 /* end of fsctl system call */
11372
11373 /*
11374 * Retrieve the data of an extended attribute.
11375 */
11376 int
11377 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11378 {
11379 vnode_t vp;
11380 struct nameidata nd;
11381 char attrname[XATTR_MAXNAMELEN + 1];
11382 vfs_context_t ctx = vfs_context_current();
11383 uio_t auio = NULL;
11384 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11385 size_t attrsize = 0;
11386 size_t namelen;
11387 u_int32_t nameiflags;
11388 int error;
11389 char uio_buf[UIO_SIZEOF(1)];
11390
11391 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11392 return EINVAL;
11393 }
11394
11395 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11396 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11397 if ((error = namei(&nd))) {
11398 return error;
11399 }
11400 vp = nd.ni_vp;
11401 nameidone(&nd);
11402
11403 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11404 if (error != 0) {
11405 goto out;
11406 }
11407 if (xattr_protected(attrname)) {
11408 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11409 error = EPERM;
11410 goto out;
11411 }
11412 }
11413 /*
11414 * the specific check for 0xffffffff is a hack to preserve
11415 * binaray compatibilty in K64 with applications that discovered
11416 * that passing in a buf pointer and a size of -1 resulted in
11417 * just the size of the indicated extended attribute being returned.
11418 * this isn't part of the documented behavior, but because of the
11419 * original implemtation's check for "uap->size > 0", this behavior
11420 * was allowed. In K32 that check turned into a signed comparison
11421 * even though uap->size is unsigned... in K64, we blow by that
11422 * check because uap->size is unsigned and doesn't get sign smeared
11423 * in the munger for a 32 bit user app. we also need to add a
11424 * check to limit the maximum size of the buffer being passed in...
11425 * unfortunately, the underlying fileystems seem to just malloc
11426 * the requested size even if the actual extended attribute is tiny.
11427 * because that malloc is for kernel wired memory, we have to put a
11428 * sane limit on it.
11429 *
11430 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11431 * U64 running on K64 will yield -1 (64 bits wide)
11432 * U32/U64 running on K32 will yield -1 (32 bits wide)
11433 */
11434 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11435 goto no_uio;
11436 }
11437
11438 if (uap->value) {
11439 if (uap->size > (size_t)XATTR_MAXSIZE) {
11440 uap->size = XATTR_MAXSIZE;
11441 }
11442
11443 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11444 &uio_buf[0], sizeof(uio_buf));
11445 uio_addiov(auio, uap->value, uap->size);
11446 }
11447 no_uio:
11448 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11449 out:
11450 vnode_put(vp);
11451
11452 if (auio) {
11453 *retval = uap->size - uio_resid(auio);
11454 } else {
11455 *retval = (user_ssize_t)attrsize;
11456 }
11457
11458 return error;
11459 }
11460
11461 /*
11462 * Retrieve the data of an extended attribute.
11463 */
11464 int
11465 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11466 {
11467 vnode_t vp;
11468 char attrname[XATTR_MAXNAMELEN + 1];
11469 uio_t auio = NULL;
11470 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11471 size_t attrsize = 0;
11472 size_t namelen;
11473 int error;
11474 char uio_buf[UIO_SIZEOF(1)];
11475
11476 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11477 return EINVAL;
11478 }
11479
11480 if ((error = file_vnode(uap->fd, &vp))) {
11481 return error;
11482 }
11483 if ((error = vnode_getwithref(vp))) {
11484 file_drop(uap->fd);
11485 return error;
11486 }
11487 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11488 if (error != 0) {
11489 goto out;
11490 }
11491 if (xattr_protected(attrname)) {
11492 error = EPERM;
11493 goto out;
11494 }
11495 if (uap->value && uap->size > 0) {
11496 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11497 &uio_buf[0], sizeof(uio_buf));
11498 uio_addiov(auio, uap->value, uap->size);
11499 }
11500
11501 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11502 out:
11503 (void)vnode_put(vp);
11504 file_drop(uap->fd);
11505
11506 if (auio) {
11507 *retval = uap->size - uio_resid(auio);
11508 } else {
11509 *retval = (user_ssize_t)attrsize;
11510 }
11511 return error;
11512 }
11513
11514 /*
11515 * Set the data of an extended attribute.
11516 */
11517 int
11518 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11519 {
11520 vnode_t vp;
11521 struct nameidata nd;
11522 char attrname[XATTR_MAXNAMELEN + 1];
11523 vfs_context_t ctx = vfs_context_current();
11524 uio_t auio = NULL;
11525 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11526 size_t namelen;
11527 u_int32_t nameiflags;
11528 int error;
11529 char uio_buf[UIO_SIZEOF(1)];
11530
11531 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11532 return EINVAL;
11533 }
11534
11535 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11536 if (error != 0) {
11537 if (error == EPERM) {
11538 /* if the string won't fit in attrname, copyinstr emits EPERM */
11539 return ENAMETOOLONG;
11540 }
11541 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11542 return error;
11543 }
11544 if (xattr_protected(attrname)) {
11545 return EPERM;
11546 }
11547 if (uap->size != 0 && uap->value == 0) {
11548 return EINVAL;
11549 }
11550
11551 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11552 NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11553 if ((error = namei(&nd))) {
11554 return error;
11555 }
11556 vp = nd.ni_vp;
11557 nameidone(&nd);
11558
11559 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11560 &uio_buf[0], sizeof(uio_buf));
11561 uio_addiov(auio, uap->value, uap->size);
11562
11563 error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11564 #if CONFIG_FSE
11565 if (error == 0) {
11566 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11567 FSE_ARG_VNODE, vp,
11568 FSE_ARG_DONE);
11569 }
11570 #endif
11571 vnode_put(vp);
11572 *retval = 0;
11573 return error;
11574 }
11575
11576 /*
11577 * Set the data of an extended attribute.
11578 */
11579 int
11580 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11581 {
11582 vnode_t vp;
11583 char attrname[XATTR_MAXNAMELEN + 1];
11584 uio_t auio = NULL;
11585 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11586 size_t namelen;
11587 int error;
11588 char uio_buf[UIO_SIZEOF(1)];
11589 #if CONFIG_FSE
11590 vfs_context_t ctx = vfs_context_current();
11591 #endif
11592
11593 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11594 return EINVAL;
11595 }
11596
11597 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11598 if (error != 0) {
11599 if (error == EPERM) {
11600 /* if the string won't fit in attrname, copyinstr emits EPERM */
11601 return ENAMETOOLONG;
11602 }
11603 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11604 return error;
11605 }
11606 if (xattr_protected(attrname)) {
11607 return EPERM;
11608 }
11609 if (uap->size != 0 && uap->value == 0) {
11610 return EINVAL;
11611 }
11612 if ((error = file_vnode(uap->fd, &vp))) {
11613 return error;
11614 }
11615 if ((error = vnode_getwithref(vp))) {
11616 file_drop(uap->fd);
11617 return error;
11618 }
11619 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11620 &uio_buf[0], sizeof(uio_buf));
11621 uio_addiov(auio, uap->value, uap->size);
11622
11623 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11624 #if CONFIG_FSE
11625 if (error == 0) {
11626 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11627 FSE_ARG_VNODE, vp,
11628 FSE_ARG_DONE);
11629 }
11630 #endif
11631 vnode_put(vp);
11632 file_drop(uap->fd);
11633 *retval = 0;
11634 return error;
11635 }
11636
11637 /*
11638 * Remove an extended attribute.
11639 * XXX Code duplication here.
11640 */
11641 int
11642 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11643 {
11644 vnode_t vp;
11645 struct nameidata nd;
11646 char attrname[XATTR_MAXNAMELEN + 1];
11647 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11648 vfs_context_t ctx = vfs_context_current();
11649 size_t namelen;
11650 u_int32_t nameiflags;
11651 int error;
11652
11653 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11654 return EINVAL;
11655 }
11656
11657 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11658 if (error != 0) {
11659 return error;
11660 }
11661 if (xattr_protected(attrname)) {
11662 return EPERM;
11663 }
11664 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11665 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11666 if ((error = namei(&nd))) {
11667 return error;
11668 }
11669 vp = nd.ni_vp;
11670 nameidone(&nd);
11671
11672 error = vn_removexattr(vp, attrname, uap->options, ctx);
11673 #if CONFIG_FSE
11674 if (error == 0) {
11675 add_fsevent(FSE_XATTR_REMOVED, ctx,
11676 FSE_ARG_VNODE, vp,
11677 FSE_ARG_DONE);
11678 }
11679 #endif
11680 vnode_put(vp);
11681 *retval = 0;
11682 return error;
11683 }
11684
11685 /*
11686 * Remove an extended attribute.
11687 * XXX Code duplication here.
11688 */
11689 int
11690 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11691 {
11692 vnode_t vp;
11693 char attrname[XATTR_MAXNAMELEN + 1];
11694 size_t namelen;
11695 int error;
11696 #if CONFIG_FSE
11697 vfs_context_t ctx = vfs_context_current();
11698 #endif
11699
11700 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11701 return EINVAL;
11702 }
11703
11704 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11705 if (error != 0) {
11706 return error;
11707 }
11708 if (xattr_protected(attrname)) {
11709 return EPERM;
11710 }
11711 if ((error = file_vnode(uap->fd, &vp))) {
11712 return error;
11713 }
11714 if ((error = vnode_getwithref(vp))) {
11715 file_drop(uap->fd);
11716 return error;
11717 }
11718
11719 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11720 #if CONFIG_FSE
11721 if (error == 0) {
11722 add_fsevent(FSE_XATTR_REMOVED, ctx,
11723 FSE_ARG_VNODE, vp,
11724 FSE_ARG_DONE);
11725 }
11726 #endif
11727 vnode_put(vp);
11728 file_drop(uap->fd);
11729 *retval = 0;
11730 return error;
11731 }
11732
11733 /*
11734 * Retrieve the list of extended attribute names.
11735 * XXX Code duplication here.
11736 */
11737 int
11738 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11739 {
11740 vnode_t vp;
11741 struct nameidata nd;
11742 vfs_context_t ctx = vfs_context_current();
11743 uio_t auio = NULL;
11744 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11745 size_t attrsize = 0;
11746 u_int32_t nameiflags;
11747 int error;
11748 char uio_buf[UIO_SIZEOF(1)];
11749
11750 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11751 return EINVAL;
11752 }
11753
11754 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11755 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11756 if ((error = namei(&nd))) {
11757 return error;
11758 }
11759 vp = nd.ni_vp;
11760 nameidone(&nd);
11761 if (uap->namebuf != 0 && uap->bufsize > 0) {
11762 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11763 &uio_buf[0], sizeof(uio_buf));
11764 uio_addiov(auio, uap->namebuf, uap->bufsize);
11765 }
11766
11767 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11768
11769 vnode_put(vp);
11770 if (auio) {
11771 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11772 } else {
11773 *retval = (user_ssize_t)attrsize;
11774 }
11775 return error;
11776 }
11777
11778 /*
11779 * Retrieve the list of extended attribute names.
11780 * XXX Code duplication here.
11781 */
11782 int
11783 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11784 {
11785 vnode_t vp;
11786 uio_t auio = NULL;
11787 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11788 size_t attrsize = 0;
11789 int error;
11790 char uio_buf[UIO_SIZEOF(1)];
11791
11792 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11793 return EINVAL;
11794 }
11795
11796 if ((error = file_vnode(uap->fd, &vp))) {
11797 return error;
11798 }
11799 if ((error = vnode_getwithref(vp))) {
11800 file_drop(uap->fd);
11801 return error;
11802 }
11803 if (uap->namebuf != 0 && uap->bufsize > 0) {
11804 auio = uio_createwithbuffer(1, 0, spacetype,
11805 UIO_READ, &uio_buf[0], sizeof(uio_buf));
11806 uio_addiov(auio, uap->namebuf, uap->bufsize);
11807 }
11808
11809 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11810
11811 vnode_put(vp);
11812 file_drop(uap->fd);
11813 if (auio) {
11814 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11815 } else {
11816 *retval = (user_ssize_t)attrsize;
11817 }
11818 return error;
11819 }
11820
11821 static int
11822 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11823 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11824 {
11825 int error;
11826 struct mount *mp = NULL;
11827 vnode_t vp;
11828 int length;
11829 int bpflags;
11830 /* maximum number of times to retry build_path */
11831 unsigned int retries = 0x10;
11832
11833 if (bufsize > PAGE_SIZE) {
11834 return EINVAL;
11835 }
11836
11837 if (buf == NULL) {
11838 return ENOMEM;
11839 }
11840
11841 retry:
11842 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11843 error = ENOTSUP; /* unexpected failure */
11844 return ENOTSUP;
11845 }
11846
11847 unionget:
11848 if (objid == 2) {
11849 struct vfs_attr vfsattr;
11850 int use_vfs_root = TRUE;
11851
11852 VFSATTR_INIT(&vfsattr);
11853 VFSATTR_WANTED(&vfsattr, f_capabilities);
11854 if (!(options & FSOPT_ISREALFSID) &&
11855 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11856 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11857 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11858 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11859 use_vfs_root = FALSE;
11860 }
11861 }
11862
11863 if (use_vfs_root) {
11864 error = VFS_ROOT(mp, &vp, ctx);
11865 } else {
11866 error = VFS_VGET(mp, objid, &vp, ctx);
11867 }
11868 } else {
11869 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11870 }
11871
11872 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11873 /*
11874 * If the fileid isn't found and we're in a union
11875 * mount volume, then see if the fileid is in the
11876 * mounted-on volume.
11877 */
11878 struct mount *tmp = mp;
11879 mp = vnode_mount(tmp->mnt_vnodecovered);
11880 vfs_unbusy(tmp);
11881 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11882 goto unionget;
11883 }
11884 } else {
11885 vfs_unbusy(mp);
11886 }
11887
11888 if (error) {
11889 return error;
11890 }
11891
11892 #if CONFIG_MACF
11893 error = mac_vnode_check_fsgetpath(ctx, vp);
11894 if (error) {
11895 vnode_put(vp);
11896 return error;
11897 }
11898 #endif
11899
11900 /* Obtain the absolute path to this vnode. */
11901 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11902 if (options & FSOPT_NOFIRMLINKPATH) {
11903 bpflags |= BUILDPATH_NO_FIRMLINK;
11904 }
11905 bpflags |= BUILDPATH_CHECK_MOVED;
11906 error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11907 vnode_put(vp);
11908
11909 if (error) {
11910 /* there was a race building the path, try a few more times */
11911 if (error == EAGAIN) {
11912 --retries;
11913 if (retries > 0) {
11914 goto retry;
11915 }
11916
11917 error = ENOENT;
11918 }
11919 goto out;
11920 }
11921
11922 AUDIT_ARG(text, buf);
11923
11924 if (kdebug_enable) {
11925 long dbg_parms[NUMPARMS];
11926 int dbg_namelen;
11927
11928 dbg_namelen = (int)sizeof(dbg_parms);
11929
11930 if (length < dbg_namelen) {
11931 memcpy((char *)dbg_parms, buf, length);
11932 memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11933
11934 dbg_namelen = length;
11935 } else {
11936 memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11937 }
11938
11939 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11940 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11941 }
11942
11943 *pathlen = (user_ssize_t)length; /* may be superseded by error */
11944
11945 out:
11946 return error;
11947 }
11948
11949 /*
11950 * Obtain the full pathname of a file system object by id.
11951 */
11952 static int
11953 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11954 uint32_t options, user_ssize_t *retval)
11955 {
11956 vfs_context_t ctx = vfs_context_current();
11957 fsid_t fsid;
11958 char *realpath;
11959 int length;
11960 int error;
11961
11962 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11963 return EINVAL;
11964 }
11965
11966 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11967 return error;
11968 }
11969 AUDIT_ARG(value32, fsid.val[0]);
11970 AUDIT_ARG(value64, objid);
11971 /* Restrict output buffer size for now. */
11972
11973 if (bufsize > PAGE_SIZE || bufsize <= 0) {
11974 return EINVAL;
11975 }
11976 MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
11977 if (realpath == NULL) {
11978 return ENOMEM;
11979 }
11980
11981 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
11982 options, &length);
11983
11984 if (error) {
11985 goto out;
11986 }
11987
11988 error = copyout((caddr_t)realpath, buf, length);
11989
11990 *retval = (user_ssize_t)length; /* may be superseded by error */
11991 out:
11992 if (realpath) {
11993 FREE(realpath, M_TEMP);
11994 }
11995 return error;
11996 }
11997
11998 int
11999 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12000 {
12001 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12002 0, retval);
12003 }
12004
12005 int
12006 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12007 {
12008 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12009 uap->options, retval);
12010 }
12011
12012 /*
12013 * Common routine to handle various flavors of statfs data heading out
12014 * to user space.
12015 *
12016 * Returns: 0 Success
12017 * EFAULT
12018 */
12019 static int
12020 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12021 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12022 boolean_t partial_copy)
12023 {
12024 int error;
12025 int my_size, copy_size;
12026
12027 if (is_64_bit) {
12028 struct user64_statfs sfs;
12029 my_size = copy_size = sizeof(sfs);
12030 bzero(&sfs, my_size);
12031 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12032 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12033 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12034 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12035 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12036 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12037 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12038 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12039 sfs.f_files = (user64_long_t)sfsp->f_files;
12040 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12041 sfs.f_fsid = sfsp->f_fsid;
12042 sfs.f_owner = sfsp->f_owner;
12043 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12044 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12045 } else {
12046 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12047 }
12048 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12049 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12050
12051 if (partial_copy) {
12052 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12053 }
12054 error = copyout((caddr_t)&sfs, bufp, copy_size);
12055 } else {
12056 struct user32_statfs sfs;
12057
12058 my_size = copy_size = sizeof(sfs);
12059 bzero(&sfs, my_size);
12060
12061 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12062 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12063 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12064
12065 /*
12066 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12067 * have to fudge the numbers here in that case. We inflate the blocksize in order
12068 * to reflect the filesystem size as best we can.
12069 */
12070 if ((sfsp->f_blocks > INT_MAX)
12071 /* Hack for 4061702 . I think the real fix is for Carbon to
12072 * look for some volume capability and not depend on hidden
12073 * semantics agreed between a FS and carbon.
12074 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12075 * for Carbon to set bNoVolumeSizes volume attribute.
12076 * Without this the webdavfs files cannot be copied onto
12077 * disk as they look huge. This change should not affect
12078 * XSAN as they should not setting these to -1..
12079 */
12080 && (sfsp->f_blocks != 0xffffffffffffffffULL)
12081 && (sfsp->f_bfree != 0xffffffffffffffffULL)
12082 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12083 int shift;
12084
12085 /*
12086 * Work out how far we have to shift the block count down to make it fit.
12087 * Note that it's possible to have to shift so far that the resulting
12088 * blocksize would be unreportably large. At that point, we will clip
12089 * any values that don't fit.
12090 *
12091 * For safety's sake, we also ensure that f_iosize is never reported as
12092 * being smaller than f_bsize.
12093 */
12094 for (shift = 0; shift < 32; shift++) {
12095 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12096 break;
12097 }
12098 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12099 break;
12100 }
12101 }
12102 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12103 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12104 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12105 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12106 #undef __SHIFT_OR_CLIP
12107 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12108 sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12109 } else {
12110 /* filesystem is small enough to be reported honestly */
12111 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12112 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12113 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12114 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12115 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12116 }
12117 sfs.f_files = (user32_long_t)sfsp->f_files;
12118 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12119 sfs.f_fsid = sfsp->f_fsid;
12120 sfs.f_owner = sfsp->f_owner;
12121 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12122 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12123 } else {
12124 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12125 }
12126 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12127 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12128
12129 if (partial_copy) {
12130 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12131 }
12132 error = copyout((caddr_t)&sfs, bufp, copy_size);
12133 }
12134
12135 if (sizep != NULL) {
12136 *sizep = my_size;
12137 }
12138 return error;
12139 }
12140
12141 /*
12142 * copy stat structure into user_stat structure.
12143 */
12144 void
12145 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12146 {
12147 bzero(usbp, sizeof(*usbp));
12148
12149 usbp->st_dev = sbp->st_dev;
12150 usbp->st_ino = sbp->st_ino;
12151 usbp->st_mode = sbp->st_mode;
12152 usbp->st_nlink = sbp->st_nlink;
12153 usbp->st_uid = sbp->st_uid;
12154 usbp->st_gid = sbp->st_gid;
12155 usbp->st_rdev = sbp->st_rdev;
12156 #ifndef _POSIX_C_SOURCE
12157 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12158 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12159 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12160 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12161 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12162 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12163 #else
12164 usbp->st_atime = sbp->st_atime;
12165 usbp->st_atimensec = sbp->st_atimensec;
12166 usbp->st_mtime = sbp->st_mtime;
12167 usbp->st_mtimensec = sbp->st_mtimensec;
12168 usbp->st_ctime = sbp->st_ctime;
12169 usbp->st_ctimensec = sbp->st_ctimensec;
12170 #endif
12171 usbp->st_size = sbp->st_size;
12172 usbp->st_blocks = sbp->st_blocks;
12173 usbp->st_blksize = sbp->st_blksize;
12174 usbp->st_flags = sbp->st_flags;
12175 usbp->st_gen = sbp->st_gen;
12176 usbp->st_lspare = sbp->st_lspare;
12177 usbp->st_qspare[0] = sbp->st_qspare[0];
12178 usbp->st_qspare[1] = sbp->st_qspare[1];
12179 }
12180
12181 void
12182 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12183 {
12184 bzero(usbp, sizeof(*usbp));
12185
12186 usbp->st_dev = sbp->st_dev;
12187 usbp->st_ino = sbp->st_ino;
12188 usbp->st_mode = sbp->st_mode;
12189 usbp->st_nlink = sbp->st_nlink;
12190 usbp->st_uid = sbp->st_uid;
12191 usbp->st_gid = sbp->st_gid;
12192 usbp->st_rdev = sbp->st_rdev;
12193 #ifndef _POSIX_C_SOURCE
12194 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12195 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12196 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12197 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12198 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12199 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12200 #else
12201 usbp->st_atime = sbp->st_atime;
12202 usbp->st_atimensec = sbp->st_atimensec;
12203 usbp->st_mtime = sbp->st_mtime;
12204 usbp->st_mtimensec = sbp->st_mtimensec;
12205 usbp->st_ctime = sbp->st_ctime;
12206 usbp->st_ctimensec = sbp->st_ctimensec;
12207 #endif
12208 usbp->st_size = sbp->st_size;
12209 usbp->st_blocks = sbp->st_blocks;
12210 usbp->st_blksize = sbp->st_blksize;
12211 usbp->st_flags = sbp->st_flags;
12212 usbp->st_gen = sbp->st_gen;
12213 usbp->st_lspare = sbp->st_lspare;
12214 usbp->st_qspare[0] = sbp->st_qspare[0];
12215 usbp->st_qspare[1] = sbp->st_qspare[1];
12216 }
12217
12218 /*
12219 * copy stat64 structure into user_stat64 structure.
12220 */
12221 void
12222 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12223 {
12224 bzero(usbp, sizeof(*usbp));
12225
12226 usbp->st_dev = sbp->st_dev;
12227 usbp->st_ino = sbp->st_ino;
12228 usbp->st_mode = sbp->st_mode;
12229 usbp->st_nlink = sbp->st_nlink;
12230 usbp->st_uid = sbp->st_uid;
12231 usbp->st_gid = sbp->st_gid;
12232 usbp->st_rdev = sbp->st_rdev;
12233 #ifndef _POSIX_C_SOURCE
12234 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12235 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12236 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12237 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12238 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12239 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12240 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12241 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12242 #else
12243 usbp->st_atime = sbp->st_atime;
12244 usbp->st_atimensec = sbp->st_atimensec;
12245 usbp->st_mtime = sbp->st_mtime;
12246 usbp->st_mtimensec = sbp->st_mtimensec;
12247 usbp->st_ctime = sbp->st_ctime;
12248 usbp->st_ctimensec = sbp->st_ctimensec;
12249 usbp->st_birthtime = sbp->st_birthtime;
12250 usbp->st_birthtimensec = sbp->st_birthtimensec;
12251 #endif
12252 usbp->st_size = sbp->st_size;
12253 usbp->st_blocks = sbp->st_blocks;
12254 usbp->st_blksize = sbp->st_blksize;
12255 usbp->st_flags = sbp->st_flags;
12256 usbp->st_gen = sbp->st_gen;
12257 usbp->st_lspare = sbp->st_lspare;
12258 usbp->st_qspare[0] = sbp->st_qspare[0];
12259 usbp->st_qspare[1] = sbp->st_qspare[1];
12260 }
12261
12262 void
12263 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12264 {
12265 bzero(usbp, sizeof(*usbp));
12266
12267 usbp->st_dev = sbp->st_dev;
12268 usbp->st_ino = sbp->st_ino;
12269 usbp->st_mode = sbp->st_mode;
12270 usbp->st_nlink = sbp->st_nlink;
12271 usbp->st_uid = sbp->st_uid;
12272 usbp->st_gid = sbp->st_gid;
12273 usbp->st_rdev = sbp->st_rdev;
12274 #ifndef _POSIX_C_SOURCE
12275 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12276 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12277 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12278 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12279 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12280 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12281 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12282 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12283 #else
12284 usbp->st_atime = sbp->st_atime;
12285 usbp->st_atimensec = sbp->st_atimensec;
12286 usbp->st_mtime = sbp->st_mtime;
12287 usbp->st_mtimensec = sbp->st_mtimensec;
12288 usbp->st_ctime = sbp->st_ctime;
12289 usbp->st_ctimensec = sbp->st_ctimensec;
12290 usbp->st_birthtime = sbp->st_birthtime;
12291 usbp->st_birthtimensec = sbp->st_birthtimensec;
12292 #endif
12293 usbp->st_size = sbp->st_size;
12294 usbp->st_blocks = sbp->st_blocks;
12295 usbp->st_blksize = sbp->st_blksize;
12296 usbp->st_flags = sbp->st_flags;
12297 usbp->st_gen = sbp->st_gen;
12298 usbp->st_lspare = sbp->st_lspare;
12299 usbp->st_qspare[0] = sbp->st_qspare[0];
12300 usbp->st_qspare[1] = sbp->st_qspare[1];
12301 }
12302
12303 /*
12304 * Purge buffer cache for simulating cold starts
12305 */
12306 static int
12307 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12308 {
12309 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12310
12311 return VNODE_RETURNED;
12312 }
12313
12314 static int
12315 vfs_purge_callback(mount_t mp, __unused void * arg)
12316 {
12317 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12318
12319 return VFS_RETURNED;
12320 }
12321
12322 int
12323 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12324 {
12325 if (!kauth_cred_issuser(kauth_cred_get())) {
12326 return EPERM;
12327 }
12328
12329 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12330
12331 return 0;
12332 }
12333
12334 /*
12335 * gets the vnode associated with the (unnamed) snapshot directory
12336 * for a Filesystem. The snapshot directory vnode is returned with
12337 * an iocount on it.
12338 */
12339 int
12340 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12341 {
12342 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12343 }
12344
12345 /*
12346 * Get the snapshot vnode.
12347 *
12348 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12349 * needs nameidone() on ndp.
12350 *
12351 * If the snapshot vnode exists it is returned in ndp->ni_vp.
12352 *
12353 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12354 * not needed.
12355 */
12356 static int
12357 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12358 user_addr_t name, struct nameidata *ndp, int32_t op,
12359 #if !CONFIG_TRIGGERS
12360 __unused
12361 #endif
12362 enum path_operation pathop,
12363 vfs_context_t ctx)
12364 {
12365 int error, i;
12366 caddr_t name_buf;
12367 size_t name_len;
12368 struct vfs_attr vfa;
12369
12370 *sdvpp = NULLVP;
12371 *rvpp = NULLVP;
12372
12373 error = vnode_getfromfd(ctx, dirfd, rvpp);
12374 if (error) {
12375 return error;
12376 }
12377
12378 if (!vnode_isvroot(*rvpp)) {
12379 error = EINVAL;
12380 goto out;
12381 }
12382
12383 /* Make sure the filesystem supports snapshots */
12384 VFSATTR_INIT(&vfa);
12385 VFSATTR_WANTED(&vfa, f_capabilities);
12386 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12387 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12388 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12389 VOL_CAP_INT_SNAPSHOT)) ||
12390 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12391 VOL_CAP_INT_SNAPSHOT))) {
12392 error = ENOTSUP;
12393 goto out;
12394 }
12395
12396 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12397 if (error) {
12398 goto out;
12399 }
12400
12401 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12402 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12403 if (error) {
12404 goto out1;
12405 }
12406
12407 /*
12408 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12409 * (the length returned by copyinstr includes the terminating NUL)
12410 */
12411 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12412 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12413 error = EINVAL;
12414 goto out1;
12415 }
12416 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12417 ;
12418 }
12419 if (i < (int)name_len) {
12420 error = EINVAL;
12421 goto out1;
12422 }
12423
12424 #if CONFIG_MACF
12425 if (op == CREATE) {
12426 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12427 name_buf);
12428 } else if (op == DELETE) {
12429 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12430 name_buf);
12431 }
12432 if (error) {
12433 goto out1;
12434 }
12435 #endif
12436
12437 /* Check if the snapshot already exists ... */
12438 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12439 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12440 ndp->ni_dvp = *sdvpp;
12441
12442 error = namei(ndp);
12443 out1:
12444 FREE(name_buf, M_TEMP);
12445 out:
12446 if (error) {
12447 if (*sdvpp) {
12448 vnode_put(*sdvpp);
12449 *sdvpp = NULLVP;
12450 }
12451 if (*rvpp) {
12452 vnode_put(*rvpp);
12453 *rvpp = NULLVP;
12454 }
12455 }
12456 return error;
12457 }
12458
12459 /*
12460 * create a filesystem snapshot (for supporting filesystems)
12461 *
12462 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12463 * We get to the (unnamed) snapshot directory vnode and create the vnode
12464 * for the snapshot in it.
12465 *
12466 * Restrictions:
12467 *
12468 * a) Passed in name for snapshot cannot have slashes.
12469 * b) name can't be "." or ".."
12470 *
12471 * Since this requires superuser privileges, vnode_authorize calls are not
12472 * made.
12473 */
12474 static int
12475 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12476 vfs_context_t ctx)
12477 {
12478 vnode_t rvp, snapdvp;
12479 int error;
12480 struct nameidata namend;
12481
12482 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12483 OP_LINK, ctx);
12484 if (error) {
12485 return error;
12486 }
12487
12488 if (namend.ni_vp) {
12489 vnode_put(namend.ni_vp);
12490 error = EEXIST;
12491 } else {
12492 struct vnode_attr va;
12493 vnode_t vp = NULLVP;
12494
12495 VATTR_INIT(&va);
12496 VATTR_SET(&va, va_type, VREG);
12497 VATTR_SET(&va, va_mode, 0);
12498
12499 error = vn_create(snapdvp, &vp, &namend, &va,
12500 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12501 if (!error && vp) {
12502 vnode_put(vp);
12503 }
12504 }
12505
12506 nameidone(&namend);
12507 vnode_put(snapdvp);
12508 vnode_put(rvp);
12509 return error;
12510 }
12511
12512 /*
12513 * Delete a Filesystem snapshot
12514 *
12515 * get the vnode for the unnamed snapshot directory and the snapshot and
12516 * delete the snapshot.
12517 */
12518 static int
12519 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12520 vfs_context_t ctx)
12521 {
12522 vnode_t rvp, snapdvp;
12523 int error;
12524 struct nameidata namend;
12525
12526 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12527 OP_UNLINK, ctx);
12528 if (error) {
12529 goto out;
12530 }
12531
12532 error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12533 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12534
12535 vnode_put(namend.ni_vp);
12536 nameidone(&namend);
12537 vnode_put(snapdvp);
12538 vnode_put(rvp);
12539 out:
12540 return error;
12541 }
12542
12543 /*
12544 * Revert a filesystem to a snapshot
12545 *
12546 * Marks the filesystem to revert to the given snapshot on next mount.
12547 */
12548 static int
12549 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12550 vfs_context_t ctx)
12551 {
12552 int error;
12553 vnode_t rvp;
12554 mount_t mp;
12555 struct fs_snapshot_revert_args revert_data;
12556 struct componentname cnp;
12557 caddr_t name_buf;
12558 size_t name_len;
12559
12560 error = vnode_getfromfd(ctx, dirfd, &rvp);
12561 if (error) {
12562 return error;
12563 }
12564 mp = vnode_mount(rvp);
12565
12566 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12567 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12568 if (error) {
12569 FREE(name_buf, M_TEMP);
12570 vnode_put(rvp);
12571 return error;
12572 }
12573
12574 #if CONFIG_MACF
12575 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12576 if (error) {
12577 FREE(name_buf, M_TEMP);
12578 vnode_put(rvp);
12579 return error;
12580 }
12581 #endif
12582
12583 /*
12584 * Grab mount_iterref so that we can release the vnode,
12585 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12586 */
12587 error = mount_iterref(mp, 0);
12588 vnode_put(rvp);
12589 if (error) {
12590 FREE(name_buf, M_TEMP);
12591 return error;
12592 }
12593
12594 memset(&cnp, 0, sizeof(cnp));
12595 cnp.cn_pnbuf = (char *)name_buf;
12596 cnp.cn_nameiop = LOOKUP;
12597 cnp.cn_flags = ISLASTCN | HASBUF;
12598 cnp.cn_pnlen = MAXPATHLEN;
12599 cnp.cn_nameptr = cnp.cn_pnbuf;
12600 cnp.cn_namelen = (int)name_len;
12601 revert_data.sr_cnp = &cnp;
12602
12603 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12604 mount_iterdrop(mp);
12605 FREE(name_buf, M_TEMP);
12606
12607 if (error) {
12608 /* If there was any error, try again using VNOP_IOCTL */
12609
12610 vnode_t snapdvp;
12611 struct nameidata namend;
12612
12613 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12614 OP_LOOKUP, ctx);
12615 if (error) {
12616 return error;
12617 }
12618
12619
12620 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12621 0, ctx);
12622
12623 vnode_put(namend.ni_vp);
12624 nameidone(&namend);
12625 vnode_put(snapdvp);
12626 vnode_put(rvp);
12627 }
12628
12629 return error;
12630 }
12631
12632 /*
12633 * rename a Filesystem snapshot
12634 *
12635 * get the vnode for the unnamed snapshot directory and the snapshot and
12636 * rename the snapshot. This is a very specialised (and simple) case of
12637 * rename(2) (which has to deal with a lot more complications). It differs
12638 * slightly from rename(2) in that EEXIST is returned if the new name exists.
12639 */
12640 static int
12641 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12642 __unused uint32_t flags, vfs_context_t ctx)
12643 {
12644 vnode_t rvp, snapdvp;
12645 int error, i;
12646 caddr_t newname_buf;
12647 size_t name_len;
12648 vnode_t fvp;
12649 struct nameidata *fromnd, *tond;
12650 /* carving out a chunk for structs that are too big to be on stack. */
12651 struct {
12652 struct nameidata from_node;
12653 struct nameidata to_node;
12654 } * __rename_data;
12655
12656 MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12657 fromnd = &__rename_data->from_node;
12658 tond = &__rename_data->to_node;
12659
12660 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12661 OP_UNLINK, ctx);
12662 if (error) {
12663 goto out;
12664 }
12665 fvp = fromnd->ni_vp;
12666
12667 MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12668 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12669 if (error) {
12670 goto out1;
12671 }
12672
12673 /*
12674 * Some sanity checks- new name can't be empty, "." or ".." or have
12675 * slashes.
12676 * (the length returned by copyinstr includes the terminating NUL)
12677 *
12678 * The FS rename VNOP is suppossed to handle this but we'll pick it
12679 * off here itself.
12680 */
12681 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12682 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12683 error = EINVAL;
12684 goto out1;
12685 }
12686 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12687 ;
12688 }
12689 if (i < (int)name_len) {
12690 error = EINVAL;
12691 goto out1;
12692 }
12693
12694 #if CONFIG_MACF
12695 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12696 newname_buf);
12697 if (error) {
12698 goto out1;
12699 }
12700 #endif
12701
12702 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12703 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12704 tond->ni_dvp = snapdvp;
12705
12706 error = namei(tond);
12707 if (error) {
12708 goto out2;
12709 } else if (tond->ni_vp) {
12710 /*
12711 * snapshot rename behaves differently than rename(2) - if the
12712 * new name exists, EEXIST is returned.
12713 */
12714 vnode_put(tond->ni_vp);
12715 error = EEXIST;
12716 goto out2;
12717 }
12718
12719 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12720 &tond->ni_cnd, ctx);
12721
12722 out2:
12723 nameidone(tond);
12724 out1:
12725 FREE(newname_buf, M_TEMP);
12726 vnode_put(fvp);
12727 vnode_put(snapdvp);
12728 vnode_put(rvp);
12729 nameidone(fromnd);
12730 out:
12731 FREE(__rename_data, M_TEMP);
12732 return error;
12733 }
12734
12735 /*
12736 * Mount a Filesystem snapshot
12737 *
12738 * get the vnode for the unnamed snapshot directory and the snapshot and
12739 * mount the snapshot.
12740 */
12741 static int
12742 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12743 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12744 {
12745 mount_t mp;
12746 vnode_t rvp, snapdvp, snapvp, vp, pvp;
12747 struct fs_snapshot_mount_args smnt_data;
12748 int error;
12749 struct nameidata *snapndp, *dirndp;
12750 /* carving out a chunk for structs that are too big to be on stack. */
12751 struct {
12752 struct nameidata snapnd;
12753 struct nameidata dirnd;
12754 } * __snapshot_mount_data;
12755
12756 MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12757 M_TEMP, M_WAITOK);
12758 snapndp = &__snapshot_mount_data->snapnd;
12759 dirndp = &__snapshot_mount_data->dirnd;
12760
12761 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12762 OP_LOOKUP, ctx);
12763 if (error) {
12764 goto out;
12765 }
12766
12767 snapvp = snapndp->ni_vp;
12768 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12769 error = EIO;
12770 goto out1;
12771 }
12772
12773 /* Get the vnode to be covered */
12774 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12775 UIO_USERSPACE, directory, ctx);
12776 error = namei(dirndp);
12777 if (error) {
12778 goto out1;
12779 }
12780
12781 vp = dirndp->ni_vp;
12782 pvp = dirndp->ni_dvp;
12783 mp = vnode_mount(rvp);
12784
12785 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12786 error = EINVAL;
12787 goto out2;
12788 }
12789
12790 #if CONFIG_MACF
12791 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
12792 mp->mnt_vfsstat.f_fstypename);
12793 if (error) {
12794 goto out2;
12795 }
12796 #endif
12797
12798 smnt_data.sm_mp = mp;
12799 smnt_data.sm_cnp = &snapndp->ni_cnd;
12800 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12801 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12802 KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12803
12804 out2:
12805 vnode_put(vp);
12806 vnode_put(pvp);
12807 nameidone(dirndp);
12808 out1:
12809 vnode_put(snapvp);
12810 vnode_put(snapdvp);
12811 vnode_put(rvp);
12812 nameidone(snapndp);
12813 out:
12814 FREE(__snapshot_mount_data, M_TEMP);
12815 return error;
12816 }
12817
12818 /*
12819 * Root from a snapshot of the filesystem
12820 *
12821 * Marks the filesystem to root from the given snapshot on next boot.
12822 */
12823 static int
12824 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12825 vfs_context_t ctx)
12826 {
12827 int error;
12828 vnode_t rvp;
12829 mount_t mp;
12830 struct fs_snapshot_root_args root_data;
12831 struct componentname cnp;
12832 caddr_t name_buf;
12833 size_t name_len;
12834
12835 error = vnode_getfromfd(ctx, dirfd, &rvp);
12836 if (error) {
12837 return error;
12838 }
12839 mp = vnode_mount(rvp);
12840
12841 MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12842 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12843 if (error) {
12844 FREE(name_buf, M_TEMP);
12845 vnode_put(rvp);
12846 return error;
12847 }
12848
12849 // XXX MAC checks ?
12850
12851 /*
12852 * Grab mount_iterref so that we can release the vnode,
12853 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12854 */
12855 error = mount_iterref(mp, 0);
12856 vnode_put(rvp);
12857 if (error) {
12858 FREE(name_buf, M_TEMP);
12859 return error;
12860 }
12861
12862 memset(&cnp, 0, sizeof(cnp));
12863 cnp.cn_pnbuf = (char *)name_buf;
12864 cnp.cn_nameiop = LOOKUP;
12865 cnp.cn_flags = ISLASTCN | HASBUF;
12866 cnp.cn_pnlen = MAXPATHLEN;
12867 cnp.cn_nameptr = cnp.cn_pnbuf;
12868 cnp.cn_namelen = (int)name_len;
12869 root_data.sr_cnp = &cnp;
12870
12871 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12872
12873 mount_iterdrop(mp);
12874 FREE(name_buf, M_TEMP);
12875
12876 return error;
12877 }
12878
12879 /*
12880 * FS snapshot operations dispatcher
12881 */
12882 int
12883 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12884 __unused int32_t *retval)
12885 {
12886 int error;
12887 vfs_context_t ctx = vfs_context_current();
12888
12889 AUDIT_ARG(fd, uap->dirfd);
12890 AUDIT_ARG(value32, uap->op);
12891
12892 error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12893 if (error) {
12894 return error;
12895 }
12896
12897 /*
12898 * Enforce user authorization for snapshot modification operations
12899 */
12900 if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12901 (uap->op != SNAPSHOT_OP_ROOT)) {
12902 vnode_t dvp = NULLVP;
12903 vnode_t devvp = NULLVP;
12904 mount_t mp;
12905
12906 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12907 if (error) {
12908 return error;
12909 }
12910 mp = vnode_mount(dvp);
12911 devvp = mp->mnt_devvp;
12912
12913 /* get an iocount on devvp */
12914 if (devvp == NULLVP) {
12915 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12916 /* for mounts which arent block devices */
12917 if (error == ENOENT) {
12918 error = ENXIO;
12919 }
12920 } else {
12921 error = vnode_getwithref(devvp);
12922 }
12923
12924 if (error) {
12925 vnode_put(dvp);
12926 return error;
12927 }
12928
12929 if ((vfs_context_issuser(ctx) == 0) &&
12930 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12931 error = EPERM;
12932 }
12933 vnode_put(dvp);
12934 vnode_put(devvp);
12935
12936 if (error) {
12937 return error;
12938 }
12939 }
12940
12941 switch (uap->op) {
12942 case SNAPSHOT_OP_CREATE:
12943 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12944 break;
12945 case SNAPSHOT_OP_DELETE:
12946 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12947 break;
12948 case SNAPSHOT_OP_RENAME:
12949 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12950 uap->flags, ctx);
12951 break;
12952 case SNAPSHOT_OP_MOUNT:
12953 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12954 uap->data, uap->flags, ctx);
12955 break;
12956 case SNAPSHOT_OP_REVERT:
12957 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12958 break;
12959 #if CONFIG_MNT_ROOTSNAP
12960 case SNAPSHOT_OP_ROOT:
12961 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12962 break;
12963 #endif /* CONFIG_MNT_ROOTSNAP */
12964 default:
12965 error = ENOSYS;
12966 }
12967
12968 return error;
12969 }