]> git.saurik.com Git - apple/xnu.git/blob - bsd/hfs/hfs_readwrite.c
xnu-3247.10.11.tar.gz
[apple/xnu.git] / bsd / hfs / hfs_readwrite.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* @(#)hfs_readwrite.c 1.0
29 *
30 * (c) 1998-2001 Apple Computer, Inc. All Rights Reserved
31 *
32 * hfs_readwrite.c -- vnode operations to deal with reading and writing files.
33 *
34 */
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/resourcevar.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/filedesc.h>
42 #include <sys/stat.h>
43 #include <sys/buf.h>
44 #include <sys/buf_internal.h>
45 #include <sys/proc.h>
46 #include <sys/kauth.h>
47 #include <sys/vnode.h>
48 #include <sys/vnode_internal.h>
49 #include <sys/uio.h>
50 #include <sys/vfs_context.h>
51 #include <sys/fsevents.h>
52 #include <kern/kalloc.h>
53 #include <sys/disk.h>
54 #include <sys/sysctl.h>
55 #include <sys/fsctl.h>
56 #include <sys/mount_internal.h>
57 #include <sys/file_internal.h>
58
59 #include <libkern/OSDebug.h>
60
61 #include <miscfs/specfs/specdev.h>
62
63 #include <sys/ubc.h>
64 #include <sys/ubc_internal.h>
65
66 #include <vm/vm_pageout.h>
67 #include <vm/vm_kern.h>
68
69 #include <IOKit/IOBSD.h>
70
71 #include <sys/kdebug.h>
72
73 #include "hfs.h"
74 #include "hfs_attrlist.h"
75 #include "hfs_endian.h"
76 #include "hfs_fsctl.h"
77 #include "hfs_quota.h"
78 #include "hfscommon/headers/FileMgrInternal.h"
79 #include "hfscommon/headers/BTreesInternal.h"
80 #include "hfs_cnode.h"
81 #include "hfs_dbg.h"
82
83
84 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
85
86 enum {
87 MAXHFSFILESIZE = 0x7FFFFFFF /* this needs to go in the mount structure */
88 };
89
90 /* from bsd/hfs/hfs_vfsops.c */
91 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
92
93 /* from hfs_hotfiles.c */
94 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
95 uint8_t forktype, uint32_t *pinned);
96
97 static int hfs_clonefile(struct vnode *, int, int, int);
98 static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
99 static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
100
101 /* from bsd/hfs/hfs_vnops.c */
102 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
103
104
105
106 int flush_cache_on_write = 0;
107 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
108
109 /*
110 * Read data from a file.
111 */
112 int
113 hfs_vnop_read(struct vnop_read_args *ap)
114 {
115 /*
116 struct vnop_read_args {
117 struct vnodeop_desc *a_desc;
118 vnode_t a_vp;
119 struct uio *a_uio;
120 int a_ioflag;
121 vfs_context_t a_context;
122 };
123 */
124
125 uio_t uio = ap->a_uio;
126 struct vnode *vp = ap->a_vp;
127 struct cnode *cp;
128 struct filefork *fp;
129 struct hfsmount *hfsmp;
130 off_t filesize;
131 off_t filebytes;
132 off_t start_resid = uio_resid(uio);
133 off_t offset = uio_offset(uio);
134 int retval = 0;
135 int took_truncate_lock = 0;
136 int io_throttle = 0;
137 int throttled_count = 0;
138
139 /* Preflight checks */
140 if (!vnode_isreg(vp)) {
141 /* can only read regular files */
142 if (vnode_isdir(vp))
143 return (EISDIR);
144 else
145 return (EPERM);
146 }
147 if (start_resid == 0)
148 return (0); /* Nothing left to do */
149 if (offset < 0)
150 return (EINVAL); /* cant read from a negative offset */
151
152 #if SECURE_KERNEL
153 if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
154 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
155 /* Don't allow unencrypted io request from user space */
156 return EPERM;
157 }
158 #endif
159
160 #if HFS_COMPRESSION
161 if (VNODE_IS_RSRC(vp)) {
162 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
163 return 0;
164 }
165 /* otherwise read the resource fork normally */
166 } else {
167 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
168 if (compressed) {
169 retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
170 if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
171 (void) hfs_addhotfile(vp);
172 }
173 if (compressed) {
174 if (retval == 0) {
175 /* successful read, update the access time */
176 VTOC(vp)->c_touch_acctime = TRUE;
177
178 //
179 // compressed files are not traditional hot file candidates
180 // but they may be for CF (which ignores the ff_bytesread
181 // field)
182 //
183 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
184 VTOF(vp)->ff_bytesread = 0;
185 }
186 }
187 return retval;
188 }
189 /* otherwise the file was converted back to a regular file while we were reading it */
190 retval = 0;
191 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
192 int error;
193
194 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
195 if (error) {
196 return error;
197 }
198
199 }
200 }
201 #endif /* HFS_COMPRESSION */
202
203 cp = VTOC(vp);
204 fp = VTOF(vp);
205 hfsmp = VTOHFS(vp);
206
207 #if CONFIG_PROTECT
208 if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
209 goto exit;
210 }
211
212 #endif // CONFIG_PROTECT
213
214 /*
215 * If this read request originated from a syscall (as opposed to
216 * an in-kernel page fault or something), then set it up for
217 * throttle checks
218 */
219 if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
220 io_throttle = IO_RETURN_ON_THROTTLE;
221 }
222
223 read_again:
224
225 /* Protect against a size change. */
226 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
227 took_truncate_lock = 1;
228
229 filesize = fp->ff_size;
230 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
231
232 /*
233 * Check the file size. Note that per POSIX spec, we return 0 at
234 * file EOF, so attempting a read at an offset that is too big
235 * should just return 0 on HFS+. Since the return value was initialized
236 * to 0 above, we just jump to exit. HFS Standard has its own behavior.
237 */
238 if (offset > filesize) {
239 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
240 (offset > (off_t)MAXHFSFILESIZE)) {
241 retval = EFBIG;
242 }
243 goto exit;
244 }
245
246 KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
247 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
248
249 retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
250
251 cp->c_touch_acctime = TRUE;
252
253 KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
254 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
255
256 /*
257 * Keep track blocks read
258 */
259 if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
260 int took_cnode_lock = 0;
261 off_t bytesread;
262
263 bytesread = start_resid - uio_resid(uio);
264
265 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
266 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
267 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
268 took_cnode_lock = 1;
269 }
270 /*
271 * If this file hasn't been seen since the start of
272 * the current sampling period then start over.
273 */
274 if (cp->c_atime < hfsmp->hfc_timebase) {
275 struct timeval tv;
276
277 fp->ff_bytesread = bytesread;
278 microtime(&tv);
279 cp->c_atime = tv.tv_sec;
280 } else {
281 fp->ff_bytesread += bytesread;
282 }
283
284 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
285 //
286 // We don't add hotfiles for processes doing IO_EVTONLY I/O
287 // on the assumption that they're system processes such as
288 // mdworker which scan everything in the system (and thus
289 // do not represent user-initiated access to files)
290 //
291 (void) hfs_addhotfile(vp);
292 }
293 if (took_cnode_lock)
294 hfs_unlock(cp);
295 }
296 exit:
297 if (took_truncate_lock) {
298 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
299 }
300 if (retval == EAGAIN) {
301 throttle_lowpri_io(1);
302 throttled_count++;
303
304 retval = 0;
305 goto read_again;
306 }
307 if (throttled_count) {
308 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
309 }
310 return (retval);
311 }
312
313 /*
314 * Ideally, this wouldn't be necessary; the cluster code should be
315 * able to handle this on the read-side. See <rdar://20420068>.
316 */
317 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
318 {
319 assert(VTOC(vp)->c_lockowner != current_thread());
320 assert(VTOC(vp)->c_truncatelockowner == current_thread());
321
322 struct filefork *fp = VTOF(vp);
323
324 if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
325 // Nothing to do
326 return 0;
327 }
328
329 zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
330
331 /* N.B. At present, @zero_up_to is not important because the cluster
332 code will always zero up to the end of the page anyway. */
333 return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
334 fp->ff_size, 0, IO_HEADZEROFILL);
335 }
336
337 /*
338 * Write data to a file.
339 */
340 int
341 hfs_vnop_write(struct vnop_write_args *ap)
342 {
343 uio_t uio = ap->a_uio;
344 struct vnode *vp = ap->a_vp;
345 struct cnode *cp;
346 struct filefork *fp;
347 struct hfsmount *hfsmp;
348 kauth_cred_t cred = NULL;
349 off_t origFileSize;
350 off_t writelimit;
351 off_t bytesToAdd = 0;
352 off_t actualBytesAdded;
353 off_t filebytes;
354 off_t offset;
355 ssize_t resid;
356 int eflags;
357 int ioflag = ap->a_ioflag;
358 int retval = 0;
359 int lockflags;
360 int cnode_locked = 0;
361 int partialwrite = 0;
362 int do_snapshot = 1;
363 time_t orig_ctime=VTOC(vp)->c_ctime;
364 int took_truncate_lock = 0;
365 int io_return_on_throttle = 0;
366 int throttled_count = 0;
367
368 #if HFS_COMPRESSION
369 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
370 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
371 switch(state) {
372 case FILE_IS_COMPRESSED:
373 return EACCES;
374 case FILE_IS_CONVERTING:
375 /* if FILE_IS_CONVERTING, we allow writes but do not
376 bother with snapshots or else we will deadlock.
377 */
378 do_snapshot = 0;
379 break;
380 default:
381 printf("invalid state %d for compressed file\n", state);
382 /* fall through */
383 }
384 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
385 int error;
386
387 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
388 if (error != 0) {
389 return error;
390 }
391 }
392
393 if (do_snapshot) {
394 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
395 }
396
397 #endif
398
399 #if SECURE_KERNEL
400 if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
401 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
402 /* Don't allow unencrypted io request from user space */
403 return EPERM;
404 }
405 #endif
406
407 resid = uio_resid(uio);
408 offset = uio_offset(uio);
409
410 if (offset < 0)
411 return (EINVAL);
412 if (resid == 0)
413 return (E_NONE);
414 if (!vnode_isreg(vp))
415 return (EPERM); /* Can only write regular files */
416
417 cp = VTOC(vp);
418 fp = VTOF(vp);
419 hfsmp = VTOHFS(vp);
420
421 #if CONFIG_PROTECT
422 if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
423 goto exit;
424 }
425 #endif
426
427 eflags = kEFDeferMask; /* defer file block allocations */
428 #if HFS_SPARSE_DEV
429 /*
430 * When the underlying device is sparse and space
431 * is low (< 8MB), stop doing delayed allocations
432 * and begin doing synchronous I/O.
433 */
434 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
435 (hfs_freeblks(hfsmp, 0) < 2048)) {
436 eflags &= ~kEFDeferMask;
437 ioflag |= IO_SYNC;
438 }
439 #endif /* HFS_SPARSE_DEV */
440
441 if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
442 (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
443 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
444 }
445
446 again:
447 /*
448 * Protect against a size change.
449 *
450 * Note: If took_truncate_lock is true, then we previously got the lock shared
451 * but needed to upgrade to exclusive. So try getting it exclusive from the
452 * start.
453 */
454 if (ioflag & IO_APPEND || took_truncate_lock) {
455 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
456 }
457 else {
458 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
459 }
460 took_truncate_lock = 1;
461
462 /* Update UIO */
463 if (ioflag & IO_APPEND) {
464 uio_setoffset(uio, fp->ff_size);
465 offset = fp->ff_size;
466 }
467 if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
468 retval = EPERM;
469 goto exit;
470 }
471
472 cred = vfs_context_ucred(ap->a_context);
473 if (cred && suser(cred, NULL) != 0)
474 eflags |= kEFReserveMask;
475
476 origFileSize = fp->ff_size;
477 writelimit = offset + resid;
478
479 /*
480 * We may need an exclusive truncate lock for several reasons, all
481 * of which are because we may be writing to a (portion of a) block
482 * for the first time, and we need to make sure no readers see the
483 * prior, uninitialized contents of the block. The cases are:
484 *
485 * 1. We have unallocated (delayed allocation) blocks. We may be
486 * allocating new blocks to the file and writing to them.
487 * (A more precise check would be whether the range we're writing
488 * to contains delayed allocation blocks.)
489 * 2. We need to extend the file. The bytes between the old EOF
490 * and the new EOF are not yet initialized. This is important
491 * even if we're not allocating new blocks to the file. If the
492 * old EOF and new EOF are in the same block, we still need to
493 * protect that range of bytes until they are written for the
494 * first time.
495 *
496 * If we had a shared lock with the above cases, we need to try to upgrade
497 * to an exclusive lock. If the upgrade fails, we will lose the shared
498 * lock, and will need to take the truncate lock again; the took_truncate_lock
499 * flag will still be set, causing us to try for an exclusive lock next time.
500 */
501 if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
502 ((fp->ff_unallocblocks != 0) ||
503 (writelimit > origFileSize))) {
504 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
505 /*
506 * Lock upgrade failed and we lost our shared lock, try again.
507 * Note: we do not set took_truncate_lock=0 here. Leaving it
508 * set to 1 will cause us to try to get the lock exclusive.
509 */
510 goto again;
511 }
512 else {
513 /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
514 cp->c_truncatelockowner = current_thread();
515 }
516 }
517
518 if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
519 goto exit;
520 }
521 cnode_locked = 1;
522
523 filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
524
525 if (offset > filebytes
526 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
527 hfsmp->blockSize) < offset - filebytes)) {
528 retval = ENOSPC;
529 goto exit;
530 }
531
532 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
533 (int)offset, uio_resid(uio), (int)fp->ff_size,
534 (int)filebytes, 0);
535
536 /* Check if we do not need to extend the file */
537 if (writelimit <= filebytes) {
538 goto sizeok;
539 }
540
541 bytesToAdd = writelimit - filebytes;
542
543 #if QUOTA
544 retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
545 cred, 0);
546 if (retval)
547 goto exit;
548 #endif /* QUOTA */
549
550 if (hfs_start_transaction(hfsmp) != 0) {
551 retval = EINVAL;
552 goto exit;
553 }
554
555 while (writelimit > filebytes) {
556 bytesToAdd = writelimit - filebytes;
557
558 /* Protect extents b-tree and allocation bitmap */
559 lockflags = SFL_BITMAP;
560 if (overflow_extents(fp))
561 lockflags |= SFL_EXTENTS;
562 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
563
564 /* Files that are changing size are not hot file candidates. */
565 if (hfsmp->hfc_stage == HFC_RECORDING) {
566 fp->ff_bytesread = 0;
567 }
568 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
569 0, eflags, &actualBytesAdded));
570
571 hfs_systemfile_unlock(hfsmp, lockflags);
572
573 if ((actualBytesAdded == 0) && (retval == E_NONE))
574 retval = ENOSPC;
575 if (retval != E_NONE)
576 break;
577 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
578 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
579 (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
580 }
581 (void) hfs_update(vp, 0);
582 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
583 (void) hfs_end_transaction(hfsmp);
584
585 /*
586 * If we didn't grow the file enough try a partial write.
587 * POSIX expects this behavior.
588 */
589 if ((retval == ENOSPC) && (filebytes > offset)) {
590 retval = 0;
591 partialwrite = 1;
592 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
593 resid -= bytesToAdd;
594 writelimit = filebytes;
595 }
596 sizeok:
597 if (retval == E_NONE) {
598 off_t filesize;
599 off_t head_off;
600 int lflag;
601
602 if (writelimit > fp->ff_size) {
603 filesize = writelimit;
604 struct timeval tv;
605 rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
606 microuptime(&tv);
607 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
608 } else
609 filesize = fp->ff_size;
610
611 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
612
613 /*
614 * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
615 * for one case below). For the regions that lie before the
616 * beginning and after the end of this write that are in the
617 * same page, we let the cluster code handle zeroing that out
618 * if necessary. If those areas are not cached, the cluster
619 * code will try and read those areas in, and in the case
620 * where those regions have never been written to,
621 * hfs_vnop_blockmap will consult the invalid ranges and then
622 * indicate that. The cluster code will zero out those areas.
623 */
624
625 head_off = trunc_page_64(offset);
626
627 if (head_off < offset && head_off >= fp->ff_size) {
628 /*
629 * The first page is beyond current EOF, so as an
630 * optimisation, we can pass IO_HEADZEROFILL.
631 */
632 lflag |= IO_HEADZEROFILL;
633 }
634
635 hfs_unlock(cp);
636 cnode_locked = 0;
637
638 /*
639 * We need to tell UBC the fork's new size BEFORE calling
640 * cluster_write, in case any of the new pages need to be
641 * paged out before cluster_write completes (which does happen
642 * in embedded systems due to extreme memory pressure).
643 * Similarly, we need to tell hfs_vnop_pageout what the new EOF
644 * will be, so that it can pass that on to cluster_pageout, and
645 * allow those pageouts.
646 *
647 * We don't update ff_size yet since we don't want pageins to
648 * be able to see uninitialized data between the old and new
649 * EOF, until cluster_write has completed and initialized that
650 * part of the file.
651 *
652 * The vnode pager relies on the file size last given to UBC via
653 * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or
654 * ff_size (whichever is larger). NOTE: ff_new_size is always
655 * zero, unless we are extending the file via write.
656 */
657 if (filesize > fp->ff_size) {
658 retval = hfs_zero_eof_page(vp, offset);
659 if (retval)
660 goto exit;
661 fp->ff_new_size = filesize;
662 ubc_setsize(vp, filesize);
663 }
664 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
665 0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
666 if (retval) {
667 fp->ff_new_size = 0; /* no longer extending; use ff_size */
668
669 if (retval == EAGAIN) {
670 /*
671 * EAGAIN indicates that we still have I/O to do, but
672 * that we now need to be throttled
673 */
674 if (resid != uio_resid(uio)) {
675 /*
676 * did manage to do some I/O before returning EAGAIN
677 */
678 resid = uio_resid(uio);
679 offset = uio_offset(uio);
680
681 cp->c_touch_chgtime = TRUE;
682 cp->c_touch_modtime = TRUE;
683 hfs_incr_gencount(cp);
684 }
685 if (filesize > fp->ff_size) {
686 /*
687 * we called ubc_setsize before the call to
688 * cluster_write... since we only partially
689 * completed the I/O, we need to
690 * re-adjust our idea of the filesize based
691 * on our interim EOF
692 */
693 ubc_setsize(vp, offset);
694
695 fp->ff_size = offset;
696 }
697 goto exit;
698 }
699 if (filesize > origFileSize) {
700 ubc_setsize(vp, origFileSize);
701 }
702 goto ioerr_exit;
703 }
704
705 if (filesize > origFileSize) {
706 fp->ff_size = filesize;
707
708 /* Files that are changing size are not hot file candidates. */
709 if (hfsmp->hfc_stage == HFC_RECORDING) {
710 fp->ff_bytesread = 0;
711 }
712 }
713 fp->ff_new_size = 0; /* ff_size now has the correct size */
714 }
715 if (partialwrite) {
716 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
717 resid += bytesToAdd;
718 }
719
720 // XXXdbg - see radar 4871353 for more info
721 {
722 if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
723 hfs_flush(hfsmp, HFS_FLUSH_CACHE);
724 }
725 }
726
727 ioerr_exit:
728 if (!cnode_locked) {
729 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
730 cnode_locked = 1;
731 }
732
733 if (resid > uio_resid(uio)) {
734 cp->c_touch_chgtime = TRUE;
735 cp->c_touch_modtime = TRUE;
736 hfs_incr_gencount(cp);
737
738 /*
739 * If we successfully wrote any data, and we are not the superuser
740 * we clear the setuid and setgid bits as a precaution against
741 * tampering.
742 */
743 if (cp->c_mode & (S_ISUID | S_ISGID)) {
744 cred = vfs_context_ucred(ap->a_context);
745 if (cred && suser(cred, NULL)) {
746 cp->c_mode &= ~(S_ISUID | S_ISGID);
747 }
748 }
749 }
750 if (retval) {
751 if (ioflag & IO_UNIT) {
752 (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
753 0, ap->a_context);
754 uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
755 uio_setresid(uio, resid);
756 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
757 }
758 } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
759 retval = hfs_update(vp, 0);
760
761 /* Updating vcbWrCnt doesn't need to be atomic. */
762 hfsmp->vcbWrCnt++;
763
764 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
765 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
766 exit:
767 if (retval && took_truncate_lock
768 && cp->c_truncatelockowner == current_thread()) {
769 fp->ff_new_size = 0;
770 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
771 }
772
773 if (cnode_locked)
774 hfs_unlock(cp);
775
776 if (took_truncate_lock) {
777 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
778 }
779 if (retval == EAGAIN) {
780 throttle_lowpri_io(1);
781 throttled_count++;
782
783 retval = 0;
784 goto again;
785 }
786 if (throttled_count) {
787 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
788 }
789 return (retval);
790 }
791
792 /* support for the "bulk-access" fcntl */
793
794 #define CACHE_LEVELS 16
795 #define NUM_CACHE_ENTRIES (64*16)
796 #define PARENT_IDS_FLAG 0x100
797
798 struct access_cache {
799 int numcached;
800 int cachehits; /* these two for statistics gathering */
801 int lookups;
802 unsigned int *acache;
803 unsigned char *haveaccess;
804 };
805
806 struct access_t {
807 uid_t uid; /* IN: effective user id */
808 short flags; /* IN: access requested (i.e. R_OK) */
809 short num_groups; /* IN: number of groups user belongs to */
810 int num_files; /* IN: number of files to process */
811 int *file_ids; /* IN: array of file ids */
812 gid_t *groups; /* IN: array of groups */
813 short *access; /* OUT: access info for each file (0 for 'has access') */
814 } __attribute__((unavailable)); // this structure is for reference purposes only
815
816 struct user32_access_t {
817 uid_t uid; /* IN: effective user id */
818 short flags; /* IN: access requested (i.e. R_OK) */
819 short num_groups; /* IN: number of groups user belongs to */
820 int num_files; /* IN: number of files to process */
821 user32_addr_t file_ids; /* IN: array of file ids */
822 user32_addr_t groups; /* IN: array of groups */
823 user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */
824 };
825
826 struct user64_access_t {
827 uid_t uid; /* IN: effective user id */
828 short flags; /* IN: access requested (i.e. R_OK) */
829 short num_groups; /* IN: number of groups user belongs to */
830 int num_files; /* IN: number of files to process */
831 user64_addr_t file_ids; /* IN: array of file ids */
832 user64_addr_t groups; /* IN: array of groups */
833 user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */
834 };
835
836
837 // these are the "extended" versions of the above structures
838 // note that it is crucial that they be different sized than
839 // the regular version
840 struct ext_access_t {
841 uint32_t flags; /* IN: access requested (i.e. R_OK) */
842 uint32_t num_files; /* IN: number of files to process */
843 uint32_t map_size; /* IN: size of the bit map */
844 uint32_t *file_ids; /* IN: Array of file ids */
845 char *bitmap; /* OUT: hash-bitmap of interesting directory ids */
846 short *access; /* OUT: access info for each file (0 for 'has access') */
847 uint32_t num_parents; /* future use */
848 cnid_t *parents; /* future use */
849 } __attribute__((unavailable)); // this structure is for reference purposes only
850
851 struct user32_ext_access_t {
852 uint32_t flags; /* IN: access requested (i.e. R_OK) */
853 uint32_t num_files; /* IN: number of files to process */
854 uint32_t map_size; /* IN: size of the bit map */
855 user32_addr_t file_ids; /* IN: Array of file ids */
856 user32_addr_t bitmap; /* OUT: hash-bitmap of interesting directory ids */
857 user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */
858 uint32_t num_parents; /* future use */
859 user32_addr_t parents; /* future use */
860 };
861
862 struct user64_ext_access_t {
863 uint32_t flags; /* IN: access requested (i.e. R_OK) */
864 uint32_t num_files; /* IN: number of files to process */
865 uint32_t map_size; /* IN: size of the bit map */
866 user64_addr_t file_ids; /* IN: array of file ids */
867 user64_addr_t bitmap; /* IN: array of groups */
868 user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */
869 uint32_t num_parents;/* future use */
870 user64_addr_t parents;/* future use */
871 };
872
873
874 /*
875 * Perform a binary search for the given parent_id. Return value is
876 * the index if there is a match. If no_match_indexp is non-NULL it
877 * will be assigned with the index to insert the item (even if it was
878 * not found).
879 */
880 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
881 {
882 int index=-1;
883 unsigned int lo=0;
884
885 do {
886 unsigned int mid = ((hi - lo)/2) + lo;
887 unsigned int this_id = array[mid];
888
889 if (parent_id == this_id) {
890 hi = mid;
891 break;
892 }
893
894 if (parent_id < this_id) {
895 hi = mid;
896 continue;
897 }
898
899 if (parent_id > this_id) {
900 lo = mid + 1;
901 continue;
902 }
903 } while(lo < hi);
904
905 /* check if lo and hi converged on the match */
906 if (parent_id == array[hi]) {
907 index = hi;
908 }
909
910 if (no_match_indexp) {
911 *no_match_indexp = hi;
912 }
913
914 return index;
915 }
916
917
918 static int
919 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
920 {
921 unsigned int hi;
922 int matches = 0;
923 int index, no_match_index;
924
925 if (cache->numcached == 0) {
926 *indexp = 0;
927 return 0; // table is empty, so insert at index=0 and report no match
928 }
929
930 if (cache->numcached > NUM_CACHE_ENTRIES) {
931 cache->numcached = NUM_CACHE_ENTRIES;
932 }
933
934 hi = cache->numcached - 1;
935
936 index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
937
938 /* if no existing entry found, find index for new one */
939 if (index == -1) {
940 index = no_match_index;
941 matches = 0;
942 } else {
943 matches = 1;
944 }
945
946 *indexp = index;
947 return matches;
948 }
949
950 /*
951 * Add a node to the access_cache at the given index (or do a lookup first
952 * to find the index if -1 is passed in). We currently do a replace rather
953 * than an insert if the cache is full.
954 */
955 static void
956 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
957 {
958 int lookup_index = -1;
959
960 /* need to do a lookup first if -1 passed for index */
961 if (index == -1) {
962 if (lookup_bucket(cache, &lookup_index, nodeID)) {
963 if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
964 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
965 cache->haveaccess[lookup_index] = access;
966 }
967
968 /* mission accomplished */
969 return;
970 } else {
971 index = lookup_index;
972 }
973
974 }
975
976 /* if the cache is full, do a replace rather than an insert */
977 if (cache->numcached >= NUM_CACHE_ENTRIES) {
978 cache->numcached = NUM_CACHE_ENTRIES-1;
979
980 if (index > cache->numcached) {
981 index = cache->numcached;
982 }
983 }
984
985 if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
986 index++;
987 }
988
989 if (index >= 0 && index < cache->numcached) {
990 /* only do bcopy if we're inserting */
991 bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
992 bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
993 }
994
995 cache->acache[index] = nodeID;
996 cache->haveaccess[index] = access;
997 cache->numcached++;
998 }
999
1000
1001 struct cinfo {
1002 uid_t uid;
1003 gid_t gid;
1004 mode_t mode;
1005 cnid_t parentcnid;
1006 u_int16_t recflags;
1007 };
1008
1009 static int
1010 snoop_callback(const cnode_t *cp, void *arg)
1011 {
1012 struct cinfo *cip = arg;
1013
1014 cip->uid = cp->c_uid;
1015 cip->gid = cp->c_gid;
1016 cip->mode = cp->c_mode;
1017 cip->parentcnid = cp->c_parentcnid;
1018 cip->recflags = cp->c_attr.ca_recflags;
1019
1020 return (0);
1021 }
1022
1023 /*
1024 * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1025 * isn't incore, then go to the catalog.
1026 */
1027 static int
1028 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1029 struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1030 {
1031 int error = 0;
1032
1033 /* if this id matches the one the fsctl was called with, skip the lookup */
1034 if (cnid == skip_cp->c_cnid) {
1035 cnattrp->ca_uid = skip_cp->c_uid;
1036 cnattrp->ca_gid = skip_cp->c_gid;
1037 cnattrp->ca_mode = skip_cp->c_mode;
1038 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1039 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1040 } else {
1041 struct cinfo c_info;
1042
1043 /* otherwise, check the cnode hash incase the file/dir is incore */
1044 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1045
1046 if (error == EACCES) {
1047 // File is deleted
1048 return ENOENT;
1049 } else if (!error) {
1050 cnattrp->ca_uid = c_info.uid;
1051 cnattrp->ca_gid = c_info.gid;
1052 cnattrp->ca_mode = c_info.mode;
1053 cnattrp->ca_recflags = c_info.recflags;
1054 keyp->hfsPlus.parentID = c_info.parentcnid;
1055 } else {
1056 int lockflags;
1057
1058 if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1059 throttle_lowpri_io(1);
1060
1061 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1062
1063 /* lookup this cnid in the catalog */
1064 error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1065
1066 hfs_systemfile_unlock(hfsmp, lockflags);
1067
1068 cache->lookups++;
1069 }
1070 }
1071
1072 return (error);
1073 }
1074
1075
1076 /*
1077 * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1078 * up to CACHE_LEVELS as we progress towards the root.
1079 */
1080 static int
1081 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1082 struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1083 struct vfs_context *my_context,
1084 char *bitmap,
1085 uint32_t map_size,
1086 cnid_t* parents,
1087 uint32_t num_parents)
1088 {
1089 int myErr = 0;
1090 int myResult;
1091 HFSCatalogNodeID thisNodeID;
1092 unsigned int myPerms;
1093 struct cat_attr cnattr;
1094 int cache_index = -1, scope_index = -1, scope_idx_start = -1;
1095 CatalogKey catkey;
1096
1097 int i = 0, ids_to_cache = 0;
1098 int parent_ids[CACHE_LEVELS];
1099
1100 thisNodeID = nodeID;
1101 while (thisNodeID >= kRootDirID) {
1102 myResult = 0; /* default to "no access" */
1103
1104 /* check the cache before resorting to hitting the catalog */
1105
1106 /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1107 * to look any further after hitting cached dir */
1108
1109 if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1110 cache->cachehits++;
1111 myErr = cache->haveaccess[cache_index];
1112 if (scope_index != -1) {
1113 if (myErr == ESRCH) {
1114 myErr = 0;
1115 }
1116 } else {
1117 scope_index = 0; // so we'll just use the cache result
1118 scope_idx_start = ids_to_cache;
1119 }
1120 myResult = (myErr == 0) ? 1 : 0;
1121 goto ExitThisRoutine;
1122 }
1123
1124
1125 if (parents) {
1126 int tmp;
1127 tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1128 if (scope_index == -1)
1129 scope_index = tmp;
1130 if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1131 scope_idx_start = ids_to_cache;
1132 }
1133 }
1134
1135 /* remember which parents we want to cache */
1136 if (ids_to_cache < CACHE_LEVELS) {
1137 parent_ids[ids_to_cache] = thisNodeID;
1138 ids_to_cache++;
1139 }
1140 // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1141 if (bitmap && map_size) {
1142 bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1143 }
1144
1145
1146 /* do the lookup (checks the cnode hash, then the catalog) */
1147 myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1148 if (myErr) {
1149 goto ExitThisRoutine; /* no access */
1150 }
1151
1152 /* Root always gets access. */
1153 if (suser(myp_ucred, NULL) == 0) {
1154 thisNodeID = catkey.hfsPlus.parentID;
1155 myResult = 1;
1156 continue;
1157 }
1158
1159 // if the thing has acl's, do the full permission check
1160 if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1161 struct vnode *vp;
1162
1163 /* get the vnode for this cnid */
1164 myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1165 if ( myErr ) {
1166 myResult = 0;
1167 goto ExitThisRoutine;
1168 }
1169
1170 thisNodeID = VTOC(vp)->c_parentcnid;
1171
1172 hfs_unlock(VTOC(vp));
1173
1174 if (vnode_vtype(vp) == VDIR) {
1175 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1176 } else {
1177 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1178 }
1179
1180 vnode_put(vp);
1181 if (myErr) {
1182 myResult = 0;
1183 goto ExitThisRoutine;
1184 }
1185 } else {
1186 unsigned int flags;
1187 int mode = cnattr.ca_mode & S_IFMT;
1188 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1189
1190 if (mode == S_IFDIR) {
1191 flags = R_OK | X_OK;
1192 } else {
1193 flags = R_OK;
1194 }
1195 if ( (myPerms & flags) != flags) {
1196 myResult = 0;
1197 myErr = EACCES;
1198 goto ExitThisRoutine; /* no access */
1199 }
1200
1201 /* up the hierarchy we go */
1202 thisNodeID = catkey.hfsPlus.parentID;
1203 }
1204 }
1205
1206 /* if here, we have access to this node */
1207 myResult = 1;
1208
1209 ExitThisRoutine:
1210 if (parents && myErr == 0 && scope_index == -1) {
1211 myErr = ESRCH;
1212 }
1213
1214 if (myErr) {
1215 myResult = 0;
1216 }
1217 *err = myErr;
1218
1219 /* cache the parent directory(ies) */
1220 for (i = 0; i < ids_to_cache; i++) {
1221 if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1222 add_node(cache, -1, parent_ids[i], ESRCH);
1223 } else {
1224 add_node(cache, -1, parent_ids[i], myErr);
1225 }
1226 }
1227
1228 return (myResult);
1229 }
1230
1231 static int
1232 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1233 struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1234 {
1235 boolean_t is64bit;
1236
1237 /*
1238 * NOTE: on entry, the vnode has an io_ref. In case this vnode
1239 * happens to be in our list of file_ids, we'll note it
1240 * avoid calling hfs_chashget_nowait() on that id as that
1241 * will cause a "locking against myself" panic.
1242 */
1243 Boolean check_leaf = true;
1244
1245 struct user64_ext_access_t *user_access_structp;
1246 struct user64_ext_access_t tmp_user_access;
1247 struct access_cache cache;
1248
1249 int error = 0, prev_parent_check_ok=1;
1250 unsigned int i;
1251
1252 short flags;
1253 unsigned int num_files = 0;
1254 int map_size = 0;
1255 int num_parents = 0;
1256 int *file_ids=NULL;
1257 short *access=NULL;
1258 char *bitmap=NULL;
1259 cnid_t *parents=NULL;
1260 int leaf_index;
1261
1262 cnid_t cnid;
1263 cnid_t prevParent_cnid = 0;
1264 unsigned int myPerms;
1265 short myaccess = 0;
1266 struct cat_attr cnattr;
1267 CatalogKey catkey;
1268 struct cnode *skip_cp = VTOC(vp);
1269 kauth_cred_t cred = vfs_context_ucred(context);
1270 proc_t p = vfs_context_proc(context);
1271
1272 is64bit = proc_is64bit(p);
1273
1274 /* initialize the local cache and buffers */
1275 cache.numcached = 0;
1276 cache.cachehits = 0;
1277 cache.lookups = 0;
1278 cache.acache = NULL;
1279 cache.haveaccess = NULL;
1280
1281 /* struct copyin done during dispatch... need to copy file_id array separately */
1282 if (ap->a_data == NULL) {
1283 error = EINVAL;
1284 goto err_exit_bulk_access;
1285 }
1286
1287 if (is64bit) {
1288 if (arg_size != sizeof(struct user64_ext_access_t)) {
1289 error = EINVAL;
1290 goto err_exit_bulk_access;
1291 }
1292
1293 user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1294
1295 } else if (arg_size == sizeof(struct user32_access_t)) {
1296 struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1297
1298 // convert an old style bulk-access struct to the new style
1299 tmp_user_access.flags = accessp->flags;
1300 tmp_user_access.num_files = accessp->num_files;
1301 tmp_user_access.map_size = 0;
1302 tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids);
1303 tmp_user_access.bitmap = USER_ADDR_NULL;
1304 tmp_user_access.access = CAST_USER_ADDR_T(accessp->access);
1305 tmp_user_access.num_parents = 0;
1306 user_access_structp = &tmp_user_access;
1307
1308 } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1309 struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1310
1311 // up-cast from a 32-bit version of the struct
1312 tmp_user_access.flags = accessp->flags;
1313 tmp_user_access.num_files = accessp->num_files;
1314 tmp_user_access.map_size = accessp->map_size;
1315 tmp_user_access.num_parents = accessp->num_parents;
1316
1317 tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids);
1318 tmp_user_access.bitmap = CAST_USER_ADDR_T(accessp->bitmap);
1319 tmp_user_access.access = CAST_USER_ADDR_T(accessp->access);
1320 tmp_user_access.parents = CAST_USER_ADDR_T(accessp->parents);
1321
1322 user_access_structp = &tmp_user_access;
1323 } else {
1324 error = EINVAL;
1325 goto err_exit_bulk_access;
1326 }
1327
1328 map_size = user_access_structp->map_size;
1329
1330 num_files = user_access_structp->num_files;
1331
1332 num_parents= user_access_structp->num_parents;
1333
1334 if (num_files < 1) {
1335 goto err_exit_bulk_access;
1336 }
1337 if (num_files > 1024) {
1338 error = EINVAL;
1339 goto err_exit_bulk_access;
1340 }
1341
1342 if (num_parents > 1024) {
1343 error = EINVAL;
1344 goto err_exit_bulk_access;
1345 }
1346
1347 file_ids = (int *) kalloc(sizeof(int) * num_files);
1348 access = (short *) kalloc(sizeof(short) * num_files);
1349 if (map_size) {
1350 bitmap = (char *) kalloc(sizeof(char) * map_size);
1351 }
1352
1353 if (num_parents) {
1354 parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1355 }
1356
1357 cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1358 cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1359
1360 if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1361 if (file_ids) {
1362 kfree(file_ids, sizeof(int) * num_files);
1363 }
1364 if (bitmap) {
1365 kfree(bitmap, sizeof(char) * map_size);
1366 }
1367 if (access) {
1368 kfree(access, sizeof(short) * num_files);
1369 }
1370 if (cache.acache) {
1371 kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1372 }
1373 if (cache.haveaccess) {
1374 kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1375 }
1376 if (parents) {
1377 kfree(parents, sizeof(cnid_t) * num_parents);
1378 }
1379 return ENOMEM;
1380 }
1381
1382 // make sure the bitmap is zero'ed out...
1383 if (bitmap) {
1384 bzero(bitmap, (sizeof(char) * map_size));
1385 }
1386
1387 if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1388 num_files * sizeof(int)))) {
1389 goto err_exit_bulk_access;
1390 }
1391
1392 if (num_parents) {
1393 if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1394 num_parents * sizeof(cnid_t)))) {
1395 goto err_exit_bulk_access;
1396 }
1397 }
1398
1399 flags = user_access_structp->flags;
1400 if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1401 flags = R_OK;
1402 }
1403
1404 /* check if we've been passed leaf node ids or parent ids */
1405 if (flags & PARENT_IDS_FLAG) {
1406 check_leaf = false;
1407 }
1408
1409 /* Check access to each file_id passed in */
1410 for (i = 0; i < num_files; i++) {
1411 leaf_index=-1;
1412 cnid = (cnid_t) file_ids[i];
1413
1414 /* root always has access */
1415 if ((!parents) && (!suser(cred, NULL))) {
1416 access[i] = 0;
1417 continue;
1418 }
1419
1420 if (check_leaf) {
1421 /* do the lookup (checks the cnode hash, then the catalog) */
1422 error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1423 if (error) {
1424 access[i] = (short) error;
1425 continue;
1426 }
1427
1428 if (parents) {
1429 // Check if the leaf matches one of the parent scopes
1430 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1431 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1432 prev_parent_check_ok = 0;
1433 else if (leaf_index >= 0)
1434 prev_parent_check_ok = 1;
1435 }
1436
1437 // if the thing has acl's, do the full permission check
1438 if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1439 struct vnode *cvp;
1440 int myErr = 0;
1441 /* get the vnode for this cnid */
1442 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1443 if ( myErr ) {
1444 access[i] = myErr;
1445 continue;
1446 }
1447
1448 hfs_unlock(VTOC(cvp));
1449
1450 if (vnode_vtype(cvp) == VDIR) {
1451 myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1452 } else {
1453 myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1454 }
1455
1456 vnode_put(cvp);
1457 if (myErr) {
1458 access[i] = myErr;
1459 continue;
1460 }
1461 } else {
1462 /* before calling CheckAccess(), check the target file for read access */
1463 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1464 cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1465
1466 /* fail fast if no access */
1467 if ((myPerms & flags) == 0) {
1468 access[i] = EACCES;
1469 continue;
1470 }
1471 }
1472 } else {
1473 /* we were passed an array of parent ids */
1474 catkey.hfsPlus.parentID = cnid;
1475 }
1476
1477 /* if the last guy had the same parent and had access, we're done */
1478 if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1479 cache.cachehits++;
1480 access[i] = 0;
1481 continue;
1482 }
1483
1484 myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1485 skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1486
1487 if (myaccess || (error == ESRCH && leaf_index != -1)) {
1488 access[i] = 0; // have access.. no errors to report
1489 } else {
1490 access[i] = (error != 0 ? (short) error : EACCES);
1491 }
1492
1493 prevParent_cnid = catkey.hfsPlus.parentID;
1494 }
1495
1496 /* copyout the access array */
1497 if ((error = copyout((caddr_t)access, user_access_structp->access,
1498 num_files * sizeof (short)))) {
1499 goto err_exit_bulk_access;
1500 }
1501 if (map_size && bitmap) {
1502 if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1503 map_size * sizeof (char)))) {
1504 goto err_exit_bulk_access;
1505 }
1506 }
1507
1508
1509 err_exit_bulk_access:
1510
1511 if (file_ids)
1512 kfree(file_ids, sizeof(int) * num_files);
1513 if (parents)
1514 kfree(parents, sizeof(cnid_t) * num_parents);
1515 if (bitmap)
1516 kfree(bitmap, sizeof(char) * map_size);
1517 if (access)
1518 kfree(access, sizeof(short) * num_files);
1519 if (cache.acache)
1520 kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1521 if (cache.haveaccess)
1522 kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1523
1524 return (error);
1525 }
1526
1527
1528 /* end "bulk-access" support */
1529
1530
1531 /*
1532 * Control filesystem operating characteristics.
1533 */
1534 int
1535 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1536 vnode_t a_vp;
1537 long a_command;
1538 caddr_t a_data;
1539 int a_fflag;
1540 vfs_context_t a_context;
1541 } */ *ap)
1542 {
1543 struct vnode * vp = ap->a_vp;
1544 struct hfsmount *hfsmp = VTOHFS(vp);
1545 vfs_context_t context = ap->a_context;
1546 kauth_cred_t cred = vfs_context_ucred(context);
1547 proc_t p = vfs_context_proc(context);
1548 struct vfsstatfs *vfsp;
1549 boolean_t is64bit;
1550 off_t jnl_start, jnl_size;
1551 struct hfs_journal_info *jip;
1552 #if HFS_COMPRESSION
1553 int compressed = 0;
1554 off_t uncompressed_size = -1;
1555 int decmpfs_error = 0;
1556
1557 if (ap->a_command == F_RDADVISE) {
1558 /* we need to inspect the decmpfs state of the file as early as possible */
1559 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1560 if (compressed) {
1561 if (VNODE_IS_RSRC(vp)) {
1562 /* if this is the resource fork, treat it as if it were empty */
1563 uncompressed_size = 0;
1564 } else {
1565 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1566 if (decmpfs_error != 0) {
1567 /* failed to get the uncompressed size, we'll check for this later */
1568 uncompressed_size = -1;
1569 }
1570 }
1571 }
1572 }
1573 #endif /* HFS_COMPRESSION */
1574
1575 is64bit = proc_is64bit(p);
1576
1577 #if CONFIG_PROTECT
1578 {
1579 int error = 0;
1580 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1581 return error;
1582 }
1583 }
1584 #endif /* CONFIG_PROTECT */
1585
1586 switch (ap->a_command) {
1587
1588 case HFS_GETPATH:
1589 {
1590 struct vnode *file_vp;
1591 cnid_t cnid;
1592 int outlen;
1593 char *bufptr;
1594 int error;
1595 int flags = 0;
1596
1597 /* Caller must be owner of file system. */
1598 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1599 if (suser(cred, NULL) &&
1600 kauth_cred_getuid(cred) != vfsp->f_owner) {
1601 return (EACCES);
1602 }
1603 /* Target vnode must be file system's root. */
1604 if (!vnode_isvroot(vp)) {
1605 return (EINVAL);
1606 }
1607 bufptr = (char *)ap->a_data;
1608 cnid = strtoul(bufptr, NULL, 10);
1609 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1610 flags |= BUILDPATH_VOLUME_RELATIVE;
1611 }
1612
1613 /* We need to call hfs_vfs_vget to leverage the code that will
1614 * fix the origin list for us if needed, as opposed to calling
1615 * hfs_vget, since we will need the parent for build_path call.
1616 */
1617
1618 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1619 return (error);
1620 }
1621 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1622 vnode_put(file_vp);
1623
1624 return (error);
1625 }
1626
1627 case HFS_TRANSFER_DOCUMENT_ID:
1628 {
1629 struct cnode *cp = NULL;
1630 int error;
1631 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1632 struct fileproc *to_fp;
1633 struct vnode *to_vp;
1634 struct cnode *to_cp;
1635
1636 cp = VTOC(vp);
1637
1638 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1639 //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1640 return error;
1641 }
1642 if ( (error = vnode_getwithref(to_vp)) ) {
1643 file_drop(to_fd);
1644 return error;
1645 }
1646
1647 if (VTOHFS(to_vp) != hfsmp) {
1648 error = EXDEV;
1649 goto transfer_cleanup;
1650 }
1651
1652 int need_unlock = 1;
1653 to_cp = VTOC(to_vp);
1654 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1655 if (error != 0) {
1656 //printf("could not lock the pair of cnodes (error %d)\n", error);
1657 goto transfer_cleanup;
1658 }
1659
1660 if (!(cp->c_bsdflags & UF_TRACKED)) {
1661 error = EINVAL;
1662 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1663 //
1664 // if the destination is already tracked, return an error
1665 // as otherwise it's a silent deletion of the target's
1666 // document-id
1667 //
1668 error = EEXIST;
1669 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1670 //
1671 // we can use the FndrExtendedFileInfo because the doc-id is the first
1672 // thing in both it and the ExtendedDirInfo struct which is fixed in
1673 // format and can not change layout
1674 //
1675 struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1676 struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1677
1678 if (f_extinfo->document_id == 0) {
1679 uint32_t new_id;
1680
1681 hfs_unlockpair(cp, to_cp); // have to unlock to be able to get a new-id
1682
1683 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1684 //
1685 // re-lock the pair now that we have the document-id
1686 //
1687 hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1688 f_extinfo->document_id = new_id;
1689 } else {
1690 goto transfer_cleanup;
1691 }
1692 }
1693
1694 to_extinfo->document_id = f_extinfo->document_id;
1695 f_extinfo->document_id = 0;
1696 //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1697
1698 // make sure the destination is also UF_TRACKED
1699 to_cp->c_bsdflags |= UF_TRACKED;
1700 cp->c_bsdflags &= ~UF_TRACKED;
1701
1702 // mark the cnodes dirty
1703 cp->c_flag |= C_MODIFIED;
1704 to_cp->c_flag |= C_MODIFIED;
1705
1706 int lockflags;
1707 if ((error = hfs_start_transaction(hfsmp)) == 0) {
1708
1709 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1710
1711 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1712 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1713
1714 hfs_systemfile_unlock (hfsmp, lockflags);
1715 (void) hfs_end_transaction(hfsmp);
1716 }
1717
1718 #if CONFIG_FSE
1719 add_fsevent(FSE_DOCID_CHANGED, context,
1720 FSE_ARG_DEV, hfsmp->hfs_raw_dev,
1721 FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode #
1722 FSE_ARG_INO, (ino64_t)to_cp->c_fileid, // dst inode #
1723 FSE_ARG_INT32, to_extinfo->document_id,
1724 FSE_ARG_DONE);
1725
1726 hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents
1727 need_unlock = 0;
1728
1729 if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1730 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1731 }
1732 if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1733 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1734 }
1735 #else
1736 hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents
1737 need_unlock = 0;
1738 #endif
1739 }
1740
1741 if (need_unlock) {
1742 hfs_unlockpair(cp, to_cp);
1743 }
1744
1745 transfer_cleanup:
1746 vnode_put(to_vp);
1747 file_drop(to_fd);
1748
1749 return error;
1750 }
1751
1752
1753
1754 case HFS_PREV_LINK:
1755 case HFS_NEXT_LINK:
1756 {
1757 cnid_t linkfileid;
1758 cnid_t nextlinkid;
1759 cnid_t prevlinkid;
1760 int error;
1761
1762 /* Caller must be owner of file system. */
1763 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1764 if (suser(cred, NULL) &&
1765 kauth_cred_getuid(cred) != vfsp->f_owner) {
1766 return (EACCES);
1767 }
1768 /* Target vnode must be file system's root. */
1769 if (!vnode_isvroot(vp)) {
1770 return (EINVAL);
1771 }
1772 linkfileid = *(cnid_t *)ap->a_data;
1773 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1774 return (EINVAL);
1775 }
1776 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1777 return (error);
1778 }
1779 if (ap->a_command == HFS_NEXT_LINK) {
1780 *(cnid_t *)ap->a_data = nextlinkid;
1781 } else {
1782 *(cnid_t *)ap->a_data = prevlinkid;
1783 }
1784 return (0);
1785 }
1786
1787 case HFS_RESIZE_PROGRESS: {
1788
1789 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1790 if (suser(cred, NULL) &&
1791 kauth_cred_getuid(cred) != vfsp->f_owner) {
1792 return (EACCES); /* must be owner of file system */
1793 }
1794 if (!vnode_isvroot(vp)) {
1795 return (EINVAL);
1796 }
1797 /* file system must not be mounted read-only */
1798 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1799 return (EROFS);
1800 }
1801
1802 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1803 }
1804
1805 case HFS_RESIZE_VOLUME: {
1806 u_int64_t newsize;
1807 u_int64_t cursize;
1808 int ret;
1809
1810 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1811 if (suser(cred, NULL) &&
1812 kauth_cred_getuid(cred) != vfsp->f_owner) {
1813 return (EACCES); /* must be owner of file system */
1814 }
1815 if (!vnode_isvroot(vp)) {
1816 return (EINVAL);
1817 }
1818
1819 /* filesystem must not be mounted read only */
1820 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1821 return (EROFS);
1822 }
1823 newsize = *(u_int64_t *)ap->a_data;
1824 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1825
1826 if (newsize == cursize) {
1827 return (0);
1828 }
1829 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1830 if (newsize > cursize) {
1831 ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1832 } else {
1833 ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1834 }
1835 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1836 return (ret);
1837 }
1838 case HFS_CHANGE_NEXT_ALLOCATION: {
1839 int error = 0; /* Assume success */
1840 u_int32_t location;
1841
1842 if (vnode_vfsisrdonly(vp)) {
1843 return (EROFS);
1844 }
1845 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1846 if (suser(cred, NULL) &&
1847 kauth_cred_getuid(cred) != vfsp->f_owner) {
1848 return (EACCES); /* must be owner of file system */
1849 }
1850 if (!vnode_isvroot(vp)) {
1851 return (EINVAL);
1852 }
1853 hfs_lock_mount(hfsmp);
1854 location = *(u_int32_t *)ap->a_data;
1855 if ((location >= hfsmp->allocLimit) &&
1856 (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1857 error = EINVAL;
1858 goto fail_change_next_allocation;
1859 }
1860 /* Return previous value. */
1861 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1862 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1863 /* On magic value for location, set nextAllocation to next block
1864 * after metadata zone and set flag in mount structure to indicate
1865 * that nextAllocation should not be updated again.
1866 */
1867 if (hfsmp->hfs_metazone_end != 0) {
1868 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1869 }
1870 hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1871 } else {
1872 hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1873 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1874 }
1875 MarkVCBDirty(hfsmp);
1876 fail_change_next_allocation:
1877 hfs_unlock_mount(hfsmp);
1878 return (error);
1879 }
1880
1881 #if HFS_SPARSE_DEV
1882 case HFS_SETBACKINGSTOREINFO: {
1883 struct vnode * bsfs_rootvp;
1884 struct vnode * di_vp;
1885 struct hfs_backingstoreinfo *bsdata;
1886 int error = 0;
1887
1888 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1889 return (EROFS);
1890 }
1891 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1892 return (EALREADY);
1893 }
1894 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1895 if (suser(cred, NULL) &&
1896 kauth_cred_getuid(cred) != vfsp->f_owner) {
1897 return (EACCES); /* must be owner of file system */
1898 }
1899 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1900 if (bsdata == NULL) {
1901 return (EINVAL);
1902 }
1903 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1904 return (error);
1905 }
1906 if ((error = vnode_getwithref(di_vp))) {
1907 file_drop(bsdata->backingfd);
1908 return(error);
1909 }
1910
1911 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1912 (void)vnode_put(di_vp);
1913 file_drop(bsdata->backingfd);
1914 return (EINVAL);
1915 }
1916
1917 /*
1918 * Obtain the backing fs root vnode and keep a reference
1919 * on it. This reference will be dropped in hfs_unmount.
1920 */
1921 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1922 if (error) {
1923 (void)vnode_put(di_vp);
1924 file_drop(bsdata->backingfd);
1925 return (error);
1926 }
1927 vnode_ref(bsfs_rootvp);
1928 vnode_put(bsfs_rootvp);
1929
1930 hfs_lock_mount(hfsmp);
1931 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1932 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1933 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1934 hfs_unlock_mount(hfsmp);
1935
1936 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1937
1938 /*
1939 * If the sparse image is on a sparse image file (as opposed to a sparse
1940 * bundle), then we may need to limit the free space to the maximum size
1941 * of a file on that volume. So we query (using pathconf), and if we get
1942 * a meaningful result, we cache the number of blocks for later use in
1943 * hfs_freeblks().
1944 */
1945 hfsmp->hfs_backingfs_maxblocks = 0;
1946 if (vnode_vtype(di_vp) == VREG) {
1947 int terr;
1948 int hostbits;
1949 terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1950 if (terr == 0 && hostbits != 0 && hostbits < 64) {
1951 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1952
1953 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1954 }
1955 }
1956
1957 /* The free extent cache is managed differently for sparse devices.
1958 * There is a window between which the volume is mounted and the
1959 * device is marked as sparse, so the free extent cache for this
1960 * volume is currently initialized as normal volume (sorted by block
1961 * count). Reset the cache so that it will be rebuilt again
1962 * for sparse device (sorted by start block).
1963 */
1964 ResetVCBFreeExtCache(hfsmp);
1965
1966 (void)vnode_put(di_vp);
1967 file_drop(bsdata->backingfd);
1968 return (0);
1969 }
1970 case HFS_CLRBACKINGSTOREINFO: {
1971 struct vnode * tmpvp;
1972
1973 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1974 if (suser(cred, NULL) &&
1975 kauth_cred_getuid(cred) != vfsp->f_owner) {
1976 return (EACCES); /* must be owner of file system */
1977 }
1978 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1979 return (EROFS);
1980 }
1981
1982 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1983 hfsmp->hfs_backingfs_rootvp) {
1984
1985 hfs_lock_mount(hfsmp);
1986 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1987 tmpvp = hfsmp->hfs_backingfs_rootvp;
1988 hfsmp->hfs_backingfs_rootvp = NULLVP;
1989 hfsmp->hfs_sparsebandblks = 0;
1990 hfs_unlock_mount(hfsmp);
1991
1992 vnode_rele(tmpvp);
1993 }
1994 return (0);
1995 }
1996 #endif /* HFS_SPARSE_DEV */
1997
1998 /* Change the next CNID stored in the VH */
1999 case HFS_CHANGE_NEXTCNID: {
2000 int error = 0; /* Assume success */
2001 u_int32_t fileid;
2002 int wraparound = 0;
2003 int lockflags = 0;
2004
2005 if (vnode_vfsisrdonly(vp)) {
2006 return (EROFS);
2007 }
2008 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2009 if (suser(cred, NULL) &&
2010 kauth_cred_getuid(cred) != vfsp->f_owner) {
2011 return (EACCES); /* must be owner of file system */
2012 }
2013
2014 fileid = *(u_int32_t *)ap->a_data;
2015
2016 /* Must have catalog lock excl. to advance the CNID pointer */
2017 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2018
2019 hfs_lock_mount(hfsmp);
2020
2021 /* If it is less than the current next CNID, force the wraparound bit to be set */
2022 if (fileid < hfsmp->vcbNxtCNID) {
2023 wraparound=1;
2024 }
2025
2026 /* Return previous value. */
2027 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2028
2029 hfsmp->vcbNxtCNID = fileid;
2030
2031 if (wraparound) {
2032 hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2033 }
2034
2035 MarkVCBDirty(hfsmp);
2036 hfs_unlock_mount(hfsmp);
2037 hfs_systemfile_unlock (hfsmp, lockflags);
2038
2039 return (error);
2040 }
2041
2042 case F_FREEZE_FS: {
2043 struct mount *mp;
2044
2045 mp = vnode_mount(vp);
2046 hfsmp = VFSTOHFS(mp);
2047
2048 if (!(hfsmp->jnl))
2049 return (ENOTSUP);
2050
2051 vfsp = vfs_statfs(mp);
2052
2053 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2054 !kauth_cred_issuser(cred))
2055 return (EACCES);
2056
2057 return hfs_freeze(hfsmp);
2058 }
2059
2060 case F_THAW_FS: {
2061 vfsp = vfs_statfs(vnode_mount(vp));
2062 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2063 !kauth_cred_issuser(cred))
2064 return (EACCES);
2065
2066 return hfs_thaw(hfsmp, current_proc());
2067 }
2068
2069 case HFS_EXT_BULKACCESS_FSCTL: {
2070 int size;
2071
2072 if (hfsmp->hfs_flags & HFS_STANDARD) {
2073 return EINVAL;
2074 }
2075
2076 if (is64bit) {
2077 size = sizeof(struct user64_ext_access_t);
2078 } else {
2079 size = sizeof(struct user32_ext_access_t);
2080 }
2081
2082 return do_bulk_access_check(hfsmp, vp, ap, size, context);
2083 }
2084
2085 case HFS_SET_XATTREXTENTS_STATE: {
2086 int state;
2087
2088 if (ap->a_data == NULL) {
2089 return (EINVAL);
2090 }
2091
2092 state = *(int *)ap->a_data;
2093
2094 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2095 return (EROFS);
2096 }
2097
2098 /* Super-user can enable or disable extent-based extended
2099 * attribute support on a volume
2100 * Note: Starting Mac OS X 10.7, extent-based extended attributes
2101 * are enabled by default, so any change will be transient only
2102 * till the volume is remounted.
2103 */
2104 if (!kauth_cred_issuser(kauth_cred_get())) {
2105 return (EPERM);
2106 }
2107 if (state == 0 || state == 1)
2108 return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2109 else
2110 return (EINVAL);
2111 }
2112
2113 case F_SETSTATICCONTENT: {
2114 int error;
2115 int enable_static = 0;
2116 struct cnode *cp = NULL;
2117 /*
2118 * lock the cnode, decorate the cnode flag, and bail out.
2119 * VFS should have already authenticated the caller for us.
2120 */
2121
2122 if (ap->a_data) {
2123 /*
2124 * Note that even though ap->a_data is of type caddr_t,
2125 * the fcntl layer at the syscall handler will pass in NULL
2126 * or 1 depending on what the argument supplied to the fcntl
2127 * was. So it is in fact correct to check the ap->a_data
2128 * argument for zero or non-zero value when deciding whether or not
2129 * to enable the static bit in the cnode.
2130 */
2131 enable_static = 1;
2132 }
2133 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2134 return EROFS;
2135 }
2136 cp = VTOC(vp);
2137
2138 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2139 if (error == 0) {
2140 if (enable_static) {
2141 cp->c_flag |= C_SSD_STATIC;
2142 }
2143 else {
2144 cp->c_flag &= ~C_SSD_STATIC;
2145 }
2146 hfs_unlock (cp);
2147 }
2148 return error;
2149 }
2150
2151 case F_SET_GREEDY_MODE: {
2152 int error;
2153 int enable_greedy_mode = 0;
2154 struct cnode *cp = NULL;
2155 /*
2156 * lock the cnode, decorate the cnode flag, and bail out.
2157 * VFS should have already authenticated the caller for us.
2158 */
2159
2160 if (ap->a_data) {
2161 /*
2162 * Note that even though ap->a_data is of type caddr_t,
2163 * the fcntl layer at the syscall handler will pass in NULL
2164 * or 1 depending on what the argument supplied to the fcntl
2165 * was. So it is in fact correct to check the ap->a_data
2166 * argument for zero or non-zero value when deciding whether or not
2167 * to enable the greedy mode bit in the cnode.
2168 */
2169 enable_greedy_mode = 1;
2170 }
2171 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2172 return EROFS;
2173 }
2174 cp = VTOC(vp);
2175
2176 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2177 if (error == 0) {
2178 if (enable_greedy_mode) {
2179 cp->c_flag |= C_SSD_GREEDY_MODE;
2180 }
2181 else {
2182 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2183 }
2184 hfs_unlock (cp);
2185 }
2186 return error;
2187 }
2188
2189 case F_SETIOTYPE: {
2190 int error;
2191 uint32_t iotypeflag = 0;
2192
2193 struct cnode *cp = NULL;
2194 /*
2195 * lock the cnode, decorate the cnode flag, and bail out.
2196 * VFS should have already authenticated the caller for us.
2197 */
2198
2199 if (ap->a_data == NULL) {
2200 return EINVAL;
2201 }
2202
2203 /*
2204 * Note that even though ap->a_data is of type caddr_t, we
2205 * can only use 32 bits of flag values.
2206 */
2207 iotypeflag = (uint32_t) ap->a_data;
2208 switch (iotypeflag) {
2209 case F_IOTYPE_ISOCHRONOUS:
2210 break;
2211 default:
2212 return EINVAL;
2213 }
2214
2215
2216 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2217 return EROFS;
2218 }
2219 cp = VTOC(vp);
2220
2221 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2222 if (error == 0) {
2223 switch (iotypeflag) {
2224 case F_IOTYPE_ISOCHRONOUS:
2225 cp->c_flag |= C_IO_ISOCHRONOUS;
2226 break;
2227 default:
2228 break;
2229 }
2230 hfs_unlock (cp);
2231 }
2232 return error;
2233 }
2234
2235 case F_MAKECOMPRESSED: {
2236 int error = 0;
2237 uint32_t gen_counter;
2238 struct cnode *cp = NULL;
2239 int reset_decmp = 0;
2240
2241 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2242 return EROFS;
2243 }
2244
2245 /*
2246 * acquire & lock the cnode.
2247 * VFS should have already authenticated the caller for us.
2248 */
2249
2250 if (ap->a_data) {
2251 /*
2252 * Cast the pointer into a uint32_t so we can extract the
2253 * supplied generation counter.
2254 */
2255 gen_counter = *((uint32_t*)ap->a_data);
2256 }
2257 else {
2258 return EINVAL;
2259 }
2260
2261 #if HFS_COMPRESSION
2262 cp = VTOC(vp);
2263 /* Grab truncate lock first; we may truncate the file */
2264 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2265
2266 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2267 if (error) {
2268 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2269 return error;
2270 }
2271
2272 /* Are there any other usecounts/FDs? */
2273 if (vnode_isinuse(vp, 1)) {
2274 hfs_unlock(cp);
2275 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2276 return EBUSY;
2277 }
2278
2279 /* now we have the cnode locked down; Validate arguments */
2280 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2281 /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2282 hfs_unlock(cp);
2283 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2284 return EINVAL;
2285 }
2286
2287 if ((hfs_get_gencount (cp)) == gen_counter) {
2288 /*
2289 * OK, the gen_counter matched. Go for it:
2290 * Toggle state bits, truncate file, and suppress mtime update
2291 */
2292 reset_decmp = 1;
2293 cp->c_bsdflags |= UF_COMPRESSED;
2294
2295 error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2296 ap->a_context);
2297 }
2298 else {
2299 error = ESTALE;
2300 }
2301
2302 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2303 hfs_unlock(cp);
2304
2305 /*
2306 * Reset the decmp state while still holding the truncate lock. We need to
2307 * serialize here against a listxattr on this node which may occur at any
2308 * time.
2309 *
2310 * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2311 * that will still potentially require getting the com.apple.decmpfs EA. If the
2312 * EA is required, then we can't hold the cnode lock, because the getxattr call is
2313 * generic(through VFS), and can't pass along any info telling it that we're already
2314 * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2315 * and trying to fill in the hfs_file_is_compressed info during the callback
2316 * operation, which will result in deadlock against the b-tree node.
2317 *
2318 * So, to serialize against listxattr (which will grab buf_t meta references on
2319 * the b-tree blocks), we hold the truncate lock as we're manipulating the
2320 * decmpfs payload.
2321 */
2322 if ((reset_decmp) && (error == 0)) {
2323 decmpfs_cnode *dp = VTOCMP (vp);
2324 if (dp != NULL) {
2325 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2326 }
2327
2328 /* Initialize the decmpfs node as needed */
2329 (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2330 }
2331
2332 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2333
2334 #endif
2335 return error;
2336 }
2337
2338 case F_SETBACKINGSTORE: {
2339
2340 int error = 0;
2341
2342 /*
2343 * See comment in F_SETSTATICCONTENT re: using
2344 * a null check for a_data
2345 */
2346 if (ap->a_data) {
2347 error = hfs_set_backingstore (vp, 1);
2348 }
2349 else {
2350 error = hfs_set_backingstore (vp, 0);
2351 }
2352
2353 return error;
2354 }
2355
2356 case F_GETPATH_MTMINFO: {
2357 int error = 0;
2358
2359 int *data = (int*) ap->a_data;
2360
2361 /* Ask if this is a backingstore vnode */
2362 error = hfs_is_backingstore (vp, data);
2363
2364 return error;
2365 }
2366
2367 case F_FULLFSYNC: {
2368 int error;
2369
2370 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2371 return (EROFS);
2372 }
2373 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2374 if (error == 0) {
2375 error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2376 hfs_unlock(VTOC(vp));
2377 }
2378
2379 return error;
2380 }
2381
2382 case F_BARRIERFSYNC: {
2383 int error;
2384
2385 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2386 return (EROFS);
2387 }
2388 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2389 if (error == 0) {
2390 error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2391 hfs_unlock(VTOC(vp));
2392 }
2393
2394 return error;
2395 }
2396
2397 case F_CHKCLEAN: {
2398 register struct cnode *cp;
2399 int error;
2400
2401 if (!vnode_isreg(vp))
2402 return EINVAL;
2403
2404 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2405 if (error == 0) {
2406 cp = VTOC(vp);
2407 /*
2408 * used by regression test to determine if
2409 * all the dirty pages (via write) have been cleaned
2410 * after a call to 'fsysnc'.
2411 */
2412 error = is_file_clean(vp, VTOF(vp)->ff_size);
2413 hfs_unlock(cp);
2414 }
2415 return (error);
2416 }
2417
2418 case F_RDADVISE: {
2419 register struct radvisory *ra;
2420 struct filefork *fp;
2421 int error;
2422
2423 if (!vnode_isreg(vp))
2424 return EINVAL;
2425
2426 ra = (struct radvisory *)(ap->a_data);
2427 fp = VTOF(vp);
2428
2429 /* Protect against a size change. */
2430 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2431
2432 #if HFS_COMPRESSION
2433 if (compressed && (uncompressed_size == -1)) {
2434 /* fetching the uncompressed size failed above, so return the error */
2435 error = decmpfs_error;
2436 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2437 (!compressed && (ra->ra_offset >= fp->ff_size))) {
2438 error = EFBIG;
2439 }
2440 #else /* HFS_COMPRESSION */
2441 if (ra->ra_offset >= fp->ff_size) {
2442 error = EFBIG;
2443 }
2444 #endif /* HFS_COMPRESSION */
2445 else {
2446 error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2447 }
2448
2449 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2450 return (error);
2451 }
2452
2453 case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */
2454 {
2455 if (is64bit) {
2456 *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2457 }
2458 else {
2459 *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2460 }
2461 return 0;
2462 }
2463
2464 case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2465 *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2466 break;
2467
2468 case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2469 *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2470 break;
2471
2472 case HFS_FSCTL_GET_VERY_LOW_DISK:
2473 *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2474 break;
2475
2476 case HFS_FSCTL_SET_VERY_LOW_DISK:
2477 if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2478 return EINVAL;
2479 }
2480
2481 hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2482 break;
2483
2484 case HFS_FSCTL_GET_LOW_DISK:
2485 *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2486 break;
2487
2488 case HFS_FSCTL_SET_LOW_DISK:
2489 if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2490 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2491
2492 return EINVAL;
2493 }
2494
2495 hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2496 break;
2497
2498 case HFS_FSCTL_GET_DESIRED_DISK:
2499 *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2500 break;
2501
2502 case HFS_FSCTL_SET_DESIRED_DISK:
2503 if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2504 return EINVAL;
2505 }
2506
2507 hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2508 break;
2509
2510 case HFS_VOLUME_STATUS:
2511 *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2512 break;
2513
2514 case HFS_SET_BOOT_INFO:
2515 if (!vnode_isvroot(vp))
2516 return(EINVAL);
2517 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2518 return(EACCES); /* must be superuser or owner of filesystem */
2519 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2520 return (EROFS);
2521 }
2522 hfs_lock_mount (hfsmp);
2523 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2524 hfs_unlock_mount (hfsmp);
2525 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2526 break;
2527
2528 case HFS_GET_BOOT_INFO:
2529 if (!vnode_isvroot(vp))
2530 return(EINVAL);
2531 hfs_lock_mount (hfsmp);
2532 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2533 hfs_unlock_mount(hfsmp);
2534 break;
2535
2536 case HFS_MARK_BOOT_CORRUPT:
2537 /* Mark the boot volume corrupt by setting
2538 * kHFSVolumeInconsistentBit in the volume header. This will
2539 * force fsck_hfs on next mount.
2540 */
2541 if (!kauth_cred_issuser(kauth_cred_get())) {
2542 return EACCES;
2543 }
2544
2545 /* Allowed only on the root vnode of the boot volume */
2546 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2547 !vnode_isvroot(vp)) {
2548 return EINVAL;
2549 }
2550 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2551 return (EROFS);
2552 }
2553 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2554 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2555 break;
2556
2557 case HFS_FSCTL_GET_JOURNAL_INFO:
2558 jip = (struct hfs_journal_info*)ap->a_data;
2559
2560 if (vp == NULLVP)
2561 return EINVAL;
2562
2563 if (hfsmp->jnl == NULL) {
2564 jnl_start = 0;
2565 jnl_size = 0;
2566 } else {
2567 jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2568 jnl_size = hfsmp->jnl_size;
2569 }
2570
2571 jip->jstart = jnl_start;
2572 jip->jsize = jnl_size;
2573 break;
2574
2575 case HFS_SET_ALWAYS_ZEROFILL: {
2576 struct cnode *cp = VTOC(vp);
2577
2578 if (*(int *)ap->a_data) {
2579 cp->c_flag |= C_ALWAYS_ZEROFILL;
2580 } else {
2581 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2582 }
2583 break;
2584 }
2585
2586 case HFS_DISABLE_METAZONE: {
2587 /* Only root can disable metadata zone */
2588 if (!kauth_cred_issuser(kauth_cred_get())) {
2589 return EACCES;
2590 }
2591 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2592 return (EROFS);
2593 }
2594
2595 /* Disable metadata zone now */
2596 (void) hfs_metadatazone_init(hfsmp, true);
2597 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2598 break;
2599 }
2600
2601
2602 case HFS_FSINFO_METADATA_BLOCKS: {
2603 int error;
2604 struct hfsinfo_metadata *hinfo;
2605
2606 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2607
2608 /* Get information about number of metadata blocks */
2609 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2610 if (error) {
2611 return error;
2612 }
2613
2614 break;
2615 }
2616
2617 case HFS_GET_FSINFO: {
2618 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2619
2620 /* Only root is allowed to get fsinfo */
2621 if (!kauth_cred_issuser(kauth_cred_get())) {
2622 return EACCES;
2623 }
2624
2625 /*
2626 * Make sure that the caller's version number matches with
2627 * the kernel's version number. This will make sure that
2628 * if the structures being read/written into are changed
2629 * by the kernel, the caller will not read incorrect data.
2630 *
2631 * The first three fields --- request_type, version and
2632 * flags are same for all the hfs_fsinfo structures, so
2633 * we can access the version number by assuming any
2634 * structure for now.
2635 */
2636 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2637 return ENOTSUP;
2638 }
2639
2640 /* Make sure that the current file system is not marked inconsistent */
2641 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2642 return EIO;
2643 }
2644
2645 return hfs_get_fsinfo(hfsmp, ap->a_data);
2646 }
2647
2648 case HFS_CS_FREESPACE_TRIM: {
2649 int error = 0;
2650 int lockflags = 0;
2651
2652 /* Only root allowed */
2653 if (!kauth_cred_issuser(kauth_cred_get())) {
2654 return EACCES;
2655 }
2656
2657 /*
2658 * This core functionality is similar to hfs_scan_blocks().
2659 * The main difference is that hfs_scan_blocks() is called
2660 * as part of mount where we are assured that the journal is
2661 * empty to start with. This fcntl() can be called on a
2662 * mounted volume, therefore it has to flush the content of
2663 * the journal as well as ensure the state of summary table.
2664 *
2665 * This fcntl scans over the entire allocation bitmap,
2666 * creates list of all the free blocks, and issues TRIM
2667 * down to the underlying device. This can take long time
2668 * as it can generate up to 512MB of read I/O.
2669 */
2670
2671 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2672 error = hfs_init_summary(hfsmp);
2673 if (error) {
2674 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2675 return error;
2676 }
2677 }
2678
2679 /*
2680 * The journal maintains list of recently deallocated blocks to
2681 * issue DKIOCUNMAPs when the corresponding journal transaction is
2682 * flushed to the disk. To avoid any race conditions, we only
2683 * want one active trim list and only one thread issuing DKIOCUNMAPs.
2684 * Therefore we make sure that the journal trim list is sync'ed,
2685 * empty, and not modifiable for the duration of our scan.
2686 *
2687 * Take the journal lock before flushing the journal to the disk.
2688 * We will keep on holding the journal lock till we don't get the
2689 * bitmap lock to make sure that no new journal transactions can
2690 * start. This will make sure that the journal trim list is not
2691 * modified after the journal flush and before getting bitmap lock.
2692 * We can release the journal lock after we acquire the bitmap
2693 * lock as it will prevent any further block deallocations.
2694 */
2695 hfs_journal_lock(hfsmp);
2696
2697 /* Flush the journal and wait for all I/Os to finish up */
2698 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2699 if (error) {
2700 hfs_journal_unlock(hfsmp);
2701 return error;
2702 }
2703
2704 /* Take bitmap lock to ensure it is not being modified */
2705 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2706
2707 /* Release the journal lock */
2708 hfs_journal_unlock(hfsmp);
2709
2710 /*
2711 * ScanUnmapBlocks reads the bitmap in large block size
2712 * (up to 1MB) unlike the runtime which reads the bitmap
2713 * in the 4K block size. This can cause buf_t collisions
2714 * and potential data corruption. To avoid this, we
2715 * invalidate all the existing buffers associated with
2716 * the bitmap vnode before scanning it.
2717 *
2718 * Note: ScanUnmapBlock() cleans up all the buffers
2719 * after itself, so there won't be any large buffers left
2720 * for us to clean up after it returns.
2721 */
2722 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2723 if (error) {
2724 hfs_systemfile_unlock(hfsmp, lockflags);
2725 return error;
2726 }
2727
2728 /* Traverse bitmap and issue DKIOCUNMAPs */
2729 error = ScanUnmapBlocks(hfsmp);
2730 hfs_systemfile_unlock(hfsmp, lockflags);
2731 if (error) {
2732 return error;
2733 }
2734
2735 break;
2736 }
2737
2738 case HFS_SET_HOTFILE_STATE: {
2739 int error;
2740 struct cnode *cp = VTOC(vp);
2741 uint32_t hf_state = *((uint32_t*)ap->a_data);
2742 uint32_t num_unpinned = 0;
2743
2744 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2745 if (error) {
2746 return error;
2747 }
2748
2749 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2750 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2751 vnode_setfastdevicecandidate(vp);
2752
2753 cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2754 cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2755 cp->c_flag |= C_MODIFIED;
2756 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2757 vnode_clearfastdevicecandidate(vp);
2758 hfs_removehotfile(vp);
2759
2760 if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2761 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned, ap->a_context);
2762 }
2763
2764 if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2765 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2766 }
2767 cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2768 cp->c_flag |= C_MODIFIED;
2769
2770 } else {
2771 error = EINVAL;
2772 }
2773
2774 if (num_unpinned != 0) {
2775 lck_mtx_lock(&hfsmp->hfc_mutex);
2776 hfsmp->hfs_hotfile_freeblks += num_unpinned;
2777 lck_mtx_unlock(&hfsmp->hfc_mutex);
2778 }
2779
2780 hfs_unlock(cp);
2781 return error;
2782 break;
2783 }
2784
2785 case HFS_REPIN_HOTFILE_STATE: {
2786 int error=0;
2787 uint32_t repin_what = *((uint32_t*)ap->a_data);
2788
2789 /* Only root allowed */
2790 if (!kauth_cred_issuser(kauth_cred_get())) {
2791 return EACCES;
2792 }
2793
2794 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2795 // this system is neither regular Fusion or Cooperative Fusion
2796 // so this fsctl makes no sense.
2797 return EINVAL;
2798 }
2799
2800 //
2801 // After a converting a CoreStorage volume to be encrypted, the
2802 // extents could have moved around underneath us. This call
2803 // allows corestoraged to re-pin everything that should be
2804 // pinned (it would happen on the next reboot too but that could
2805 // be a long time away).
2806 //
2807 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2808 hfs_pin_fs_metadata(hfsmp);
2809 }
2810 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2811 hfs_repin_hotfiles(hfsmp);
2812 }
2813 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2814 //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2815 //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2816 }
2817
2818 return error;
2819 break;
2820 }
2821
2822
2823 default:
2824 return (ENOTTY);
2825 }
2826
2827 return 0;
2828 }
2829
2830 /*
2831 * select
2832 */
2833 int
2834 hfs_vnop_select(__unused struct vnop_select_args *ap)
2835 /*
2836 struct vnop_select_args {
2837 vnode_t a_vp;
2838 int a_which;
2839 int a_fflags;
2840 void *a_wql;
2841 vfs_context_t a_context;
2842 };
2843 */
2844 {
2845 /*
2846 * We should really check to see if I/O is possible.
2847 */
2848 return (1);
2849 }
2850
2851 /*
2852 * Converts a logical block number to a physical block, and optionally returns
2853 * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2854 * The physical block number is based on the device block size, currently its 512.
2855 * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2856 */
2857 int
2858 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2859 {
2860 struct filefork *fp = VTOF(vp);
2861 struct hfsmount *hfsmp = VTOHFS(vp);
2862 int retval = E_NONE;
2863 u_int32_t logBlockSize;
2864 size_t bytesContAvail = 0;
2865 off_t blockposition;
2866 int lockExtBtree;
2867 int lockflags = 0;
2868
2869 /*
2870 * Check for underlying vnode requests and ensure that logical
2871 * to physical mapping is requested.
2872 */
2873 if (vpp != NULL)
2874 *vpp = hfsmp->hfs_devvp;
2875 if (bnp == NULL)
2876 return (0);
2877
2878 logBlockSize = GetLogicalBlockSize(vp);
2879 blockposition = (off_t)bn * logBlockSize;
2880
2881 lockExtBtree = overflow_extents(fp);
2882
2883 if (lockExtBtree)
2884 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2885
2886 retval = MacToVFSError(
2887 MapFileBlockC (HFSTOVCB(hfsmp),
2888 (FCB*)fp,
2889 MAXPHYSIO,
2890 blockposition,
2891 bnp,
2892 &bytesContAvail));
2893
2894 if (lockExtBtree)
2895 hfs_systemfile_unlock(hfsmp, lockflags);
2896
2897 if (retval == E_NONE) {
2898 /* Figure out how many read ahead blocks there are */
2899 if (runp != NULL) {
2900 if (can_cluster(logBlockSize)) {
2901 /* Make sure this result never goes negative: */
2902 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2903 } else {
2904 *runp = 0;
2905 }
2906 }
2907 }
2908 return (retval);
2909 }
2910
2911 /*
2912 * Convert logical block number to file offset.
2913 */
2914 int
2915 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2916 /*
2917 struct vnop_blktooff_args {
2918 vnode_t a_vp;
2919 daddr64_t a_lblkno;
2920 off_t *a_offset;
2921 };
2922 */
2923 {
2924 if (ap->a_vp == NULL)
2925 return (EINVAL);
2926 *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2927
2928 return(0);
2929 }
2930
2931 /*
2932 * Convert file offset to logical block number.
2933 */
2934 int
2935 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2936 /*
2937 struct vnop_offtoblk_args {
2938 vnode_t a_vp;
2939 off_t a_offset;
2940 daddr64_t *a_lblkno;
2941 };
2942 */
2943 {
2944 if (ap->a_vp == NULL)
2945 return (EINVAL);
2946 *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2947
2948 return(0);
2949 }
2950
2951 /*
2952 * Map file offset to physical block number.
2953 *
2954 * If this function is called for write operation, and if the file
2955 * had virtual blocks allocated (delayed allocation), real blocks
2956 * are allocated by calling ExtendFileC().
2957 *
2958 * If this function is called for read operation, and if the file
2959 * had virtual blocks allocated (delayed allocation), no change
2960 * to the size of file is done, and if required, rangelist is
2961 * searched for mapping.
2962 *
2963 * System file cnodes are expected to be locked (shared or exclusive).
2964 *
2965 * -- INVALID RANGES --
2966 *
2967 * Invalid ranges are used to keep track of where we have extended a
2968 * file, but have not yet written that data to disk. In the past we
2969 * would clear up the invalid ranges as we wrote to those areas, but
2970 * before data was actually flushed to disk. The problem with that
2971 * approach is that the data can be left in the cache and is therefore
2972 * still not valid on disk. So now we clear up the ranges here, when
2973 * the flags field has VNODE_WRITE set, indicating a write is about to
2974 * occur. This isn't ideal (ideally we want to clear them up when
2975 * know the data has been successfully written), but it's the best we
2976 * can do.
2977 *
2978 * For reads, we use the invalid ranges here in block map to indicate
2979 * to the caller that the data should be zeroed (a_bpn == -1). We
2980 * have to be careful about what ranges we return to the cluster code.
2981 * Currently the cluster code can only handle non-rounded values for
2982 * the EOF; it cannot handle funny sized ranges in the middle of the
2983 * file (the main problem is that it sends down odd sized I/Os to the
2984 * disk). Our code currently works because whilst the very first
2985 * offset and the last offset in the invalid ranges are not aligned,
2986 * gaps in the invalid ranges between the first and last, have to be
2987 * aligned (because we always write page sized blocks). For example,
2988 * consider this arrangement:
2989 *
2990 * +-------------+-----+-------+------+
2991 * | |XXXXX| |XXXXXX|
2992 * +-------------+-----+-------+------+
2993 * a b c d
2994 *
2995 * This shows two invalid ranges <a, b> and <c, d>. Whilst a and d
2996 * are not necessarily aligned, b and c *must* be.
2997 *
2998 * Zero-filling occurs in a number of ways:
2999 *
3000 * 1. When a read occurs and we return with a_bpn == -1.
3001 *
3002 * 2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3003 * which will cause us to iterate over the ranges bringing in
3004 * pages that are not present in the cache and zeroing them. Any
3005 * pages that are already in the cache are left untouched. Note
3006 * that hfs_fsync does not always flush invalid ranges.
3007 *
3008 * 3. When we extend a file we zero out from the old EOF to the end
3009 * of the page. It would be nice if we didn't have to do this if
3010 * the page wasn't present (and could defer it), but because of
3011 * the problem described above, we have to.
3012 *
3013 * The invalid ranges are also used to restrict the size that we write
3014 * out on disk: see hfs_prepare_fork_for_update.
3015 *
3016 * Note that invalid ranges are ignored when neither the VNODE_READ or
3017 * the VNODE_WRITE flag is specified. This is useful for the
3018 * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3019 * just want to know whether blocks are physically allocated or not.
3020 */
3021 int
3022 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3023 /*
3024 struct vnop_blockmap_args {
3025 vnode_t a_vp;
3026 off_t a_foffset;
3027 size_t a_size;
3028 daddr64_t *a_bpn;
3029 size_t *a_run;
3030 void *a_poff;
3031 int a_flags;
3032 vfs_context_t a_context;
3033 };
3034 */
3035 {
3036 struct vnode *vp = ap->a_vp;
3037 struct cnode *cp;
3038 struct filefork *fp;
3039 struct hfsmount *hfsmp;
3040 size_t bytesContAvail = ap->a_size;
3041 int retval = E_NONE;
3042 int syslocks = 0;
3043 int lockflags = 0;
3044 struct rl_entry *invalid_range;
3045 enum rl_overlaptype overlaptype;
3046 int started_tr = 0;
3047 int tooklock = 0;
3048
3049 #if HFS_COMPRESSION
3050 if (VNODE_IS_RSRC(vp)) {
3051 /* allow blockmaps to the resource fork */
3052 } else {
3053 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3054 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3055 switch(state) {
3056 case FILE_IS_COMPRESSED:
3057 return ENOTSUP;
3058 case FILE_IS_CONVERTING:
3059 /* if FILE_IS_CONVERTING, we allow blockmap */
3060 break;
3061 default:
3062 printf("invalid state %d for compressed file\n", state);
3063 /* fall through */
3064 }
3065 }
3066 }
3067 #endif /* HFS_COMPRESSION */
3068
3069 /* Do not allow blockmap operation on a directory */
3070 if (vnode_isdir(vp)) {
3071 return (ENOTSUP);
3072 }
3073
3074 /*
3075 * Check for underlying vnode requests and ensure that logical
3076 * to physical mapping is requested.
3077 */
3078 if (ap->a_bpn == NULL)
3079 return (0);
3080
3081 hfsmp = VTOHFS(vp);
3082 cp = VTOC(vp);
3083 fp = VTOF(vp);
3084
3085 if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3086 if (cp->c_lockowner != current_thread()) {
3087 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3088 tooklock = 1;
3089 }
3090
3091 // For reads, check the invalid ranges
3092 if (ISSET(ap->a_flags, VNODE_READ)) {
3093 if (ap->a_foffset >= fp->ff_size) {
3094 retval = ERANGE;
3095 goto exit;
3096 }
3097
3098 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3099 ap->a_foffset + (off_t)bytesContAvail - 1,
3100 &invalid_range);
3101 switch(overlaptype) {
3102 case RL_MATCHINGOVERLAP:
3103 case RL_OVERLAPCONTAINSRANGE:
3104 case RL_OVERLAPSTARTSBEFORE:
3105 /* There's no valid block for this byte offset */
3106 *ap->a_bpn = (daddr64_t)-1;
3107 /* There's no point limiting the amount to be returned
3108 * if the invalid range that was hit extends all the way
3109 * to the EOF (i.e. there's no valid bytes between the
3110 * end of this range and the file's EOF):
3111 */
3112 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3113 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3114 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3115 }
3116
3117 retval = 0;
3118 goto exit;
3119
3120 case RL_OVERLAPISCONTAINED:
3121 case RL_OVERLAPENDSAFTER:
3122 /* The range of interest hits an invalid block before the end: */
3123 if (invalid_range->rl_start == ap->a_foffset) {
3124 /* There's actually no valid information to be had starting here: */
3125 *ap->a_bpn = (daddr64_t)-1;
3126 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3127 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3128 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3129 }
3130
3131 retval = 0;
3132 goto exit;
3133 } else {
3134 /*
3135 * Sadly, the lower layers don't like us to
3136 * return unaligned ranges, so we skip over
3137 * any invalid ranges here that are less than
3138 * a page: zeroing of those bits is not our
3139 * responsibility (it's dealt with elsewhere).
3140 */
3141 do {
3142 off_t rounded_start = round_page_64(invalid_range->rl_start);
3143 if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3144 break;
3145 if (rounded_start < invalid_range->rl_end + 1) {
3146 bytesContAvail = rounded_start - ap->a_foffset;
3147 break;
3148 }
3149 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3150 rl_link)));
3151 }
3152 break;
3153
3154 case RL_NOOVERLAP:
3155 break;
3156 } // switch
3157 }
3158 }
3159
3160 #if CONFIG_PROTECT
3161 if (cp->c_cpentry) {
3162 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3163 ? VNODE_WRITE : VNODE_READ);
3164
3165 cp_io_params_t io_params;
3166 cp_io_params(hfsmp, cp->c_cpentry,
3167 off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3168 direction, &io_params);
3169
3170 if (io_params.max_len < (off_t)bytesContAvail)
3171 bytesContAvail = io_params.max_len;
3172
3173 if (io_params.phys_offset != -1) {
3174 *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3175 / hfsmp->hfs_logical_block_size);
3176
3177 retval = 0;
3178 goto exit;
3179 }
3180 }
3181 #endif
3182
3183 retry:
3184
3185 /* Check virtual blocks only when performing write operation */
3186 if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3187 if (hfs_start_transaction(hfsmp) != 0) {
3188 retval = EINVAL;
3189 goto exit;
3190 } else {
3191 started_tr = 1;
3192 }
3193 syslocks = SFL_EXTENTS | SFL_BITMAP;
3194
3195 } else if (overflow_extents(fp)) {
3196 syslocks = SFL_EXTENTS;
3197 }
3198
3199 if (syslocks)
3200 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3201
3202 /*
3203 * Check for any delayed allocations.
3204 */
3205 if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3206 int64_t actbytes;
3207 u_int32_t loanedBlocks;
3208
3209 //
3210 // Make sure we have a transaction. It's possible
3211 // that we came in and fp->ff_unallocblocks was zero
3212 // but during the time we blocked acquiring the extents
3213 // btree, ff_unallocblocks became non-zero and so we
3214 // will need to start a transaction.
3215 //
3216 if (started_tr == 0) {
3217 if (syslocks) {
3218 hfs_systemfile_unlock(hfsmp, lockflags);
3219 syslocks = 0;
3220 }
3221 goto retry;
3222 }
3223
3224 /*
3225 * Note: ExtendFileC will Release any blocks on loan and
3226 * aquire real blocks. So we ask to extend by zero bytes
3227 * since ExtendFileC will account for the virtual blocks.
3228 */
3229
3230 loanedBlocks = fp->ff_unallocblocks;
3231 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3232 kEFAllMask | kEFNoClumpMask, &actbytes);
3233
3234 if (retval) {
3235 fp->ff_unallocblocks = loanedBlocks;
3236 cp->c_blocks += loanedBlocks;
3237 fp->ff_blocks += loanedBlocks;
3238
3239 hfs_lock_mount (hfsmp);
3240 hfsmp->loanedBlocks += loanedBlocks;
3241 hfs_unlock_mount (hfsmp);
3242
3243 hfs_systemfile_unlock(hfsmp, lockflags);
3244 cp->c_flag |= C_MODIFIED;
3245 if (started_tr) {
3246 (void) hfs_update(vp, 0);
3247 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3248
3249 hfs_end_transaction(hfsmp);
3250 started_tr = 0;
3251 }
3252 goto exit;
3253 }
3254 }
3255
3256 retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3257 ap->a_bpn, &bytesContAvail);
3258 if (syslocks) {
3259 hfs_systemfile_unlock(hfsmp, lockflags);
3260 syslocks = 0;
3261 }
3262
3263 if (retval) {
3264 /* On write, always return error because virtual blocks, if any,
3265 * should have been allocated in ExtendFileC(). We do not
3266 * allocate virtual blocks on read, therefore return error
3267 * only if no virtual blocks are allocated. Otherwise we search
3268 * rangelist for zero-fills
3269 */
3270 if ((MacToVFSError(retval) != ERANGE) ||
3271 (ap->a_flags & VNODE_WRITE) ||
3272 ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3273 goto exit;
3274 }
3275
3276 /* Validate if the start offset is within logical file size */
3277 if (ap->a_foffset >= fp->ff_size) {
3278 goto exit;
3279 }
3280
3281 /*
3282 * At this point, we have encountered a failure during
3283 * MapFileBlockC that resulted in ERANGE, and we are not
3284 * servicing a write, and there are borrowed blocks.
3285 *
3286 * However, the cluster layer will not call blockmap for
3287 * blocks that are borrowed and in-cache. We have to assume
3288 * that because we observed ERANGE being emitted from
3289 * MapFileBlockC, this extent range is not valid on-disk. So
3290 * we treat this as a mapping that needs to be zero-filled
3291 * prior to reading.
3292 */
3293
3294 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3295 bytesContAvail = fp->ff_size - ap->a_foffset;
3296
3297 *ap->a_bpn = (daddr64_t) -1;
3298 retval = 0;
3299
3300 goto exit;
3301 }
3302
3303 exit:
3304 if (retval == 0) {
3305 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3306 struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3307
3308 // See if we might be overlapping invalid ranges...
3309 if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3310 /*
3311 * Mark the file as needing an update if we think the
3312 * on-disk EOF has changed.
3313 */
3314 if (ap->a_foffset <= r->rl_start)
3315 SET(cp->c_flag, C_MODIFIED);
3316
3317 /*
3318 * This isn't the ideal place to put this. Ideally, we
3319 * should do something *after* we have successfully
3320 * written to the range, but that's difficult to do
3321 * because we cannot take locks in the callback. At
3322 * present, the cluster code will call us with VNODE_WRITE
3323 * set just before it's about to write the data so we know
3324 * that data is about to be written. If we get an I/O
3325 * error at this point then chances are the metadata
3326 * update to follow will also have an I/O error so the
3327 * risk here is small.
3328 */
3329 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3330 &fp->ff_invalidranges);
3331
3332 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3333 cp->c_flag &= ~C_ZFWANTSYNC;
3334 cp->c_zftimeout = 0;
3335 }
3336 }
3337 }
3338
3339 if (ap->a_run)
3340 *ap->a_run = bytesContAvail;
3341
3342 if (ap->a_poff)
3343 *(int *)ap->a_poff = 0;
3344 }
3345
3346 if (started_tr) {
3347 hfs_update(vp, TRUE);
3348 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3349 hfs_end_transaction(hfsmp);
3350 started_tr = 0;
3351 }
3352
3353 if (tooklock)
3354 hfs_unlock(cp);
3355
3356 return (MacToVFSError(retval));
3357 }
3358
3359 /*
3360 * prepare and issue the I/O
3361 * buf_strategy knows how to deal
3362 * with requests that require
3363 * fragmented I/Os
3364 */
3365 int
3366 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3367 {
3368 buf_t bp = ap->a_bp;
3369 vnode_t vp = buf_vnode(bp);
3370 int error = 0;
3371
3372 /* Mark buffer as containing static data if cnode flag set */
3373 if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3374 buf_markstatic(bp);
3375 }
3376
3377 /* Mark buffer as containing static data if cnode flag set */
3378 if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3379 bufattr_markgreedymode(&bp->b_attr);
3380 }
3381
3382 /* mark buffer as containing burst mode data if cnode flag set */
3383 if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3384 bufattr_markisochronous(&bp->b_attr);
3385 }
3386
3387 #if CONFIG_PROTECT
3388 error = cp_handle_strategy(bp);
3389
3390 if (error)
3391 return error;
3392 #endif
3393
3394 error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3395
3396 return error;
3397 }
3398
3399 int
3400 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3401 {
3402 register struct cnode *cp = VTOC(vp);
3403 struct filefork *fp = VTOF(vp);
3404 kauth_cred_t cred = vfs_context_ucred(context);
3405 int retval;
3406 off_t bytesToAdd;
3407 off_t actualBytesAdded;
3408 off_t filebytes;
3409 u_int32_t fileblocks;
3410 int blksize;
3411 struct hfsmount *hfsmp;
3412 int lockflags;
3413 int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3414
3415 blksize = VTOVCB(vp)->blockSize;
3416 fileblocks = fp->ff_blocks;
3417 filebytes = (off_t)fileblocks * (off_t)blksize;
3418
3419 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3420 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3421
3422 if (length < 0)
3423 return (EINVAL);
3424
3425 /* This should only happen with a corrupt filesystem */
3426 if ((off_t)fp->ff_size < 0)
3427 return (EINVAL);
3428
3429 if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3430 return (EFBIG);
3431
3432 hfsmp = VTOHFS(vp);
3433
3434 retval = E_NONE;
3435
3436 /* Files that are changing size are not hot file candidates. */
3437 if (hfsmp->hfc_stage == HFC_RECORDING) {
3438 fp->ff_bytesread = 0;
3439 }
3440
3441 /*
3442 * We cannot just check if fp->ff_size == length (as an optimization)
3443 * since there may be extra physical blocks that also need truncation.
3444 */
3445 #if QUOTA
3446 if ((retval = hfs_getinoquota(cp)))
3447 return(retval);
3448 #endif /* QUOTA */
3449
3450 /*
3451 * Lengthen the size of the file. We must ensure that the
3452 * last byte of the file is allocated. Since the smallest
3453 * value of ff_size is 0, length will be at least 1.
3454 */
3455 if (length > (off_t)fp->ff_size) {
3456 #if QUOTA
3457 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3458 cred, 0);
3459 if (retval)
3460 goto Err_Exit;
3461 #endif /* QUOTA */
3462 /*
3463 * If we don't have enough physical space then
3464 * we need to extend the physical size.
3465 */
3466 if (length > filebytes) {
3467 int eflags;
3468 u_int32_t blockHint = 0;
3469
3470 /* All or nothing and don't round up to clumpsize. */
3471 eflags = kEFAllMask | kEFNoClumpMask;
3472
3473 if (cred && (suser(cred, NULL) != 0)) {
3474 eflags |= kEFReserveMask; /* keep a reserve */
3475 }
3476
3477 /*
3478 * Allocate Journal and Quota files in metadata zone.
3479 */
3480 if (filebytes == 0 &&
3481 hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3482 hfs_virtualmetafile(cp)) {
3483 eflags |= kEFMetadataMask;
3484 blockHint = hfsmp->hfs_metazone_start;
3485 }
3486 if (hfs_start_transaction(hfsmp) != 0) {
3487 retval = EINVAL;
3488 goto Err_Exit;
3489 }
3490
3491 /* Protect extents b-tree and allocation bitmap */
3492 lockflags = SFL_BITMAP;
3493 if (overflow_extents(fp))
3494 lockflags |= SFL_EXTENTS;
3495 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3496
3497 /*
3498 * Keep growing the file as long as the current EOF is
3499 * less than the desired value.
3500 */
3501 while ((length > filebytes) && (retval == E_NONE)) {
3502 bytesToAdd = length - filebytes;
3503 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3504 (FCB*)fp,
3505 bytesToAdd,
3506 blockHint,
3507 eflags,
3508 &actualBytesAdded));
3509
3510 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3511 if (actualBytesAdded == 0 && retval == E_NONE) {
3512 if (length > filebytes)
3513 length = filebytes;
3514 break;
3515 }
3516 } /* endwhile */
3517
3518 hfs_systemfile_unlock(hfsmp, lockflags);
3519
3520 if (hfsmp->jnl) {
3521 hfs_update(vp, 0);
3522 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3523 }
3524
3525 hfs_end_transaction(hfsmp);
3526
3527 if (retval)
3528 goto Err_Exit;
3529
3530 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3531 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3532 }
3533
3534 if (ISSET(flags, IO_NOZEROFILL)) {
3535 // An optimisation for the hibernation file
3536 if (vnode_isswap(vp))
3537 rl_remove_all(&fp->ff_invalidranges);
3538 } else {
3539 if (UBCINFOEXISTS(vp) && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3540 if (length > (off_t)fp->ff_size) {
3541 struct timeval tv;
3542
3543 /* Extending the file: time to fill out the current last page w. zeroes? */
3544 if (fp->ff_size & PAGE_MASK_64) {
3545 /* There might be some valid data at the start of the (current) last page
3546 of the file, so zero out the remainder of that page to ensure the
3547 entire page contains valid data. */
3548 hfs_unlock(cp);
3549 retval = hfs_zero_eof_page(vp, length);
3550 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3551 if (retval) goto Err_Exit;
3552 }
3553 microuptime(&tv);
3554 rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3555 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3556 }
3557 } else {
3558 panic("hfs_truncate: invoked on non-UBC object?!");
3559 };
3560 }
3561 if (suppress_times == 0) {
3562 cp->c_touch_modtime = TRUE;
3563 }
3564 fp->ff_size = length;
3565
3566 } else { /* Shorten the size of the file */
3567
3568 // An optimisation for the hibernation file
3569 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3570 rl_remove_all(&fp->ff_invalidranges);
3571 } else if ((off_t)fp->ff_size > length) {
3572 /* Any space previously marked as invalid is now irrelevant: */
3573 rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3574 }
3575
3576 /*
3577 * Account for any unmapped blocks. Note that the new
3578 * file length can still end up with unmapped blocks.
3579 */
3580 if (fp->ff_unallocblocks > 0) {
3581 u_int32_t finalblks;
3582 u_int32_t loanedBlocks;
3583
3584 hfs_lock_mount(hfsmp);
3585 loanedBlocks = fp->ff_unallocblocks;
3586 cp->c_blocks -= loanedBlocks;
3587 fp->ff_blocks -= loanedBlocks;
3588 fp->ff_unallocblocks = 0;
3589
3590 hfsmp->loanedBlocks -= loanedBlocks;
3591
3592 finalblks = (length + blksize - 1) / blksize;
3593 if (finalblks > fp->ff_blocks) {
3594 /* calculate required unmapped blocks */
3595 loanedBlocks = finalblks - fp->ff_blocks;
3596 hfsmp->loanedBlocks += loanedBlocks;
3597
3598 fp->ff_unallocblocks = loanedBlocks;
3599 cp->c_blocks += loanedBlocks;
3600 fp->ff_blocks += loanedBlocks;
3601 }
3602 hfs_unlock_mount (hfsmp);
3603 }
3604
3605 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3606 if (hfs_start_transaction(hfsmp) != 0) {
3607 retval = EINVAL;
3608 goto Err_Exit;
3609 }
3610
3611 if (fp->ff_unallocblocks == 0) {
3612 /* Protect extents b-tree and allocation bitmap */
3613 lockflags = SFL_BITMAP;
3614 if (overflow_extents(fp))
3615 lockflags |= SFL_EXTENTS;
3616 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3617
3618 retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3619 FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3620
3621 hfs_systemfile_unlock(hfsmp, lockflags);
3622 }
3623 if (hfsmp->jnl) {
3624 if (retval == 0) {
3625 fp->ff_size = length;
3626 }
3627 hfs_update(vp, 0);
3628 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3629 }
3630 hfs_end_transaction(hfsmp);
3631
3632 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3633 if (retval)
3634 goto Err_Exit;
3635 #if QUOTA
3636 /* These are bytesreleased */
3637 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3638 #endif /* QUOTA */
3639
3640 //
3641 // Unlike when growing a file, we adjust the hotfile block count here
3642 // instead of deeper down in the block allocation code because we do
3643 // not necessarily have a vnode or "fcb" at the time we're deleting
3644 // the file and so we wouldn't know if it was hotfile cached or not
3645 //
3646 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3647
3648
3649 /*
3650 * Only set update flag if the logical length changes & we aren't
3651 * suppressing modtime updates.
3652 */
3653 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3654 cp->c_touch_modtime = TRUE;
3655 }
3656 fp->ff_size = length;
3657 }
3658 if (cp->c_mode & (S_ISUID | S_ISGID)) {
3659 if (!vfs_context_issuser(context))
3660 cp->c_mode &= ~(S_ISUID | S_ISGID);
3661 }
3662 cp->c_flag |= C_MODIFIED;
3663 cp->c_touch_chgtime = TRUE; /* status changed */
3664 if (suppress_times == 0) {
3665 cp->c_touch_modtime = TRUE; /* file data was modified */
3666
3667 /*
3668 * If we are not suppressing the modtime update, then
3669 * update the gen count as well.
3670 */
3671 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3672 hfs_incr_gencount(cp);
3673 }
3674 }
3675
3676 retval = hfs_update(vp, 0);
3677 if (retval) {
3678 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3679 -1, -1, -1, retval, 0);
3680 }
3681
3682 Err_Exit:
3683
3684 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3685 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3686
3687 return (retval);
3688 }
3689
3690 /*
3691 * Preparation which must be done prior to deleting the catalog record
3692 * of a file or directory. In order to make the on-disk as safe as possible,
3693 * we remove the catalog entry before releasing the bitmap blocks and the
3694 * overflow extent records. However, some work must be done prior to deleting
3695 * the catalog record.
3696 *
3697 * When calling this function, the cnode must exist both in memory and on-disk.
3698 * If there are both resource fork and data fork vnodes, this function should
3699 * be called on both.
3700 */
3701
3702 int
3703 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3704
3705 struct filefork *fp = VTOF(vp);
3706 struct cnode *cp = VTOC(vp);
3707 #if QUOTA
3708 int retval = 0;
3709 #endif /* QUOTA */
3710
3711 /* Cannot truncate an HFS directory! */
3712 if (vnode_isdir(vp)) {
3713 return (EISDIR);
3714 }
3715
3716 /*
3717 * See the comment below in hfs_truncate for why we need to call
3718 * setsize here. Essentially we want to avoid pending IO if we
3719 * already know that the blocks are going to be released here.
3720 * This function is only called when totally removing all storage for a file, so
3721 * we can take a shortcut and immediately setsize (0);
3722 */
3723 ubc_setsize(vp, 0);
3724
3725 /* This should only happen with a corrupt filesystem */
3726 if ((off_t)fp->ff_size < 0)
3727 return (EINVAL);
3728
3729 /*
3730 * We cannot just check if fp->ff_size == length (as an optimization)
3731 * since there may be extra physical blocks that also need truncation.
3732 */
3733 #if QUOTA
3734 if ((retval = hfs_getinoquota(cp))) {
3735 return(retval);
3736 }
3737 #endif /* QUOTA */
3738
3739 /* Wipe out any invalid ranges which have yet to be backed by disk */
3740 rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3741
3742 /*
3743 * Account for any unmapped blocks. Since we're deleting the
3744 * entire file, we don't have to worry about just shrinking
3745 * to a smaller number of borrowed blocks.
3746 */
3747 if (fp->ff_unallocblocks > 0) {
3748 u_int32_t loanedBlocks;
3749
3750 hfs_lock_mount (hfsmp);
3751 loanedBlocks = fp->ff_unallocblocks;
3752 cp->c_blocks -= loanedBlocks;
3753 fp->ff_blocks -= loanedBlocks;
3754 fp->ff_unallocblocks = 0;
3755
3756 hfsmp->loanedBlocks -= loanedBlocks;
3757
3758 hfs_unlock_mount (hfsmp);
3759 }
3760
3761 return 0;
3762 }
3763
3764
3765 /*
3766 * Special wrapper around calling TruncateFileC. This function is useable
3767 * even when the catalog record does not exist any longer, making it ideal
3768 * for use when deleting a file. The simplification here is that we know
3769 * that we are releasing all blocks.
3770 *
3771 * Note that this function may be called when there is no vnode backing
3772 * the file fork in question. We may call this from hfs_vnop_inactive
3773 * to clear out resource fork data (and may not want to clear out the data
3774 * fork yet). As a result, we pointer-check both sets of inputs before
3775 * doing anything with them.
3776 *
3777 * The caller is responsible for saving off a copy of the filefork(s)
3778 * embedded within the cnode prior to calling this function. The pointers
3779 * supplied as arguments must be valid even if the cnode is no longer valid.
3780 */
3781
3782 int
3783 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3784 struct filefork *rsrcfork, u_int32_t fileid) {
3785
3786 off_t filebytes;
3787 u_int32_t fileblocks;
3788 int blksize = 0;
3789 int error = 0;
3790 int lockflags;
3791
3792 blksize = hfsmp->blockSize;
3793
3794 /* Data Fork */
3795 if (datafork) {
3796 off_t prev_filebytes;
3797 datafork->ff_size = 0;
3798
3799 fileblocks = datafork->ff_blocks;
3800 filebytes = (off_t)fileblocks * (off_t)blksize;
3801 prev_filebytes = filebytes;
3802
3803 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3804
3805 while (filebytes > 0) {
3806 if (filebytes > HFS_BIGFILE_SIZE) {
3807 filebytes -= HFS_BIGFILE_SIZE;
3808 } else {
3809 filebytes = 0;
3810 }
3811
3812 /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3813 if (hfs_start_transaction(hfsmp) != 0) {
3814 error = EINVAL;
3815 break;
3816 }
3817
3818 if (datafork->ff_unallocblocks == 0) {
3819 /* Protect extents b-tree and allocation bitmap */
3820 lockflags = SFL_BITMAP;
3821 if (overflow_extents(datafork))
3822 lockflags |= SFL_EXTENTS;
3823 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3824
3825 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3826
3827 hfs_systemfile_unlock(hfsmp, lockflags);
3828 }
3829 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3830
3831 struct cnode *cp = datafork ? FTOC(datafork) : NULL;
3832 struct vnode *vp;
3833 vp = cp ? CTOV(cp, 0) : NULL;
3834 hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
3835 prev_filebytes = filebytes;
3836
3837 /* Finish the transaction and start over if necessary */
3838 hfs_end_transaction(hfsmp);
3839
3840 if (error) {
3841 break;
3842 }
3843 }
3844 }
3845
3846 /* Resource fork */
3847 if (error == 0 && rsrcfork) {
3848 rsrcfork->ff_size = 0;
3849
3850 fileblocks = rsrcfork->ff_blocks;
3851 filebytes = (off_t)fileblocks * (off_t)blksize;
3852
3853 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3854
3855 while (filebytes > 0) {
3856 if (filebytes > HFS_BIGFILE_SIZE) {
3857 filebytes -= HFS_BIGFILE_SIZE;
3858 } else {
3859 filebytes = 0;
3860 }
3861
3862 /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3863 if (hfs_start_transaction(hfsmp) != 0) {
3864 error = EINVAL;
3865 break;
3866 }
3867
3868 if (rsrcfork->ff_unallocblocks == 0) {
3869 /* Protect extents b-tree and allocation bitmap */
3870 lockflags = SFL_BITMAP;
3871 if (overflow_extents(rsrcfork))
3872 lockflags |= SFL_EXTENTS;
3873 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3874
3875 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3876
3877 hfs_systemfile_unlock(hfsmp, lockflags);
3878 }
3879 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3880
3881 /* Finish the transaction and start over if necessary */
3882 hfs_end_transaction(hfsmp);
3883
3884 if (error) {
3885 break;
3886 }
3887 }
3888 }
3889
3890 return error;
3891 }
3892
3893 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3894 {
3895 errno_t error;
3896
3897 /*
3898 * Call ubc_setsize to give the VM subsystem a chance to do
3899 * whatever it needs to with existing pages before we delete
3900 * blocks. Note that symlinks don't use the UBC so we'll
3901 * get back ENOENT in that case.
3902 */
3903 if (have_cnode_lock) {
3904 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3905 if (error == EAGAIN) {
3906 cnode_t *cp = VTOC(vp);
3907
3908 if (cp->c_truncatelockowner != current_thread()) {
3909 #if DEVELOPMENT || DEBUG
3910 panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3911 #else
3912 printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
3913 #endif
3914 }
3915
3916 hfs_unlock(cp);
3917 error = ubc_setsize_ex(vp, len, 0);
3918 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3919 }
3920 } else
3921 error = ubc_setsize_ex(vp, len, 0);
3922
3923 return error == ENOENT ? 0 : error;
3924 }
3925
3926 /*
3927 * Truncate a cnode to at most length size, freeing (or adding) the
3928 * disk blocks.
3929 */
3930 int
3931 hfs_truncate(struct vnode *vp, off_t length, int flags,
3932 int truncateflags, vfs_context_t context)
3933 {
3934 struct filefork *fp = VTOF(vp);
3935 off_t filebytes;
3936 u_int32_t fileblocks;
3937 int blksize;
3938 errno_t error = 0;
3939 struct cnode *cp = VTOC(vp);
3940 hfsmount_t *hfsmp = VTOHFS(vp);
3941
3942 /* Cannot truncate an HFS directory! */
3943 if (vnode_isdir(vp)) {
3944 return (EISDIR);
3945 }
3946 /* A swap file cannot change size. */
3947 if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
3948 return (EPERM);
3949 }
3950
3951 blksize = hfsmp->blockSize;
3952 fileblocks = fp->ff_blocks;
3953 filebytes = (off_t)fileblocks * (off_t)blksize;
3954
3955 bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
3956
3957 error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
3958 if (error)
3959 return error;
3960
3961 if (!caller_has_cnode_lock) {
3962 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3963 if (error)
3964 return error;
3965 }
3966
3967 // have to loop truncating or growing files that are
3968 // really big because otherwise transactions can get
3969 // enormous and consume too many kernel resources.
3970
3971 if (length < filebytes) {
3972 while (filebytes > length) {
3973 if ((filebytes - length) > HFS_BIGFILE_SIZE) {
3974 filebytes -= HFS_BIGFILE_SIZE;
3975 } else {
3976 filebytes = length;
3977 }
3978 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3979 if (error)
3980 break;
3981 }
3982 } else if (length > filebytes) {
3983 kauth_cred_t cred = vfs_context_ucred(context);
3984 const bool keep_reserve = cred && suser(cred, NULL) != 0;
3985
3986 if (hfs_freeblks(hfsmp, keep_reserve)
3987 < howmany(length - filebytes, blksize)) {
3988 error = ENOSPC;
3989 } else {
3990 while (filebytes < length) {
3991 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
3992 filebytes += HFS_BIGFILE_SIZE;
3993 } else {
3994 filebytes = length;
3995 }
3996 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3997 if (error)
3998 break;
3999 }
4000 }
4001 } else /* Same logical size */ {
4002
4003 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4004 }
4005 /* Files that are changing size are not hot file candidates. */
4006 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4007 fp->ff_bytesread = 0;
4008 }
4009
4010
4011 if (!caller_has_cnode_lock)
4012 hfs_unlock(cp);
4013
4014 // Make sure UBC's size matches up (in case we didn't completely succeed)
4015 errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4016 if (!error)
4017 error = err2;
4018
4019 return error;
4020 }
4021
4022
4023 /*
4024 * Preallocate file storage space.
4025 */
4026 int
4027 hfs_vnop_allocate(struct vnop_allocate_args /* {
4028 vnode_t a_vp;
4029 off_t a_length;
4030 u_int32_t a_flags;
4031 off_t *a_bytesallocated;
4032 off_t a_offset;
4033 vfs_context_t a_context;
4034 } */ *ap)
4035 {
4036 struct vnode *vp = ap->a_vp;
4037 struct cnode *cp;
4038 struct filefork *fp;
4039 ExtendedVCB *vcb;
4040 off_t length = ap->a_length;
4041 off_t startingPEOF;
4042 off_t moreBytesRequested;
4043 off_t actualBytesAdded;
4044 off_t filebytes;
4045 u_int32_t fileblocks;
4046 int retval, retval2;
4047 u_int32_t blockHint;
4048 u_int32_t extendFlags; /* For call to ExtendFileC */
4049 struct hfsmount *hfsmp;
4050 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4051 int lockflags;
4052 time_t orig_ctime;
4053
4054 *(ap->a_bytesallocated) = 0;
4055
4056 if (!vnode_isreg(vp))
4057 return (EISDIR);
4058 if (length < (off_t)0)
4059 return (EINVAL);
4060
4061 cp = VTOC(vp);
4062
4063 orig_ctime = VTOC(vp)->c_ctime;
4064
4065 check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4066
4067 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4068
4069 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4070 goto Err_Exit;
4071 }
4072
4073 fp = VTOF(vp);
4074 hfsmp = VTOHFS(vp);
4075 vcb = VTOVCB(vp);
4076
4077 fileblocks = fp->ff_blocks;
4078 filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4079
4080 if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4081 retval = EINVAL;
4082 goto Err_Exit;
4083 }
4084
4085 /* Fill in the flags word for the call to Extend the file */
4086
4087 extendFlags = kEFNoClumpMask;
4088 if (ap->a_flags & ALLOCATECONTIG)
4089 extendFlags |= kEFContigMask;
4090 if (ap->a_flags & ALLOCATEALL)
4091 extendFlags |= kEFAllMask;
4092 if (cred && suser(cred, NULL) != 0)
4093 extendFlags |= kEFReserveMask;
4094 if (hfs_virtualmetafile(cp))
4095 extendFlags |= kEFMetadataMask;
4096
4097 retval = E_NONE;
4098 blockHint = 0;
4099 startingPEOF = filebytes;
4100
4101 if (ap->a_flags & ALLOCATEFROMPEOF)
4102 length += filebytes;
4103 else if (ap->a_flags & ALLOCATEFROMVOL)
4104 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4105
4106 /* If no changes are necesary, then we're done */
4107 if (filebytes == length)
4108 goto Std_Exit;
4109
4110 /*
4111 * Lengthen the size of the file. We must ensure that the
4112 * last byte of the file is allocated. Since the smallest
4113 * value of filebytes is 0, length will be at least 1.
4114 */
4115 if (length > filebytes) {
4116 if (ISSET(extendFlags, kEFAllMask)
4117 && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4118 < howmany(length - filebytes, hfsmp->blockSize))) {
4119 retval = ENOSPC;
4120 goto Err_Exit;
4121 }
4122
4123 off_t total_bytes_added = 0, orig_request_size;
4124
4125 orig_request_size = moreBytesRequested = length - filebytes;
4126
4127 #if QUOTA
4128 retval = hfs_chkdq(cp,
4129 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4130 cred, 0);
4131 if (retval)
4132 goto Err_Exit;
4133
4134 #endif /* QUOTA */
4135 /*
4136 * Metadata zone checks.
4137 */
4138 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4139 /*
4140 * Allocate Journal and Quota files in metadata zone.
4141 */
4142 if (hfs_virtualmetafile(cp)) {
4143 blockHint = hfsmp->hfs_metazone_start;
4144 } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4145 (blockHint <= hfsmp->hfs_metazone_end)) {
4146 /*
4147 * Move blockHint outside metadata zone.
4148 */
4149 blockHint = hfsmp->hfs_metazone_end + 1;
4150 }
4151 }
4152
4153
4154 while ((length > filebytes) && (retval == E_NONE)) {
4155 off_t bytesRequested;
4156
4157 if (hfs_start_transaction(hfsmp) != 0) {
4158 retval = EINVAL;
4159 goto Err_Exit;
4160 }
4161
4162 /* Protect extents b-tree and allocation bitmap */
4163 lockflags = SFL_BITMAP;
4164 if (overflow_extents(fp))
4165 lockflags |= SFL_EXTENTS;
4166 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4167
4168 if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4169 bytesRequested = HFS_BIGFILE_SIZE;
4170 } else {
4171 bytesRequested = moreBytesRequested;
4172 }
4173
4174 if (extendFlags & kEFContigMask) {
4175 // if we're on a sparse device, this will force it to do a
4176 // full scan to find the space needed.
4177 hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4178 }
4179
4180 retval = MacToVFSError(ExtendFileC(vcb,
4181 (FCB*)fp,
4182 bytesRequested,
4183 blockHint,
4184 extendFlags,
4185 &actualBytesAdded));
4186
4187 if (retval == E_NONE) {
4188 *(ap->a_bytesallocated) += actualBytesAdded;
4189 total_bytes_added += actualBytesAdded;
4190 moreBytesRequested -= actualBytesAdded;
4191 if (blockHint != 0) {
4192 blockHint += actualBytesAdded / vcb->blockSize;
4193 }
4194 }
4195 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4196
4197 hfs_systemfile_unlock(hfsmp, lockflags);
4198
4199 if (hfsmp->jnl) {
4200 (void) hfs_update(vp, 0);
4201 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4202 }
4203
4204 hfs_end_transaction(hfsmp);
4205 }
4206
4207
4208 /*
4209 * if we get an error and no changes were made then exit
4210 * otherwise we must do the hfs_update to reflect the changes
4211 */
4212 if (retval && (startingPEOF == filebytes))
4213 goto Err_Exit;
4214
4215 /*
4216 * Adjust actualBytesAdded to be allocation block aligned, not
4217 * clump size aligned.
4218 * NOTE: So what we are reporting does not affect reality
4219 * until the file is closed, when we truncate the file to allocation
4220 * block size.
4221 */
4222 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4223 *(ap->a_bytesallocated) =
4224 roundup(orig_request_size, (off_t)vcb->blockSize);
4225
4226 } else { /* Shorten the size of the file */
4227
4228 /*
4229 * N.B. At present, this code is never called. If and when we
4230 * do start using it, it looks like there might be slightly
4231 * strange semantics with the file size: it's possible for the
4232 * file size to *increase* e.g. if current file size is 5,
4233 * length is 1024 and filebytes is 4096, the file size will
4234 * end up being 1024 bytes. This isn't necessarily a problem
4235 * but it's not consistent with the code above which doesn't
4236 * change the file size.
4237 */
4238
4239 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4240 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4241
4242 /*
4243 * if we get an error and no changes were made then exit
4244 * otherwise we must do the hfs_update to reflect the changes
4245 */
4246 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4247 #if QUOTA
4248 /* These are bytesreleased */
4249 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4250 #endif /* QUOTA */
4251
4252 if (fp->ff_size > filebytes) {
4253 fp->ff_size = filebytes;
4254
4255 hfs_ubc_setsize(vp, fp->ff_size, true);
4256 }
4257 }
4258
4259 Std_Exit:
4260 cp->c_flag |= C_MODIFIED;
4261 cp->c_touch_chgtime = TRUE;
4262 cp->c_touch_modtime = TRUE;
4263 retval2 = hfs_update(vp, 0);
4264
4265 if (retval == 0)
4266 retval = retval2;
4267 Err_Exit:
4268 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4269 hfs_unlock(cp);
4270 return (retval);
4271 }
4272
4273
4274 /*
4275 * Pagein for HFS filesystem
4276 */
4277 int
4278 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4279 /*
4280 struct vnop_pagein_args {
4281 vnode_t a_vp,
4282 upl_t a_pl,
4283 vm_offset_t a_pl_offset,
4284 off_t a_f_offset,
4285 size_t a_size,
4286 int a_flags
4287 vfs_context_t a_context;
4288 };
4289 */
4290 {
4291 vnode_t vp;
4292 struct cnode *cp;
4293 struct filefork *fp;
4294 int error = 0;
4295 upl_t upl;
4296 upl_page_info_t *pl;
4297 off_t f_offset;
4298 off_t page_needed_f_offset;
4299 int offset;
4300 int isize;
4301 int upl_size;
4302 int pg_index;
4303 boolean_t truncate_lock_held = FALSE;
4304 boolean_t file_converted = FALSE;
4305 kern_return_t kret;
4306
4307 vp = ap->a_vp;
4308 cp = VTOC(vp);
4309 fp = VTOF(vp);
4310
4311 #if CONFIG_PROTECT
4312 if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4313 /*
4314 * If we errored here, then this means that one of two things occurred:
4315 * 1. there was a problem with the decryption of the key.
4316 * 2. the device is locked and we are not allowed to access this particular file.
4317 *
4318 * Either way, this means that we need to shut down this upl now. As long as
4319 * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4320 * then we create a upl and immediately abort it.
4321 */
4322 if (ap->a_pl == NULL) {
4323 /* create the upl */
4324 ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4325 UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4326 /* mark the range as needed so it doesn't immediately get discarded upon abort */
4327 ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4328
4329 /* Abort the range */
4330 ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4331 }
4332
4333
4334 return error;
4335 }
4336 #endif /* CONFIG_PROTECT */
4337
4338 if (ap->a_pl != NULL) {
4339 /*
4340 * this can only happen for swap files now that
4341 * we're asking for V2 paging behavior...
4342 * so don't need to worry about decompression, or
4343 * keeping track of blocks read or taking the truncate lock
4344 */
4345 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4346 ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4347 goto pagein_done;
4348 }
4349
4350 page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4351
4352 retry_pagein:
4353 /*
4354 * take truncate lock (shared/recursive) to guard against
4355 * zero-fill thru fsync interfering, but only for v2
4356 *
4357 * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4358 * lock shared and we are allowed to recurse 1 level if this thread already
4359 * owns the lock exclusively... this can legally occur
4360 * if we are doing a shrinking ftruncate against a file
4361 * that is mapped private, and the pages being truncated
4362 * do not currently exist in the cache... in that case
4363 * we will have to page-in the missing pages in order
4364 * to provide them to the private mapping... we must
4365 * also call hfs_unlock_truncate with a postive been_recursed
4366 * arg to indicate that if we have recursed, there is no need to drop
4367 * the lock. Allowing this simple recursion is necessary
4368 * in order to avoid a certain deadlock... since the ftruncate
4369 * already holds the truncate lock exclusively, if we try
4370 * to acquire it shared to protect the pagein path, we will
4371 * hang this thread
4372 *
4373 * NOTE: The if () block below is a workaround in order to prevent a
4374 * VM deadlock. See rdar://7853471.
4375 *
4376 * If we are in a forced unmount, then launchd will still have the
4377 * dyld_shared_cache file mapped as it is trying to reboot. If we
4378 * take the truncate lock here to service a page fault, then our
4379 * thread could deadlock with the forced-unmount. The forced unmount
4380 * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4381 * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount
4382 * thread will think it needs to copy all of the data out of the file
4383 * and into a VM copy object. If we hold the cnode lock here, then that
4384 * VM operation will not be able to proceed, because we'll set a busy page
4385 * before attempting to grab the lock. Note that this isn't as simple as "don't
4386 * call ubc_setsize" because doing that would just shift the problem to the
4387 * ubc_msync done before the vnode is reclaimed.
4388 *
4389 * So, if a forced unmount on this volume is in flight AND the cnode is
4390 * marked C_DELETED, then just go ahead and do the page in without taking
4391 * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file
4392 * that is not going to be available on the next mount, this seems like a
4393 * OK solution from a correctness point of view, even though it is hacky.
4394 */
4395 if (vfs_isforce(vp->v_mount)) {
4396 if (cp->c_flag & C_DELETED) {
4397 /* If we don't get it, then just go ahead and operate without the lock */
4398 truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4399 }
4400 }
4401 else {
4402 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4403 truncate_lock_held = TRUE;
4404 }
4405
4406 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4407
4408 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4409 error = EINVAL;
4410 goto pagein_done;
4411 }
4412 ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4413
4414 upl_size = isize = ap->a_size;
4415
4416 /*
4417 * Scan from the back to find the last page in the UPL, so that we
4418 * aren't looking at a UPL that may have already been freed by the
4419 * preceding aborts/completions.
4420 */
4421 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4422 if (upl_page_present(pl, --pg_index))
4423 break;
4424 if (pg_index == 0) {
4425 /*
4426 * no absent pages were found in the range specified
4427 * just abort the UPL to get rid of it and then we're done
4428 */
4429 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4430 goto pagein_done;
4431 }
4432 }
4433 /*
4434 * initialize the offset variables before we touch the UPL.
4435 * f_offset is the position into the file, in bytes
4436 * offset is the position into the UPL, in bytes
4437 * pg_index is the pg# of the UPL we're operating on
4438 * isize is the offset into the UPL of the last page that is present.
4439 */
4440 isize = ((pg_index + 1) * PAGE_SIZE);
4441 pg_index = 0;
4442 offset = 0;
4443 f_offset = ap->a_f_offset;
4444
4445 while (isize) {
4446 int xsize;
4447 int num_of_pages;
4448
4449 if ( !upl_page_present(pl, pg_index)) {
4450 /*
4451 * we asked for RET_ONLY_ABSENT, so it's possible
4452 * to get back empty slots in the UPL.
4453 * just skip over them
4454 */
4455 f_offset += PAGE_SIZE;
4456 offset += PAGE_SIZE;
4457 isize -= PAGE_SIZE;
4458 pg_index++;
4459
4460 continue;
4461 }
4462 /*
4463 * We know that we have at least one absent page.
4464 * Now checking to see how many in a row we have
4465 */
4466 num_of_pages = 1;
4467 xsize = isize - PAGE_SIZE;
4468
4469 while (xsize) {
4470 if ( !upl_page_present(pl, pg_index + num_of_pages))
4471 break;
4472 num_of_pages++;
4473 xsize -= PAGE_SIZE;
4474 }
4475 xsize = num_of_pages * PAGE_SIZE;
4476
4477 #if HFS_COMPRESSION
4478 if (VNODE_IS_RSRC(vp)) {
4479 /* allow pageins of the resource fork */
4480 } else {
4481 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4482
4483 if (compressed) {
4484
4485 if (truncate_lock_held) {
4486 /*
4487 * can't hold the truncate lock when calling into the decmpfs layer
4488 * since it calls back into this layer... even though we're only
4489 * holding the lock in shared mode, and the re-entrant path only
4490 * takes the lock shared, we can deadlock if some other thread
4491 * tries to grab the lock exclusively in between.
4492 */
4493 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4494 truncate_lock_held = FALSE;
4495 }
4496 ap->a_pl = upl;
4497 ap->a_pl_offset = offset;
4498 ap->a_f_offset = f_offset;
4499 ap->a_size = xsize;
4500
4501 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4502 /*
4503 * note that decpfs_pagein_compressed can change the state of
4504 * 'compressed'... it will set it to 0 if the file is no longer
4505 * compressed once the compression lock is successfully taken
4506 * i.e. we would block on that lock while the file is being inflated
4507 */
4508 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4509 (void) hfs_addhotfile(vp);
4510 }
4511 if (compressed) {
4512 if (error == 0) {
4513 /* successful page-in, update the access time */
4514 VTOC(vp)->c_touch_acctime = TRUE;
4515
4516 //
4517 // compressed files are not traditional hot file candidates
4518 // but they may be for CF (which ignores the ff_bytesread
4519 // field)
4520 //
4521 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4522 fp->ff_bytesread = 0;
4523 }
4524 } else if (error == EAGAIN) {
4525 /*
4526 * EAGAIN indicates someone else already holds the compression lock...
4527 * to avoid deadlocking, we'll abort this range of pages with an
4528 * indication that the pagein needs to be redriven
4529 */
4530 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4531 } else if (error == ENOSPC) {
4532
4533 if (upl_size == PAGE_SIZE)
4534 panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4535
4536 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4537
4538 ap->a_size = PAGE_SIZE;
4539 ap->a_pl = NULL;
4540 ap->a_pl_offset = 0;
4541 ap->a_f_offset = page_needed_f_offset;
4542
4543 goto retry_pagein;
4544 }
4545 goto pagein_next_range;
4546 }
4547 else {
4548 /*
4549 * Set file_converted only if the file became decompressed while we were
4550 * paging in. If it were still compressed, we would re-start the loop using the goto
4551 * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein
4552 * condition below, since we could have avoided taking the truncate lock to prevent
4553 * a deadlock in the force unmount case.
4554 */
4555 file_converted = TRUE;
4556 }
4557 }
4558 if (file_converted == TRUE) {
4559 /*
4560 * the file was converted back to a regular file after we first saw it as compressed
4561 * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4562 * reset a_size so that we consider what remains of the original request
4563 * and null out a_upl and a_pl_offset.
4564 *
4565 * We should only be able to get into this block if the decmpfs_pagein_compressed
4566 * successfully decompressed the range in question for this file.
4567 */
4568 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4569
4570 ap->a_size = isize;
4571 ap->a_pl = NULL;
4572 ap->a_pl_offset = 0;
4573
4574 /* Reset file_converted back to false so that we don't infinite-loop. */
4575 file_converted = FALSE;
4576 goto retry_pagein;
4577 }
4578 }
4579 #endif
4580 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4581
4582 /*
4583 * Keep track of blocks read.
4584 */
4585 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4586 int bytesread;
4587 int took_cnode_lock = 0;
4588
4589 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4590 bytesread = fp->ff_size;
4591 else
4592 bytesread = xsize;
4593
4594 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4595 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4596 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4597 took_cnode_lock = 1;
4598 }
4599 /*
4600 * If this file hasn't been seen since the start of
4601 * the current sampling period then start over.
4602 */
4603 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4604 struct timeval tv;
4605
4606 fp->ff_bytesread = bytesread;
4607 microtime(&tv);
4608 cp->c_atime = tv.tv_sec;
4609 } else {
4610 fp->ff_bytesread += bytesread;
4611 }
4612 cp->c_touch_acctime = TRUE;
4613
4614 if (vnode_isfastdevicecandidate(vp)) {
4615 (void) hfs_addhotfile(vp);
4616 }
4617 if (took_cnode_lock)
4618 hfs_unlock(cp);
4619 }
4620 pagein_next_range:
4621 f_offset += xsize;
4622 offset += xsize;
4623 isize -= xsize;
4624 pg_index += num_of_pages;
4625
4626 error = 0;
4627 }
4628
4629 pagein_done:
4630 if (truncate_lock_held == TRUE) {
4631 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4632 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4633 }
4634
4635 return (error);
4636 }
4637
4638 /*
4639 * Pageout for HFS filesystem.
4640 */
4641 int
4642 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4643 /*
4644 struct vnop_pageout_args {
4645 vnode_t a_vp,
4646 upl_t a_pl,
4647 vm_offset_t a_pl_offset,
4648 off_t a_f_offset,
4649 size_t a_size,
4650 int a_flags
4651 vfs_context_t a_context;
4652 };
4653 */
4654 {
4655 vnode_t vp = ap->a_vp;
4656 struct cnode *cp;
4657 struct filefork *fp;
4658 int retval = 0;
4659 off_t filesize;
4660 upl_t upl;
4661 upl_page_info_t* pl;
4662 vm_offset_t a_pl_offset;
4663 int a_flags;
4664 int is_pageoutv2 = 0;
4665 kern_return_t kret;
4666
4667 cp = VTOC(vp);
4668 fp = VTOF(vp);
4669
4670 a_flags = ap->a_flags;
4671 a_pl_offset = ap->a_pl_offset;
4672
4673 /*
4674 * we can tell if we're getting the new or old behavior from the UPL
4675 */
4676 if ((upl = ap->a_pl) == NULL) {
4677 int request_flags;
4678
4679 is_pageoutv2 = 1;
4680 /*
4681 * we're in control of any UPL we commit
4682 * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4683 */
4684 a_flags &= ~UPL_NOCOMMIT;
4685 a_pl_offset = 0;
4686
4687 /*
4688 * For V2 semantics, we want to take the cnode truncate lock
4689 * shared to guard against the file size changing via zero-filling.
4690 *
4691 * However, we have to be careful because we may be invoked
4692 * via the ubc_msync path to write out dirty mmap'd pages
4693 * in response to a lock event on a content-protected
4694 * filesystem (e.g. to write out class A files).
4695 * As a result, we want to take the truncate lock 'SHARED' with
4696 * the mini-recursion locktype so that we don't deadlock/panic
4697 * because we may be already holding the truncate lock exclusive to force any other
4698 * IOs to have blocked behind us.
4699 */
4700 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4701
4702 if (a_flags & UPL_MSYNC) {
4703 request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4704 }
4705 else {
4706 request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4707 }
4708
4709 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4710
4711 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4712 retval = EINVAL;
4713 goto pageout_done;
4714 }
4715 }
4716 /*
4717 * from this point forward upl points at the UPL we're working with
4718 * it was either passed in or we succesfully created it
4719 */
4720
4721 /*
4722 * Figure out where the file ends, for pageout purposes. If
4723 * ff_new_size > ff_size, then we're in the middle of extending the
4724 * file via a write, so it is safe (and necessary) that we be able
4725 * to pageout up to that point.
4726 */
4727 filesize = fp->ff_size;
4728 if (fp->ff_new_size > filesize)
4729 filesize = fp->ff_new_size;
4730
4731 /*
4732 * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4733 * UPL instead of relying on the UPL passed into us. We go ahead and do that here,
4734 * scanning for dirty ranges. We'll issue our own N cluster_pageout calls, for
4735 * N dirty ranges in the UPL. Note that this is almost a direct copy of the
4736 * logic in vnode_pageout except that we need to do it after grabbing the truncate
4737 * lock in HFS so that we don't lock invert ourselves.
4738 *
4739 * Note that we can still get into this function on behalf of the default pager with
4740 * non-V2 behavior (swapfiles). However in that case, we did not grab locks above
4741 * since fsync and other writing threads will grab the locks, then mark the
4742 * relevant pages as busy. But the pageout codepath marks the pages as busy,
4743 * and THEN would attempt to grab the truncate lock, which would result in deadlock. So
4744 * we do not try to grab anything for the pre-V2 case, which should only be accessed
4745 * by the paging/VM system.
4746 */
4747
4748 if (is_pageoutv2) {
4749 off_t f_offset;
4750 int offset;
4751 int isize;
4752 int pg_index;
4753 int error;
4754 int error_ret = 0;
4755
4756 isize = ap->a_size;
4757 f_offset = ap->a_f_offset;
4758
4759 /*
4760 * Scan from the back to find the last page in the UPL, so that we
4761 * aren't looking at a UPL that may have already been freed by the
4762 * preceding aborts/completions.
4763 */
4764 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4765 if (upl_page_present(pl, --pg_index))
4766 break;
4767 if (pg_index == 0) {
4768 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4769 goto pageout_done;
4770 }
4771 }
4772
4773 /*
4774 * initialize the offset variables before we touch the UPL.
4775 * a_f_offset is the position into the file, in bytes
4776 * offset is the position into the UPL, in bytes
4777 * pg_index is the pg# of the UPL we're operating on.
4778 * isize is the offset into the UPL of the last non-clean page.
4779 */
4780 isize = ((pg_index + 1) * PAGE_SIZE);
4781
4782 offset = 0;
4783 pg_index = 0;
4784
4785 while (isize) {
4786 int xsize;
4787 int num_of_pages;
4788
4789 if ( !upl_page_present(pl, pg_index)) {
4790 /*
4791 * we asked for RET_ONLY_DIRTY, so it's possible
4792 * to get back empty slots in the UPL.
4793 * just skip over them
4794 */
4795 f_offset += PAGE_SIZE;
4796 offset += PAGE_SIZE;
4797 isize -= PAGE_SIZE;
4798 pg_index++;
4799
4800 continue;
4801 }
4802 if ( !upl_dirty_page(pl, pg_index)) {
4803 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4804 }
4805
4806 /*
4807 * We know that we have at least one dirty page.
4808 * Now checking to see how many in a row we have
4809 */
4810 num_of_pages = 1;
4811 xsize = isize - PAGE_SIZE;
4812
4813 while (xsize) {
4814 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4815 break;
4816 num_of_pages++;
4817 xsize -= PAGE_SIZE;
4818 }
4819 xsize = num_of_pages * PAGE_SIZE;
4820
4821 if ((error = cluster_pageout(vp, upl, offset, f_offset,
4822 xsize, filesize, a_flags))) {
4823 if (error_ret == 0)
4824 error_ret = error;
4825 }
4826 f_offset += xsize;
4827 offset += xsize;
4828 isize -= xsize;
4829 pg_index += num_of_pages;
4830 }
4831 /* capture errnos bubbled out of cluster_pageout if they occurred */
4832 if (error_ret != 0) {
4833 retval = error_ret;
4834 }
4835 } /* end block for v2 pageout behavior */
4836 else {
4837 /*
4838 * just call cluster_pageout for old pre-v2 behavior
4839 */
4840 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4841 ap->a_size, filesize, a_flags);
4842 }
4843
4844 /*
4845 * If data was written, update the modification time of the file
4846 * but only if it's mapped writable; we will have touched the
4847 * modifcation time for direct writes.
4848 */
4849 if (retval == 0 && (ubc_is_mapped_writable(vp)
4850 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4851 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4852
4853 // Check again with lock
4854 bool mapped_writable = ubc_is_mapped_writable(vp);
4855 if (mapped_writable
4856 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4857 cp->c_touch_modtime = TRUE;
4858 cp->c_touch_chgtime = TRUE;
4859
4860 /*
4861 * We only need to increment the generation counter if
4862 * it's currently mapped writable because we incremented
4863 * the counter in hfs_vnop_mnomap.
4864 */
4865 if (mapped_writable)
4866 hfs_incr_gencount(VTOC(vp));
4867
4868 /*
4869 * If setuid or setgid bits are set and this process is
4870 * not the superuser then clear the setuid and setgid bits
4871 * as a precaution against tampering.
4872 */
4873 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4874 (vfs_context_suser(ap->a_context) != 0)) {
4875 cp->c_mode &= ~(S_ISUID | S_ISGID);
4876 }
4877 }
4878
4879 hfs_unlock(cp);
4880 }
4881
4882 pageout_done:
4883 if (is_pageoutv2) {
4884 /*
4885 * Release the truncate lock. Note that because
4886 * we may have taken the lock recursively by
4887 * being invoked via ubc_msync due to lockdown,
4888 * we should release it recursively, too.
4889 */
4890 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4891 }
4892 return (retval);
4893 }
4894
4895 /*
4896 * Intercept B-Tree node writes to unswap them if necessary.
4897 */
4898 int
4899 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4900 {
4901 int retval = 0;
4902 register struct buf *bp = ap->a_bp;
4903 register struct vnode *vp = buf_vnode(bp);
4904 BlockDescriptor block;
4905
4906 /* Trap B-Tree writes */
4907 if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4908 (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4909 (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4910 (vp == VTOHFS(vp)->hfc_filevp)) {
4911
4912 /*
4913 * Swap and validate the node if it is in native byte order.
4914 * This is always be true on big endian, so we always validate
4915 * before writing here. On little endian, the node typically has
4916 * been swapped and validated when it was written to the journal,
4917 * so we won't do anything here.
4918 */
4919 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4920 /* Prepare the block pointer */
4921 block.blockHeader = bp;
4922 block.buffer = (char *)buf_dataptr(bp);
4923 block.blockNum = buf_lblkno(bp);
4924 /* not found in cache ==> came from disk */
4925 block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4926 block.blockSize = buf_count(bp);
4927
4928 /* Endian un-swap B-Tree node */
4929 retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4930 if (retval)
4931 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4932 }
4933 }
4934
4935 /* This buffer shouldn't be locked anymore but if it is clear it */
4936 if ((buf_flags(bp) & B_LOCKED)) {
4937 // XXXdbg
4938 if (VTOHFS(vp)->jnl) {
4939 panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4940 }
4941 buf_clearflags(bp, B_LOCKED);
4942 }
4943 retval = vn_bwrite (ap);
4944
4945 return (retval);
4946 }
4947
4948
4949 int
4950 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx)
4951 {
4952 _dk_cs_pin_t pin;
4953 unsigned ioc;
4954 int err;
4955
4956 memset(&pin, 0, sizeof(pin));
4957 pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
4958 pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
4959 switch (pin_state) {
4960 case HFS_PIN_IT:
4961 ioc = _DKIOCCSPINEXTENT;
4962 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
4963 break;
4964 case HFS_PIN_IT | HFS_TEMP_PIN:
4965 ioc = _DKIOCCSPINEXTENT;
4966 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
4967 break;
4968 case HFS_PIN_IT | HFS_DATALESS_PIN:
4969 ioc = _DKIOCCSPINEXTENT;
4970 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
4971 break;
4972 case HFS_UNPIN_IT:
4973 ioc = _DKIOCCSUNPINEXTENT;
4974 pin.cp_flags = 0;
4975 break;
4976 case HFS_UNPIN_IT | HFS_EVICT_PIN:
4977 ioc = _DKIOCCSPINEXTENT;
4978 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
4979 break;
4980 default:
4981 return EINVAL;
4982 }
4983 err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, ctx);
4984 return err;
4985 }
4986
4987 //
4988 // The cnode lock should already be held on entry to this function
4989 //
4990 int
4991 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx)
4992 {
4993 struct filefork *fp = VTOF(vp);
4994 int i, err=0, need_put=0;
4995 struct vnode *rsrc_vp=NULL;
4996 uint32_t npinned = 0;
4997 off_t offset;
4998
4999 if (num_blocks_pinned) {
5000 *num_blocks_pinned = 0;
5001 }
5002
5003 if (vnode_vtype(vp) != VREG) {
5004 /* Not allowed to pin directories or symlinks */
5005 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5006 return (EPERM);
5007 }
5008
5009 if (fp->ff_unallocblocks) {
5010 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5011 return (EINVAL);
5012 }
5013
5014 /*
5015 * It is possible that if the caller unlocked/re-locked the cnode after checking
5016 * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5017 * cnode was unlocked. So check the condition again and return ENOENT so that
5018 * the caller knows why we failed to pin the vnode.
5019 */
5020 if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5021 // makes no sense to pin something that's pending deletion
5022 return ENOENT;
5023 }
5024
5025 if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5026 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5027 //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5028 // VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5029
5030 fp = VTOC(rsrc_vp)->c_rsrcfork;
5031 need_put = 1;
5032 }
5033 }
5034 if (fp->ff_blocks == 0) {
5035 if (need_put) {
5036 //
5037 // use a distinct error code for a compressed file that has no resource fork;
5038 // we return EALREADY to indicate that the data is already probably hot file
5039 // cached because it's in an EA and the attributes btree is on the ssd
5040 //
5041 err = EALREADY;
5042 } else {
5043 err = EINVAL;
5044 }
5045 goto out;
5046 }
5047
5048 offset = 0;
5049 for (i = 0; i < kHFSPlusExtentDensity; i++) {
5050 if (fp->ff_extents[i].startBlock == 0) {
5051 break;
5052 }
5053
5054 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, ctx);
5055 if (err) {
5056 break;
5057 } else {
5058 npinned += fp->ff_extents[i].blockCount;
5059 }
5060 }
5061
5062 if (err || npinned == 0) {
5063 goto out;
5064 }
5065
5066 if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5067 uint32_t pblocks;
5068 uint8_t forktype = 0;
5069
5070 if (fp == VTOC(vp)->c_rsrcfork) {
5071 forktype = 0xff;
5072 }
5073 /*
5074 * The file could have overflow extents, better pin them.
5075 *
5076 * We assume that since we are holding the cnode lock for this cnode,
5077 * the files extents cannot be manipulated, but the tree could, so we
5078 * need to ensure that it doesn't change behind our back as we iterate it.
5079 */
5080 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5081 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5082 hfs_systemfile_unlock (hfsmp, lockflags);
5083
5084 if (err) {
5085 goto out;
5086 }
5087 npinned += pblocks;
5088 }
5089
5090 out:
5091 if (num_blocks_pinned) {
5092 *num_blocks_pinned = npinned;
5093 }
5094
5095 if (need_put && rsrc_vp) {
5096 //
5097 // have to unlock the cnode since it's shared between the
5098 // resource fork vnode and the data fork vnode (and the
5099 // vnode_put() may need to re-acquire the cnode lock to
5100 // reclaim the resource fork vnode)
5101 //
5102 hfs_unlock(VTOC(vp));
5103 vnode_put(rsrc_vp);
5104 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5105 }
5106 return err;
5107 }
5108
5109
5110 /*
5111 * Relocate a file to a new location on disk
5112 * cnode must be locked on entry
5113 *
5114 * Relocation occurs by cloning the file's data from its
5115 * current set of blocks to a new set of blocks. During
5116 * the relocation all of the blocks (old and new) are
5117 * owned by the file.
5118 *
5119 * -----------------
5120 * |///////////////|
5121 * -----------------
5122 * 0 N (file offset)
5123 *
5124 * ----------------- -----------------
5125 * |///////////////| | | STEP 1 (acquire new blocks)
5126 * ----------------- -----------------
5127 * 0 N N+1 2N
5128 *
5129 * ----------------- -----------------
5130 * |///////////////| |///////////////| STEP 2 (clone data)
5131 * ----------------- -----------------
5132 * 0 N N+1 2N
5133 *
5134 * -----------------
5135 * |///////////////| STEP 3 (head truncate blocks)
5136 * -----------------
5137 * 0 N
5138 *
5139 * During steps 2 and 3 page-outs to file offsets less
5140 * than or equal to N are suspended.
5141 *
5142 * During step 3 page-ins to the file get suspended.
5143 */
5144 int
5145 hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred,
5146 struct proc *p)
5147 {
5148 struct cnode *cp;
5149 struct filefork *fp;
5150 struct hfsmount *hfsmp;
5151 u_int32_t headblks;
5152 u_int32_t datablks;
5153 u_int32_t blksize;
5154 u_int32_t growsize;
5155 u_int32_t nextallocsave;
5156 daddr64_t sector_a, sector_b;
5157 int eflags;
5158 off_t newbytes;
5159 int retval;
5160 int lockflags = 0;
5161 int took_trunc_lock = 0;
5162 int started_tr = 0;
5163 enum vtype vnodetype;
5164
5165 vnodetype = vnode_vtype(vp);
5166 if (vnodetype != VREG) {
5167 /* Not allowed to move symlinks. */
5168 return (EPERM);
5169 }
5170
5171 hfsmp = VTOHFS(vp);
5172 if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5173 return (ENOSPC);
5174 }
5175
5176 cp = VTOC(vp);
5177 fp = VTOF(vp);
5178 if (fp->ff_unallocblocks)
5179 return (EINVAL);
5180
5181 #if CONFIG_PROTECT
5182 /*
5183 * <rdar://problem/9118426>
5184 * Disable HFS file relocation on content-protected filesystems
5185 */
5186 if (cp_fs_protected (hfsmp->hfs_mp)) {
5187 return EINVAL;
5188 }
5189 #endif
5190 /* If it's an SSD, also disable HFS relocation */
5191 if (hfsmp->hfs_flags & HFS_SSD) {
5192 return EINVAL;
5193 }
5194
5195
5196 blksize = hfsmp->blockSize;
5197 if (blockHint == 0)
5198 blockHint = hfsmp->nextAllocation;
5199
5200 if (fp->ff_size > 0x7fffffff) {
5201 return (EFBIG);
5202 }
5203
5204 //
5205 // We do not believe that this call to hfs_fsync() is
5206 // necessary and it causes a journal transaction
5207 // deadlock so we are removing it.
5208 //
5209 //if (vnodetype == VREG && !vnode_issystem(vp)) {
5210 // retval = hfs_fsync(vp, MNT_WAIT, 0, p);
5211 // if (retval)
5212 // return (retval);
5213 //}
5214
5215 if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5216 hfs_unlock(cp);
5217 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5218 /* Force lock since callers expects lock to be held. */
5219 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5220 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5221 return (retval);
5222 }
5223 /* No need to continue if file was removed. */
5224 if (cp->c_flag & C_NOEXISTS) {
5225 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5226 return (ENOENT);
5227 }
5228 took_trunc_lock = 1;
5229 }
5230 headblks = fp->ff_blocks;
5231 datablks = howmany(fp->ff_size, blksize);
5232 growsize = datablks * blksize;
5233 eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5234 if (blockHint >= hfsmp->hfs_metazone_start &&
5235 blockHint <= hfsmp->hfs_metazone_end)
5236 eflags |= kEFMetadataMask;
5237
5238 if (hfs_start_transaction(hfsmp) != 0) {
5239 if (took_trunc_lock)
5240 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5241 return (EINVAL);
5242 }
5243 started_tr = 1;
5244 /*
5245 * Protect the extents b-tree and the allocation bitmap
5246 * during MapFileBlockC and ExtendFileC operations.
5247 */
5248 lockflags = SFL_BITMAP;
5249 if (overflow_extents(fp))
5250 lockflags |= SFL_EXTENTS;
5251 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5252
5253 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5254 if (retval) {
5255 retval = MacToVFSError(retval);
5256 goto out;
5257 }
5258
5259 /*
5260 * STEP 1 - acquire new allocation blocks.
5261 */
5262 nextallocsave = hfsmp->nextAllocation;
5263 retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5264 if (eflags & kEFMetadataMask) {
5265 hfs_lock_mount(hfsmp);
5266 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5267 MarkVCBDirty(hfsmp);
5268 hfs_unlock_mount(hfsmp);
5269 }
5270
5271 retval = MacToVFSError(retval);
5272 if (retval == 0) {
5273 cp->c_flag |= C_MODIFIED;
5274 if (newbytes < growsize) {
5275 retval = ENOSPC;
5276 goto restore;
5277 } else if (fp->ff_blocks < (headblks + datablks)) {
5278 printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5279 retval = ENOSPC;
5280 goto restore;
5281 }
5282
5283 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5284 if (retval) {
5285 retval = MacToVFSError(retval);
5286 } else if ((sector_a + 1) == sector_b) {
5287 retval = ENOSPC;
5288 goto restore;
5289 } else if ((eflags & kEFMetadataMask) &&
5290 ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5291 hfsmp->hfs_metazone_end)) {
5292 #if 0
5293 const char * filestr;
5294 char emptystr = '\0';
5295
5296 if (cp->c_desc.cd_nameptr != NULL) {
5297 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5298 } else if (vnode_name(vp) != NULL) {
5299 filestr = vnode_name(vp);
5300 } else {
5301 filestr = &emptystr;
5302 }
5303 #endif
5304 retval = ENOSPC;
5305 goto restore;
5306 }
5307 }
5308 /* Done with system locks and journal for now. */
5309 hfs_systemfile_unlock(hfsmp, lockflags);
5310 lockflags = 0;
5311 hfs_end_transaction(hfsmp);
5312 started_tr = 0;
5313
5314 if (retval) {
5315 /*
5316 * Check to see if failure is due to excessive fragmentation.
5317 */
5318 if ((retval == ENOSPC) &&
5319 (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5320 hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5321 }
5322 goto out;
5323 }
5324 /*
5325 * STEP 2 - clone file data into the new allocation blocks.
5326 */
5327
5328 if (vnodetype == VLNK)
5329 retval = EPERM;
5330 else if (vnode_issystem(vp))
5331 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5332 else
5333 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5334
5335 /* Start transaction for step 3 or for a restore. */
5336 if (hfs_start_transaction(hfsmp) != 0) {
5337 retval = EINVAL;
5338 goto out;
5339 }
5340 started_tr = 1;
5341 if (retval)
5342 goto restore;
5343
5344 /*
5345 * STEP 3 - switch to cloned data and remove old blocks.
5346 */
5347 lockflags = SFL_BITMAP;
5348 if (overflow_extents(fp))
5349 lockflags |= SFL_EXTENTS;
5350 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5351
5352 retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5353
5354 hfs_systemfile_unlock(hfsmp, lockflags);
5355 lockflags = 0;
5356 if (retval)
5357 goto restore;
5358 out:
5359 if (took_trunc_lock)
5360 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5361
5362 if (lockflags) {
5363 hfs_systemfile_unlock(hfsmp, lockflags);
5364 lockflags = 0;
5365 }
5366
5367 /* Push cnode's new extent data to disk. */
5368 if (retval == 0) {
5369 hfs_update(vp, 0);
5370 }
5371 if (hfsmp->jnl) {
5372 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5373 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5374 else
5375 (void) hfs_flushvolumeheader(hfsmp, 0);
5376 }
5377 exit:
5378 if (started_tr)
5379 hfs_end_transaction(hfsmp);
5380
5381 return (retval);
5382
5383 restore:
5384 if (fp->ff_blocks == headblks) {
5385 if (took_trunc_lock)
5386 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5387 goto exit;
5388 }
5389 /*
5390 * Give back any newly allocated space.
5391 */
5392 if (lockflags == 0) {
5393 lockflags = SFL_BITMAP;
5394 if (overflow_extents(fp))
5395 lockflags |= SFL_EXTENTS;
5396 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5397 }
5398
5399 (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5400 FTOC(fp)->c_fileid, false);
5401
5402 hfs_systemfile_unlock(hfsmp, lockflags);
5403 lockflags = 0;
5404
5405 if (took_trunc_lock)
5406 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5407 goto exit;
5408 }
5409
5410
5411 /*
5412 * Clone a file's data within the file.
5413 *
5414 */
5415 static int
5416 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5417 {
5418 caddr_t bufp;
5419 size_t bufsize;
5420 size_t copysize;
5421 size_t iosize;
5422 size_t offset;
5423 off_t writebase;
5424 uio_t auio;
5425 int error = 0;
5426
5427 writebase = blkstart * blksize;
5428 copysize = blkcnt * blksize;
5429 iosize = bufsize = MIN(copysize, 128 * 1024);
5430 offset = 0;
5431
5432 hfs_unlock(VTOC(vp));
5433
5434 #if CONFIG_PROTECT
5435 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5436 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5437 return (error);
5438 }
5439 #endif /* CONFIG_PROTECT */
5440
5441 if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
5442 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5443 return (ENOMEM);
5444 }
5445
5446 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5447
5448 while (offset < copysize) {
5449 iosize = MIN(copysize - offset, iosize);
5450
5451 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5452 uio_addiov(auio, (uintptr_t)bufp, iosize);
5453
5454 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5455 if (error) {
5456 printf("hfs_clonefile: cluster_read failed - %d\n", error);
5457 break;
5458 }
5459 if (uio_resid(auio) != 0) {
5460 printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5461 error = EIO;
5462 break;
5463 }
5464
5465 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5466 uio_addiov(auio, (uintptr_t)bufp, iosize);
5467
5468 error = cluster_write(vp, auio, writebase + offset,
5469 writebase + offset + iosize,
5470 uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5471 if (error) {
5472 printf("hfs_clonefile: cluster_write failed - %d\n", error);
5473 break;
5474 }
5475 if (uio_resid(auio) != 0) {
5476 printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5477 error = EIO;
5478 break;
5479 }
5480 offset += iosize;
5481 }
5482 uio_free(auio);
5483
5484 if ((blksize & PAGE_MASK)) {
5485 /*
5486 * since the copy may not have started on a PAGE
5487 * boundary (or may not have ended on one), we
5488 * may have pages left in the cache since NOCACHE
5489 * will let partially written pages linger...
5490 * lets just flush the entire range to make sure
5491 * we don't have any pages left that are beyond
5492 * (or intersect) the real LEOF of this file
5493 */
5494 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5495 } else {
5496 /*
5497 * No need to call ubc_msync or hfs_invalbuf
5498 * since the file was copied using IO_NOCACHE and
5499 * the copy was done starting and ending on a page
5500 * boundary in the file.
5501 */
5502 }
5503 kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5504
5505 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5506 return (error);
5507 }
5508
5509 /*
5510 * Clone a system (metadata) file.
5511 *
5512 */
5513 static int
5514 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5515 kauth_cred_t cred, struct proc *p)
5516 {
5517 caddr_t bufp;
5518 char * offset;
5519 size_t bufsize;
5520 size_t iosize;
5521 struct buf *bp = NULL;
5522 daddr64_t blkno;
5523 daddr64_t blk;
5524 daddr64_t start_blk;
5525 daddr64_t last_blk;
5526 int breadcnt;
5527 int i;
5528 int error = 0;
5529
5530
5531 iosize = GetLogicalBlockSize(vp);
5532 bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5533 breadcnt = bufsize / iosize;
5534
5535 if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
5536 return (ENOMEM);
5537 }
5538 start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5539 last_blk = ((daddr64_t)blkcnt * blksize) / iosize;
5540 blkno = 0;
5541
5542 while (blkno < last_blk) {
5543 /*
5544 * Read up to a megabyte
5545 */
5546 offset = bufp;
5547 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5548 error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5549 if (error) {
5550 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5551 goto out;
5552 }
5553 if (buf_count(bp) != iosize) {
5554 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5555 goto out;
5556 }
5557 bcopy((char *)buf_dataptr(bp), offset, iosize);
5558
5559 buf_markinvalid(bp);
5560 buf_brelse(bp);
5561 bp = NULL;
5562
5563 offset += iosize;
5564 }
5565
5566 /*
5567 * Write up to a megabyte
5568 */
5569 offset = bufp;
5570 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5571 bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5572 if (bp == NULL) {
5573 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5574 error = EIO;
5575 goto out;
5576 }
5577 bcopy(offset, (char *)buf_dataptr(bp), iosize);
5578 error = (int)buf_bwrite(bp);
5579 bp = NULL;
5580 if (error)
5581 goto out;
5582 offset += iosize;
5583 }
5584 }
5585 out:
5586 if (bp) {
5587 buf_brelse(bp);
5588 }
5589
5590 kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5591
5592 error = hfs_fsync(vp, MNT_WAIT, 0, p);
5593
5594 return (error);
5595 }
5596
5597 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5598 {
5599 cnode_t *cp = VTOC(vp);
5600
5601 assert(cp->c_lockowner == current_thread());
5602 assert(cp->c_truncatelockowner == current_thread());
5603
5604 if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5605 return 0;
5606
5607 filefork_t *fp = VTOF(vp);
5608
5609 /*
5610 * We can't hold the cnode lock whilst we call cluster_write so we
5611 * need to copy the extents into a local buffer.
5612 */
5613 int max_exts = 16;
5614 struct ext {
5615 off_t start, end;
5616 } exts_buf[max_exts]; // 256 bytes
5617 struct ext *exts = exts_buf;
5618 int ext_count = 0;
5619 errno_t ret;
5620
5621 struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5622
5623 while (r) {
5624 /* If we have more than can fit in our stack buffer, switch
5625 to a heap buffer. */
5626 if (exts == exts_buf && ext_count == max_exts) {
5627 max_exts = 256;
5628 MALLOC(exts, struct ext *, sizeof(struct ext) * max_exts,
5629 M_TEMP, M_WAITOK);
5630 memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5631 }
5632
5633 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5634
5635 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5636
5637 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5638 hfs_unlock(cp);
5639 for (int i = 0; i < ext_count; ++i) {
5640 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5641 exts[i].start, 0,
5642 IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5643 if (ret) {
5644 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5645 goto exit;
5646 }
5647 }
5648
5649 if (!next) {
5650 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5651 break;
5652 }
5653
5654 /* Push any existing clusters which should clean up our invalid
5655 ranges as they go through hfs_vnop_blockmap. */
5656 cluster_push(vp, 0);
5657
5658 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5659
5660 /*
5661 * Get back to where we were (given we dropped the lock).
5662 * This shouldn't be many because we pushed above.
5663 */
5664 TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5665 if (r->rl_end > exts[ext_count - 1].end)
5666 break;
5667 }
5668
5669 ext_count = 0;
5670 } else
5671 r = next;
5672 }
5673
5674 ret = 0;
5675
5676 exit:
5677
5678 if (exts != exts_buf)
5679 FREE(exts, M_TEMP);
5680
5681 return ret;
5682 }