bsd/hfs/hfs_vfsops.c

   1 /*
   2  * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1991, 1993, 1994
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      hfs_vfsops.c
  66  *  derived from        @(#)ufs_vfsops.c        8.8 (Berkeley) 5/20/95
  67  *
  68  *      (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved.
  69  *
  70  *      hfs_vfsops.c -- VFS layer for loadable HFS file system.
  71  *
  72  */
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kauth.h>
  76
  77 #include <sys/ubc.h>
  78 #include <sys/ubc_internal.h>
  79 #include <sys/vnode_internal.h>
  80 #include <sys/mount_internal.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/malloc.h>
  83 #include <sys/stat.h>
  84 #include <sys/quota.h>
  85 #include <sys/disk.h>
  86 #include <sys/paths.h>
  87 #include <sys/utfconv.h>
  88 #include <sys/kdebug.h>
  89 #include <sys/fslog.h>
  90 #include <sys/ubc.h>
  91 #include <sys/buf_internal.h>
  92
  93 #include <kern/locks.h>
  94
  95 #include <vfs/vfs_journal.h>
  96
  97 #include <miscfs/specfs/specdev.h>
  98 #include <hfs/hfs_mount.h>
  99
 100 #include <libkern/crypto/md5.h>
 101 #include <uuid/uuid.h>
 102
 103 #include "hfs.h"
 104 #include "hfs_catalog.h"
 105 #include "hfs_cnode.h"
 106 #include "hfs_dbg.h"
 107 #include "hfs_endian.h"
 108 #include "hfs_hotfiles.h"
 109 #include "hfs_quota.h"
 110 #include "hfs_btreeio.h"
 111
 112 #include "hfscommon/headers/FileMgrInternal.h"
 113 #include "hfscommon/headers/BTreesInternal.h"
 114
 115 #if CONFIG_PROTECT
 116 #include <sys/cprotect.h>
 117 #endif
 118
 119 #if CONFIG_HFS_ALLOC_RBTREE
 120 #include "hfscommon/headers/HybridAllocator.h"
 121 #endif
 122
 123 #define HFS_MOUNT_DEBUG 1
 124
 125 #if     HFS_DIAGNOSTIC
 126 int hfs_dbg_all = 0;
 127 int hfs_dbg_err = 0;
 128 #endif
 129
 130 /* Enable/disable debugging code for live volume resizing */
 131 int hfs_resize_debug = 0;
 132
 133 lck_grp_attr_t *  hfs_group_attr;
 134 lck_attr_t *  hfs_lock_attr;
 135 lck_grp_t *  hfs_mutex_group;
 136 lck_grp_t *  hfs_rwlock_group;
 137 lck_grp_t *  hfs_spinlock_group;
 138
 139 extern struct vnodeopv_desc hfs_vnodeop_opv_desc;
 140 extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc;
 141
 142 /* not static so we can re-use in hfs_readwrite.c for build_path calls */
 143 int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 144
 145 static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args);
 146 static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context);
 147 static int hfs_flushfiles(struct mount *, int, struct proc *);
 148 static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush);
 149 static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp);
 150 static int hfs_init(struct vfsconf *vfsp);
 151 static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context);
 152 static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context);
 153 static int hfs_start(struct mount *mp, int flags, vfs_context_t context);
 154 static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
 155 static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec);
 156 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
 157 static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context);
 158 static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context);
 159
 160 void hfs_initialize_allocator (struct hfsmount *hfsmp);
 161 int hfs_teardown_allocator (struct hfsmount *hfsmp);
 162 void hfs_unmap_blocks (struct hfsmount *hfsmp);
 163
 164 int hfs_mount(struct mount *mp, vnode_t  devvp, user_addr_t data, vfs_context_t context);
 165 int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context);
 166 int hfs_reload(struct mount *mp);
 167 int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context);
 168 int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context);
 169 int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
 170                       user_addr_t newp, size_t newlen, vfs_context_t context);
 171 int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context);
 172
 173 /*
 174  * Called by vfs_mountroot when mounting HFS Plus as root.
 175  */
 176
 177 int
 178 hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context)
 179 {
 180         struct hfsmount *hfsmp;
 181         ExtendedVCB *vcb;
 182         struct vfsstatfs *vfsp;
 183         int error;
 184
 185         if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) {
 186                 if (HFS_MOUNT_DEBUG) {
 187                         printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n",
 188                                         error, rvp, (rvp->v_name ? rvp->v_name : "unknown device"));
 189                 }
 190                 return (error);
 191         }
 192
 193         /* Init hfsmp */
 194         hfsmp = VFSTOHFS(mp);
 195
 196         hfsmp->hfs_uid = UNKNOWNUID;
 197         hfsmp->hfs_gid = UNKNOWNGID;
 198         hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 199         hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 200
 201         /* Establish the free block reserve. */
 202         vcb = HFSTOVCB(hfsmp);
 203         vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100;
 204         vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize);
 205
 206         vfsp = vfs_statfs(mp);
 207         (void)hfs_statfs(mp, vfsp, NULL);
 208
 209         return (0);
 210 }
 211
 212
 213 /*
 214  * VFS Operations.
 215  *
 216  * mount system call
 217  */
 218
 219 int
 220 hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
 221 {
 222         struct proc *p = vfs_context_proc(context);
 223         struct hfsmount *hfsmp = NULL;
 224         struct hfs_mount_args args;
 225         int retval = E_NONE;
 226         u_int32_t cmdflags;
 227
 228         if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) {
 229                 if (HFS_MOUNT_DEBUG) {
 230                         printf("hfs_mount: copyin returned %d for fs\n", retval);
 231                 }
 232                 return (retval);
 233         }
 234         cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS;
 235         if (cmdflags & MNT_UPDATE) {
 236                 hfsmp = VFSTOHFS(mp);
 237
 238                 /* Reload incore data after an fsck. */
 239                 if (cmdflags & MNT_RELOAD) {
 240                         if (vfs_isrdonly(mp)) {
 241                                 int error = hfs_reload(mp);
 242                                 if (error && HFS_MOUNT_DEBUG) {
 243                                         printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN);
 244                                 }
 245                                 return error;
 246                         }
 247                         else {
 248                                 if (HFS_MOUNT_DEBUG) {
 249                                         printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN);
 250                                 }
 251                                 return (EINVAL);
 252                         }
 253                 }
 254
 255                 /* Change to a read-only file system. */
 256                 if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) &&
 257                     vfs_isrdonly(mp)) {
 258                         int flags;
 259
 260                         /* Set flag to indicate that a downgrade to read-only
 261                          * is in progress and therefore block any further
 262                          * modifications to the file system.
 263                          */
 264                         hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 265                         hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
 266                         hfsmp->hfs_downgrading_proc = current_thread();
 267                         hfs_unlock_global (hfsmp);
 268
 269                         /* use VFS_SYNC to push out System (btree) files */
 270                         retval = VFS_SYNC(mp, MNT_WAIT, context);
 271                         if (retval && ((cmdflags & MNT_FORCE) == 0)) {
 272                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 273                                 hfsmp->hfs_downgrading_proc = NULL;
 274                                 if (HFS_MOUNT_DEBUG) {
 275                                         printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN);
 276                                 }
 277                                 goto out;
 278                         }
 279
 280                         flags = WRITECLOSE;
 281                         if (cmdflags & MNT_FORCE)
 282                                 flags |= FORCECLOSE;
 283
 284                         if ((retval = hfs_flushfiles(mp, flags, p))) {
 285                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 286                                 hfsmp->hfs_downgrading_proc = NULL;
 287                                 if (HFS_MOUNT_DEBUG) {
 288                                         printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN);
 289                                 }
 290                                 goto out;
 291                         }
 292
 293                         /* mark the volume cleanly unmounted */
 294                         hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
 295                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 296                         hfsmp->hfs_flags |= HFS_READ_ONLY;
 297
 298                         /*
 299                          * Close down the journal.
 300                          *
 301                          * NOTE: It is critically important to close down the journal
 302                          * and have it issue all pending I/O prior to calling VNOP_FSYNC below.
 303                          * In a journaled environment it is expected that the journal be
 304                          * the only actor permitted to issue I/O for metadata blocks in HFS.
 305                          * If we were to call VNOP_FSYNC prior to closing down the journal,
 306                          * we would inadvertantly issue (and wait for) the I/O we just
 307                          * initiated above as part of the flushvolumeheader call.
 308                          *
 309                          * To avoid this, we follow the same order of operations as in
 310                          * unmount and issue the journal_close prior to calling VNOP_FSYNC.
 311                          */
 312
 313                         if (hfsmp->jnl) {
 314                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 315
 316                             journal_close(hfsmp->jnl);
 317                             hfsmp->jnl = NULL;
 318
 319                             // Note: we explicitly don't want to shutdown
 320                             //       access to the jvp because we may need
 321                             //       it later if we go back to being read-write.
 322
 323                                 hfs_unlock_global (hfsmp);
 324                         }
 325
 326
 327                         /*
 328                          * Write out any pending I/O still outstanding against the device node
 329                          * now that the journal has been closed.
 330                          */
 331                         if (!retval) {
 332                                 if (vnode_mount(hfsmp->hfs_devvp) == mp) {
 333                                         retval = hfs_fsync(hfsmp->hfs_devvp, MNT_WAIT, 0, p);
 334                                 } else {
 335                                         vnode_get(hfsmp->hfs_devvp);
 336                                         retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
 337                                         vnode_put(hfsmp->hfs_devvp);
 338                                 }
 339                         }
 340
 341                         if (retval) {
 342                                 if (HFS_MOUNT_DEBUG) {
 343                                         printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN);
 344                                 }
 345                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 346                                 hfsmp->hfs_downgrading_proc = NULL;
 347                                 hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 348                                 goto out;
 349                         }
 350
 351 #if CONFIG_HFS_ALLOC_RBTREE
 352                         (void) hfs_teardown_allocator(hfsmp);
 353 #endif
 354                         hfsmp->hfs_downgrading_proc = NULL;
 355                 }
 356
 357                 /* Change to a writable file system. */
 358                 if (vfs_iswriteupgrade(mp)) {
 359 #if CONFIG_HFS_ALLOC_RBTREE
 360                                 thread_t allocator_thread;
 361 #endif
 362
 363                         /*
 364                          * On inconsistent disks, do not allow read-write mount
 365                          * unless it is the boot volume being mounted.
 366                          */
 367                         if (!(vfs_flags(mp) & MNT_ROOTFS) &&
 368                                         (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) {
 369                                 if (HFS_MOUNT_DEBUG) {
 370                                         printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n",  (hfsmp->vcbVN));
 371                                 }
 372                                 retval = EINVAL;
 373                                 goto out;
 374                         }
 375
 376                         // If the journal was shut-down previously because we were
 377                         // asked to be read-only, let's start it back up again now
 378
 379                         if (   (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask)
 380                             && hfsmp->jnl == NULL
 381                             && hfsmp->jvp != NULL) {
 382                             int jflags;
 383
 384                             if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) {
 385                                         jflags = JOURNAL_RESET;
 386                                 } else {
 387                                         jflags = 0;
 388                                 }
 389
 390                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 391
 392                                 hfsmp->jnl = journal_open(hfsmp->jvp,
 393                                                 (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
 394                                                 hfsmp->jnl_size,
 395                                                 hfsmp->hfs_devvp,
 396                                                 hfsmp->hfs_logical_block_size,
 397                                                 jflags,
 398                                                 0,
 399                                                 hfs_sync_metadata, hfsmp->hfs_mp);
 400
 401                                 /*
 402                                  * Set up the trim callback function so that we can add
 403                                  * recently freed extents to the free extent cache once
 404                                  * the transaction that freed them is written to the
 405                                  * journal on disk.
 406                                  */
 407                                 if (hfsmp->jnl)
 408                                         journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp);
 409
 410                                 hfs_unlock_global (hfsmp);
 411
 412                                 if (hfsmp->jnl == NULL) {
 413                                         if (HFS_MOUNT_DEBUG) {
 414                                                 printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN));
 415                                         }
 416                                         retval = EINVAL;
 417                                         goto out;
 418                                 } else {
 419                                         hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET;
 420                                 }
 421
 422                         }
 423
 424                         /* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */
 425                         retval = hfs_erase_unused_nodes(hfsmp);
 426                         if (retval != E_NONE) {
 427                                 if (HFS_MOUNT_DEBUG) {
 428                                         printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN);
 429                                 }
 430                                 goto out;
 431                         }
 432
 433                         /* If this mount point was downgraded from read-write
 434                          * to read-only, clear that information as we are now
 435                          * moving back to read-write.
 436                          */
 437                         hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 438                         hfsmp->hfs_downgrading_proc = NULL;
 439
 440                         /* mark the volume dirty (clear clean unmount bit) */
 441                         hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
 442
 443                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 444                         if (retval != E_NONE) {
 445                                 if (HFS_MOUNT_DEBUG) {
 446                                         printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN);
 447                                 }
 448                                 goto out;
 449                         }
 450
 451                         /* Only clear HFS_READ_ONLY after a successful write */
 452                         hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 453
 454
 455                         if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) {
 456                                 /* Setup private/hidden directories for hardlinks. */
 457                                 hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
 458                                 hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
 459
 460                                 hfs_remove_orphans(hfsmp);
 461
 462                                 /*
 463                                  * Allow hot file clustering if conditions allow.
 464                                  */
 465                                 if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) &&
 466                                            ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0))    {
 467                                         (void) hfs_recording_init(hfsmp);
 468                                 }
 469                                 /* Force ACLs on HFS+ file systems. */
 470                                 if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) {
 471                                         vfs_setextendedsecurity(HFSTOVFS(hfsmp));
 472                                 }
 473                         }
 474
 475 #if CONFIG_HFS_ALLOC_RBTREE
 476                         /*
 477                          * Like the normal mount case, we need to handle creation of the allocation red-black tree
 478                          * if we're upgrading from read-only to read-write.
 479                          *
 480                          * We spawn a thread to create the pair of red-black trees for this volume.
 481                          * However, in so doing, we must be careful to ensure that if this thread is still
 482                          * running after mount has finished, it doesn't interfere with an unmount. Specifically,
 483                          * we'll need to set a bit that indicates we're in progress building the trees here.
 484                          * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
 485                          * notifies the tree generation code that an unmount is waiting.  Also, mark the extent
 486                          * tree flags that the allocator is enabled for use before we spawn the thread that will start
 487                          * scanning the RB tree.
 488                          *
 489                          * Only do this if we're operating on a read-write mount (we wouldn't care for read-only),
 490                          * which has not previously encountered a bad error on the red-black tree code.  Also, don't
 491                          * try to re-build a tree that already exists.
 492                          *
 493                          * When this is enabled, we must re-integrate the above function into our bitmap iteration
 494                          * so that we accurately send TRIMs down to the underlying disk device as needed.
 495                          */
 496
 497                         if (hfsmp->extent_tree_flags == 0) {
 498                                 hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
 499                                 /* Initialize EOF counter so that the thread can assume it started at initial values */
 500                                 hfsmp->offset_block_end = 0;
 501
 502                                 InitTree(hfsmp);
 503
 504                                 kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
 505                                 thread_deallocate(allocator_thread);
 506                         }
 507
 508 #endif
 509                 }
 510
 511                 /* Update file system parameters. */
 512                 retval = hfs_changefs(mp, &args);
 513                 if (retval &&  HFS_MOUNT_DEBUG) {
 514                         printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN);
 515                 }
 516
 517         } else /* not an update request */ {
 518
 519                 /* Set the mount flag to indicate that we support volfs  */
 520                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS));
 521
 522                 retval = hfs_mountfs(devvp, mp, &args, 0, context);
 523                 if (retval && HFS_MOUNT_DEBUG) {
 524                         printf("hfs_mount: hfs_mountfs returned %d\n", retval);
 525                 }
 526 #if CONFIG_PROTECT
 527                 /*
 528                  * If above mount call was successful, and this mount is content protection
 529                  * enabled, then verify the on-disk EA on the root to ensure that the filesystem
 530                  * is of a suitable vintage to allow the mount to proceed.
 531                  */
 532                 if ((retval == 0) && (cp_fs_protected (mp))) {
 533                         int err = 0;
 534
 535                         struct cp_root_xattr *xattr = NULL;
 536                         MALLOC (xattr, struct cp_root_xattr*, sizeof(struct cp_root_xattr), M_TEMP, M_WAITOK);
 537                         if (xattr == NULL) {
 538                                 err = ENOMEM;
 539                                 goto badalloc;
 540                         }
 541                         bzero (xattr, sizeof(struct cp_root_xattr));
 542                         hfsmp = vfs_fsprivate(mp);
 543
 544                         /* go get the EA to get the version information */
 545                         err = cp_getrootxattr (hfsmp, xattr);
 546                         /*
 547                          * If there was no EA there, then write one out.
 548                          * Assuming EA is not present on the root means
 549                          * this is an erase install or a very old FS
 550                          */
 551                         if (err == ENOATTR) {
 552                                 printf("No root EA set, creating new EA with new version: %d\n", CP_NEW_MAJOR_VERS);
 553                                 bzero(xattr, sizeof(struct cp_root_xattr));
 554                                 xattr->major_version = CP_NEW_MAJOR_VERS;
 555                                 xattr->minor_version = CP_MINOR_VERS;
 556                                 xattr->flags = 0;
 557
 558                                 err = cp_setrootxattr (hfsmp, xattr);
 559                         }
 560
 561                         /*
 562                          * For any other error, including having an out of date CP version in the
 563                          * EA, or for an error out of cp_setrootxattr, deny the mount
 564                          * and do not proceed further.
 565                          */
 566                         if (err || (xattr->major_version != CP_NEW_MAJOR_VERS && xattr->major_version != CP_PREV_MAJOR_VERS))  {
 567                                 /* Deny the mount and tear down. */
 568                                 retval = EPERM;
 569                                 (void) hfs_unmount (mp, MNT_FORCE, context);
 570                         }
 571                         printf("Running with CP root xattr: %d.%d\n", xattr->major_version, xattr->minor_version);
 572 badalloc:
 573                         if(xattr) {
 574                                 FREE(xattr, M_TEMP);
 575                         }
 576                 }
 577 #endif
 578         }
 579 out:
 580         if (retval == 0) {
 581                 (void)hfs_statfs(mp, vfs_statfs(mp), context);
 582         }
 583         return (retval);
 584 }
 585
 586
 587 struct hfs_changefs_cargs {
 588         struct hfsmount *hfsmp;
 589         int             namefix;
 590         int             permfix;
 591         int             permswitch;
 592 };
 593
 594 static int
 595 hfs_changefs_callback(struct vnode *vp, void *cargs)
 596 {
 597         ExtendedVCB *vcb;
 598         struct cnode *cp;
 599         struct cat_desc cndesc;
 600         struct cat_attr cnattr;
 601         struct hfs_changefs_cargs *args;
 602         int lockflags;
 603         int error;
 604
 605         args = (struct hfs_changefs_cargs *)cargs;
 606
 607         cp = VTOC(vp);
 608         vcb = HFSTOVCB(args->hfsmp);
 609
 610         lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 611         error = cat_lookup(args->hfsmp, &cp->c_desc, 0, &cndesc, &cnattr, NULL, NULL);
 612         hfs_systemfile_unlock(args->hfsmp, lockflags);
 613         if (error) {
 614                 /*
 615                  * If we couldn't find this guy skip to the next one
 616                  */
 617                 if (args->namefix)
 618                         cache_purge(vp);
 619
 620                 return (VNODE_RETURNED);
 621         }
 622         /*
 623          * Get the real uid/gid and perm mask from disk.
 624          */
 625         if (args->permswitch || args->permfix) {
 626                 cp->c_uid = cnattr.ca_uid;
 627                 cp->c_gid = cnattr.ca_gid;
 628                 cp->c_mode = cnattr.ca_mode;
 629         }
 630         /*
 631          * If we're switching name converters then...
 632          *   Remove the existing entry from the namei cache.
 633          *   Update name to one based on new encoder.
 634          */
 635         if (args->namefix) {
 636                 cache_purge(vp);
 637                 replace_desc(cp, &cndesc);
 638
 639                 if (cndesc.cd_cnid == kHFSRootFolderID) {
 640                         strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1);
 641                         cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding;
 642                 }
 643         } else {
 644                 cat_releasedesc(&cndesc);
 645         }
 646         return (VNODE_RETURNED);
 647 }
 648
 649 /* Change fs mount parameters */
 650 static int
 651 hfs_changefs(struct mount *mp, struct hfs_mount_args *args)
 652 {
 653         int retval = 0;
 654         int namefix, permfix, permswitch;
 655         struct hfsmount *hfsmp;
 656         ExtendedVCB *vcb;
 657         hfs_to_unicode_func_t   get_unicode_func;
 658         unicode_to_hfs_func_t   get_hfsname_func;
 659         u_int32_t old_encoding = 0;
 660         struct hfs_changefs_cargs cargs;
 661         u_int32_t mount_flags;
 662
 663         hfsmp = VFSTOHFS(mp);
 664         vcb = HFSTOVCB(hfsmp);
 665         mount_flags = (unsigned int)vfs_flags(mp);
 666
 667         hfsmp->hfs_flags |= HFS_IN_CHANGEFS;
 668
 669         permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) &&
 670                        ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) ||
 671                       (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) &&
 672                        (mount_flags & MNT_UNKNOWNPERMISSIONS)));
 673
 674         /* The root filesystem must operate with actual permissions: */
 675         if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) {
 676                 vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS));  /* Just say "No". */
 677                 retval = EINVAL;
 678                 goto exit;
 679         }
 680         if (mount_flags & MNT_UNKNOWNPERMISSIONS)
 681                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
 682         else
 683                 hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS;
 684
 685         namefix = permfix = 0;
 686
 687         /*
 688          * Tracking of hot files requires up-to-date access times.  So if
 689          * access time updates are disabled, we must also disable hot files.
 690          */
 691         if (mount_flags & MNT_NOATIME) {
 692                 (void) hfs_recording_suspend(hfsmp);
 693         }
 694
 695         /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */
 696         if (args->hfs_timezone.tz_minuteswest != VNOVAL) {
 697                 gTimeZone = args->hfs_timezone;
 698         }
 699
 700         /* Change the default uid, gid and/or mask */
 701         if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) {
 702                 hfsmp->hfs_uid = args->hfs_uid;
 703                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 704                         ++permfix;
 705         }
 706         if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) {
 707                 hfsmp->hfs_gid = args->hfs_gid;
 708                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 709                         ++permfix;
 710         }
 711         if (args->hfs_mask != (mode_t)VNOVAL) {
 712                 if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) {
 713                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
 714                         hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
 715                         if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES))
 716                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
 717                         if (vcb->vcbSigWord == kHFSPlusSigWord)
 718                                 ++permfix;
 719                 }
 720         }
 721
 722         /* Change the hfs encoding value (hfs only) */
 723         if ((vcb->vcbSigWord == kHFSSigWord)    &&
 724             (args->hfs_encoding != (u_int32_t)VNOVAL)              &&
 725             (hfsmp->hfs_encoding != args->hfs_encoding)) {
 726
 727                 retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func);
 728                 if (retval)
 729                         goto exit;
 730
 731                 /*
 732                  * Connect the new hfs_get_unicode converter but leave
 733                  * the old hfs_get_hfsname converter in place so that
 734                  * we can lookup existing vnodes to get their correctly
 735                  * encoded names.
 736                  *
 737                  * When we're all finished, we can then connect the new
 738                  * hfs_get_hfsname converter and release our interest
 739                  * in the old converters.
 740                  */
 741                 hfsmp->hfs_get_unicode = get_unicode_func;
 742                 old_encoding = hfsmp->hfs_encoding;
 743                 hfsmp->hfs_encoding = args->hfs_encoding;
 744                 ++namefix;
 745         }
 746
 747         if (!(namefix || permfix || permswitch))
 748                 goto exit;
 749
 750         /* XXX 3762912 hack to support HFS filesystem 'owner' */
 751         if (permfix)
 752                 vfs_setowner(mp,
 753                     hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid,
 754                     hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid);
 755
 756         /*
 757          * For each active vnode fix things that changed
 758          *
 759          * Note that we can visit a vnode more than once
 760          * and we can race with fsync.
 761          *
 762          * hfs_changefs_callback will be called for each vnode
 763          * hung off of this mount point
 764          *
 765          * The vnode will be properly referenced and unreferenced
 766          * around the callback
 767          */
 768         cargs.hfsmp = hfsmp;
 769         cargs.namefix = namefix;
 770         cargs.permfix = permfix;
 771         cargs.permswitch = permswitch;
 772
 773         vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs);
 774
 775         /*
 776          * If we're switching name converters we can now
 777          * connect the new hfs_get_hfsname converter and
 778          * release our interest in the old converters.
 779          */
 780         if (namefix) {
 781                 hfsmp->hfs_get_hfsname = get_hfsname_func;
 782                 vcb->volumeNameEncodingHint = args->hfs_encoding;
 783                 (void) hfs_relconverter(old_encoding);
 784         }
 785 exit:
 786         hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS;
 787         return (retval);
 788 }
 789
 790
 791 struct hfs_reload_cargs {
 792         struct hfsmount *hfsmp;
 793         int             error;
 794 };
 795
 796 static int
 797 hfs_reload_callback(struct vnode *vp, void *cargs)
 798 {
 799         struct cnode *cp;
 800         struct hfs_reload_cargs *args;
 801         int lockflags;
 802
 803         args = (struct hfs_reload_cargs *)cargs;
 804         /*
 805          * flush all the buffers associated with this node
 806          */
 807         (void) buf_invalidateblks(vp, 0, 0, 0);
 808
 809         cp = VTOC(vp);
 810         /*
 811          * Remove any directory hints
 812          */
 813         if (vnode_isdir(vp))
 814                 hfs_reldirhints(cp, 0);
 815
 816         /*
 817          * Re-read cnode data for all active vnodes (non-metadata files).
 818          */
 819         if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) {
 820                 struct cat_fork *datafork;
 821                 struct cat_desc desc;
 822
 823                 datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL;
 824
 825                 /* lookup by fileID since name could have changed */
 826                 lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 827                 args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork);
 828                 hfs_systemfile_unlock(args->hfsmp, lockflags);
 829                 if (args->error) {
 830                         return (VNODE_RETURNED_DONE);
 831                 }
 832
 833                 /* update cnode's catalog descriptor */
 834                 (void) replace_desc(cp, &desc);
 835         }
 836         return (VNODE_RETURNED);
 837 }
 838
 839 /*
 840  * Reload all incore data for a filesystem (used after running fsck on
 841  * the root filesystem and finding things to fix). The filesystem must
 842  * be mounted read-only.
 843  *
 844  * Things to do to update the mount:
 845  *      invalidate all cached meta-data.
 846  *      invalidate all inactive vnodes.
 847  *      invalidate all cached file data.
 848  *      re-read volume header from disk.
 849  *      re-load meta-file info (extents, file size).
 850  *      re-load B-tree header data.
 851  *      re-read cnode data for all active vnodes.
 852  */
 853 int
 854 hfs_reload(struct mount *mountp)
 855 {
 856         register struct vnode *devvp;
 857         struct buf *bp;
 858         int error, i;
 859         struct hfsmount *hfsmp;
 860         struct HFSPlusVolumeHeader *vhp;
 861         ExtendedVCB *vcb;
 862         struct filefork *forkp;
 863         struct cat_desc cndesc;
 864         struct hfs_reload_cargs args;
 865         daddr64_t priIDSector;
 866
 867         hfsmp = VFSTOHFS(mountp);
 868         vcb = HFSTOVCB(hfsmp);
 869
 870         if (vcb->vcbSigWord == kHFSSigWord)
 871                 return (EINVAL);        /* rooting from HFS is not supported! */
 872
 873         /*
 874          * Invalidate all cached meta-data.
 875          */
 876         devvp = hfsmp->hfs_devvp;
 877         if (buf_invalidateblks(devvp, 0, 0, 0))
 878                 panic("hfs_reload: dirty1");
 879
 880         args.hfsmp = hfsmp;
 881         args.error = 0;
 882         /*
 883          * hfs_reload_callback will be called for each vnode
 884          * hung off of this mount point that can't be recycled...
 885          * vnode_iterate will recycle those that it can (the VNODE_RELOAD option)
 886          * the vnode will be in an 'unbusy' state (VNODE_WAIT) and
 887          * properly referenced and unreferenced around the callback
 888          */
 889         vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args);
 890
 891         if (args.error)
 892                 return (args.error);
 893
 894         /*
 895          * Re-read VolumeHeader from disk.
 896          */
 897         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
 898                         HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
 899
 900         error = (int)buf_meta_bread(hfsmp->hfs_devvp,
 901                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
 902                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
 903         if (error) {
 904                 if (bp != NULL)
 905                         buf_brelse(bp);
 906                 return (error);
 907         }
 908
 909         vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
 910
 911         /* Do a quick sanity check */
 912         if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
 913              SWAP_BE16(vhp->signature) != kHFSXSigWord) ||
 914             (SWAP_BE16(vhp->version) != kHFSPlusVersion &&
 915              SWAP_BE16(vhp->version) != kHFSXVersion) ||
 916             SWAP_BE32(vhp->blockSize) != vcb->blockSize) {
 917                 buf_brelse(bp);
 918                 return (EIO);
 919         }
 920
 921         vcb->vcbLsMod           = to_bsd_time(SWAP_BE32(vhp->modifyDate));
 922         vcb->vcbAtrb            = SWAP_BE32 (vhp->attributes);
 923         vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
 924         vcb->vcbClpSiz          = SWAP_BE32 (vhp->rsrcClumpSize);
 925         vcb->vcbNxtCNID         = SWAP_BE32 (vhp->nextCatalogID);
 926         vcb->vcbVolBkUp         = to_bsd_time(SWAP_BE32(vhp->backupDate));
 927         vcb->vcbWrCnt           = SWAP_BE32 (vhp->writeCount);
 928         vcb->vcbFilCnt          = SWAP_BE32 (vhp->fileCount);
 929         vcb->vcbDirCnt          = SWAP_BE32 (vhp->folderCount);
 930         HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation));
 931         vcb->totalBlocks        = SWAP_BE32 (vhp->totalBlocks);
 932         vcb->freeBlocks         = SWAP_BE32 (vhp->freeBlocks);
 933         vcb->encodingsBitmap    = SWAP_BE64 (vhp->encodingsBitmap);
 934         bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo));
 935         vcb->localCreateDate    = SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */
 936
 937         /*
 938          * Re-load meta-file vnode data (extent info, file size, etc).
 939          */
 940         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 941         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 942                 forkp->ff_extents[i].startBlock =
 943                         SWAP_BE32 (vhp->extentsFile.extents[i].startBlock);
 944                 forkp->ff_extents[i].blockCount =
 945                         SWAP_BE32 (vhp->extentsFile.extents[i].blockCount);
 946         }
 947         forkp->ff_size      = SWAP_BE64 (vhp->extentsFile.logicalSize);
 948         forkp->ff_blocks    = SWAP_BE32 (vhp->extentsFile.totalBlocks);
 949         forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize);
 950
 951
 952         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 953         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 954                 forkp->ff_extents[i].startBlock =
 955                         SWAP_BE32 (vhp->catalogFile.extents[i].startBlock);
 956                 forkp->ff_extents[i].blockCount =
 957                         SWAP_BE32 (vhp->catalogFile.extents[i].blockCount);
 958         }
 959         forkp->ff_size      = SWAP_BE64 (vhp->catalogFile.logicalSize);
 960         forkp->ff_blocks    = SWAP_BE32 (vhp->catalogFile.totalBlocks);
 961         forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize);
 962
 963         if (hfsmp->hfs_attribute_vp) {
 964                 forkp = VTOF(hfsmp->hfs_attribute_vp);
 965                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
 966                         forkp->ff_extents[i].startBlock =
 967                                 SWAP_BE32 (vhp->attributesFile.extents[i].startBlock);
 968                         forkp->ff_extents[i].blockCount =
 969                                 SWAP_BE32 (vhp->attributesFile.extents[i].blockCount);
 970                 }
 971                 forkp->ff_size      = SWAP_BE64 (vhp->attributesFile.logicalSize);
 972                 forkp->ff_blocks    = SWAP_BE32 (vhp->attributesFile.totalBlocks);
 973                 forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize);
 974         }
 975
 976         forkp = VTOF((struct vnode *)vcb->allocationsRefNum);
 977         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 978                 forkp->ff_extents[i].startBlock =
 979                         SWAP_BE32 (vhp->allocationFile.extents[i].startBlock);
 980                 forkp->ff_extents[i].blockCount =
 981                         SWAP_BE32 (vhp->allocationFile.extents[i].blockCount);
 982         }
 983         forkp->ff_size      = SWAP_BE64 (vhp->allocationFile.logicalSize);
 984         forkp->ff_blocks    = SWAP_BE32 (vhp->allocationFile.totalBlocks);
 985         forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize);
 986
 987         buf_brelse(bp);
 988         vhp = NULL;
 989
 990         /*
 991          * Re-load B-tree header data
 992          */
 993         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 994         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 995                 return (error);
 996
 997         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 998         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 999                 return (error);
1000
1001         if (hfsmp->hfs_attribute_vp) {
1002                 forkp = VTOF(hfsmp->hfs_attribute_vp);
1003                 if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
1004                         return (error);
1005         }
1006
1007         /* Reload the volume name */
1008         if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL)))
1009                 return (error);
1010         vcb->volumeNameEncodingHint = cndesc.cd_encoding;
1011         bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen));
1012         cat_releasedesc(&cndesc);
1013
1014         /* Re-establish private/hidden directories. */
1015         hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
1016         hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
1017
1018         /* In case any volume information changed to trigger a notification */
1019         hfs_generate_volume_notifications(hfsmp);
1020
1021         return (0);
1022 }
1023
1024
1025
1026 static void
1027 hfs_syncer(void *arg0, void *unused)
1028 {
1029 #pragma unused(unused)
1030
1031     struct hfsmount *hfsmp = arg0;
1032     clock_sec_t secs;
1033     clock_usec_t usecs;
1034     uint32_t delay = HFS_META_DELAY;
1035     uint64_t now;
1036     static int no_max=1;
1037
1038     clock_get_calendar_microtime(&secs, &usecs);
1039     now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1040
1041     //
1042     // If the amount of pending writes is more than our limit, wait
1043     // for 2/3 of it to drain and then flush the journal.
1044     //
1045     if (hfsmp->hfs_mp->mnt_pending_write_size > hfsmp->hfs_max_pending_io) {
1046             int counter=0;
1047             uint64_t pending_io, start, rate = 0;
1048
1049             no_max = 0;
1050
1051             hfs_start_transaction(hfsmp);   // so we hold off any new i/o's
1052
1053             pending_io = hfsmp->hfs_mp->mnt_pending_write_size;
1054
1055             clock_get_calendar_microtime(&secs, &usecs);
1056             start = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1057
1058             while(hfsmp->hfs_mp->mnt_pending_write_size > (pending_io/3) && counter++ < 500) {
1059                     tsleep((caddr_t)hfsmp, PRIBIO, "hfs-wait-for-io-to-drain", 10);
1060             }
1061
1062             if (counter >= 500) {
1063                     printf("hfs: timed out waiting for io to drain (%lld)\n", (int64_t)hfsmp->hfs_mp->mnt_pending_write_size);
1064             }
1065
1066             if (hfsmp->jnl) {
1067                     journal_flush(hfsmp->jnl, FALSE);
1068             } else {
1069                     hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1070             }
1071
1072             clock_get_calendar_microtime(&secs, &usecs);
1073             now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1074             hfsmp->hfs_last_sync_time = now;
1075             if (now != start) {
1076                     rate = ((pending_io * 1000000ULL) / (now - start));     // yields bytes per second
1077             }
1078
1079             hfs_end_transaction(hfsmp);
1080
1081             //
1082             // If a reasonable amount of time elapsed then check the
1083             // i/o rate.  If it's taking less than 1 second or more
1084             // than 2 seconds, adjust hfs_max_pending_io so that we
1085             // will allow about 1.5 seconds of i/o to queue up.
1086             //
1087             if (((now - start) >= 300000) && (rate != 0)) {
1088                     uint64_t scale = (pending_io * 100) / rate;
1089
1090                     if (scale < 100 || scale > 200) {
1091                             // set it so that it should take about 1.5 seconds to drain
1092                             hfsmp->hfs_max_pending_io = (rate * 150ULL) / 100ULL;
1093                     }
1094             }
1095
1096     } else if (   ((now - hfsmp->hfs_last_sync_time) >= 5000000ULL)
1097                || (((now - hfsmp->hfs_last_sync_time) >= 100000LL)
1098                    && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL)
1099                    && (hfsmp->hfs_active_threads == 0)
1100                    && (hfsmp->hfs_global_lock_nesting == 0))) {
1101
1102             //
1103             // Flush the journal if more than 5 seconds elapsed since
1104             // the last sync OR we have not sync'ed recently and the
1105             // last sync request time was more than 100 milliseconds
1106             // ago and no one is in the middle of a transaction right
1107             // now.  Else we defer the sync and reschedule it.
1108             //
1109             if (hfsmp->jnl) {
1110                         hfs_lock_global (hfsmp, HFS_SHARED_LOCK);
1111
1112                     journal_flush(hfsmp->jnl, FALSE);
1113
1114                         hfs_unlock_global (hfsmp);
1115             } else {
1116                     hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1117             }
1118
1119             clock_get_calendar_microtime(&secs, &usecs);
1120             now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1121             hfsmp->hfs_last_sync_time = now;
1122
1123     } else if (hfsmp->hfs_active_threads == 0) {
1124             uint64_t deadline;
1125
1126             clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline);
1127             thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
1128
1129             // note: we intentionally return early here and do not
1130             // decrement the sync_scheduled and sync_incomplete
1131             // variables because we rescheduled the timer.
1132
1133             return;
1134     }
1135
1136     //
1137     // NOTE: we decrement these *after* we're done the journal_flush() since
1138     // it can take a significant amount of time and so we don't want more
1139     // callbacks scheduled until we're done this one.
1140     //
1141     OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
1142     OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
1143     wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
1144 }
1145
1146
1147 extern int IOBSDIsMediaEjectable( const char *cdev_name );
1148
1149 /*
1150  * Initialization code for Red-Black Tree Allocator
1151  *
1152  * This function will build the two red-black trees necessary for allocating space
1153  * from the metadata zone as well as normal allocations.  Currently, we use
1154  * an advisory read to get most of the data into the buffer cache.
1155  * This function is intended to be run in a separate thread so as not to slow down mount.
1156  *
1157  */
1158
1159 void
1160 hfs_initialize_allocator (struct hfsmount *hfsmp) {
1161
1162 #if CONFIG_HFS_ALLOC_RBTREE
1163         u_int32_t err;
1164
1165         /*
1166          * Take the allocation file lock.  Journal transactions will block until
1167          * we're done here.
1168          */
1169         int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1170
1171         /*
1172          * GenerateTree assumes that the bitmap lock is held when you call the function.
1173          * It will drop and re-acquire the lock periodically as needed to let other allocations
1174          * through.  It returns with the bitmap lock held. Since we only maintain one tree,
1175          * we don't need to specify a start block (always starts at 0).
1176          */
1177         err = GenerateTree(hfsmp, hfsmp->totalBlocks, &flags, 1);
1178         if (err) {
1179                 goto bailout;
1180         }
1181         /* Mark offset tree as built */
1182         hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ACTIVE;
1183
1184 bailout:
1185         /*
1186          * GenerateTree may drop the bitmap lock during operation in order to give other
1187          * threads a chance to allocate blocks, but it will always return with the lock held, so
1188          * we don't need to re-grab the lock in order to update the TREEBUILD_INFLIGHT bit.
1189          */
1190         hfsmp->extent_tree_flags &= ~HFS_ALLOC_TREEBUILD_INFLIGHT;
1191         if (err != 0) {
1192                 /* Wakeup any waiters on the allocation bitmap lock */
1193                 wakeup((caddr_t)&hfsmp->extent_tree_flags);
1194         }
1195
1196         hfs_systemfile_unlock(hfsmp, flags);
1197 #else
1198 #pragma unused (hfsmp)
1199 #endif
1200 }
1201
1202 void hfs_unmap_blocks (struct hfsmount *hfsmp) {
1203         /*
1204          * Take the allocation file lock.  Journal transactions will block until
1205          * we're done here.
1206          */
1207         int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1208
1209         /*
1210          * UnmapBlocks assumes that the bitmap lock is held when you call the function.
1211          * We don't care if there were any error issuing unmaps yet.
1212          */
1213         (void) UnmapBlocks(hfsmp);
1214
1215         hfs_systemfile_unlock(hfsmp, flags);
1216 }
1217
1218
1219 /*
1220  * Teardown code for the Red-Black Tree allocator.
1221  * This function consolidates the code which serializes with respect
1222  * to a thread that may be potentially still building the tree when we need to begin
1223  * tearing it down.   Since the red-black tree may not be live when we enter this function
1224  * we return:
1225  *              1 -> Tree was live.
1226  *              0 -> Tree was not active at time of call.
1227  */
1228
1229 int
1230 hfs_teardown_allocator (struct hfsmount *hfsmp) {
1231         int rb_used = 0;
1232
1233 #if CONFIG_HFS_ALLOC_RBTREE
1234
1235         int flags = 0;
1236
1237         /*
1238          * Check to see if the tree-generation is still on-going.
1239          * If it is, then block until it's done.
1240          */
1241
1242         flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1243
1244
1245         while (hfsmp->extent_tree_flags & HFS_ALLOC_TREEBUILD_INFLIGHT) {
1246                 hfsmp->extent_tree_flags |= HFS_ALLOC_TEARDOWN_INFLIGHT;
1247
1248                 lck_rw_sleep(&(VTOC(hfsmp->hfs_allocation_vp))->c_rwlock, LCK_SLEEP_EXCLUSIVE,
1249                                          &hfsmp->extent_tree_flags, THREAD_UNINT);
1250         }
1251
1252         if (hfs_isrbtree_active (hfsmp)) {
1253                 rb_used = 1;
1254
1255                 /* Tear down the RB Trees while we have the bitmap locked */
1256                 DestroyTrees(hfsmp);
1257
1258         }
1259
1260         hfs_systemfile_unlock(hfsmp, flags);
1261 #else
1262         #pragma unused (hfsmp)
1263 #endif
1264         return rb_used;
1265
1266 }
1267
1268 static int hfs_root_unmounted_cleanly = 0;
1269
1270 SYSCTL_DECL(_vfs_generic);
1271 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
1272
1273 /*
1274  * Common code for mount and mountroot
1275  */
1276 int
1277 hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
1278             int journal_replay_only, vfs_context_t context)
1279 {
1280         struct proc *p = vfs_context_proc(context);
1281         int retval = E_NONE;
1282         struct hfsmount *hfsmp = NULL;
1283         struct buf *bp;
1284         dev_t dev;
1285         HFSMasterDirectoryBlock *mdbp = NULL;
1286         int ronly;
1287 #if QUOTA
1288         int i;
1289 #endif
1290         int mntwrapper;
1291         kauth_cred_t cred;
1292         u_int64_t disksize;
1293         daddr64_t log_blkcnt;
1294         u_int32_t log_blksize;
1295         u_int32_t phys_blksize;
1296         u_int32_t minblksize;
1297         u_int32_t iswritable;
1298         daddr64_t mdb_offset;
1299         int isvirtual = 0;
1300         int isroot = 0;
1301         u_int32_t device_features = 0;
1302         int isssd;
1303 #if CONFIG_HFS_ALLOC_RBTREE
1304         thread_t allocator_thread;
1305 #endif
1306
1307         if (args == NULL) {
1308                 /* only hfs_mountroot passes us NULL as the 'args' argument */
1309                 isroot = 1;
1310         }
1311
1312         ronly = vfs_isrdonly(mp);
1313         dev = vnode_specrdev(devvp);
1314         cred = p ? vfs_context_ucred(context) : NOCRED;
1315         mntwrapper = 0;
1316
1317         bp = NULL;
1318         hfsmp = NULL;
1319         mdbp = NULL;
1320         minblksize = kHFSBlockSize;
1321
1322         /* Advisory locking should be handled at the VFS layer */
1323         vfs_setlocklocal(mp);
1324
1325         /* Get the logical block size (treated as physical block size everywhere) */
1326         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) {
1327                 if (HFS_MOUNT_DEBUG) {
1328                         printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n");
1329                 }
1330                 retval = ENXIO;
1331                 goto error_exit;
1332         }
1333         if (log_blksize == 0 || log_blksize > 1024*1024*1024) {
1334                 printf("hfs: logical block size 0x%x looks bad.  Not mounting.\n", log_blksize);
1335                 retval = ENXIO;
1336                 goto error_exit;
1337         }
1338
1339         /* Get the physical block size. */
1340         retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context);
1341         if (retval) {
1342                 if ((retval != ENOTSUP) && (retval != ENOTTY)) {
1343                         if (HFS_MOUNT_DEBUG) {
1344                                 printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n");
1345                         }
1346                         retval = ENXIO;
1347                         goto error_exit;
1348                 }
1349                 /* If device does not support this ioctl, assume that physical
1350                  * block size is same as logical block size
1351                  */
1352                 phys_blksize = log_blksize;
1353         }
1354         if (phys_blksize == 0 || phys_blksize > 1024*1024*1024) {
1355                 printf("hfs: physical block size 0x%x looks bad.  Not mounting.\n", phys_blksize);
1356                 retval = ENXIO;
1357                 goto error_exit;
1358         }
1359
1360         /* Switch to 512 byte sectors (temporarily) */
1361         if (log_blksize > 512) {
1362                 u_int32_t size512 = 512;
1363
1364                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) {
1365                         if (HFS_MOUNT_DEBUG) {
1366                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n");
1367                         }
1368                         retval = ENXIO;
1369                         goto error_exit;
1370                 }
1371         }
1372         /* Get the number of 512 byte physical blocks. */
1373         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1374                 /* resetting block size may fail if getting block count did */
1375                 (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context);
1376                 if (HFS_MOUNT_DEBUG) {
1377                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n");
1378                 }
1379                 retval = ENXIO;
1380                 goto error_exit;
1381         }
1382         /* Compute an accurate disk size (i.e. within 512 bytes) */
1383         disksize = (u_int64_t)log_blkcnt * (u_int64_t)512;
1384
1385         /*
1386          * On Tiger it is not necessary to switch the device
1387          * block size to be 4k if there are more than 31-bits
1388          * worth of blocks but to insure compatibility with
1389          * pre-Tiger systems we have to do it.
1390          *
1391          * If the device size is not a multiple of 4K (8 * 512), then
1392          * switching the logical block size isn't going to help because
1393          * we will be unable to write the alternate volume header.
1394          * In this case, just leave the logical block size unchanged.
1395          */
1396         if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
1397                 minblksize = log_blksize = 4096;
1398                 if (phys_blksize < log_blksize)
1399                         phys_blksize = log_blksize;
1400         }
1401
1402         /*
1403          * The cluster layer is not currently prepared to deal with a logical
1404          * block size larger than the system's page size.  (It can handle
1405          * blocks per page, but not multiple pages per block.)  So limit the
1406          * logical block size to the page size.
1407          */
1408         if (log_blksize > PAGE_SIZE)
1409                 log_blksize = PAGE_SIZE;
1410
1411         /* Now switch to our preferred physical block size. */
1412         if (log_blksize > 512) {
1413                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1414                         if (HFS_MOUNT_DEBUG) {
1415                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n");
1416                         }
1417                         retval = ENXIO;
1418                         goto error_exit;
1419                 }
1420                 /* Get the count of physical blocks. */
1421                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1422                         if (HFS_MOUNT_DEBUG) {
1423                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n");
1424                         }
1425                         retval = ENXIO;
1426                         goto error_exit;
1427                 }
1428         }
1429         /*
1430          * At this point:
1431          *   minblksize is the minimum physical block size
1432          *   log_blksize has our preferred physical block size
1433          *   log_blkcnt has the total number of physical blocks
1434          */
1435
1436         mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize);
1437         if ((retval = (int)buf_meta_bread(devvp,
1438                                 HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)),
1439                                 phys_blksize, cred, &bp))) {
1440                 if (HFS_MOUNT_DEBUG) {
1441                         printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval);
1442                 }
1443                 goto error_exit;
1444         }
1445         MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK);
1446         if (mdbp == NULL) {
1447                 retval = ENOMEM;
1448                 if (HFS_MOUNT_DEBUG) {
1449                         printf("hfs_mountfs: MALLOC failed\n");
1450                 }
1451                 goto error_exit;
1452         }
1453         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize);
1454         buf_brelse(bp);
1455         bp = NULL;
1456
1457         MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK);
1458         if (hfsmp == NULL) {
1459                 if (HFS_MOUNT_DEBUG) {
1460                         printf("hfs_mountfs: MALLOC (2) failed\n");
1461                 }
1462                 retval = ENOMEM;
1463                 goto error_exit;
1464         }
1465         bzero(hfsmp, sizeof(struct hfsmount));
1466
1467         hfs_chashinit_finish(hfsmp);
1468
1469         /*
1470          * See if the disk supports unmap (trim).
1471          *
1472          * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field
1473          * returned by vfs_ioattr.  We need to call VNOP_IOCTL ourselves.
1474          */
1475         if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) {
1476                 if (device_features & DK_FEATURE_UNMAP) {
1477                         hfsmp->hfs_flags |= HFS_UNMAP;
1478                 }
1479         }
1480
1481         /*
1482          * See if the disk is a solid state device, too.  We need this to decide what to do about
1483          * hotfiles.
1484          */
1485         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) {
1486                 if (isssd) {
1487                         hfsmp->hfs_flags |= HFS_SSD;
1488                 }
1489         }
1490
1491
1492         /*
1493          *  Init the volume information structure
1494          */
1495
1496         lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr);
1497         lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr);
1498         lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr);
1499         lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr);
1500         lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr);
1501
1502         vfs_setfsprivate(mp, hfsmp);
1503         hfsmp->hfs_mp = mp;                     /* Make VFSTOHFS work */
1504         hfsmp->hfs_raw_dev = vnode_specrdev(devvp);
1505         hfsmp->hfs_devvp = devvp;
1506         vnode_ref(devvp);  /* Hold a ref on the device, dropped when hfsmp is freed. */
1507         hfsmp->hfs_logical_block_size = log_blksize;
1508         hfsmp->hfs_logical_block_count = log_blkcnt;
1509         hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1510         hfsmp->hfs_physical_block_size = phys_blksize;
1511         hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize);
1512         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1513         if (ronly)
1514                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1515         if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)
1516                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
1517
1518 #if QUOTA
1519         for (i = 0; i < MAXQUOTAS; i++)
1520                 dqfileinit(&hfsmp->hfs_qfiles[i]);
1521 #endif
1522
1523         if (args) {
1524                 hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid;
1525                 if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID;
1526                 hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid;
1527                 if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID;
1528                 vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                               /* tell the VFS */
1529                 if (args->hfs_mask != (mode_t)VNOVAL) {
1530                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
1531                         if (args->flags & HFSFSMNT_NOXONFILES) {
1532                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
1533                         } else {
1534                                 hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
1535                         }
1536                 } else {
1537                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1538                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1539                 }
1540                 if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER))
1541                         mntwrapper = 1;
1542         } else {
1543                 /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */
1544                 if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) {
1545                         hfsmp->hfs_uid = UNKNOWNUID;
1546                         hfsmp->hfs_gid = UNKNOWNGID;
1547                         vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                       /* tell the VFS */
1548                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1549                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1550                 }
1551         }
1552
1553         /* Find out if disk media is writable. */
1554         if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) {
1555                 if (iswritable)
1556                         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1557                 else
1558                         hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1559         }
1560
1561         // record the current time at which we're mounting this volume
1562         struct timeval tv;
1563         microtime(&tv);
1564         hfsmp->hfs_mount_time = tv.tv_sec;
1565
1566         /* Mount a standard HFS disk */
1567         if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) &&
1568             (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) {
1569 #if CONFIG_HFS_STD
1570                 /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */
1571                 if (vfs_isrdwr(mp)) {
1572                         retval = EROFS;
1573                         goto error_exit;
1574                 }
1575
1576                 printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n");
1577
1578                 /* Treat it as if it's read-only and not writeable */
1579                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1580                 hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1581
1582                 /* If only journal replay is requested, exit immediately */
1583                 if (journal_replay_only) {
1584                         retval = 0;
1585                         goto error_exit;
1586                 }
1587
1588                 if ((vfs_flags(mp) & MNT_ROOTFS)) {
1589                         retval = EINVAL;  /* Cannot root from HFS standard disks */
1590                         goto error_exit;
1591                 }
1592                 /* HFS disks can only use 512 byte physical blocks */
1593                 if (log_blksize > kHFSBlockSize) {
1594                         log_blksize = kHFSBlockSize;
1595                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1596                                 retval = ENXIO;
1597                                 goto error_exit;
1598                         }
1599                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1600                                 retval = ENXIO;
1601                                 goto error_exit;
1602                         }
1603                         hfsmp->hfs_logical_block_size = log_blksize;
1604                         hfsmp->hfs_logical_block_count = log_blkcnt;
1605                         hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1606                         hfsmp->hfs_physical_block_size = log_blksize;
1607                         hfsmp->hfs_log_per_phys = 1;
1608                 }
1609                 if (args) {
1610                         hfsmp->hfs_encoding = args->hfs_encoding;
1611                         HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding;
1612
1613                         /* establish the timezone */
1614                         gTimeZone = args->hfs_timezone;
1615                 }
1616
1617                 retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode,
1618                                         &hfsmp->hfs_get_hfsname);
1619                 if (retval)
1620                         goto error_exit;
1621
1622                 retval = hfs_MountHFSVolume(hfsmp, mdbp, p);
1623                 if (retval)
1624                         (void) hfs_relconverter(hfsmp->hfs_encoding);
1625 #else
1626                 /* On platforms where HFS Standard is not supported, deny the mount altogether */
1627                 retval = EINVAL;
1628                 goto error_exit;
1629 #endif
1630
1631         } else /* Mount an HFS Plus disk */ {
1632                 HFSPlusVolumeHeader *vhp;
1633                 off_t embeddedOffset;
1634                 int   jnl_disable = 0;
1635
1636                 /* Get the embedded Volume Header */
1637                 if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
1638                         embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize;
1639                         embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) *
1640                                           (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1641
1642                         /*
1643                          * If the embedded volume doesn't start on a block
1644                          * boundary, then switch the device to a 512-byte
1645                          * block size so everything will line up on a block
1646                          * boundary.
1647                          */
1648                         if ((embeddedOffset % log_blksize) != 0) {
1649                                 printf("hfs_mountfs: embedded volume offset not"
1650                                     " a multiple of physical block size (%d);"
1651                                     " switching to 512\n", log_blksize);
1652                                 log_blksize = 512;
1653                                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE,
1654                                     (caddr_t)&log_blksize, FWRITE, context)) {
1655
1656                                         if (HFS_MOUNT_DEBUG) {
1657                                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n");
1658                                         }
1659                                         retval = ENXIO;
1660                                         goto error_exit;
1661                                 }
1662                                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT,
1663                                     (caddr_t)&log_blkcnt, 0, context)) {
1664                                         if (HFS_MOUNT_DEBUG) {
1665                                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n");
1666                                         }
1667                                         retval = ENXIO;
1668                                         goto error_exit;
1669                                 }
1670                                 /* Note: relative block count adjustment */
1671                                 hfsmp->hfs_logical_block_count *=
1672                                     hfsmp->hfs_logical_block_size / log_blksize;
1673
1674                                 /* Update logical /physical block size */
1675                                 hfsmp->hfs_logical_block_size = log_blksize;
1676                                 hfsmp->hfs_physical_block_size = log_blksize;
1677
1678                                 phys_blksize = log_blksize;
1679                                 hfsmp->hfs_log_per_phys = 1;
1680                         }
1681
1682                         disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
1683                                    (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1684
1685                         hfsmp->hfs_logical_block_count = disksize / log_blksize;
1686
1687                         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1688
1689                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1690                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1691                                         phys_blksize, cred, &bp);
1692                         if (retval) {
1693                                 if (HFS_MOUNT_DEBUG) {
1694                                         printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval);
1695                                 }
1696                                 goto error_exit;
1697                         }
1698                         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512);
1699                         buf_brelse(bp);
1700                         bp = NULL;
1701                         vhp = (HFSPlusVolumeHeader*) mdbp;
1702
1703                 } else /* pure HFS+ */ {
1704                         embeddedOffset = 0;
1705                         vhp = (HFSPlusVolumeHeader*) mdbp;
1706                 }
1707
1708                 if (isroot) {
1709                         hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0);
1710                 }
1711
1712                 /*
1713                  * On inconsistent disks, do not allow read-write mount
1714                  * unless it is the boot volume being mounted.  We also
1715                  * always want to replay the journal if the journal_replay_only
1716                  * flag is set because that will (most likely) get the
1717                  * disk into a consistent state before fsck_hfs starts
1718                  * looking at it.
1719                  */
1720                 if (  !(vfs_flags(mp) & MNT_ROOTFS)
1721                    && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask)
1722                    && !journal_replay_only
1723                    && !(hfsmp->hfs_flags & HFS_READ_ONLY)) {
1724
1725                         if (HFS_MOUNT_DEBUG) {
1726                                 printf("hfs_mountfs: failed to mount non-root inconsistent disk\n");
1727                         }
1728                         retval = EINVAL;
1729                         goto error_exit;
1730                 }
1731
1732
1733                 // XXXdbg
1734                 //
1735                 hfsmp->jnl = NULL;
1736                 hfsmp->jvp = NULL;
1737                 if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) &&
1738                     args->journal_disable) {
1739                     jnl_disable = 1;
1740                 }
1741
1742                 //
1743                 // We only initialize the journal here if the last person
1744                 // to mount this volume was journaling aware.  Otherwise
1745                 // we delay journal initialization until later at the end
1746                 // of hfs_MountHFSPlusVolume() because the last person who
1747                 // mounted it could have messed things up behind our back
1748                 // (so we need to go find the .journal file, make sure it's
1749                 // the right size, re-sync up if it was moved, etc).
1750                 //
1751                 if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
1752                         && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
1753                         && !jnl_disable) {
1754
1755                         // if we're able to init the journal, mark the mount
1756                         // point as journaled.
1757                         //
1758                         if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) {
1759                                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1760                         } else {
1761                                 if (retval == EROFS) {
1762                                         // EROFS is a special error code that means the volume has an external
1763                                         // journal which we couldn't find.  in that case we do not want to
1764                                         // rewrite the volume header - we'll just refuse to mount the volume.
1765                                         if (HFS_MOUNT_DEBUG) {
1766                                                 printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n");
1767                                         }
1768                                         retval = EINVAL;
1769                                         goto error_exit;
1770                                 }
1771
1772                                 // if the journal failed to open, then set the lastMountedVersion
1773                                 // to be "FSK!" which fsck_hfs will see and force the fsck instead
1774                                 // of just bailing out because the volume is journaled.
1775                                 if (!ronly) {
1776                                         if (HFS_MOUNT_DEBUG) {
1777                                                 printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n");
1778                                         }
1779
1780                                         HFSPlusVolumeHeader *jvhp;
1781
1782                                     hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1783
1784                                     if (mdb_offset == 0) {
1785                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1786                                     }
1787
1788                                     bp = NULL;
1789                                     retval = (int)buf_meta_bread(devvp,
1790                                                     HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1791                                                     phys_blksize, cred, &bp);
1792                                     if (retval == 0) {
1793                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1794
1795                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1796                                                 printf ("hfs(1): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1797                                             jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1798                                             buf_bwrite(bp);
1799                                         } else {
1800                                             buf_brelse(bp);
1801                                         }
1802                                         bp = NULL;
1803                                     } else if (bp) {
1804                                         buf_brelse(bp);
1805                                         // clear this so the error exit path won't try to use it
1806                                         bp = NULL;
1807                                     }
1808                                 }
1809
1810                                 // if this isn't the root device just bail out.
1811                                 // If it is the root device we just continue on
1812                                 // in the hopes that fsck_hfs will be able to
1813                                 // fix any damage that exists on the volume.
1814                                 if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1815                                         if (HFS_MOUNT_DEBUG) {
1816                                                 printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n");
1817                                         }
1818                                     retval = EINVAL;
1819                                     goto error_exit;
1820                                 }
1821                         }
1822                 }
1823                 // XXXdbg
1824
1825                 /* Either the journal is replayed successfully, or there
1826                  * was nothing to replay, or no journal exists.  In any case,
1827                  * return success.
1828                  */
1829                 if (journal_replay_only) {
1830                         retval = 0;
1831                         goto error_exit;
1832                 }
1833
1834                 (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
1835
1836                 retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1837                 /*
1838                  * If the backend didn't like our physical blocksize
1839                  * then retry with physical blocksize of 512.
1840                  */
1841                 if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) {
1842                         printf("hfs_mountfs: could not use physical block size "
1843                                         "(%d) switching to 512\n", log_blksize);
1844                         log_blksize = 512;
1845                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1846                                 if (HFS_MOUNT_DEBUG) {
1847                                         printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n");
1848                                 }
1849                                 retval = ENXIO;
1850                                 goto error_exit;
1851                         }
1852                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1853                                 if (HFS_MOUNT_DEBUG) {
1854                                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n");
1855                                 }
1856                                 retval = ENXIO;
1857                                 goto error_exit;
1858                         }
1859                         devvp->v_specsize = log_blksize;
1860                         /* Note: relative block count adjustment (in case this is an embedded volume). */
1861                         hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
1862                         hfsmp->hfs_logical_block_size = log_blksize;
1863                         hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
1864
1865                         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1866
1867                         if (hfsmp->jnl && hfsmp->jvp == devvp) {
1868                             // close and re-open this with the new block size
1869                             journal_close(hfsmp->jnl);
1870                             hfsmp->jnl = NULL;
1871                             if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
1872                                         vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1873                                 } else {
1874                                         // if the journal failed to open, then set the lastMountedVersion
1875                                         // to be "FSK!" which fsck_hfs will see and force the fsck instead
1876                                         // of just bailing out because the volume is journaled.
1877                                         if (!ronly) {
1878                                                 if (HFS_MOUNT_DEBUG) {
1879                                                         printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n");
1880                                                 }
1881                                         HFSPlusVolumeHeader *jvhp;
1882
1883                                         hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1884
1885                                         if (mdb_offset == 0) {
1886                                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1887                                         }
1888
1889                                                 bp = NULL;
1890                                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1891                                                         phys_blksize, cred, &bp);
1892                                         if (retval == 0) {
1893                                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1894
1895                                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1896                                                                 printf ("hfs(2): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1897                                                         jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1898                                                         buf_bwrite(bp);
1899                                                         } else {
1900                                                         buf_brelse(bp);
1901                                                         }
1902                                                         bp = NULL;
1903                                         } else if (bp) {
1904                                                         buf_brelse(bp);
1905                                                         // clear this so the error exit path won't try to use it
1906                                                         bp = NULL;
1907                                         }
1908                                         }
1909
1910                                         // if this isn't the root device just bail out.
1911                                         // If it is the root device we just continue on
1912                                         // in the hopes that fsck_hfs will be able to
1913                                         // fix any damage that exists on the volume.
1914                                         if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1915                                                 if (HFS_MOUNT_DEBUG) {
1916                                                         printf("hfs_mountfs: hfs_early_journal_init (2) failed \n");
1917                                                 }
1918                                         retval = EINVAL;
1919                                         goto error_exit;
1920                                         }
1921                                 }
1922                         }
1923
1924                         /* Try again with a smaller block size... */
1925                         retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1926                         if (retval && HFS_MOUNT_DEBUG) {
1927                                 printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval);
1928                         }
1929                 }
1930                 if (retval)
1931                         (void) hfs_relconverter(0);
1932         }
1933
1934         // save off a snapshot of the mtime from the previous mount
1935         // (for matador).
1936         hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime;
1937
1938         if ( retval ) {
1939                 if (HFS_MOUNT_DEBUG) {
1940                         printf("hfs_mountfs: encountered failure %d \n", retval);
1941                 }
1942                 goto error_exit;
1943         }
1944
1945         mp->mnt_vfsstat.f_fsid.val[0] = (long)dev;
1946         mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp);
1947         vfs_setmaxsymlen(mp, 0);
1948
1949         mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR;
1950 #if NAMEDSTREAMS
1951         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1952 #endif
1953         if (!(hfsmp->hfs_flags & HFS_STANDARD)) {
1954                 /* Tell VFS that we support directory hard links. */
1955                 mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS;
1956         } else {
1957                 /* HFS standard doesn't support extended readdir! */
1958                 mount_set_noreaddirext (mp);
1959         }
1960
1961         if (args) {
1962                 /*
1963                  * Set the free space warning levels for a non-root volume:
1964                  *
1965                  * Set the "danger" limit to 1% of the volume size or 100MB, whichever
1966                  * is less.  Set the "warning" limit to 2% of the volume size or 150MB,
1967                  * whichever is less.  And last, set the "desired" freespace level to
1968                  * to 3% of the volume size or 200MB, whichever is less.
1969                  */
1970                 hfsmp->hfs_freespace_notify_dangerlimit =
1971                         MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1972                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION);
1973                 hfsmp->hfs_freespace_notify_warninglimit =
1974                         MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1975                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION);
1976                 hfsmp->hfs_freespace_notify_desiredlevel =
1977                         MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1978                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION);
1979         } else {
1980                 /*
1981                  * Set the free space warning levels for the root volume:
1982                  *
1983                  * Set the "danger" limit to 5% of the volume size or 512MB, whichever
1984                  * is less.  Set the "warning" limit to 10% of the volume size or 1GB,
1985                  * whichever is less.  And last, set the "desired" freespace level to
1986                  * to 11% of the volume size or 1.25GB, whichever is less.
1987                  */
1988                 hfsmp->hfs_freespace_notify_dangerlimit =
1989                         MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1990                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION);
1991                 hfsmp->hfs_freespace_notify_warninglimit =
1992                         MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1993                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION);
1994                 hfsmp->hfs_freespace_notify_desiredlevel =
1995                         MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1996                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION);
1997         };
1998
1999         /* Check if the file system exists on virtual device, like disk image */
2000         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) {
2001                 if (isvirtual) {
2002                         hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE;
2003                 }
2004         }
2005
2006         /* do not allow ejectability checks on the root device */
2007         if (isroot == 0) {
2008                 if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 &&
2009                                 IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
2010                         hfsmp->hfs_max_pending_io = 4096*1024;   // a reasonable value to start with.
2011                         hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
2012                         if (hfsmp->hfs_syncer == NULL) {
2013                                 printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
2014                                                 mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
2015                         }
2016                 }
2017         }
2018
2019 #if CONFIG_HFS_MOUNT_UNMAP
2020         /* Enable UNMAPs for embedded SSDs only for now */
2021         /*
2022          * TODO: Should we enable this for CoreStorage volumes, too?
2023          */
2024         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2025                 if (hfsmp->hfs_flags & HFS_UNMAP) {
2026                         hfs_unmap_blocks(hfsmp);
2027                 }
2028         }
2029 #endif
2030
2031
2032 #if CONFIG_HFS_ALLOC_RBTREE
2033         /*
2034          * We spawn a thread to create the pair of red-black trees for this volume.
2035          * However, in so doing, we must be careful to ensure that if this thread is still
2036          * running after mount has finished, it doesn't interfere with an unmount. Specifically,
2037          * we'll need to set a bit that indicates we're in progress building the trees here.
2038          * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
2039          * notifies the tree generation code that an unmount is waiting.  Also mark the bit that
2040          * indicates the tree is live and operating.
2041          *
2042          * Only do this if we're operating on a read-write mount (we wouldn't care for read-only).
2043          */
2044
2045         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2046                 hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
2047
2048                 /* Initialize EOF counter so that the thread can assume it started at initial values */
2049                 hfsmp->offset_block_end = 0;
2050                 InitTree(hfsmp);
2051
2052                 kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
2053                 thread_deallocate(allocator_thread);
2054         }
2055
2056 #endif
2057
2058         /*
2059          * Start looking for free space to drop below this level and generate a
2060          * warning immediately if needed:
2061          */
2062         hfsmp->hfs_notification_conditions = 0;
2063         hfs_generate_volume_notifications(hfsmp);
2064
2065         if (ronly == 0) {
2066                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2067         }
2068         FREE(mdbp, M_TEMP);
2069         return (0);
2070
2071 error_exit:
2072         if (bp)
2073                 buf_brelse(bp);
2074         if (mdbp)
2075                 FREE(mdbp, M_TEMP);
2076
2077         if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2078                 vnode_clearmountedon(hfsmp->jvp);
2079                 (void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2080                 hfsmp->jvp = NULL;
2081         }
2082         if (hfsmp) {
2083                 if (hfsmp->hfs_devvp) {
2084                         vnode_rele(hfsmp->hfs_devvp);
2085                 }
2086                 hfs_delete_chash(hfsmp);
2087
2088                 FREE(hfsmp, M_HFSMNT);
2089                 vfs_setfsprivate(mp, NULL);
2090         }
2091         return (retval);
2092 }
2093
2094
2095 /*
2096  * Make a filesystem operational.
2097  * Nothing to do at the moment.
2098  */
2099 /* ARGSUSED */
2100 static int
2101 hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context)
2102 {
2103         return (0);
2104 }
2105
2106
2107 /*
2108  * unmount system call
2109  */
2110 int
2111 hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
2112 {
2113         struct proc *p = vfs_context_proc(context);
2114         struct hfsmount *hfsmp = VFSTOHFS(mp);
2115         int retval = E_NONE;
2116         int flags;
2117         int force;
2118         int started_tr = 0;
2119         int rb_used = 0;
2120
2121         flags = 0;
2122         force = 0;
2123         if (mntflags & MNT_FORCE) {
2124                 flags |= FORCECLOSE;
2125                 force = 1;
2126         }
2127
2128         if ((retval = hfs_flushfiles(mp, flags, p)) && !force)
2129                 return (retval);
2130
2131         if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
2132                 (void) hfs_recording_suspend(hfsmp);
2133
2134         /*
2135          * Cancel any pending timers for this volume.  Then wait for any timers
2136          * which have fired, but whose callbacks have not yet completed.
2137          */
2138         if (hfsmp->hfs_syncer)
2139         {
2140                 struct timespec ts = {0, 100000000};    /* 0.1 seconds */
2141
2142                 /*
2143                  * Cancel any timers that have been scheduled, but have not
2144                  * fired yet.  NOTE: The kernel considers a timer complete as
2145                  * soon as it starts your callback, so the kernel does not
2146                  * keep track of the number of callbacks in progress.
2147                  */
2148                 if (thread_call_cancel(hfsmp->hfs_syncer))
2149                         OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
2150                 thread_call_free(hfsmp->hfs_syncer);
2151                 hfsmp->hfs_syncer = NULL;
2152
2153                 /*
2154                  * This waits for all of the callbacks that were entered before
2155                  * we did thread_call_cancel above, but have not completed yet.
2156                  */
2157                 while(hfsmp->hfs_sync_incomplete > 0)
2158                 {
2159                         msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
2160                 }
2161
2162                 if (hfsmp->hfs_sync_incomplete < 0)
2163                         panic("hfs_unmount: pm_sync_incomplete underflow!\n");
2164         }
2165
2166 #if CONFIG_HFS_ALLOC_RBTREE
2167         rb_used = hfs_teardown_allocator(hfsmp);
2168 #endif
2169
2170         /*
2171          * Flush out the b-trees, volume bitmap and Volume Header
2172          */
2173         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2174                 retval = hfs_start_transaction(hfsmp);
2175                 if (retval == 0) {
2176                     started_tr = 1;
2177                 } else if (!force) {
2178                     goto err_exit;
2179                 }
2180
2181                 if (hfsmp->hfs_startup_vp) {
2182                         (void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK);
2183                         retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p);
2184                         hfs_unlock(VTOC(hfsmp->hfs_startup_vp));
2185                         if (retval && !force)
2186                                 goto err_exit;
2187                 }
2188
2189                 if (hfsmp->hfs_attribute_vp) {
2190                         (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK);
2191                         retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p);
2192                         hfs_unlock(VTOC(hfsmp->hfs_attribute_vp));
2193                         if (retval && !force)
2194                                 goto err_exit;
2195                 }
2196
2197                 (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK);
2198                 retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p);
2199                 hfs_unlock(VTOC(hfsmp->hfs_catalog_vp));
2200                 if (retval && !force)
2201                         goto err_exit;
2202
2203                 (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK);
2204                 retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p);
2205                 hfs_unlock(VTOC(hfsmp->hfs_extents_vp));
2206                 if (retval && !force)
2207                         goto err_exit;
2208
2209                 if (hfsmp->hfs_allocation_vp) {
2210                         (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK);
2211                         retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p);
2212                         hfs_unlock(VTOC(hfsmp->hfs_allocation_vp));
2213                         if (retval && !force)
2214                                 goto err_exit;
2215                 }
2216
2217                 if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) {
2218                         retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p);
2219                         if (retval && !force)
2220                                 goto err_exit;
2221                 }
2222
2223                 /* If runtime corruption was detected, indicate that the volume
2224                  * was not unmounted cleanly.
2225                  */
2226                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2227                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2228                 } else {
2229                         HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
2230                 }
2231
2232
2233                 if (rb_used) {
2234                         /* If the rb-tree was live, just set min_start to 0 */
2235                         hfsmp->nextAllocation = 0;
2236                 }
2237                 else {
2238                         if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2239                                 int i;
2240                                 u_int32_t min_start = hfsmp->totalBlocks;
2241
2242                                 // set the nextAllocation pointer to the smallest free block number
2243                                 // we've seen so on the next mount we won't rescan unnecessarily
2244                                 lck_spin_lock(&hfsmp->vcbFreeExtLock);
2245                                 for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) {
2246                                         if (hfsmp->vcbFreeExt[i].startBlock < min_start) {
2247                                                 min_start = hfsmp->vcbFreeExt[i].startBlock;
2248                                         }
2249                                 }
2250                                 lck_spin_unlock(&hfsmp->vcbFreeExtLock);
2251                                 if (min_start < hfsmp->nextAllocation) {
2252                                         hfsmp->nextAllocation = min_start;
2253                                 }
2254                         }
2255                 }
2256
2257
2258                 retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2259                 if (retval) {
2260                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2261                         if (!force)
2262                                 goto err_exit;  /* could not flush everything */
2263                 }
2264
2265                 if (started_tr) {
2266                     hfs_end_transaction(hfsmp);
2267                     started_tr = 0;
2268                 }
2269         }
2270
2271         if (hfsmp->jnl) {
2272                 hfs_journal_flush(hfsmp, FALSE);
2273         }
2274
2275         /*
2276          *      Invalidate our caches and release metadata vnodes
2277          */
2278         (void) hfsUnmount(hfsmp, p);
2279
2280         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2281                 (void) hfs_relconverter(hfsmp->hfs_encoding);
2282
2283         // XXXdbg
2284         if (hfsmp->jnl) {
2285             journal_close(hfsmp->jnl);
2286             hfsmp->jnl = NULL;
2287         }
2288
2289         VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
2290
2291         if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2292             vnode_clearmountedon(hfsmp->jvp);
2293             retval = VNOP_CLOSE(hfsmp->jvp,
2294                                hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE,
2295                                vfs_context_kernel());
2296             vnode_put(hfsmp->jvp);
2297             hfsmp->jvp = NULL;
2298         }
2299         // XXXdbg
2300
2301         /*
2302          * Last chance to dump unreferenced system files.
2303          */
2304         (void) vflush(mp, NULLVP, FORCECLOSE);
2305
2306 #if HFS_SPARSE_DEV
2307         /* Drop our reference on the backing fs (if any). */
2308         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) {
2309                 struct vnode * tmpvp;
2310
2311                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2312                 tmpvp = hfsmp->hfs_backingfs_rootvp;
2313                 hfsmp->hfs_backingfs_rootvp = NULLVP;
2314                 vnode_rele(tmpvp);
2315         }
2316 #endif /* HFS_SPARSE_DEV */
2317         lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group);
2318         lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group);
2319         vnode_rele(hfsmp->hfs_devvp);
2320
2321         hfs_delete_chash(hfsmp);
2322         FREE(hfsmp, M_HFSMNT);
2323
2324         return (0);
2325
2326   err_exit:
2327         if (started_tr) {
2328                 hfs_end_transaction(hfsmp);
2329         }
2330         return retval;
2331 }
2332
2333
2334 /*
2335  * Return the root of a filesystem.
2336  */
2337 static int
2338 hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context)
2339 {
2340         return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0);
2341 }
2342
2343
2344 /*
2345  * Do operations associated with quotas
2346  */
2347 #if !QUOTA
2348 static int
2349 hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context)
2350 {
2351         return (ENOTSUP);
2352 }
2353 #else
2354 static int
2355 hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context)
2356 {
2357         struct proc *p = vfs_context_proc(context);
2358         int cmd, type, error;
2359
2360         if (uid == ~0U)
2361                 uid = kauth_cred_getuid(vfs_context_ucred(context));
2362         cmd = cmds >> SUBCMDSHIFT;
2363
2364         switch (cmd) {
2365         case Q_SYNC:
2366         case Q_QUOTASTAT:
2367                 break;
2368         case Q_GETQUOTA:
2369                 if (uid == kauth_cred_getuid(vfs_context_ucred(context)))
2370                         break;
2371                 /* fall through */
2372         default:
2373                 if ( (error = vfs_context_suser(context)) )
2374                         return (error);
2375         }
2376
2377         type = cmds & SUBCMDMASK;
2378         if ((u_int)type >= MAXQUOTAS)
2379                 return (EINVAL);
2380         if (vfs_busy(mp, LK_NOWAIT))
2381                 return (0);
2382
2383         switch (cmd) {
2384
2385         case Q_QUOTAON:
2386                 error = hfs_quotaon(p, mp, type, datap);
2387                 break;
2388
2389         case Q_QUOTAOFF:
2390                 error = hfs_quotaoff(p, mp, type);
2391                 break;
2392
2393         case Q_SETQUOTA:
2394                 error = hfs_setquota(mp, uid, type, datap);
2395                 break;
2396
2397         case Q_SETUSE:
2398                 error = hfs_setuse(mp, uid, type, datap);
2399                 break;
2400
2401         case Q_GETQUOTA:
2402                 error = hfs_getquota(mp, uid, type, datap);
2403                 break;
2404
2405         case Q_SYNC:
2406                 error = hfs_qsync(mp);
2407                 break;
2408
2409         case Q_QUOTASTAT:
2410                 error = hfs_quotastat(mp, type, datap);
2411                 break;
2412
2413         default:
2414                 error = EINVAL;
2415                 break;
2416         }
2417         vfs_unbusy(mp);
2418
2419         return (error);
2420 }
2421 #endif /* QUOTA */
2422
2423 /* Subtype is composite of bits */
2424 #define HFS_SUBTYPE_JOURNALED      0x01
2425 #define HFS_SUBTYPE_CASESENSITIVE  0x02
2426 /* bits 2 - 6 reserved */
2427 #define HFS_SUBTYPE_STANDARDHFS    0x80
2428
2429 /*
2430  * Get file system statistics.
2431  */
2432 int
2433 hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context)
2434 {
2435         ExtendedVCB *vcb = VFSTOVCB(mp);
2436         struct hfsmount *hfsmp = VFSTOHFS(mp);
2437         u_int32_t freeCNIDs;
2438         u_int16_t subtype = 0;
2439
2440         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID;
2441
2442         sbp->f_bsize = (u_int32_t)vcb->blockSize;
2443         sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0);
2444         sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks);
2445         sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0));
2446         sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1));
2447         sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2));  /* max files is constrained by total blocks */
2448         sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail)));
2449
2450         /*
2451          * Subtypes (flavors) for HFS
2452          *   0:   Mac OS Extended
2453          *   1:   Mac OS Extended (Journaled)
2454          *   2:   Mac OS Extended (Case Sensitive)
2455          *   3:   Mac OS Extended (Case Sensitive, Journaled)
2456          *   4 - 127:   Reserved
2457          * 128:   Mac OS Standard
2458          *
2459          */
2460         if (hfsmp->hfs_flags & HFS_STANDARD) {
2461                 subtype = HFS_SUBTYPE_STANDARDHFS;
2462         } else /* HFS Plus */ {
2463                 if (hfsmp->jnl)
2464                         subtype |= HFS_SUBTYPE_JOURNALED;
2465                 if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
2466                         subtype |= HFS_SUBTYPE_CASESENSITIVE;
2467         }
2468         sbp->f_fssubtype = subtype;
2469
2470         return (0);
2471 }
2472
2473
2474 //
2475 // XXXdbg -- this is a callback to be used by the journal to
2476 //           get meta data blocks flushed out to disk.
2477 //
2478 // XXXdbg -- be smarter and don't flush *every* block on each
2479 //           call.  try to only flush some so we don't wind up
2480 //           being too synchronous.
2481 //
2482 __private_extern__
2483 void
2484 hfs_sync_metadata(void *arg)
2485 {
2486         struct mount *mp = (struct mount *)arg;
2487         struct hfsmount *hfsmp;
2488         ExtendedVCB *vcb;
2489         buf_t   bp;
2490         int  retval;
2491         daddr64_t priIDSector;
2492         hfsmp = VFSTOHFS(mp);
2493         vcb = HFSTOVCB(hfsmp);
2494
2495         // now make sure the super block is flushed
2496         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
2497                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
2498
2499         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2500                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
2501                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
2502         if ((retval != 0 ) && (retval != ENXIO)) {
2503                 printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n",
2504                        (int)priIDSector, retval);
2505         }
2506
2507         if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2508             buf_bwrite(bp);
2509         } else if (bp) {
2510             buf_brelse(bp);
2511         }
2512
2513         // the alternate super block...
2514         // XXXdbg - we probably don't need to do this each and every time.
2515         //          hfs_btreeio.c:FlushAlternate() should flag when it was
2516         //          written...
2517         if (hfsmp->hfs_alt_id_sector) {
2518                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2519                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
2520                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
2521                 if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2522                     buf_bwrite(bp);
2523                 } else if (bp) {
2524                     buf_brelse(bp);
2525                 }
2526         }
2527 }
2528
2529
2530 struct hfs_sync_cargs {
2531         kauth_cred_t cred;
2532         struct proc  *p;
2533         int    waitfor;
2534         int    error;
2535 };
2536
2537
2538 static int
2539 hfs_sync_callback(struct vnode *vp, void *cargs)
2540 {
2541         struct cnode *cp;
2542         struct hfs_sync_cargs *args;
2543         int error;
2544
2545         args = (struct hfs_sync_cargs *)cargs;
2546
2547         if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) {
2548                 return (VNODE_RETURNED);
2549         }
2550         cp = VTOC(vp);
2551
2552         if ((cp->c_flag & C_MODIFIED) ||
2553             (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) ||
2554             vnode_hasdirtyblks(vp)) {
2555                 error = hfs_fsync(vp, args->waitfor, 0, args->p);
2556
2557                 if (error)
2558                         args->error = error;
2559         }
2560         hfs_unlock(cp);
2561         return (VNODE_RETURNED);
2562 }
2563
2564
2565
2566 /*
2567  * Go through the disk queues to initiate sandbagged IO;
2568  * go through the inodes to write those that have been modified;
2569  * initiate the writing of the super block if it has been modified.
2570  *
2571  * Note: we are always called with the filesystem marked `MPBUSY'.
2572  */
2573 int
2574 hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
2575 {
2576         struct proc *p = vfs_context_proc(context);
2577         struct cnode *cp;
2578         struct hfsmount *hfsmp;
2579         ExtendedVCB *vcb;
2580         struct vnode *meta_vp[4];
2581         int i;
2582         int error, allerror = 0;
2583         struct hfs_sync_cargs args;
2584
2585         hfsmp = VFSTOHFS(mp);
2586
2587         /*
2588          * hfs_changefs might be manipulating vnodes so back off
2589          */
2590         if (hfsmp->hfs_flags & HFS_IN_CHANGEFS)
2591                 return (0);
2592
2593         if (hfsmp->hfs_flags & HFS_READ_ONLY)
2594                 return (EROFS);
2595
2596         /* skip over frozen volumes */
2597         if (!lck_rw_try_lock_shared(&hfsmp->hfs_insync))
2598                 return 0;
2599
2600         args.cred = kauth_cred_get();
2601         args.waitfor = waitfor;
2602         args.p = p;
2603         args.error = 0;
2604         /*
2605          * hfs_sync_callback will be called for each vnode
2606          * hung off of this mount point... the vnode will be
2607          * properly referenced and unreferenced around the callback
2608          */
2609         vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args);
2610
2611         if (args.error)
2612                 allerror = args.error;
2613
2614         vcb = HFSTOVCB(hfsmp);
2615
2616         meta_vp[0] = vcb->extentsRefNum;
2617         meta_vp[1] = vcb->catalogRefNum;
2618         meta_vp[2] = vcb->allocationsRefNum;  /* This is NULL for standard HFS */
2619         meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */
2620
2621         /* Now sync our three metadata files */
2622         for (i = 0; i < 4; ++i) {
2623                 struct vnode *btvp;
2624
2625                 btvp = meta_vp[i];;
2626                 if ((btvp==0) || (vnode_mount(btvp) != mp))
2627                         continue;
2628
2629                 /* XXX use hfs_systemfile_lock instead ? */
2630                 (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK);
2631                 cp = VTOC(btvp);
2632
2633                 if (((cp->c_flag &  C_MODIFIED) == 0) &&
2634                     (cp->c_touch_acctime == 0) &&
2635                     (cp->c_touch_chgtime == 0) &&
2636                     (cp->c_touch_modtime == 0) &&
2637                     vnode_hasdirtyblks(btvp) == 0) {
2638                         hfs_unlock(VTOC(btvp));
2639                         continue;
2640                 }
2641                 error = vnode_get(btvp);
2642                 if (error) {
2643                         hfs_unlock(VTOC(btvp));
2644                         continue;
2645                 }
2646                 if ((error = hfs_fsync(btvp, waitfor, 0, p)))
2647                         allerror = error;
2648
2649                 hfs_unlock(cp);
2650                 vnode_put(btvp);
2651         };
2652
2653         /*
2654          * Force stale file system control information to be flushed.
2655          */
2656         if (vcb->vcbSigWord == kHFSSigWord) {
2657                 if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) {
2658                         allerror = error;
2659                 }
2660         }
2661 #if QUOTA
2662         hfs_qsync(mp);
2663 #endif /* QUOTA */
2664
2665         hfs_hotfilesync(hfsmp, vfs_context_kernel());
2666
2667         /*
2668          * Write back modified superblock.
2669          */
2670         if (IsVCBDirty(vcb)) {
2671                 error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
2672                 if (error)
2673                         allerror = error;
2674         }
2675
2676         if (hfsmp->jnl) {
2677             hfs_journal_flush(hfsmp, FALSE);
2678         }
2679
2680         {
2681                 clock_sec_t secs;
2682                 clock_usec_t usecs;
2683                 uint64_t now;
2684
2685                 clock_get_calendar_microtime(&secs, &usecs);
2686                 now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
2687                 hfsmp->hfs_last_sync_time = now;
2688         }
2689
2690         lck_rw_unlock_shared(&hfsmp->hfs_insync);
2691         return (allerror);
2692 }
2693
2694
2695 /*
2696  * File handle to vnode
2697  *
2698  * Have to be really careful about stale file handles:
2699  * - check that the cnode id is valid
2700  * - call hfs_vget() to get the locked cnode
2701  * - check for an unallocated cnode (i_mode == 0)
2702  * - check that the given client host has export rights and return
2703  *   those rights via. exflagsp and credanonp
2704  */
2705 static int
2706 hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context)
2707 {
2708         struct hfsfid *hfsfhp;
2709         struct vnode *nvp;
2710         int result;
2711
2712         *vpp = NULL;
2713         hfsfhp = (struct hfsfid *)fhp;
2714
2715         if (fhlen < (int)sizeof(struct hfsfid))
2716                 return (EINVAL);
2717
2718         result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0);
2719         if (result) {
2720                 if (result == ENOENT)
2721                         result = ESTALE;
2722                 return result;
2723         }
2724
2725         /*
2726          * We used to use the create time as the gen id of the file handle,
2727          * but it is not static enough because it can change at any point
2728          * via system calls.  We still don't have another volume ID or other
2729          * unique identifier to use for a generation ID across reboots that
2730          * persists until the file is removed.  Using only the CNID exposes
2731          * us to the potential wrap-around case, but as of 2/2008, it would take
2732          * over 2 months to wrap around if the machine did nothing but allocate
2733          * CNIDs.  Using some kind of wrap counter would only be effective if
2734          * each file had the wrap counter associated with it.  For now,
2735          * we use only the CNID to identify the file as it's good enough.
2736          */
2737
2738         *vpp = nvp;
2739
2740         hfs_unlock(VTOC(nvp));
2741         return (0);
2742 }
2743
2744
2745 /*
2746  * Vnode pointer to File handle
2747  */
2748 /* ARGSUSED */
2749 static int
2750 hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context)
2751 {
2752         struct cnode *cp;
2753         struct hfsfid *hfsfhp;
2754
2755         if (ISHFS(VTOVCB(vp)))
2756                 return (ENOTSUP);       /* hfs standard is not exportable */
2757
2758         if (*fhlenp < (int)sizeof(struct hfsfid))
2759                 return (EOVERFLOW);
2760
2761         cp = VTOC(vp);
2762         hfsfhp = (struct hfsfid *)fhp;
2763         /* only the CNID is used to identify the file now */
2764         hfsfhp->hfsfid_cnid = htonl(cp->c_fileid);
2765         hfsfhp->hfsfid_gen = htonl(cp->c_fileid);
2766         *fhlenp = sizeof(struct hfsfid);
2767
2768         return (0);
2769 }
2770
2771
2772 /*
2773  * Initial HFS filesystems, done only once.
2774  */
2775 static int
2776 hfs_init(__unused struct vfsconf *vfsp)
2777 {
2778         static int done = 0;
2779
2780         if (done)
2781                 return (0);
2782         done = 1;
2783         hfs_chashinit();
2784         hfs_converterinit();
2785
2786         BTReserveSetup();
2787
2788
2789         hfs_lock_attr    = lck_attr_alloc_init();
2790         hfs_group_attr   = lck_grp_attr_alloc_init();
2791         hfs_mutex_group  = lck_grp_alloc_init("hfs-mutex", hfs_group_attr);
2792         hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr);
2793         hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr);
2794
2795 #if HFS_COMPRESSION
2796     decmpfs_init();
2797 #endif
2798
2799         return (0);
2800 }
2801
2802 static int
2803 hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp)
2804 {
2805         struct hfsmount * hfsmp;
2806         char fstypename[MFSNAMELEN];
2807
2808         if (vp == NULL)
2809                 return (EINVAL);
2810
2811         if (!vnode_isvroot(vp))
2812                 return (EINVAL);
2813
2814         vnode_vfsname(vp, fstypename);
2815         if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0)
2816                 return (EINVAL);
2817
2818         hfsmp = VTOHFS(vp);
2819
2820         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2821                 return (EINVAL);
2822
2823         *hfsmpp = hfsmp;
2824
2825         return (0);
2826 }
2827
2828 // XXXdbg
2829 #include <sys/filedesc.h>
2830
2831 /*
2832  * HFS filesystem related variables.
2833  */
2834 int
2835 hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp,
2836                         user_addr_t newp, size_t newlen, vfs_context_t context)
2837 {
2838         struct proc *p = vfs_context_proc(context);
2839         int error;
2840         struct hfsmount *hfsmp;
2841
2842         /* all sysctl names at this level are terminal */
2843
2844         if (name[0] == HFS_ENCODINGBIAS) {
2845                 int bias;
2846
2847                 bias = hfs_getencodingbias();
2848                 error = sysctl_int(oldp, oldlenp, newp, newlen, &bias);
2849                 if (error == 0 && newp)
2850                         hfs_setencodingbias(bias);
2851                 return (error);
2852
2853         } else if (name[0] == HFS_EXTEND_FS) {
2854         u_int64_t  newsize;
2855                 vnode_t vp = vfs_context_cwd(context);
2856
2857                 if (newp == USER_ADDR_NULL || vp == NULLVP)
2858                         return (EINVAL);
2859                 if ((error = hfs_getmountpoint(vp, &hfsmp)))
2860                         return (error);
2861                 error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize);
2862                 if (error)
2863                         return (error);
2864
2865                 error = hfs_extendfs(hfsmp, newsize, context);
2866                 return (error);
2867
2868         } else if (name[0] == HFS_ENCODINGHINT) {
2869                 size_t bufsize;
2870                 size_t bytes;
2871                 u_int32_t hint;
2872                 u_int16_t *unicode_name = NULL;
2873                 char *filename = NULL;
2874
2875                 if ((newlen <= 0) || (newlen > MAXPATHLEN))
2876                         return (EINVAL);
2877
2878                 bufsize = MAX(newlen * 3, MAXPATHLEN);
2879                 MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK);
2880                 if (filename == NULL) {
2881                         error = ENOMEM;
2882                         goto encodinghint_exit;
2883                 }
2884                 MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK);
2885                 if (filename == NULL) {
2886                         error = ENOMEM;
2887                         goto encodinghint_exit;
2888                 }
2889
2890                 error = copyin(newp, (caddr_t)filename, newlen);
2891                 if (error == 0) {
2892                         error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name,
2893                                                &bytes, bufsize, 0, UTF_DECOMPOSED);
2894                         if (error == 0) {
2895                                 hint = hfs_pickencoding(unicode_name, bytes / 2);
2896                                 error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint);
2897                         }
2898                 }
2899
2900 encodinghint_exit:
2901                 if (unicode_name)
2902                         FREE(unicode_name, M_TEMP);
2903                 if (filename)
2904                         FREE(filename, M_TEMP);
2905                 return (error);
2906
2907         } else if (name[0] == HFS_ENABLE_JOURNALING) {
2908                 // make the file system journaled...
2909                 vnode_t vp = vfs_context_cwd(context);
2910                 vnode_t jvp;
2911                 ExtendedVCB *vcb;
2912                 struct cat_attr jnl_attr, jinfo_attr;
2913                 struct cat_fork jnl_fork, jinfo_fork;
2914                 void *jnl = NULL;
2915                 int lockflags;
2916
2917                 /* Only root can enable journaling */
2918                 if (!is_suser()) {
2919                         return (EPERM);
2920                 }
2921                 if (vp == NULLVP)
2922                         return EINVAL;
2923
2924                 hfsmp = VTOHFS(vp);
2925                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2926                         return EROFS;
2927                 }
2928                 if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2929                         printf("hfs: can't make a plain hfs volume journaled.\n");
2930                         return EINVAL;
2931                 }
2932
2933                 if (hfsmp->jnl) {
2934                     printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp));
2935                     return EAGAIN;
2936                 }
2937
2938                 vcb = HFSTOVCB(hfsmp);
2939                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2940                 if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
2941                         BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
2942
2943                         printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
2944                         hfs_systemfile_unlock(hfsmp, lockflags);
2945                         return EINVAL;
2946                 }
2947                 hfs_systemfile_unlock(hfsmp, lockflags);
2948
2949                 // make sure these both exist!
2950                 if (   GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
2951                         || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) {
2952
2953                         return EINVAL;
2954                 }
2955
2956                 hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context);
2957
2958                 printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
2959                            (off_t)name[2], (off_t)name[3]);
2960
2961                 //
2962                 // XXXdbg - note that currently (Sept, 08) hfs_util does not support
2963                 //          enabling the journal on a separate device so it is safe
2964                 //          to just copy hfs_devvp here.  If hfs_util gets the ability
2965                 //          to dynamically enable the journal on a separate device then
2966                 //          we will have to do the same thing as hfs_early_journal_init()
2967                 //          to locate and open the journal device.
2968                 //
2969                 jvp = hfsmp->hfs_devvp;
2970                 jnl = journal_create(jvp,
2971                                                          (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
2972                                                          + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
2973                                                          (off_t)((unsigned)name[3]),
2974                                                          hfsmp->hfs_devvp,
2975                                                          hfsmp->hfs_logical_block_size,
2976                                                          0,
2977                                                          0,
2978                                                          hfs_sync_metadata, hfsmp->hfs_mp);
2979
2980                 /*
2981                  * Set up the trim callback function so that we can add
2982                  * recently freed extents to the free extent cache once
2983                  * the transaction that freed them is written to the
2984                  * journal on disk.
2985                  */
2986                 if (jnl)
2987                         journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp);
2988
2989                 if (jnl == NULL) {
2990                         printf("hfs: FAILED to create the journal!\n");
2991                         if (jvp && jvp != hfsmp->hfs_devvp) {
2992                                 vnode_clearmountedon(jvp);
2993                                 VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2994                         }
2995                         jvp = NULL;
2996
2997                         return EINVAL;
2998                 }
2999
3000                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3001
3002                 /*
3003                  * Flush all dirty metadata buffers.
3004                  */
3005                 buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl");
3006                 buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl");
3007                 buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl");
3008                 buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl");
3009                 if (hfsmp->hfs_attribute_vp)
3010                         buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl");
3011
3012                 HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
3013                 HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
3014                 hfsmp->jvp = jvp;
3015                 hfsmp->jnl = jnl;
3016
3017                 // save this off for the hack-y check in hfs_remove()
3018                 hfsmp->jnl_start        = (u_int32_t)name[2];
3019                 hfsmp->jnl_size         = (off_t)((unsigned)name[3]);
3020                 hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
3021                 hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
3022
3023                 vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3024
3025                 hfs_unlock_global (hfsmp);
3026                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3027
3028                 {
3029                         fsid_t fsid;
3030
3031                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3032                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3033                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3034                 }
3035                 return 0;
3036         } else if (name[0] == HFS_DISABLE_JOURNALING) {
3037                 // clear the journaling bit
3038                 vnode_t vp = vfs_context_cwd(context);
3039
3040                 /* Only root can disable journaling */
3041                 if (!is_suser()) {
3042                         return (EPERM);
3043                 }
3044                 if (vp == NULLVP)
3045                         return EINVAL;
3046
3047                 hfsmp = VTOHFS(vp);
3048
3049                 /*
3050                  * Disabling journaling is disallowed on volumes with directory hard links
3051                  * because we have not tested the relevant code path.
3052                  */
3053                 if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){
3054                         printf("hfs: cannot disable journaling on volumes with directory hardlinks\n");
3055                         return EPERM;
3056                 }
3057
3058                 printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp));
3059
3060                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3061
3062                 // Lights out for you buddy!
3063                 journal_close(hfsmp->jnl);
3064                 hfsmp->jnl = NULL;
3065
3066                 if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
3067                         vnode_clearmountedon(hfsmp->jvp);
3068                         VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
3069                         vnode_put(hfsmp->jvp);
3070                 }
3071                 hfsmp->jvp = NULL;
3072                 vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3073                 hfsmp->jnl_start        = 0;
3074                 hfsmp->hfs_jnlinfoblkid = 0;
3075                 hfsmp->hfs_jnlfileid    = 0;
3076
3077                 HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
3078
3079                 hfs_unlock_global (hfsmp);
3080
3081                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3082
3083                 {
3084                         fsid_t fsid;
3085
3086                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3087                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3088                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3089                 }
3090                 return 0;
3091         } else if (name[0] == HFS_GET_JOURNAL_INFO) {
3092                 vnode_t vp = vfs_context_cwd(context);
3093                 off_t jnl_start, jnl_size;
3094
3095                 if (vp == NULLVP)
3096                         return EINVAL;
3097
3098                 /* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */
3099                 if (proc_is64bit(current_proc()))
3100                         return EINVAL;
3101
3102                 hfsmp = VTOHFS(vp);
3103             if (hfsmp->jnl == NULL) {
3104                         jnl_start = 0;
3105                         jnl_size  = 0;
3106             } else {
3107                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
3108                         jnl_size  = (off_t)hfsmp->jnl_size;
3109             }
3110
3111             if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) {
3112                         return error;
3113                 }
3114             if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) {
3115                         return error;
3116                 }
3117
3118                 return 0;
3119         } else if (name[0] == HFS_SET_PKG_EXTENSIONS) {
3120
3121             return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
3122
3123         } else if (name[0] == VFS_CTL_QUERY) {
3124         struct sysctl_req *req;
3125         union union_vfsidctl vc;
3126         struct mount *mp;
3127             struct vfsquery vq;
3128
3129                 req = CAST_DOWN(struct sysctl_req *, oldp);     /* we're new style vfs sysctl. */
3130
3131         error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32));
3132                 if (error) return (error);
3133
3134                 mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */
3135         if (mp == NULL) return (ENOENT);
3136
3137                 hfsmp = VFSTOHFS(mp);
3138                 bzero(&vq, sizeof(vq));
3139                 vq.vq_flags = hfsmp->hfs_notification_conditions;
3140                 return SYSCTL_OUT(req, &vq, sizeof(vq));;
3141         } else if (name[0] == HFS_REPLAY_JOURNAL) {
3142                 vnode_t devvp = NULL;
3143                 int device_fd;
3144                 if (namelen != 2) {
3145                         return (EINVAL);
3146                 }
3147                 device_fd = name[1];
3148                 error = file_vnode(device_fd, &devvp);
3149                 if (error) {
3150                         return error;
3151                 }
3152                 error = vnode_getwithref(devvp);
3153                 if (error) {
3154                         file_drop(device_fd);
3155                         return error;
3156                 }
3157                 error = hfs_journal_replay(devvp, context);
3158                 file_drop(device_fd);
3159                 vnode_put(devvp);
3160                 return error;
3161         } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) {
3162                 hfs_resize_debug = 1;
3163                 printf ("hfs_sysctl: Enabled volume resize debugging.\n");
3164                 return 0;
3165         }
3166
3167         return (ENOTSUP);
3168 }
3169
3170 /*
3171  * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support
3172  * the build_path ioctl.  We use it to leverage the code below that updates
3173  * the origin list cache if necessary
3174  */
3175
3176 int
3177 hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context)
3178 {
3179         int error;
3180         int lockflags;
3181         struct hfsmount *hfsmp;
3182
3183         hfsmp = VFSTOHFS(mp);
3184
3185         error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0);
3186         if (error)
3187                 return (error);
3188
3189         /*
3190          * ADLs may need to have their origin state updated
3191          * since build_path needs a valid parent.  The same is true
3192          * for hardlinked files as well.  There isn't a race window here
3193          * in re-acquiring the cnode lock since we aren't pulling any data
3194          * out of the cnode; instead, we're going to the catalog.
3195          */
3196         if ((VTOC(*vpp)->c_flag & C_HARDLINK) &&
3197             (hfs_lock(VTOC(*vpp), HFS_EXCLUSIVE_LOCK) == 0)) {
3198                 cnode_t *cp = VTOC(*vpp);
3199                 struct cat_desc cdesc;
3200
3201                 if (!hfs_haslinkorigin(cp)) {
3202                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3203                         error = cat_findname(hfsmp, (cnid_t)ino, &cdesc);
3204                         hfs_systemfile_unlock(hfsmp, lockflags);
3205                         if (error == 0) {
3206                                 if ((cdesc.cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3207                                         (cdesc.cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid)) {
3208                                         hfs_savelinkorigin(cp, cdesc.cd_parentcnid);
3209                                 }
3210                                 cat_releasedesc(&cdesc);
3211                         }
3212                 }
3213                 hfs_unlock(cp);
3214         }
3215         return (0);
3216 }
3217
3218
3219 /*
3220  * Look up an HFS object by ID.
3221  *
3222  * The object is returned with an iocount reference and the cnode locked.
3223  *
3224  * If the object is a file then it will represent the data fork.
3225  */
3226 int
3227 hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted)
3228 {
3229         struct vnode *vp = NULLVP;
3230         struct cat_desc cndesc;
3231         struct cat_attr cnattr;
3232         struct cat_fork cnfork;
3233         u_int32_t linkref = 0;
3234         int error;
3235
3236         /* Check for cnids that should't be exported. */
3237         if ((cnid < kHFSFirstUserCatalogNodeID) &&
3238             (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) {
3239                 return (ENOENT);
3240         }
3241         /* Don't export our private directories. */
3242         if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid ||
3243             cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) {
3244                 return (ENOENT);
3245         }
3246         /*
3247          * Check the hash first
3248          */
3249         vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted);
3250         if (vp) {
3251                 *vpp = vp;
3252                 return(0);
3253         }
3254
3255         bzero(&cndesc, sizeof(cndesc));
3256         bzero(&cnattr, sizeof(cnattr));
3257         bzero(&cnfork, sizeof(cnfork));
3258
3259         /*
3260          * Not in hash, lookup in catalog
3261          */
3262         if (cnid == kHFSRootParentID) {
3263                 static char hfs_rootname[] = "/";
3264
3265                 cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0];
3266                 cndesc.cd_namelen = 1;
3267                 cndesc.cd_parentcnid = kHFSRootParentID;
3268                 cndesc.cd_cnid = kHFSRootFolderID;
3269                 cndesc.cd_flags = CD_ISDIR;
3270
3271                 cnattr.ca_fileid = kHFSRootFolderID;
3272                 cnattr.ca_linkcount = 1;
3273                 cnattr.ca_entries = 1;
3274                 cnattr.ca_dircount = 1;
3275                 cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
3276         } else {
3277                 int lockflags;
3278                 cnid_t pid;
3279                 const char *nameptr;
3280
3281                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3282                 error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork);
3283                 hfs_systemfile_unlock(hfsmp, lockflags);
3284
3285                 if (error) {
3286                         *vpp = NULL;
3287                         return (error);
3288                 }
3289
3290                 /*
3291                  * Check for a raw hardlink inode and save its linkref.
3292                  */
3293                 pid = cndesc.cd_parentcnid;
3294                 nameptr = (const char *)cndesc.cd_nameptr;
3295
3296                 if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3297                     (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) {
3298                         linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10);
3299
3300                 } else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3301                            (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) {
3302                         linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10);
3303
3304                 } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3305                            (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) {
3306                         *vpp = NULL;
3307                         cat_releasedesc(&cndesc);
3308                         return (ENOENT);  /* open unlinked file */
3309                 }
3310         }
3311
3312         /*
3313          * Finish initializing cnode descriptor for hardlinks.
3314          *
3315          * We need a valid name and parent for reverse lookups.
3316          */
3317         if (linkref) {
3318                 cnid_t nextlinkid;
3319                 cnid_t prevlinkid;
3320                 struct cat_desc linkdesc;
3321                 int lockflags;
3322
3323                 cnattr.ca_linkref = linkref;
3324
3325                 /*
3326                  * Pick up the first link in the chain and get a descriptor for it.
3327                  * This allows blind volfs paths to work for hardlinks.
3328                  */
3329                 if ((hfs_lookup_siblinglinks(hfsmp, linkref, &prevlinkid,  &nextlinkid) == 0) &&
3330                     (nextlinkid != 0)) {
3331                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3332                         error = cat_findname(hfsmp, nextlinkid, &linkdesc);
3333                         hfs_systemfile_unlock(hfsmp, lockflags);
3334                         if (error == 0) {
3335                                 cat_releasedesc(&cndesc);
3336                                 bcopy(&linkdesc, &cndesc, sizeof(linkdesc));
3337                         }
3338                 }
3339         }
3340
3341         if (linkref) {
3342                 int newvnode_flags = 0;
3343
3344                 error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
3345                                                                 &cnfork, &vp, &newvnode_flags);
3346                 if (error == 0) {
3347                         VTOC(vp)->c_flag |= C_HARDLINK;
3348                         vnode_setmultipath(vp);
3349                 }
3350         } else {
3351                 struct componentname cn;
3352                 int newvnode_flags = 0;
3353
3354                 /* Supply hfs_getnewvnode with a component name. */
3355                 MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
3356                 cn.cn_nameiop = LOOKUP;
3357                 cn.cn_flags = ISLASTCN | HASBUF;
3358                 cn.cn_context = NULL;
3359                 cn.cn_pnlen = MAXPATHLEN;
3360                 cn.cn_nameptr = cn.cn_pnbuf;
3361                 cn.cn_namelen = cndesc.cd_namelen;
3362                 cn.cn_hash = 0;
3363                 cn.cn_consume = 0;
3364                 bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1);
3365
3366                 error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr,
3367                                                                 &cnfork, &vp, &newvnode_flags);
3368
3369                 if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) {
3370                         hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid);
3371                 }
3372                 FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI);
3373         }
3374         cat_releasedesc(&cndesc);
3375
3376         *vpp = vp;
3377         if (vp && skiplock) {
3378                 hfs_unlock(VTOC(vp));
3379         }
3380         return (error);
3381 }
3382
3383
3384 /*
3385  * Flush out all the files in a filesystem.
3386  */
3387 static int
3388 #if QUOTA
3389 hfs_flushfiles(struct mount *mp, int flags, struct proc *p)
3390 #else
3391 hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p)
3392 #endif /* QUOTA */
3393 {
3394         struct hfsmount *hfsmp;
3395         struct vnode *skipvp = NULLVP;
3396         int error;
3397 #if QUOTA
3398         int quotafilecnt;
3399         int i;
3400 #endif
3401
3402         hfsmp = VFSTOHFS(mp);
3403
3404 #if QUOTA
3405         /*
3406          * The open quota files have an indirect reference on
3407          * the root directory vnode.  We must account for this
3408          * extra reference when doing the intial vflush.
3409          */
3410         quotafilecnt = 0;
3411         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3412
3413                 /* Find out how many quota files we have open. */
3414                 for (i = 0; i < MAXQUOTAS; i++) {
3415                         if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP)
3416                                 ++quotafilecnt;
3417                 }
3418
3419                 /* Obtain the root vnode so we can skip over it. */
3420                 skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0);
3421         }
3422 #endif /* QUOTA */
3423
3424         error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags);
3425         if (error != 0)
3426                 return(error);
3427
3428         error = vflush(mp, skipvp, SKIPSYSTEM | flags);
3429
3430 #if QUOTA
3431         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3432                 if (skipvp) {
3433                         /*
3434                          * See if there are additional references on the
3435                          * root vp besides the ones obtained from the open
3436                          * quota files and the hfs_chash_getvnode call above.
3437                          */
3438                         if ((error == 0) &&
3439                             (vnode_isinuse(skipvp,  quotafilecnt))) {
3440                                 error = EBUSY;  /* root directory is still open */
3441                         }
3442                         hfs_unlock(VTOC(skipvp));
3443                         vnode_put(skipvp);
3444                 }
3445                 if (error && (flags & FORCECLOSE) == 0)
3446                         return (error);
3447
3448                 for (i = 0; i < MAXQUOTAS; i++) {
3449                         if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP)
3450                                 continue;
3451                         hfs_quotaoff(p, mp, i);
3452                 }
3453                 error = vflush(mp, NULLVP, SKIPSYSTEM | flags);
3454         }
3455 #endif /* QUOTA */
3456
3457         return (error);
3458 }
3459
3460 /*
3461  * Update volume encoding bitmap (HFS Plus only)
3462  */
3463 __private_extern__
3464 void
3465 hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding)
3466 {
3467 #define  kIndexMacUkrainian     48  /* MacUkrainian encoding is 152 */
3468 #define  kIndexMacFarsi         49  /* MacFarsi encoding is 140 */
3469
3470         u_int32_t       index;
3471
3472         switch (encoding) {
3473         case kTextEncodingMacUkrainian:
3474                 index = kIndexMacUkrainian;
3475                 break;
3476         case kTextEncodingMacFarsi:
3477                 index = kIndexMacFarsi;
3478                 break;
3479         default:
3480                 index = encoding;
3481                 break;
3482         }
3483
3484         if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) {
3485                 HFS_MOUNT_LOCK(hfsmp, TRUE)
3486                 hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index);
3487                 MarkVCBDirty(hfsmp);
3488                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3489         }
3490 }
3491
3492 /*
3493  * Update volume stats
3494  *
3495  * On journal volumes this will cause a volume header flush
3496  */
3497 int
3498 hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
3499 {
3500         struct timeval tv;
3501
3502         microtime(&tv);
3503
3504         lck_mtx_lock(&hfsmp->hfs_mutex);
3505
3506         MarkVCBDirty(hfsmp);
3507         hfsmp->hfs_mtime = tv.tv_sec;
3508
3509         switch (op) {
3510         case VOL_UPDATE:
3511                 break;
3512         case VOL_MKDIR:
3513                 if (hfsmp->hfs_dircount != 0xFFFFFFFF)
3514                         ++hfsmp->hfs_dircount;
3515                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3516                         ++hfsmp->vcbNmRtDirs;
3517                 break;
3518         case VOL_RMDIR:
3519                 if (hfsmp->hfs_dircount != 0)
3520                         --hfsmp->hfs_dircount;
3521                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3522                         --hfsmp->vcbNmRtDirs;
3523                 break;
3524         case VOL_MKFILE:
3525                 if (hfsmp->hfs_filecount != 0xFFFFFFFF)
3526                         ++hfsmp->hfs_filecount;
3527                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3528                         ++hfsmp->vcbNmFls;
3529                 break;
3530         case VOL_RMFILE:
3531                 if (hfsmp->hfs_filecount != 0)
3532                         --hfsmp->hfs_filecount;
3533                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3534                         --hfsmp->vcbNmFls;
3535                 break;
3536         }
3537
3538         lck_mtx_unlock(&hfsmp->hfs_mutex);
3539
3540         if (hfsmp->jnl) {
3541                 hfs_flushvolumeheader(hfsmp, 0, 0);
3542         }
3543
3544         return (0);
3545 }
3546
3547
3548 static int
3549 hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
3550 {
3551         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3552         struct filefork *fp;
3553         HFSMasterDirectoryBlock *mdb;
3554         struct buf *bp = NULL;
3555         int retval;
3556         int sector_size;
3557         ByteCount namelen;
3558
3559         sector_size = hfsmp->hfs_logical_block_size;
3560         retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp);
3561         if (retval) {
3562                 if (bp)
3563                         buf_brelse(bp);
3564                 return retval;
3565         }
3566
3567         lck_mtx_lock(&hfsmp->hfs_mutex);
3568
3569         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size));
3570
3571         mdb->drCrDate   = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime)));
3572         mdb->drLsMod    = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod)));
3573         mdb->drAtrb     = SWAP_BE16 (vcb->vcbAtrb);
3574         mdb->drNmFls    = SWAP_BE16 (vcb->vcbNmFls);
3575         mdb->drAllocPtr = SWAP_BE16 (vcb->nextAllocation);
3576         mdb->drClpSiz   = SWAP_BE32 (vcb->vcbClpSiz);
3577         mdb->drNxtCNID  = SWAP_BE32 (vcb->vcbNxtCNID);
3578         mdb->drFreeBks  = SWAP_BE16 (vcb->freeBlocks);
3579
3580         namelen = strlen((char *)vcb->vcbVN);
3581         retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN);
3582         /* Retry with MacRoman in case that's how it was exported. */
3583         if (retval)
3584                 retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN);
3585
3586         mdb->drVolBkUp  = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp)));
3587         mdb->drWrCnt    = SWAP_BE32 (vcb->vcbWrCnt);
3588         mdb->drNmRtDirs = SWAP_BE16 (vcb->vcbNmRtDirs);
3589         mdb->drFilCnt   = SWAP_BE32 (vcb->vcbFilCnt);
3590         mdb->drDirCnt   = SWAP_BE32 (vcb->vcbDirCnt);
3591
3592         bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo));
3593
3594         fp = VTOF(vcb->extentsRefNum);
3595         mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3596         mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3597         mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3598         mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3599         mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3600         mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3601         mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3602         mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3603         FTOC(fp)->c_flag &= ~C_MODIFIED;
3604
3605         fp = VTOF(vcb->catalogRefNum);
3606         mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3607         mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3608         mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3609         mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3610         mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3611         mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3612         mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3613         mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3614         FTOC(fp)->c_flag &= ~C_MODIFIED;
3615
3616         MarkVCBClean( vcb );
3617
3618         lck_mtx_unlock(&hfsmp->hfs_mutex);
3619
3620         /* If requested, flush out the alternate MDB */
3621         if (altflush) {
3622                 struct buf *alt_bp = NULL;
3623
3624                 if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sector_size, NOCRED, &alt_bp) == 0) {
3625                         bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize);
3626
3627                         (void) VNOP_BWRITE(alt_bp);
3628                 } else if (alt_bp)
3629                         buf_brelse(alt_bp);
3630         }
3631
3632         if (waitfor != MNT_WAIT)
3633                 buf_bawrite(bp);
3634         else
3635                 retval = VNOP_BWRITE(bp);
3636
3637         return (retval);
3638 }
3639
3640 /*
3641  *  Flush any dirty in-memory mount data to the on-disk
3642  *  volume header.
3643  *
3644  *  Note: the on-disk volume signature is intentionally
3645  *  not flushed since the on-disk "H+" and "HX" signatures
3646  *  are always stored in-memory as "H+".
3647  */
3648 int
3649 hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
3650 {
3651         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3652         struct filefork *fp;
3653         HFSPlusVolumeHeader *volumeHeader, *altVH;
3654         int retval;
3655         struct buf *bp, *alt_bp;
3656         int i;
3657         daddr64_t priIDSector;
3658         int critical;
3659         u_int16_t  signature;
3660         u_int16_t  hfsversion;
3661
3662         if (hfsmp->hfs_flags & HFS_READ_ONLY) {
3663                 return(0);
3664         }
3665         if (hfsmp->hfs_flags & HFS_STANDARD) {
3666                 return hfs_flushMDB(hfsmp, waitfor, altflush);
3667         }
3668         critical = altflush;
3669         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
3670                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
3671
3672         if (hfs_start_transaction(hfsmp) != 0) {
3673             return EINVAL;
3674         }
3675
3676         bp = NULL;
3677         alt_bp = NULL;
3678
3679         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3680                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
3681                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
3682         if (retval) {
3683                 printf("hfs: err %d reading VH blk (%s)\n", retval, vcb->vcbVN);
3684                 goto err_exit;
3685         }
3686
3687         volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) +
3688                         HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3689
3690         /*
3691          * Sanity check what we just read.  If it's bad, try the alternate
3692          * instead.
3693          */
3694         signature = SWAP_BE16 (volumeHeader->signature);
3695         hfsversion   = SWAP_BE16 (volumeHeader->version);
3696         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3697             (hfsversion < kHFSPlusVersion) || (hfsversion > 100) ||
3698             (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) {
3699                 printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d%s\n",
3700                       vcb->vcbVN, signature, hfsversion,
3701                       SWAP_BE32 (volumeHeader->blockSize),
3702                       hfsmp->hfs_alt_id_sector ? "; trying alternate" : "");
3703                 hfs_mark_volume_inconsistent(hfsmp);
3704
3705                 if (hfsmp->hfs_alt_id_sector) {
3706                         retval = buf_meta_bread(hfsmp->hfs_devvp,
3707                             HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3708                             hfsmp->hfs_physical_block_size, NOCRED, &alt_bp);
3709                         if (retval) {
3710                                 printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN);
3711                                 goto err_exit;
3712                         }
3713
3714                         altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) +
3715                                 HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size));
3716                         signature = SWAP_BE16(altVH->signature);
3717                         hfsversion = SWAP_BE16(altVH->version);
3718
3719                         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3720                             (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) ||
3721                             (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) {
3722                                 printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n",
3723                                     vcb->vcbVN, signature, hfsversion,
3724                                     SWAP_BE32(altVH->blockSize));
3725                                 retval = EIO;
3726                                 goto err_exit;
3727                         }
3728
3729                         /* The alternate is plausible, so use it. */
3730                         bcopy(altVH, volumeHeader, kMDBSize);
3731                         buf_brelse(alt_bp);
3732                         alt_bp = NULL;
3733                 } else {
3734                         /* No alternate VH, nothing more we can do. */
3735                         retval = EIO;
3736                         goto err_exit;
3737                 }
3738         }
3739
3740         if (hfsmp->jnl) {
3741                 journal_modify_block_start(hfsmp->jnl, bp);
3742         }
3743
3744         /*
3745          * For embedded HFS+ volumes, update create date if it changed
3746          * (ie from a setattrlist call)
3747          */
3748         if ((vcb->hfsPlusIOPosOffset != 0) &&
3749             (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) {
3750                 struct buf *bp2;
3751                 HFSMasterDirectoryBlock *mdb;
3752
3753                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3754                                 HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys),
3755                                 hfsmp->hfs_physical_block_size, NOCRED, &bp2);
3756                 if (retval) {
3757                         if (bp2)
3758                                 buf_brelse(bp2);
3759                         retval = 0;
3760                 } else {
3761                         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) +
3762                                 HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3763
3764                         if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
3765                           {
3766                                 if (hfsmp->jnl) {
3767                                     journal_modify_block_start(hfsmp->jnl, bp2);
3768                                 }
3769
3770                                 mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);       /* pick up the new create date */
3771
3772                                 if (hfsmp->jnl) {
3773                                         journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL);
3774                                 } else {
3775                                         (void) VNOP_BWRITE(bp2);                /* write out the changes */
3776                                 }
3777                           }
3778                         else
3779                           {
3780                                 buf_brelse(bp2);                                                /* just release it */
3781                           }
3782                   }
3783         }
3784
3785         lck_mtx_lock(&hfsmp->hfs_mutex);
3786
3787         /* Note: only update the lower 16 bits worth of attributes */
3788         volumeHeader->attributes       = SWAP_BE32 (vcb->vcbAtrb);
3789         volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock);
3790         if (hfsmp->jnl) {
3791                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
3792         } else {
3793                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
3794         }
3795         volumeHeader->createDate        = SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
3796         volumeHeader->modifyDate        = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
3797         volumeHeader->backupDate        = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
3798         volumeHeader->fileCount         = SWAP_BE32 (vcb->vcbFilCnt);
3799         volumeHeader->folderCount       = SWAP_BE32 (vcb->vcbDirCnt);
3800         volumeHeader->totalBlocks       = SWAP_BE32 (vcb->totalBlocks);
3801         volumeHeader->freeBlocks        = SWAP_BE32 (vcb->freeBlocks);
3802         volumeHeader->nextAllocation    = SWAP_BE32 (vcb->nextAllocation);
3803         volumeHeader->rsrcClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3804         volumeHeader->dataClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3805         volumeHeader->nextCatalogID     = SWAP_BE32 (vcb->vcbNxtCNID);
3806         volumeHeader->writeCount        = SWAP_BE32 (vcb->vcbWrCnt);
3807         volumeHeader->encodingsBitmap   = SWAP_BE64 (vcb->encodingsBitmap);
3808
3809         if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) {
3810                 bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo));
3811                 critical = 1;
3812         }
3813
3814         /*
3815          * System files are only dirty when altflush is set.
3816          */
3817         if (altflush == 0) {
3818                 goto done;
3819         }
3820
3821         /* Sync Extents over-flow file meta data */
3822         fp = VTOF(vcb->extentsRefNum);
3823         if (FTOC(fp)->c_flag & C_MODIFIED) {
3824                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3825                         volumeHeader->extentsFile.extents[i].startBlock =
3826                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3827                         volumeHeader->extentsFile.extents[i].blockCount =
3828                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3829                 }
3830                 volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size);
3831                 volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3832                 volumeHeader->extentsFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3833                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3834         }
3835
3836         /* Sync Catalog file meta data */
3837         fp = VTOF(vcb->catalogRefNum);
3838         if (FTOC(fp)->c_flag & C_MODIFIED) {
3839                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3840                         volumeHeader->catalogFile.extents[i].startBlock =
3841                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3842                         volumeHeader->catalogFile.extents[i].blockCount =
3843                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3844                 }
3845                 volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size);
3846                 volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3847                 volumeHeader->catalogFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3848                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3849         }
3850
3851         /* Sync Allocation file meta data */
3852         fp = VTOF(vcb->allocationsRefNum);
3853         if (FTOC(fp)->c_flag & C_MODIFIED) {
3854                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3855                         volumeHeader->allocationFile.extents[i].startBlock =
3856                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3857                         volumeHeader->allocationFile.extents[i].blockCount =
3858                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3859                 }
3860                 volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size);
3861                 volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3862                 volumeHeader->allocationFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3863                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3864         }
3865
3866         /* Sync Attribute file meta data */
3867         if (hfsmp->hfs_attribute_vp) {
3868                 fp = VTOF(hfsmp->hfs_attribute_vp);
3869                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3870                         volumeHeader->attributesFile.extents[i].startBlock =
3871                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3872                         volumeHeader->attributesFile.extents[i].blockCount =
3873                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3874                 }
3875                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3876                 volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size);
3877                 volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3878                 volumeHeader->attributesFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3879         }
3880
3881         /* Sync Startup file meta data */
3882         if (hfsmp->hfs_startup_vp) {
3883                 fp = VTOF(hfsmp->hfs_startup_vp);
3884                 if (FTOC(fp)->c_flag & C_MODIFIED) {
3885                         for (i = 0; i < kHFSPlusExtentDensity; i++) {
3886                                 volumeHeader->startupFile.extents[i].startBlock =
3887                                         SWAP_BE32 (fp->ff_extents[i].startBlock);
3888                                 volumeHeader->startupFile.extents[i].blockCount =
3889                                         SWAP_BE32 (fp->ff_extents[i].blockCount);
3890                         }
3891                         volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size);
3892                         volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3893                         volumeHeader->startupFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3894                         FTOC(fp)->c_flag &= ~C_MODIFIED;
3895                 }
3896         }
3897
3898 done:
3899         MarkVCBClean(hfsmp);
3900         lck_mtx_unlock(&hfsmp->hfs_mutex);
3901
3902         /* If requested, flush out the alternate volume header */
3903         if (altflush && hfsmp->hfs_alt_id_sector) {
3904                 if (buf_meta_bread(hfsmp->hfs_devvp,
3905                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3906                                 hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) {
3907                         if (hfsmp->jnl) {
3908                                 journal_modify_block_start(hfsmp->jnl, alt_bp);
3909                         }
3910
3911                         bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) +
3912                                         HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size),
3913                                         kMDBSize);
3914
3915                         if (hfsmp->jnl) {
3916                                 journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL);
3917                         } else {
3918                                 (void) VNOP_BWRITE(alt_bp);
3919                         }
3920                 } else if (alt_bp)
3921                         buf_brelse(alt_bp);
3922         }
3923
3924         if (hfsmp->jnl) {
3925                 journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
3926         } else {
3927                 if (waitfor != MNT_WAIT)
3928                         buf_bawrite(bp);
3929                 else {
3930                     retval = VNOP_BWRITE(bp);
3931                     /* When critical data changes, flush the device cache */
3932                     if (critical && (retval == 0)) {
3933                         (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
3934                                          NULL, FWRITE, NULL);
3935                     }
3936                 }
3937         }
3938         hfs_end_transaction(hfsmp);
3939
3940         return (retval);
3941
3942 err_exit:
3943         if (alt_bp)
3944                 buf_brelse(alt_bp);
3945         if (bp)
3946                 buf_brelse(bp);
3947         hfs_end_transaction(hfsmp);
3948         return retval;
3949 }
3950
3951
3952 /*
3953  * Extend a file system.
3954  */
3955 int
3956 hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
3957 {
3958         struct proc *p = vfs_context_proc(context);
3959         kauth_cred_t cred = vfs_context_ucred(context);
3960         struct  vnode *vp;
3961         struct  vnode *devvp;
3962         struct  buf *bp;
3963         struct  filefork *fp = NULL;
3964         ExtendedVCB  *vcb;
3965         struct  cat_fork forkdata;
3966         u_int64_t  oldsize;
3967         u_int64_t  newblkcnt;
3968         u_int64_t  prev_phys_block_count;
3969         u_int32_t  addblks;
3970         u_int64_t  sector_count;
3971         u_int32_t  sector_size;
3972         u_int32_t  phys_sector_size;
3973         u_int32_t  overage_blocks;
3974         daddr64_t  prev_alt_sector;
3975         daddr_t    bitmapblks;
3976         int  lockflags = 0;
3977         int  error;
3978         int64_t oldBitmapSize;
3979         Boolean  usedExtendFileC = false;
3980         int transaction_begun = 0;
3981
3982         devvp = hfsmp->hfs_devvp;
3983         vcb = HFSTOVCB(hfsmp);
3984
3985         /*
3986          * - HFS Plus file systems only.
3987          * - Journaling must be enabled.
3988          * - No embedded volumes.
3989          */
3990         if ((vcb->vcbSigWord == kHFSSigWord) ||
3991              (hfsmp->jnl == NULL) ||
3992              (vcb->hfsPlusIOPosOffset != 0)) {
3993                 return (EPERM);
3994         }
3995         /*
3996          * If extending file system by non-root, then verify
3997          * ownership and check permissions.
3998          */
3999         if (suser(cred, NULL)) {
4000                 error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0);
4001
4002                 if (error)
4003                         return (error);
4004                 error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0);
4005                 if (error == 0) {
4006                         error = hfs_write_access(vp, cred, p, false);
4007                 }
4008                 hfs_unlock(VTOC(vp));
4009                 vnode_put(vp);
4010                 if (error)
4011                         return (error);
4012
4013                 error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context);
4014                 if (error)
4015                         return (error);
4016         }
4017         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&sector_size, 0, context)) {
4018                 return (ENXIO);
4019         }
4020         if (sector_size != hfsmp->hfs_logical_block_size) {
4021                 return (ENXIO);
4022         }
4023         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&sector_count, 0, context)) {
4024                 return (ENXIO);
4025         }
4026         if ((sector_size * sector_count) < newsize) {
4027                 printf("hfs_extendfs: not enough space on device\n");
4028                 return (ENOSPC);
4029         }
4030         error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context);
4031         if (error) {
4032                 if ((error != ENOTSUP) && (error != ENOTTY)) {
4033                         return (ENXIO);
4034                 }
4035                 /* If ioctl is not supported, force physical and logical sector size to be same */
4036                 phys_sector_size = sector_size;
4037         }
4038         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4039
4040         /*
4041          * Validate new size.
4042          */
4043         if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) {
4044                 printf("hfs_extendfs: invalid size\n");
4045                 return (EINVAL);
4046         }
4047         newblkcnt = newsize / vcb->blockSize;
4048         if (newblkcnt > (u_int64_t)0xFFFFFFFF)
4049                 return (EOVERFLOW);
4050
4051         addblks = newblkcnt - vcb->totalBlocks;
4052
4053         if (hfs_resize_debug) {
4054                 printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks);
4055                 printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks);
4056         }
4057         printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks);
4058
4059         HFS_MOUNT_LOCK(hfsmp, TRUE);
4060         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4061                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4062                 error = EALREADY;
4063                 goto out;
4064         }
4065         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4066         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4067
4068         /* Start with a clean journal. */
4069         hfs_journal_flush(hfsmp, TRUE);
4070
4071         /*
4072          * Enclose changes inside a transaction.
4073          */
4074         if (hfs_start_transaction(hfsmp) != 0) {
4075                 error = EINVAL;
4076                 goto out;
4077         }
4078         transaction_begun = 1;
4079
4080
4081         /* Update the hfsmp fields for the physical information about the device */
4082         prev_phys_block_count = hfsmp->hfs_logical_block_count;
4083         prev_alt_sector = hfsmp->hfs_alt_id_sector;
4084
4085         hfsmp->hfs_logical_block_count = sector_count;
4086         /*
4087          * Note that the new AltVH location must be based on the device's EOF rather than the new
4088          * filesystem's EOF, so we use logical_block_count here rather than newsize.
4089          */
4090         hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) +
4091                                   HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count);
4092         hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size;
4093
4094
4095         /*
4096          * Note: we take the attributes lock in case we have an attribute data vnode
4097          * which needs to change size.
4098          */
4099         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4100         vp = vcb->allocationsRefNum;
4101         fp = VTOF(vp);
4102         bcopy(&fp->ff_data, &forkdata, sizeof(forkdata));
4103
4104         /*
4105          * Calculate additional space required (if any) by allocation bitmap.
4106          */
4107         oldBitmapSize = fp->ff_size;
4108         bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize;
4109         if (bitmapblks > (daddr_t)fp->ff_blocks)
4110                 bitmapblks -= fp->ff_blocks;
4111         else
4112                 bitmapblks = 0;
4113
4114         /*
4115          * The allocation bitmap can contain unused bits that are beyond end of
4116          * current volume's allocation blocks.  Usually they are supposed to be
4117          * zero'ed out but there can be cases where they might be marked as used.
4118          * After extending the file system, those bits can represent valid
4119          * allocation blocks, so we mark all the bits from the end of current
4120          * volume to end of allocation bitmap as "free".
4121          *
4122          * Figure out the number of overage blocks before proceeding though,
4123          * so we don't add more bytes to our I/O than necessary.
4124          * First figure out the total number of blocks representable by the
4125          * end of the bitmap file vs. the total number of blocks in the new FS.
4126          * Then subtract away the number of blocks in the current FS.  This is how much
4127          * we can mark as free right now without having to grow the bitmap file.
4128          */
4129         overage_blocks = fp->ff_blocks * vcb->blockSize * 8;
4130         overage_blocks = MIN (overage_blocks, newblkcnt);
4131         overage_blocks -= vcb->totalBlocks;
4132
4133         BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks);
4134
4135         if (bitmapblks > 0) {
4136                 daddr64_t blkno;
4137                 daddr_t blkcnt;
4138                 off_t bytesAdded;
4139
4140                 /*
4141                  * Get the bitmap's current size (in allocation blocks) so we know
4142                  * where to start zero filling once the new space is added.  We've
4143                  * got to do this before the bitmap is grown.
4144                  */
4145                 blkno  = (daddr64_t)fp->ff_blocks;
4146
4147                 /*
4148                  * Try to grow the allocation file in the normal way, using allocation
4149                  * blocks already existing in the file system.  This way, we might be
4150                  * able to grow the bitmap contiguously, or at least in the metadata
4151                  * zone.
4152                  */
4153                 error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0,
4154                                 kEFAllMask | kEFNoClumpMask | kEFReserveMask
4155                                 | kEFMetadataMask | kEFContigMask, &bytesAdded);
4156
4157                 if (error == 0) {
4158                         usedExtendFileC = true;
4159                 } else {
4160                         /*
4161                          * If the above allocation failed, fall back to allocating the new
4162                          * extent of the bitmap from the space we're going to add.  Since those
4163                          * blocks don't yet belong to the file system, we have to update the
4164                          * extent list directly, and manually adjust the file size.
4165                          */
4166                         bytesAdded = 0;
4167                         error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks);
4168                         if (error) {
4169                                 printf("hfs_extendfs: error %d adding extents\n", error);
4170                                 goto out;
4171                         }
4172                         fp->ff_blocks += bitmapblks;
4173                         VTOC(vp)->c_blocks = fp->ff_blocks;
4174                         VTOC(vp)->c_flag |= C_MODIFIED;
4175                 }
4176
4177                 /*
4178                  * Update the allocation file's size to include the newly allocated
4179                  * blocks.  Note that ExtendFileC doesn't do this, which is why this
4180                  * statement is outside the above "if" statement.
4181                  */
4182                 fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4183
4184                 /*
4185                  * Zero out the new bitmap blocks.
4186                  */
4187                 {
4188
4189                         bp = NULL;
4190                         blkcnt = bitmapblks;
4191                         while (blkcnt > 0) {
4192                                 error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp);
4193                                 if (error) {
4194                                         if (bp) {
4195                                                 buf_brelse(bp);
4196                                         }
4197                                         break;
4198                                 }
4199                                 bzero((char *)buf_dataptr(bp), vcb->blockSize);
4200                                 buf_markaged(bp);
4201                                 error = (int)buf_bwrite(bp);
4202                                 if (error)
4203                                         break;
4204                                 --blkcnt;
4205                                 ++blkno;
4206                         }
4207                 }
4208                 if (error) {
4209                         printf("hfs_extendfs: error %d  clearing blocks\n", error);
4210                         goto out;
4211                 }
4212                 /*
4213                  * Mark the new bitmap space as allocated.
4214                  *
4215                  * Note that ExtendFileC will have marked any blocks it allocated, so
4216                  * this is only needed if we used AddFileExtent.  Also note that this
4217                  * has to come *after* the zero filling of new blocks in the case where
4218                  * we used AddFileExtent (since the part of the bitmap we're touching
4219                  * is in those newly allocated blocks).
4220                  */
4221                 if (!usedExtendFileC) {
4222                         error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks);
4223                         if (error) {
4224                                 printf("hfs_extendfs: error %d setting bitmap\n", error);
4225                                 goto out;
4226                         }
4227                         vcb->freeBlocks -= bitmapblks;
4228                 }
4229         }
4230         /*
4231          * Mark the new alternate VH as allocated.
4232          */
4233         if (vcb->blockSize == 512)
4234                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2);
4235         else
4236                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1);
4237         if (error) {
4238                 printf("hfs_extendfs: error %d setting bitmap (VH)\n", error);
4239                 goto out;
4240         }
4241         /*
4242          * Mark the old alternate VH as free.
4243          */
4244         if (vcb->blockSize == 512)
4245                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2);
4246         else
4247                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1);
4248         /*
4249          * Adjust file system variables for new space.
4250          */
4251         vcb->totalBlocks += addblks;
4252         vcb->freeBlocks += addblks;
4253         MarkVCBDirty(vcb);
4254         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4255         if (error) {
4256                 printf("hfs_extendfs: couldn't flush volume headers (%d)", error);
4257                 /*
4258                  * Restore to old state.
4259                  */
4260                 if (usedExtendFileC) {
4261                         (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp),
4262                                                                  FTOC(fp)->c_fileid, false);
4263                 } else {
4264                         fp->ff_blocks -= bitmapblks;
4265                         fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4266                         /*
4267                          * No need to mark the excess blocks free since those bitmap blocks
4268                          * are no longer part of the bitmap.  But we do need to undo the
4269                          * effect of the "vcb->freeBlocks -= bitmapblks" above.
4270                          */
4271                         vcb->freeBlocks += bitmapblks;
4272                 }
4273                 vcb->totalBlocks -= addblks;
4274                 vcb->freeBlocks -= addblks;
4275                 hfsmp->hfs_logical_block_count = prev_phys_block_count;
4276                 hfsmp->hfs_alt_id_sector = prev_alt_sector;
4277                 MarkVCBDirty(vcb);
4278                 if (vcb->blockSize == 512) {
4279                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) {
4280                                 hfs_mark_volume_inconsistent(hfsmp);
4281                         }
4282                 } else {
4283                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) {
4284                                 hfs_mark_volume_inconsistent(hfsmp);
4285                         }
4286                 }
4287                 goto out;
4288         }
4289         /*
4290          * Invalidate the old alternate volume header.
4291          */
4292         bp = NULL;
4293         if (prev_alt_sector) {
4294                 if (buf_meta_bread(hfsmp->hfs_devvp,
4295                                 HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys),
4296                                 hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
4297                         journal_modify_block_start(hfsmp->jnl, bp);
4298
4299                         bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize);
4300
4301                         journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
4302                 } else if (bp) {
4303                         buf_brelse(bp);
4304                 }
4305         }
4306
4307         /*
4308          * Update the metadata zone size based on current volume size
4309          */
4310         hfs_metadatazone_init(hfsmp, false);
4311
4312         /*
4313          * Adjust the size of hfsmp->hfs_attrdata_vp
4314          */
4315         if (hfsmp->hfs_attrdata_vp) {
4316                 struct cnode *attr_cp;
4317                 struct filefork *attr_fp;
4318
4319                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4320                         attr_cp = VTOC(hfsmp->hfs_attrdata_vp);
4321                         attr_fp = VTOF(hfsmp->hfs_attrdata_vp);
4322
4323                         attr_cp->c_blocks = newblkcnt;
4324                         attr_fp->ff_blocks = newblkcnt;
4325                         attr_fp->ff_extents[0].blockCount = newblkcnt;
4326                         attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4327                         ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size);
4328                         vnode_put(hfsmp->hfs_attrdata_vp);
4329                 }
4330         }
4331
4332         /*
4333          * Update the R/B Tree if necessary.  Since we don't have to drop the systemfile
4334          * locks in the middle of these operations like we do in the truncate case
4335          * where we have to relocate files, we can only update the red-black tree
4336          * if there were actual changes made to the bitmap.  Also, we can't really scan the
4337          * new portion of the bitmap before it has been allocated. The BlockMarkAllocated
4338          * routines are smart enough to avoid the r/b tree if the portion they are manipulating is
4339          * not currently controlled by the tree.
4340          *
4341          * We only update hfsmp->allocLimit if totalBlocks actually increased.
4342          */
4343         if (error == 0) {
4344                 UpdateAllocLimit(hfsmp, hfsmp->totalBlocks);
4345         }
4346
4347         /* Release all locks and sync up journal content before
4348          * checking and extending, if required, the journal
4349          */
4350         if (lockflags) {
4351                 hfs_systemfile_unlock(hfsmp, lockflags);
4352                 lockflags = 0;
4353         }
4354         if (transaction_begun) {
4355                 hfs_end_transaction(hfsmp);
4356                 hfs_journal_flush(hfsmp, TRUE);
4357                 transaction_begun = 0;
4358         }
4359
4360         /* Increase the journal size, if required. */
4361         error = hfs_extend_journal(hfsmp, sector_size, sector_count, context);
4362         if (error) {
4363                 printf ("hfs_extendfs: Could not extend journal size\n");
4364                 goto out_noalloc;
4365         }
4366
4367         /* Log successful extending */
4368         printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n",
4369                hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize));
4370
4371 out:
4372         if (error && fp) {
4373                 /* Restore allocation fork. */
4374                 bcopy(&forkdata, &fp->ff_data, sizeof(forkdata));
4375                 VTOC(vp)->c_blocks = fp->ff_blocks;
4376
4377         }
4378
4379 out_noalloc:
4380         HFS_MOUNT_LOCK(hfsmp, TRUE);
4381         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4382         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4383         if (lockflags) {
4384                 hfs_systemfile_unlock(hfsmp, lockflags);
4385         }
4386         if (transaction_begun) {
4387                 hfs_end_transaction(hfsmp);
4388                 hfs_journal_flush(hfsmp, FALSE);
4389                 /* Just to be sure, sync all data to the disk */
4390                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4391         }
4392
4393         return MacToVFSError(error);
4394 }
4395
4396 #define HFS_MIN_SIZE  (32LL * 1024LL * 1024LL)
4397
4398 /*
4399  * Truncate a file system (while still mounted).
4400  */
4401 int
4402 hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4403 {
4404         struct  buf *bp = NULL;
4405         u_int64_t oldsize;
4406         u_int32_t newblkcnt;
4407         u_int32_t reclaimblks = 0;
4408         int lockflags = 0;
4409         int transaction_begun = 0;
4410         Boolean updateFreeBlocks = false;
4411         Boolean disable_sparse = false;
4412         int error = 0;
4413
4414         lck_mtx_lock(&hfsmp->hfs_mutex);
4415         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4416                 lck_mtx_unlock(&hfsmp->hfs_mutex);
4417                 return (EALREADY);
4418         }
4419         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4420         hfsmp->hfs_resize_blocksmoved = 0;
4421         hfsmp->hfs_resize_totalblocks = 0;
4422         hfsmp->hfs_resize_progress = 0;
4423         lck_mtx_unlock(&hfsmp->hfs_mutex);
4424
4425         /*
4426          * - Journaled HFS Plus volumes only.
4427          * - No embedded volumes.
4428          */
4429         if ((hfsmp->jnl == NULL) ||
4430             (hfsmp->hfsPlusIOPosOffset != 0)) {
4431                 error = EPERM;
4432                 goto out;
4433         }
4434         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4435         newblkcnt = newsize / hfsmp->blockSize;
4436         reclaimblks = hfsmp->totalBlocks - newblkcnt;
4437
4438         if (hfs_resize_debug) {
4439                 printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
4440                 printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
4441         }
4442
4443         /* Make sure new size is valid. */
4444         if ((newsize < HFS_MIN_SIZE) ||
4445             (newsize >= oldsize) ||
4446             (newsize % hfsmp->hfs_logical_block_size) ||
4447             (newsize % hfsmp->hfs_physical_block_size)) {
4448                 printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4449                 error = EINVAL;
4450                 goto out;
4451         }
4452
4453         /*
4454          * Make sure that the file system has enough free blocks reclaim.
4455          *
4456          * Before resize, the disk is divided into four zones -
4457          *      A. Allocated_Stationary - These are allocated blocks that exist
4458          *         before the new end of disk.  These blocks will not be
4459          *         relocated or modified during resize.
4460          *      B. Free_Stationary - These are free blocks that exist before the
4461          *         new end of disk.  These blocks can be used for any new
4462          *         allocations during resize, including allocation for relocating
4463          *         data from the area of disk being reclaimed.
4464          *      C. Allocated_To-Reclaim - These are allocated blocks that exist
4465          *         beyond the new end of disk.  These blocks need to be reclaimed
4466          *         during resize by allocating equal number of blocks in Free
4467          *         Stationary zone and copying the data.
4468          *      D. Free_To-Reclaim - These are free blocks that exist beyond the
4469          *         new end of disk.  Nothing special needs to be done to reclaim
4470          *         them.
4471          *
4472          * Total number of blocks on the disk before resize:
4473          * ------------------------------------------------
4474          *      Total Blocks = Allocated_Stationary + Free_Stationary +
4475          *                     Allocated_To-Reclaim + Free_To-Reclaim
4476          *
4477          * Total number of blocks that need to be reclaimed:
4478          * ------------------------------------------------
4479          *      Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim
4480          *
4481          * Note that the check below also makes sure that we have enough space
4482          * to relocate data from Allocated_To-Reclaim to Free_Stationary.
4483          * Therefore we do not need to check total number of blocks to relocate
4484          * later in the code.
4485          *
4486          * The condition below gets converted to:
4487          *
4488          * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim
4489          *
4490          * which is equivalent to:
4491          *
4492          *              Allocated To-Reclaim >= Free Stationary
4493          */
4494         if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
4495                 printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
4496                 error = ENOSPC;
4497                 goto out;
4498         }
4499
4500         /* Start with a clean journal. */
4501         hfs_journal_flush(hfsmp, TRUE);
4502
4503         if (hfs_start_transaction(hfsmp) != 0) {
4504                 error = EINVAL;
4505                 goto out;
4506         }
4507         transaction_begun = 1;
4508
4509         /* Take the bitmap lock to update the alloc limit field */
4510         lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4511
4512         /*
4513          * Prevent new allocations from using the part we're trying to truncate.
4514          *
4515          * NOTE: allocLimit is set to the allocation block number where the new
4516          * alternate volume header will be.  That way there will be no files to
4517          * interfere with allocating the new alternate volume header, and no files
4518          * in the allocation blocks beyond (i.e. the blocks we're trying to
4519          * truncate away.
4520          *
4521          * Also shrink the red-black tree if needed.
4522          */
4523         if (hfsmp->blockSize == 512) {
4524                 error = UpdateAllocLimit (hfsmp, newblkcnt - 2);
4525         }
4526         else {
4527                 error = UpdateAllocLimit (hfsmp, newblkcnt - 1);
4528         }
4529
4530         /* Sparse devices use first fit allocation which is not ideal
4531          * for volume resize which requires best fit allocation.  If a
4532          * sparse device is being truncated, disable the sparse device
4533          * property temporarily for the duration of resize.  Also reset
4534          * the free extent cache so that it is rebuilt as sorted by
4535          * totalBlocks instead of startBlock.
4536          *
4537          * Note that this will affect all allocations on the volume and
4538          * ideal fix would be just to modify resize-related allocations,
4539          * but it will result in complexity like handling of two free
4540          * extent caches sorted differently, etc.  So we stick to this
4541          * solution for now.
4542          */
4543         HFS_MOUNT_LOCK(hfsmp, TRUE);
4544         if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
4545                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
4546                 ResetVCBFreeExtCache(hfsmp);
4547                 disable_sparse = true;
4548         }
4549
4550         /*
4551          * Update the volume free block count to reflect the total number
4552          * of free blocks that will exist after a successful resize.
4553          * Relocation of extents will result in no net change in the total
4554          * free space on the disk.  Therefore the code that allocates
4555          * space for new extent and deallocates the old extent explicitly
4556          * prevents updating the volume free block count.  It will also
4557          * prevent false disk full error when the number of blocks in
4558          * an extent being relocated is more than the free blocks that
4559          * will exist after the volume is resized.
4560          */
4561         hfsmp->freeBlocks -= reclaimblks;
4562         updateFreeBlocks = true;
4563         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4564
4565         if (lockflags) {
4566                 hfs_systemfile_unlock(hfsmp, lockflags);
4567                 lockflags = 0;
4568         }
4569
4570         /*
4571          * Update the metadata zone size to match the new volume size,
4572          * and if it too less, metadata zone might be disabled.
4573          */
4574         hfs_metadatazone_init(hfsmp, false);
4575
4576         /*
4577          * If some files have blocks at or beyond the location of the
4578          * new alternate volume header, recalculate free blocks and
4579          * reclaim blocks.  Otherwise just update free blocks count.
4580          *
4581          * The current allocLimit is set to the location of new alternate
4582          * volume header, and reclaimblks are the total number of blocks
4583          * that need to be reclaimed.  So the check below is really
4584          * ignoring the blocks allocated for old alternate volume header.
4585          */
4586         if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
4587                 /*
4588                  * hfs_reclaimspace will use separate transactions when
4589                  * relocating files (so we don't overwhelm the journal).
4590                  */
4591                 hfs_end_transaction(hfsmp);
4592                 transaction_begun = 0;
4593
4594                 /* Attempt to reclaim some space. */
4595                 error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
4596                 if (error != 0) {
4597                         printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
4598                         error = ENOSPC;
4599                         goto out;
4600                 }
4601                 if (hfs_start_transaction(hfsmp) != 0) {
4602                         error = EINVAL;
4603                         goto out;
4604                 }
4605                 transaction_begun = 1;
4606
4607                 /* Check if we're clear now. */
4608                 error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
4609                 if (error != 0) {
4610                         printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
4611                         error = EAGAIN;  /* tell client to try again */
4612                         goto out;
4613                 }
4614         }
4615
4616         /*
4617          * Note: we take the attributes lock in case we have an attribute data vnode
4618          * which needs to change size.
4619          */
4620         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4621
4622         /*
4623          * Allocate last 1KB for alternate volume header.
4624          */
4625         error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1);
4626         if (error) {
4627                 printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error);
4628                 goto out;
4629         }
4630
4631         /*
4632          * Mark the old alternate volume header as free.
4633          * We don't bother shrinking allocation bitmap file.
4634          */
4635         if (hfsmp->blockSize == 512)
4636                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2);
4637         else
4638                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1);
4639
4640         /*
4641          * Invalidate the existing alternate volume header.
4642          *
4643          * Don't include this in a transaction (don't call journal_modify_block)
4644          * since this block will be outside of the truncated file system!
4645          */
4646         if (hfsmp->hfs_alt_id_sector) {
4647                 error = buf_meta_bread(hfsmp->hfs_devvp,
4648                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
4649                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
4650                 if (error == 0) {
4651                         bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
4652                         (void) VNOP_BWRITE(bp);
4653                 } else {
4654                         if (bp) {
4655                                 buf_brelse(bp);
4656                         }
4657                 }
4658                 bp = NULL;
4659         }
4660
4661         /* Log successful shrinking. */
4662         printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n",
4663                hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks);
4664
4665         /*
4666          * Adjust file system variables and flush them to disk.
4667          */
4668         hfsmp->totalBlocks = newblkcnt;
4669         hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size;
4670         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
4671
4672         /*
4673          * Note that although the logical block size is updated here, it is only done for
4674          * the benefit of the partition management software.  The logical block count change
4675          * has not yet actually been propagated to the disk device yet.
4676          */
4677
4678         hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count);
4679         MarkVCBDirty(hfsmp);
4680         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4681         if (error)
4682                 panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error);
4683
4684         /*
4685          * Adjust the size of hfsmp->hfs_attrdata_vp
4686          */
4687         if (hfsmp->hfs_attrdata_vp) {
4688                 struct cnode *cp;
4689                 struct filefork *fp;
4690
4691                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4692                         cp = VTOC(hfsmp->hfs_attrdata_vp);
4693                         fp = VTOF(hfsmp->hfs_attrdata_vp);
4694
4695                         cp->c_blocks = newblkcnt;
4696                         fp->ff_blocks = newblkcnt;
4697                         fp->ff_extents[0].blockCount = newblkcnt;
4698                         fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4699                         ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size);
4700                         vnode_put(hfsmp->hfs_attrdata_vp);
4701                 }
4702         }
4703
4704 out:
4705         /*
4706          * Update the allocLimit to acknowledge the last one or two blocks now.
4707          * Add it to the tree as well if necessary.
4708          */
4709         UpdateAllocLimit (hfsmp, hfsmp->totalBlocks);
4710
4711         HFS_MOUNT_LOCK(hfsmp, TRUE);
4712         if (disable_sparse == true) {
4713                 /* Now that resize is completed, set the volume to be sparse
4714                  * device again so that all further allocations will be first
4715                  * fit instead of best fit.  Reset free extent cache so that
4716                  * it is rebuilt.
4717                  */
4718                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
4719                 ResetVCBFreeExtCache(hfsmp);
4720         }
4721
4722         if (error && (updateFreeBlocks == true)) {
4723                 hfsmp->freeBlocks += reclaimblks;
4724         }
4725
4726         if (hfsmp->nextAllocation >= hfsmp->allocLimit) {
4727                 hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
4728         }
4729         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4730         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4731
4732         /* On error, reset the metadata zone for original volume size */
4733         if (error && (updateFreeBlocks == true)) {
4734                 hfs_metadatazone_init(hfsmp, false);
4735         }
4736
4737         if (lockflags) {
4738                 hfs_systemfile_unlock(hfsmp, lockflags);
4739         }
4740         if (transaction_begun) {
4741                 hfs_end_transaction(hfsmp);
4742                 hfs_journal_flush(hfsmp, FALSE);
4743                 /* Just to be sure, sync all data to the disk */
4744                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4745         }
4746
4747         return MacToVFSError(error);
4748 }
4749
4750
4751 /*
4752  * Invalidate the physical block numbers associated with buffer cache blocks
4753  * in the given extent of the given vnode.
4754  */
4755 struct hfs_inval_blk_no {
4756         daddr64_t sectorStart;
4757         daddr64_t sectorCount;
4758 };
4759 static int
4760 hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in)
4761 {
4762         daddr64_t blkno;
4763         struct hfs_inval_blk_no *args;
4764
4765         blkno = buf_blkno(bp);
4766         args = args_in;
4767
4768         if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount)
4769                 buf_setblkno(bp, buf_lblkno(bp));
4770
4771         return BUF_RETURNED;
4772 }
4773 static void
4774 hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount)
4775 {
4776         struct hfs_inval_blk_no args;
4777         args.sectorStart = sectorStart;
4778         args.sectorCount = sectorCount;
4779
4780         buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args);
4781 }
4782
4783
4784 /*
4785  * Copy the contents of an extent to a new location.  Also invalidates the
4786  * physical block number of any buffer cache block in the copied extent
4787  * (so that if the block is written, it will go through VNOP_BLOCKMAP to
4788  * determine the new physical block number).
4789  *
4790  * At this point, for regular files, we hold the truncate lock exclusive
4791  * and the cnode lock exclusive.
4792  */
4793 static int
4794 hfs_copy_extent(
4795         struct hfsmount *hfsmp,
4796         struct vnode *vp,               /* The file whose extent is being copied. */
4797         u_int32_t oldStart,             /* The start of the source extent. */
4798         u_int32_t newStart,             /* The start of the destination extent. */
4799         u_int32_t blockCount,   /* The number of allocation blocks to copy. */
4800         vfs_context_t context)
4801 {
4802         int err = 0;
4803         size_t bufferSize;
4804         void *buffer = NULL;
4805         struct vfsioattr ioattr;
4806         buf_t bp = NULL;
4807         off_t resid;
4808         size_t ioSize;
4809         u_int32_t ioSizeSectors;        /* Device sectors in this I/O */
4810         daddr64_t srcSector, destSector;
4811         u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4812 #if CONFIG_PROTECT
4813         int cpenabled = 0;
4814 #endif
4815
4816         /*
4817          * Sanity check that we have locked the vnode of the file we're copying.
4818          *
4819          * But since hfs_systemfile_lock() doesn't actually take the lock on
4820          * the allocation file if a journal is active, ignore the check if the
4821          * file being copied is the allocation file.
4822          */
4823         struct cnode *cp = VTOC(vp);
4824         if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
4825                 panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
4826
4827 #if CONFIG_PROTECT
4828         /*
4829          * Prepare the CP blob and get it ready for use, if necessary.
4830          *
4831          * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs),
4832          * because they are implicitly protected via the media key on iOS.  As such, they
4833          * must not be relocated except with the media key.  So it is OK to not pass down
4834          * a special cpentry to the IOMedia/LwVM code for handling.
4835          */
4836         if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) {
4837                 int cp_err = 0;
4838                 /*
4839                  * Ideally, the file whose extents we are about to manipulate is using the
4840                  * newer offset-based IVs so that we can manipulate it regardless of the
4841                  * current lock state.  However, we must maintain support for older-style
4842                  * EAs.
4843                  *
4844                  * For the older EA case, the IV was tied to the device LBA for file content.
4845                  * This means that encrypted data cannot be moved from one location to another
4846                  * in the filesystem without garbling the IV data.  As a result, we need to
4847                  * access the file's plaintext because we cannot do our AES-symmetry trick
4848                  * here.  This requires that we attempt a key-unwrap here (via cp_handle_relocate)
4849                  * to make forward progress.  If the keys are unavailable then we will
4850                  * simply stop the resize in its tracks here since we cannot move
4851                  * this extent at this time.
4852                  */
4853                 if ((cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) == 0) {
4854                         cp_err = cp_handle_relocate(cp, hfsmp);
4855                 }
4856
4857                 if (cp_err) {
4858                         printf ("hfs_copy_extent: cp_handle_relocate failed (%d) \n", cp_err);
4859                         return cp_err;
4860                 }
4861
4862                 cpenabled = 1;
4863         }
4864 #endif
4865
4866
4867         /*
4868          * Determine the I/O size to use
4869          *
4870          * NOTE: Many external drives will result in an ioSize of 128KB.
4871          * TODO: Should we use a larger buffer, doing several consecutive
4872          * reads, then several consecutive writes?
4873          */
4874         vfs_ioattr(hfsmp->hfs_mp, &ioattr);
4875         bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt);
4876         if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize))
4877                 return ENOMEM;
4878
4879         /* Get a buffer for doing the I/O */
4880         bp = buf_alloc(hfsmp->hfs_devvp);
4881         buf_setdataptr(bp, (uintptr_t)buffer);
4882
4883         resid = (off_t) blockCount * (off_t) hfsmp->blockSize;
4884         srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4885         destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4886         while (resid > 0) {
4887                 ioSize = MIN(bufferSize, (size_t) resid);
4888                 ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size;
4889
4890                 /* Prepare the buffer for reading */
4891                 buf_reset(bp, B_READ);
4892                 buf_setsize(bp, ioSize);
4893                 buf_setcount(bp, ioSize);
4894                 buf_setblkno(bp, srcSector);
4895                 buf_setlblkno(bp, srcSector);
4896
4897                 /*
4898                  * Note that because this is an I/O to the device vp
4899                  * it is correct to have lblkno and blkno both point to the
4900                  * start sector being read from.  If it were being issued against the
4901                  * underlying file then that would be different.
4902                  */
4903
4904                 /* Attach the new CP blob  to the buffer if needed */
4905 #if CONFIG_PROTECT
4906                 if (cpenabled) {
4907                         if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4908                                 /* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */
4909                                 cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT;
4910                                 buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4911                         }
4912                         else {
4913                                 /*
4914                                  * Use the cnode's cp key.  This file is tied to the
4915                                  * LBAs of the physical blocks that it occupies.
4916                                  */
4917                                 buf_setcpaddr (bp, cp->c_cpentry);
4918                         }
4919
4920                         /* Initialize the content protection file offset to start at 0 */
4921                         buf_setcpoff (bp, 0);
4922                 }
4923 #endif
4924
4925                 /* Do the read */
4926                 err = VNOP_STRATEGY(bp);
4927                 if (!err)
4928                         err = buf_biowait(bp);
4929                 if (err) {
4930 #if CONFIG_PROTECT
4931                         /* Turn the flag off in error cases. */
4932                         if (cpenabled) {
4933                                 cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
4934                         }
4935 #endif
4936                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err);
4937                         break;
4938                 }
4939
4940                 /* Prepare the buffer for writing */
4941                 buf_reset(bp, B_WRITE);
4942                 buf_setsize(bp, ioSize);
4943                 buf_setcount(bp, ioSize);
4944                 buf_setblkno(bp, destSector);
4945                 buf_setlblkno(bp, destSector);
4946                 if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
4947                         buf_markfua(bp);
4948
4949 #if CONFIG_PROTECT
4950                 /* Attach the CP to the buffer if needed */
4951                 if (cpenabled) {
4952                         if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4953                                 buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4954                         }
4955                         else {
4956                                 /*
4957                                  * Use the cnode's CP key.  This file is still tied
4958                                  * to the LBAs of the physical blocks that it occupies.
4959                                  */
4960                                 buf_setcpaddr (bp, cp->c_cpentry);
4961                         }
4962                         /*
4963                          * The last STRATEGY call may have updated the cp file offset behind our
4964                          * back, so we cannot trust it.  Re-initialize the content protection
4965                          * file offset back to 0 before initiating the write portion of this I/O.
4966                          */
4967                         buf_setcpoff (bp, 0);
4968                 }
4969 #endif
4970
4971                 /* Do the write */
4972                 vnode_startwrite(hfsmp->hfs_devvp);
4973                 err = VNOP_STRATEGY(bp);
4974                 if (!err) {
4975                         err = buf_biowait(bp);
4976                 }
4977 #if CONFIG_PROTECT
4978                 /* Turn the flag off regardless once the strategy call finishes. */
4979                 if (cpenabled) {
4980                         cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
4981                 }
4982 #endif
4983                 if (err) {
4984                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err);
4985                         break;
4986                 }
4987
4988                 resid -= ioSize;
4989                 srcSector += ioSizeSectors;
4990                 destSector += ioSizeSectors;
4991         }
4992         if (bp)
4993                 buf_free(bp);
4994         if (buffer)
4995                 kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
4996
4997         /* Make sure all writes have been flushed to disk. */
4998         if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
4999                 err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
5000                 if (err) {
5001                         printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
5002                         err = 0;        /* Don't fail the copy. */
5003                 }
5004         }
5005
5006         if (!err)
5007                 hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock);
5008
5009         return err;
5010 }
5011
5012
5013 /* Structure to store state of reclaiming extents from a
5014  * given file.  hfs_reclaim_file()/hfs_reclaim_xattr()
5015  * initializes the values in this structure which are then
5016  * used by code that reclaims and splits the extents.
5017  */
5018 struct hfs_reclaim_extent_info {
5019         struct vnode *vp;
5020         u_int32_t fileID;
5021         u_int8_t forkType;
5022         u_int8_t is_dirlink;                 /* Extent belongs to directory hard link */
5023         u_int8_t is_sysfile;                 /* Extent belongs to system file */
5024         u_int8_t is_xattr;                   /* Extent belongs to extent-based xattr */
5025         u_int8_t extent_index;
5026         int lockflags;                       /* Locks that reclaim and split code should grab before modifying the extent record */
5027         u_int32_t blocks_relocated;          /* Total blocks relocated for this file till now */
5028         u_int32_t recStartBlock;             /* File allocation block number (FABN) for current extent record */
5029         u_int32_t cur_blockCount;            /* Number of allocation blocks that have been checked for reclaim */
5030         struct filefork *catalog_fp;         /* If non-NULL, extent is from catalog record */
5031         union record {
5032                 HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */
5033                 HFSPlusAttrRecord xattr;     /* Attribute record for large EAs */
5034         } record;
5035         HFSPlusExtentDescriptor *extents;    /* Pointer to current extent record being processed.
5036                                               * For catalog extent record, points to the correct
5037                                               * extent information in filefork.  For overflow extent
5038                                               * record, or xattr record, points to extent record
5039                                               * in the structure above
5040                                               */
5041         struct cat_desc *dirlink_desc;
5042         struct cat_attr *dirlink_attr;
5043         struct filefork *dirlink_fork;        /* For directory hard links, fp points actually to this */
5044         struct BTreeIterator *iterator;       /* Shared read/write iterator, hfs_reclaim_file/xattr()
5045                                                * use it for reading and hfs_reclaim_extent()/hfs_split_extent()
5046                                                * use it for writing updated extent record
5047                                                */
5048         struct FSBufferDescriptor btdata;     /* Shared btdata for reading/writing extent record, same as iterator above */
5049         u_int16_t recordlen;
5050         int overflow_count;                   /* For debugging, counter for overflow extent record */
5051         FCB *fcb;                             /* Pointer to the current btree being traversed */
5052 };
5053
5054 /*
5055  * Split the current extent into two extents, with first extent
5056  * to contain given number of allocation blocks.  Splitting of
5057  * extent creates one new extent entry which can result in
5058  * shifting of many entries through all the extent records of a
5059  * file, and/or creating a new extent record in the overflow
5060  * extent btree.
5061  *
5062  * Example:
5063  * The diagram below represents two consecutive extent records,
5064  * for simplicity, lets call them record X and X+1 respectively.
5065  * Interesting extent entries have been denoted by letters.
5066  * If the letter is unchanged before and after split, it means
5067  * that the extent entry was not modified during the split.
5068  * A '.' means that the entry remains unchanged after the split
5069  * and is not relevant for our example.  A '0' means that the
5070  * extent entry is empty.
5071  *
5072  * If there isn't sufficient contiguous free space to relocate
5073  * an extent (extent "C" below), we will have to break the one
5074  * extent into multiple smaller extents, and relocate each of
5075  * the smaller extents individually.  The way we do this is by
5076  * finding the largest contiguous free space that is currently
5077  * available (N allocation blocks), and then convert extent "C"
5078  * into two extents, C1 and C2, that occupy exactly the same
5079  * allocation blocks as extent C.  Extent C1 is the first
5080  * N allocation blocks of extent C, and extent C2 is the remainder
5081  * of extent C.  Then we can relocate extent C1 since we know
5082  * we have enough contiguous free space to relocate it in its
5083  * entirety.  We then repeat the process starting with extent C2.
5084  *
5085  * In record X, only the entries following entry C are shifted, and
5086  * the original entry C is replaced with two entries C1 and C2 which
5087  * are actually two extent entries for contiguous allocation blocks.
5088  *
5089  * Note that the entry E from record X is shifted into record X+1 as
5090  * the new first entry.  Since the first entry of record X+1 is updated,
5091  * the FABN will also get updated with the blockCount of entry E.
5092  * This also results in shifting of all extent entries in record X+1.
5093  * Note that the number of empty entries after the split has been
5094  * changed from 3 to 2.
5095  *
5096  * Before:
5097  *               record X                           record X+1
5098  *  ---------------------===---------     ---------------------------------
5099  *  | A | . | . | . | B | C | D | E |     | F | . | . | . | G | 0 | 0 | 0 |
5100  *  ---------------------===---------     ---------------------------------
5101  *
5102  * After:
5103  *  ---------------------=======-----     ---------------------------------
5104  *  | A | . | . | . | B | C1| C2| D |     | E | F | . | . | . | G | 0 | 0 |
5105  *  ---------------------=======-----     ---------------------------------
5106  *
5107  *  C1.startBlock = C.startBlock
5108  *  C1.blockCount = N
5109  *
5110  *  C2.startBlock = C.startBlock + N
5111  *  C2.blockCount = C.blockCount - N
5112  *
5113  *                                        FABN = old FABN - E.blockCount
5114  *
5115  * Inputs:
5116  *      extent_info -   This is the structure that contains state about
5117  *                      the current file, extent, and extent record that
5118  *                      is being relocated.  This structure is shared
5119  *                      among code that traverses through all the extents
5120  *                      of the file, code that relocates extents, and
5121  *                      code that splits the extent.
5122  *      newBlockCount - The blockCount of the extent to be split after
5123  *                      successfully split operation.
5124  * Output:
5125  *      Zero on success, non-zero on failure.
5126  */
5127 static int
5128 hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount)
5129 {
5130         int error = 0;
5131         int index = extent_info->extent_index;
5132         int i;
5133         HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */
5134         HFSPlusExtentDescriptor last_extent;
5135         HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */
5136         HFSPlusExtentRecord *extents_rec = NULL;
5137         HFSPlusExtentKey *extents_key = NULL;
5138         HFSPlusAttrRecord *xattr_rec = NULL;
5139         HFSPlusAttrKey *xattr_key = NULL;
5140         struct BTreeIterator iterator;
5141         struct FSBufferDescriptor btdata;
5142         uint16_t reclen;
5143         uint32_t read_recStartBlock;    /* Starting allocation block number to read old extent record */
5144         uint32_t write_recStartBlock;   /* Starting allocation block number to insert newly updated extent record */
5145         Boolean create_record = false;
5146         Boolean is_xattr;
5147         struct cnode *cp;
5148
5149         is_xattr = extent_info->is_xattr;
5150         extents = extent_info->extents;
5151         cp = VTOC(extent_info->vp);
5152
5153         if (newBlockCount == 0) {
5154                 if (hfs_resize_debug) {
5155                         printf ("hfs_split_extent: No splitting required for newBlockCount=0\n");
5156                 }
5157                 return error;
5158         }
5159
5160         if (hfs_resize_debug) {
5161                 printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount);
5162         }
5163
5164         /* Extents overflow btree can not have more than 8 extents.
5165          * No split allowed if the 8th extent is already used.
5166          */
5167         if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) {
5168                 printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n");
5169                 error = ENOSPC;
5170                 goto out;
5171         }
5172
5173         /* Determine the starting allocation block number for the following
5174          * overflow extent record, if any, before the current record
5175          * gets modified.
5176          */
5177         read_recStartBlock = extent_info->recStartBlock;
5178         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5179                 if (extents[i].blockCount == 0) {
5180                         break;
5181                 }
5182                 read_recStartBlock += extents[i].blockCount;
5183         }
5184
5185         /* Shift and split */
5186         if (index == kHFSPlusExtentDensity-1) {
5187                 /* The new extent created after split will go into following overflow extent record */
5188                 shift_extent.startBlock = extents[index].startBlock + newBlockCount;
5189                 shift_extent.blockCount = extents[index].blockCount - newBlockCount;
5190
5191                 /* Last extent in the record will be split, so nothing to shift */
5192         } else {
5193                 /* Splitting of extents can result in at most of one
5194                  * extent entry to be shifted into following overflow extent
5195                  * record.  So, store the last extent entry for later.
5196                  */
5197                 shift_extent = extents[kHFSPlusExtentDensity-1];
5198                 if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) {
5199                         printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount);
5200                 }
5201
5202                 /* Start shifting extent information from the end of the extent
5203                  * record to the index where we want to insert the new extent.
5204                  * Note that kHFSPlusExtentDensity-1 is already saved above, and
5205                  * does not need to be shifted.  The extent entry that is being
5206                  * split does not get shifted.
5207                  */
5208                 for (i = kHFSPlusExtentDensity-2; i > index; i--) {
5209                         if (hfs_resize_debug) {
5210                                 if (extents[i].blockCount) {
5211                                         printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount);
5212                                 }
5213                         }
5214                         extents[i+1] = extents[i];
5215                 }
5216         }
5217
5218         if (index == kHFSPlusExtentDensity-1) {
5219                 /* The second half of the extent being split will be the overflow
5220                  * entry that will go into following overflow extent record.  The
5221                  * value has been stored in 'shift_extent' above, so there is
5222                  * nothing to be done here.
5223                  */
5224         } else {
5225                 /* Update the values in the second half of the extent being split
5226                  * before updating the first half of the split.  Note that the
5227                  * extent to split or first half of the split is at index 'index'
5228                  * and a new extent or second half of the split will be inserted at
5229                  * 'index+1' or into following overflow extent record.
5230                  */
5231                 extents[index+1].startBlock = extents[index].startBlock + newBlockCount;
5232                 extents[index+1].blockCount = extents[index].blockCount - newBlockCount;
5233         }
5234         /* Update the extent being split, only the block count will change */
5235         extents[index].blockCount = newBlockCount;
5236
5237         if (hfs_resize_debug) {
5238                 printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount);
5239                 if (index != kHFSPlusExtentDensity-1) {
5240                         printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount);
5241                 } else {
5242                         printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount);
5243                 }
5244         }
5245
5246         /* Write out information about the newly split extent to the disk */
5247         if (extent_info->catalog_fp) {
5248                 /* (extent_info->catalog_fp != NULL) means the newly split
5249                  * extent exists in the catalog record.  This means that
5250                  * the cnode was updated.  Therefore, to write out the changes,
5251                  * mark the cnode as modified.   We cannot call hfs_update()
5252                  * in this function because the caller hfs_reclaim_extent()
5253                  * is holding the catalog lock currently.
5254                  */
5255                 cp->c_flag |= C_MODIFIED;
5256         } else {
5257                 /* The newly split extent is for large EAs or is in overflow
5258                  * extent record, so update it directly in the btree using the
5259                  * iterator information from the shared extent_info structure
5260                  */
5261                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5262                                 &(extent_info->btdata), extent_info->recordlen);
5263                 if (error) {
5264                         printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error);
5265                         goto out;
5266                 }
5267         }
5268
5269         /* No extent entry to be shifted into another extent overflow record */
5270         if (shift_extent.blockCount == 0) {
5271                 if (hfs_resize_debug) {
5272                         printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n");
5273                 }
5274                 error = 0;
5275                 goto out;
5276         }
5277
5278         /* The overflow extent entry has to be shifted into an extent
5279          * overflow record.  This means that we might have to shift
5280          * extent entries from all subsequent overflow records by one.
5281          * We start iteration from the first record to the last record,
5282          * and shift the extent entry from one record to another.
5283          * We might have to create a new extent record for the last
5284          * extent entry for the file.
5285          */
5286
5287         /* Initialize iterator to search the next record */
5288         bzero(&iterator, sizeof(iterator));
5289         if (is_xattr) {
5290                 /* Copy the key from the iterator that was used to update the modified attribute record. */
5291                 xattr_key = (HFSPlusAttrKey *)&(iterator.key);
5292                 bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey));
5293                 /* Note: xattr_key->startBlock will be initialized later in the iteration loop */
5294
5295                 MALLOC(xattr_rec, HFSPlusAttrRecord *,
5296                                 sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK);
5297                 if (xattr_rec == NULL) {
5298                         error = ENOMEM;
5299                         goto out;
5300                 }
5301                 btdata.bufferAddress = xattr_rec;
5302                 btdata.itemSize = sizeof(HFSPlusAttrRecord);
5303                 btdata.itemCount = 1;
5304                 extents = xattr_rec->overflowExtents.extents;
5305         } else {
5306                 /* Initialize the extent key for the current file */
5307                 extents_key = (HFSPlusExtentKey *) &(iterator.key);
5308                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5309                 extents_key->forkType = extent_info->forkType;
5310                 extents_key->fileID = extent_info->fileID;
5311                 /* Note: extents_key->startBlock will be initialized later in the iteration loop */
5312
5313                 MALLOC(extents_rec, HFSPlusExtentRecord *,
5314                                 sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK);
5315                 if (extents_rec == NULL) {
5316                         error = ENOMEM;
5317                         goto out;
5318                 }
5319                 btdata.bufferAddress = extents_rec;
5320                 btdata.itemSize = sizeof(HFSPlusExtentRecord);
5321                 btdata.itemCount = 1;
5322                 extents = extents_rec[0];
5323         }
5324
5325         /* The overflow extent entry has to be shifted into an extent
5326          * overflow record.  This means that we might have to shift
5327          * extent entries from all subsequent overflow records by one.
5328          * We start iteration from the first record to the last record,
5329          * examine one extent record in each iteration and shift one
5330          * extent entry from one record to another.  We might have to
5331          * create a new extent record for the last extent entry for the
5332          * file.
5333          *
5334          * If shift_extent.blockCount is non-zero, it means that there is
5335          * an extent entry that needs to be shifted into the next
5336          * overflow extent record.  We keep on going till there are no such
5337          * entries left to be shifted.  This will also change the starting
5338          * allocation block number of the extent record which is part of
5339          * the key for the extent record in each iteration.  Note that
5340          * because the extent record key is changing while we are searching,
5341          * the record can not be updated directly, instead it has to be
5342          * deleted and inserted again.
5343          */
5344         while (shift_extent.blockCount) {
5345                 if (hfs_resize_debug) {
5346                         printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock);
5347                 }
5348
5349                 /* Search if there is any existing overflow extent record
5350                  * that matches the current file and the logical start block
5351                  * number.
5352                  *
5353                  * For this, the logical start block number in the key is
5354                  * the value calculated based on the logical start block
5355                  * number of the current extent record and the total number
5356                  * of blocks existing in the current extent record.
5357                  */
5358                 if (is_xattr) {
5359                         xattr_key->startBlock = read_recStartBlock;
5360                 } else {
5361                         extents_key->startBlock = read_recStartBlock;
5362                 }
5363                 error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator);
5364                 if (error) {
5365                         if (error != btNotFound) {
5366                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5367                                 goto out;
5368                         }
5369                         /* No matching record was found, so create a new extent record.
5370                          * Note:  Since no record was found, we can't rely on the
5371                          * btree key in the iterator any longer.  This will be initialized
5372                          * later before we insert the record.
5373                          */
5374                         create_record = true;
5375                 }
5376
5377                 /* The extra extent entry from the previous record is being inserted
5378                  * as the first entry in the current extent record.  This will change
5379                  * the file allocation block number (FABN) of the current extent
5380                  * record, which is the startBlock value from the extent record key.
5381                  * Since one extra entry is being inserted in the record, the new
5382                  * FABN for the record will less than old FABN by the number of blocks
5383                  * in the new extent entry being inserted at the start.  We have to
5384                  * do this before we update read_recStartBlock to point at the
5385                  * startBlock of the following record.
5386                  */
5387                 write_recStartBlock = read_recStartBlock - shift_extent.blockCount;
5388                 if (hfs_resize_debug) {
5389                         if (create_record) {
5390                                 printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock);
5391                         }
5392                 }
5393
5394                 /* Now update the read_recStartBlock to account for total number
5395                  * of blocks in this extent record.  It will now point to the
5396                  * starting allocation block number for the next extent record.
5397                  */
5398                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
5399                         if (extents[i].blockCount == 0) {
5400                                 break;
5401                         }
5402                         read_recStartBlock += extents[i].blockCount;
5403                 }
5404
5405                 if (create_record == true) {
5406                         /* Initialize new record content with only one extent entry */
5407                         bzero(extents, sizeof(HFSPlusExtentRecord));
5408                         /* The new record will contain only one extent entry */
5409                         extents[0] = shift_extent;
5410                         /* There are no more overflow extents to be shifted */
5411                         shift_extent.startBlock = shift_extent.blockCount = 0;
5412
5413                         if (is_xattr) {
5414                                 /* BTSearchRecord above returned btNotFound,
5415                                  * but since the attribute btree is never empty
5416                                  * if we are trying to insert new overflow
5417                                  * record for the xattrs, the extents_key will
5418                                  * contain correct data.  So we don't need to
5419                                  * re-initialize it again like below.
5420                                  */
5421
5422                                 /* Initialize the new xattr record */
5423                                 xattr_rec->recordType = kHFSPlusAttrExtents;
5424                                 xattr_rec->overflowExtents.reserved = 0;
5425                                 reclen = sizeof(HFSPlusAttrExtents);
5426                         } else {
5427                                 /* BTSearchRecord above returned btNotFound,
5428                                  * which means that extents_key content might
5429                                  * not correspond to the record that we are
5430                                  * trying to create, especially when the extents
5431                                  * overflow btree is empty.  So we reinitialize
5432                                  * the extents_key again always.
5433                                  */
5434                                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5435                                 extents_key->forkType = extent_info->forkType;
5436                                 extents_key->fileID = extent_info->fileID;
5437
5438                                 /* Initialize the new extent record */
5439                                 reclen = sizeof(HFSPlusExtentRecord);
5440                         }
5441                 } else {
5442                         /* The overflow extent entry from previous record will be
5443                          * the first entry in this extent record.  If the last
5444                          * extent entry in this record is valid, it will be shifted
5445                          * into the following extent record as its first entry.  So
5446                          * save the last entry before shifting entries in current
5447                          * record.
5448                          */
5449                         last_extent = extents[kHFSPlusExtentDensity-1];
5450
5451                         /* Shift all entries by one index towards the end */
5452                         for (i = kHFSPlusExtentDensity-2; i >= 0; i--) {
5453                                 extents[i+1] = extents[i];
5454                         }
5455
5456                         /* Overflow extent entry saved from previous record
5457                          * is now the first entry in the current record.
5458                          */
5459                         extents[0] = shift_extent;
5460
5461                         if (hfs_resize_debug) {
5462                                 printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock);
5463                         }
5464
5465                         /* The last entry from current record will be the
5466                          * overflow entry which will be the first entry for
5467                          * the following extent record.
5468                          */
5469                         shift_extent = last_extent;
5470
5471                         /* Since the key->startBlock is being changed for this record,
5472                          * it should be deleted and inserted with the new key.
5473                          */
5474                         error = BTDeleteRecord(extent_info->fcb, &iterator);
5475                         if (error) {
5476                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5477                                 goto out;
5478                         }
5479                         if (hfs_resize_debug) {
5480                                 printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock));
5481                         }
5482                 }
5483
5484                 /* Insert the newly created or modified extent record */
5485                 bzero(&iterator.hint, sizeof(iterator.hint));
5486                 if (is_xattr) {
5487                         xattr_key->startBlock = write_recStartBlock;
5488                 } else {
5489                         extents_key->startBlock = write_recStartBlock;
5490                 }
5491                 error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen);
5492                 if (error) {
5493                         printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error);
5494                         goto out;
5495                 }
5496                 if (hfs_resize_debug) {
5497                         printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock);
5498                 }
5499         }
5500
5501 out:
5502         /*
5503          * Extents overflow btree or attributes btree headers might have
5504          * been modified during the split/shift operation, so flush the
5505          * changes to the disk while we are inside journal transaction.
5506          * We should only be able to generate I/O that modifies the B-Tree
5507          * header nodes while we're in the middle of a journal transaction.
5508          * Otherwise it might result in panic during unmount.
5509          */
5510         BTFlushPath(extent_info->fcb);
5511
5512         if (extents_rec) {
5513                 FREE (extents_rec, M_TEMP);
5514         }
5515         if (xattr_rec) {
5516                 FREE (xattr_rec, M_TEMP);
5517         }
5518         return error;
5519 }
5520
5521
5522 /*
5523  * Relocate an extent if it lies beyond the expected end of volume.
5524  *
5525  * This function is called for every extent of the file being relocated.
5526  * It allocates space for relocation, copies the data, deallocates
5527  * the old extent, and update corresponding on-disk extent.  If the function
5528  * does not find contiguous space to  relocate an extent, it splits the
5529  * extent in smaller size to be able to relocate it out of the area of
5530  * disk being reclaimed.  As an optimization, if an extent lies partially
5531  * in the area of the disk being reclaimed, it is split so that we only
5532  * have to relocate the area that was overlapping with the area of disk
5533  * being reclaimed.
5534  *
5535  * Note that every extent is relocated in its own transaction so that
5536  * they do not overwhelm the journal.  This function handles the extent
5537  * record that exists in the catalog record, extent record from overflow
5538  * extents btree, and extents for large EAs.
5539  *
5540  * Inputs:
5541  *      extent_info - This is the structure that contains state about
5542  *                    the current file, extent, and extent record that
5543  *                    is being relocated.  This structure is shared
5544  *                    among code that traverses through all the extents
5545  *                    of the file, code that relocates extents, and
5546  *                    code that splits the extent.
5547  */
5548 static int
5549 hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context)
5550 {
5551         int error = 0;
5552         int index;
5553         struct cnode *cp;
5554         u_int32_t oldStartBlock;
5555         u_int32_t oldBlockCount;
5556         u_int32_t newStartBlock;
5557         u_int32_t newBlockCount;
5558         u_int32_t roundedBlockCount;
5559         uint16_t node_size;
5560         uint32_t remainder_blocks;
5561         u_int32_t alloc_flags;
5562         int blocks_allocated = false;
5563
5564         index = extent_info->extent_index;
5565         cp = VTOC(extent_info->vp);
5566
5567         oldStartBlock = extent_info->extents[index].startBlock;
5568         oldBlockCount = extent_info->extents[index].blockCount;
5569
5570         if (0 && hfs_resize_debug) {
5571                 printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount);
5572         }
5573
5574         /* If the current extent lies completely within allocLimit,
5575          * it does not require any relocation.
5576          */
5577         if ((oldStartBlock + oldBlockCount) <= allocLimit) {
5578                 extent_info->cur_blockCount += oldBlockCount;
5579                 return error;
5580         }
5581
5582         /* Every extent should be relocated in its own transaction
5583          * to make sure that we don't overflow the journal buffer.
5584          */
5585         error = hfs_start_transaction(hfsmp);
5586         if (error) {
5587                 return error;
5588         }
5589         extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK);
5590
5591         /* Check if the extent lies partially in the area to reclaim,
5592          * i.e. it starts before allocLimit and ends beyond allocLimit.
5593          * We have already skipped extents that lie completely within
5594          * allocLimit in the check above, so we only check for the
5595          * startBlock.  If it lies partially, split it so that we
5596          * only relocate part of the extent.
5597          */
5598         if (oldStartBlock < allocLimit) {
5599                 newBlockCount = allocLimit - oldStartBlock;
5600
5601                 if (hfs_resize_debug) {
5602                         int idx = extent_info->extent_index;
5603                         printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5604                 }
5605
5606                 /* If the extent belongs to a btree, check and trim
5607                  * it to be multiple of the node size.
5608                  */
5609                 if (extent_info->is_sysfile) {
5610                         node_size = get_btree_nodesize(extent_info->vp);
5611                         /* If the btree node size is less than the block size,
5612                          * splitting this extent will not split a node across
5613                          * different extents.  So we only check and trim if
5614                          * node size is more than the allocation block size.
5615                          */
5616                         if (node_size > hfsmp->blockSize) {
5617                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5618                                 if (remainder_blocks) {
5619                                         newBlockCount -= remainder_blocks;
5620                                         if (hfs_resize_debug) {
5621                                                 printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5622                                         }
5623                                 }
5624                         }
5625                         /* The newBlockCount is zero because of rounding-down so that
5626                          * btree nodes are not split across extents.  Therefore this
5627                          * straddling extent across resize-boundary does not require
5628                          * splitting.  Skip over to relocating of complete extent.
5629                          */
5630                         if (newBlockCount == 0) {
5631                                 if (hfs_resize_debug) {
5632                                         printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n");
5633                                 }
5634                                 goto relocate_full_extent;
5635                         }
5636                 }
5637
5638                 /* Split the extents into two parts --- the first extent lies
5639                  * completely within allocLimit and therefore does not require
5640                  * relocation.  The second extent will require relocation which
5641                  * will be handled when the caller calls this function again
5642                  * for the next extent.
5643                  */
5644                 error = hfs_split_extent(extent_info, newBlockCount);
5645                 if (error == 0) {
5646                         /* Split success, no relocation required */
5647                         goto out;
5648                 }
5649                 /* Split failed, so try to relocate entire extent */
5650                 if (hfs_resize_debug) {
5651                         int idx = extent_info->extent_index;
5652                         printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5653                 }
5654         }
5655
5656 relocate_full_extent:
5657         /* At this point, the current extent requires relocation.
5658          * We will try to allocate space equal to the size of the extent
5659          * being relocated first to try to relocate it without splitting.
5660          * If the allocation fails, we will try to allocate contiguous
5661          * blocks out of metadata zone.  If that allocation also fails,
5662          * then we will take a whatever contiguous block run is returned
5663          * by the allocation, split the extent into two parts, and then
5664          * relocate the first splitted extent.
5665          */
5666         alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS;
5667         if (extent_info->is_sysfile) {
5668                 alloc_flags |= HFS_ALLOC_METAZONE;
5669         }
5670
5671         error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags,
5672                         &newStartBlock, &newBlockCount);
5673         if ((extent_info->is_sysfile == false) &&
5674             ((error == dskFulErr) || (error == ENOSPC))) {
5675                 /* For non-system files, try reallocating space in metadata zone */
5676                 alloc_flags |= HFS_ALLOC_METAZONE;
5677                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5678                                 alloc_flags, &newStartBlock, &newBlockCount);
5679         }
5680         if ((error == dskFulErr) || (error == ENOSPC)) {
5681                 /* We did not find desired contiguous space for this extent.
5682                  * So try to allocate the maximum contiguous space available.
5683                  */
5684                 alloc_flags &= ~HFS_ALLOC_FORCECONTIG;
5685
5686                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5687                                 alloc_flags, &newStartBlock, &newBlockCount);
5688                 if (error) {
5689                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5690                         goto out;
5691                 }
5692                 blocks_allocated = true;
5693
5694                 /* The number of blocks allocated is less than the requested
5695                  * number of blocks.  For btree extents, check and trim the
5696                  * extent to be multiple of the node size.
5697                  */
5698                 if (extent_info->is_sysfile) {
5699                         node_size = get_btree_nodesize(extent_info->vp);
5700                         if (node_size > hfsmp->blockSize) {
5701                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5702                                 if (remainder_blocks) {
5703                                         roundedBlockCount = newBlockCount - remainder_blocks;
5704                                         /* Free tail-end blocks of the newly allocated extent */
5705                                         BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount,
5706                                                                newBlockCount - roundedBlockCount,
5707                                                                HFS_ALLOC_SKIPFREEBLKS);
5708                                         newBlockCount = roundedBlockCount;
5709                                         if (hfs_resize_debug) {
5710                                                 printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5711                                         }
5712                                         if (newBlockCount == 0) {
5713                                                 printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID);
5714                                                 error = ENOSPC;
5715                                                 goto out;
5716                                         }
5717                                 }
5718                         }
5719                 }
5720
5721                 /* The number of blocks allocated is less than the number of
5722                  * blocks requested, so split this extent --- the first extent
5723                  * will be relocated as part of this function call and the caller
5724                  * will handle relocating the second extent by calling this
5725                  * function again for the second extent.
5726                  */
5727                 error = hfs_split_extent(extent_info, newBlockCount);
5728                 if (error) {
5729                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5730                         goto out;
5731                 }
5732                 oldBlockCount = newBlockCount;
5733         }
5734         if (error) {
5735                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5736                 goto out;
5737         }
5738         blocks_allocated = true;
5739
5740         /* Copy data from old location to new location */
5741         error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock,
5742                         newStartBlock, newBlockCount, context);
5743         if (error) {
5744                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error);
5745                 goto out;
5746         }
5747
5748         /* Update the extent record with the new start block information */
5749         extent_info->extents[index].startBlock = newStartBlock;
5750
5751         /* Sync the content back to the disk */
5752         if (extent_info->catalog_fp) {
5753                 /* Update the extents in catalog record */
5754                 if (extent_info->is_dirlink) {
5755                         error = cat_update_dirlink(hfsmp, extent_info->forkType,
5756                                         extent_info->dirlink_desc, extent_info->dirlink_attr,
5757                                         &(extent_info->dirlink_fork->ff_data));
5758                 } else {
5759                         cp->c_flag |= C_MODIFIED;
5760                         /* If this is a system file, sync volume headers on disk */
5761                         if (extent_info->is_sysfile) {
5762                                 error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5763                         }
5764                 }
5765         } else {
5766                 /* Replace record for extents overflow or extents-based xattrs */
5767                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5768                                 &(extent_info->btdata), extent_info->recordlen);
5769         }
5770         if (error) {
5771                 printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error);
5772                 goto out;
5773         }
5774
5775         /* Deallocate the old extent */
5776         error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5777         if (error) {
5778                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5779                 goto out;
5780         }
5781         extent_info->blocks_relocated += newBlockCount;
5782
5783         if (hfs_resize_debug) {
5784                 printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
5785         }
5786
5787 out:
5788         if (error != 0) {
5789                 if (blocks_allocated == true) {
5790                         BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5791                 }
5792         } else {
5793                 /* On success, increment the total allocation blocks processed */
5794                 extent_info->cur_blockCount += newBlockCount;
5795         }
5796
5797         hfs_systemfile_unlock(hfsmp, extent_info->lockflags);
5798
5799         /* For a non-system file, if an extent entry from catalog record
5800          * was modified, sync the in-memory changes to the catalog record
5801          * on disk before ending the transaction.
5802          */
5803          if ((extent_info->catalog_fp) &&
5804              (extent_info->is_sysfile == false)) {
5805                 (void) hfs_update(extent_info->vp, MNT_WAIT);
5806         }
5807
5808         hfs_end_transaction(hfsmp);
5809
5810         return error;
5811 }
5812
5813 /* Report intermediate progress during volume resize */
5814 static void
5815 hfs_truncatefs_progress(struct hfsmount *hfsmp)
5816 {
5817         u_int32_t cur_progress = 0;
5818
5819         hfs_resize_progress(hfsmp, &cur_progress);
5820         if (cur_progress > (hfsmp->hfs_resize_progress + 9)) {
5821                 printf("hfs_truncatefs: %d%% done...\n", cur_progress);
5822                 hfsmp->hfs_resize_progress = cur_progress;
5823         }
5824         return;
5825 }
5826
5827 /*
5828  * Reclaim space at the end of a volume for given file and forktype.
5829  *
5830  * This routine attempts to move any extent which contains allocation blocks
5831  * at or after "allocLimit."  A separate transaction is used for every extent
5832  * that needs to be moved.  If there is not contiguous space available for
5833  * moving an extent, it can be split into smaller extents.  The contents of
5834  * any moved extents are read and written via the volume's device vnode --
5835  * NOT via "vp."  During the move, moved blocks which are part of a transaction
5836  * have their physical block numbers invalidated so they will eventually be
5837  * written to their new locations.
5838  *
5839  * This function is also called for directory hard links.  Directory hard links
5840  * are regular files with no data fork and resource fork that contains alias
5841  * information for backward compatibility with pre-Leopard systems.  However
5842  * non-Mac OS X implementation can add/modify data fork or resource fork
5843  * information to directory hard links, so we check, and if required, relocate
5844  * both data fork and resource fork.
5845  *
5846  * Inputs:
5847  *    hfsmp       The volume being resized.
5848  *    vp          The vnode for the system file.
5849  *    fileID      ID of the catalog record that needs to be relocated
5850  *    forktype    The type of fork that needs relocated,
5851  *                      kHFSResourceForkType for resource fork,
5852  *                      kHFSDataForkType for data fork
5853  *    allocLimit  Allocation limit for the new volume size,
5854  *                do not use this block or beyond.  All extents
5855  *                that use this block or any blocks beyond this limit
5856  *                will be relocated.
5857  *
5858  * Side Effects:
5859  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
5860  * blocks that were relocated.
5861  */
5862 static int
5863 hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID,
5864                 u_int8_t forktype, u_long allocLimit, vfs_context_t context)
5865 {
5866         int error = 0;
5867         struct hfs_reclaim_extent_info *extent_info;
5868         int i;
5869         int lockflags = 0;
5870         struct cnode *cp;
5871         struct filefork *fp;
5872         int took_truncate_lock = false;
5873         int release_desc = false;
5874         HFSPlusExtentKey *key;
5875
5876         /* If there is no vnode for this file, then there's nothing to do. */
5877         if (vp == NULL) {
5878                 return 0;
5879         }
5880
5881         cp = VTOC(vp);
5882
5883         if (hfs_resize_debug) {
5884                 const char *filename = (const char *) cp->c_desc.cd_nameptr;
5885                 int namelen = cp->c_desc.cd_namelen;
5886
5887                 if (filename == NULL) {
5888                         filename = "";
5889                         namelen = 0;
5890                 }
5891                 printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename);
5892         }
5893
5894         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
5895                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
5896         if (extent_info == NULL) {
5897                 return ENOMEM;
5898         }
5899         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
5900         extent_info->vp = vp;
5901         extent_info->fileID = fileID;
5902         extent_info->forkType = forktype;
5903         extent_info->is_sysfile = vnode_issystem(vp);
5904         if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) {
5905                 extent_info->is_dirlink = true;
5906         }
5907         /* We always need allocation bitmap and extent btree lock */
5908         lockflags = SFL_BITMAP | SFL_EXTENTS;
5909         if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) {
5910                 lockflags |= SFL_CATALOG;
5911         } else if (fileID == kHFSAttributesFileID) {
5912                 lockflags |= SFL_ATTRIBUTE;
5913         } else if (fileID == kHFSStartupFileID) {
5914                 lockflags |= SFL_STARTUP;
5915         }
5916         extent_info->lockflags = lockflags;
5917         extent_info->fcb = VTOF(hfsmp->hfs_extents_vp);
5918
5919         /* Flush data associated with current file on disk.
5920          *
5921          * If the current vnode is directory hard link, no flushing of
5922          * journal or vnode is required.  The current kernel does not
5923          * modify data/resource fork of directory hard links, so nothing
5924          * will be in the cache.  If a directory hard link is newly created,
5925          * the resource fork data is written directly using devvp and
5926          * the code that actually relocates data (hfs_copy_extent()) also
5927          * uses devvp for its I/O --- so they will see a consistent copy.
5928          */
5929         if (extent_info->is_sysfile) {
5930                 /* If the current vnode is system vnode, flush journal
5931                  * to make sure that all data is written to the disk.
5932                  */
5933                 error = hfs_journal_flush(hfsmp, TRUE);
5934                 if (error) {
5935                         printf ("hfs_reclaim_file: journal_flush returned %d\n", error);
5936                         goto out;
5937                 }
5938         } else if (extent_info->is_dirlink == false) {
5939                 /* Flush all blocks associated with this regular file vnode.
5940                  * Normally there should not be buffer cache blocks for regular
5941                  * files, but for objects like symlinks, we can have buffer cache
5942                  * blocks associated with the vnode.  Therefore we call
5943                  * buf_flushdirtyblks() also.
5944                  */
5945                 buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file");
5946
5947                 hfs_unlock(cp);
5948                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
5949                 took_truncate_lock = true;
5950                 (void) cluster_push(vp, 0);
5951                 error = hfs_lock(cp, HFS_FORCE_LOCK);
5952                 if (error) {
5953                         goto out;
5954                 }
5955
5956                 /* If the file no longer exists, nothing left to do */
5957                 if (cp->c_flag & C_NOEXISTS) {
5958                         error = 0;
5959                         goto out;
5960                 }
5961
5962                 /* Wait for any in-progress writes to this vnode to complete, so that we'll
5963                  * be copying consistent bits.  (Otherwise, it's possible that an async
5964                  * write will complete to the old extent after we read from it.  That
5965                  * could lead to corruption.)
5966                  */
5967                 error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
5968                 if (error) {
5969                         goto out;
5970                 }
5971         }
5972
5973         if (hfs_resize_debug) {
5974                 printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID);
5975         }
5976
5977         if (extent_info->is_dirlink) {
5978                 MALLOC(extent_info->dirlink_desc, struct cat_desc *,
5979                                 sizeof(struct cat_desc), M_TEMP, M_WAITOK);
5980                 MALLOC(extent_info->dirlink_attr, struct cat_attr *,
5981                                 sizeof(struct cat_attr), M_TEMP, M_WAITOK);
5982                 MALLOC(extent_info->dirlink_fork, struct filefork *,
5983                                 sizeof(struct filefork), M_TEMP, M_WAITOK);
5984                 if ((extent_info->dirlink_desc == NULL) ||
5985                     (extent_info->dirlink_attr == NULL) ||
5986                     (extent_info->dirlink_fork == NULL)) {
5987                         error = ENOMEM;
5988                         goto out;
5989                 }
5990
5991                 /* Lookup catalog record for directory hard link and
5992                  * create a fake filefork for the value looked up from
5993                  * the disk.
5994                  */
5995                 fp = extent_info->dirlink_fork;
5996                 bzero(extent_info->dirlink_fork, sizeof(struct filefork));
5997                 extent_info->dirlink_fork->ff_cp = cp;
5998                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5999                 error = cat_lookup_dirlink(hfsmp, fileID, forktype,
6000                                 extent_info->dirlink_desc, extent_info->dirlink_attr,
6001                                 &(extent_info->dirlink_fork->ff_data));
6002                 hfs_systemfile_unlock(hfsmp, lockflags);
6003                 if (error) {
6004                         printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error);
6005                         goto out;
6006                 }
6007                 release_desc = true;
6008         } else {
6009                 fp = VTOF(vp);
6010         }
6011
6012         extent_info->catalog_fp = fp;
6013         extent_info->recStartBlock = 0;
6014         extent_info->extents = extent_info->catalog_fp->ff_extents;
6015         /* Relocate extents from the catalog record */
6016         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6017                 if (fp->ff_extents[i].blockCount == 0) {
6018                         break;
6019                 }
6020                 extent_info->extent_index = i;
6021                 error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6022                 if (error) {
6023                         printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error);
6024                         goto out;
6025                 }
6026         }
6027
6028         /* If the number of allocation blocks processed for reclaiming
6029          * are less than total number of blocks for the file, continuing
6030          * working on overflow extents record.
6031          */
6032         if (fp->ff_blocks <= extent_info->cur_blockCount) {
6033                 if (0 && hfs_resize_debug) {
6034                         printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6035                 }
6036                 goto out;
6037         }
6038
6039         if (hfs_resize_debug) {
6040                 printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6041         }
6042
6043         MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6044         if (extent_info->iterator == NULL) {
6045                 error = ENOMEM;
6046                 goto out;
6047         }
6048         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6049         key = (HFSPlusExtentKey *) &(extent_info->iterator->key);
6050         key->keyLength = kHFSPlusExtentKeyMaximumLength;
6051         key->forkType = forktype;
6052         key->fileID = fileID;
6053         key->startBlock = extent_info->cur_blockCount;
6054
6055         extent_info->btdata.bufferAddress = extent_info->record.overflow;
6056         extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord);
6057         extent_info->btdata.itemCount = 1;
6058
6059         extent_info->catalog_fp = NULL;
6060
6061         /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */
6062         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6063         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6064                         &(extent_info->btdata), &(extent_info->recordlen),
6065                         extent_info->iterator);
6066         hfs_systemfile_unlock(hfsmp, lockflags);
6067         while (error == 0) {
6068                 extent_info->overflow_count++;
6069                 extent_info->recStartBlock = key->startBlock;
6070                 extent_info->extents = extent_info->record.overflow;
6071                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6072                         if (extent_info->record.overflow[i].blockCount == 0) {
6073                                 goto out;
6074                         }
6075                         extent_info->extent_index = i;
6076                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6077                         if (error) {
6078                                 printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error);
6079                                 goto out;
6080                         }
6081                 }
6082
6083                 /* Look for more overflow records */
6084                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6085                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6086                                 extent_info->iterator, &(extent_info->btdata),
6087                                 &(extent_info->recordlen));
6088                 hfs_systemfile_unlock(hfsmp, lockflags);
6089                 if (error) {
6090                         break;
6091                 }
6092                 /* Stop when we encounter a different file or fork. */
6093                 if ((key->fileID != fileID) || (key->forkType != forktype)) {
6094                         break;
6095                 }
6096         }
6097         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6098                 error = 0;
6099         }
6100
6101 out:
6102         /* If any blocks were relocated, account them and report progress */
6103         if (extent_info->blocks_relocated) {
6104                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6105                 hfs_truncatefs_progress(hfsmp);
6106                 if (fileID < kHFSFirstUserCatalogNodeID) {
6107                         printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n",
6108                                         extent_info->blocks_relocated, fileID, hfsmp->vcbVN);
6109                 }
6110         }
6111         if (extent_info->iterator) {
6112                 FREE(extent_info->iterator, M_TEMP);
6113         }
6114         if (release_desc == true) {
6115                 cat_releasedesc(extent_info->dirlink_desc);
6116         }
6117         if (extent_info->dirlink_desc) {
6118                 FREE(extent_info->dirlink_desc, M_TEMP);
6119         }
6120         if (extent_info->dirlink_attr) {
6121                 FREE(extent_info->dirlink_attr, M_TEMP);
6122         }
6123         if (extent_info->dirlink_fork) {
6124                 FREE(extent_info->dirlink_fork, M_TEMP);
6125         }
6126         if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) {
6127                 (void) hfs_update(vp, MNT_WAIT);
6128         }
6129         if (took_truncate_lock) {
6130                 hfs_unlock_truncate(cp, 0);
6131         }
6132         if (extent_info) {
6133                 FREE(extent_info, M_TEMP);
6134         }
6135         if (hfs_resize_debug) {
6136                 printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error);
6137         }
6138
6139         return error;
6140 }
6141
6142
6143 /*
6144  * This journal_relocate callback updates the journal info block to point
6145  * at the new journal location.  This write must NOT be done using the
6146  * transaction.  We must write the block immediately.  We must also force
6147  * it to get to the media so that the new journal location will be seen by
6148  * the replay code before we can safely let journaled blocks be written
6149  * to their normal locations.
6150  *
6151  * The tests for journal_uses_fua below are mildly hacky.  Since the journal
6152  * and the file system are both on the same device, I'm leveraging what
6153  * the journal has decided about FUA.
6154  */
6155 struct hfs_journal_relocate_args {
6156         struct hfsmount *hfsmp;
6157         vfs_context_t context;
6158         u_int32_t newStartBlock;
6159         u_int32_t newBlockCount;
6160 };
6161
6162 static errno_t
6163 hfs_journal_relocate_callback(void *_args)
6164 {
6165         int error;
6166         struct hfs_journal_relocate_args *args = _args;
6167         struct hfsmount *hfsmp = args->hfsmp;
6168         buf_t bp;
6169         JournalInfoBlock *jibp;
6170
6171         error = buf_meta_bread(hfsmp->hfs_devvp,
6172                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6173                 hfsmp->blockSize, vfs_context_ucred(args->context), &bp);
6174         if (error) {
6175                 printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error);
6176                 if (bp) {
6177                         buf_brelse(bp);
6178                 }
6179                 return error;
6180         }
6181         jibp = (JournalInfoBlock*) buf_dataptr(bp);
6182         jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize);
6183         jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize);
6184         if (journal_uses_fua(hfsmp->jnl))
6185                 buf_markfua(bp);
6186         error = buf_bwrite(bp);
6187         if (error) {
6188                 printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error);
6189                 return error;
6190         }
6191         if (!journal_uses_fua(hfsmp->jnl)) {
6192                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context);
6193                 if (error) {
6194                         printf("hfs_journal_relocate_callback: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6195                         error = 0;              /* Don't fail the operation. */
6196                 }
6197         }
6198
6199         return error;
6200 }
6201
6202
6203 /* Type of resize operation in progress */
6204 #define HFS_RESIZE_TRUNCATE     1
6205 #define HFS_RESIZE_EXTEND       2
6206
6207 /*
6208  * Core function to relocate the journal file.  This function takes the
6209  * journal size of the newly relocated journal --- the caller can
6210  * provide a new journal size if they want to change the size of
6211  * the journal.  The function takes care of updating the journal info
6212  * block and all other data structures correctly.
6213  *
6214  * Note: This function starts a transaction and grabs the btree locks.
6215  */
6216 static int
6217 hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context)
6218 {
6219         int error;
6220         int journal_err;
6221         int lockflags;
6222         u_int32_t oldStartBlock;
6223         u_int32_t newStartBlock;
6224         u_int32_t oldBlockCount;
6225         u_int32_t newBlockCount;
6226         u_int32_t jnlBlockCount;
6227         u_int32_t alloc_skipfreeblks;
6228         struct cat_desc journal_desc;
6229         struct cat_attr journal_attr;
6230         struct cat_fork journal_fork;
6231         struct hfs_journal_relocate_args callback_args;
6232
6233         /* Calculate the number of allocation blocks required for the journal */
6234         jnlBlockCount = howmany(jnl_size, hfsmp->blockSize);
6235
6236         /*
6237          * During truncatefs(), the volume free block count is updated
6238          * before relocating data and reflects the total number of free
6239          * blocks that will exist on volume after the resize is successful.
6240          * This means that the allocation blocks required for relocation
6241          * have already been reserved and accounted for in the free block
6242          * count.  Therefore, block allocation and deallocation routines
6243          * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS
6244          * flag.
6245          *
6246          * This special handling is not required when the file system
6247          * is being extended as we want all the allocated and deallocated
6248          * blocks to be accounted for correctly.
6249          */
6250         if (resize_type == HFS_RESIZE_TRUNCATE) {
6251                 alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS;
6252         } else {
6253                 alloc_skipfreeblks = 0;
6254         }
6255
6256         error = hfs_start_transaction(hfsmp);
6257         if (error) {
6258                 printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error);
6259                 return error;
6260         }
6261         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6262
6263         error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount,
6264                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | alloc_skipfreeblks,
6265                          &newStartBlock, &newBlockCount);
6266         if (error) {
6267                 printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error);
6268                 goto fail;
6269         }
6270         if (newBlockCount != jnlBlockCount) {
6271                 printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount);
6272                 goto free_fail;
6273         }
6274
6275         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork);
6276         if (error) {
6277                 printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error);
6278                 goto free_fail;
6279         }
6280
6281         oldStartBlock = journal_fork.cf_extents[0].startBlock;
6282         oldBlockCount = journal_fork.cf_extents[0].blockCount;
6283         error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks);
6284         if (error) {
6285                 printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6286                 goto free_fail;
6287         }
6288
6289         /* Update the catalog record for .journal */
6290         journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
6291         journal_fork.cf_extents[0].startBlock = newStartBlock;
6292         journal_fork.cf_extents[0].blockCount = newBlockCount;
6293         journal_fork.cf_blocks = newBlockCount;
6294         error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL);
6295         cat_releasedesc(&journal_desc);  /* all done with cat descriptor */
6296         if (error) {
6297                 printf("hfs_relocate_journal_file: cat_update returned %d\n", error);
6298                 goto free_fail;
6299         }
6300
6301         /*
6302          * If the journal is part of the file system, then tell the journal
6303          * code about the new location.  If the journal is on an external
6304          * device, then just keep using it as-is.
6305          */
6306         if (hfsmp->jvp == hfsmp->hfs_devvp) {
6307                 callback_args.hfsmp = hfsmp;
6308                 callback_args.context = context;
6309                 callback_args.newStartBlock = newStartBlock;
6310                 callback_args.newBlockCount = newBlockCount;
6311
6312                 error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize,
6313                         (off_t)newBlockCount*hfsmp->blockSize, 0,
6314                         hfs_journal_relocate_callback, &callback_args);
6315                 if (error) {
6316                         /* NOTE: journal_relocate will mark the journal invalid. */
6317                         printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error);
6318                         goto fail;
6319                 }
6320                 if (hfs_resize_debug) {
6321                         printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
6322                 }
6323                 hfsmp->jnl_start = newStartBlock;
6324                 hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize;
6325         }
6326
6327         hfs_systemfile_unlock(hfsmp, lockflags);
6328         error = hfs_end_transaction(hfsmp);
6329         if (error) {
6330                 printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error);
6331         }
6332
6333         return error;
6334
6335 free_fail:
6336         journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
6337         if (journal_err) {
6338                 printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6339                 hfs_mark_volume_inconsistent(hfsmp);
6340         }
6341 fail:
6342         hfs_systemfile_unlock(hfsmp, lockflags);
6343         (void) hfs_end_transaction(hfsmp);
6344         if (hfs_resize_debug) {
6345                 printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error);
6346         }
6347         return error;
6348 }
6349
6350
6351 /*
6352  * Relocate the journal file when the file system is being truncated.
6353  * We do not down-size the journal when the file system size is
6354  * reduced, so we always provide the current journal size to the
6355  * relocate code.
6356  */
6357 static int
6358 hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6359 {
6360         int error = 0;
6361         u_int32_t startBlock;
6362         u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6363
6364         /*
6365          * Figure out the location of the .journal file.  When the journal
6366          * is on an external device, we need to look up the .journal file.
6367          */
6368         if (hfsmp->jvp == hfsmp->hfs_devvp) {
6369                 startBlock = hfsmp->jnl_start;
6370                 blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6371         } else {
6372                 u_int32_t fileid;
6373                 u_int32_t old_jnlfileid;
6374                 struct cat_attr attr;
6375                 struct cat_fork fork;
6376
6377                 /*
6378                  * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid
6379                  * is set, and it is trying to hide the .journal file.  So temporarily
6380                  * unset the field while calling GetFileInfo.
6381                  */
6382                 old_jnlfileid = hfsmp->hfs_jnlfileid;
6383                 hfsmp->hfs_jnlfileid = 0;
6384                 fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork);
6385                 hfsmp->hfs_jnlfileid = old_jnlfileid;
6386                 if (fileid != old_jnlfileid) {
6387                         printf("hfs_reclaim_journal_file: cannot find .journal file!\n");
6388                         return EIO;
6389                 }
6390
6391                 startBlock = fork.cf_extents[0].startBlock;
6392                 blockCount = fork.cf_extents[0].blockCount;
6393         }
6394
6395         if (startBlock + blockCount <= allocLimit) {
6396                 /* The journal file does not require relocation */
6397                 return 0;
6398         }
6399
6400         error = hfs_relocate_journal_file(hfsmp, blockCount * hfsmp->blockSize, HFS_RESIZE_TRUNCATE, context);
6401         if (error == 0) {
6402                 hfsmp->hfs_resize_blocksmoved += blockCount;
6403                 hfs_truncatefs_progress(hfsmp);
6404                 printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n",
6405                                 blockCount, hfsmp->vcbVN);
6406         }
6407
6408         return error;
6409 }
6410
6411
6412 /*
6413  * Move the journal info block to a new location.  We have to make sure the
6414  * new copy of the journal info block gets to the media first, then change
6415  * the field in the volume header and the catalog record.
6416  */
6417 static int
6418 hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6419 {
6420         int error;
6421         int journal_err;
6422         int lockflags;
6423         u_int32_t oldBlock;
6424         u_int32_t newBlock;
6425         u_int32_t blockCount;
6426         struct cat_desc jib_desc;
6427         struct cat_attr jib_attr;
6428         struct cat_fork jib_fork;
6429         buf_t old_bp, new_bp;
6430
6431         if (hfsmp->vcbJinfoBlock <= allocLimit) {
6432                 /* The journal info block does not require relocation */
6433                 return 0;
6434         }
6435
6436         error = hfs_start_transaction(hfsmp);
6437         if (error) {
6438                 printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error);
6439                 return error;
6440         }
6441         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6442
6443         error = BlockAllocate(hfsmp, 1, 1, 1,
6444                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS,
6445                         &newBlock, &blockCount);
6446         if (error) {
6447                 printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error);
6448                 goto fail;
6449         }
6450         if (blockCount != 1) {
6451                 printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount);
6452                 goto free_fail;
6453         }
6454         error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS);
6455         if (error) {
6456                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6457                 goto free_fail;
6458         }
6459
6460         /* Copy the old journal info block content to the new location */
6461         error = buf_meta_bread(hfsmp->hfs_devvp,
6462                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6463                 hfsmp->blockSize, vfs_context_ucred(context), &old_bp);
6464         if (error) {
6465                 printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error);
6466                 if (old_bp) {
6467                         buf_brelse(old_bp);
6468                 }
6469                 goto free_fail;
6470         }
6471         new_bp = buf_getblk(hfsmp->hfs_devvp,
6472                 newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6473                 hfsmp->blockSize, 0, 0, BLK_META);
6474         bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize);
6475         buf_brelse(old_bp);
6476         if (journal_uses_fua(hfsmp->jnl))
6477                 buf_markfua(new_bp);
6478         error = buf_bwrite(new_bp);
6479         if (error) {
6480                 printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error);
6481                 goto free_fail;
6482         }
6483         if (!journal_uses_fua(hfsmp->jnl)) {
6484                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
6485                 if (error) {
6486                         printf("hfs_reclaim_journal_info_block: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6487                         /* Don't fail the operation. */
6488                 }
6489         }
6490
6491         /* Update the catalog record for .journal_info_block */
6492         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork);
6493         if (error) {
6494                 printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error);
6495                 goto fail;
6496         }
6497         oldBlock = jib_fork.cf_extents[0].startBlock;
6498         jib_fork.cf_size = hfsmp->blockSize;
6499         jib_fork.cf_extents[0].startBlock = newBlock;
6500         jib_fork.cf_extents[0].blockCount = 1;
6501         jib_fork.cf_blocks = 1;
6502         error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL);
6503         cat_releasedesc(&jib_desc);  /* all done with cat descriptor */
6504         if (error) {
6505                 printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error);
6506                 goto fail;
6507         }
6508
6509         /* Update the pointer to the journal info block in the volume header. */
6510         hfsmp->vcbJinfoBlock = newBlock;
6511         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
6512         if (error) {
6513                 printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error);
6514                 goto fail;
6515         }
6516         hfs_systemfile_unlock(hfsmp, lockflags);
6517         error = hfs_end_transaction(hfsmp);
6518         if (error) {
6519                 printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error);
6520         }
6521         error = hfs_journal_flush(hfsmp, FALSE);
6522         if (error) {
6523                 printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
6524         }
6525
6526         /* Account for the block relocated and print progress */
6527         hfsmp->hfs_resize_blocksmoved += 1;
6528         hfs_truncatefs_progress(hfsmp);
6529         if (!error) {
6530                 printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n",
6531                                 hfsmp->vcbVN);
6532                 if (hfs_resize_debug) {
6533                         printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
6534                 }
6535         }
6536         return error;
6537
6538 free_fail:
6539         journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS);
6540         if (journal_err) {
6541                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6542                 hfs_mark_volume_inconsistent(hfsmp);
6543         }
6544
6545 fail:
6546         hfs_systemfile_unlock(hfsmp, lockflags);
6547         (void) hfs_end_transaction(hfsmp);
6548         if (hfs_resize_debug) {
6549                 printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
6550         }
6551         return error;
6552 }
6553
6554
6555 static u_int64_t
6556 calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count)
6557 {
6558         u_int64_t journal_size;
6559         u_int32_t journal_scale;
6560
6561 #define DEFAULT_JOURNAL_SIZE (8*1024*1024)
6562 #define MAX_JOURNAL_SIZE     (512*1024*1024)
6563
6564         /* Calculate the journal size for this volume.   We want
6565          * at least 8 MB of journal for each 100 GB of disk space.
6566          * We cap the size at 512 MB, unless the allocation block
6567          * size is larger, in which case, we use one allocation
6568          * block.
6569          */
6570         journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024);
6571         journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1);
6572         if (journal_size > MAX_JOURNAL_SIZE) {
6573                 journal_size = MAX_JOURNAL_SIZE;
6574         }
6575         if (journal_size < hfsmp->blockSize) {
6576                 journal_size = hfsmp->blockSize;
6577         }
6578         return journal_size;
6579 }
6580
6581
6582 /*
6583  * Calculate the expected journal size based on current partition size.
6584  * If the size of the current journal is less than the calculated size,
6585  * force journal relocation with the new journal size.
6586  */
6587 static int
6588 hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context)
6589 {
6590         int error = 0;
6591         u_int64_t calc_journal_size;
6592
6593         if (hfsmp->jvp != hfsmp->hfs_devvp) {
6594                 if (hfs_resize_debug) {
6595                         printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n");
6596                 }
6597                 return 0;
6598         }
6599
6600         calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count);
6601         if (calc_journal_size <= hfsmp->jnl_size) {
6602                 /* The journal size requires no modification */
6603                 goto out;
6604         }
6605
6606         if (hfs_resize_debug) {
6607                 printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size);
6608         }
6609
6610         /* Extend the journal to the new calculated size */
6611         error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context);
6612         if (error == 0) {
6613                 printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n",
6614                                 hfsmp->jnl_size, hfsmp->vcbVN);
6615         }
6616 out:
6617         return error;
6618 }
6619
6620
6621 /*
6622  * This function traverses through all extended attribute records for a given
6623  * fileID, and calls function that reclaims data blocks that exist in the
6624  * area of the disk being reclaimed which in turn is responsible for allocating
6625  * new space, copying extent data, deallocating new space, and if required,
6626  * splitting the extent.
6627  *
6628  * Note: The caller has already acquired the cnode lock on the file.  Therefore
6629  * we are assured that no other thread would be creating/deleting/modifying
6630  * extended attributes for this file.
6631  *
6632  * Side Effects:
6633  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
6634  * blocks that were relocated.
6635  *
6636  * Returns:
6637  *      0 on success, non-zero on failure.
6638  */
6639 static int
6640 hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context)
6641 {
6642         int error = 0;
6643         struct hfs_reclaim_extent_info *extent_info;
6644         int i;
6645         HFSPlusAttrKey *key;
6646         int *lockflags;
6647
6648         if (hfs_resize_debug) {
6649                 printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID);
6650         }
6651
6652         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
6653                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
6654         if (extent_info == NULL) {
6655                 return ENOMEM;
6656         }
6657         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
6658         extent_info->vp = vp;
6659         extent_info->fileID = fileID;
6660         extent_info->is_xattr = true;
6661         extent_info->is_sysfile = vnode_issystem(vp);
6662         extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp);
6663         lockflags = &(extent_info->lockflags);
6664         *lockflags = SFL_ATTRIBUTE | SFL_BITMAP;
6665
6666         /* Initialize iterator from the extent_info structure */
6667         MALLOC(extent_info->iterator, struct BTreeIterator *,
6668                sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6669         if (extent_info->iterator == NULL) {
6670                 error = ENOMEM;
6671                 goto out;
6672         }
6673         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6674
6675         /* Build attribute key */
6676         key = (HFSPlusAttrKey *)&(extent_info->iterator->key);
6677         error = hfs_buildattrkey(fileID, NULL, key);
6678         if (error) {
6679                 goto out;
6680         }
6681
6682         /* Initialize btdata from extent_info structure.  Note that the
6683          * buffer pointer actually points to the xattr record from the
6684          * extent_info structure itself.
6685          */
6686         extent_info->btdata.bufferAddress = &(extent_info->record.xattr);
6687         extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord);
6688         extent_info->btdata.itemCount = 1;
6689
6690         /*
6691          * Sync all extent-based attribute data to the disk.
6692          *
6693          * All extent-based attribute data I/O is performed via cluster
6694          * I/O using a virtual file that spans across entire file system
6695          * space.
6696          */
6697         hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK);
6698         (void)cluster_push(hfsmp->hfs_attrdata_vp, 0);
6699         error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr");
6700         hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), 0);
6701         if (error) {
6702                 goto out;
6703         }
6704
6705         /* Search for extended attribute for current file.  This
6706          * will place the iterator before the first matching record.
6707          */
6708         *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6709         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6710                         &(extent_info->btdata), &(extent_info->recordlen),
6711                         extent_info->iterator);
6712         hfs_systemfile_unlock(hfsmp, *lockflags);
6713         if (error) {
6714                 if (error != btNotFound) {
6715                         goto out;
6716                 }
6717                 /* btNotFound is expected here, so just mask it */
6718                 error = 0;
6719         }
6720
6721         while (1) {
6722                 /* Iterate to the next record */
6723                 *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6724                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6725                                 extent_info->iterator, &(extent_info->btdata),
6726                                 &(extent_info->recordlen));
6727                 hfs_systemfile_unlock(hfsmp, *lockflags);
6728
6729                 /* Stop the iteration if we encounter end of btree or xattr with different fileID */
6730                 if (error || key->fileID != fileID) {
6731                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6732                                 error = 0;
6733                         }
6734                         break;
6735                 }
6736
6737                 /* We only care about extent-based EAs */
6738                 if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) &&
6739                     (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) {
6740                         continue;
6741                 }
6742
6743                 if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) {
6744                         extent_info->overflow_count = 0;
6745                         extent_info->extents = extent_info->record.xattr.forkData.theFork.extents;
6746                 } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) {
6747                         extent_info->overflow_count++;
6748                         extent_info->extents = extent_info->record.xattr.overflowExtents.extents;
6749                 }
6750
6751                 extent_info->recStartBlock = key->startBlock;
6752                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6753                         if (extent_info->extents[i].blockCount == 0) {
6754                                 break;
6755                         }
6756                         extent_info->extent_index = i;
6757                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6758                         if (error) {
6759                                 printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error);
6760                                 goto out;
6761                         }
6762                 }
6763         }
6764
6765 out:
6766         /* If any blocks were relocated, account them and report progress */
6767         if (extent_info->blocks_relocated) {
6768                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6769                 hfs_truncatefs_progress(hfsmp);
6770         }
6771         if (extent_info->iterator) {
6772                 FREE(extent_info->iterator, M_TEMP);
6773         }
6774         if (extent_info) {
6775                 FREE(extent_info, M_TEMP);
6776         }
6777         if (hfs_resize_debug) {
6778                 printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error);
6779         }
6780         return error;
6781 }
6782
6783 /*
6784  * Reclaim any extent-based extended attributes allocation blocks from
6785  * the area of the disk that is being truncated.
6786  *
6787  * The function traverses the attribute btree to find out the fileIDs
6788  * of the extended attributes that need to be relocated.  For every
6789  * file whose large EA requires relocation, it looks up the cnode and
6790  * calls hfs_reclaim_xattr() to do all the work for allocating
6791  * new space, copying data, deallocating old space, and if required,
6792  * splitting the extents.
6793  *
6794  * Inputs:
6795  *      allocLimit    - starting block of the area being reclaimed
6796  *
6797  * Returns:
6798  *      returns 0 on success, non-zero on failure.
6799  */
6800 static int
6801 hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6802 {
6803         int error = 0;
6804         FCB *fcb;
6805         struct BTreeIterator *iterator = NULL;
6806         struct FSBufferDescriptor btdata;
6807         HFSPlusAttrKey *key;
6808         HFSPlusAttrRecord rec;
6809         int lockflags = 0;
6810         cnid_t prev_fileid = 0;
6811         struct vnode *vp;
6812         int need_relocate;
6813         int btree_operation;
6814         u_int32_t files_moved = 0;
6815         u_int32_t prev_blocksmoved;
6816         int i;
6817
6818         fcb = VTOF(hfsmp->hfs_attribute_vp);
6819         /* Store the value to print total blocks moved by this function in end */
6820         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6821
6822         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6823                 return ENOMEM;
6824         }
6825         bzero(iterator, sizeof(*iterator));
6826         key = (HFSPlusAttrKey *)&iterator->key;
6827         btdata.bufferAddress = &rec;
6828         btdata.itemSize = sizeof(rec);
6829         btdata.itemCount = 1;
6830
6831         need_relocate = false;
6832         btree_operation = kBTreeFirstRecord;
6833         /* Traverse the attribute btree to find extent-based EAs to reclaim */
6834         while (1) {
6835                 lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK);
6836                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6837                 hfs_systemfile_unlock(hfsmp, lockflags);
6838                 if (error) {
6839                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6840                                 error = 0;
6841                         }
6842                         break;
6843                 }
6844                 btree_operation = kBTreeNextRecord;
6845
6846                 /* If the extents of current fileID were already relocated, skip it */
6847                 if (prev_fileid == key->fileID) {
6848                         continue;
6849                 }
6850
6851                 /* Check if any of the extents in the current record need to be relocated */
6852                 need_relocate = false;
6853                 switch(rec.recordType) {
6854                         case kHFSPlusAttrForkData:
6855                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6856                                         if (rec.forkData.theFork.extents[i].blockCount == 0) {
6857                                                 break;
6858                                         }
6859                                         if ((rec.forkData.theFork.extents[i].startBlock +
6860                                              rec.forkData.theFork.extents[i].blockCount) > allocLimit) {
6861                                                 need_relocate = true;
6862                                                 break;
6863                                         }
6864                                 }
6865                                 break;
6866
6867                         case kHFSPlusAttrExtents:
6868                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6869                                         if (rec.overflowExtents.extents[i].blockCount == 0) {
6870                                                 break;
6871                                         }
6872                                         if ((rec.overflowExtents.extents[i].startBlock +
6873                                              rec.overflowExtents.extents[i].blockCount) > allocLimit) {
6874                                                 need_relocate = true;
6875                                                 break;
6876                                         }
6877                                 }
6878                                 break;
6879                 };
6880
6881                 /* Continue iterating to next attribute record */
6882                 if (need_relocate == false) {
6883                         continue;
6884                 }
6885
6886                 /* Look up the vnode for corresponding file.  The cnode
6887                  * will be locked which will ensure that no one modifies
6888                  * the xattrs when we are relocating them.
6889                  *
6890                  * We want to allow open-unlinked files to be moved,
6891                  * so provide allow_deleted == 1 for hfs_vget().
6892                  */
6893                 if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) {
6894                         continue;
6895                 }
6896
6897                 error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context);
6898                 hfs_unlock(VTOC(vp));
6899                 vnode_put(vp);
6900                 if (error) {
6901                         printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error);
6902                         break;
6903                 }
6904                 prev_fileid = key->fileID;
6905                 files_moved++;
6906         }
6907
6908         if (files_moved) {
6909                 printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n",
6910                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6911                                 files_moved, hfsmp->vcbVN);
6912         }
6913
6914         kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6915         return error;
6916 }
6917
6918 /*
6919  * Reclaim blocks from regular files.
6920  *
6921  * This function iterates over all the record in catalog btree looking
6922  * for files with extents that overlap into the space we're trying to
6923  * free up.  If a file extent requires relocation, it looks up the vnode
6924  * and calls function to relocate the data.
6925  *
6926  * Returns:
6927  *      Zero on success, non-zero on failure.
6928  */
6929 static int
6930 hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6931 {
6932         int error;
6933         FCB *fcb;
6934         struct BTreeIterator *iterator = NULL;
6935         struct FSBufferDescriptor btdata;
6936         int btree_operation;
6937         int lockflags;
6938         struct HFSPlusCatalogFile filerec;
6939         struct vnode *vp;
6940         struct vnode *rvp;
6941         struct filefork *datafork;
6942         u_int32_t files_moved = 0;
6943         u_int32_t prev_blocksmoved;
6944
6945         fcb = VTOF(hfsmp->hfs_catalog_vp);
6946         /* Store the value to print total blocks moved by this function at the end */
6947         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6948
6949         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6950                 error = ENOMEM;
6951                 goto reclaim_filespace_done;
6952         }
6953
6954 #if CONFIG_PROTECT
6955         int keys_generated = 0;
6956         /*
6957          * For content-protected filesystems, we may need to relocate files that
6958          * are encrypted.  If they use the new-style offset-based IVs, then
6959          * we can move them regardless of the lock state.  We create a temporary
6960          * key here that we use to read/write the data, then we discard it at the
6961          * end of the function.
6962          */
6963         if (cp_fs_protected (hfsmp->hfs_mp)) {
6964                 error = cp_entry_gentempkeys(&hfsmp->hfs_resize_cpentry, hfsmp);
6965                 if (error) {
6966                         printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error);
6967                         goto reclaim_filespace_done;
6968                 }
6969         }
6970 #endif
6971
6972         bzero(iterator, sizeof(*iterator));
6973
6974         btdata.bufferAddress = &filerec;
6975         btdata.itemSize = sizeof(filerec);
6976         btdata.itemCount = 1;
6977
6978         btree_operation = kBTreeFirstRecord;
6979         while (1) {
6980                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
6981                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6982                 hfs_systemfile_unlock(hfsmp, lockflags);
6983                 if (error) {
6984                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6985                                 error = 0;
6986                         }
6987                         break;
6988                 }
6989                 btree_operation = kBTreeNextRecord;
6990
6991                 if (filerec.recordType != kHFSPlusFileRecord) {
6992                         continue;
6993                 }
6994
6995                 /* Check if any of the extents require relocation */
6996                 if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) {
6997                         continue;
6998                 }
6999
7000                 /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */
7001                 if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) {
7002                         if (hfs_resize_debug) {
7003                                 printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID);
7004                         }
7005                         continue;
7006                 }
7007
7008                 /* If data fork exists or item is a directory hard link, relocate blocks */
7009                 datafork = VTOF(vp);
7010                 if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) {
7011                         error = hfs_reclaim_file(hfsmp, vp, filerec.fileID,
7012                                         kHFSDataForkType, allocLimit, context);
7013                         if (error)  {
7014                                 printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7015                                 hfs_unlock(VTOC(vp));
7016                                 vnode_put(vp);
7017                                 break;
7018                         }
7019                 }
7020
7021                 /* If resource fork exists or item is a directory hard link, relocate blocks */
7022                 if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) {
7023                         if (vnode_isdir(vp)) {
7024                                 /* Resource fork vnode lookup is invalid for directory hard link.
7025                                  * So we fake data fork vnode as resource fork vnode.
7026                                  */
7027                                 rvp = vp;
7028                         } else {
7029                                 error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
7030                                 if (error) {
7031                                         printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error);
7032                                         hfs_unlock(VTOC(vp));
7033                                         vnode_put(vp);
7034                                         break;
7035                                 }
7036                                 VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
7037                         }
7038
7039                         error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID,
7040                                         kHFSResourceForkType, allocLimit, context);
7041                         if (error) {
7042                                 printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7043                                 hfs_unlock(VTOC(vp));
7044                                 vnode_put(vp);
7045                                 break;
7046                         }
7047                 }
7048
7049                 /* The file forks were relocated successfully, now drop the
7050                  * cnode lock and vnode reference, and continue iterating to
7051                  * next catalog record.
7052                  */
7053                 hfs_unlock(VTOC(vp));
7054                 vnode_put(vp);
7055                 files_moved++;
7056         }
7057
7058         if (files_moved) {
7059                 printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n",
7060                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
7061                                 files_moved, hfsmp->vcbVN);
7062         }
7063
7064 reclaim_filespace_done:
7065         if (iterator) {
7066                 kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7067         }
7068
7069 #if CONFIG_PROTECT
7070         if (keys_generated) {
7071                 cp_entry_destroy(&hfsmp->hfs_resize_cpentry);
7072         }
7073 #endif
7074         return error;
7075 }
7076
7077 /*
7078  * Reclaim space at the end of a file system.
7079  *
7080  * Inputs -
7081  *      allocLimit      - start block of the space being reclaimed
7082  *      reclaimblks     - number of allocation blocks to reclaim
7083  */
7084 static int
7085 hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context)
7086 {
7087         int error = 0;
7088
7089         /*
7090          * Preflight the bitmap to find out total number of blocks that need
7091          * relocation.
7092          *
7093          * Note: Since allocLimit is set to the location of new alternate volume
7094          * header, the check below does not account for blocks allocated for old
7095          * alternate volume header.
7096          */
7097         error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks));
7098         if (error) {
7099                 printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error);
7100                 return error;
7101         }
7102         if (hfs_resize_debug) {
7103                 printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks);
7104         }
7105
7106         /* Just to be safe, sync the content of the journal to the disk before we proceed */
7107         hfs_journal_flush(hfsmp, TRUE);
7108
7109         /* First, relocate journal file blocks if they're in the way.
7110          * Doing this first will make sure that journal relocate code
7111          * gets access to contiguous blocks on disk first.  The journal
7112          * file has to be contiguous on the disk, otherwise resize will
7113          * fail.
7114          */
7115         error = hfs_reclaim_journal_file(hfsmp, allocLimit, context);
7116         if (error) {
7117                 printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error);
7118                 return error;
7119         }
7120
7121         /* Relocate journal info block blocks if they're in the way. */
7122         error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context);
7123         if (error) {
7124                 printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error);
7125                 return error;
7126         }
7127
7128         /* Relocate extents of the Extents B-tree if they're in the way.
7129          * Relocating extents btree before other btrees is important as
7130          * this will provide access to largest contiguous block range on
7131          * the disk for relocating extents btree.  Note that extents btree
7132          * can only have maximum of 8 extents.
7133          */
7134         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID,
7135                         kHFSDataForkType, allocLimit, context);
7136         if (error) {
7137                 printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
7138                 return error;
7139         }
7140
7141         /* Relocate extents of the Allocation file if they're in the way. */
7142         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID,
7143                         kHFSDataForkType, allocLimit, context);
7144         if (error) {
7145                 printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
7146                 return error;
7147         }
7148
7149         /* Relocate extents of the Catalog B-tree if they're in the way. */
7150         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID,
7151                         kHFSDataForkType, allocLimit, context);
7152         if (error) {
7153                 printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
7154                 return error;
7155         }
7156
7157         /* Relocate extents of the Attributes B-tree if they're in the way. */
7158         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID,
7159                         kHFSDataForkType, allocLimit, context);
7160         if (error) {
7161                 printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
7162                 return error;
7163         }
7164
7165         /* Relocate extents of the Startup File if there is one and they're in the way. */
7166         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID,
7167                         kHFSDataForkType, allocLimit, context);
7168         if (error) {
7169                 printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
7170                 return error;
7171         }
7172
7173         /*
7174          * We need to make sure the alternate volume header gets flushed if we moved
7175          * any extents in the volume header.  But we need to do that before
7176          * shrinking the size of the volume, or else the journal code will panic
7177          * with an invalid (too large) block number.
7178          *
7179          * Note that blks_moved will be set if ANY extent was moved, even
7180          * if it was just an overflow extent.  In this case, the journal_flush isn't
7181          * strictly required, but shouldn't hurt.
7182          */
7183         if (hfsmp->hfs_resize_blocksmoved) {
7184                 hfs_journal_flush(hfsmp, TRUE);
7185         }
7186
7187         /* Reclaim extents from catalog file records */
7188         error = hfs_reclaim_filespace(hfsmp, allocLimit, context);
7189         if (error) {
7190                 printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error);
7191                 return error;
7192         }
7193
7194         /* Reclaim extents from extent-based extended attributes, if any */
7195         error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context);
7196         if (error) {
7197                 printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error);
7198                 return error;
7199         }
7200
7201         return error;
7202 }
7203
7204
7205 /*
7206  * Check if there are any extents (including overflow extents) that overlap
7207  * into the disk space that is being reclaimed.
7208  *
7209  * Output -
7210  *      true  - One of the extents need to be relocated
7211  *      false - No overflow extents need to be relocated, or there was an error
7212  */
7213 static int
7214 hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec)
7215 {
7216         struct BTreeIterator * iterator = NULL;
7217         struct FSBufferDescriptor btdata;
7218         HFSPlusExtentRecord extrec;
7219         HFSPlusExtentKey *extkeyptr;
7220         FCB *fcb;
7221         int overlapped = false;
7222         int i, j;
7223         int error;
7224         int lockflags = 0;
7225         u_int32_t endblock;
7226
7227         /* Check if data fork overlaps the target space */
7228         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7229                 if (filerec->dataFork.extents[i].blockCount == 0) {
7230                         break;
7231                 }
7232                 endblock = filerec->dataFork.extents[i].startBlock +
7233                         filerec->dataFork.extents[i].blockCount;
7234                 if (endblock > allocLimit) {
7235                         overlapped = true;
7236                         goto out;
7237                 }
7238         }
7239
7240         /* Check if resource fork overlaps the target space */
7241         for (j = 0; j < kHFSPlusExtentDensity; ++j) {
7242                 if (filerec->resourceFork.extents[j].blockCount == 0) {
7243                         break;
7244                 }
7245                 endblock = filerec->resourceFork.extents[j].startBlock +
7246                         filerec->resourceFork.extents[j].blockCount;
7247                 if (endblock > allocLimit) {
7248                         overlapped = true;
7249                         goto out;
7250                 }
7251         }
7252
7253         /* Return back if there are no overflow extents for this file */
7254         if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) {
7255                 goto out;
7256         }
7257
7258         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
7259                 return 0;
7260         }
7261         bzero(iterator, sizeof(*iterator));
7262         extkeyptr = (HFSPlusExtentKey *)&iterator->key;
7263         extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
7264         extkeyptr->forkType = 0;
7265         extkeyptr->fileID = filerec->fileID;
7266         extkeyptr->startBlock = 0;
7267
7268         btdata.bufferAddress = &extrec;
7269         btdata.itemSize = sizeof(extrec);
7270         btdata.itemCount = 1;
7271
7272         fcb = VTOF(hfsmp->hfs_extents_vp);
7273
7274         lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
7275
7276         /* This will position the iterator just before the first overflow
7277          * extent record for given fileID.  It will always return btNotFound,
7278          * so we special case the error code.
7279          */
7280         error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
7281         if (error && (error != btNotFound)) {
7282                 goto out;
7283         }
7284
7285         /* BTIterateRecord() might return error if the btree is empty, and
7286          * therefore we return that the extent does not overflow to the caller
7287          */
7288         error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7289         while (error == 0) {
7290                 /* Stop when we encounter a different file. */
7291                 if (extkeyptr->fileID != filerec->fileID) {
7292                         break;
7293                 }
7294                 /* Check if any of the forks exist in the target space. */
7295                 for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7296                         if (extrec[i].blockCount == 0) {
7297                                 break;
7298                         }
7299                         endblock = extrec[i].startBlock + extrec[i].blockCount;
7300                         if (endblock > allocLimit) {
7301                                 overlapped = true;
7302                                 goto out;
7303                         }
7304                 }
7305                 /* Look for more records. */
7306                 error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7307         }
7308
7309 out:
7310         if (lockflags) {
7311                 hfs_systemfile_unlock(hfsmp, lockflags);
7312         }
7313         if (iterator) {
7314                 kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7315         }
7316         return overlapped;
7317 }
7318
7319
7320 /*
7321  * Calculate the progress of a file system resize operation.
7322  */
7323 __private_extern__
7324 int
7325 hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress)
7326 {
7327         if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) {
7328                 return (ENXIO);
7329         }
7330
7331         if (hfsmp->hfs_resize_totalblocks > 0) {
7332                 *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks);
7333         } else {
7334                 *progress = 0;
7335         }
7336
7337         return (0);
7338 }
7339
7340
7341 /*
7342  * Creates a UUID from a unique "name" in the HFS UUID Name space.
7343  * See version 3 UUID.
7344  */
7345 static void
7346 hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result)
7347 {
7348         MD5_CTX  md5c;
7349         uint8_t  rawUUID[8];
7350
7351         ((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6];
7352         ((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7];
7353
7354         MD5Init( &md5c );
7355         MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) );
7356         MD5Update( &md5c, rawUUID, sizeof (rawUUID) );
7357         MD5Final( result, &md5c );
7358
7359         result[6] = 0x30 | ( result[6] & 0x0F );
7360         result[8] = 0x80 | ( result[8] & 0x3F );
7361 }
7362
7363 /*
7364  * Get file system attributes.
7365  */
7366 static int
7367 hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7368 {
7369 #define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST))
7370 #define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST))
7371 #define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME))
7372
7373         ExtendedVCB *vcb = VFSTOVCB(mp);
7374         struct hfsmount *hfsmp = VFSTOHFS(mp);
7375         u_int32_t freeCNIDs;
7376
7377         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID;
7378
7379         VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt);
7380         VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt);
7381         VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt);
7382         VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF);
7383         VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0));
7384         VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks);
7385         VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0));
7386         VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1));
7387         VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize);
7388         /* XXX needs clarification */
7389         VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1));
7390         /* Maximum files is constrained by total blocks. */
7391         VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2));
7392         VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1)));
7393
7394         fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev;
7395         fsap->f_fsid.val[1] = vfs_typenum(mp);
7396         VFSATTR_SET_SUPPORTED(fsap, f_fsid);
7397
7398         VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord);
7399         VFSATTR_RETURN(fsap, f_carbon_fsid, 0);
7400
7401         if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
7402                 vol_capabilities_attr_t *cap;
7403
7404                 cap = &fsap->f_capabilities;
7405
7406                 if (hfsmp->hfs_flags & HFS_STANDARD) {
7407                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7408                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7409                                 VOL_CAP_FMT_CASE_PRESERVING |
7410                                 VOL_CAP_FMT_FAST_STATFS |
7411                                 VOL_CAP_FMT_HIDDEN_FILES |
7412                                 VOL_CAP_FMT_PATH_FROM_ID;
7413                 } else {
7414                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7415                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7416                                 VOL_CAP_FMT_SYMBOLICLINKS |
7417                                 VOL_CAP_FMT_HARDLINKS |
7418                                 VOL_CAP_FMT_JOURNAL |
7419                                 VOL_CAP_FMT_ZERO_RUNS |
7420                                 (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) |
7421                                 (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) |
7422                                 VOL_CAP_FMT_CASE_PRESERVING |
7423                                 VOL_CAP_FMT_FAST_STATFS |
7424                                 VOL_CAP_FMT_2TB_FILESIZE |
7425                                 VOL_CAP_FMT_HIDDEN_FILES |
7426 #if HFS_COMPRESSION
7427                                 VOL_CAP_FMT_PATH_FROM_ID |
7428                                 VOL_CAP_FMT_DECMPFS_COMPRESSION;
7429 #else
7430                                 VOL_CAP_FMT_PATH_FROM_ID;
7431 #endif
7432                 }
7433                 cap->capabilities[VOL_CAPABILITIES_INTERFACES] =
7434                         VOL_CAP_INT_SEARCHFS |
7435                         VOL_CAP_INT_ATTRLIST |
7436                         VOL_CAP_INT_NFSEXPORT |
7437                         VOL_CAP_INT_READDIRATTR |
7438                         VOL_CAP_INT_EXCHANGEDATA |
7439                         VOL_CAP_INT_ALLOCATE |
7440                         VOL_CAP_INT_VOL_RENAME |
7441                         VOL_CAP_INT_ADVLOCK |
7442                         VOL_CAP_INT_FLOCK |
7443 #if NAMEDSTREAMS
7444                         VOL_CAP_INT_EXTENDED_ATTR |
7445                         VOL_CAP_INT_NAMEDSTREAMS;
7446 #else
7447                         VOL_CAP_INT_EXTENDED_ATTR;
7448 #endif
7449                 cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0;
7450                 cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0;
7451
7452                 cap->valid[VOL_CAPABILITIES_FORMAT] =
7453                         VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7454                         VOL_CAP_FMT_SYMBOLICLINKS |
7455                         VOL_CAP_FMT_HARDLINKS |
7456                         VOL_CAP_FMT_JOURNAL |
7457                         VOL_CAP_FMT_JOURNAL_ACTIVE |
7458                         VOL_CAP_FMT_NO_ROOT_TIMES |
7459                         VOL_CAP_FMT_SPARSE_FILES |
7460                         VOL_CAP_FMT_ZERO_RUNS |
7461                         VOL_CAP_FMT_CASE_SENSITIVE |
7462                         VOL_CAP_FMT_CASE_PRESERVING |
7463                         VOL_CAP_FMT_FAST_STATFS |
7464                         VOL_CAP_FMT_2TB_FILESIZE |
7465                         VOL_CAP_FMT_OPENDENYMODES |
7466                         VOL_CAP_FMT_HIDDEN_FILES |
7467 #if HFS_COMPRESSION
7468                         VOL_CAP_FMT_PATH_FROM_ID |
7469                         VOL_CAP_FMT_DECMPFS_COMPRESSION;
7470 #else
7471                         VOL_CAP_FMT_PATH_FROM_ID;
7472 #endif
7473                 cap->valid[VOL_CAPABILITIES_INTERFACES] =
7474                         VOL_CAP_INT_SEARCHFS |
7475                         VOL_CAP_INT_ATTRLIST |
7476                         VOL_CAP_INT_NFSEXPORT |
7477                         VOL_CAP_INT_READDIRATTR |
7478                         VOL_CAP_INT_EXCHANGEDATA |
7479                         VOL_CAP_INT_COPYFILE |
7480                         VOL_CAP_INT_ALLOCATE |
7481                         VOL_CAP_INT_VOL_RENAME |
7482                         VOL_CAP_INT_ADVLOCK |
7483                         VOL_CAP_INT_FLOCK |
7484                         VOL_CAP_INT_MANLOCK |
7485 #if NAMEDSTREAMS
7486                         VOL_CAP_INT_EXTENDED_ATTR |
7487                         VOL_CAP_INT_NAMEDSTREAMS;
7488 #else
7489                         VOL_CAP_INT_EXTENDED_ATTR;
7490 #endif
7491                 cap->valid[VOL_CAPABILITIES_RESERVED1] = 0;
7492                 cap->valid[VOL_CAPABILITIES_RESERVED2] = 0;
7493                 VFSATTR_SET_SUPPORTED(fsap, f_capabilities);
7494         }
7495         if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) {
7496                 vol_attributes_attr_t *attrp = &fsap->f_attributes;
7497
7498                 attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7499                 attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7500                 attrp->validattr.dirattr = ATTR_DIR_VALIDMASK;
7501                 attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7502                 attrp->validattr.forkattr = 0;
7503
7504                 attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7505                 attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7506                 attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK;
7507                 attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7508                 attrp->nativeattr.forkattr = 0;
7509                 VFSATTR_SET_SUPPORTED(fsap, f_attributes);
7510         }
7511         fsap->f_create_time.tv_sec = hfsmp->hfs_itime;
7512         fsap->f_create_time.tv_nsec = 0;
7513         VFSATTR_SET_SUPPORTED(fsap, f_create_time);
7514         fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod;
7515         fsap->f_modify_time.tv_nsec = 0;
7516         VFSATTR_SET_SUPPORTED(fsap, f_modify_time);
7517
7518         fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp;
7519         fsap->f_backup_time.tv_nsec = 0;
7520         VFSATTR_SET_SUPPORTED(fsap, f_backup_time);
7521         if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) {
7522                 u_int16_t subtype = 0;
7523
7524                 /*
7525                  * Subtypes (flavors) for HFS
7526                  *   0:   Mac OS Extended
7527                  *   1:   Mac OS Extended (Journaled)
7528                  *   2:   Mac OS Extended (Case Sensitive)
7529                  *   3:   Mac OS Extended (Case Sensitive, Journaled)
7530                  *   4 - 127:   Reserved
7531                  * 128:   Mac OS Standard
7532                  *
7533                  */
7534                 if (hfsmp->hfs_flags & HFS_STANDARD) {
7535                         subtype = HFS_SUBTYPE_STANDARDHFS;
7536                 } else /* HFS Plus */ {
7537                         if (hfsmp->jnl)
7538                                 subtype |= HFS_SUBTYPE_JOURNALED;
7539                         if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
7540                                 subtype |= HFS_SUBTYPE_CASESENSITIVE;
7541                 }
7542                 fsap->f_fssubtype = subtype;
7543                 VFSATTR_SET_SUPPORTED(fsap, f_fssubtype);
7544         }
7545
7546         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7547                 strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN);
7548                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7549         }
7550         if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) {
7551                 hfs_getvoluuid(hfsmp, fsap->f_uuid);
7552                 VFSATTR_SET_SUPPORTED(fsap, f_uuid);
7553         }
7554         return (0);
7555 }
7556
7557 /*
7558  * Perform a volume rename.  Requires the FS' root vp.
7559  */
7560 static int
7561 hfs_rename_volume(struct vnode *vp, const char *name, proc_t p)
7562 {
7563         ExtendedVCB *vcb = VTOVCB(vp);
7564         struct cnode *cp = VTOC(vp);
7565         struct hfsmount *hfsmp = VTOHFS(vp);
7566         struct cat_desc to_desc;
7567         struct cat_desc todir_desc;
7568         struct cat_desc new_desc;
7569         cat_cookie_t cookie;
7570         int lockflags;
7571         int error = 0;
7572         char converted_volname[256];
7573         size_t volname_length = 0;
7574         size_t conv_volname_length = 0;
7575
7576
7577         /*
7578          * Ignore attempts to rename a volume to a zero-length name.
7579          */
7580         if (name[0] == 0)
7581                 return(0);
7582
7583         bzero(&to_desc, sizeof(to_desc));
7584         bzero(&todir_desc, sizeof(todir_desc));
7585         bzero(&new_desc, sizeof(new_desc));
7586         bzero(&cookie, sizeof(cookie));
7587
7588         todir_desc.cd_parentcnid = kHFSRootParentID;
7589         todir_desc.cd_cnid = kHFSRootFolderID;
7590         todir_desc.cd_flags = CD_ISDIR;
7591
7592         to_desc.cd_nameptr = (const u_int8_t *)name;
7593         to_desc.cd_namelen = strlen(name);
7594         to_desc.cd_parentcnid = kHFSRootParentID;
7595         to_desc.cd_cnid = cp->c_cnid;
7596         to_desc.cd_flags = CD_ISDIR;
7597
7598         if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) == 0) {
7599                 if ((error = hfs_start_transaction(hfsmp)) == 0) {
7600                         if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) {
7601                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
7602
7603                                 error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc);
7604
7605                                 /*
7606                                  * If successful, update the name in the VCB, ensure it's terminated.
7607                                  */
7608                                 if (!error) {
7609                                         strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN));
7610                                         volname_length = strlen ((const char*)vcb->vcbVN);
7611 #define DKIOCCSSETLVNAME _IOW('d', 198, char[256])
7612                                         /* Send the volume name down to CoreStorage if necessary */
7613                                         error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED);
7614                                         if (error == 0) {
7615                                                 (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current());
7616                                         }
7617                                         error = 0;
7618                                 }
7619
7620                                 hfs_systemfile_unlock(hfsmp, lockflags);
7621                                 cat_postflight(hfsmp, &cookie, p);
7622
7623                                 if (error)
7624                                         MarkVCBDirty(vcb);
7625                                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
7626                         }
7627                         hfs_end_transaction(hfsmp);
7628                 }
7629                 if (!error) {
7630                         /* Release old allocated name buffer */
7631                         if (cp->c_desc.cd_flags & CD_HASBUF) {
7632                                 const char *tmp_name = (const char *)cp->c_desc.cd_nameptr;
7633
7634                                 cp->c_desc.cd_nameptr = 0;
7635                                 cp->c_desc.cd_namelen = 0;
7636                                 cp->c_desc.cd_flags &= ~CD_HASBUF;
7637                                 vfs_removename(tmp_name);
7638                         }
7639                         /* Update cnode's catalog descriptor */
7640                         replace_desc(cp, &new_desc);
7641                         vcb->volumeNameEncodingHint = new_desc.cd_encoding;
7642                         cp->c_touch_chgtime = TRUE;
7643                 }
7644
7645                 hfs_unlock(cp);
7646         }
7647
7648         return(error);
7649 }
7650
7651 /*
7652  * Get file system attributes.
7653  */
7654 static int
7655 hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7656 {
7657         kauth_cred_t cred = vfs_context_ucred(context);
7658         int error = 0;
7659
7660         /*
7661          * Must be superuser or owner of filesystem to change volume attributes
7662          */
7663         if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner))
7664                 return(EACCES);
7665
7666         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7667                 vnode_t root_vp;
7668
7669                 error = hfs_vfs_root(mp, &root_vp, context);
7670                 if (error)
7671                         goto out;
7672
7673                 error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context));
7674                 (void) vnode_put(root_vp);
7675                 if (error)
7676                         goto out;
7677
7678                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7679         }
7680
7681 out:
7682         return error;
7683 }
7684
7685 /* If a runtime corruption is detected, set the volume inconsistent
7686  * bit in the volume attributes.  The volume inconsistent bit is a persistent
7687  * bit which represents that the volume is corrupt and needs repair.
7688  * The volume inconsistent bit can be set from the kernel when it detects
7689  * runtime corruption or from file system repair utilities like fsck_hfs when
7690  * a repair operation fails.  The bit should be cleared only from file system
7691  * verify/repair utility like fsck_hfs when a verify/repair succeeds.
7692  */
7693 void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp)
7694 {
7695         HFS_MOUNT_LOCK(hfsmp, TRUE);
7696         if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) {
7697                 hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask;
7698                 MarkVCBDirty(hfsmp);
7699         }
7700         if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) {
7701                 /* Log information to ASL log */
7702                 fslog_fs_corrupt(hfsmp->hfs_mp);
7703                 printf("hfs: Runtime corruption detected on %s, fsck will be forced on next mount.\n", hfsmp->vcbVN);
7704         }
7705         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
7706 }
7707
7708 /* Replay the journal on the device node provided.  Returns zero if
7709  * journal replay succeeded or no journal was supposed to be replayed.
7710  */
7711 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context)
7712 {
7713         int retval = 0;
7714         int error = 0;
7715         struct mount *mp = NULL;
7716         struct hfs_mount_args *args = NULL;
7717
7718         /* Replay allowed only on raw devices */
7719         if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) {
7720                 retval = EINVAL;
7721                 goto out;
7722         }
7723
7724         /* Create dummy mount structures */
7725         MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK);
7726         if (mp == NULL) {
7727                 retval = ENOMEM;
7728                 goto out;
7729         }
7730         bzero(mp, sizeof(struct mount));
7731         mount_lock_init(mp);
7732
7733         MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK);
7734         if (args == NULL) {
7735                 retval = ENOMEM;
7736                 goto out;
7737         }
7738         bzero(args, sizeof(struct hfs_mount_args));
7739
7740         retval = hfs_mountfs(devvp, mp, args, 1, context);
7741         buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay");
7742
7743         /* FSYNC the devnode to be sure all data has been flushed */
7744         error = VNOP_FSYNC(devvp, MNT_WAIT, context);
7745         if (error) {
7746                 retval = error;
7747         }
7748
7749 out:
7750         if (mp) {
7751                 mount_lock_destroy(mp);
7752                 FREE(mp, M_TEMP);
7753         }
7754         if (args) {
7755                 FREE(args, M_TEMP);
7756         }
7757         return retval;
7758 }
7759
7760 /*
7761  * hfs vfs operations.
7762  */
7763 struct vfsops hfs_vfsops = {
7764         hfs_mount,
7765         hfs_start,
7766         hfs_unmount,
7767         hfs_vfs_root,
7768         hfs_quotactl,
7769         hfs_vfs_getattr,        /* was hfs_statfs */
7770         hfs_sync,
7771         hfs_vfs_vget,
7772         hfs_fhtovp,
7773         hfs_vptofh,
7774         hfs_init,
7775         hfs_sysctl,
7776         hfs_vfs_setattr,
7777         {NULL}
7778 };