bsd/hfs/hfs_vfsops.c

   1 /*
   2  * Copyright (c) 1999-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1991, 1993, 1994
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      hfs_vfsops.c
  66  *  derived from        @(#)ufs_vfsops.c        8.8 (Berkeley) 5/20/95
  67  *
  68  *      (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved.
  69  *
  70  *      hfs_vfsops.c -- VFS layer for loadable HFS file system.
  71  *
  72  */
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kauth.h>
  76
  77 #include <sys/ubc.h>
  78 #include <sys/ubc_internal.h>
  79 #include <sys/vnode_internal.h>
  80 #include <sys/mount_internal.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/malloc.h>
  83 #include <sys/stat.h>
  84 #include <sys/quota.h>
  85 #include <sys/disk.h>
  86 #include <sys/paths.h>
  87 #include <sys/utfconv.h>
  88 #include <sys/kdebug.h>
  89 #include <sys/fslog.h>
  90 #include <sys/ubc.h>
  91
  92 #include <kern/locks.h>
  93
  94 #include <vfs/vfs_journal.h>
  95
  96 #include <miscfs/specfs/specdev.h>
  97 #include <hfs/hfs_mount.h>
  98
  99 #include <libkern/crypto/md5.h>
 100 #include <uuid/uuid.h>
 101
 102 #include "hfs.h"
 103 #include "hfs_catalog.h"
 104 #include "hfs_cnode.h"
 105 #include "hfs_dbg.h"
 106 #include "hfs_endian.h"
 107 #include "hfs_hotfiles.h"
 108 #include "hfs_quota.h"
 109 #include "hfs_btreeio.h"
 110
 111 #include "hfscommon/headers/FileMgrInternal.h"
 112 #include "hfscommon/headers/BTreesInternal.h"
 113
 114 #if CONFIG_PROTECT
 115 #include <sys/cprotect.h>
 116 #endif
 117
 118 #if CONFIG_HFS_ALLOC_RBTREE
 119 #include "hfscommon/headers/HybridAllocator.h"
 120 #endif
 121
 122 #define HFS_MOUNT_DEBUG 1
 123
 124 #if     HFS_DIAGNOSTIC
 125 int hfs_dbg_all = 0;
 126 int hfs_dbg_err = 0;
 127 #endif
 128
 129 /* Enable/disable debugging code for live volume resizing */
 130 int hfs_resize_debug = 0;
 131
 132 lck_grp_attr_t *  hfs_group_attr;
 133 lck_attr_t *  hfs_lock_attr;
 134 lck_grp_t *  hfs_mutex_group;
 135 lck_grp_t *  hfs_rwlock_group;
 136 lck_grp_t *  hfs_spinlock_group;
 137
 138 extern struct vnodeopv_desc hfs_vnodeop_opv_desc;
 139 extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc;
 140
 141 /* not static so we can re-use in hfs_readwrite.c for build_path calls */
 142 int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 143
 144 static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args);
 145 static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context);
 146 static int hfs_flushfiles(struct mount *, int, struct proc *);
 147 static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush);
 148 static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp);
 149 static int hfs_init(struct vfsconf *vfsp);
 150 static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context);
 151 static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context);
 152 static int hfs_start(struct mount *mp, int flags, vfs_context_t context);
 153 static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
 154 static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec);
 155 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
 156 static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context);
 157
 158 void hfs_initialize_allocator (struct hfsmount *hfsmp);
 159 int hfs_teardown_allocator (struct hfsmount *hfsmp);
 160
 161 int hfs_mount(struct mount *mp, vnode_t  devvp, user_addr_t data, vfs_context_t context);
 162 int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context);
 163 int hfs_reload(struct mount *mp);
 164 int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context);
 165 int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context);
 166 int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
 167                       user_addr_t newp, size_t newlen, vfs_context_t context);
 168 int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context);
 169
 170 /*
 171  * Called by vfs_mountroot when mounting HFS Plus as root.
 172  */
 173
 174 int
 175 hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context)
 176 {
 177         struct hfsmount *hfsmp;
 178         ExtendedVCB *vcb;
 179         struct vfsstatfs *vfsp;
 180         int error;
 181
 182         if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) {
 183                 if (HFS_MOUNT_DEBUG) {
 184                         printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n",
 185                                         error, rvp, (rvp->v_name ? rvp->v_name : "unknown device"));
 186                 }
 187                 return (error);
 188         }
 189
 190         /* Init hfsmp */
 191         hfsmp = VFSTOHFS(mp);
 192
 193         hfsmp->hfs_uid = UNKNOWNUID;
 194         hfsmp->hfs_gid = UNKNOWNGID;
 195         hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 196         hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 197
 198         /* Establish the free block reserve. */
 199         vcb = HFSTOVCB(hfsmp);
 200         vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100;
 201         vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize);
 202
 203         vfsp = vfs_statfs(mp);
 204         (void)hfs_statfs(mp, vfsp, NULL);
 205
 206         return (0);
 207 }
 208
 209
 210 /*
 211  * VFS Operations.
 212  *
 213  * mount system call
 214  */
 215
 216 int
 217 hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
 218 {
 219         struct proc *p = vfs_context_proc(context);
 220         struct hfsmount *hfsmp = NULL;
 221         struct hfs_mount_args args;
 222         int retval = E_NONE;
 223         u_int32_t cmdflags;
 224
 225         if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) {
 226                 if (HFS_MOUNT_DEBUG) {
 227                         printf("hfs_mount: copyin returned %d for fs\n", retval);
 228                 }
 229                 return (retval);
 230         }
 231         cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS;
 232         if (cmdflags & MNT_UPDATE) {
 233                 hfsmp = VFSTOHFS(mp);
 234
 235                 /* Reload incore data after an fsck. */
 236                 if (cmdflags & MNT_RELOAD) {
 237                         if (vfs_isrdonly(mp)) {
 238                                 int error = hfs_reload(mp);
 239                                 if (error && HFS_MOUNT_DEBUG) {
 240                                         printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN);
 241                                 }
 242                                 return error;
 243                         }
 244                         else {
 245                                 if (HFS_MOUNT_DEBUG) {
 246                                         printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN);
 247                                 }
 248                                 return (EINVAL);
 249                         }
 250                 }
 251
 252                 /* Change to a read-only file system. */
 253                 if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) &&
 254                     vfs_isrdonly(mp)) {
 255                         int flags;
 256
 257                         /* Set flag to indicate that a downgrade to read-only
 258                          * is in progress and therefore block any further
 259                          * modifications to the file system.
 260                          */
 261                         hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 262                         hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
 263                         hfsmp->hfs_downgrading_proc = current_thread();
 264                         hfs_unlock_global (hfsmp);
 265
 266                         /* use VFS_SYNC to push out System (btree) files */
 267                         retval = VFS_SYNC(mp, MNT_WAIT, context);
 268                         if (retval && ((cmdflags & MNT_FORCE) == 0)) {
 269                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 270                                 hfsmp->hfs_downgrading_proc = NULL;
 271                                 if (HFS_MOUNT_DEBUG) {
 272                                         printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN);
 273                                 }
 274                                 goto out;
 275                         }
 276
 277                         flags = WRITECLOSE;
 278                         if (cmdflags & MNT_FORCE)
 279                                 flags |= FORCECLOSE;
 280
 281                         if ((retval = hfs_flushfiles(mp, flags, p))) {
 282                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 283                                 hfsmp->hfs_downgrading_proc = NULL;
 284                                 if (HFS_MOUNT_DEBUG) {
 285                                         printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN);
 286                                 }
 287                                 goto out;
 288                         }
 289
 290                         /* mark the volume cleanly unmounted */
 291                         hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
 292                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 293                         hfsmp->hfs_flags |= HFS_READ_ONLY;
 294
 295                         /* also get the volume bitmap blocks */
 296                         if (!retval) {
 297                                 if (vnode_mount(hfsmp->hfs_devvp) == mp) {
 298                                         retval = hfs_fsync(hfsmp->hfs_devvp, MNT_WAIT, 0, p);
 299                                 } else {
 300                                         vnode_get(hfsmp->hfs_devvp);
 301                                         retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
 302                                         vnode_put(hfsmp->hfs_devvp);
 303                                 }
 304                         }
 305                         if (retval) {
 306                                 if (HFS_MOUNT_DEBUG) {
 307                                         printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN);
 308                                 }
 309                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 310                                 hfsmp->hfs_downgrading_proc = NULL;
 311                                 hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 312                                 goto out;
 313                         }
 314                         if (hfsmp->jnl) {
 315                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 316
 317                             journal_close(hfsmp->jnl);
 318                             hfsmp->jnl = NULL;
 319
 320                             // Note: we explicitly don't want to shutdown
 321                             //       access to the jvp because we may need
 322                             //       it later if we go back to being read-write.
 323
 324                                 hfs_unlock_global (hfsmp);
 325                         }
 326
 327 #if CONFIG_HFS_ALLOC_RBTREE
 328                         (void) hfs_teardown_allocator(hfsmp);
 329 #endif
 330                         hfsmp->hfs_downgrading_proc = NULL;
 331                 }
 332
 333                 /* Change to a writable file system. */
 334                 if (vfs_iswriteupgrade(mp)) {
 335 #if CONFIG_HFS_ALLOC_RBTREE
 336                                 thread_t allocator_thread;
 337 #endif
 338
 339                         /*
 340                          * On inconsistent disks, do not allow read-write mount
 341                          * unless it is the boot volume being mounted.
 342                          */
 343                         if (!(vfs_flags(mp) & MNT_ROOTFS) &&
 344                                         (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) {
 345                                 if (HFS_MOUNT_DEBUG) {
 346                                         printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n",  (hfsmp->vcbVN));
 347                                 }
 348                                 retval = EINVAL;
 349                                 goto out;
 350                         }
 351
 352                         // If the journal was shut-down previously because we were
 353                         // asked to be read-only, let's start it back up again now
 354
 355                         if (   (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask)
 356                             && hfsmp->jnl == NULL
 357                             && hfsmp->jvp != NULL) {
 358                             int jflags;
 359
 360                             if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) {
 361                                         jflags = JOURNAL_RESET;
 362                                 } else {
 363                                         jflags = 0;
 364                                 }
 365
 366                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 367
 368                                 hfsmp->jnl = journal_open(hfsmp->jvp,
 369                                                 (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
 370                                                 hfsmp->jnl_size,
 371                                                 hfsmp->hfs_devvp,
 372                                                 hfsmp->hfs_logical_block_size,
 373                                                 jflags,
 374                                                 0,
 375                                                 hfs_sync_metadata, hfsmp->hfs_mp);
 376
 377                                 /*
 378                                  * Set up the trim callback function so that we can add
 379                                  * recently freed extents to the free extent cache once
 380                                  * the transaction that freed them is written to the
 381                                  * journal on disk.
 382                                  */
 383                                 if (hfsmp->jnl)
 384                                         journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp);
 385
 386                                 hfs_unlock_global (hfsmp);
 387
 388                                 if (hfsmp->jnl == NULL) {
 389                                         if (HFS_MOUNT_DEBUG) {
 390                                                 printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN));
 391                                         }
 392                                         retval = EINVAL;
 393                                         goto out;
 394                                 } else {
 395                                         hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET;
 396                                 }
 397
 398                         }
 399
 400                         /* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */
 401                         retval = hfs_erase_unused_nodes(hfsmp);
 402                         if (retval != E_NONE) {
 403                                 if (HFS_MOUNT_DEBUG) {
 404                                         printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN);
 405                                 }
 406                                 goto out;
 407                         }
 408
 409                         /* If this mount point was downgraded from read-write
 410                          * to read-only, clear that information as we are now
 411                          * moving back to read-write.
 412                          */
 413                         hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 414                         hfsmp->hfs_downgrading_proc = NULL;
 415
 416                         /* mark the volume dirty (clear clean unmount bit) */
 417                         hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
 418
 419                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 420                         if (retval != E_NONE) {
 421                                 if (HFS_MOUNT_DEBUG) {
 422                                         printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN);
 423                                 }
 424                                 goto out;
 425                         }
 426
 427                         /* Only clear HFS_READ_ONLY after a successful write */
 428                         hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 429
 430
 431                         if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) {
 432                                 /* Setup private/hidden directories for hardlinks. */
 433                                 hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
 434                                 hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
 435
 436                                 hfs_remove_orphans(hfsmp);
 437
 438                                 /*
 439                                  * Allow hot file clustering if conditions allow.
 440                                  */
 441                                 if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) &&
 442                                                 ((hfsmp->hfs_flags & HFS_SSD) == 0)) {
 443                                         (void) hfs_recording_init(hfsmp);
 444                                 }
 445                                 /* Force ACLs on HFS+ file systems. */
 446                                 if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) {
 447                                         vfs_setextendedsecurity(HFSTOVFS(hfsmp));
 448                                 }
 449                         }
 450
 451 #if CONFIG_HFS_ALLOC_RBTREE
 452                         /*
 453                          * Like the normal mount case, we need to handle creation of the allocation red-black tree
 454                          * if we're upgrading from read-only to read-write.
 455                          *
 456                          * We spawn a thread to create the pair of red-black trees for this volume.
 457                          * However, in so doing, we must be careful to ensure that if this thread is still
 458                          * running after mount has finished, it doesn't interfere with an unmount. Specifically,
 459                          * we'll need to set a bit that indicates we're in progress building the trees here.
 460                          * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
 461                          * notifies the tree generation code that an unmount is waiting.  Also, mark the extent
 462                          * tree flags that the allocator is enabled for use before we spawn the thread that will start
 463                          * scanning the RB tree.
 464                          *
 465                          * Only do this if we're operating on a read-write mount (we wouldn't care for read-only),
 466                          * which has not previously encountered a bad error on the red-black tree code.  Also, don't
 467                          * try to re-build a tree that already exists.
 468                          */
 469
 470                         if (hfsmp->extent_tree_flags == 0) {
 471                                 hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
 472                                 /* Initialize EOF counter so that the thread can assume it started at initial values */
 473                                 hfsmp->offset_block_end = 0;
 474
 475                                 InitTree(hfsmp);
 476
 477                                 kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
 478                                 thread_deallocate(allocator_thread);
 479                         }
 480
 481 #endif
 482                 }
 483
 484                 /* Update file system parameters. */
 485                 retval = hfs_changefs(mp, &args);
 486                 if (retval &&  HFS_MOUNT_DEBUG) {
 487                         printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN);
 488                 }
 489
 490         } else /* not an update request */ {
 491
 492                 /* Set the mount flag to indicate that we support volfs  */
 493                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS));
 494
 495                 retval = hfs_mountfs(devvp, mp, &args, 0, context);
 496                 if (retval && HFS_MOUNT_DEBUG) {
 497                         printf("hfs_mount: hfs_mountfs returned %d\n", retval);
 498                 }
 499 #if CONFIG_PROTECT
 500                 /*
 501                  * If above mount call was successful, and this mount is content protection
 502                  * enabled, then verify the on-disk EA on the root to ensure that the filesystem
 503                  * is of a suitable vintage to allow the mount to proceed.
 504                  */
 505                 if ((retval == 0) && (cp_fs_protected (mp))) {
 506                         int err = 0;
 507                         struct cp_root_xattr xattr;
 508                         bzero (&xattr, sizeof(struct cp_root_xattr));
 509                         hfsmp = vfs_fsprivate(mp);
 510
 511                         /* go get the EA to get the version information */
 512                         err = cp_getrootxattr (hfsmp, &xattr);
 513                         /* If there was no EA there, then write one out. */
 514                         if (err == ENOATTR) {
 515                                 bzero(&xattr, sizeof(struct cp_root_xattr));
 516                                 xattr.major_version = CP_CURRENT_MAJOR_VERS;
 517                                 xattr.minor_version = CP_CURRENT_MINOR_VERS;
 518                                 xattr.flags = 0;
 519
 520                                 err = cp_setrootxattr (hfsmp, &xattr);
 521                         }
 522                         /*
 523                          * For any other error, including having an out of date CP version in the
 524                          * EA, or for an error out of cp_setrootxattr, deny the mount
 525                          * and do not proceed further.
 526                          */
 527                         if (err || xattr.major_version != CP_CURRENT_MAJOR_VERS)  {
 528                                 /* Deny the mount and tear down. */
 529                                 retval = EPERM;
 530                                 (void) hfs_unmount (mp, MNT_FORCE, context);
 531                         }
 532                 }
 533 #endif
 534         }
 535 out:
 536         if (retval == 0) {
 537                 (void)hfs_statfs(mp, vfs_statfs(mp), context);
 538         }
 539         return (retval);
 540 }
 541
 542
 543 struct hfs_changefs_cargs {
 544         struct hfsmount *hfsmp;
 545         int             namefix;
 546         int             permfix;
 547         int             permswitch;
 548 };
 549
 550 static int
 551 hfs_changefs_callback(struct vnode *vp, void *cargs)
 552 {
 553         ExtendedVCB *vcb;
 554         struct cnode *cp;
 555         struct cat_desc cndesc;
 556         struct cat_attr cnattr;
 557         struct hfs_changefs_cargs *args;
 558         int lockflags;
 559         int error;
 560
 561         args = (struct hfs_changefs_cargs *)cargs;
 562
 563         cp = VTOC(vp);
 564         vcb = HFSTOVCB(args->hfsmp);
 565
 566         lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 567         error = cat_lookup(args->hfsmp, &cp->c_desc, 0, &cndesc, &cnattr, NULL, NULL);
 568         hfs_systemfile_unlock(args->hfsmp, lockflags);
 569         if (error) {
 570                 /*
 571                  * If we couldn't find this guy skip to the next one
 572                  */
 573                 if (args->namefix)
 574                         cache_purge(vp);
 575
 576                 return (VNODE_RETURNED);
 577         }
 578         /*
 579          * Get the real uid/gid and perm mask from disk.
 580          */
 581         if (args->permswitch || args->permfix) {
 582                 cp->c_uid = cnattr.ca_uid;
 583                 cp->c_gid = cnattr.ca_gid;
 584                 cp->c_mode = cnattr.ca_mode;
 585         }
 586         /*
 587          * If we're switching name converters then...
 588          *   Remove the existing entry from the namei cache.
 589          *   Update name to one based on new encoder.
 590          */
 591         if (args->namefix) {
 592                 cache_purge(vp);
 593                 replace_desc(cp, &cndesc);
 594
 595                 if (cndesc.cd_cnid == kHFSRootFolderID) {
 596                         strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1);
 597                         cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding;
 598                 }
 599         } else {
 600                 cat_releasedesc(&cndesc);
 601         }
 602         return (VNODE_RETURNED);
 603 }
 604
 605 /* Change fs mount parameters */
 606 static int
 607 hfs_changefs(struct mount *mp, struct hfs_mount_args *args)
 608 {
 609         int retval = 0;
 610         int namefix, permfix, permswitch;
 611         struct hfsmount *hfsmp;
 612         ExtendedVCB *vcb;
 613         hfs_to_unicode_func_t   get_unicode_func;
 614         unicode_to_hfs_func_t   get_hfsname_func;
 615         u_int32_t old_encoding = 0;
 616         struct hfs_changefs_cargs cargs;
 617         u_int32_t mount_flags;
 618
 619         hfsmp = VFSTOHFS(mp);
 620         vcb = HFSTOVCB(hfsmp);
 621         mount_flags = (unsigned int)vfs_flags(mp);
 622
 623         hfsmp->hfs_flags |= HFS_IN_CHANGEFS;
 624
 625         permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) &&
 626                        ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) ||
 627                       (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) &&
 628                        (mount_flags & MNT_UNKNOWNPERMISSIONS)));
 629
 630         /* The root filesystem must operate with actual permissions: */
 631         if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) {
 632                 vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS));  /* Just say "No". */
 633                 retval = EINVAL;
 634                 goto exit;
 635         }
 636         if (mount_flags & MNT_UNKNOWNPERMISSIONS)
 637                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
 638         else
 639                 hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS;
 640
 641         namefix = permfix = 0;
 642
 643         /*
 644          * Tracking of hot files requires up-to-date access times.  So if
 645          * access time updates are disabled, we must also disable hot files.
 646          */
 647         if (mount_flags & MNT_NOATIME) {
 648                 (void) hfs_recording_suspend(hfsmp);
 649         }
 650
 651         /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */
 652         if (args->hfs_timezone.tz_minuteswest != VNOVAL) {
 653                 gTimeZone = args->hfs_timezone;
 654         }
 655
 656         /* Change the default uid, gid and/or mask */
 657         if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) {
 658                 hfsmp->hfs_uid = args->hfs_uid;
 659                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 660                         ++permfix;
 661         }
 662         if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) {
 663                 hfsmp->hfs_gid = args->hfs_gid;
 664                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 665                         ++permfix;
 666         }
 667         if (args->hfs_mask != (mode_t)VNOVAL) {
 668                 if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) {
 669                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
 670                         hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
 671                         if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES))
 672                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
 673                         if (vcb->vcbSigWord == kHFSPlusSigWord)
 674                                 ++permfix;
 675                 }
 676         }
 677
 678         /* Change the hfs encoding value (hfs only) */
 679         if ((vcb->vcbSigWord == kHFSSigWord)    &&
 680             (args->hfs_encoding != (u_int32_t)VNOVAL)              &&
 681             (hfsmp->hfs_encoding != args->hfs_encoding)) {
 682
 683                 retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func);
 684                 if (retval)
 685                         goto exit;
 686
 687                 /*
 688                  * Connect the new hfs_get_unicode converter but leave
 689                  * the old hfs_get_hfsname converter in place so that
 690                  * we can lookup existing vnodes to get their correctly
 691                  * encoded names.
 692                  *
 693                  * When we're all finished, we can then connect the new
 694                  * hfs_get_hfsname converter and release our interest
 695                  * in the old converters.
 696                  */
 697                 hfsmp->hfs_get_unicode = get_unicode_func;
 698                 old_encoding = hfsmp->hfs_encoding;
 699                 hfsmp->hfs_encoding = args->hfs_encoding;
 700                 ++namefix;
 701         }
 702
 703         if (!(namefix || permfix || permswitch))
 704                 goto exit;
 705
 706         /* XXX 3762912 hack to support HFS filesystem 'owner' */
 707         if (permfix)
 708                 vfs_setowner(mp,
 709                     hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid,
 710                     hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid);
 711
 712         /*
 713          * For each active vnode fix things that changed
 714          *
 715          * Note that we can visit a vnode more than once
 716          * and we can race with fsync.
 717          *
 718          * hfs_changefs_callback will be called for each vnode
 719          * hung off of this mount point
 720          *
 721          * The vnode will be properly referenced and unreferenced
 722          * around the callback
 723          */
 724         cargs.hfsmp = hfsmp;
 725         cargs.namefix = namefix;
 726         cargs.permfix = permfix;
 727         cargs.permswitch = permswitch;
 728
 729         vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs);
 730
 731         /*
 732          * If we're switching name converters we can now
 733          * connect the new hfs_get_hfsname converter and
 734          * release our interest in the old converters.
 735          */
 736         if (namefix) {
 737                 hfsmp->hfs_get_hfsname = get_hfsname_func;
 738                 vcb->volumeNameEncodingHint = args->hfs_encoding;
 739                 (void) hfs_relconverter(old_encoding);
 740         }
 741 exit:
 742         hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS;
 743         return (retval);
 744 }
 745
 746
 747 struct hfs_reload_cargs {
 748         struct hfsmount *hfsmp;
 749         int             error;
 750 };
 751
 752 static int
 753 hfs_reload_callback(struct vnode *vp, void *cargs)
 754 {
 755         struct cnode *cp;
 756         struct hfs_reload_cargs *args;
 757         int lockflags;
 758
 759         args = (struct hfs_reload_cargs *)cargs;
 760         /*
 761          * flush all the buffers associated with this node
 762          */
 763         (void) buf_invalidateblks(vp, 0, 0, 0);
 764
 765         cp = VTOC(vp);
 766         /*
 767          * Remove any directory hints
 768          */
 769         if (vnode_isdir(vp))
 770                 hfs_reldirhints(cp, 0);
 771
 772         /*
 773          * Re-read cnode data for all active vnodes (non-metadata files).
 774          */
 775         if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) {
 776                 struct cat_fork *datafork;
 777                 struct cat_desc desc;
 778
 779                 datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL;
 780
 781                 /* lookup by fileID since name could have changed */
 782                 lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 783                 args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, &desc, &cp->c_attr, datafork);
 784                 hfs_systemfile_unlock(args->hfsmp, lockflags);
 785                 if (args->error) {
 786                         return (VNODE_RETURNED_DONE);
 787                 }
 788
 789                 /* update cnode's catalog descriptor */
 790                 (void) replace_desc(cp, &desc);
 791         }
 792         return (VNODE_RETURNED);
 793 }
 794
 795 /*
 796  * Reload all incore data for a filesystem (used after running fsck on
 797  * the root filesystem and finding things to fix). The filesystem must
 798  * be mounted read-only.
 799  *
 800  * Things to do to update the mount:
 801  *      invalidate all cached meta-data.
 802  *      invalidate all inactive vnodes.
 803  *      invalidate all cached file data.
 804  *      re-read volume header from disk.
 805  *      re-load meta-file info (extents, file size).
 806  *      re-load B-tree header data.
 807  *      re-read cnode data for all active vnodes.
 808  */
 809 int
 810 hfs_reload(struct mount *mountp)
 811 {
 812         register struct vnode *devvp;
 813         struct buf *bp;
 814         int error, i;
 815         struct hfsmount *hfsmp;
 816         struct HFSPlusVolumeHeader *vhp;
 817         ExtendedVCB *vcb;
 818         struct filefork *forkp;
 819         struct cat_desc cndesc;
 820         struct hfs_reload_cargs args;
 821         daddr64_t priIDSector;
 822
 823         hfsmp = VFSTOHFS(mountp);
 824         vcb = HFSTOVCB(hfsmp);
 825
 826         if (vcb->vcbSigWord == kHFSSigWord)
 827                 return (EINVAL);        /* rooting from HFS is not supported! */
 828
 829         /*
 830          * Invalidate all cached meta-data.
 831          */
 832         devvp = hfsmp->hfs_devvp;
 833         if (buf_invalidateblks(devvp, 0, 0, 0))
 834                 panic("hfs_reload: dirty1");
 835
 836         args.hfsmp = hfsmp;
 837         args.error = 0;
 838         /*
 839          * hfs_reload_callback will be called for each vnode
 840          * hung off of this mount point that can't be recycled...
 841          * vnode_iterate will recycle those that it can (the VNODE_RELOAD option)
 842          * the vnode will be in an 'unbusy' state (VNODE_WAIT) and
 843          * properly referenced and unreferenced around the callback
 844          */
 845         vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args);
 846
 847         if (args.error)
 848                 return (args.error);
 849
 850         /*
 851          * Re-read VolumeHeader from disk.
 852          */
 853         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
 854                         HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
 855
 856         error = (int)buf_meta_bread(hfsmp->hfs_devvp,
 857                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
 858                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
 859         if (error) {
 860                 if (bp != NULL)
 861                         buf_brelse(bp);
 862                 return (error);
 863         }
 864
 865         vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
 866
 867         /* Do a quick sanity check */
 868         if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
 869              SWAP_BE16(vhp->signature) != kHFSXSigWord) ||
 870             (SWAP_BE16(vhp->version) != kHFSPlusVersion &&
 871              SWAP_BE16(vhp->version) != kHFSXVersion) ||
 872             SWAP_BE32(vhp->blockSize) != vcb->blockSize) {
 873                 buf_brelse(bp);
 874                 return (EIO);
 875         }
 876
 877         vcb->vcbLsMod           = to_bsd_time(SWAP_BE32(vhp->modifyDate));
 878         vcb->vcbAtrb            = SWAP_BE32 (vhp->attributes);
 879         vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
 880         vcb->vcbClpSiz          = SWAP_BE32 (vhp->rsrcClumpSize);
 881         vcb->vcbNxtCNID         = SWAP_BE32 (vhp->nextCatalogID);
 882         vcb->vcbVolBkUp         = to_bsd_time(SWAP_BE32(vhp->backupDate));
 883         vcb->vcbWrCnt           = SWAP_BE32 (vhp->writeCount);
 884         vcb->vcbFilCnt          = SWAP_BE32 (vhp->fileCount);
 885         vcb->vcbDirCnt          = SWAP_BE32 (vhp->folderCount);
 886         HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation));
 887         vcb->totalBlocks        = SWAP_BE32 (vhp->totalBlocks);
 888         vcb->freeBlocks         = SWAP_BE32 (vhp->freeBlocks);
 889         vcb->encodingsBitmap    = SWAP_BE64 (vhp->encodingsBitmap);
 890         bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo));
 891         vcb->localCreateDate    = SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */
 892
 893         /*
 894          * Re-load meta-file vnode data (extent info, file size, etc).
 895          */
 896         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 897         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 898                 forkp->ff_extents[i].startBlock =
 899                         SWAP_BE32 (vhp->extentsFile.extents[i].startBlock);
 900                 forkp->ff_extents[i].blockCount =
 901                         SWAP_BE32 (vhp->extentsFile.extents[i].blockCount);
 902         }
 903         forkp->ff_size      = SWAP_BE64 (vhp->extentsFile.logicalSize);
 904         forkp->ff_blocks    = SWAP_BE32 (vhp->extentsFile.totalBlocks);
 905         forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize);
 906
 907
 908         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 909         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 910                 forkp->ff_extents[i].startBlock =
 911                         SWAP_BE32 (vhp->catalogFile.extents[i].startBlock);
 912                 forkp->ff_extents[i].blockCount =
 913                         SWAP_BE32 (vhp->catalogFile.extents[i].blockCount);
 914         }
 915         forkp->ff_size      = SWAP_BE64 (vhp->catalogFile.logicalSize);
 916         forkp->ff_blocks    = SWAP_BE32 (vhp->catalogFile.totalBlocks);
 917         forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize);
 918
 919         if (hfsmp->hfs_attribute_vp) {
 920                 forkp = VTOF(hfsmp->hfs_attribute_vp);
 921                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
 922                         forkp->ff_extents[i].startBlock =
 923                                 SWAP_BE32 (vhp->attributesFile.extents[i].startBlock);
 924                         forkp->ff_extents[i].blockCount =
 925                                 SWAP_BE32 (vhp->attributesFile.extents[i].blockCount);
 926                 }
 927                 forkp->ff_size      = SWAP_BE64 (vhp->attributesFile.logicalSize);
 928                 forkp->ff_blocks    = SWAP_BE32 (vhp->attributesFile.totalBlocks);
 929                 forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize);
 930         }
 931
 932         forkp = VTOF((struct vnode *)vcb->allocationsRefNum);
 933         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 934                 forkp->ff_extents[i].startBlock =
 935                         SWAP_BE32 (vhp->allocationFile.extents[i].startBlock);
 936                 forkp->ff_extents[i].blockCount =
 937                         SWAP_BE32 (vhp->allocationFile.extents[i].blockCount);
 938         }
 939         forkp->ff_size      = SWAP_BE64 (vhp->allocationFile.logicalSize);
 940         forkp->ff_blocks    = SWAP_BE32 (vhp->allocationFile.totalBlocks);
 941         forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize);
 942
 943         buf_brelse(bp);
 944         vhp = NULL;
 945
 946         /*
 947          * Re-load B-tree header data
 948          */
 949         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 950         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 951                 return (error);
 952
 953         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 954         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 955                 return (error);
 956
 957         if (hfsmp->hfs_attribute_vp) {
 958                 forkp = VTOF(hfsmp->hfs_attribute_vp);
 959                 if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 960                         return (error);
 961         }
 962
 963         /* Reload the volume name */
 964         if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, &cndesc, NULL, NULL)))
 965                 return (error);
 966         vcb->volumeNameEncodingHint = cndesc.cd_encoding;
 967         bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen));
 968         cat_releasedesc(&cndesc);
 969
 970         /* Re-establish private/hidden directories. */
 971         hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
 972         hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
 973
 974         /* In case any volume information changed to trigger a notification */
 975         hfs_generate_volume_notifications(hfsmp);
 976
 977         return (0);
 978 }
 979
 980
 981
 982 static void
 983 hfs_syncer(void *arg0, void *unused)
 984 {
 985 #pragma unused(unused)
 986
 987     struct hfsmount *hfsmp = arg0;
 988     clock_sec_t secs;
 989     clock_usec_t usecs;
 990     uint32_t delay = HFS_META_DELAY;
 991     uint64_t now;
 992     static int no_max=1;
 993
 994     clock_get_calendar_microtime(&secs, &usecs);
 995     now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
 996
 997     //
 998     // If the amount of pending writes is more than our limit, wait
 999     // for 2/3 of it to drain and then flush the journal.
1000     //
1001     if (hfsmp->hfs_mp->mnt_pending_write_size > hfsmp->hfs_max_pending_io) {
1002             int counter=0;
1003             uint64_t pending_io, start, rate = 0;
1004
1005             no_max = 0;
1006
1007             hfs_start_transaction(hfsmp);   // so we hold off any new i/o's
1008
1009             pending_io = hfsmp->hfs_mp->mnt_pending_write_size;
1010
1011             clock_get_calendar_microtime(&secs, &usecs);
1012             start = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1013
1014             while(hfsmp->hfs_mp->mnt_pending_write_size > (pending_io/3) && counter++ < 500) {
1015                     tsleep((caddr_t)hfsmp, PRIBIO, "hfs-wait-for-io-to-drain", 10);
1016             }
1017
1018             if (counter >= 500) {
1019                     printf("hfs: timed out waiting for io to drain (%lld)\n", (int64_t)hfsmp->hfs_mp->mnt_pending_write_size);
1020             }
1021
1022             if (hfsmp->jnl) {
1023                     journal_flush(hfsmp->jnl, FALSE);
1024             } else {
1025                     hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1026             }
1027
1028             clock_get_calendar_microtime(&secs, &usecs);
1029             now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1030             hfsmp->hfs_last_sync_time = now;
1031             if (now != start) {
1032                     rate = ((pending_io * 1000000ULL) / (now - start));     // yields bytes per second
1033             }
1034
1035             hfs_end_transaction(hfsmp);
1036
1037             //
1038             // If a reasonable amount of time elapsed then check the
1039             // i/o rate.  If it's taking less than 1 second or more
1040             // than 2 seconds, adjust hfs_max_pending_io so that we
1041             // will allow about 1.5 seconds of i/o to queue up.
1042             //
1043             if (((now - start) >= 300000) && (rate != 0)) {
1044                     uint64_t scale = (pending_io * 100) / rate;
1045
1046                     if (scale < 100 || scale > 200) {
1047                             // set it so that it should take about 1.5 seconds to drain
1048                             hfsmp->hfs_max_pending_io = (rate * 150ULL) / 100ULL;
1049                     }
1050             }
1051
1052     } else if (   ((now - hfsmp->hfs_last_sync_time) >= 5000000ULL)
1053                || (((now - hfsmp->hfs_last_sync_time) >= 100000LL)
1054                    && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL)
1055                    && (hfsmp->hfs_active_threads == 0)
1056                    && (hfsmp->hfs_global_lock_nesting == 0))) {
1057
1058             //
1059             // Flush the journal if more than 5 seconds elapsed since
1060             // the last sync OR we have not sync'ed recently and the
1061             // last sync request time was more than 100 milliseconds
1062             // ago and no one is in the middle of a transaction right
1063             // now.  Else we defer the sync and reschedule it.
1064             //
1065             if (hfsmp->jnl) {
1066                         hfs_lock_global (hfsmp, HFS_SHARED_LOCK);
1067
1068                     journal_flush(hfsmp->jnl, FALSE);
1069
1070                         hfs_unlock_global (hfsmp);
1071             } else {
1072                     hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1073             }
1074
1075             clock_get_calendar_microtime(&secs, &usecs);
1076             now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1077             hfsmp->hfs_last_sync_time = now;
1078
1079     } else if (hfsmp->hfs_active_threads == 0) {
1080             uint64_t deadline;
1081
1082             clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline);
1083             thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
1084
1085             // note: we intentionally return early here and do not
1086             // decrement the sync_scheduled and sync_incomplete
1087             // variables because we rescheduled the timer.
1088
1089             return;
1090     }
1091
1092     //
1093     // NOTE: we decrement these *after* we're done the journal_flush() since
1094     // it can take a significant amount of time and so we don't want more
1095     // callbacks scheduled until we're done this one.
1096     //
1097     OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
1098     OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
1099     wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
1100 }
1101
1102
1103 extern int IOBSDIsMediaEjectable( const char *cdev_name );
1104
1105 /*
1106  * Initialization code for Red-Black Tree Allocator
1107  *
1108  * This function will build the two red-black trees necessary for allocating space
1109  * from the metadata zone as well as normal allocations.  Currently, we use
1110  * an advisory read to get most of the data into the buffer cache.
1111  * This function is intended to be run in a separate thread so as not to slow down mount.
1112  *
1113  */
1114
1115 void
1116 hfs_initialize_allocator (struct hfsmount *hfsmp) {
1117
1118 #if CONFIG_HFS_ALLOC_RBTREE
1119         u_int32_t err;
1120
1121         /*
1122          * Take the allocation file lock.  Journal transactions will block until
1123          * we're done here.
1124          */
1125         int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1126
1127         /*
1128          * GenerateTree assumes that the bitmap lock is held when you call the function.
1129          * It will drop and re-acquire the lock periodically as needed to let other allocations
1130          * through.  It returns with the bitmap lock held. Since we only maintain one tree,
1131          * we don't need to specify a start block (always starts at 0).
1132          */
1133         err = GenerateTree(hfsmp, hfsmp->totalBlocks, &flags, 1);
1134         if (err) {
1135                 goto bailout;
1136         }
1137         /* Mark offset tree as built */
1138         hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ACTIVE;
1139
1140 bailout:
1141         /*
1142          * GenerateTree may drop the bitmap lock during operation in order to give other
1143          * threads a chance to allocate blocks, but it will always return with the lock held, so
1144          * we don't need to re-grab the lock in order to update the TREEBUILD_INFLIGHT bit.
1145          */
1146         hfsmp->extent_tree_flags &= ~HFS_ALLOC_TREEBUILD_INFLIGHT;
1147         if (err != 0) {
1148                 /* Wakeup any waiters on the allocation bitmap lock */
1149                 wakeup((caddr_t)&hfsmp->extent_tree_flags);
1150         }
1151
1152         hfs_systemfile_unlock(hfsmp, flags);
1153 #else
1154 #pragma unused (hfsmp)
1155 #endif
1156 }
1157
1158
1159 /*
1160  * Teardown code for the Red-Black Tree allocator.
1161  * This function consolidates the code which serializes with respect
1162  * to a thread that may be potentially still building the tree when we need to begin
1163  * tearing it down.   Since the red-black tree may not be live when we enter this function
1164  * we return:
1165  *              1 -> Tree was live.
1166  *              0 -> Tree was not active at time of call.
1167  */
1168
1169 int
1170 hfs_teardown_allocator (struct hfsmount *hfsmp) {
1171         int rb_used = 0;
1172
1173 #if CONFIG_HFS_ALLOC_RBTREE
1174
1175         int flags = 0;
1176
1177         /*
1178          * Check to see if the tree-generation is still on-going.
1179          * If it is, then block until it's done.
1180          */
1181
1182         flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1183
1184
1185         while (hfsmp->extent_tree_flags & HFS_ALLOC_TREEBUILD_INFLIGHT) {
1186                 hfsmp->extent_tree_flags |= HFS_ALLOC_TEARDOWN_INFLIGHT;
1187
1188                 lck_rw_sleep(&(VTOC(hfsmp->hfs_allocation_vp))->c_rwlock, LCK_SLEEP_EXCLUSIVE,
1189                                          &hfsmp->extent_tree_flags, THREAD_UNINT);
1190         }
1191
1192         if (hfs_isrbtree_active (hfsmp)) {
1193                 rb_used = 1;
1194
1195                 /* Tear down the RB Trees while we have the bitmap locked */
1196                 DestroyTrees(hfsmp);
1197
1198         }
1199
1200         hfs_systemfile_unlock(hfsmp, flags);
1201 #else
1202         #pragma unused (hfsmp)
1203 #endif
1204         return rb_used;
1205
1206 }
1207
1208
1209 static int hfs_root_unmounted_cleanly = 0;
1210
1211 SYSCTL_DECL(_vfs_generic);
1212 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
1213
1214 /*
1215  * Common code for mount and mountroot
1216  */
1217 int
1218 hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
1219             int journal_replay_only, vfs_context_t context)
1220 {
1221         struct proc *p = vfs_context_proc(context);
1222         int retval = E_NONE;
1223         struct hfsmount *hfsmp = NULL;
1224         struct buf *bp;
1225         dev_t dev;
1226         HFSMasterDirectoryBlock *mdbp = NULL;
1227         int ronly;
1228 #if QUOTA
1229         int i;
1230 #endif
1231         int mntwrapper;
1232         kauth_cred_t cred;
1233         u_int64_t disksize;
1234         daddr64_t log_blkcnt;
1235         u_int32_t log_blksize;
1236         u_int32_t phys_blksize;
1237         u_int32_t minblksize;
1238         u_int32_t iswritable;
1239         daddr64_t mdb_offset;
1240         int isvirtual = 0;
1241         int isroot = 0;
1242         int isssd;
1243 #if CONFIG_HFS_ALLOC_RBTREE
1244         thread_t allocator_thread;
1245 #endif
1246
1247         if (args == NULL) {
1248                 /* only hfs_mountroot passes us NULL as the 'args' argument */
1249                 isroot = 1;
1250         }
1251
1252         ronly = vfs_isrdonly(mp);
1253         dev = vnode_specrdev(devvp);
1254         cred = p ? vfs_context_ucred(context) : NOCRED;
1255         mntwrapper = 0;
1256
1257         bp = NULL;
1258         hfsmp = NULL;
1259         mdbp = NULL;
1260         minblksize = kHFSBlockSize;
1261
1262         /* Advisory locking should be handled at the VFS layer */
1263         vfs_setlocklocal(mp);
1264
1265         /* Get the logical block size (treated as physical block size everywhere) */
1266         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) {
1267                 if (HFS_MOUNT_DEBUG) {
1268                         printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n");
1269                 }
1270                 retval = ENXIO;
1271                 goto error_exit;
1272         }
1273         if (log_blksize == 0 || log_blksize > 1024*1024*1024) {
1274                 printf("hfs: logical block size 0x%x looks bad.  Not mounting.\n", log_blksize);
1275                 retval = ENXIO;
1276                 goto error_exit;
1277         }
1278
1279         /* Get the physical block size. */
1280         retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context);
1281         if (retval) {
1282                 if ((retval != ENOTSUP) && (retval != ENOTTY)) {
1283                         if (HFS_MOUNT_DEBUG) {
1284                                 printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n");
1285                         }
1286                         retval = ENXIO;
1287                         goto error_exit;
1288                 }
1289                 /* If device does not support this ioctl, assume that physical
1290                  * block size is same as logical block size
1291                  */
1292                 phys_blksize = log_blksize;
1293         }
1294         if (phys_blksize == 0 || phys_blksize > 1024*1024*1024) {
1295                 printf("hfs: physical block size 0x%x looks bad.  Not mounting.\n", phys_blksize);
1296                 retval = ENXIO;
1297                 goto error_exit;
1298         }
1299
1300         /* Switch to 512 byte sectors (temporarily) */
1301         if (log_blksize > 512) {
1302                 u_int32_t size512 = 512;
1303
1304                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) {
1305                         if (HFS_MOUNT_DEBUG) {
1306                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n");
1307                         }
1308                         retval = ENXIO;
1309                         goto error_exit;
1310                 }
1311         }
1312         /* Get the number of 512 byte physical blocks. */
1313         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1314                 /* resetting block size may fail if getting block count did */
1315                 (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context);
1316                 if (HFS_MOUNT_DEBUG) {
1317                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n");
1318                 }
1319                 retval = ENXIO;
1320                 goto error_exit;
1321         }
1322         /* Compute an accurate disk size (i.e. within 512 bytes) */
1323         disksize = (u_int64_t)log_blkcnt * (u_int64_t)512;
1324
1325         /*
1326          * On Tiger it is not necessary to switch the device
1327          * block size to be 4k if there are more than 31-bits
1328          * worth of blocks but to insure compatibility with
1329          * pre-Tiger systems we have to do it.
1330          *
1331          * If the device size is not a multiple of 4K (8 * 512), then
1332          * switching the logical block size isn't going to help because
1333          * we will be unable to write the alternate volume header.
1334          * In this case, just leave the logical block size unchanged.
1335          */
1336         if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
1337                 minblksize = log_blksize = 4096;
1338                 if (phys_blksize < log_blksize)
1339                         phys_blksize = log_blksize;
1340         }
1341
1342         /*
1343          * The cluster layer is not currently prepared to deal with a logical
1344          * block size larger than the system's page size.  (It can handle
1345          * blocks per page, but not multiple pages per block.)  So limit the
1346          * logical block size to the page size.
1347          */
1348         if (log_blksize > PAGE_SIZE)
1349                 log_blksize = PAGE_SIZE;
1350
1351         /* Now switch to our preferred physical block size. */
1352         if (log_blksize > 512) {
1353                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1354                         if (HFS_MOUNT_DEBUG) {
1355                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n");
1356                         }
1357                         retval = ENXIO;
1358                         goto error_exit;
1359                 }
1360                 /* Get the count of physical blocks. */
1361                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1362                         if (HFS_MOUNT_DEBUG) {
1363                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n");
1364                         }
1365                         retval = ENXIO;
1366                         goto error_exit;
1367                 }
1368         }
1369         /*
1370          * At this point:
1371          *   minblksize is the minimum physical block size
1372          *   log_blksize has our preferred physical block size
1373          *   log_blkcnt has the total number of physical blocks
1374          */
1375
1376         mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize);
1377         if ((retval = (int)buf_meta_bread(devvp,
1378                                 HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)),
1379                                 phys_blksize, cred, &bp))) {
1380                 if (HFS_MOUNT_DEBUG) {
1381                         printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval);
1382                 }
1383                 goto error_exit;
1384         }
1385         MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK);
1386         if (mdbp == NULL) {
1387                 retval = ENOMEM;
1388                 if (HFS_MOUNT_DEBUG) {
1389                         printf("hfs_mountfs: MALLOC failed\n");
1390                 }
1391                 goto error_exit;
1392         }
1393         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize);
1394         buf_brelse(bp);
1395         bp = NULL;
1396
1397         MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK);
1398         if (hfsmp == NULL) {
1399                 if (HFS_MOUNT_DEBUG) {
1400                         printf("hfs_mountfs: MALLOC (2) failed\n");
1401                 }
1402                 retval = ENOMEM;
1403                 goto error_exit;
1404         }
1405         bzero(hfsmp, sizeof(struct hfsmount));
1406
1407         hfs_chashinit_finish(hfsmp);
1408
1409         /*
1410          * See if the disk is a solid state device.  We need this to decide what to do about
1411          * hotfiles.
1412          */
1413         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) {
1414                 if (isssd) {
1415                         hfsmp->hfs_flags |= HFS_SSD;
1416                 }
1417         }
1418
1419
1420         /*
1421          *  Init the volume information structure
1422          */
1423
1424         lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr);
1425         lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr);
1426         lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr);
1427         lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr);
1428         lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr);
1429
1430         vfs_setfsprivate(mp, hfsmp);
1431         hfsmp->hfs_mp = mp;                     /* Make VFSTOHFS work */
1432         hfsmp->hfs_raw_dev = vnode_specrdev(devvp);
1433         hfsmp->hfs_devvp = devvp;
1434         vnode_ref(devvp);  /* Hold a ref on the device, dropped when hfsmp is freed. */
1435         hfsmp->hfs_logical_block_size = log_blksize;
1436         hfsmp->hfs_logical_block_count = log_blkcnt;
1437         hfsmp->hfs_physical_block_size = phys_blksize;
1438         hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize);
1439         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1440         if (ronly)
1441                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1442         if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)
1443                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
1444
1445 #if QUOTA
1446         for (i = 0; i < MAXQUOTAS; i++)
1447                 dqfileinit(&hfsmp->hfs_qfiles[i]);
1448 #endif
1449
1450         if (args) {
1451                 hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid;
1452                 if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID;
1453                 hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid;
1454                 if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID;
1455                 vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                               /* tell the VFS */
1456                 if (args->hfs_mask != (mode_t)VNOVAL) {
1457                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
1458                         if (args->flags & HFSFSMNT_NOXONFILES) {
1459                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
1460                         } else {
1461                                 hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
1462                         }
1463                 } else {
1464                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1465                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1466                 }
1467                 if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER))
1468                         mntwrapper = 1;
1469         } else {
1470                 /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */
1471                 if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) {
1472                         hfsmp->hfs_uid = UNKNOWNUID;
1473                         hfsmp->hfs_gid = UNKNOWNGID;
1474                         vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                       /* tell the VFS */
1475                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1476                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1477                 }
1478         }
1479
1480         /* Find out if disk media is writable. */
1481         if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) {
1482                 if (iswritable)
1483                         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1484                 else
1485                         hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1486         }
1487
1488         // record the current time at which we're mounting this volume
1489         struct timeval tv;
1490         microtime(&tv);
1491         hfsmp->hfs_mount_time = tv.tv_sec;
1492
1493         /* Mount a standard HFS disk */
1494         if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) &&
1495             (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) {
1496
1497                 /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */
1498                 if (vfs_isrdwr(mp)) {
1499                         retval = EROFS;
1500                         goto error_exit;
1501                 }
1502
1503                 printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n");
1504
1505                 /* Treat it as if it's read-only and not writeable */
1506                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1507                 hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1508
1509                 /* If only journal replay is requested, exit immediately */
1510                 if (journal_replay_only) {
1511                         retval = 0;
1512                         goto error_exit;
1513                 }
1514
1515                 if ((vfs_flags(mp) & MNT_ROOTFS)) {
1516                         retval = EINVAL;  /* Cannot root from HFS standard disks */
1517                         goto error_exit;
1518                 }
1519                 /* HFS disks can only use 512 byte physical blocks */
1520                 if (log_blksize > kHFSBlockSize) {
1521                         log_blksize = kHFSBlockSize;
1522                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1523                                 retval = ENXIO;
1524                                 goto error_exit;
1525                         }
1526                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1527                                 retval = ENXIO;
1528                                 goto error_exit;
1529                         }
1530                         hfsmp->hfs_logical_block_size = log_blksize;
1531                         hfsmp->hfs_logical_block_count = log_blkcnt;
1532                         hfsmp->hfs_physical_block_size = log_blksize;
1533                         hfsmp->hfs_log_per_phys = 1;
1534                 }
1535                 if (args) {
1536                         hfsmp->hfs_encoding = args->hfs_encoding;
1537                         HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding;
1538
1539                         /* establish the timezone */
1540                         gTimeZone = args->hfs_timezone;
1541                 }
1542
1543                 retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode,
1544                                         &hfsmp->hfs_get_hfsname);
1545                 if (retval)
1546                         goto error_exit;
1547
1548                 retval = hfs_MountHFSVolume(hfsmp, mdbp, p);
1549                 if (retval)
1550                         (void) hfs_relconverter(hfsmp->hfs_encoding);
1551
1552         } else /* Mount an HFS Plus disk */ {
1553                 HFSPlusVolumeHeader *vhp;
1554                 off_t embeddedOffset;
1555                 int   jnl_disable = 0;
1556
1557                 /* Get the embedded Volume Header */
1558                 if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
1559                         embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize;
1560                         embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) *
1561                                           (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1562
1563                         /*
1564                          * If the embedded volume doesn't start on a block
1565                          * boundary, then switch the device to a 512-byte
1566                          * block size so everything will line up on a block
1567                          * boundary.
1568                          */
1569                         if ((embeddedOffset % log_blksize) != 0) {
1570                                 printf("hfs_mountfs: embedded volume offset not"
1571                                     " a multiple of physical block size (%d);"
1572                                     " switching to 512\n", log_blksize);
1573                                 log_blksize = 512;
1574                                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE,
1575                                     (caddr_t)&log_blksize, FWRITE, context)) {
1576
1577                                         if (HFS_MOUNT_DEBUG) {
1578                                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n");
1579                                         }
1580                                         retval = ENXIO;
1581                                         goto error_exit;
1582                                 }
1583                                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT,
1584                                     (caddr_t)&log_blkcnt, 0, context)) {
1585                                         if (HFS_MOUNT_DEBUG) {
1586                                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n");
1587                                         }
1588                                         retval = ENXIO;
1589                                         goto error_exit;
1590                                 }
1591                                 /* Note: relative block count adjustment */
1592                                 hfsmp->hfs_logical_block_count *=
1593                                     hfsmp->hfs_logical_block_size / log_blksize;
1594
1595                                 /* Update logical /physical block size */
1596                                 hfsmp->hfs_logical_block_size = log_blksize;
1597                                 hfsmp->hfs_physical_block_size = log_blksize;
1598                                 phys_blksize = log_blksize;
1599                                 hfsmp->hfs_log_per_phys = 1;
1600                         }
1601
1602                         disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
1603                                    (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1604
1605                         hfsmp->hfs_logical_block_count = disksize / log_blksize;
1606
1607                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1608                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1609                                         phys_blksize, cred, &bp);
1610                         if (retval) {
1611                                 if (HFS_MOUNT_DEBUG) {
1612                                         printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval);
1613                                 }
1614                                 goto error_exit;
1615                         }
1616                         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512);
1617                         buf_brelse(bp);
1618                         bp = NULL;
1619                         vhp = (HFSPlusVolumeHeader*) mdbp;
1620
1621                 } else /* pure HFS+ */ {
1622                         embeddedOffset = 0;
1623                         vhp = (HFSPlusVolumeHeader*) mdbp;
1624                 }
1625
1626                 if (isroot) {
1627                         hfs_root_unmounted_cleanly = (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0;
1628                 }
1629
1630                 /*
1631                  * On inconsistent disks, do not allow read-write mount
1632                  * unless it is the boot volume being mounted.  We also
1633                  * always want to replay the journal if the journal_replay_only
1634                  * flag is set because that will (most likely) get the
1635                  * disk into a consistent state before fsck_hfs starts
1636                  * looking at it.
1637                  */
1638                 if (  !(vfs_flags(mp) & MNT_ROOTFS)
1639                    && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask)
1640                    && !journal_replay_only
1641                    && !(hfsmp->hfs_flags & HFS_READ_ONLY)) {
1642
1643                         if (HFS_MOUNT_DEBUG) {
1644                                 printf("hfs_mountfs: failed to mount non-root inconsistent disk\n");
1645                         }
1646                         retval = EINVAL;
1647                         goto error_exit;
1648                 }
1649
1650
1651                 // XXXdbg
1652                 //
1653                 hfsmp->jnl = NULL;
1654                 hfsmp->jvp = NULL;
1655                 if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) &&
1656                     args->journal_disable) {
1657                     jnl_disable = 1;
1658                 }
1659
1660                 //
1661                 // We only initialize the journal here if the last person
1662                 // to mount this volume was journaling aware.  Otherwise
1663                 // we delay journal initialization until later at the end
1664                 // of hfs_MountHFSPlusVolume() because the last person who
1665                 // mounted it could have messed things up behind our back
1666                 // (so we need to go find the .journal file, make sure it's
1667                 // the right size, re-sync up if it was moved, etc).
1668                 //
1669                 if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
1670                         && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
1671                         && !jnl_disable) {
1672
1673                         // if we're able to init the journal, mark the mount
1674                         // point as journaled.
1675                         //
1676                         if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) {
1677                                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1678                         } else {
1679                                 if (retval == EROFS) {
1680                                         // EROFS is a special error code that means the volume has an external
1681                                         // journal which we couldn't find.  in that case we do not want to
1682                                         // rewrite the volume header - we'll just refuse to mount the volume.
1683                                         if (HFS_MOUNT_DEBUG) {
1684                                                 printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n");
1685                                         }
1686                                         retval = EINVAL;
1687                                         goto error_exit;
1688                                 }
1689
1690                                 // if the journal failed to open, then set the lastMountedVersion
1691                                 // to be "FSK!" which fsck_hfs will see and force the fsck instead
1692                                 // of just bailing out because the volume is journaled.
1693                                 if (!ronly) {
1694                                         if (HFS_MOUNT_DEBUG) {
1695                                                 printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n");
1696                                         }
1697
1698                                         HFSPlusVolumeHeader *jvhp;
1699
1700                                     hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1701
1702                                     if (mdb_offset == 0) {
1703                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1704                                     }
1705
1706                                     bp = NULL;
1707                                     retval = (int)buf_meta_bread(devvp,
1708                                                     HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1709                                                     phys_blksize, cred, &bp);
1710                                     if (retval == 0) {
1711                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1712
1713                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1714                                                 printf ("hfs(1): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1715                                             jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1716                                             buf_bwrite(bp);
1717                                         } else {
1718                                             buf_brelse(bp);
1719                                         }
1720                                         bp = NULL;
1721                                     } else if (bp) {
1722                                         buf_brelse(bp);
1723                                         // clear this so the error exit path won't try to use it
1724                                         bp = NULL;
1725                                     }
1726                                 }
1727
1728                                 // if this isn't the root device just bail out.
1729                                 // If it is the root device we just continue on
1730                                 // in the hopes that fsck_hfs will be able to
1731                                 // fix any damage that exists on the volume.
1732                                 if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1733                                         if (HFS_MOUNT_DEBUG) {
1734                                                 printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n");
1735                                         }
1736                                     retval = EINVAL;
1737                                     goto error_exit;
1738                                 }
1739                         }
1740                 }
1741                 // XXXdbg
1742
1743                 /* Either the journal is replayed successfully, or there
1744                  * was nothing to replay, or no journal exists.  In any case,
1745                  * return success.
1746                  */
1747                 if (journal_replay_only) {
1748                         retval = 0;
1749                         goto error_exit;
1750                 }
1751
1752                 (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
1753
1754                 retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1755                 /*
1756                  * If the backend didn't like our physical blocksize
1757                  * then retry with physical blocksize of 512.
1758                  */
1759                 if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) {
1760                         printf("hfs_mountfs: could not use physical block size "
1761                                 "(%d) switching to 512\n", log_blksize);
1762                         log_blksize = 512;
1763                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1764                                 if (HFS_MOUNT_DEBUG) {
1765                                         printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n");
1766                                 }
1767                                 retval = ENXIO;
1768                                 goto error_exit;
1769                         }
1770                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1771                                 if (HFS_MOUNT_DEBUG) {
1772                                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n");
1773                                 }
1774                                 retval = ENXIO;
1775                                 goto error_exit;
1776                         }
1777                         devvp->v_specsize = log_blksize;
1778                         /* Note: relative block count adjustment (in case this is an embedded volume). */
1779                         hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
1780                         hfsmp->hfs_logical_block_size = log_blksize;
1781                         hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
1782
1783                         if (hfsmp->jnl && hfsmp->jvp == devvp) {
1784                             // close and re-open this with the new block size
1785                             journal_close(hfsmp->jnl);
1786                             hfsmp->jnl = NULL;
1787                             if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
1788                                         vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1789                                 } else {
1790                                         // if the journal failed to open, then set the lastMountedVersion
1791                                         // to be "FSK!" which fsck_hfs will see and force the fsck instead
1792                                         // of just bailing out because the volume is journaled.
1793                                         if (!ronly) {
1794                                                 if (HFS_MOUNT_DEBUG) {
1795                                                         printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n");
1796                                                 }
1797                                         HFSPlusVolumeHeader *jvhp;
1798
1799                                         hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1800
1801                                         if (mdb_offset == 0) {
1802                                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1803                                         }
1804
1805                                                 bp = NULL;
1806                                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1807                                                         phys_blksize, cred, &bp);
1808                                         if (retval == 0) {
1809                                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1810
1811                                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1812                                                                 printf ("hfs(2): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1813                                                         jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1814                                                         buf_bwrite(bp);
1815                                                         } else {
1816                                                         buf_brelse(bp);
1817                                                         }
1818                                                         bp = NULL;
1819                                         } else if (bp) {
1820                                                         buf_brelse(bp);
1821                                                         // clear this so the error exit path won't try to use it
1822                                                         bp = NULL;
1823                                         }
1824                                         }
1825
1826                                         // if this isn't the root device just bail out.
1827                                         // If it is the root device we just continue on
1828                                         // in the hopes that fsck_hfs will be able to
1829                                         // fix any damage that exists on the volume.
1830                                         if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1831                                                 if (HFS_MOUNT_DEBUG) {
1832                                                         printf("hfs_mountfs: hfs_early_journal_init (2) failed \n");
1833                                                 }
1834                                         retval = EINVAL;
1835                                         goto error_exit;
1836                                         }
1837                                 }
1838                         }
1839
1840                         /* Try again with a smaller block size... */
1841                         retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1842                         if (retval && HFS_MOUNT_DEBUG) {
1843                                 printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval);
1844                         }
1845                 }
1846                 if (retval)
1847                         (void) hfs_relconverter(0);
1848         }
1849
1850         // save off a snapshot of the mtime from the previous mount
1851         // (for matador).
1852         hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime;
1853
1854         if ( retval ) {
1855                 if (HFS_MOUNT_DEBUG) {
1856                         printf("hfs_mountfs: encountered failure %d \n", retval);
1857                 }
1858                 goto error_exit;
1859         }
1860
1861         mp->mnt_vfsstat.f_fsid.val[0] = (long)dev;
1862         mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp);
1863         vfs_setmaxsymlen(mp, 0);
1864
1865         mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR;
1866 #if NAMEDSTREAMS
1867         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1868 #endif
1869         if (!(hfsmp->hfs_flags & HFS_STANDARD)) {
1870                 /* Tell VFS that we support directory hard links. */
1871                 mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS;
1872         } else {
1873                 /* HFS standard doesn't support extended readdir! */
1874                 mount_set_noreaddirext (mp);
1875         }
1876
1877         if (args) {
1878                 /*
1879                  * Set the free space warning levels for a non-root volume:
1880                  *
1881                  * Set the "danger" limit to 1% of the volume size or 100MB, whichever
1882                  * is less.  Set the "warning" limit to 2% of the volume size or 150MB,
1883                  * whichever is less.  And last, set the "desired" freespace level to
1884                  * to 3% of the volume size or 200MB, whichever is less.
1885                  */
1886                 hfsmp->hfs_freespace_notify_dangerlimit =
1887                         MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1888                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION);
1889                 hfsmp->hfs_freespace_notify_warninglimit =
1890                         MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1891                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION);
1892                 hfsmp->hfs_freespace_notify_desiredlevel =
1893                         MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1894                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION);
1895         } else {
1896                 /*
1897                  * Set the free space warning levels for the root volume:
1898                  *
1899                  * Set the "danger" limit to 5% of the volume size or 512MB, whichever
1900                  * is less.  Set the "warning" limit to 10% of the volume size or 1GB,
1901                  * whichever is less.  And last, set the "desired" freespace level to
1902                  * to 11% of the volume size or 1.25GB, whichever is less.
1903                  */
1904                 hfsmp->hfs_freespace_notify_dangerlimit =
1905                         MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1906                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION);
1907                 hfsmp->hfs_freespace_notify_warninglimit =
1908                         MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1909                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION);
1910                 hfsmp->hfs_freespace_notify_desiredlevel =
1911                         MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1912                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION);
1913         };
1914
1915         /* Check if the file system exists on virtual device, like disk image */
1916         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) {
1917                 if (isvirtual) {
1918                         hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE;
1919                 }
1920         }
1921
1922         /* do not allow ejectability checks on the root device */
1923         if (isroot == 0) {
1924                 if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 &&
1925                                 IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
1926                         hfsmp->hfs_max_pending_io = 4096*1024;   // a reasonable value to start with.
1927                         hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
1928                         if (hfsmp->hfs_syncer == NULL) {
1929                                 printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
1930                                                 mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
1931                         }
1932                 }
1933         }
1934
1935 #if CONFIG_HFS_ALLOC_RBTREE
1936         /*
1937          * We spawn a thread to create the pair of red-black trees for this volume.
1938          * However, in so doing, we must be careful to ensure that if this thread is still
1939          * running after mount has finished, it doesn't interfere with an unmount. Specifically,
1940          * we'll need to set a bit that indicates we're in progress building the trees here.
1941          * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
1942          * notifies the tree generation code that an unmount is waiting.  Also mark the bit that
1943          * indicates the tree is live and operating.
1944          *
1945          * Only do this if we're operating on a read-write mount (we wouldn't care for read-only).
1946          */
1947
1948         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
1949                 hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
1950
1951                 /* Initialize EOF counter so that the thread can assume it started at initial values */
1952                 hfsmp->offset_block_end = 0;
1953                 InitTree(hfsmp);
1954
1955                 kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
1956                 thread_deallocate(allocator_thread);
1957         }
1958
1959 #endif
1960
1961         /*
1962          * Start looking for free space to drop below this level and generate a
1963          * warning immediately if needed:
1964          */
1965         hfsmp->hfs_notification_conditions = 0;
1966         hfs_generate_volume_notifications(hfsmp);
1967
1968         if (ronly == 0) {
1969                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1970         }
1971         FREE(mdbp, M_TEMP);
1972         return (0);
1973
1974 error_exit:
1975         if (bp)
1976                 buf_brelse(bp);
1977         if (mdbp)
1978                 FREE(mdbp, M_TEMP);
1979
1980         if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
1981                 vnode_clearmountedon(hfsmp->jvp);
1982                 (void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel());
1983                 hfsmp->jvp = NULL;
1984         }
1985         if (hfsmp) {
1986                 if (hfsmp->hfs_devvp) {
1987                         vnode_rele(hfsmp->hfs_devvp);
1988                 }
1989                 hfs_delete_chash(hfsmp);
1990
1991                 FREE(hfsmp, M_HFSMNT);
1992                 vfs_setfsprivate(mp, NULL);
1993         }
1994         return (retval);
1995 }
1996
1997
1998 /*
1999  * Make a filesystem operational.
2000  * Nothing to do at the moment.
2001  */
2002 /* ARGSUSED */
2003 static int
2004 hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context)
2005 {
2006         return (0);
2007 }
2008
2009
2010 /*
2011  * unmount system call
2012  */
2013 int
2014 hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
2015 {
2016         struct proc *p = vfs_context_proc(context);
2017         struct hfsmount *hfsmp = VFSTOHFS(mp);
2018         int retval = E_NONE;
2019         int flags;
2020         int force;
2021         int started_tr = 0;
2022         int rb_used = 0;
2023
2024         flags = 0;
2025         force = 0;
2026         if (mntflags & MNT_FORCE) {
2027                 flags |= FORCECLOSE;
2028                 force = 1;
2029         }
2030
2031         if ((retval = hfs_flushfiles(mp, flags, p)) && !force)
2032                 return (retval);
2033
2034         if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
2035                 (void) hfs_recording_suspend(hfsmp);
2036
2037         /*
2038          * Cancel any pending timers for this volume.  Then wait for any timers
2039          * which have fired, but whose callbacks have not yet completed.
2040          */
2041         if (hfsmp->hfs_syncer)
2042         {
2043                 struct timespec ts = {0, 100000000};    /* 0.1 seconds */
2044
2045                 /*
2046                  * Cancel any timers that have been scheduled, but have not
2047                  * fired yet.  NOTE: The kernel considers a timer complete as
2048                  * soon as it starts your callback, so the kernel does not
2049                  * keep track of the number of callbacks in progress.
2050                  */
2051                 if (thread_call_cancel(hfsmp->hfs_syncer))
2052                         OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
2053                 thread_call_free(hfsmp->hfs_syncer);
2054                 hfsmp->hfs_syncer = NULL;
2055
2056                 /*
2057                  * This waits for all of the callbacks that were entered before
2058                  * we did thread_call_cancel above, but have not completed yet.
2059                  */
2060                 while(hfsmp->hfs_sync_incomplete > 0)
2061                 {
2062                         msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
2063                 }
2064
2065                 if (hfsmp->hfs_sync_incomplete < 0)
2066                         panic("hfs_unmount: pm_sync_incomplete underflow!\n");
2067         }
2068
2069 #if CONFIG_HFS_ALLOC_RBTREE
2070         rb_used = hfs_teardown_allocator(hfsmp);
2071 #endif
2072
2073         /*
2074          * Flush out the b-trees, volume bitmap and Volume Header
2075          */
2076         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2077                 retval = hfs_start_transaction(hfsmp);
2078                 if (retval == 0) {
2079                     started_tr = 1;
2080                 } else if (!force) {
2081                     goto err_exit;
2082                 }
2083
2084                 if (hfsmp->hfs_startup_vp) {
2085                         (void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK);
2086                         retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p);
2087                         hfs_unlock(VTOC(hfsmp->hfs_startup_vp));
2088                         if (retval && !force)
2089                                 goto err_exit;
2090                 }
2091
2092                 if (hfsmp->hfs_attribute_vp) {
2093                         (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK);
2094                         retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p);
2095                         hfs_unlock(VTOC(hfsmp->hfs_attribute_vp));
2096                         if (retval && !force)
2097                                 goto err_exit;
2098                 }
2099
2100                 (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK);
2101                 retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p);
2102                 hfs_unlock(VTOC(hfsmp->hfs_catalog_vp));
2103                 if (retval && !force)
2104                         goto err_exit;
2105
2106                 (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK);
2107                 retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p);
2108                 hfs_unlock(VTOC(hfsmp->hfs_extents_vp));
2109                 if (retval && !force)
2110                         goto err_exit;
2111
2112                 if (hfsmp->hfs_allocation_vp) {
2113                         (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK);
2114                         retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p);
2115                         hfs_unlock(VTOC(hfsmp->hfs_allocation_vp));
2116                         if (retval && !force)
2117                                 goto err_exit;
2118                 }
2119
2120                 if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) {
2121                         retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p);
2122                         if (retval && !force)
2123                                 goto err_exit;
2124                 }
2125
2126                 /* If runtime corruption was detected, indicate that the volume
2127                  * was not unmounted cleanly.
2128                  */
2129                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2130                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2131                 } else {
2132                         HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
2133                 }
2134
2135
2136                 if (rb_used) {
2137                         /* If the rb-tree was live, just set min_start to 0 */
2138                         hfsmp->nextAllocation = 0;
2139                 }
2140                 else {
2141                         if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2142                                 int i;
2143                                 u_int32_t min_start = hfsmp->totalBlocks;
2144
2145                                 // set the nextAllocation pointer to the smallest free block number
2146                                 // we've seen so on the next mount we won't rescan unnecessarily
2147                                 lck_spin_lock(&hfsmp->vcbFreeExtLock);
2148                                 for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) {
2149                                         if (hfsmp->vcbFreeExt[i].startBlock < min_start) {
2150                                                 min_start = hfsmp->vcbFreeExt[i].startBlock;
2151                                         }
2152                                 }
2153                                 lck_spin_unlock(&hfsmp->vcbFreeExtLock);
2154                                 if (min_start < hfsmp->nextAllocation) {
2155                                         hfsmp->nextAllocation = min_start;
2156                                 }
2157                         }
2158                 }
2159
2160
2161                 retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2162                 if (retval) {
2163                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2164                         if (!force)
2165                                 goto err_exit;  /* could not flush everything */
2166                 }
2167
2168                 if (started_tr) {
2169                     hfs_end_transaction(hfsmp);
2170                     started_tr = 0;
2171                 }
2172         }
2173
2174         if (hfsmp->jnl) {
2175                 hfs_journal_flush(hfsmp, FALSE);
2176         }
2177
2178         /*
2179          *      Invalidate our caches and release metadata vnodes
2180          */
2181         (void) hfsUnmount(hfsmp, p);
2182
2183         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2184                 (void) hfs_relconverter(hfsmp->hfs_encoding);
2185
2186         // XXXdbg
2187         if (hfsmp->jnl) {
2188             journal_close(hfsmp->jnl);
2189             hfsmp->jnl = NULL;
2190         }
2191
2192         VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
2193
2194         if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2195             vnode_clearmountedon(hfsmp->jvp);
2196             retval = VNOP_CLOSE(hfsmp->jvp,
2197                                hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE,
2198                                vfs_context_kernel());
2199             vnode_put(hfsmp->jvp);
2200             hfsmp->jvp = NULL;
2201         }
2202         // XXXdbg
2203
2204         /*
2205          * Last chance to dump unreferenced system files.
2206          */
2207         (void) vflush(mp, NULLVP, FORCECLOSE);
2208
2209 #if HFS_SPARSE_DEV
2210         /* Drop our reference on the backing fs (if any). */
2211         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) {
2212                 struct vnode * tmpvp;
2213
2214                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2215                 tmpvp = hfsmp->hfs_backingfs_rootvp;
2216                 hfsmp->hfs_backingfs_rootvp = NULLVP;
2217                 vnode_rele(tmpvp);
2218         }
2219 #endif /* HFS_SPARSE_DEV */
2220         lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group);
2221         lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group);
2222         vnode_rele(hfsmp->hfs_devvp);
2223
2224         hfs_delete_chash(hfsmp);
2225         FREE(hfsmp, M_HFSMNT);
2226
2227         return (0);
2228
2229   err_exit:
2230         if (started_tr) {
2231                 hfs_end_transaction(hfsmp);
2232         }
2233         return retval;
2234 }
2235
2236
2237 /*
2238  * Return the root of a filesystem.
2239  */
2240 static int
2241 hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context)
2242 {
2243         return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0);
2244 }
2245
2246
2247 /*
2248  * Do operations associated with quotas
2249  */
2250 #if !QUOTA
2251 static int
2252 hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context)
2253 {
2254         return (ENOTSUP);
2255 }
2256 #else
2257 static int
2258 hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context)
2259 {
2260         struct proc *p = vfs_context_proc(context);
2261         int cmd, type, error;
2262
2263         if (uid == ~0U)
2264                 uid = kauth_cred_getuid(vfs_context_ucred(context));
2265         cmd = cmds >> SUBCMDSHIFT;
2266
2267         switch (cmd) {
2268         case Q_SYNC:
2269         case Q_QUOTASTAT:
2270                 break;
2271         case Q_GETQUOTA:
2272                 if (uid == kauth_cred_getuid(vfs_context_ucred(context)))
2273                         break;
2274                 /* fall through */
2275         default:
2276                 if ( (error = vfs_context_suser(context)) )
2277                         return (error);
2278         }
2279
2280         type = cmds & SUBCMDMASK;
2281         if ((u_int)type >= MAXQUOTAS)
2282                 return (EINVAL);
2283         if (vfs_busy(mp, LK_NOWAIT))
2284                 return (0);
2285
2286         switch (cmd) {
2287
2288         case Q_QUOTAON:
2289                 error = hfs_quotaon(p, mp, type, datap);
2290                 break;
2291
2292         case Q_QUOTAOFF:
2293                 error = hfs_quotaoff(p, mp, type);
2294                 break;
2295
2296         case Q_SETQUOTA:
2297                 error = hfs_setquota(mp, uid, type, datap);
2298                 break;
2299
2300         case Q_SETUSE:
2301                 error = hfs_setuse(mp, uid, type, datap);
2302                 break;
2303
2304         case Q_GETQUOTA:
2305                 error = hfs_getquota(mp, uid, type, datap);
2306                 break;
2307
2308         case Q_SYNC:
2309                 error = hfs_qsync(mp);
2310                 break;
2311
2312         case Q_QUOTASTAT:
2313                 error = hfs_quotastat(mp, type, datap);
2314                 break;
2315
2316         default:
2317                 error = EINVAL;
2318                 break;
2319         }
2320         vfs_unbusy(mp);
2321
2322         return (error);
2323 }
2324 #endif /* QUOTA */
2325
2326 /* Subtype is composite of bits */
2327 #define HFS_SUBTYPE_JOURNALED      0x01
2328 #define HFS_SUBTYPE_CASESENSITIVE  0x02
2329 /* bits 2 - 6 reserved */
2330 #define HFS_SUBTYPE_STANDARDHFS    0x80
2331
2332 /*
2333  * Get file system statistics.
2334  */
2335 int
2336 hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context)
2337 {
2338         ExtendedVCB *vcb = VFSTOVCB(mp);
2339         struct hfsmount *hfsmp = VFSTOHFS(mp);
2340         u_int32_t freeCNIDs;
2341         u_int16_t subtype = 0;
2342
2343         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID;
2344
2345         sbp->f_bsize = (u_int32_t)vcb->blockSize;
2346         sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0);
2347         sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks);
2348         sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0));
2349         sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1));
2350         sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2));  /* max files is constrained by total blocks */
2351         sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail)));
2352
2353         /*
2354          * Subtypes (flavors) for HFS
2355          *   0:   Mac OS Extended
2356          *   1:   Mac OS Extended (Journaled)
2357          *   2:   Mac OS Extended (Case Sensitive)
2358          *   3:   Mac OS Extended (Case Sensitive, Journaled)
2359          *   4 - 127:   Reserved
2360          * 128:   Mac OS Standard
2361          *
2362          */
2363         if (hfsmp->hfs_flags & HFS_STANDARD) {
2364                 subtype = HFS_SUBTYPE_STANDARDHFS;
2365         } else /* HFS Plus */ {
2366                 if (hfsmp->jnl)
2367                         subtype |= HFS_SUBTYPE_JOURNALED;
2368                 if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
2369                         subtype |= HFS_SUBTYPE_CASESENSITIVE;
2370         }
2371         sbp->f_fssubtype = subtype;
2372
2373         return (0);
2374 }
2375
2376
2377 //
2378 // XXXdbg -- this is a callback to be used by the journal to
2379 //           get meta data blocks flushed out to disk.
2380 //
2381 // XXXdbg -- be smarter and don't flush *every* block on each
2382 //           call.  try to only flush some so we don't wind up
2383 //           being too synchronous.
2384 //
2385 __private_extern__
2386 void
2387 hfs_sync_metadata(void *arg)
2388 {
2389         struct mount *mp = (struct mount *)arg;
2390         struct hfsmount *hfsmp;
2391         ExtendedVCB *vcb;
2392         buf_t   bp;
2393         int  retval;
2394         daddr64_t priIDSector;
2395         hfsmp = VFSTOHFS(mp);
2396         vcb = HFSTOVCB(hfsmp);
2397
2398         // now make sure the super block is flushed
2399         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
2400                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
2401
2402         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2403                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
2404                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
2405         if ((retval != 0 ) && (retval != ENXIO)) {
2406                 printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n",
2407                        (int)priIDSector, retval);
2408         }
2409
2410         if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2411             buf_bwrite(bp);
2412         } else if (bp) {
2413             buf_brelse(bp);
2414         }
2415
2416         // the alternate super block...
2417         // XXXdbg - we probably don't need to do this each and every time.
2418         //          hfs_btreeio.c:FlushAlternate() should flag when it was
2419         //          written...
2420         if (hfsmp->hfs_alt_id_sector) {
2421                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2422                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
2423                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
2424                 if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2425                     buf_bwrite(bp);
2426                 } else if (bp) {
2427                     buf_brelse(bp);
2428                 }
2429         }
2430 }
2431
2432
2433 struct hfs_sync_cargs {
2434         kauth_cred_t cred;
2435         struct proc  *p;
2436         int    waitfor;
2437         int    error;
2438 };
2439
2440
2441 static int
2442 hfs_sync_callback(struct vnode *vp, void *cargs)
2443 {
2444         struct cnode *cp;
2445         struct hfs_sync_cargs *args;
2446         int error;
2447
2448         args = (struct hfs_sync_cargs *)cargs;
2449
2450         if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) {
2451                 return (VNODE_RETURNED);
2452         }
2453         cp = VTOC(vp);
2454
2455         if ((cp->c_flag & C_MODIFIED) ||
2456             (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) ||
2457             vnode_hasdirtyblks(vp)) {
2458                 error = hfs_fsync(vp, args->waitfor, 0, args->p);
2459
2460                 if (error)
2461                         args->error = error;
2462         }
2463         hfs_unlock(cp);
2464         return (VNODE_RETURNED);
2465 }
2466
2467
2468
2469 /*
2470  * Go through the disk queues to initiate sandbagged IO;
2471  * go through the inodes to write those that have been modified;
2472  * initiate the writing of the super block if it has been modified.
2473  *
2474  * Note: we are always called with the filesystem marked `MPBUSY'.
2475  */
2476 int
2477 hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
2478 {
2479         struct proc *p = vfs_context_proc(context);
2480         struct cnode *cp;
2481         struct hfsmount *hfsmp;
2482         ExtendedVCB *vcb;
2483         struct vnode *meta_vp[4];
2484         int i;
2485         int error, allerror = 0;
2486         struct hfs_sync_cargs args;
2487
2488         hfsmp = VFSTOHFS(mp);
2489
2490         /*
2491          * hfs_changefs might be manipulating vnodes so back off
2492          */
2493         if (hfsmp->hfs_flags & HFS_IN_CHANGEFS)
2494                 return (0);
2495
2496         if (hfsmp->hfs_flags & HFS_READ_ONLY)
2497                 return (EROFS);
2498
2499         /* skip over frozen volumes */
2500         if (!lck_rw_try_lock_shared(&hfsmp->hfs_insync))
2501                 return 0;
2502
2503         args.cred = kauth_cred_get();
2504         args.waitfor = waitfor;
2505         args.p = p;
2506         args.error = 0;
2507         /*
2508          * hfs_sync_callback will be called for each vnode
2509          * hung off of this mount point... the vnode will be
2510          * properly referenced and unreferenced around the callback
2511          */
2512         vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args);
2513
2514         if (args.error)
2515                 allerror = args.error;
2516
2517         vcb = HFSTOVCB(hfsmp);
2518
2519         meta_vp[0] = vcb->extentsRefNum;
2520         meta_vp[1] = vcb->catalogRefNum;
2521         meta_vp[2] = vcb->allocationsRefNum;  /* This is NULL for standard HFS */
2522         meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */
2523
2524         /* Now sync our three metadata files */
2525         for (i = 0; i < 4; ++i) {
2526                 struct vnode *btvp;
2527
2528                 btvp = meta_vp[i];;
2529                 if ((btvp==0) || (vnode_mount(btvp) != mp))
2530                         continue;
2531
2532                 /* XXX use hfs_systemfile_lock instead ? */
2533                 (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK);
2534                 cp = VTOC(btvp);
2535
2536                 if (((cp->c_flag &  C_MODIFIED) == 0) &&
2537                     (cp->c_touch_acctime == 0) &&
2538                     (cp->c_touch_chgtime == 0) &&
2539                     (cp->c_touch_modtime == 0) &&
2540                     vnode_hasdirtyblks(btvp) == 0) {
2541                         hfs_unlock(VTOC(btvp));
2542                         continue;
2543                 }
2544                 error = vnode_get(btvp);
2545                 if (error) {
2546                         hfs_unlock(VTOC(btvp));
2547                         continue;
2548                 }
2549                 if ((error = hfs_fsync(btvp, waitfor, 0, p)))
2550                         allerror = error;
2551
2552                 hfs_unlock(cp);
2553                 vnode_put(btvp);
2554         };
2555
2556         /*
2557          * Force stale file system control information to be flushed.
2558          */
2559         if (vcb->vcbSigWord == kHFSSigWord) {
2560                 if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) {
2561                         allerror = error;
2562                 }
2563         }
2564 #if QUOTA
2565         hfs_qsync(mp);
2566 #endif /* QUOTA */
2567
2568         hfs_hotfilesync(hfsmp, vfs_context_kernel());
2569
2570         /*
2571          * Write back modified superblock.
2572          */
2573         if (IsVCBDirty(vcb)) {
2574                 error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
2575                 if (error)
2576                         allerror = error;
2577         }
2578
2579         if (hfsmp->jnl) {
2580             hfs_journal_flush(hfsmp, FALSE);
2581         }
2582
2583         {
2584                 clock_sec_t secs;
2585                 clock_usec_t usecs;
2586                 uint64_t now;
2587
2588                 clock_get_calendar_microtime(&secs, &usecs);
2589                 now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
2590                 hfsmp->hfs_last_sync_time = now;
2591         }
2592
2593         lck_rw_unlock_shared(&hfsmp->hfs_insync);
2594         return (allerror);
2595 }
2596
2597
2598 /*
2599  * File handle to vnode
2600  *
2601  * Have to be really careful about stale file handles:
2602  * - check that the cnode id is valid
2603  * - call hfs_vget() to get the locked cnode
2604  * - check for an unallocated cnode (i_mode == 0)
2605  * - check that the given client host has export rights and return
2606  *   those rights via. exflagsp and credanonp
2607  */
2608 static int
2609 hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context)
2610 {
2611         struct hfsfid *hfsfhp;
2612         struct vnode *nvp;
2613         int result;
2614
2615         *vpp = NULL;
2616         hfsfhp = (struct hfsfid *)fhp;
2617
2618         if (fhlen < (int)sizeof(struct hfsfid))
2619                 return (EINVAL);
2620
2621         result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0);
2622         if (result) {
2623                 if (result == ENOENT)
2624                         result = ESTALE;
2625                 return result;
2626         }
2627
2628         /*
2629          * We used to use the create time as the gen id of the file handle,
2630          * but it is not static enough because it can change at any point
2631          * via system calls.  We still don't have another volume ID or other
2632          * unique identifier to use for a generation ID across reboots that
2633          * persists until the file is removed.  Using only the CNID exposes
2634          * us to the potential wrap-around case, but as of 2/2008, it would take
2635          * over 2 months to wrap around if the machine did nothing but allocate
2636          * CNIDs.  Using some kind of wrap counter would only be effective if
2637          * each file had the wrap counter associated with it.  For now,
2638          * we use only the CNID to identify the file as it's good enough.
2639          */
2640
2641         *vpp = nvp;
2642
2643         hfs_unlock(VTOC(nvp));
2644         return (0);
2645 }
2646
2647
2648 /*
2649  * Vnode pointer to File handle
2650  */
2651 /* ARGSUSED */
2652 static int
2653 hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context)
2654 {
2655         struct cnode *cp;
2656         struct hfsfid *hfsfhp;
2657
2658         if (ISHFS(VTOVCB(vp)))
2659                 return (ENOTSUP);       /* hfs standard is not exportable */
2660
2661         if (*fhlenp < (int)sizeof(struct hfsfid))
2662                 return (EOVERFLOW);
2663
2664         cp = VTOC(vp);
2665         hfsfhp = (struct hfsfid *)fhp;
2666         /* only the CNID is used to identify the file now */
2667         hfsfhp->hfsfid_cnid = htonl(cp->c_fileid);
2668         hfsfhp->hfsfid_gen = htonl(cp->c_fileid);
2669         *fhlenp = sizeof(struct hfsfid);
2670
2671         return (0);
2672 }
2673
2674
2675 /*
2676  * Initial HFS filesystems, done only once.
2677  */
2678 static int
2679 hfs_init(__unused struct vfsconf *vfsp)
2680 {
2681         static int done = 0;
2682
2683         if (done)
2684                 return (0);
2685         done = 1;
2686         hfs_chashinit();
2687         hfs_converterinit();
2688
2689         BTReserveSetup();
2690
2691
2692         hfs_lock_attr    = lck_attr_alloc_init();
2693         hfs_group_attr   = lck_grp_attr_alloc_init();
2694         hfs_mutex_group  = lck_grp_alloc_init("hfs-mutex", hfs_group_attr);
2695         hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr);
2696         hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr);
2697
2698 #if HFS_COMPRESSION
2699     decmpfs_init();
2700 #endif
2701
2702         return (0);
2703 }
2704
2705 static int
2706 hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp)
2707 {
2708         struct hfsmount * hfsmp;
2709         char fstypename[MFSNAMELEN];
2710
2711         if (vp == NULL)
2712                 return (EINVAL);
2713
2714         if (!vnode_isvroot(vp))
2715                 return (EINVAL);
2716
2717         vnode_vfsname(vp, fstypename);
2718         if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0)
2719                 return (EINVAL);
2720
2721         hfsmp = VTOHFS(vp);
2722
2723         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2724                 return (EINVAL);
2725
2726         *hfsmpp = hfsmp;
2727
2728         return (0);
2729 }
2730
2731 // XXXdbg
2732 #include <sys/filedesc.h>
2733
2734 /*
2735  * HFS filesystem related variables.
2736  */
2737 int
2738 hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp,
2739                         user_addr_t newp, size_t newlen, vfs_context_t context)
2740 {
2741         struct proc *p = vfs_context_proc(context);
2742         int error;
2743         struct hfsmount *hfsmp;
2744
2745         /* all sysctl names at this level are terminal */
2746
2747         if (name[0] == HFS_ENCODINGBIAS) {
2748                 int bias;
2749
2750                 bias = hfs_getencodingbias();
2751                 error = sysctl_int(oldp, oldlenp, newp, newlen, &bias);
2752                 if (error == 0 && newp)
2753                         hfs_setencodingbias(bias);
2754                 return (error);
2755
2756         } else if (name[0] == HFS_EXTEND_FS) {
2757         u_int64_t  newsize;
2758                 vnode_t vp = vfs_context_cwd(context);
2759
2760                 if (newp == USER_ADDR_NULL || vp == NULLVP)
2761                         return (EINVAL);
2762                 if ((error = hfs_getmountpoint(vp, &hfsmp)))
2763                         return (error);
2764                 error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize);
2765                 if (error)
2766                         return (error);
2767
2768                 error = hfs_extendfs(hfsmp, newsize, context);
2769                 return (error);
2770
2771         } else if (name[0] == HFS_ENCODINGHINT) {
2772                 size_t bufsize;
2773                 size_t bytes;
2774                 u_int32_t hint;
2775                 u_int16_t *unicode_name = NULL;
2776                 char *filename = NULL;
2777
2778                 if ((newlen <= 0) || (newlen > MAXPATHLEN))
2779                         return (EINVAL);
2780
2781                 bufsize = MAX(newlen * 3, MAXPATHLEN);
2782                 MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK);
2783                 if (filename == NULL) {
2784                         error = ENOMEM;
2785                         goto encodinghint_exit;
2786                 }
2787                 MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK);
2788                 if (filename == NULL) {
2789                         error = ENOMEM;
2790                         goto encodinghint_exit;
2791                 }
2792
2793                 error = copyin(newp, (caddr_t)filename, newlen);
2794                 if (error == 0) {
2795                         error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name,
2796                                                &bytes, bufsize, 0, UTF_DECOMPOSED);
2797                         if (error == 0) {
2798                                 hint = hfs_pickencoding(unicode_name, bytes / 2);
2799                                 error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint);
2800                         }
2801                 }
2802
2803 encodinghint_exit:
2804                 if (unicode_name)
2805                         FREE(unicode_name, M_TEMP);
2806                 if (filename)
2807                         FREE(filename, M_TEMP);
2808                 return (error);
2809
2810         } else if (name[0] == HFS_ENABLE_JOURNALING) {
2811                 // make the file system journaled...
2812                 vnode_t vp = vfs_context_cwd(context);
2813                 vnode_t jvp;
2814                 ExtendedVCB *vcb;
2815                 struct cat_attr jnl_attr, jinfo_attr;
2816                 struct cat_fork jnl_fork, jinfo_fork;
2817                 void *jnl = NULL;
2818                 int lockflags;
2819
2820                 /* Only root can enable journaling */
2821                 if (!is_suser()) {
2822                         return (EPERM);
2823                 }
2824                 if (vp == NULLVP)
2825                         return EINVAL;
2826
2827                 hfsmp = VTOHFS(vp);
2828                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2829                         return EROFS;
2830                 }
2831                 if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2832                         printf("hfs: can't make a plain hfs volume journaled.\n");
2833                         return EINVAL;
2834                 }
2835
2836                 if (hfsmp->jnl) {
2837                     printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp));
2838                     return EAGAIN;
2839                 }
2840
2841                 vcb = HFSTOVCB(hfsmp);
2842                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2843                 if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
2844                         BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
2845
2846                         printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
2847                         hfs_systemfile_unlock(hfsmp, lockflags);
2848                         return EINVAL;
2849                 }
2850                 hfs_systemfile_unlock(hfsmp, lockflags);
2851
2852                 // make sure these both exist!
2853                 if (   GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
2854                         || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) {
2855
2856                         return EINVAL;
2857                 }
2858
2859                 hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context);
2860
2861                 printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
2862                            (off_t)name[2], (off_t)name[3]);
2863
2864                 //
2865                 // XXXdbg - note that currently (Sept, 08) hfs_util does not support
2866                 //          enabling the journal on a separate device so it is safe
2867                 //          to just copy hfs_devvp here.  If hfs_util gets the ability
2868                 //          to dynamically enable the journal on a separate device then
2869                 //          we will have to do the same thing as hfs_early_journal_init()
2870                 //          to locate and open the journal device.
2871                 //
2872                 jvp = hfsmp->hfs_devvp;
2873                 jnl = journal_create(jvp,
2874                                                          (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
2875                                                          + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
2876                                                          (off_t)((unsigned)name[3]),
2877                                                          hfsmp->hfs_devvp,
2878                                                          hfsmp->hfs_logical_block_size,
2879                                                          0,
2880                                                          0,
2881                                                          hfs_sync_metadata, hfsmp->hfs_mp);
2882
2883                 /*
2884                  * Set up the trim callback function so that we can add
2885                  * recently freed extents to the free extent cache once
2886                  * the transaction that freed them is written to the
2887                  * journal on disk.
2888                  */
2889                 if (jnl)
2890                         journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp);
2891
2892                 if (jnl == NULL) {
2893                         printf("hfs: FAILED to create the journal!\n");
2894                         if (jvp && jvp != hfsmp->hfs_devvp) {
2895                                 vnode_clearmountedon(jvp);
2896                                 VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2897                         }
2898                         jvp = NULL;
2899
2900                         return EINVAL;
2901                 }
2902
2903                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2904
2905                 /*
2906                  * Flush all dirty metadata buffers.
2907                  */
2908                 buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl");
2909                 buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl");
2910                 buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl");
2911                 buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl");
2912                 if (hfsmp->hfs_attribute_vp)
2913                         buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl");
2914
2915                 HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
2916                 HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
2917                 hfsmp->jvp = jvp;
2918                 hfsmp->jnl = jnl;
2919
2920                 // save this off for the hack-y check in hfs_remove()
2921                 hfsmp->jnl_start        = (u_int32_t)name[2];
2922                 hfsmp->jnl_size         = (off_t)((unsigned)name[3]);
2923                 hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
2924                 hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
2925
2926                 vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
2927
2928                 hfs_unlock_global (hfsmp);
2929                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
2930
2931                 {
2932                         fsid_t fsid;
2933
2934                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
2935                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
2936                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
2937                 }
2938                 return 0;
2939         } else if (name[0] == HFS_DISABLE_JOURNALING) {
2940                 // clear the journaling bit
2941                 vnode_t vp = vfs_context_cwd(context);
2942
2943                 /* Only root can disable journaling */
2944                 if (!is_suser()) {
2945                         return (EPERM);
2946                 }
2947                 if (vp == NULLVP)
2948                         return EINVAL;
2949
2950                 hfsmp = VTOHFS(vp);
2951
2952                 /*
2953                  * Disabling journaling is disallowed on volumes with directory hard links
2954                  * because we have not tested the relevant code path.
2955                  */
2956                 if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){
2957                         printf("hfs: cannot disable journaling on volumes with directory hardlinks\n");
2958                         return EPERM;
2959                 }
2960
2961                 printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp));
2962
2963                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2964
2965                 // Lights out for you buddy!
2966                 journal_close(hfsmp->jnl);
2967                 hfsmp->jnl = NULL;
2968
2969                 if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2970                         vnode_clearmountedon(hfsmp->jvp);
2971                         VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2972                         vnode_put(hfsmp->jvp);
2973                 }
2974                 hfsmp->jvp = NULL;
2975                 vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
2976                 hfsmp->jnl_start        = 0;
2977                 hfsmp->hfs_jnlinfoblkid = 0;
2978                 hfsmp->hfs_jnlfileid    = 0;
2979
2980                 HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
2981
2982                 hfs_unlock_global (hfsmp);
2983
2984                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
2985
2986                 {
2987                         fsid_t fsid;
2988
2989                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
2990                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
2991                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
2992                 }
2993                 return 0;
2994         } else if (name[0] == HFS_GET_JOURNAL_INFO) {
2995                 vnode_t vp = vfs_context_cwd(context);
2996                 off_t jnl_start, jnl_size;
2997
2998                 if (vp == NULLVP)
2999                         return EINVAL;
3000
3001                 /* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */
3002                 if (proc_is64bit(current_proc()))
3003                         return EINVAL;
3004
3005                 hfsmp = VTOHFS(vp);
3006             if (hfsmp->jnl == NULL) {
3007                         jnl_start = 0;
3008                         jnl_size  = 0;
3009             } else {
3010                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
3011                         jnl_size  = (off_t)hfsmp->jnl_size;
3012             }
3013
3014             if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) {
3015                         return error;
3016                 }
3017             if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) {
3018                         return error;
3019                 }
3020
3021                 return 0;
3022         } else if (name[0] == HFS_SET_PKG_EXTENSIONS) {
3023
3024             return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
3025
3026         } else if (name[0] == VFS_CTL_QUERY) {
3027         struct sysctl_req *req;
3028         union union_vfsidctl vc;
3029         struct mount *mp;
3030             struct vfsquery vq;
3031
3032                 req = CAST_DOWN(struct sysctl_req *, oldp);     /* we're new style vfs sysctl. */
3033
3034         error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32));
3035                 if (error) return (error);
3036
3037                 mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */
3038         if (mp == NULL) return (ENOENT);
3039
3040                 hfsmp = VFSTOHFS(mp);
3041                 bzero(&vq, sizeof(vq));
3042                 vq.vq_flags = hfsmp->hfs_notification_conditions;
3043                 return SYSCTL_OUT(req, &vq, sizeof(vq));;
3044         } else if (name[0] == HFS_REPLAY_JOURNAL) {
3045                 vnode_t devvp = NULL;
3046                 int device_fd;
3047                 if (namelen != 2) {
3048                         return (EINVAL);
3049                 }
3050                 device_fd = name[1];
3051                 error = file_vnode(device_fd, &devvp);
3052                 if (error) {
3053                         return error;
3054                 }
3055                 error = vnode_getwithref(devvp);
3056                 if (error) {
3057                         file_drop(device_fd);
3058                         return error;
3059                 }
3060                 error = hfs_journal_replay(devvp, context);
3061                 file_drop(device_fd);
3062                 vnode_put(devvp);
3063                 return error;
3064         } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) {
3065                 hfs_resize_debug = 1;
3066                 printf ("hfs_sysctl: Enabled volume resize debugging.\n");
3067                 return 0;
3068         }
3069
3070         return (ENOTSUP);
3071 }
3072
3073 /*
3074  * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support
3075  * the build_path ioctl.  We use it to leverage the code below that updates
3076  * the origin list cache if necessary
3077  */
3078
3079 int
3080 hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context)
3081 {
3082         int error;
3083         int lockflags;
3084         struct hfsmount *hfsmp;
3085
3086         hfsmp = VFSTOHFS(mp);
3087
3088         error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0);
3089         if (error)
3090                 return (error);
3091
3092         /*
3093          * ADLs may need to have their origin state updated
3094          * since build_path needs a valid parent.  The same is true
3095          * for hardlinked files as well.  There isn't a race window here
3096          * in re-acquiring the cnode lock since we aren't pulling any data
3097          * out of the cnode; instead, we're going to the catalog.
3098          */
3099         if ((VTOC(*vpp)->c_flag & C_HARDLINK) &&
3100             (hfs_lock(VTOC(*vpp), HFS_EXCLUSIVE_LOCK) == 0)) {
3101                 cnode_t *cp = VTOC(*vpp);
3102                 struct cat_desc cdesc;
3103
3104                 if (!hfs_haslinkorigin(cp)) {
3105                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3106                         error = cat_findname(hfsmp, (cnid_t)ino, &cdesc);
3107                         hfs_systemfile_unlock(hfsmp, lockflags);
3108                         if (error == 0) {
3109                                 if ((cdesc.cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3110                                         (cdesc.cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid)) {
3111                                         hfs_savelinkorigin(cp, cdesc.cd_parentcnid);
3112                                 }
3113                                 cat_releasedesc(&cdesc);
3114                         }
3115                 }
3116                 hfs_unlock(cp);
3117         }
3118         return (0);
3119 }
3120
3121
3122 /*
3123  * Look up an HFS object by ID.
3124  *
3125  * The object is returned with an iocount reference and the cnode locked.
3126  *
3127  * If the object is a file then it will represent the data fork.
3128  */
3129 int
3130 hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted)
3131 {
3132         struct vnode *vp = NULLVP;
3133         struct cat_desc cndesc;
3134         struct cat_attr cnattr;
3135         struct cat_fork cnfork;
3136         u_int32_t linkref = 0;
3137         int error;
3138
3139         /* Check for cnids that should't be exported. */
3140         if ((cnid < kHFSFirstUserCatalogNodeID) &&
3141             (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) {
3142                 return (ENOENT);
3143         }
3144         /* Don't export our private directories. */
3145         if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid ||
3146             cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) {
3147                 return (ENOENT);
3148         }
3149         /*
3150          * Check the hash first
3151          */
3152         vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted);
3153         if (vp) {
3154                 *vpp = vp;
3155                 return(0);
3156         }
3157
3158         bzero(&cndesc, sizeof(cndesc));
3159         bzero(&cnattr, sizeof(cnattr));
3160         bzero(&cnfork, sizeof(cnfork));
3161
3162         /*
3163          * Not in hash, lookup in catalog
3164          */
3165         if (cnid == kHFSRootParentID) {
3166                 static char hfs_rootname[] = "/";
3167
3168                 cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0];
3169                 cndesc.cd_namelen = 1;
3170                 cndesc.cd_parentcnid = kHFSRootParentID;
3171                 cndesc.cd_cnid = kHFSRootFolderID;
3172                 cndesc.cd_flags = CD_ISDIR;
3173
3174                 cnattr.ca_fileid = kHFSRootFolderID;
3175                 cnattr.ca_linkcount = 1;
3176                 cnattr.ca_entries = 1;
3177                 cnattr.ca_dircount = 1;
3178                 cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
3179         } else {
3180                 int lockflags;
3181                 cnid_t pid;
3182                 const char *nameptr;
3183
3184                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3185                 error = cat_idlookup(hfsmp, cnid, 0, &cndesc, &cnattr, &cnfork);
3186                 hfs_systemfile_unlock(hfsmp, lockflags);
3187
3188                 if (error) {
3189                         *vpp = NULL;
3190                         return (error);
3191                 }
3192
3193                 /*
3194                  * Check for a raw hardlink inode and save its linkref.
3195                  */
3196                 pid = cndesc.cd_parentcnid;
3197                 nameptr = (const char *)cndesc.cd_nameptr;
3198
3199                 if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3200                     (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) {
3201                         linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10);
3202
3203                 } else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3204                            (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) {
3205                         linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10);
3206
3207                 } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3208                            (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) {
3209                         *vpp = NULL;
3210                         cat_releasedesc(&cndesc);
3211                         return (ENOENT);  /* open unlinked file */
3212                 }
3213         }
3214
3215         /*
3216          * Finish initializing cnode descriptor for hardlinks.
3217          *
3218          * We need a valid name and parent for reverse lookups.
3219          */
3220         if (linkref) {
3221                 cnid_t nextlinkid;
3222                 cnid_t prevlinkid;
3223                 struct cat_desc linkdesc;
3224                 int lockflags;
3225
3226                 cnattr.ca_linkref = linkref;
3227
3228                 /*
3229                  * Pick up the first link in the chain and get a descriptor for it.
3230                  * This allows blind volfs paths to work for hardlinks.
3231                  */
3232                 if ((hfs_lookup_siblinglinks(hfsmp, linkref, &prevlinkid,  &nextlinkid) == 0) &&
3233                     (nextlinkid != 0)) {
3234                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3235                         error = cat_findname(hfsmp, nextlinkid, &linkdesc);
3236                         hfs_systemfile_unlock(hfsmp, lockflags);
3237                         if (error == 0) {
3238                                 cat_releasedesc(&cndesc);
3239                                 bcopy(&linkdesc, &cndesc, sizeof(linkdesc));
3240                         }
3241                 }
3242         }
3243
3244         if (linkref) {
3245                 int newvnode_flags = 0;
3246
3247                 error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
3248                                                                 &cnfork, &vp, &newvnode_flags);
3249                 if (error == 0) {
3250                         VTOC(vp)->c_flag |= C_HARDLINK;
3251                         vnode_setmultipath(vp);
3252                 }
3253         } else {
3254                 struct componentname cn;
3255                 int newvnode_flags = 0;
3256
3257                 /* Supply hfs_getnewvnode with a component name. */
3258                 MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
3259                 cn.cn_nameiop = LOOKUP;
3260                 cn.cn_flags = ISLASTCN | HASBUF;
3261                 cn.cn_context = NULL;
3262                 cn.cn_pnlen = MAXPATHLEN;
3263                 cn.cn_nameptr = cn.cn_pnbuf;
3264                 cn.cn_namelen = cndesc.cd_namelen;
3265                 cn.cn_hash = 0;
3266                 cn.cn_consume = 0;
3267                 bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1);
3268
3269                 error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr,
3270                                                                 &cnfork, &vp, &newvnode_flags);
3271
3272                 if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) {
3273                         hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid);
3274                 }
3275                 FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI);
3276         }
3277         cat_releasedesc(&cndesc);
3278
3279         *vpp = vp;
3280         if (vp && skiplock) {
3281                 hfs_unlock(VTOC(vp));
3282         }
3283         return (error);
3284 }
3285
3286
3287 /*
3288  * Flush out all the files in a filesystem.
3289  */
3290 static int
3291 #if QUOTA
3292 hfs_flushfiles(struct mount *mp, int flags, struct proc *p)
3293 #else
3294 hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p)
3295 #endif /* QUOTA */
3296 {
3297         struct hfsmount *hfsmp;
3298         struct vnode *skipvp = NULLVP;
3299         int error;
3300 #if QUOTA
3301         int quotafilecnt;
3302         int i;
3303 #endif
3304
3305         hfsmp = VFSTOHFS(mp);
3306
3307 #if QUOTA
3308         /*
3309          * The open quota files have an indirect reference on
3310          * the root directory vnode.  We must account for this
3311          * extra reference when doing the intial vflush.
3312          */
3313         quotafilecnt = 0;
3314         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3315
3316                 /* Find out how many quota files we have open. */
3317                 for (i = 0; i < MAXQUOTAS; i++) {
3318                         if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP)
3319                                 ++quotafilecnt;
3320                 }
3321
3322                 /* Obtain the root vnode so we can skip over it. */
3323                 skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0);
3324         }
3325 #endif /* QUOTA */
3326
3327         error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags);
3328         if (error != 0)
3329                 return(error);
3330
3331         error = vflush(mp, skipvp, SKIPSYSTEM | flags);
3332
3333 #if QUOTA
3334         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3335                 if (skipvp) {
3336                         /*
3337                          * See if there are additional references on the
3338                          * root vp besides the ones obtained from the open
3339                          * quota files and the hfs_chash_getvnode call above.
3340                          */
3341                         if ((error == 0) &&
3342                             (vnode_isinuse(skipvp,  quotafilecnt))) {
3343                                 error = EBUSY;  /* root directory is still open */
3344                         }
3345                         hfs_unlock(VTOC(skipvp));
3346                         vnode_put(skipvp);
3347                 }
3348                 if (error && (flags & FORCECLOSE) == 0)
3349                         return (error);
3350
3351                 for (i = 0; i < MAXQUOTAS; i++) {
3352                         if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP)
3353                                 continue;
3354                         hfs_quotaoff(p, mp, i);
3355                 }
3356                 error = vflush(mp, NULLVP, SKIPSYSTEM | flags);
3357         }
3358 #endif /* QUOTA */
3359
3360         return (error);
3361 }
3362
3363 /*
3364  * Update volume encoding bitmap (HFS Plus only)
3365  */
3366 __private_extern__
3367 void
3368 hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding)
3369 {
3370 #define  kIndexMacUkrainian     48  /* MacUkrainian encoding is 152 */
3371 #define  kIndexMacFarsi         49  /* MacFarsi encoding is 140 */
3372
3373         u_int32_t       index;
3374
3375         switch (encoding) {
3376         case kTextEncodingMacUkrainian:
3377                 index = kIndexMacUkrainian;
3378                 break;
3379         case kTextEncodingMacFarsi:
3380                 index = kIndexMacFarsi;
3381                 break;
3382         default:
3383                 index = encoding;
3384                 break;
3385         }
3386
3387         if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) {
3388                 HFS_MOUNT_LOCK(hfsmp, TRUE)
3389                 hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index);
3390                 MarkVCBDirty(hfsmp);
3391                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3392         }
3393 }
3394
3395 /*
3396  * Update volume stats
3397  *
3398  * On journal volumes this will cause a volume header flush
3399  */
3400 int
3401 hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
3402 {
3403         struct timeval tv;
3404
3405         microtime(&tv);
3406
3407         lck_mtx_lock(&hfsmp->hfs_mutex);
3408
3409         MarkVCBDirty(hfsmp);
3410         hfsmp->hfs_mtime = tv.tv_sec;
3411
3412         switch (op) {
3413         case VOL_UPDATE:
3414                 break;
3415         case VOL_MKDIR:
3416                 if (hfsmp->hfs_dircount != 0xFFFFFFFF)
3417                         ++hfsmp->hfs_dircount;
3418                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3419                         ++hfsmp->vcbNmRtDirs;
3420                 break;
3421         case VOL_RMDIR:
3422                 if (hfsmp->hfs_dircount != 0)
3423                         --hfsmp->hfs_dircount;
3424                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3425                         --hfsmp->vcbNmRtDirs;
3426                 break;
3427         case VOL_MKFILE:
3428                 if (hfsmp->hfs_filecount != 0xFFFFFFFF)
3429                         ++hfsmp->hfs_filecount;
3430                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3431                         ++hfsmp->vcbNmFls;
3432                 break;
3433         case VOL_RMFILE:
3434                 if (hfsmp->hfs_filecount != 0)
3435                         --hfsmp->hfs_filecount;
3436                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3437                         --hfsmp->vcbNmFls;
3438                 break;
3439         }
3440
3441         lck_mtx_unlock(&hfsmp->hfs_mutex);
3442
3443         if (hfsmp->jnl) {
3444                 hfs_flushvolumeheader(hfsmp, 0, 0);
3445         }
3446
3447         return (0);
3448 }
3449
3450
3451 static int
3452 hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
3453 {
3454         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3455         struct filefork *fp;
3456         HFSMasterDirectoryBlock *mdb;
3457         struct buf *bp = NULL;
3458         int retval;
3459         int sectorsize;
3460         ByteCount namelen;
3461
3462         sectorsize = hfsmp->hfs_logical_block_size;
3463         retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp);
3464         if (retval) {
3465                 if (bp)
3466                         buf_brelse(bp);
3467                 return retval;
3468         }
3469
3470         lck_mtx_lock(&hfsmp->hfs_mutex);
3471
3472         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize));
3473
3474         mdb->drCrDate   = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime)));
3475         mdb->drLsMod    = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod)));
3476         mdb->drAtrb     = SWAP_BE16 (vcb->vcbAtrb);
3477         mdb->drNmFls    = SWAP_BE16 (vcb->vcbNmFls);
3478         mdb->drAllocPtr = SWAP_BE16 (vcb->nextAllocation);
3479         mdb->drClpSiz   = SWAP_BE32 (vcb->vcbClpSiz);
3480         mdb->drNxtCNID  = SWAP_BE32 (vcb->vcbNxtCNID);
3481         mdb->drFreeBks  = SWAP_BE16 (vcb->freeBlocks);
3482
3483         namelen = strlen((char *)vcb->vcbVN);
3484         retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN);
3485         /* Retry with MacRoman in case that's how it was exported. */
3486         if (retval)
3487                 retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN);
3488
3489         mdb->drVolBkUp  = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp)));
3490         mdb->drWrCnt    = SWAP_BE32 (vcb->vcbWrCnt);
3491         mdb->drNmRtDirs = SWAP_BE16 (vcb->vcbNmRtDirs);
3492         mdb->drFilCnt   = SWAP_BE32 (vcb->vcbFilCnt);
3493         mdb->drDirCnt   = SWAP_BE32 (vcb->vcbDirCnt);
3494
3495         bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo));
3496
3497         fp = VTOF(vcb->extentsRefNum);
3498         mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3499         mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3500         mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3501         mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3502         mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3503         mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3504         mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3505         mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3506         FTOC(fp)->c_flag &= ~C_MODIFIED;
3507
3508         fp = VTOF(vcb->catalogRefNum);
3509         mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3510         mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3511         mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3512         mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3513         mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3514         mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3515         mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3516         mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3517         FTOC(fp)->c_flag &= ~C_MODIFIED;
3518
3519         MarkVCBClean( vcb );
3520
3521         lck_mtx_unlock(&hfsmp->hfs_mutex);
3522
3523         /* If requested, flush out the alternate MDB */
3524         if (altflush) {
3525                 struct buf *alt_bp = NULL;
3526
3527                 if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &alt_bp) == 0) {
3528                         bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize);
3529
3530                         (void) VNOP_BWRITE(alt_bp);
3531                 } else if (alt_bp)
3532                         buf_brelse(alt_bp);
3533         }
3534
3535         if (waitfor != MNT_WAIT)
3536                 buf_bawrite(bp);
3537         else
3538                 retval = VNOP_BWRITE(bp);
3539
3540         return (retval);
3541 }
3542
3543 /*
3544  *  Flush any dirty in-memory mount data to the on-disk
3545  *  volume header.
3546  *
3547  *  Note: the on-disk volume signature is intentionally
3548  *  not flushed since the on-disk "H+" and "HX" signatures
3549  *  are always stored in-memory as "H+".
3550  */
3551 int
3552 hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
3553 {
3554         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3555         struct filefork *fp;
3556         HFSPlusVolumeHeader *volumeHeader, *altVH;
3557         int retval;
3558         struct buf *bp, *alt_bp;
3559         int i;
3560         daddr64_t priIDSector;
3561         int critical;
3562         u_int16_t  signature;
3563         u_int16_t  hfsversion;
3564
3565         if (hfsmp->hfs_flags & HFS_READ_ONLY) {
3566                 return(0);
3567         }
3568         if (hfsmp->hfs_flags & HFS_STANDARD) {
3569                 return hfs_flushMDB(hfsmp, waitfor, altflush);
3570         }
3571         critical = altflush;
3572         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
3573                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
3574
3575         if (hfs_start_transaction(hfsmp) != 0) {
3576             return EINVAL;
3577         }
3578
3579         bp = NULL;
3580         alt_bp = NULL;
3581
3582         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3583                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
3584                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
3585         if (retval) {
3586                 printf("hfs: err %d reading VH blk (%s)\n", retval, vcb->vcbVN);
3587                 goto err_exit;
3588         }
3589
3590         volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) +
3591                         HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3592
3593         /*
3594          * Sanity check what we just read.  If it's bad, try the alternate
3595          * instead.
3596          */
3597         signature = SWAP_BE16 (volumeHeader->signature);
3598         hfsversion   = SWAP_BE16 (volumeHeader->version);
3599         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3600             (hfsversion < kHFSPlusVersion) || (hfsversion > 100) ||
3601             (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) {
3602                 printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d%s\n",
3603                       vcb->vcbVN, signature, hfsversion,
3604                       SWAP_BE32 (volumeHeader->blockSize),
3605                       hfsmp->hfs_alt_id_sector ? "; trying alternate" : "");
3606                 hfs_mark_volume_inconsistent(hfsmp);
3607
3608                 if (hfsmp->hfs_alt_id_sector) {
3609                         retval = buf_meta_bread(hfsmp->hfs_devvp,
3610                             HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3611                             hfsmp->hfs_physical_block_size, NOCRED, &alt_bp);
3612                         if (retval) {
3613                                 printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN);
3614                                 goto err_exit;
3615                         }
3616
3617                         altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) +
3618                                 HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size));
3619                         signature = SWAP_BE16(altVH->signature);
3620                         hfsversion = SWAP_BE16(altVH->version);
3621
3622                         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3623                             (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) ||
3624                             (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) {
3625                                 printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n",
3626                                     vcb->vcbVN, signature, hfsversion,
3627                                     SWAP_BE32(altVH->blockSize));
3628                                 retval = EIO;
3629                                 goto err_exit;
3630                         }
3631
3632                         /* The alternate is plausible, so use it. */
3633                         bcopy(altVH, volumeHeader, kMDBSize);
3634                         buf_brelse(alt_bp);
3635                         alt_bp = NULL;
3636                 } else {
3637                         /* No alternate VH, nothing more we can do. */
3638                         retval = EIO;
3639                         goto err_exit;
3640                 }
3641         }
3642
3643         if (hfsmp->jnl) {
3644                 journal_modify_block_start(hfsmp->jnl, bp);
3645         }
3646
3647         /*
3648          * For embedded HFS+ volumes, update create date if it changed
3649          * (ie from a setattrlist call)
3650          */
3651         if ((vcb->hfsPlusIOPosOffset != 0) &&
3652             (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) {
3653                 struct buf *bp2;
3654                 HFSMasterDirectoryBlock *mdb;
3655
3656                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3657                                 HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys),
3658                                 hfsmp->hfs_physical_block_size, NOCRED, &bp2);
3659                 if (retval) {
3660                         if (bp2)
3661                                 buf_brelse(bp2);
3662                         retval = 0;
3663                 } else {
3664                         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) +
3665                                 HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3666
3667                         if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
3668                           {
3669                                 if (hfsmp->jnl) {
3670                                     journal_modify_block_start(hfsmp->jnl, bp2);
3671                                 }
3672
3673                                 mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);       /* pick up the new create date */
3674
3675                                 if (hfsmp->jnl) {
3676                                         journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL);
3677                                 } else {
3678                                         (void) VNOP_BWRITE(bp2);                /* write out the changes */
3679                                 }
3680                           }
3681                         else
3682                           {
3683                                 buf_brelse(bp2);                                                /* just release it */
3684                           }
3685                   }
3686         }
3687
3688         lck_mtx_lock(&hfsmp->hfs_mutex);
3689
3690         /* Note: only update the lower 16 bits worth of attributes */
3691         volumeHeader->attributes       = SWAP_BE32 (vcb->vcbAtrb);
3692         volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock);
3693         if (hfsmp->jnl) {
3694                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
3695         } else {
3696                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
3697         }
3698         volumeHeader->createDate        = SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
3699         volumeHeader->modifyDate        = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
3700         volumeHeader->backupDate        = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
3701         volumeHeader->fileCount         = SWAP_BE32 (vcb->vcbFilCnt);
3702         volumeHeader->folderCount       = SWAP_BE32 (vcb->vcbDirCnt);
3703         volumeHeader->totalBlocks       = SWAP_BE32 (vcb->totalBlocks);
3704         volumeHeader->freeBlocks        = SWAP_BE32 (vcb->freeBlocks);
3705         volumeHeader->nextAllocation    = SWAP_BE32 (vcb->nextAllocation);
3706         volumeHeader->rsrcClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3707         volumeHeader->dataClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3708         volumeHeader->nextCatalogID     = SWAP_BE32 (vcb->vcbNxtCNID);
3709         volumeHeader->writeCount        = SWAP_BE32 (vcb->vcbWrCnt);
3710         volumeHeader->encodingsBitmap   = SWAP_BE64 (vcb->encodingsBitmap);
3711
3712         if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) {
3713                 bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo));
3714                 critical = 1;
3715         }
3716
3717         /*
3718          * System files are only dirty when altflush is set.
3719          */
3720         if (altflush == 0) {
3721                 goto done;
3722         }
3723
3724         /* Sync Extents over-flow file meta data */
3725         fp = VTOF(vcb->extentsRefNum);
3726         if (FTOC(fp)->c_flag & C_MODIFIED) {
3727                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3728                         volumeHeader->extentsFile.extents[i].startBlock =
3729                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3730                         volumeHeader->extentsFile.extents[i].blockCount =
3731                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3732                 }
3733                 volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size);
3734                 volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3735                 volumeHeader->extentsFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3736                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3737         }
3738
3739         /* Sync Catalog file meta data */
3740         fp = VTOF(vcb->catalogRefNum);
3741         if (FTOC(fp)->c_flag & C_MODIFIED) {
3742                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3743                         volumeHeader->catalogFile.extents[i].startBlock =
3744                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3745                         volumeHeader->catalogFile.extents[i].blockCount =
3746                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3747                 }
3748                 volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size);
3749                 volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3750                 volumeHeader->catalogFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3751                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3752         }
3753
3754         /* Sync Allocation file meta data */
3755         fp = VTOF(vcb->allocationsRefNum);
3756         if (FTOC(fp)->c_flag & C_MODIFIED) {
3757                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3758                         volumeHeader->allocationFile.extents[i].startBlock =
3759                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3760                         volumeHeader->allocationFile.extents[i].blockCount =
3761                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3762                 }
3763                 volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size);
3764                 volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3765                 volumeHeader->allocationFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3766                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3767         }
3768
3769         /* Sync Attribute file meta data */
3770         if (hfsmp->hfs_attribute_vp) {
3771                 fp = VTOF(hfsmp->hfs_attribute_vp);
3772                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3773                         volumeHeader->attributesFile.extents[i].startBlock =
3774                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3775                         volumeHeader->attributesFile.extents[i].blockCount =
3776                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3777                 }
3778                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3779                 volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size);
3780                 volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3781                 volumeHeader->attributesFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3782         }
3783
3784         /* Sync Startup file meta data */
3785         if (hfsmp->hfs_startup_vp) {
3786                 fp = VTOF(hfsmp->hfs_startup_vp);
3787                 if (FTOC(fp)->c_flag & C_MODIFIED) {
3788                         for (i = 0; i < kHFSPlusExtentDensity; i++) {
3789                                 volumeHeader->startupFile.extents[i].startBlock =
3790                                         SWAP_BE32 (fp->ff_extents[i].startBlock);
3791                                 volumeHeader->startupFile.extents[i].blockCount =
3792                                         SWAP_BE32 (fp->ff_extents[i].blockCount);
3793                         }
3794                         volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size);
3795                         volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3796                         volumeHeader->startupFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3797                         FTOC(fp)->c_flag &= ~C_MODIFIED;
3798                 }
3799         }
3800
3801 done:
3802         MarkVCBClean(hfsmp);
3803         lck_mtx_unlock(&hfsmp->hfs_mutex);
3804
3805         /* If requested, flush out the alternate volume header */
3806         if (altflush && hfsmp->hfs_alt_id_sector) {
3807                 if (buf_meta_bread(hfsmp->hfs_devvp,
3808                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3809                                 hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) {
3810                         if (hfsmp->jnl) {
3811                                 journal_modify_block_start(hfsmp->jnl, alt_bp);
3812                         }
3813
3814                         bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) +
3815                                         HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size),
3816                                         kMDBSize);
3817
3818                         if (hfsmp->jnl) {
3819                                 journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL);
3820                         } else {
3821                                 (void) VNOP_BWRITE(alt_bp);
3822                         }
3823                 } else if (alt_bp)
3824                         buf_brelse(alt_bp);
3825         }
3826
3827         if (hfsmp->jnl) {
3828                 journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
3829         } else {
3830                 if (waitfor != MNT_WAIT)
3831                         buf_bawrite(bp);
3832                 else {
3833                     retval = VNOP_BWRITE(bp);
3834                     /* When critical data changes, flush the device cache */
3835                     if (critical && (retval == 0)) {
3836                         (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
3837                                          NULL, FWRITE, NULL);
3838                     }
3839                 }
3840         }
3841         hfs_end_transaction(hfsmp);
3842
3843         return (retval);
3844
3845 err_exit:
3846         if (alt_bp)
3847                 buf_brelse(alt_bp);
3848         if (bp)
3849                 buf_brelse(bp);
3850         hfs_end_transaction(hfsmp);
3851         return retval;
3852 }
3853
3854
3855 /*
3856  * Extend a file system.
3857  */
3858 int
3859 hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
3860 {
3861         struct proc *p = vfs_context_proc(context);
3862         kauth_cred_t cred = vfs_context_ucred(context);
3863         struct  vnode *vp;
3864         struct  vnode *devvp;
3865         struct  buf *bp;
3866         struct  filefork *fp = NULL;
3867         ExtendedVCB  *vcb;
3868         struct  cat_fork forkdata;
3869         u_int64_t  oldsize;
3870         u_int64_t  newblkcnt;
3871         u_int64_t  prev_phys_block_count;
3872         u_int32_t  addblks;
3873         u_int64_t  sectorcnt;
3874         u_int32_t  sectorsize;
3875         u_int32_t  phys_sectorsize;
3876         daddr64_t  prev_alt_sector;
3877         daddr_t    bitmapblks;
3878         int  lockflags = 0;
3879         int  error;
3880         int64_t oldBitmapSize;
3881         Boolean  usedExtendFileC = false;
3882         int transaction_begun = 0;
3883
3884         devvp = hfsmp->hfs_devvp;
3885         vcb = HFSTOVCB(hfsmp);
3886
3887         /*
3888          * - HFS Plus file systems only.
3889          * - Journaling must be enabled.
3890          * - No embedded volumes.
3891          */
3892         if ((vcb->vcbSigWord == kHFSSigWord) ||
3893              (hfsmp->jnl == NULL) ||
3894              (vcb->hfsPlusIOPosOffset != 0)) {
3895                 return (EPERM);
3896         }
3897         /*
3898          * If extending file system by non-root, then verify
3899          * ownership and check permissions.
3900          */
3901         if (suser(cred, NULL)) {
3902                 error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0);
3903
3904                 if (error)
3905                         return (error);
3906                 error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0);
3907                 if (error == 0) {
3908                         error = hfs_write_access(vp, cred, p, false);
3909                 }
3910                 hfs_unlock(VTOC(vp));
3911                 vnode_put(vp);
3912                 if (error)
3913                         return (error);
3914
3915                 error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context);
3916                 if (error)
3917                         return (error);
3918         }
3919         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&sectorsize, 0, context)) {
3920                 return (ENXIO);
3921         }
3922         if (sectorsize != hfsmp->hfs_logical_block_size) {
3923                 return (ENXIO);
3924         }
3925         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&sectorcnt, 0, context)) {
3926                 return (ENXIO);
3927         }
3928         if ((sectorsize * sectorcnt) < newsize) {
3929                 printf("hfs_extendfs: not enough space on device\n");
3930                 return (ENOSPC);
3931         }
3932         error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sectorsize, 0, context);
3933         if (error) {
3934                 if ((error != ENOTSUP) && (error != ENOTTY)) {
3935                         return (ENXIO);
3936                 }
3937                 /* If ioctl is not supported, force physical and logical sector size to be same */
3938                 phys_sectorsize = sectorsize;
3939         }
3940         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
3941
3942         /*
3943          * Validate new size.
3944          */
3945         if ((newsize <= oldsize) || (newsize % sectorsize) || (newsize % phys_sectorsize)) {
3946                 printf("hfs_extendfs: invalid size\n");
3947                 return (EINVAL);
3948         }
3949         newblkcnt = newsize / vcb->blockSize;
3950         if (newblkcnt > (u_int64_t)0xFFFFFFFF)
3951                 return (EOVERFLOW);
3952
3953         addblks = newblkcnt - vcb->totalBlocks;
3954
3955         if (hfs_resize_debug) {
3956                 printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks);
3957                 printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks);
3958         }
3959         printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks);
3960
3961         HFS_MOUNT_LOCK(hfsmp, TRUE);
3962         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
3963                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3964                 error = EALREADY;
3965                 goto out;
3966         }
3967         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
3968         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3969
3970         /* Start with a clean journal. */
3971         hfs_journal_flush(hfsmp, TRUE);
3972
3973         /*
3974          * Enclose changes inside a transaction.
3975          */
3976         if (hfs_start_transaction(hfsmp) != 0) {
3977                 error = EINVAL;
3978                 goto out;
3979         }
3980         transaction_begun = 1;
3981
3982         /*
3983          * Note: we take the attributes lock in case we have an attribute data vnode
3984          * which needs to change size.
3985          */
3986         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
3987         vp = vcb->allocationsRefNum;
3988         fp = VTOF(vp);
3989         bcopy(&fp->ff_data, &forkdata, sizeof(forkdata));
3990
3991         /*
3992          * Calculate additional space required (if any) by allocation bitmap.
3993          */
3994         oldBitmapSize = fp->ff_size;
3995         bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize;
3996         if (bitmapblks > (daddr_t)fp->ff_blocks)
3997                 bitmapblks -= fp->ff_blocks;
3998         else
3999                 bitmapblks = 0;
4000
4001         /*
4002          * The allocation bitmap can contain unused bits that are beyond end of
4003          * current volume's allocation blocks.  Usually they are supposed to be
4004          * zero'ed out but there can be cases where they might be marked as used.
4005          * After extending the file system, those bits can represent valid
4006          * allocation blocks, so we mark all the bits from the end of current
4007          * volume to end of allocation bitmap as "free".
4008          */
4009         BlockMarkFreeUnused(vcb, vcb->totalBlocks,
4010                         (fp->ff_blocks * vcb->blockSize * 8) - vcb->totalBlocks);
4011
4012         if (bitmapblks > 0) {
4013                 daddr64_t blkno;
4014                 daddr_t blkcnt;
4015                 off_t bytesAdded;
4016
4017                 /*
4018                  * Get the bitmap's current size (in allocation blocks) so we know
4019                  * where to start zero filling once the new space is added.  We've
4020                  * got to do this before the bitmap is grown.
4021                  */
4022                 blkno  = (daddr64_t)fp->ff_blocks;
4023
4024                 /*
4025                  * Try to grow the allocation file in the normal way, using allocation
4026                  * blocks already existing in the file system.  This way, we might be
4027                  * able to grow the bitmap contiguously, or at least in the metadata
4028                  * zone.
4029                  */
4030                 error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0,
4031                                 kEFAllMask | kEFNoClumpMask | kEFReserveMask
4032                                 | kEFMetadataMask | kEFContigMask, &bytesAdded);
4033
4034                 if (error == 0) {
4035                         usedExtendFileC = true;
4036                 } else {
4037                         /*
4038                          * If the above allocation failed, fall back to allocating the new
4039                          * extent of the bitmap from the space we're going to add.  Since those
4040                          * blocks don't yet belong to the file system, we have to update the
4041                          * extent list directly, and manually adjust the file size.
4042                          */
4043                         bytesAdded = 0;
4044                         error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks);
4045                         if (error) {
4046                                 printf("hfs_extendfs: error %d adding extents\n", error);
4047                                 goto out;
4048                         }
4049                         fp->ff_blocks += bitmapblks;
4050                         VTOC(vp)->c_blocks = fp->ff_blocks;
4051                         VTOC(vp)->c_flag |= C_MODIFIED;
4052                 }
4053
4054                 /*
4055                  * Update the allocation file's size to include the newly allocated
4056                  * blocks.  Note that ExtendFileC doesn't do this, which is why this
4057                  * statement is outside the above "if" statement.
4058                  */
4059                 fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4060
4061                 /*
4062                  * Zero out the new bitmap blocks.
4063                  */
4064                 {
4065
4066                         bp = NULL;
4067                         blkcnt = bitmapblks;
4068                         while (blkcnt > 0) {
4069                                 error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp);
4070                                 if (error) {
4071                                         if (bp) {
4072                                                 buf_brelse(bp);
4073                                         }
4074                                         break;
4075                                 }
4076                                 bzero((char *)buf_dataptr(bp), vcb->blockSize);
4077                                 buf_markaged(bp);
4078                                 error = (int)buf_bwrite(bp);
4079                                 if (error)
4080                                         break;
4081                                 --blkcnt;
4082                                 ++blkno;
4083                         }
4084                 }
4085                 if (error) {
4086                         printf("hfs_extendfs: error %d  clearing blocks\n", error);
4087                         goto out;
4088                 }
4089                 /*
4090                  * Mark the new bitmap space as allocated.
4091                  *
4092                  * Note that ExtendFileC will have marked any blocks it allocated, so
4093                  * this is only needed if we used AddFileExtent.  Also note that this
4094                  * has to come *after* the zero filling of new blocks in the case where
4095                  * we used AddFileExtent (since the part of the bitmap we're touching
4096                  * is in those newly allocated blocks).
4097                  */
4098                 if (!usedExtendFileC) {
4099                         error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks);
4100                         if (error) {
4101                                 printf("hfs_extendfs: error %d setting bitmap\n", error);
4102                                 goto out;
4103                         }
4104                         vcb->freeBlocks -= bitmapblks;
4105                 }
4106         }
4107         /*
4108          * Mark the new alternate VH as allocated.
4109          */
4110         if (vcb->blockSize == 512)
4111                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2);
4112         else
4113                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1);
4114         if (error) {
4115                 printf("hfs_extendfs: error %d setting bitmap (VH)\n", error);
4116                 goto out;
4117         }
4118         /*
4119          * Mark the old alternate VH as free.
4120          */
4121         if (vcb->blockSize == 512)
4122                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2);
4123         else
4124                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1);
4125         /*
4126          * Adjust file system variables for new space.
4127          */
4128         prev_phys_block_count = hfsmp->hfs_logical_block_count;
4129         prev_alt_sector = hfsmp->hfs_alt_id_sector;
4130
4131         vcb->totalBlocks += addblks;
4132         vcb->freeBlocks += addblks;
4133         hfsmp->hfs_logical_block_count = newsize / sectorsize;
4134         hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sectorsize) +
4135                                   HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_logical_block_count);
4136         MarkVCBDirty(vcb);
4137         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4138         if (error) {
4139                 printf("hfs_extendfs: couldn't flush volume headers (%d)", error);
4140                 /*
4141                  * Restore to old state.
4142                  */
4143                 if (usedExtendFileC) {
4144                         (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp),
4145                                                                  FTOC(fp)->c_fileid, false);
4146                 } else {
4147                         fp->ff_blocks -= bitmapblks;
4148                         fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4149                         /*
4150                          * No need to mark the excess blocks free since those bitmap blocks
4151                          * are no longer part of the bitmap.  But we do need to undo the
4152                          * effect of the "vcb->freeBlocks -= bitmapblks" above.
4153                          */
4154                         vcb->freeBlocks += bitmapblks;
4155                 }
4156                 vcb->totalBlocks -= addblks;
4157                 vcb->freeBlocks -= addblks;
4158                 hfsmp->hfs_logical_block_count = prev_phys_block_count;
4159                 hfsmp->hfs_alt_id_sector = prev_alt_sector;
4160                 MarkVCBDirty(vcb);
4161                 if (vcb->blockSize == 512) {
4162                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) {
4163                                 hfs_mark_volume_inconsistent(hfsmp);
4164                         }
4165                 } else {
4166                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) {
4167                                 hfs_mark_volume_inconsistent(hfsmp);
4168                         }
4169                 }
4170                 goto out;
4171         }
4172         /*
4173          * Invalidate the old alternate volume header.
4174          */
4175         bp = NULL;
4176         if (prev_alt_sector) {
4177                 if (buf_meta_bread(hfsmp->hfs_devvp,
4178                                 HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys),
4179                                 hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
4180                         journal_modify_block_start(hfsmp->jnl, bp);
4181
4182                         bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize);
4183
4184                         journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
4185                 } else if (bp) {
4186                         buf_brelse(bp);
4187                 }
4188         }
4189
4190         /*
4191          * Update the metadata zone size based on current volume size
4192          */
4193         hfs_metadatazone_init(hfsmp, false);
4194
4195         /*
4196          * Adjust the size of hfsmp->hfs_attrdata_vp
4197          */
4198         if (hfsmp->hfs_attrdata_vp) {
4199                 struct cnode *attr_cp;
4200                 struct filefork *attr_fp;
4201
4202                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4203                         attr_cp = VTOC(hfsmp->hfs_attrdata_vp);
4204                         attr_fp = VTOF(hfsmp->hfs_attrdata_vp);
4205
4206                         attr_cp->c_blocks = newblkcnt;
4207                         attr_fp->ff_blocks = newblkcnt;
4208                         attr_fp->ff_extents[0].blockCount = newblkcnt;
4209                         attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4210                         ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size);
4211                         vnode_put(hfsmp->hfs_attrdata_vp);
4212                 }
4213         }
4214
4215         /*
4216          * Update the R/B Tree if necessary.  Since we don't have to drop the systemfile
4217          * locks in the middle of these operations like we do in the truncate case
4218          * where we have to relocate files, we can only update the red-black tree
4219          * if there were actual changes made to the bitmap.  Also, we can't really scan the
4220          * new portion of the bitmap before it has been allocated. The BlockMarkAllocated
4221          * routines are smart enough to avoid the r/b tree if the portion they are manipulating is
4222          * not currently controlled by the tree.
4223          *
4224          * We only update hfsmp->allocLimit if totalBlocks actually increased.
4225          */
4226
4227         if (error == 0) {
4228                 UpdateAllocLimit(hfsmp, hfsmp->totalBlocks);
4229         }
4230
4231         /* Log successful extending */
4232         printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n",
4233                hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize));
4234
4235 out:
4236         if (error && fp) {
4237                 /* Restore allocation fork. */
4238                 bcopy(&forkdata, &fp->ff_data, sizeof(forkdata));
4239                 VTOC(vp)->c_blocks = fp->ff_blocks;
4240
4241         }
4242
4243         HFS_MOUNT_LOCK(hfsmp, TRUE);
4244         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4245         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4246         if (lockflags) {
4247                 hfs_systemfile_unlock(hfsmp, lockflags);
4248         }
4249         if (transaction_begun) {
4250                 hfs_end_transaction(hfsmp);
4251                 hfs_journal_flush(hfsmp, FALSE);
4252                 /* Just to be sure, sync all data to the disk */
4253                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4254         }
4255
4256         return MacToVFSError(error);
4257 }
4258
4259 #define HFS_MIN_SIZE  (32LL * 1024LL * 1024LL)
4260
4261 /*
4262  * Truncate a file system (while still mounted).
4263  */
4264 int
4265 hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4266 {
4267         struct  buf *bp = NULL;
4268         u_int64_t oldsize;
4269         u_int32_t newblkcnt;
4270         u_int32_t reclaimblks = 0;
4271         int lockflags = 0;
4272         int transaction_begun = 0;
4273         Boolean updateFreeBlocks = false;
4274         Boolean disable_sparse = false;
4275         int error = 0;
4276
4277         lck_mtx_lock(&hfsmp->hfs_mutex);
4278         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4279                 lck_mtx_unlock(&hfsmp->hfs_mutex);
4280                 return (EALREADY);
4281         }
4282         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4283         hfsmp->hfs_resize_blocksmoved = 0;
4284         hfsmp->hfs_resize_totalblocks = 0;
4285         hfsmp->hfs_resize_progress = 0;
4286         lck_mtx_unlock(&hfsmp->hfs_mutex);
4287
4288         /*
4289          * - Journaled HFS Plus volumes only.
4290          * - No embedded volumes.
4291          */
4292         if ((hfsmp->jnl == NULL) ||
4293             (hfsmp->hfsPlusIOPosOffset != 0)) {
4294                 error = EPERM;
4295                 goto out;
4296         }
4297         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4298         newblkcnt = newsize / hfsmp->blockSize;
4299         reclaimblks = hfsmp->totalBlocks - newblkcnt;
4300
4301         if (hfs_resize_debug) {
4302                 printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
4303                 printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
4304         }
4305
4306         /* Make sure new size is valid. */
4307         if ((newsize < HFS_MIN_SIZE) ||
4308             (newsize >= oldsize) ||
4309             (newsize % hfsmp->hfs_logical_block_size) ||
4310             (newsize % hfsmp->hfs_physical_block_size)) {
4311                 printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4312                 error = EINVAL;
4313                 goto out;
4314         }
4315
4316         /*
4317          * Make sure that the file system has enough free blocks reclaim.
4318          *
4319          * Before resize, the disk is divided into four zones -
4320          *      A. Allocated_Stationary - These are allocated blocks that exist
4321          *         before the new end of disk.  These blocks will not be
4322          *         relocated or modified during resize.
4323          *      B. Free_Stationary - These are free blocks that exist before the
4324          *         new end of disk.  These blocks can be used for any new
4325          *         allocations during resize, including allocation for relocating
4326          *         data from the area of disk being reclaimed.
4327          *      C. Allocated_To-Reclaim - These are allocated blocks that exist
4328          *         beyond the new end of disk.  These blocks need to be reclaimed
4329          *         during resize by allocating equal number of blocks in Free
4330          *         Stationary zone and copying the data.
4331          *      D. Free_To-Reclaim - These are free blocks that exist beyond the
4332          *         new end of disk.  Nothing special needs to be done to reclaim
4333          *         them.
4334          *
4335          * Total number of blocks on the disk before resize:
4336          * ------------------------------------------------
4337          *      Total Blocks = Allocated_Stationary + Free_Stationary +
4338          *                     Allocated_To-Reclaim + Free_To-Reclaim
4339          *
4340          * Total number of blocks that need to be reclaimed:
4341          * ------------------------------------------------
4342          *      Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim
4343          *
4344          * Note that the check below also makes sure that we have enough space
4345          * to relocate data from Allocated_To-Reclaim to Free_Stationary.
4346          * Therefore we do not need to check total number of blocks to relocate
4347          * later in the code.
4348          *
4349          * The condition below gets converted to:
4350          *
4351          * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim
4352          *
4353          * which is equivalent to:
4354          *
4355          *              Allocated To-Reclaim >= Free Stationary
4356          */
4357         if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
4358                 printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
4359                 error = ENOSPC;
4360                 goto out;
4361         }
4362
4363         /* Start with a clean journal. */
4364         hfs_journal_flush(hfsmp, TRUE);
4365
4366         if (hfs_start_transaction(hfsmp) != 0) {
4367                 error = EINVAL;
4368                 goto out;
4369         }
4370         transaction_begun = 1;
4371
4372         /* Take the bitmap lock to update the alloc limit field */
4373         lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4374
4375         /*
4376          * Prevent new allocations from using the part we're trying to truncate.
4377          *
4378          * NOTE: allocLimit is set to the allocation block number where the new
4379          * alternate volume header will be.  That way there will be no files to
4380          * interfere with allocating the new alternate volume header, and no files
4381          * in the allocation blocks beyond (i.e. the blocks we're trying to
4382          * truncate away.
4383          *
4384          * Also shrink the red-black tree if needed.
4385          */
4386         if (hfsmp->blockSize == 512) {
4387                 error = UpdateAllocLimit (hfsmp, newblkcnt - 2);
4388         }
4389         else {
4390                 error = UpdateAllocLimit (hfsmp, newblkcnt - 1);
4391         }
4392
4393         /* Sparse devices use first fit allocation which is not ideal
4394          * for volume resize which requires best fit allocation.  If a
4395          * sparse device is being truncated, disable the sparse device
4396          * property temporarily for the duration of resize.  Also reset
4397          * the free extent cache so that it is rebuilt as sorted by
4398          * totalBlocks instead of startBlock.
4399          *
4400          * Note that this will affect all allocations on the volume and
4401          * ideal fix would be just to modify resize-related allocations,
4402          * but it will result in complexity like handling of two free
4403          * extent caches sorted differently, etc.  So we stick to this
4404          * solution for now.
4405          */
4406         HFS_MOUNT_LOCK(hfsmp, TRUE);
4407         if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
4408                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
4409                 ResetVCBFreeExtCache(hfsmp);
4410                 disable_sparse = true;
4411         }
4412
4413         /*
4414          * Update the volume free block count to reflect the total number
4415          * of free blocks that will exist after a successful resize.
4416          * Relocation of extents will result in no net change in the total
4417          * free space on the disk.  Therefore the code that allocates
4418          * space for new extent and deallocates the old extent explicitly
4419          * prevents updating the volume free block count.  It will also
4420          * prevent false disk full error when the number of blocks in
4421          * an extent being relocated is more than the free blocks that
4422          * will exist after the volume is resized.
4423          */
4424         hfsmp->freeBlocks -= reclaimblks;
4425         updateFreeBlocks = true;
4426         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4427
4428         if (lockflags) {
4429                 hfs_systemfile_unlock(hfsmp, lockflags);
4430                 lockflags = 0;
4431         }
4432
4433         /*
4434          * Update the metadata zone size to match the new volume size,
4435          * and if it too less, metadata zone might be disabled.
4436          */
4437         hfs_metadatazone_init(hfsmp, false);
4438
4439         /*
4440          * If some files have blocks at or beyond the location of the
4441          * new alternate volume header, recalculate free blocks and
4442          * reclaim blocks.  Otherwise just update free blocks count.
4443          *
4444          * The current allocLimit is set to the location of new alternate
4445          * volume header, and reclaimblks are the total number of blocks
4446          * that need to be reclaimed.  So the check below is really
4447          * ignoring the blocks allocated for old alternate volume header.
4448          */
4449         if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
4450                 /*
4451                  * hfs_reclaimspace will use separate transactions when
4452                  * relocating files (so we don't overwhelm the journal).
4453                  */
4454                 hfs_end_transaction(hfsmp);
4455                 transaction_begun = 0;
4456
4457                 /* Attempt to reclaim some space. */
4458                 error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
4459                 if (error != 0) {
4460                         printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
4461                         error = ENOSPC;
4462                         goto out;
4463                 }
4464                 if (hfs_start_transaction(hfsmp) != 0) {
4465                         error = EINVAL;
4466                         goto out;
4467                 }
4468                 transaction_begun = 1;
4469
4470                 /* Check if we're clear now. */
4471                 error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
4472                 if (error != 0) {
4473                         printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
4474                         error = EAGAIN;  /* tell client to try again */
4475                         goto out;
4476                 }
4477         }
4478
4479         /*
4480          * Note: we take the attributes lock in case we have an attribute data vnode
4481          * which needs to change size.
4482          */
4483         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4484
4485         /*
4486          * Allocate last 1KB for alternate volume header.
4487          */
4488         error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1);
4489         if (error) {
4490                 printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error);
4491                 goto out;
4492         }
4493
4494         /*
4495          * Mark the old alternate volume header as free.
4496          * We don't bother shrinking allocation bitmap file.
4497          */
4498         if (hfsmp->blockSize == 512)
4499                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2);
4500         else
4501                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1);
4502
4503         /*
4504          * Invalidate the existing alternate volume header.
4505          *
4506          * Don't include this in a transaction (don't call journal_modify_block)
4507          * since this block will be outside of the truncated file system!
4508          */
4509         if (hfsmp->hfs_alt_id_sector) {
4510                 error = buf_meta_bread(hfsmp->hfs_devvp,
4511                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
4512                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
4513                 if (error == 0) {
4514                         bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
4515                         (void) VNOP_BWRITE(bp);
4516                 } else {
4517                         if (bp) {
4518                                 buf_brelse(bp);
4519                         }
4520                 }
4521                 bp = NULL;
4522         }
4523
4524         /* Log successful shrinking. */
4525         printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n",
4526                hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks);
4527
4528         /*
4529          * Adjust file system variables and flush them to disk.
4530          */
4531         hfsmp->totalBlocks = newblkcnt;
4532         hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size;
4533         hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count);
4534         MarkVCBDirty(hfsmp);
4535         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4536         if (error)
4537                 panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error);
4538
4539         /*
4540          * Adjust the size of hfsmp->hfs_attrdata_vp
4541          */
4542         if (hfsmp->hfs_attrdata_vp) {
4543                 struct cnode *cp;
4544                 struct filefork *fp;
4545
4546                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4547                         cp = VTOC(hfsmp->hfs_attrdata_vp);
4548                         fp = VTOF(hfsmp->hfs_attrdata_vp);
4549
4550                         cp->c_blocks = newblkcnt;
4551                         fp->ff_blocks = newblkcnt;
4552                         fp->ff_extents[0].blockCount = newblkcnt;
4553                         fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4554                         ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size);
4555                         vnode_put(hfsmp->hfs_attrdata_vp);
4556                 }
4557         }
4558
4559 out:
4560         /*
4561          * Update the allocLimit to acknowledge the last one or two blocks now.
4562          * Add it to the tree as well if necessary.
4563          */
4564         UpdateAllocLimit (hfsmp, hfsmp->totalBlocks);
4565
4566         HFS_MOUNT_LOCK(hfsmp, TRUE);
4567         if (disable_sparse == true) {
4568                 /* Now that resize is completed, set the volume to be sparse
4569                  * device again so that all further allocations will be first
4570                  * fit instead of best fit.  Reset free extent cache so that
4571                  * it is rebuilt.
4572                  */
4573                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
4574                 ResetVCBFreeExtCache(hfsmp);
4575         }
4576
4577         if (error && (updateFreeBlocks == true)) {
4578                 hfsmp->freeBlocks += reclaimblks;
4579         }
4580
4581         if (hfsmp->nextAllocation >= hfsmp->allocLimit) {
4582                 hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
4583         }
4584         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4585         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4586
4587         /* On error, reset the metadata zone for original volume size */
4588         if (error && (updateFreeBlocks == true)) {
4589                 hfs_metadatazone_init(hfsmp, false);
4590         }
4591
4592         if (lockflags) {
4593                 hfs_systemfile_unlock(hfsmp, lockflags);
4594         }
4595         if (transaction_begun) {
4596                 hfs_end_transaction(hfsmp);
4597                 hfs_journal_flush(hfsmp, FALSE);
4598                 /* Just to be sure, sync all data to the disk */
4599                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4600         }
4601
4602         return MacToVFSError(error);
4603 }
4604
4605
4606 /*
4607  * Invalidate the physical block numbers associated with buffer cache blocks
4608  * in the given extent of the given vnode.
4609  */
4610 struct hfs_inval_blk_no {
4611         daddr64_t sectorStart;
4612         daddr64_t sectorCount;
4613 };
4614 static int
4615 hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in)
4616 {
4617         daddr64_t blkno;
4618         struct hfs_inval_blk_no *args;
4619
4620         blkno = buf_blkno(bp);
4621         args = args_in;
4622
4623         if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount)
4624                 buf_setblkno(bp, buf_lblkno(bp));
4625
4626         return BUF_RETURNED;
4627 }
4628 static void
4629 hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount)
4630 {
4631         struct hfs_inval_blk_no args;
4632         args.sectorStart = sectorStart;
4633         args.sectorCount = sectorCount;
4634
4635         buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args);
4636 }
4637
4638
4639 /*
4640  * Copy the contents of an extent to a new location.  Also invalidates the
4641  * physical block number of any buffer cache block in the copied extent
4642  * (so that if the block is written, it will go through VNOP_BLOCKMAP to
4643  * determine the new physical block number).
4644  */
4645 static int
4646 hfs_copy_extent(
4647         struct hfsmount *hfsmp,
4648         struct vnode *vp,               /* The file whose extent is being copied. */
4649         u_int32_t oldStart,             /* The start of the source extent. */
4650         u_int32_t newStart,             /* The start of the destination extent. */
4651         u_int32_t blockCount,   /* The number of allocation blocks to copy. */
4652         vfs_context_t context)
4653 {
4654         int err = 0;
4655         size_t bufferSize;
4656         void *buffer = NULL;
4657         struct vfsioattr ioattr;
4658         buf_t bp = NULL;
4659         off_t resid;
4660         size_t ioSize;
4661         u_int32_t ioSizeSectors;        /* Device sectors in this I/O */
4662         daddr64_t srcSector, destSector;
4663         u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4664 #if CONFIG_PROTECT
4665         int cpenabled = 0;
4666 #endif
4667
4668         /*
4669          * Sanity check that we have locked the vnode of the file we're copying.
4670          *
4671          * But since hfs_systemfile_lock() doesn't actually take the lock on
4672          * the allocation file if a journal is active, ignore the check if the
4673          * file being copied is the allocation file.
4674          */
4675         struct cnode *cp = VTOC(vp);
4676         if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
4677                 panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
4678
4679 #if CONFIG_PROTECT
4680         /* Prepare the CP blob and get it ready for use */
4681         if (!vnode_issystem (vp) && vnode_isreg(vp) &&
4682                         cp_fs_protected (hfsmp->hfs_mp)) {
4683                 int cp_err = 0;
4684                 cp_err = cp_handle_relocate (cp);
4685                 if (cp_err) {
4686                         /*
4687                          * can't copy the file because we couldn't set up keys.
4688                          * bail out
4689                          */
4690                         return cp_err;
4691                 }
4692                 else {
4693                         cpenabled = 1;
4694                 }
4695         }
4696 #endif
4697
4698         /*
4699          * Determine the I/O size to use
4700          *
4701          * NOTE: Many external drives will result in an ioSize of 128KB.
4702          * TODO: Should we use a larger buffer, doing several consecutive
4703          * reads, then several consecutive writes?
4704          */
4705         vfs_ioattr(hfsmp->hfs_mp, &ioattr);
4706         bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt);
4707         if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize))
4708                 return ENOMEM;
4709
4710         /* Get a buffer for doing the I/O */
4711         bp = buf_alloc(hfsmp->hfs_devvp);
4712         buf_setdataptr(bp, (uintptr_t)buffer);
4713
4714         resid = (off_t) blockCount * (off_t) hfsmp->blockSize;
4715         srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4716         destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4717         while (resid > 0) {
4718                 ioSize = MIN(bufferSize, (size_t) resid);
4719                 ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size;
4720
4721                 /* Prepare the buffer for reading */
4722                 buf_reset(bp, B_READ);
4723                 buf_setsize(bp, ioSize);
4724                 buf_setcount(bp, ioSize);
4725                 buf_setblkno(bp, srcSector);
4726                 buf_setlblkno(bp, srcSector);
4727
4728                 /* Attach the CP to the buffer */
4729 #if CONFIG_PROTECT
4730                 if (cpenabled) {
4731                         buf_setcpaddr (bp, cp->c_cpentry);
4732                 }
4733 #endif
4734
4735                 /* Do the read */
4736                 err = VNOP_STRATEGY(bp);
4737                 if (!err)
4738                         err = buf_biowait(bp);
4739                 if (err) {
4740                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err);
4741                         break;
4742                 }
4743
4744                 /* Prepare the buffer for writing */
4745                 buf_reset(bp, B_WRITE);
4746                 buf_setsize(bp, ioSize);
4747                 buf_setcount(bp, ioSize);
4748                 buf_setblkno(bp, destSector);
4749                 buf_setlblkno(bp, destSector);
4750                 if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
4751                         buf_markfua(bp);
4752
4753 #if CONFIG_PROTECT
4754                 /* Attach the CP to the buffer */
4755                 if (cpenabled) {
4756                         buf_setcpaddr (bp, cp->c_cpentry);
4757                 }
4758 #endif
4759
4760                 /* Do the write */
4761                 vnode_startwrite(hfsmp->hfs_devvp);
4762                 err = VNOP_STRATEGY(bp);
4763                 if (!err)
4764                         err = buf_biowait(bp);
4765                 if (err) {
4766                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err);
4767                         break;
4768                 }
4769
4770                 resid -= ioSize;
4771                 srcSector += ioSizeSectors;
4772                 destSector += ioSizeSectors;
4773         }
4774         if (bp)
4775                 buf_free(bp);
4776         if (buffer)
4777                 kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
4778
4779         /* Make sure all writes have been flushed to disk. */
4780         if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
4781                 err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4782                 if (err) {
4783                         printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
4784                         err = 0;        /* Don't fail the copy. */
4785                 }
4786         }
4787
4788         if (!err)
4789                 hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock);
4790
4791         return err;
4792 }
4793
4794
4795 /* Structure to store state of reclaiming extents from a
4796  * given file.  hfs_reclaim_file()/hfs_reclaim_xattr()
4797  * initializes the values in this structure which are then
4798  * used by code that reclaims and splits the extents.
4799  */
4800 struct hfs_reclaim_extent_info {
4801         struct vnode *vp;
4802         u_int32_t fileID;
4803         u_int8_t forkType;
4804         u_int8_t is_dirlink;                 /* Extent belongs to directory hard link */
4805         u_int8_t is_sysfile;                 /* Extent belongs to system file */
4806         u_int8_t is_xattr;                   /* Extent belongs to extent-based xattr */
4807         u_int8_t extent_index;
4808         int lockflags;                       /* Locks that reclaim and split code should grab before modifying the extent record */
4809         u_int32_t blocks_relocated;          /* Total blocks relocated for this file till now */
4810         u_int32_t recStartBlock;             /* File allocation block number (FABN) for current extent record */
4811         u_int32_t cur_blockCount;            /* Number of allocation blocks that have been checked for reclaim */
4812         struct filefork *catalog_fp;         /* If non-NULL, extent is from catalog record */
4813         union record {
4814                 HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */
4815                 HFSPlusAttrRecord xattr;     /* Attribute record for large EAs */
4816         } record;
4817         HFSPlusExtentDescriptor *extents;    /* Pointer to current extent record being processed.
4818                                               * For catalog extent record, points to the correct
4819                                               * extent information in filefork.  For overflow extent
4820                                               * record, or xattr record, points to extent record
4821                                               * in the structure above
4822                                               */
4823         struct cat_desc *dirlink_desc;
4824         struct cat_attr *dirlink_attr;
4825         struct filefork *dirlink_fork;        /* For directory hard links, fp points actually to this */
4826         struct BTreeIterator *iterator;       /* Shared read/write iterator, hfs_reclaim_file/xattr()
4827                                                * use it for reading and hfs_reclaim_extent()/hfs_split_extent()
4828                                                * use it for writing updated extent record
4829                                                */
4830         struct FSBufferDescriptor btdata;     /* Shared btdata for reading/writing extent record, same as iterator above */
4831         u_int16_t recordlen;
4832         int overflow_count;                   /* For debugging, counter for overflow extent record */
4833         FCB *fcb;                             /* Pointer to the current btree being traversed */
4834 };
4835
4836 /*
4837  * Split the current extent into two extents, with first extent
4838  * to contain given number of allocation blocks.  Splitting of
4839  * extent creates one new extent entry which can result in
4840  * shifting of many entries through all the extent records of a
4841  * file, and/or creating a new extent record in the overflow
4842  * extent btree.
4843  *
4844  * Example:
4845  * The diagram below represents two consecutive extent records,
4846  * for simplicity, lets call them record X and X+1 respectively.
4847  * Interesting extent entries have been denoted by letters.
4848  * If the letter is unchanged before and after split, it means
4849  * that the extent entry was not modified during the split.
4850  * A '.' means that the entry remains unchanged after the split
4851  * and is not relevant for our example.  A '0' means that the
4852  * extent entry is empty.
4853  *
4854  * If there isn't sufficient contiguous free space to relocate
4855  * an extent (extent "C" below), we will have to break the one
4856  * extent into multiple smaller extents, and relocate each of
4857  * the smaller extents individually.  The way we do this is by
4858  * finding the largest contiguous free space that is currently
4859  * available (N allocation blocks), and then convert extent "C"
4860  * into two extents, C1 and C2, that occupy exactly the same
4861  * allocation blocks as extent C.  Extent C1 is the first
4862  * N allocation blocks of extent C, and extent C2 is the remainder
4863  * of extent C.  Then we can relocate extent C1 since we know
4864  * we have enough contiguous free space to relocate it in its
4865  * entirety.  We then repeat the process starting with extent C2.
4866  *
4867  * In record X, only the entries following entry C are shifted, and
4868  * the original entry C is replaced with two entries C1 and C2 which
4869  * are actually two extent entries for contiguous allocation blocks.
4870  *
4871  * Note that the entry E from record X is shifted into record X+1 as
4872  * the new first entry.  Since the first entry of record X+1 is updated,
4873  * the FABN will also get updated with the blockCount of entry E.
4874  * This also results in shifting of all extent entries in record X+1.
4875  * Note that the number of empty entries after the split has been
4876  * changed from 3 to 2.
4877  *
4878  * Before:
4879  *               record X                           record X+1
4880  *  ---------------------===---------     ---------------------------------
4881  *  | A | . | . | . | B | C | D | E |     | F | . | . | . | G | 0 | 0 | 0 |
4882  *  ---------------------===---------     ---------------------------------
4883  *
4884  * After:
4885  *  ---------------------=======-----     ---------------------------------
4886  *  | A | . | . | . | B | C1| C2| D |     | E | F | . | . | . | G | 0 | 0 |
4887  *  ---------------------=======-----     ---------------------------------
4888  *
4889  *  C1.startBlock = C.startBlock
4890  *  C1.blockCount = N
4891  *
4892  *  C2.startBlock = C.startBlock + N
4893  *  C2.blockCount = C.blockCount - N
4894  *
4895  *                                        FABN = old FABN - E.blockCount
4896  *
4897  * Inputs:
4898  *      extent_info - This is the structure that contains state about
4899  *                    the current file, extent, and extent record that
4900  *                    is being relocated.  This structure is shared
4901  *                    among code that traverses through all the extents
4902  *                    of the file, code that relocates extents, and
4903  *                    code that splits the extent.
4904  * Output:
4905  *      Zero on success, non-zero on failure.
4906  */
4907 static int
4908 hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount)
4909 {
4910         int error = 0;
4911         int index = extent_info->extent_index;
4912         int i;
4913         HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */
4914         HFSPlusExtentDescriptor last_extent;
4915         HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */
4916         HFSPlusExtentRecord *extents_rec = NULL;
4917         HFSPlusExtentKey *extents_key = NULL;
4918         HFSPlusAttrRecord *xattr_rec = NULL;
4919         HFSPlusAttrKey *xattr_key = NULL;
4920         struct BTreeIterator iterator;
4921         struct FSBufferDescriptor btdata;
4922         uint16_t reclen;
4923         uint32_t read_recStartBlock;    /* Starting allocation block number to read old extent record */
4924         uint32_t write_recStartBlock;   /* Starting allocation block number to insert newly updated extent record */
4925         Boolean create_record = false;
4926         Boolean is_xattr;
4927         struct cnode *cp;
4928
4929         is_xattr = extent_info->is_xattr;
4930         extents = extent_info->extents;
4931         cp = VTOC(extent_info->vp);
4932
4933         if (hfs_resize_debug) {
4934                 printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount);
4935         }
4936
4937         /* Extents overflow btree can not have more than 8 extents.
4938          * No split allowed if the 8th extent is already used.
4939          */
4940         if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) {
4941                 printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n");
4942                 error = ENOSPC;
4943                 goto out;
4944         }
4945
4946         /* Determine the starting allocation block number for the following
4947          * overflow extent record, if any, before the current record
4948          * gets modified.
4949          */
4950         read_recStartBlock = extent_info->recStartBlock;
4951         for (i = 0; i < kHFSPlusExtentDensity; i++) {
4952                 if (extents[i].blockCount == 0) {
4953                         break;
4954                 }
4955                 read_recStartBlock += extents[i].blockCount;
4956         }
4957
4958         /* Shift and split */
4959         if (index == kHFSPlusExtentDensity-1) {
4960                 /* The new extent created after split will go into following overflow extent record */
4961                 shift_extent.startBlock = extents[index].startBlock + newBlockCount;
4962                 shift_extent.blockCount = extents[index].blockCount - newBlockCount;
4963
4964                 /* Last extent in the record will be split, so nothing to shift */
4965         } else {
4966                 /* Splitting of extents can result in at most of one
4967                  * extent entry to be shifted into following overflow extent
4968                  * record.  So, store the last extent entry for later.
4969                  */
4970                 shift_extent = extents[kHFSPlusExtentDensity-1];
4971                 if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) {
4972                         printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount);
4973                 }
4974
4975                 /* Start shifting extent information from the end of the extent
4976                  * record to the index where we want to insert the new extent.
4977                  * Note that kHFSPlusExtentDensity-1 is already saved above, and
4978                  * does not need to be shifted.  The extent entry that is being
4979                  * split does not get shifted.
4980                  */
4981                 for (i = kHFSPlusExtentDensity-2; i > index; i--) {
4982                         if (hfs_resize_debug) {
4983                                 if (extents[i].blockCount) {
4984                                         printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount);
4985                                 }
4986                         }
4987                         extents[i+1] = extents[i];
4988                 }
4989         }
4990
4991         if (index == kHFSPlusExtentDensity-1) {
4992                 /* The second half of the extent being split will be the overflow
4993                  * entry that will go into following overflow extent record.  The
4994                  * value has been stored in 'shift_extent' above, so there is
4995                  * nothing to be done here.
4996                  */
4997         } else {
4998                 /* Update the values in the second half of the extent being split
4999                  * before updating the first half of the split.  Note that the
5000                  * extent to split or first half of the split is at index 'index'
5001                  * and a new extent or second half of the split will be inserted at
5002                  * 'index+1' or into following overflow extent record.
5003                  */
5004                 extents[index+1].startBlock = extents[index].startBlock + newBlockCount;
5005                 extents[index+1].blockCount = extents[index].blockCount - newBlockCount;
5006         }
5007         /* Update the extent being split, only the block count will change */
5008         extents[index].blockCount = newBlockCount;
5009
5010         if (hfs_resize_debug) {
5011                 printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount);
5012                 if (index != kHFSPlusExtentDensity-1) {
5013                         printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount);
5014                 } else {
5015                         printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount);
5016                 }
5017         }
5018
5019         /* Write out information about the newly split extent to the disk */
5020         if (extent_info->catalog_fp) {
5021                 /* (extent_info->catalog_fp != NULL) means the newly split
5022                  * extent exists in the catalog record.  This means that
5023                  * the cnode was updated.  Therefore, to write out the changes,
5024                  * mark the cnode as modified.   We cannot call hfs_update()
5025                  * in this function because the caller hfs_reclaim_extent()
5026                  * is holding the catalog lock currently.
5027                  */
5028                 cp->c_flag |= C_MODIFIED;
5029         } else {
5030                 /* The newly split extent is for large EAs or is in overflow
5031                  * extent record, so update it directly in the btree using the
5032                  * iterator information from the shared extent_info structure
5033                  */
5034                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5035                                 &(extent_info->btdata), extent_info->recordlen);
5036                 if (error) {
5037                         printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error);
5038                         goto out;
5039                 }
5040         }
5041
5042         /* No extent entry to be shifted into another extent overflow record */
5043         if (shift_extent.blockCount == 0) {
5044                 if (hfs_resize_debug) {
5045                         printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n");
5046                 }
5047                 error = 0;
5048                 goto out;
5049         }
5050
5051         /* The overflow extent entry has to be shifted into an extent
5052          * overflow record.  This means that we might have to shift
5053          * extent entries from all subsequent overflow records by one.
5054          * We start iteration from the first record to the last record,
5055          * and shift the extent entry from one record to another.
5056          * We might have to create a new extent record for the last
5057          * extent entry for the file.
5058          */
5059
5060         /* Initialize iterator to search the next record */
5061         bzero(&iterator, sizeof(iterator));
5062         if (is_xattr) {
5063                 /* Copy the key from the iterator that was used to update the modified attribute record. */
5064                 xattr_key = (HFSPlusAttrKey *)&(iterator.key);
5065                 bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey));
5066                 /* Note: xattr_key->startBlock will be initialized later in the iteration loop */
5067
5068                 MALLOC(xattr_rec, HFSPlusAttrRecord *,
5069                                 sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK);
5070                 if (xattr_rec == NULL) {
5071                         error = ENOMEM;
5072                         goto out;
5073                 }
5074                 btdata.bufferAddress = xattr_rec;
5075                 btdata.itemSize = sizeof(HFSPlusAttrRecord);
5076                 btdata.itemCount = 1;
5077                 extents = xattr_rec->overflowExtents.extents;
5078         } else {
5079                 /* Initialize the extent key for the current file */
5080                 extents_key = (HFSPlusExtentKey *) &(iterator.key);
5081                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5082                 extents_key->forkType = extent_info->forkType;
5083                 extents_key->fileID = extent_info->fileID;
5084                 /* Note: extents_key->startBlock will be initialized later in the iteration loop */
5085
5086                 MALLOC(extents_rec, HFSPlusExtentRecord *,
5087                                 sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK);
5088                 if (extents_rec == NULL) {
5089                         error = ENOMEM;
5090                         goto out;
5091                 }
5092                 btdata.bufferAddress = extents_rec;
5093                 btdata.itemSize = sizeof(HFSPlusExtentRecord);
5094                 btdata.itemCount = 1;
5095                 extents = extents_rec[0];
5096         }
5097
5098         /* The overflow extent entry has to be shifted into an extent
5099          * overflow record.  This means that we might have to shift
5100          * extent entries from all subsequent overflow records by one.
5101          * We start iteration from the first record to the last record,
5102          * examine one extent record in each iteration and shift one
5103          * extent entry from one record to another.  We might have to
5104          * create a new extent record for the last extent entry for the
5105          * file.
5106          *
5107          * If shift_extent.blockCount is non-zero, it means that there is
5108          * an extent entry that needs to be shifted into the next
5109          * overflow extent record.  We keep on going till there are no such
5110          * entries left to be shifted.  This will also change the starting
5111          * allocation block number of the extent record which is part of
5112          * the key for the extent record in each iteration.  Note that
5113          * because the extent record key is changing while we are searching,
5114          * the record can not be updated directly, instead it has to be
5115          * deleted and inserted again.
5116          */
5117         while (shift_extent.blockCount) {
5118                 if (hfs_resize_debug) {
5119                         printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock);
5120                 }
5121
5122                 /* Search if there is any existing overflow extent record
5123                  * that matches the current file and the logical start block
5124                  * number.
5125                  *
5126                  * For this, the logical start block number in the key is
5127                  * the value calculated based on the logical start block
5128                  * number of the current extent record and the total number
5129                  * of blocks existing in the current extent record.
5130                  */
5131                 if (is_xattr) {
5132                         xattr_key->startBlock = read_recStartBlock;
5133                 } else {
5134                         extents_key->startBlock = read_recStartBlock;
5135                 }
5136                 error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator);
5137                 if (error) {
5138                         if (error != btNotFound) {
5139                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5140                                 goto out;
5141                         }
5142                         /* No matching record was found, so create a new extent record.
5143                          * Note:  Since no record was found, we can't rely on the
5144                          * btree key in the iterator any longer.  This will be initialized
5145                          * later before we insert the record.
5146                          */
5147                         create_record = true;
5148                 }
5149
5150                 /* The extra extent entry from the previous record is being inserted
5151                  * as the first entry in the current extent record.  This will change
5152                  * the file allocation block number (FABN) of the current extent
5153                  * record, which is the startBlock value from the extent record key.
5154                  * Since one extra entry is being inserted in the record, the new
5155                  * FABN for the record will less than old FABN by the number of blocks
5156                  * in the new extent entry being inserted at the start.  We have to
5157                  * do this before we update read_recStartBlock to point at the
5158                  * startBlock of the following record.
5159                  */
5160                 write_recStartBlock = read_recStartBlock - shift_extent.blockCount;
5161                 if (hfs_resize_debug) {
5162                         if (create_record) {
5163                                 printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock);
5164                         }
5165                 }
5166
5167                 /* Now update the read_recStartBlock to account for total number
5168                  * of blocks in this extent record.  It will now point to the
5169                  * starting allocation block number for the next extent record.
5170                  */
5171                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
5172                         if (extents[i].blockCount == 0) {
5173                                 break;
5174                         }
5175                         read_recStartBlock += extents[i].blockCount;
5176                 }
5177
5178                 if (create_record == true) {
5179                         /* Initialize new record content with only one extent entry */
5180                         bzero(extents, sizeof(HFSPlusExtentRecord));
5181                         /* The new record will contain only one extent entry */
5182                         extents[0] = shift_extent;
5183                         /* There are no more overflow extents to be shifted */
5184                         shift_extent.startBlock = shift_extent.blockCount = 0;
5185
5186                         if (is_xattr) {
5187                                 /* BTSearchRecord above returned btNotFound,
5188                                  * but since the attribute btree is never empty
5189                                  * if we are trying to insert new overflow
5190                                  * record for the xattrs, the extents_key will
5191                                  * contain correct data.  So we don't need to
5192                                  * re-initialize it again like below.
5193                                  */
5194
5195                                 /* Initialize the new xattr record */
5196                                 xattr_rec->recordType = kHFSPlusAttrExtents;
5197                                 xattr_rec->overflowExtents.reserved = 0;
5198                                 reclen = sizeof(HFSPlusAttrExtents);
5199                         } else {
5200                                 /* BTSearchRecord above returned btNotFound,
5201                                  * which means that extents_key content might
5202                                  * not correspond to the record that we are
5203                                  * trying to create, especially when the extents
5204                                  * overflow btree is empty.  So we reinitialize
5205                                  * the extents_key again always.
5206                                  */
5207                                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5208                                 extents_key->forkType = extent_info->forkType;
5209                                 extents_key->fileID = extent_info->fileID;
5210
5211                                 /* Initialize the new extent record */
5212                                 reclen = sizeof(HFSPlusExtentRecord);
5213                         }
5214                 } else {
5215                         /* The overflow extent entry from previous record will be
5216                          * the first entry in this extent record.  If the last
5217                          * extent entry in this record is valid, it will be shifted
5218                          * into the following extent record as its first entry.  So
5219                          * save the last entry before shifting entries in current
5220                          * record.
5221                          */
5222                         last_extent = extents[kHFSPlusExtentDensity-1];
5223
5224                         /* Shift all entries by one index towards the end */
5225                         for (i = kHFSPlusExtentDensity-2; i >= 0; i--) {
5226                                 extents[i+1] = extents[i];
5227                         }
5228
5229                         /* Overflow extent entry saved from previous record
5230                          * is now the first entry in the current record.
5231                          */
5232                         extents[0] = shift_extent;
5233
5234                         if (hfs_resize_debug) {
5235                                 printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock);
5236                         }
5237
5238                         /* The last entry from current record will be the
5239                          * overflow entry which will be the first entry for
5240                          * the following extent record.
5241                          */
5242                         shift_extent = last_extent;
5243
5244                         /* Since the key->startBlock is being changed for this record,
5245                          * it should be deleted and inserted with the new key.
5246                          */
5247                         error = BTDeleteRecord(extent_info->fcb, &iterator);
5248                         if (error) {
5249                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5250                                 goto out;
5251                         }
5252                         if (hfs_resize_debug) {
5253                                 printf ("hfs_split_extent: Deleted record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock));
5254                         }
5255                 }
5256
5257                 /* Insert the newly created or modified extent record */
5258                 bzero(&iterator.hint, sizeof(iterator.hint));
5259                 if (is_xattr) {
5260                         xattr_key->startBlock = write_recStartBlock;
5261                 } else {
5262                         extents_key->startBlock = write_recStartBlock;
5263                 }
5264                 error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen);
5265                 if (error) {
5266                         printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error);
5267                         goto out;
5268                 }
5269                 if (hfs_resize_debug) {
5270                         printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock);
5271                 }
5272         }
5273         BTFlushPath(extent_info->fcb);
5274 out:
5275         if (extents_rec) {
5276                 FREE (extents_rec, M_TEMP);
5277         }
5278         if (xattr_rec) {
5279                 FREE (xattr_rec, M_TEMP);
5280         }
5281         return error;
5282 }
5283
5284
5285 /*
5286  * Relocate an extent if it lies beyond the expected end of volume.
5287  *
5288  * This function is called for every extent of the file being relocated.
5289  * It allocates space for relocation, copies the data, deallocates
5290  * the old extent, and update corresponding on-disk extent.  If the function
5291  * does not find contiguous space to  relocate an extent, it splits the
5292  * extent in smaller size to be able to relocate it out of the area of
5293  * disk being reclaimed.  As an optimization, if an extent lies partially
5294  * in the area of the disk being reclaimed, it is split so that we only
5295  * have to relocate the area that was overlapping with the area of disk
5296  * being reclaimed.
5297  *
5298  * Note that every extent is relocated in its own transaction so that
5299  * they do not overwhelm the journal.  This function handles the extent
5300  * record that exists in the catalog record, extent record from overflow
5301  * extents btree, and extents for large EAs.
5302  *
5303  * Inputs:
5304  *      extent_info - This is the structure that contains state about
5305  *                    the current file, extent, and extent record that
5306  *                    is being relocated.  This structure is shared
5307  *                    among code that traverses through all the extents
5308  *                    of the file, code that relocates extents, and
5309  *                    code that splits the extent.
5310  */
5311 static int
5312 hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context)
5313 {
5314         int error = 0;
5315         int index;
5316         struct cnode *cp;
5317         u_int32_t oldStartBlock;
5318         u_int32_t oldBlockCount;
5319         u_int32_t newStartBlock;
5320         u_int32_t newBlockCount;
5321         u_int32_t roundedBlockCount;
5322         uint16_t node_size;
5323         uint32_t remainder_blocks;
5324         u_int32_t alloc_flags;
5325         int blocks_allocated = false;
5326
5327         index = extent_info->extent_index;
5328         cp = VTOC(extent_info->vp);
5329
5330         oldStartBlock = extent_info->extents[index].startBlock;
5331         oldBlockCount = extent_info->extents[index].blockCount;
5332
5333         if (0 && hfs_resize_debug) {
5334                 printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount);
5335         }
5336
5337         /* If the current extent lies completely within allocLimit,
5338          * it does not require any relocation.
5339          */
5340         if ((oldStartBlock + oldBlockCount) <= allocLimit) {
5341                 extent_info->cur_blockCount += oldBlockCount;
5342                 return error;
5343         }
5344
5345         /* Every extent should be relocated in its own transaction
5346          * to make sure that we don't overflow the journal buffer.
5347          */
5348         error = hfs_start_transaction(hfsmp);
5349         if (error) {
5350                 return error;
5351         }
5352         extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK);
5353
5354         /* Check if the extent lies partially in the area to reclaim,
5355          * i.e. it starts before allocLimit and ends beyond allocLimit.
5356          * We have already skipped extents that lie completely within
5357          * allocLimit in the check above, so we only check for the
5358          * startBlock.  If it lies partially, split it so that we
5359          * only relocate part of the extent.
5360          */
5361         if (oldStartBlock < allocLimit) {
5362                 newBlockCount = allocLimit - oldStartBlock;
5363
5364                 /* If the extent belongs to a btree, check and trim
5365                  * it to be multiple of the node size.
5366                  */
5367                 if (extent_info->is_sysfile) {
5368                         node_size = get_btree_nodesize(extent_info->vp);
5369                         /* If the btree node size is less than the block size,
5370                          * splitting this extent will not split a node across
5371                          * different extents.  So we only check and trim if
5372                          * node size is more than the allocation block size.
5373                          */
5374                         if (node_size > hfsmp->blockSize) {
5375                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5376                                 if (remainder_blocks) {
5377                                         newBlockCount -= remainder_blocks;
5378                                         if (hfs_resize_debug) {
5379                                                 printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5380                                         }
5381                                 }
5382                         }
5383                 }
5384
5385                 if (hfs_resize_debug) {
5386                         int idx = extent_info->extent_index;
5387                         printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5388                 }
5389
5390                 /* Split the extents into two parts --- the first extent lies
5391                  * completely within allocLimit and therefore does not require
5392                  * relocation.  The second extent will require relocation which
5393                  * will be handled when the caller calls this function again
5394                  * for the next extent.
5395                  */
5396                 error = hfs_split_extent(extent_info, newBlockCount);
5397                 if (error == 0) {
5398                         /* Split success, no relocation required */
5399                         goto out;
5400                 }
5401                 /* Split failed, so try to relocate entire extent */
5402                 if (hfs_resize_debug) {
5403                         printf ("hfs_reclaim_extent: Split straddling extent failed, reclocate full extent\n");
5404                 }
5405         }
5406
5407         /* At this point, the current extent requires relocation.
5408          * We will try to allocate space equal to the size of the extent
5409          * being relocated first to try to relocate it without splitting.
5410          * If the allocation fails, we will try to allocate contiguous
5411          * blocks out of metadata zone.  If that allocation also fails,
5412          * then we will take a whatever contiguous block run is returned
5413          * by the allocation, split the extent into two parts, and then
5414          * relocate the first splitted extent.
5415          */
5416         alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS;
5417         if (extent_info->is_sysfile) {
5418                 alloc_flags |= HFS_ALLOC_METAZONE;
5419         }
5420
5421         error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags,
5422                         &newStartBlock, &newBlockCount);
5423         if ((extent_info->is_sysfile == false) &&
5424             ((error == dskFulErr) || (error == ENOSPC))) {
5425                 /* For non-system files, try reallocating space in metadata zone */
5426                 alloc_flags |= HFS_ALLOC_METAZONE;
5427                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5428                                 alloc_flags, &newStartBlock, &newBlockCount);
5429         }
5430         if ((error == dskFulErr) || (error == ENOSPC)) {
5431                 /* We did not find desired contiguous space for this extent.
5432                  * So try to allocate the maximum contiguous space available.
5433                  */
5434                 alloc_flags &= ~HFS_ALLOC_FORCECONTIG;
5435
5436                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5437                                 alloc_flags, &newStartBlock, &newBlockCount);
5438                 if (error) {
5439                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5440                         goto out;
5441                 }
5442                 blocks_allocated = true;
5443
5444                 /* The number of blocks allocated is less than the requested
5445                  * number of blocks.  For btree extents, check and trim the
5446                  * extent to be multiple of the node size.
5447                  */
5448                 if (extent_info->is_sysfile) {
5449                         node_size = get_btree_nodesize(extent_info->vp);
5450                         if (node_size > hfsmp->blockSize) {
5451                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5452                                 if (remainder_blocks) {
5453                                         roundedBlockCount = newBlockCount - remainder_blocks;
5454                                         /* Free tail-end blocks of the newly allocated extent */
5455                                         BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount,
5456                                                                newBlockCount - roundedBlockCount,
5457                                                                HFS_ALLOC_SKIPFREEBLKS);
5458                                         newBlockCount = roundedBlockCount;
5459                                         if (hfs_resize_debug) {
5460                                                 printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5461                                         }
5462                                         if (newBlockCount == 0) {
5463                                                 printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID);
5464                                                 error = ENOSPC;
5465                                                 goto out;
5466                                         }
5467                                 }
5468                         }
5469                 }
5470
5471                 /* The number of blocks allocated is less than the number of
5472                  * blocks requested, so split this extent --- the first extent
5473                  * will be relocated as part of this function call and the caller
5474                  * will handle relocating the second extent by calling this
5475                  * function again for the second extent.
5476                  */
5477                 error = hfs_split_extent(extent_info, newBlockCount);
5478                 if (error) {
5479                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5480                         goto out;
5481                 }
5482                 oldBlockCount = newBlockCount;
5483         }
5484         if (error) {
5485                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5486                 goto out;
5487         }
5488         blocks_allocated = true;
5489
5490         /* Copy data from old location to new location */
5491         error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock,
5492                         newStartBlock, newBlockCount, context);
5493         if (error) {
5494                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error);
5495                 goto out;
5496         }
5497
5498         /* Update the extent record with the new start block information */
5499         extent_info->extents[index].startBlock = newStartBlock;
5500
5501         /* Sync the content back to the disk */
5502         if (extent_info->catalog_fp) {
5503                 /* Update the extents in catalog record */
5504                 if (extent_info->is_dirlink) {
5505                         error = cat_update_dirlink(hfsmp, extent_info->forkType,
5506                                         extent_info->dirlink_desc, extent_info->dirlink_attr,
5507                                         &(extent_info->dirlink_fork->ff_data));
5508                 } else {
5509                         cp->c_flag |= C_MODIFIED;
5510                         /* If this is a system file, sync volume headers on disk */
5511                         if (extent_info->is_sysfile) {
5512                                 error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5513                         }
5514                 }
5515         } else {
5516                 /* Replace record for extents overflow or extents-based xattrs */
5517                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5518                                 &(extent_info->btdata), extent_info->recordlen);
5519         }
5520         if (error) {
5521                 printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error);
5522                 goto out;
5523         }
5524
5525         /* Deallocate the old extent */
5526         error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5527         if (error) {
5528                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5529                 goto out;
5530         }
5531         extent_info->blocks_relocated += newBlockCount;
5532
5533         if (hfs_resize_debug) {
5534                 printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
5535         }
5536
5537 out:
5538         if (error != 0) {
5539                 if (blocks_allocated == true) {
5540                         BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5541                 }
5542         } else {
5543                 /* On success, increment the total allocation blocks processed */
5544                 extent_info->cur_blockCount += newBlockCount;
5545         }
5546
5547         hfs_systemfile_unlock(hfsmp, extent_info->lockflags);
5548
5549         /* For a non-system file, if an extent entry from catalog record
5550          * was modified, sync the in-memory changes to the catalog record
5551          * on disk before ending the transaction.
5552          */
5553          if ((extent_info->catalog_fp) &&
5554              (extent_info->is_sysfile == false)) {
5555                 (void) hfs_update(extent_info->vp, MNT_WAIT);
5556         }
5557
5558         hfs_end_transaction(hfsmp);
5559
5560         return error;
5561 }
5562
5563 /* Report intermediate progress during volume resize */
5564 static void
5565 hfs_truncatefs_progress(struct hfsmount *hfsmp)
5566 {
5567         u_int32_t cur_progress;
5568
5569         hfs_resize_progress(hfsmp, &cur_progress);
5570         if (cur_progress > (hfsmp->hfs_resize_progress + 9)) {
5571                 printf("hfs_truncatefs: %d%% done...\n", cur_progress);
5572                 hfsmp->hfs_resize_progress = cur_progress;
5573         }
5574         return;
5575 }
5576
5577 /*
5578  * Reclaim space at the end of a volume for given file and forktype.
5579  *
5580  * This routine attempts to move any extent which contains allocation blocks
5581  * at or after "allocLimit."  A separate transaction is used for every extent
5582  * that needs to be moved.  If there is not contiguous space available for
5583  * moving an extent, it can be split into smaller extents.  The contents of
5584  * any moved extents are read and written via the volume's device vnode --
5585  * NOT via "vp."  During the move, moved blocks which are part of a transaction
5586  * have their physical block numbers invalidated so they will eventually be
5587  * written to their new locations.
5588  *
5589  * This function is also called for directory hard links.  Directory hard links
5590  * are regular files with no data fork and resource fork that contains alias
5591  * information for backward compatibility with pre-Leopard systems.  However
5592  * non-Mac OS X implementation can add/modify data fork or resource fork
5593  * information to directory hard links, so we check, and if required, relocate
5594  * both data fork and resource fork.
5595  *
5596  * Inputs:
5597  *    hfsmp       The volume being resized.
5598  *    vp          The vnode for the system file.
5599  *    fileID      ID of the catalog record that needs to be relocated
5600  *    forktype    The type of fork that needs relocated,
5601  *                      kHFSResourceForkType for resource fork,
5602  *                      kHFSDataForkType for data fork
5603  *    allocLimit  Allocation limit for the new volume size,
5604  *                do not use this block or beyond.  All extents
5605  *                that use this block or any blocks beyond this limit
5606  *                will be relocated.
5607  *
5608  * Side Effects:
5609  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
5610  * blocks that were relocated.
5611  */
5612 static int
5613 hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID,
5614                 u_int8_t forktype, u_long allocLimit, vfs_context_t context)
5615 {
5616         int error = 0;
5617         struct hfs_reclaim_extent_info *extent_info;
5618         int i;
5619         int lockflags = 0;
5620         struct cnode *cp;
5621         struct filefork *fp;
5622         int took_truncate_lock = false;
5623         int release_desc = false;
5624         HFSPlusExtentKey *key;
5625
5626         /* If there is no vnode for this file, then there's nothing to do. */
5627         if (vp == NULL) {
5628                 return 0;
5629         }
5630
5631         cp = VTOC(vp);
5632
5633         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
5634                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
5635         if (extent_info == NULL) {
5636                 return ENOMEM;
5637         }
5638         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
5639         extent_info->vp = vp;
5640         extent_info->fileID = fileID;
5641         extent_info->forkType = forktype;
5642         extent_info->is_sysfile = vnode_issystem(vp);
5643         if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) {
5644                 extent_info->is_dirlink = true;
5645         }
5646         /* We always need allocation bitmap and extent btree lock */
5647         lockflags = SFL_BITMAP | SFL_EXTENTS;
5648         if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) {
5649                 lockflags |= SFL_CATALOG;
5650         } else if (fileID == kHFSAttributesFileID) {
5651                 lockflags |= SFL_ATTRIBUTE;
5652         } else if (fileID == kHFSStartupFileID) {
5653                 lockflags |= SFL_STARTUP;
5654         }
5655         extent_info->lockflags = lockflags;
5656         extent_info->fcb = VTOF(hfsmp->hfs_extents_vp);
5657
5658         /* Flush data associated with current file on disk.
5659          *
5660          * If the current vnode is directory hard link, no flushing of
5661          * journal or vnode is required.  The current kernel does not
5662          * modify data/resource fork of directory hard links, so nothing
5663          * will be in the cache.  If a directory hard link is newly created,
5664          * the resource fork data is written directly using devvp and
5665          * the code that actually relocates data (hfs_copy_extent()) also
5666          * uses devvp for its I/O --- so they will see a consistent copy.
5667          */
5668         if (extent_info->is_sysfile) {
5669                 /* If the current vnode is system vnode, flush journal
5670                  * to make sure that all data is written to the disk.
5671                  */
5672                 error = hfs_journal_flush(hfsmp, TRUE);
5673                 if (error) {
5674                         printf ("hfs_reclaim_file: journal_flush returned %d\n", error);
5675                         goto out;
5676                 }
5677         } else if (extent_info->is_dirlink == false) {
5678                 /* Flush all blocks associated with this regular file vnode.
5679                  * Normally there should not be buffer cache blocks for regular
5680                  * files, but for objects like symlinks, we can have buffer cache
5681                  * blocks associated with the vnode.  Therefore we call
5682                  * buf_flushdirtyblks() also.
5683                  */
5684                 buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file");
5685
5686                 hfs_unlock(cp);
5687                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
5688                 took_truncate_lock = true;
5689                 (void) cluster_push(vp, 0);
5690                 error = hfs_lock(cp, HFS_FORCE_LOCK);
5691                 if (error) {
5692                         goto out;
5693                 }
5694
5695                 /* If the file no longer exists, nothing left to do */
5696                 if (cp->c_flag & C_NOEXISTS) {
5697                         error = 0;
5698                         goto out;
5699                 }
5700
5701                 /* Wait for any in-progress writes to this vnode to complete, so that we'll
5702                  * be copying consistent bits.  (Otherwise, it's possible that an async
5703                  * write will complete to the old extent after we read from it.  That
5704                  * could lead to corruption.)
5705                  */
5706                 error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
5707                 if (error) {
5708                         goto out;
5709                 }
5710         }
5711
5712         if (hfs_resize_debug) {
5713                 printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID);
5714         }
5715
5716         if (extent_info->is_dirlink) {
5717                 MALLOC(extent_info->dirlink_desc, struct cat_desc *,
5718                                 sizeof(struct cat_desc), M_TEMP, M_WAITOK);
5719                 MALLOC(extent_info->dirlink_attr, struct cat_attr *,
5720                                 sizeof(struct cat_attr), M_TEMP, M_WAITOK);
5721                 MALLOC(extent_info->dirlink_fork, struct filefork *,
5722                                 sizeof(struct filefork), M_TEMP, M_WAITOK);
5723                 if ((extent_info->dirlink_desc == NULL) ||
5724                     (extent_info->dirlink_attr == NULL) ||
5725                     (extent_info->dirlink_fork == NULL)) {
5726                         error = ENOMEM;
5727                         goto out;
5728                 }
5729
5730                 /* Lookup catalog record for directory hard link and
5731                  * create a fake filefork for the value looked up from
5732                  * the disk.
5733                  */
5734                 fp = extent_info->dirlink_fork;
5735                 bzero(extent_info->dirlink_fork, sizeof(struct filefork));
5736                 extent_info->dirlink_fork->ff_cp = cp;
5737                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5738                 error = cat_lookup_dirlink(hfsmp, fileID, forktype,
5739                                 extent_info->dirlink_desc, extent_info->dirlink_attr,
5740                                 &(extent_info->dirlink_fork->ff_data));
5741                 hfs_systemfile_unlock(hfsmp, lockflags);
5742                 if (error) {
5743                         printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error);
5744                         goto out;
5745                 }
5746                 release_desc = true;
5747         } else {
5748                 fp = VTOF(vp);
5749         }
5750
5751         extent_info->catalog_fp = fp;
5752         extent_info->recStartBlock = 0;
5753         extent_info->extents = extent_info->catalog_fp->ff_extents;
5754         /* Relocate extents from the catalog record */
5755         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
5756                 if (fp->ff_extents[i].blockCount == 0) {
5757                         break;
5758                 }
5759                 extent_info->extent_index = i;
5760                 error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
5761                 if (error) {
5762                         printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error);
5763                         goto out;
5764                 }
5765         }
5766
5767         /* If the number of allocation blocks processed for reclaiming
5768          * are less than total number of blocks for the file, continuing
5769          * working on overflow extents record.
5770          */
5771         if (fp->ff_blocks <= extent_info->cur_blockCount) {
5772                 if (0 && hfs_resize_debug) {
5773                         printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
5774                 }
5775                 goto out;
5776         }
5777
5778         if (hfs_resize_debug) {
5779                 printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
5780         }
5781
5782         MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
5783         if (extent_info->iterator == NULL) {
5784                 error = ENOMEM;
5785                 goto out;
5786         }
5787         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
5788         key = (HFSPlusExtentKey *) &(extent_info->iterator->key);
5789         key->keyLength = kHFSPlusExtentKeyMaximumLength;
5790         key->forkType = forktype;
5791         key->fileID = fileID;
5792         key->startBlock = extent_info->cur_blockCount;
5793
5794         extent_info->btdata.bufferAddress = extent_info->record.overflow;
5795         extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord);
5796         extent_info->btdata.itemCount = 1;
5797
5798         extent_info->catalog_fp = NULL;
5799
5800         /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */
5801         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5802         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
5803                         &(extent_info->btdata), &(extent_info->recordlen),
5804                         extent_info->iterator);
5805         hfs_systemfile_unlock(hfsmp, lockflags);
5806         while (error == 0) {
5807                 extent_info->overflow_count++;
5808                 extent_info->recStartBlock = key->startBlock;
5809                 extent_info->extents = extent_info->record.overflow;
5810                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
5811                         if (extent_info->record.overflow[i].blockCount == 0) {
5812                                 goto out;
5813                         }
5814                         extent_info->extent_index = i;
5815                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
5816                         if (error) {
5817                                 printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error);
5818                                 goto out;
5819                         }
5820                 }
5821
5822                 /* Look for more overflow records */
5823                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5824                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
5825                                 extent_info->iterator, &(extent_info->btdata),
5826                                 &(extent_info->recordlen));
5827                 hfs_systemfile_unlock(hfsmp, lockflags);
5828                 if (error) {
5829                         break;
5830                 }
5831                 /* Stop when we encounter a different file or fork. */
5832                 if ((key->fileID != fileID) || (key->forkType != forktype)) {
5833                         break;
5834                 }
5835         }
5836         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
5837                 error = 0;
5838         }
5839
5840 out:
5841         /* If any blocks were relocated, account them and report progress */
5842         if (extent_info->blocks_relocated) {
5843                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
5844                 hfs_truncatefs_progress(hfsmp);
5845                 if (fileID < kHFSFirstUserCatalogNodeID) {
5846                         printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n",
5847                                         extent_info->blocks_relocated, fileID, hfsmp->vcbVN);
5848                 }
5849         }
5850         if (extent_info->iterator) {
5851                 FREE(extent_info->iterator, M_TEMP);
5852         }
5853         if (release_desc == true) {
5854                 cat_releasedesc(extent_info->dirlink_desc);
5855         }
5856         if (extent_info->dirlink_desc) {
5857                 FREE(extent_info->dirlink_desc, M_TEMP);
5858         }
5859         if (extent_info->dirlink_attr) {
5860                 FREE(extent_info->dirlink_attr, M_TEMP);
5861         }
5862         if (extent_info->dirlink_fork) {
5863                 FREE(extent_info->dirlink_fork, M_TEMP);
5864         }
5865         if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) {
5866                 (void) hfs_update(vp, MNT_WAIT);
5867         }
5868         if (took_truncate_lock) {
5869                 hfs_unlock_truncate(cp, 0);
5870         }
5871         if (extent_info) {
5872                 FREE(extent_info, M_TEMP);
5873         }
5874         if (hfs_resize_debug) {
5875                 printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error);
5876         }
5877
5878         return error;
5879 }
5880
5881
5882 /*
5883  * This journal_relocate callback updates the journal info block to point
5884  * at the new journal location.  This write must NOT be done using the
5885  * transaction.  We must write the block immediately.  We must also force
5886  * it to get to the media so that the new journal location will be seen by
5887  * the replay code before we can safely let journaled blocks be written
5888  * to their normal locations.
5889  *
5890  * The tests for journal_uses_fua below are mildly hacky.  Since the journal
5891  * and the file system are both on the same device, I'm leveraging what
5892  * the journal has decided about FUA.
5893  */
5894 struct hfs_journal_relocate_args {
5895         struct hfsmount *hfsmp;
5896         vfs_context_t context;
5897         u_int32_t newStartBlock;
5898 };
5899
5900 static errno_t
5901 hfs_journal_relocate_callback(void *_args)
5902 {
5903         int error;
5904         struct hfs_journal_relocate_args *args = _args;
5905         struct hfsmount *hfsmp = args->hfsmp;
5906         buf_t bp;
5907         JournalInfoBlock *jibp;
5908
5909         error = buf_meta_bread(hfsmp->hfs_devvp,
5910                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
5911                 hfsmp->blockSize, vfs_context_ucred(args->context), &bp);
5912         if (error) {
5913                 printf("hfs_reclaim_journal_file: failed to read JIB (%d)\n", error);
5914                 if (bp) {
5915                         buf_brelse(bp);
5916                 }
5917                 return error;
5918         }
5919         jibp = (JournalInfoBlock*) buf_dataptr(bp);
5920         jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize);
5921         jibp->size = SWAP_BE64(hfsmp->jnl_size);
5922         if (journal_uses_fua(hfsmp->jnl))
5923                 buf_markfua(bp);
5924         error = buf_bwrite(bp);
5925         if (error) {
5926                 printf("hfs_reclaim_journal_file: failed to write JIB (%d)\n", error);
5927                 return error;
5928         }
5929         if (!journal_uses_fua(hfsmp->jnl)) {
5930                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context);
5931                 if (error) {
5932                         printf("hfs_reclaim_journal_file: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
5933                         error = 0;              /* Don't fail the operation. */
5934                 }
5935         }
5936
5937         return error;
5938 }
5939
5940
5941 static int
5942 hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
5943 {
5944         int error;
5945         int journal_err;
5946         int lockflags;
5947         u_int32_t oldStartBlock;
5948         u_int32_t newStartBlock;
5949         u_int32_t oldBlockCount;
5950         u_int32_t newBlockCount;
5951         struct cat_desc journal_desc;
5952         struct cat_attr journal_attr;
5953         struct cat_fork journal_fork;
5954         struct hfs_journal_relocate_args callback_args;
5955
5956         if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) <= allocLimit) {
5957                 /* The journal does not require relocation */
5958                 return 0;
5959         }
5960
5961         error = hfs_start_transaction(hfsmp);
5962         if (error) {
5963                 printf("hfs_reclaim_journal_file: hfs_start_transaction returned %d\n", error);
5964                 return error;
5965         }
5966         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
5967
5968         oldBlockCount = hfsmp->jnl_size / hfsmp->blockSize;
5969
5970         /* TODO: Allow the journal to change size based on the new volume size. */
5971         error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5972                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS,
5973                          &newStartBlock, &newBlockCount);
5974         if (error) {
5975                 printf("hfs_reclaim_journal_file: BlockAllocate returned %d\n", error);
5976                 goto fail;
5977         }
5978         if (newBlockCount != oldBlockCount) {
5979                 printf("hfs_reclaim_journal_file: newBlockCount != oldBlockCount (%u, %u)\n", newBlockCount, oldBlockCount);
5980                 goto free_fail;
5981         }
5982
5983         error = BlockDeallocate(hfsmp, hfsmp->jnl_start, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5984         if (error) {
5985                 printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error);
5986                 goto free_fail;
5987         }
5988
5989         /* Update the catalog record for .journal */
5990         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, &journal_desc, &journal_attr, &journal_fork);
5991         if (error) {
5992                 printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error);
5993                 goto free_fail;
5994         }
5995         oldStartBlock = journal_fork.cf_extents[0].startBlock;
5996         journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
5997         journal_fork.cf_extents[0].startBlock = newStartBlock;
5998         journal_fork.cf_extents[0].blockCount = newBlockCount;
5999         journal_fork.cf_blocks = newBlockCount;
6000         error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL);
6001         cat_releasedesc(&journal_desc);  /* all done with cat descriptor */
6002         if (error) {
6003                 printf("hfs_reclaim_journal_file: cat_update returned %d\n", error);
6004                 goto free_fail;
6005         }
6006         callback_args.hfsmp = hfsmp;
6007         callback_args.context = context;
6008         callback_args.newStartBlock = newStartBlock;
6009
6010         error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize,
6011                 (off_t)newBlockCount*hfsmp->blockSize, 0,
6012                 hfs_journal_relocate_callback, &callback_args);
6013         if (error) {
6014                 /* NOTE: journal_relocate will mark the journal invalid. */
6015                 printf("hfs_reclaim_journal_file: journal_relocate returned %d\n", error);
6016                 goto fail;
6017         }
6018         hfsmp->jnl_start = newStartBlock;
6019         hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize;
6020
6021         hfs_systemfile_unlock(hfsmp, lockflags);
6022         error = hfs_end_transaction(hfsmp);
6023         if (error) {
6024                 printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error);
6025         }
6026
6027         /* Account for the blocks relocated and print progress */
6028         hfsmp->hfs_resize_blocksmoved += oldBlockCount;
6029         hfs_truncatefs_progress(hfsmp);
6030         if (!error) {
6031                 printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n",
6032                                 oldBlockCount, hfsmp->vcbVN);
6033                 if (hfs_resize_debug) {
6034                         printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
6035                 }
6036         }
6037         return error;
6038
6039 free_fail:
6040         journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
6041         if (journal_err) {
6042                 printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error);
6043                 hfs_mark_volume_inconsistent(hfsmp);
6044         }
6045 fail:
6046         hfs_systemfile_unlock(hfsmp, lockflags);
6047         (void) hfs_end_transaction(hfsmp);
6048         if (hfs_resize_debug) {
6049                 printf ("hfs_reclaim_journal_file: Error relocating journal file (error=%d)\n", error);
6050         }
6051         return error;
6052 }
6053
6054
6055 /*
6056  * Move the journal info block to a new location.  We have to make sure the
6057  * new copy of the journal info block gets to the media first, then change
6058  * the field in the volume header and the catalog record.
6059  */
6060 static int
6061 hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6062 {
6063         int error;
6064         int journal_err;
6065         int lockflags;
6066         u_int32_t oldBlock;
6067         u_int32_t newBlock;
6068         u_int32_t blockCount;
6069         struct cat_desc jib_desc;
6070         struct cat_attr jib_attr;
6071         struct cat_fork jib_fork;
6072         buf_t old_bp, new_bp;
6073
6074         if (hfsmp->vcbJinfoBlock <= allocLimit) {
6075                 /* The journal info block does not require relocation */
6076                 return 0;
6077         }
6078
6079         error = hfs_start_transaction(hfsmp);
6080         if (error) {
6081                 printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error);
6082                 return error;
6083         }
6084         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6085
6086         error = BlockAllocate(hfsmp, 1, 1, 1,
6087                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS,
6088                         &newBlock, &blockCount);
6089         if (error) {
6090                 printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error);
6091                 goto fail;
6092         }
6093         if (blockCount != 1) {
6094                 printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount);
6095                 goto free_fail;
6096         }
6097         error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS);
6098         if (error) {
6099                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6100                 goto free_fail;
6101         }
6102
6103         /* Copy the old journal info block content to the new location */
6104         error = buf_meta_bread(hfsmp->hfs_devvp,
6105                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6106                 hfsmp->blockSize, vfs_context_ucred(context), &old_bp);
6107         if (error) {
6108                 printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error);
6109                 if (old_bp) {
6110                         buf_brelse(old_bp);
6111                 }
6112                 goto free_fail;
6113         }
6114         new_bp = buf_getblk(hfsmp->hfs_devvp,
6115                 newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6116                 hfsmp->blockSize, 0, 0, BLK_META);
6117         bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize);
6118         buf_brelse(old_bp);
6119         if (journal_uses_fua(hfsmp->jnl))
6120                 buf_markfua(new_bp);
6121         error = buf_bwrite(new_bp);
6122         if (error) {
6123                 printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error);
6124                 goto free_fail;
6125         }
6126         if (!journal_uses_fua(hfsmp->jnl)) {
6127                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
6128                 if (error) {
6129                         printf("hfs_reclaim_journal_info_block: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6130                         /* Don't fail the operation. */
6131                 }
6132         }
6133
6134         /* Update the catalog record for .journal_info_block */
6135         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, &jib_desc, &jib_attr, &jib_fork);
6136         if (error) {
6137                 printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error);
6138                 goto fail;
6139         }
6140         oldBlock = jib_fork.cf_extents[0].startBlock;
6141         jib_fork.cf_size = hfsmp->blockSize;
6142         jib_fork.cf_extents[0].startBlock = newBlock;
6143         jib_fork.cf_extents[0].blockCount = 1;
6144         jib_fork.cf_blocks = 1;
6145         error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL);
6146         cat_releasedesc(&jib_desc);  /* all done with cat descriptor */
6147         if (error) {
6148                 printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error);
6149                 goto fail;
6150         }
6151
6152         /* Update the pointer to the journal info block in the volume header. */
6153         hfsmp->vcbJinfoBlock = newBlock;
6154         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
6155         if (error) {
6156                 printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error);
6157                 goto fail;
6158         }
6159         hfs_systemfile_unlock(hfsmp, lockflags);
6160         error = hfs_end_transaction(hfsmp);
6161         if (error) {
6162                 printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error);
6163         }
6164         error = hfs_journal_flush(hfsmp, FALSE);
6165         if (error) {
6166                 printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
6167         }
6168
6169         /* Account for the block relocated and print progress */
6170         hfsmp->hfs_resize_blocksmoved += 1;
6171         hfs_truncatefs_progress(hfsmp);
6172         if (!error) {
6173                 printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n",
6174                                 hfsmp->vcbVN);
6175                 if (hfs_resize_debug) {
6176                         printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
6177                 }
6178         }
6179         return error;
6180
6181 free_fail:
6182         journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS);
6183         if (journal_err) {
6184                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6185                 hfs_mark_volume_inconsistent(hfsmp);
6186         }
6187
6188 fail:
6189         hfs_systemfile_unlock(hfsmp, lockflags);
6190         (void) hfs_end_transaction(hfsmp);
6191         if (hfs_resize_debug) {
6192                 printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
6193         }
6194         return error;
6195 }
6196
6197
6198 /*
6199  * This function traverses through all extended attribute records for a given
6200  * fileID, and calls function that reclaims data blocks that exist in the
6201  * area of the disk being reclaimed which in turn is responsible for allocating
6202  * new space, copying extent data, deallocating new space, and if required,
6203  * splitting the extent.
6204  *
6205  * Note: The caller has already acquired the cnode lock on the file.  Therefore
6206  * we are assured that no other thread would be creating/deleting/modifying
6207  * extended attributes for this file.
6208  *
6209  * Side Effects:
6210  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
6211  * blocks that were relocated.
6212  *
6213  * Returns:
6214  *      0 on success, non-zero on failure.
6215  */
6216 static int
6217 hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context)
6218 {
6219         int error = 0;
6220         struct hfs_reclaim_extent_info *extent_info;
6221         int i;
6222         HFSPlusAttrKey *key;
6223         int *lockflags;
6224
6225         if (hfs_resize_debug) {
6226                 printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID);
6227         }
6228
6229         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
6230                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
6231         if (extent_info == NULL) {
6232                 return ENOMEM;
6233         }
6234         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
6235         extent_info->vp = vp;
6236         extent_info->fileID = fileID;
6237         extent_info->is_xattr = true;
6238         extent_info->is_sysfile = vnode_issystem(vp);
6239         extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp);
6240         lockflags = &(extent_info->lockflags);
6241         *lockflags = SFL_ATTRIBUTE | SFL_BITMAP;
6242
6243         /* Initialize iterator from the extent_info structure */
6244         MALLOC(extent_info->iterator, struct BTreeIterator *,
6245                sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6246         if (extent_info->iterator == NULL) {
6247                 error = ENOMEM;
6248                 goto out;
6249         }
6250         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6251
6252         /* Build attribute key */
6253         key = (HFSPlusAttrKey *)&(extent_info->iterator->key);
6254         error = hfs_buildattrkey(fileID, NULL, key);
6255         if (error) {
6256                 goto out;
6257         }
6258
6259         /* Initialize btdata from extent_info structure.  Note that the
6260          * buffer pointer actually points to the xattr record from the
6261          * extent_info structure itself.
6262          */
6263         extent_info->btdata.bufferAddress = &(extent_info->record.xattr);
6264         extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord);
6265         extent_info->btdata.itemCount = 1;
6266
6267         /*
6268          * Sync all extent-based attribute data to the disk.
6269          *
6270          * All extent-based attribute data I/O is performed via cluster
6271          * I/O using a virtual file that spans across entire file system
6272          * space.
6273          */
6274         hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK);
6275         (void)cluster_push(hfsmp->hfs_attrdata_vp, 0);
6276         error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr");
6277         hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), 0);
6278         if (error) {
6279                 goto out;
6280         }
6281
6282         /* Search for extended attribute for current file.  This
6283          * will place the iterator before the first matching record.
6284          */
6285         *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6286         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6287                         &(extent_info->btdata), &(extent_info->recordlen),
6288                         extent_info->iterator);
6289         hfs_systemfile_unlock(hfsmp, *lockflags);
6290         if (error) {
6291                 if (error != btNotFound) {
6292                         goto out;
6293                 }
6294                 /* btNotFound is expected here, so just mask it */
6295                 error = 0;
6296         }
6297
6298         while (1) {
6299                 /* Iterate to the next record */
6300                 *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6301                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6302                                 extent_info->iterator, &(extent_info->btdata),
6303                                 &(extent_info->recordlen));
6304                 hfs_systemfile_unlock(hfsmp, *lockflags);
6305
6306                 /* Stop the iteration if we encounter end of btree or xattr with different fileID */
6307                 if (error || key->fileID != fileID) {
6308                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6309                                 error = 0;
6310                         }
6311                         break;
6312                 }
6313
6314                 /* We only care about extent-based EAs */
6315                 if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) &&
6316                     (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) {
6317                         continue;
6318                 }
6319
6320                 if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) {
6321                         extent_info->overflow_count = 0;
6322                         extent_info->extents = extent_info->record.xattr.forkData.theFork.extents;
6323                 } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) {
6324                         extent_info->overflow_count++;
6325                         extent_info->extents = extent_info->record.xattr.overflowExtents.extents;
6326                 }
6327
6328                 extent_info->recStartBlock = key->startBlock;
6329                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6330                         if (extent_info->extents[i].blockCount == 0) {
6331                                 break;
6332                         }
6333                         extent_info->extent_index = i;
6334                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6335                         if (error) {
6336                                 printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error);
6337                                 goto out;
6338                         }
6339                 }
6340         }
6341
6342 out:
6343         /* If any blocks were relocated, account them and report progress */
6344         if (extent_info->blocks_relocated) {
6345                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6346                 hfs_truncatefs_progress(hfsmp);
6347         }
6348         if (extent_info->iterator) {
6349                 FREE(extent_info->iterator, M_TEMP);
6350         }
6351         if (extent_info) {
6352                 FREE(extent_info, M_TEMP);
6353         }
6354         if (hfs_resize_debug) {
6355                 printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error);
6356         }
6357         return error;
6358 }
6359
6360 /*
6361  * Reclaim any extent-based extended attributes allocation blocks from
6362  * the area of the disk that is being truncated.
6363  *
6364  * The function traverses the attribute btree to find out the fileIDs
6365  * of the extended attributes that need to be relocated.  For every
6366  * file whose large EA requires relocation, it looks up the cnode and
6367  * calls hfs_reclaim_xattr() to do all the work for allocating
6368  * new space, copying data, deallocating old space, and if required,
6369  * splitting the extents.
6370  *
6371  * Inputs:
6372  *      allocLimit    - starting block of the area being reclaimed
6373  *
6374  * Returns:
6375  *      returns 0 on success, non-zero on failure.
6376  */
6377 static int
6378 hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6379 {
6380         int error = 0;
6381         FCB *fcb;
6382         struct BTreeIterator *iterator = NULL;
6383         struct FSBufferDescriptor btdata;
6384         HFSPlusAttrKey *key;
6385         HFSPlusAttrRecord rec;
6386         int lockflags = 0;
6387         cnid_t prev_fileid = 0;
6388         struct vnode *vp;
6389         int need_relocate;
6390         int btree_operation;
6391         u_int32_t files_moved = 0;
6392         u_int32_t prev_blocksmoved;
6393         int i;
6394
6395         fcb = VTOF(hfsmp->hfs_attribute_vp);
6396         /* Store the value to print total blocks moved by this function in end */
6397         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6398
6399         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6400                 return ENOMEM;
6401         }
6402         bzero(iterator, sizeof(*iterator));
6403         key = (HFSPlusAttrKey *)&iterator->key;
6404         btdata.bufferAddress = &rec;
6405         btdata.itemSize = sizeof(rec);
6406         btdata.itemCount = 1;
6407
6408         need_relocate = false;
6409         btree_operation = kBTreeFirstRecord;
6410         /* Traverse the attribute btree to find extent-based EAs to reclaim */
6411         while (1) {
6412                 lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK);
6413                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6414                 hfs_systemfile_unlock(hfsmp, lockflags);
6415                 if (error) {
6416                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6417                                 error = 0;
6418                         }
6419                         break;
6420                 }
6421                 btree_operation = kBTreeNextRecord;
6422
6423                 /* If the extents of current fileID were already relocated, skip it */
6424                 if (prev_fileid == key->fileID) {
6425                         continue;
6426                 }
6427
6428                 /* Check if any of the extents in the current record need to be relocated */
6429                 need_relocate = false;
6430                 switch(rec.recordType) {
6431                         case kHFSPlusAttrForkData:
6432                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6433                                         if (rec.forkData.theFork.extents[i].blockCount == 0) {
6434                                                 break;
6435                                         }
6436                                         if ((rec.forkData.theFork.extents[i].startBlock +
6437                                              rec.forkData.theFork.extents[i].blockCount) > allocLimit) {
6438                                                 need_relocate = true;
6439                                                 break;
6440                                         }
6441                                 }
6442                                 break;
6443
6444                         case kHFSPlusAttrExtents:
6445                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6446                                         if (rec.overflowExtents.extents[i].blockCount == 0) {
6447                                                 break;
6448                                         }
6449                                         if ((rec.overflowExtents.extents[i].startBlock +
6450                                              rec.overflowExtents.extents[i].blockCount) > allocLimit) {
6451                                                 need_relocate = true;
6452                                                 break;
6453                                         }
6454                                 }
6455                                 break;
6456                 };
6457
6458                 /* Continue iterating to next attribute record */
6459                 if (need_relocate == false) {
6460                         continue;
6461                 }
6462
6463                 /* Look up the vnode for corresponding file.  The cnode
6464                  * will be locked which will ensure that no one modifies
6465                  * the xattrs when we are relocating them.
6466                  *
6467                  * We want to allow open-unlinked files to be moved,
6468                  * so provide allow_deleted == 1 for hfs_vget().
6469                  */
6470                 if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) {
6471                         continue;
6472                 }
6473
6474                 error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context);
6475                 hfs_unlock(VTOC(vp));
6476                 vnode_put(vp);
6477                 if (error) {
6478                         printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error);
6479                         break;
6480                 }
6481                 prev_fileid = key->fileID;
6482                 files_moved++;
6483         }
6484
6485         if (files_moved) {
6486                 printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n",
6487                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6488                                 files_moved, hfsmp->vcbVN);
6489         }
6490
6491         kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6492         return error;
6493 }
6494
6495 /*
6496  * Reclaim blocks from regular files.
6497  *
6498  * This function iterates over all the record in catalog btree looking
6499  * for files with extents that overlap into the space we're trying to
6500  * free up.  If a file extent requires relocation, it looks up the vnode
6501  * and calls function to relocate the data.
6502  *
6503  * Returns:
6504  *      Zero on success, non-zero on failure.
6505  */
6506 static int
6507 hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6508 {
6509         int error;
6510         FCB *fcb;
6511         struct BTreeIterator *iterator = NULL;
6512         struct FSBufferDescriptor btdata;
6513         int btree_operation;
6514         int lockflags;
6515         struct HFSPlusCatalogFile filerec;
6516         struct vnode *vp;
6517         struct vnode *rvp;
6518         struct filefork *datafork;
6519         u_int32_t files_moved = 0;
6520         u_int32_t prev_blocksmoved;
6521
6522         fcb = VTOF(hfsmp->hfs_catalog_vp);
6523         /* Store the value to print total blocks moved by this function at the end */
6524         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6525
6526         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6527                 return ENOMEM;
6528         }
6529         bzero(iterator, sizeof(*iterator));
6530
6531         btdata.bufferAddress = &filerec;
6532         btdata.itemSize = sizeof(filerec);
6533         btdata.itemCount = 1;
6534
6535         btree_operation = kBTreeFirstRecord;
6536         while (1) {
6537                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
6538                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6539                 hfs_systemfile_unlock(hfsmp, lockflags);
6540                 if (error) {
6541                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6542                                 error = 0;
6543                         }
6544                         break;
6545                 }
6546                 btree_operation = kBTreeNextRecord;
6547
6548                 if (filerec.recordType != kHFSPlusFileRecord) {
6549                         continue;
6550                 }
6551
6552                 /* Check if any of the extents require relocation */
6553                 if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) {
6554                         continue;
6555                 }
6556
6557                 /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */
6558                 if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) {
6559                         continue;
6560                 }
6561
6562                 /* If data fork exists or item is a directory hard link, relocate blocks */
6563                 datafork = VTOF(vp);
6564                 if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) {
6565                         error = hfs_reclaim_file(hfsmp, vp, filerec.fileID,
6566                                         kHFSDataForkType, allocLimit, context);
6567                         if (error)  {
6568                                 printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
6569                                 hfs_unlock(VTOC(vp));
6570                                 vnode_put(vp);
6571                                 break;
6572                         }
6573                 }
6574
6575                 /* If resource fork exists or item is a directory hard link, relocate blocks */
6576                 if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) {
6577                         if (vnode_isdir(vp)) {
6578                                 /* Resource fork vnode lookup is invalid for directory hard link.
6579                                  * So we fake data fork vnode as resource fork vnode.
6580                                  */
6581                                 rvp = vp;
6582                         } else {
6583                                 error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
6584                                 if (error) {
6585                                         printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error);
6586                                         hfs_unlock(VTOC(vp));
6587                                         vnode_put(vp);
6588                                         break;
6589                                 }
6590                                 VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
6591                         }
6592
6593                         error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID,
6594                                         kHFSResourceForkType, allocLimit, context);
6595                         if (error) {
6596                                 printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
6597                                 hfs_unlock(VTOC(vp));
6598                                 vnode_put(vp);
6599                                 break;
6600                         }
6601                 }
6602
6603                 /* The file forks were relocated successfully, now drop the
6604                  * cnode lock and vnode reference, and continue iterating to
6605                  * next catalog record.
6606                  */
6607                 hfs_unlock(VTOC(vp));
6608                 vnode_put(vp);
6609                 files_moved++;
6610         }
6611
6612         if (files_moved) {
6613                 printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n",
6614                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6615                                 files_moved, hfsmp->vcbVN);
6616         }
6617
6618         kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6619         return error;
6620 }
6621
6622 /*
6623  * Reclaim space at the end of a file system.
6624  *
6625  * Inputs -
6626  *      allocLimit      - start block of the space being reclaimed
6627  *      reclaimblks     - number of allocation blocks to reclaim
6628  */
6629 static int
6630 hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context)
6631 {
6632         int error = 0;
6633
6634         /*
6635          * Preflight the bitmap to find out total number of blocks that need
6636          * relocation.
6637          *
6638          * Note: Since allocLimit is set to the location of new alternate volume
6639          * header, the check below does not account for blocks allocated for old
6640          * alternate volume header.
6641          */
6642         error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks));
6643         if (error) {
6644                 printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error);
6645                 return error;
6646         }
6647         if (hfs_resize_debug) {
6648                 printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks);
6649         }
6650
6651         /* Just to be safe, sync the content of the journal to the disk before we proceed */
6652         hfs_journal_flush(hfsmp, TRUE);
6653
6654         /* First, relocate journal file blocks if they're in the way.
6655          * Doing this first will make sure that journal relocate code
6656          * gets access to contiguous blocks on disk first.  The journal
6657          * file has to be contiguous on the disk, otherwise resize will
6658          * fail.
6659          */
6660         error = hfs_reclaim_journal_file(hfsmp, allocLimit, context);
6661         if (error) {
6662                 printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error);
6663                 return error;
6664         }
6665
6666         /* Relocate journal info block blocks if they're in the way. */
6667         error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context);
6668         if (error) {
6669                 printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error);
6670                 return error;
6671         }
6672
6673         /* Relocate extents of the Extents B-tree if they're in the way.
6674          * Relocating extents btree before other btrees is important as
6675          * this will provide access to largest contiguous block range on
6676          * the disk for relocating extents btree.  Note that extents btree
6677          * can only have maximum of 8 extents.
6678          */
6679         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID,
6680                         kHFSDataForkType, allocLimit, context);
6681         if (error) {
6682                 printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
6683                 return error;
6684         }
6685
6686         /* Relocate extents of the Allocation file if they're in the way. */
6687         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID,
6688                         kHFSDataForkType, allocLimit, context);
6689         if (error) {
6690                 printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
6691                 return error;
6692         }
6693
6694         /* Relocate extents of the Catalog B-tree if they're in the way. */
6695         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID,
6696                         kHFSDataForkType, allocLimit, context);
6697         if (error) {
6698                 printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
6699                 return error;
6700         }
6701
6702         /* Relocate extents of the Attributes B-tree if they're in the way. */
6703         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID,
6704                         kHFSDataForkType, allocLimit, context);
6705         if (error) {
6706                 printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
6707                 return error;
6708         }
6709
6710         /* Relocate extents of the Startup File if there is one and they're in the way. */
6711         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID,
6712                         kHFSDataForkType, allocLimit, context);
6713         if (error) {
6714                 printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
6715                 return error;
6716         }
6717
6718         /*
6719          * We need to make sure the alternate volume header gets flushed if we moved
6720          * any extents in the volume header.  But we need to do that before
6721          * shrinking the size of the volume, or else the journal code will panic
6722          * with an invalid (too large) block number.
6723          *
6724          * Note that blks_moved will be set if ANY extent was moved, even
6725          * if it was just an overflow extent.  In this case, the journal_flush isn't
6726          * strictly required, but shouldn't hurt.
6727          */
6728         if (hfsmp->hfs_resize_blocksmoved) {
6729                 hfs_journal_flush(hfsmp, TRUE);
6730         }
6731
6732         /* Reclaim extents from catalog file records */
6733         error = hfs_reclaim_filespace(hfsmp, allocLimit, context);
6734         if (error) {
6735                 printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error);
6736                 return error;
6737         }
6738
6739         /* Reclaim extents from extent-based extended attributes, if any */
6740         error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context);
6741         if (error) {
6742                 printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error);
6743                 return error;
6744         }
6745
6746         return error;
6747 }
6748
6749
6750 /*
6751  * Check if there are any extents (including overflow extents) that overlap
6752  * into the disk space that is being reclaimed.
6753  *
6754  * Output -
6755  *      true  - One of the extents need to be relocated
6756  *      false - No overflow extents need to be relocated, or there was an error
6757  */
6758 static int
6759 hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec)
6760 {
6761         struct BTreeIterator * iterator = NULL;
6762         struct FSBufferDescriptor btdata;
6763         HFSPlusExtentRecord extrec;
6764         HFSPlusExtentKey *extkeyptr;
6765         FCB *fcb;
6766         int overlapped = false;
6767         int i, j;
6768         int error;
6769         int lockflags = 0;
6770         u_int32_t endblock;
6771
6772         /* Check if data fork overlaps the target space */
6773         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6774                 if (filerec->dataFork.extents[i].blockCount == 0) {
6775                         break;
6776                 }
6777                 endblock = filerec->dataFork.extents[i].startBlock +
6778                         filerec->dataFork.extents[i].blockCount;
6779                 if (endblock > allocLimit) {
6780                         overlapped = true;
6781                         goto out;
6782                 }
6783         }
6784
6785         /* Check if resource fork overlaps the target space */
6786         for (j = 0; j < kHFSPlusExtentDensity; ++j) {
6787                 if (filerec->resourceFork.extents[j].blockCount == 0) {
6788                         break;
6789                 }
6790                 endblock = filerec->resourceFork.extents[j].startBlock +
6791                         filerec->resourceFork.extents[j].blockCount;
6792                 if (endblock > allocLimit) {
6793                         overlapped = true;
6794                         goto out;
6795                 }
6796         }
6797
6798         /* Return back if there are no overflow extents for this file */
6799         if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) {
6800                 goto out;
6801         }
6802
6803         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6804                 return 0;
6805         }
6806         bzero(iterator, sizeof(*iterator));
6807         extkeyptr = (HFSPlusExtentKey *)&iterator->key;
6808         extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
6809         extkeyptr->forkType = 0;
6810         extkeyptr->fileID = filerec->fileID;
6811         extkeyptr->startBlock = 0;
6812
6813         btdata.bufferAddress = &extrec;
6814         btdata.itemSize = sizeof(extrec);
6815         btdata.itemCount = 1;
6816
6817         fcb = VTOF(hfsmp->hfs_extents_vp);
6818
6819         lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
6820
6821         /* This will position the iterator just before the first overflow
6822          * extent record for given fileID.  It will always return btNotFound,
6823          * so we special case the error code.
6824          */
6825         error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
6826         if (error && (error != btNotFound)) {
6827                 goto out;
6828         }
6829
6830         /* BTIterateRecord() might return error if the btree is empty, and
6831          * therefore we return that the extent does not overflow to the caller
6832          */
6833         error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
6834         while (error == 0) {
6835                 /* Stop when we encounter a different file. */
6836                 if (extkeyptr->fileID != filerec->fileID) {
6837                         break;
6838                 }
6839                 /* Check if any of the forks exist in the target space. */
6840                 for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6841                         if (extrec[i].blockCount == 0) {
6842                                 break;
6843                         }
6844                         endblock = extrec[i].startBlock + extrec[i].blockCount;
6845                         if (endblock > allocLimit) {
6846                                 overlapped = true;
6847                                 goto out;
6848                         }
6849                 }
6850                 /* Look for more records. */
6851                 error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
6852         }
6853
6854 out:
6855         if (lockflags) {
6856                 hfs_systemfile_unlock(hfsmp, lockflags);
6857         }
6858         if (iterator) {
6859                 kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6860         }
6861         return overlapped;
6862 }
6863
6864
6865 /*
6866  * Calculate the progress of a file system resize operation.
6867  */
6868 __private_extern__
6869 int
6870 hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress)
6871 {
6872         if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) {
6873                 return (ENXIO);
6874         }
6875
6876         if (hfsmp->hfs_resize_totalblocks > 0) {
6877                 *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks);
6878         } else {
6879                 *progress = 0;
6880         }
6881
6882         return (0);
6883 }
6884
6885
6886 /*
6887  * Creates a UUID from a unique "name" in the HFS UUID Name space.
6888  * See version 3 UUID.
6889  */
6890 static void
6891 hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result)
6892 {
6893         MD5_CTX  md5c;
6894         uint8_t  rawUUID[8];
6895
6896         ((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6];
6897         ((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7];
6898
6899         MD5Init( &md5c );
6900         MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) );
6901         MD5Update( &md5c, rawUUID, sizeof (rawUUID) );
6902         MD5Final( result, &md5c );
6903
6904         result[6] = 0x30 | ( result[6] & 0x0F );
6905         result[8] = 0x80 | ( result[8] & 0x3F );
6906 }
6907
6908 /*
6909  * Get file system attributes.
6910  */
6911 static int
6912 hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
6913 {
6914 #define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST))
6915 #define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST))
6916 #define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME))
6917
6918         ExtendedVCB *vcb = VFSTOVCB(mp);
6919         struct hfsmount *hfsmp = VFSTOHFS(mp);
6920         u_int32_t freeCNIDs;
6921
6922         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID;
6923
6924         VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt);
6925         VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt);
6926         VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt);
6927         VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF);
6928         VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0));
6929         VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks);
6930         VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0));
6931         VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1));
6932         VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize);
6933         /* XXX needs clarification */
6934         VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1));
6935         /* Maximum files is constrained by total blocks. */
6936         VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2));
6937         VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1)));
6938
6939         fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev;
6940         fsap->f_fsid.val[1] = vfs_typenum(mp);
6941         VFSATTR_SET_SUPPORTED(fsap, f_fsid);
6942
6943         VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord);
6944         VFSATTR_RETURN(fsap, f_carbon_fsid, 0);
6945
6946         if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
6947                 vol_capabilities_attr_t *cap;
6948
6949                 cap = &fsap->f_capabilities;
6950
6951                 if (hfsmp->hfs_flags & HFS_STANDARD) {
6952                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
6953                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
6954                                 VOL_CAP_FMT_CASE_PRESERVING |
6955                                 VOL_CAP_FMT_FAST_STATFS |
6956                                 VOL_CAP_FMT_HIDDEN_FILES |
6957                                 VOL_CAP_FMT_PATH_FROM_ID;
6958                 } else {
6959                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
6960                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
6961                                 VOL_CAP_FMT_SYMBOLICLINKS |
6962                                 VOL_CAP_FMT_HARDLINKS |
6963                                 VOL_CAP_FMT_JOURNAL |
6964                                 VOL_CAP_FMT_ZERO_RUNS |
6965                                 (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) |
6966                                 (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) |
6967                                 VOL_CAP_FMT_CASE_PRESERVING |
6968                                 VOL_CAP_FMT_FAST_STATFS |
6969                                 VOL_CAP_FMT_2TB_FILESIZE |
6970                                 VOL_CAP_FMT_HIDDEN_FILES |
6971 #if HFS_COMPRESSION
6972                                 VOL_CAP_FMT_PATH_FROM_ID |
6973                                 VOL_CAP_FMT_DECMPFS_COMPRESSION;
6974 #else
6975                                 VOL_CAP_FMT_PATH_FROM_ID;
6976 #endif
6977                 }
6978                 cap->capabilities[VOL_CAPABILITIES_INTERFACES] =
6979                         VOL_CAP_INT_SEARCHFS |
6980                         VOL_CAP_INT_ATTRLIST |
6981                         VOL_CAP_INT_NFSEXPORT |
6982                         VOL_CAP_INT_READDIRATTR |
6983                         VOL_CAP_INT_EXCHANGEDATA |
6984                         VOL_CAP_INT_ALLOCATE |
6985                         VOL_CAP_INT_VOL_RENAME |
6986                         VOL_CAP_INT_ADVLOCK |
6987                         VOL_CAP_INT_FLOCK |
6988 #if NAMEDSTREAMS
6989                         VOL_CAP_INT_EXTENDED_ATTR |
6990                         VOL_CAP_INT_NAMEDSTREAMS;
6991 #else
6992                         VOL_CAP_INT_EXTENDED_ATTR;
6993 #endif
6994                 cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0;
6995                 cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0;
6996
6997                 cap->valid[VOL_CAPABILITIES_FORMAT] =
6998                         VOL_CAP_FMT_PERSISTENTOBJECTIDS |
6999                         VOL_CAP_FMT_SYMBOLICLINKS |
7000                         VOL_CAP_FMT_HARDLINKS |
7001                         VOL_CAP_FMT_JOURNAL |
7002                         VOL_CAP_FMT_JOURNAL_ACTIVE |
7003                         VOL_CAP_FMT_NO_ROOT_TIMES |
7004                         VOL_CAP_FMT_SPARSE_FILES |
7005                         VOL_CAP_FMT_ZERO_RUNS |
7006                         VOL_CAP_FMT_CASE_SENSITIVE |
7007                         VOL_CAP_FMT_CASE_PRESERVING |
7008                         VOL_CAP_FMT_FAST_STATFS |
7009                         VOL_CAP_FMT_2TB_FILESIZE |
7010                         VOL_CAP_FMT_OPENDENYMODES |
7011                         VOL_CAP_FMT_HIDDEN_FILES |
7012 #if HFS_COMPRESSION
7013                         VOL_CAP_FMT_PATH_FROM_ID |
7014                         VOL_CAP_FMT_DECMPFS_COMPRESSION;
7015 #else
7016                         VOL_CAP_FMT_PATH_FROM_ID;
7017 #endif
7018                 cap->valid[VOL_CAPABILITIES_INTERFACES] =
7019                         VOL_CAP_INT_SEARCHFS |
7020                         VOL_CAP_INT_ATTRLIST |
7021                         VOL_CAP_INT_NFSEXPORT |
7022                         VOL_CAP_INT_READDIRATTR |
7023                         VOL_CAP_INT_EXCHANGEDATA |
7024                         VOL_CAP_INT_COPYFILE |
7025                         VOL_CAP_INT_ALLOCATE |
7026                         VOL_CAP_INT_VOL_RENAME |
7027                         VOL_CAP_INT_ADVLOCK |
7028                         VOL_CAP_INT_FLOCK |
7029                         VOL_CAP_INT_MANLOCK |
7030 #if NAMEDSTREAMS
7031                         VOL_CAP_INT_EXTENDED_ATTR |
7032                         VOL_CAP_INT_NAMEDSTREAMS;
7033 #else
7034                         VOL_CAP_INT_EXTENDED_ATTR;
7035 #endif
7036                 cap->valid[VOL_CAPABILITIES_RESERVED1] = 0;
7037                 cap->valid[VOL_CAPABILITIES_RESERVED2] = 0;
7038                 VFSATTR_SET_SUPPORTED(fsap, f_capabilities);
7039         }
7040         if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) {
7041                 vol_attributes_attr_t *attrp = &fsap->f_attributes;
7042
7043                 attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7044                 attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7045                 attrp->validattr.dirattr = ATTR_DIR_VALIDMASK;
7046                 attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7047                 attrp->validattr.forkattr = 0;
7048
7049                 attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7050                 attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7051                 attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK;
7052                 attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7053                 attrp->nativeattr.forkattr = 0;
7054                 VFSATTR_SET_SUPPORTED(fsap, f_attributes);
7055         }
7056         fsap->f_create_time.tv_sec = hfsmp->hfs_itime;
7057         fsap->f_create_time.tv_nsec = 0;
7058         VFSATTR_SET_SUPPORTED(fsap, f_create_time);
7059         fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod;
7060         fsap->f_modify_time.tv_nsec = 0;
7061         VFSATTR_SET_SUPPORTED(fsap, f_modify_time);
7062
7063         fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp;
7064         fsap->f_backup_time.tv_nsec = 0;
7065         VFSATTR_SET_SUPPORTED(fsap, f_backup_time);
7066         if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) {
7067                 u_int16_t subtype = 0;
7068
7069                 /*
7070                  * Subtypes (flavors) for HFS
7071                  *   0:   Mac OS Extended
7072                  *   1:   Mac OS Extended (Journaled)
7073                  *   2:   Mac OS Extended (Case Sensitive)
7074                  *   3:   Mac OS Extended (Case Sensitive, Journaled)
7075                  *   4 - 127:   Reserved
7076                  * 128:   Mac OS Standard
7077                  *
7078                  */
7079                 if (hfsmp->hfs_flags & HFS_STANDARD) {
7080                         subtype = HFS_SUBTYPE_STANDARDHFS;
7081                 } else /* HFS Plus */ {
7082                         if (hfsmp->jnl)
7083                                 subtype |= HFS_SUBTYPE_JOURNALED;
7084                         if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
7085                                 subtype |= HFS_SUBTYPE_CASESENSITIVE;
7086                 }
7087                 fsap->f_fssubtype = subtype;
7088                 VFSATTR_SET_SUPPORTED(fsap, f_fssubtype);
7089         }
7090
7091         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7092                 strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN);
7093                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7094         }
7095         if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) {
7096                 hfs_getvoluuid(hfsmp, fsap->f_uuid);
7097                 VFSATTR_SET_SUPPORTED(fsap, f_uuid);
7098         }
7099         return (0);
7100 }
7101
7102 /*
7103  * Perform a volume rename.  Requires the FS' root vp.
7104  */
7105 static int
7106 hfs_rename_volume(struct vnode *vp, const char *name, proc_t p)
7107 {
7108         ExtendedVCB *vcb = VTOVCB(vp);
7109         struct cnode *cp = VTOC(vp);
7110         struct hfsmount *hfsmp = VTOHFS(vp);
7111         struct cat_desc to_desc;
7112         struct cat_desc todir_desc;
7113         struct cat_desc new_desc;
7114         cat_cookie_t cookie;
7115         int lockflags;
7116         int error = 0;
7117         char converted_volname[256];
7118         size_t volname_length = 0;
7119         size_t conv_volname_length = 0;
7120
7121
7122         /*
7123          * Ignore attempts to rename a volume to a zero-length name.
7124          */
7125         if (name[0] == 0)
7126                 return(0);
7127
7128         bzero(&to_desc, sizeof(to_desc));
7129         bzero(&todir_desc, sizeof(todir_desc));
7130         bzero(&new_desc, sizeof(new_desc));
7131         bzero(&cookie, sizeof(cookie));
7132
7133         todir_desc.cd_parentcnid = kHFSRootParentID;
7134         todir_desc.cd_cnid = kHFSRootFolderID;
7135         todir_desc.cd_flags = CD_ISDIR;
7136
7137         to_desc.cd_nameptr = (const u_int8_t *)name;
7138         to_desc.cd_namelen = strlen(name);
7139         to_desc.cd_parentcnid = kHFSRootParentID;
7140         to_desc.cd_cnid = cp->c_cnid;
7141         to_desc.cd_flags = CD_ISDIR;
7142
7143         if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) == 0) {
7144                 if ((error = hfs_start_transaction(hfsmp)) == 0) {
7145                         if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) {
7146                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
7147
7148                                 error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc);
7149
7150                                 /*
7151                                  * If successful, update the name in the VCB, ensure it's terminated.
7152                                  */
7153                                 if (!error) {
7154                                         strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN));
7155                                         volname_length = strlen ((const char*)vcb->vcbVN);
7156 #define DKIOCCSSETLVNAME _IOW('d', 198, char[1024])
7157                                         /* Send the volume name down to CoreStorage if necessary */
7158                                         error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED);
7159                                         if (error == 0) {
7160                                                 (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current());
7161                                         }
7162                                         error = 0;
7163                                 }
7164
7165                                 hfs_systemfile_unlock(hfsmp, lockflags);
7166                                 cat_postflight(hfsmp, &cookie, p);
7167
7168                                 if (error)
7169                                         MarkVCBDirty(vcb);
7170                                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
7171                         }
7172                         hfs_end_transaction(hfsmp);
7173                 }
7174                 if (!error) {
7175                         /* Release old allocated name buffer */
7176                         if (cp->c_desc.cd_flags & CD_HASBUF) {
7177                                 const char *tmp_name = (const char *)cp->c_desc.cd_nameptr;
7178
7179                                 cp->c_desc.cd_nameptr = 0;
7180                                 cp->c_desc.cd_namelen = 0;
7181                                 cp->c_desc.cd_flags &= ~CD_HASBUF;
7182                                 vfs_removename(tmp_name);
7183                         }
7184                         /* Update cnode's catalog descriptor */
7185                         replace_desc(cp, &new_desc);
7186                         vcb->volumeNameEncodingHint = new_desc.cd_encoding;
7187                         cp->c_touch_chgtime = TRUE;
7188                 }
7189
7190                 hfs_unlock(cp);
7191         }
7192
7193         return(error);
7194 }
7195
7196 /*
7197  * Get file system attributes.
7198  */
7199 static int
7200 hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7201 {
7202         kauth_cred_t cred = vfs_context_ucred(context);
7203         int error = 0;
7204
7205         /*
7206          * Must be superuser or owner of filesystem to change volume attributes
7207          */
7208         if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner))
7209                 return(EACCES);
7210
7211         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7212                 vnode_t root_vp;
7213
7214                 error = hfs_vfs_root(mp, &root_vp, context);
7215                 if (error)
7216                         goto out;
7217
7218                 error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context));
7219                 (void) vnode_put(root_vp);
7220                 if (error)
7221                         goto out;
7222
7223                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7224         }
7225
7226 out:
7227         return error;
7228 }
7229
7230 /* If a runtime corruption is detected, set the volume inconsistent
7231  * bit in the volume attributes.  The volume inconsistent bit is a persistent
7232  * bit which represents that the volume is corrupt and needs repair.
7233  * The volume inconsistent bit can be set from the kernel when it detects
7234  * runtime corruption or from file system repair utilities like fsck_hfs when
7235  * a repair operation fails.  The bit should be cleared only from file system
7236  * verify/repair utility like fsck_hfs when a verify/repair succeeds.
7237  */
7238 void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp)
7239 {
7240         HFS_MOUNT_LOCK(hfsmp, TRUE);
7241         if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) {
7242                 hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask;
7243                 MarkVCBDirty(hfsmp);
7244         }
7245         if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) {
7246                 /* Log information to ASL log */
7247                 fslog_fs_corrupt(hfsmp->hfs_mp);
7248                 printf("hfs: Runtime corruption detected on %s, fsck will be forced on next mount.\n", hfsmp->vcbVN);
7249         }
7250         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
7251 }
7252
7253 /* Replay the journal on the device node provided.  Returns zero if
7254  * journal replay succeeded or no journal was supposed to be replayed.
7255  */
7256 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context)
7257 {
7258         int retval = 0;
7259         struct mount *mp = NULL;
7260         struct hfs_mount_args *args = NULL;
7261
7262         /* Replay allowed only on raw devices */
7263         if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) {
7264                 retval = EINVAL;
7265                 goto out;
7266         }
7267
7268         /* Create dummy mount structures */
7269         MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK);
7270         if (mp == NULL) {
7271                 retval = ENOMEM;
7272                 goto out;
7273         }
7274         bzero(mp, sizeof(struct mount));
7275         mount_lock_init(mp);
7276
7277         MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK);
7278         if (args == NULL) {
7279                 retval = ENOMEM;
7280                 goto out;
7281         }
7282         bzero(args, sizeof(struct hfs_mount_args));
7283
7284         retval = hfs_mountfs(devvp, mp, args, 1, context);
7285         buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay");
7286
7287         /* FSYNC the devnode to be sure all data has been flushed */
7288         retval = VNOP_FSYNC(devvp, MNT_WAIT, context);
7289
7290 out:
7291         if (mp) {
7292                 mount_lock_destroy(mp);
7293                 FREE(mp, M_TEMP);
7294         }
7295         if (args) {
7296                 FREE(args, M_TEMP);
7297         }
7298         return retval;
7299 }
7300
7301 /*
7302  * hfs vfs operations.
7303  */
7304 struct vfsops hfs_vfsops = {
7305         hfs_mount,
7306         hfs_start,
7307         hfs_unmount,
7308         hfs_vfs_root,
7309         hfs_quotactl,
7310         hfs_vfs_getattr,        /* was hfs_statfs */
7311         hfs_sync,
7312         hfs_vfs_vget,
7313         hfs_fhtovp,
7314         hfs_vptofh,
7315         hfs_init,
7316         hfs_sysctl,
7317         hfs_vfs_setattr,
7318         {NULL}
7319 };