bsd/hfs/hfs_vfsops.c

   1 /*
   2  * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1991, 1993, 1994
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      hfs_vfsops.c
  66  *  derived from        @(#)ufs_vfsops.c        8.8 (Berkeley) 5/20/95
  67  *
  68  *      (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved.
  69  *
  70  *      hfs_vfsops.c -- VFS layer for loadable HFS file system.
  71  *
  72  */
  73 #include <sys/param.h>
  74 #include <sys/systm.h>
  75 #include <sys/kauth.h>
  76
  77 #include <sys/ubc.h>
  78 #include <sys/ubc_internal.h>
  79 #include <sys/vnode_internal.h>
  80 #include <sys/mount_internal.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/malloc.h>
  83 #include <sys/stat.h>
  84 #include <sys/quota.h>
  85 #include <sys/disk.h>
  86 #include <sys/paths.h>
  87 #include <sys/utfconv.h>
  88 #include <sys/kdebug.h>
  89 #include <sys/fslog.h>
  90 #include <sys/ubc.h>
  91 #include <sys/buf_internal.h>
  92
  93 /* for parsing boot-args */
  94 #include <pexpert/pexpert.h>
  95
  96
  97 #include <kern/locks.h>
  98
  99 #include <vfs/vfs_journal.h>
 100
 101 #include <miscfs/specfs/specdev.h>
 102 #include <hfs/hfs_mount.h>
 103
 104 #include <libkern/crypto/md5.h>
 105 #include <uuid/uuid.h>
 106
 107 #include "hfs.h"
 108 #include "hfs_catalog.h"
 109 #include "hfs_cnode.h"
 110 #include "hfs_dbg.h"
 111 #include "hfs_endian.h"
 112 #include "hfs_hotfiles.h"
 113 #include "hfs_quota.h"
 114 #include "hfs_btreeio.h"
 115 #include "hfs_kdebug.h"
 116
 117 #include "hfscommon/headers/FileMgrInternal.h"
 118 #include "hfscommon/headers/BTreesInternal.h"
 119
 120 #if CONFIG_PROTECT
 121 #include <sys/cprotect.h>
 122 #endif
 123
 124 #define HFS_MOUNT_DEBUG 1
 125
 126 #if     HFS_DIAGNOSTIC
 127 int hfs_dbg_all = 0;
 128 int hfs_dbg_err = 0;
 129 #endif
 130
 131 /* Enable/disable debugging code for live volume resizing */
 132 int hfs_resize_debug = 0;
 133
 134 lck_grp_attr_t *  hfs_group_attr;
 135 lck_attr_t *  hfs_lock_attr;
 136 lck_grp_t *  hfs_mutex_group;
 137 lck_grp_t *  hfs_rwlock_group;
 138 lck_grp_t *  hfs_spinlock_group;
 139
 140 extern struct vnodeopv_desc hfs_vnodeop_opv_desc;
 141
 142 #if CONFIG_HFS_STD
 143 extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc;
 144 static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush);
 145 #endif
 146
 147 /* not static so we can re-use in hfs_readwrite.c for build_path calls */
 148 int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
 149
 150 static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args);
 151 static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context);
 152 static int hfs_flushfiles(struct mount *, int, struct proc *);
 153 static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp);
 154 static int hfs_init(struct vfsconf *vfsp);
 155 static void hfs_locks_destroy(struct hfsmount *hfsmp);
 156 static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context);
 157 static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context);
 158 static int hfs_start(struct mount *mp, int flags, vfs_context_t context);
 159 static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
 160 static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec);
 161 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
 162 static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context);
 163 static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context);
 164
 165 void hfs_initialize_allocator (struct hfsmount *hfsmp);
 166 int hfs_teardown_allocator (struct hfsmount *hfsmp);
 167
 168 int hfs_mount(struct mount *mp, vnode_t  devvp, user_addr_t data, vfs_context_t context);
 169 int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context);
 170 int hfs_reload(struct mount *mp);
 171 int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context);
 172 int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context);
 173 int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
 174                       user_addr_t newp, size_t newlen, vfs_context_t context);
 175 int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context);
 176
 177 /*
 178  * Called by vfs_mountroot when mounting HFS Plus as root.
 179  */
 180
 181 int
 182 hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context)
 183 {
 184         struct hfsmount *hfsmp;
 185         ExtendedVCB *vcb;
 186         struct vfsstatfs *vfsp;
 187         int error;
 188
 189         if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) {
 190                 if (HFS_MOUNT_DEBUG) {
 191                         printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n",
 192                                         error, rvp, (rvp->v_name ? rvp->v_name : "unknown device"));
 193                 }
 194                 return (error);
 195         }
 196
 197         /* Init hfsmp */
 198         hfsmp = VFSTOHFS(mp);
 199
 200         hfsmp->hfs_uid = UNKNOWNUID;
 201         hfsmp->hfs_gid = UNKNOWNGID;
 202         hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 203         hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
 204
 205         /* Establish the free block reserve. */
 206         vcb = HFSTOVCB(hfsmp);
 207         vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100;
 208         vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize);
 209
 210         vfsp = vfs_statfs(mp);
 211         (void)hfs_statfs(mp, vfsp, NULL);
 212
 213         return (0);
 214 }
 215
 216
 217 /*
 218  * VFS Operations.
 219  *
 220  * mount system call
 221  */
 222
 223 int
 224 hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
 225 {
 226         struct proc *p = vfs_context_proc(context);
 227         struct hfsmount *hfsmp = NULL;
 228         struct hfs_mount_args args;
 229         int retval = E_NONE;
 230         u_int32_t cmdflags;
 231
 232         if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) {
 233                 if (HFS_MOUNT_DEBUG) {
 234                         printf("hfs_mount: copyin returned %d for fs\n", retval);
 235                 }
 236                 return (retval);
 237         }
 238         cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS;
 239         if (cmdflags & MNT_UPDATE) {
 240                 hfsmp = VFSTOHFS(mp);
 241
 242                 /* Reload incore data after an fsck. */
 243                 if (cmdflags & MNT_RELOAD) {
 244                         if (vfs_isrdonly(mp)) {
 245                                 int error = hfs_reload(mp);
 246                                 if (error && HFS_MOUNT_DEBUG) {
 247                                         printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN);
 248                                 }
 249                                 return error;
 250                         }
 251                         else {
 252                                 if (HFS_MOUNT_DEBUG) {
 253                                         printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN);
 254                                 }
 255                                 return (EINVAL);
 256                         }
 257                 }
 258
 259                 /* Change to a read-only file system. */
 260                 if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) &&
 261                     vfs_isrdonly(mp)) {
 262                         int flags;
 263
 264                         /* Set flag to indicate that a downgrade to read-only
 265                          * is in progress and therefore block any further
 266                          * modifications to the file system.
 267                          */
 268                         hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 269                         hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
 270                         hfsmp->hfs_downgrading_proc = current_thread();
 271                         hfs_unlock_global (hfsmp);
 272
 273                         /* use VFS_SYNC to push out System (btree) files */
 274                         retval = VFS_SYNC(mp, MNT_WAIT, context);
 275                         if (retval && ((cmdflags & MNT_FORCE) == 0)) {
 276                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 277                                 hfsmp->hfs_downgrading_proc = NULL;
 278                                 if (HFS_MOUNT_DEBUG) {
 279                                         printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN);
 280                                 }
 281                                 goto out;
 282                         }
 283
 284                         flags = WRITECLOSE;
 285                         if (cmdflags & MNT_FORCE)
 286                                 flags |= FORCECLOSE;
 287
 288                         if ((retval = hfs_flushfiles(mp, flags, p))) {
 289                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 290                                 hfsmp->hfs_downgrading_proc = NULL;
 291                                 if (HFS_MOUNT_DEBUG) {
 292                                         printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN);
 293                                 }
 294                                 goto out;
 295                         }
 296
 297                         /* mark the volume cleanly unmounted */
 298                         hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
 299                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 300                         hfsmp->hfs_flags |= HFS_READ_ONLY;
 301
 302                         /*
 303                          * Close down the journal.
 304                          *
 305                          * NOTE: It is critically important to close down the journal
 306                          * and have it issue all pending I/O prior to calling VNOP_FSYNC below.
 307                          * In a journaled environment it is expected that the journal be
 308                          * the only actor permitted to issue I/O for metadata blocks in HFS.
 309                          * If we were to call VNOP_FSYNC prior to closing down the journal,
 310                          * we would inadvertantly issue (and wait for) the I/O we just
 311                          * initiated above as part of the flushvolumeheader call.
 312                          *
 313                          * To avoid this, we follow the same order of operations as in
 314                          * unmount and issue the journal_close prior to calling VNOP_FSYNC.
 315                          */
 316
 317                         if (hfsmp->jnl) {
 318                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 319
 320                             journal_close(hfsmp->jnl);
 321                             hfsmp->jnl = NULL;
 322
 323                             // Note: we explicitly don't want to shutdown
 324                             //       access to the jvp because we may need
 325                             //       it later if we go back to being read-write.
 326
 327                                 hfs_unlock_global (hfsmp);
 328                         }
 329
 330
 331                         /*
 332                          * Write out any pending I/O still outstanding against the device node
 333                          * now that the journal has been closed.
 334                          */
 335                         if (retval == 0) {
 336                                 vnode_get(hfsmp->hfs_devvp);
 337                                 retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
 338                                 vnode_put(hfsmp->hfs_devvp);
 339                         }
 340
 341                         if (retval) {
 342                                 if (HFS_MOUNT_DEBUG) {
 343                                         printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN);
 344                                 }
 345                                 hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 346                                 hfsmp->hfs_downgrading_proc = NULL;
 347                                 hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 348                                 goto out;
 349                         }
 350
 351                         if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) {
 352                                 if (hfsmp->hfs_summary_table) {
 353                                         int err = 0;
 354                                         /*
 355                                          * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress
 356                                          */
 357                                         if (hfsmp->hfs_allocation_vp) {
 358                                                 err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 359                                         }
 360                                         FREE (hfsmp->hfs_summary_table, M_TEMP);
 361                                         hfsmp->hfs_summary_table = NULL;
 362                                         hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE;
 363                                         if (err == 0 && hfsmp->hfs_allocation_vp){
 364                                                 hfs_unlock (VTOC(hfsmp->hfs_allocation_vp));
 365                                         }
 366                                 }
 367                         }
 368
 369                         hfsmp->hfs_downgrading_proc = NULL;
 370                 }
 371
 372                 /* Change to a writable file system. */
 373                 if (vfs_iswriteupgrade(mp)) {
 374                         /*
 375                          * On inconsistent disks, do not allow read-write mount
 376                          * unless it is the boot volume being mounted.
 377                          */
 378                         if (!(vfs_flags(mp) & MNT_ROOTFS) &&
 379                                         (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) {
 380                                 if (HFS_MOUNT_DEBUG) {
 381                                         printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n",  (hfsmp->vcbVN));
 382                                 }
 383                                 retval = EINVAL;
 384                                 goto out;
 385                         }
 386
 387                         // If the journal was shut-down previously because we were
 388                         // asked to be read-only, let's start it back up again now
 389
 390                         if (   (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask)
 391                             && hfsmp->jnl == NULL
 392                             && hfsmp->jvp != NULL) {
 393                             int jflags;
 394
 395                             if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) {
 396                                         jflags = JOURNAL_RESET;
 397                                 } else {
 398                                         jflags = 0;
 399                                 }
 400
 401                                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
 402
 403                                 /* We provide the mount point twice here: The first is used as
 404                                  * an opaque argument to be passed back when hfs_sync_metadata
 405                                  * is called.  The second is provided to the throttling code to
 406                                  * indicate which mount's device should be used when accounting
 407                                  * for metadata writes.
 408                                  */
 409                                 hfsmp->jnl = journal_open(hfsmp->jvp,
 410                                                 (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
 411                                                 hfsmp->jnl_size,
 412                                                 hfsmp->hfs_devvp,
 413                                                 hfsmp->hfs_logical_block_size,
 414                                                 jflags,
 415                                                 0,
 416                                                 hfs_sync_metadata, hfsmp->hfs_mp,
 417                                                 hfsmp->hfs_mp);
 418
 419                                 /*
 420                                  * Set up the trim callback function so that we can add
 421                                  * recently freed extents to the free extent cache once
 422                                  * the transaction that freed them is written to the
 423                                  * journal on disk.
 424                                  */
 425                                 if (hfsmp->jnl)
 426                                         journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp);
 427
 428                                 hfs_unlock_global (hfsmp);
 429
 430                                 if (hfsmp->jnl == NULL) {
 431                                         if (HFS_MOUNT_DEBUG) {
 432                                                 printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN));
 433                                         }
 434                                         retval = EINVAL;
 435                                         goto out;
 436                                 } else {
 437                                         hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET;
 438                                 }
 439
 440                         }
 441
 442                         /* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */
 443                         retval = hfs_erase_unused_nodes(hfsmp);
 444                         if (retval != E_NONE) {
 445                                 if (HFS_MOUNT_DEBUG) {
 446                                         printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN);
 447                                 }
 448                                 goto out;
 449                         }
 450
 451                         /* If this mount point was downgraded from read-write
 452                          * to read-only, clear that information as we are now
 453                          * moving back to read-write.
 454                          */
 455                         hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
 456                         hfsmp->hfs_downgrading_proc = NULL;
 457
 458                         /* mark the volume dirty (clear clean unmount bit) */
 459                         hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
 460
 461                         retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
 462                         if (retval != E_NONE) {
 463                                 if (HFS_MOUNT_DEBUG) {
 464                                         printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN);
 465                                 }
 466                                 goto out;
 467                         }
 468
 469                         /* Only clear HFS_READ_ONLY after a successful write */
 470                         hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 471
 472
 473                         if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) {
 474                                 /* Setup private/hidden directories for hardlinks. */
 475                                 hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
 476                                 hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
 477
 478                                 hfs_remove_orphans(hfsmp);
 479
 480                                 /*
 481                                  * Allow hot file clustering if conditions allow.
 482                                  */
 483                                 if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) &&
 484                                            ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0))    {
 485                                         (void) hfs_recording_init(hfsmp);
 486                                 }
 487                                 /* Force ACLs on HFS+ file systems. */
 488                                 if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) {
 489                                         vfs_setextendedsecurity(HFSTOVFS(hfsmp));
 490                                 }
 491                         }
 492                 }
 493
 494                 /* Update file system parameters. */
 495                 retval = hfs_changefs(mp, &args);
 496                 if (retval &&  HFS_MOUNT_DEBUG) {
 497                         printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN);
 498                 }
 499
 500         } else /* not an update request */ {
 501
 502                 /* Set the mount flag to indicate that we support volfs  */
 503                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS));
 504
 505                 retval = hfs_mountfs(devvp, mp, &args, 0, context);
 506                 if (retval) {
 507                         const char *name = vnode_getname(devvp);
 508                         printf("hfs_mount: hfs_mountfs returned error=%d for device %s\n", retval, (name ? name : "unknown-dev"));
 509                         if (name) {
 510                                 vnode_putname(name);
 511                         }
 512                         goto out;
 513                 }
 514
 515                 /* After hfs_mountfs succeeds, we should have valid hfsmp */
 516                 hfsmp = VFSTOHFS(mp);
 517
 518                 /*
 519                  * Check to see if the file system exists on CoreStorage.
 520                  *
 521                  * This must be done after examining the root folder's CP EA since
 522                  * hfs_vfs_root will create a vnode (which must not occur until after
 523                  * we've established the CP level of the FS).
 524                  */
 525                 if (retval == 0) {
 526                         errno_t err;
 527                         vnode_t root_vnode;
 528                         err = hfs_vfs_root(mp, &root_vnode, context);
 529                         if (err == 0) {
 530                                 if (VNOP_IOCTL(devvp, _DKIOCCSSETFSVNODE,
 531                                                         (caddr_t)&root_vnode, 0, context) == 0) {
 532                                         err = vnode_ref(root_vnode);
 533                                         if (err == 0) {
 534                                                 hfsmp->hfs_flags |= HFS_CS;
 535                                         }
 536                                 }
 537
 538                                 err = vnode_put(root_vnode);
 539                                 if (err) {
 540                                         printf("hfs: could not release io count on root vnode with error: %d\n",
 541                                                         err);
 542                                 }
 543                         } else {
 544                                 printf("hfs: could not get root vnode with error: %d\n",
 545                                                 err);
 546                         }
 547                 }
 548         }
 549
 550 out:
 551         if (retval == 0) {
 552                 (void)hfs_statfs(mp, vfs_statfs(mp), context);
 553         }
 554         return (retval);
 555 }
 556
 557
 558 struct hfs_changefs_cargs {
 559         struct hfsmount *hfsmp;
 560         int             namefix;
 561         int             permfix;
 562         int             permswitch;
 563 };
 564
 565 static int
 566 hfs_changefs_callback(struct vnode *vp, void *cargs)
 567 {
 568         ExtendedVCB *vcb;
 569         struct cnode *cp;
 570         struct cat_desc cndesc;
 571         struct cat_attr cnattr;
 572         struct hfs_changefs_cargs *args;
 573         int lockflags;
 574         int error;
 575
 576         args = (struct hfs_changefs_cargs *)cargs;
 577
 578         cp = VTOC(vp);
 579         vcb = HFSTOVCB(args->hfsmp);
 580
 581         lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 582         error = cat_lookup(args->hfsmp, &cp->c_desc, 0, 0, &cndesc, &cnattr, NULL, NULL);
 583         hfs_systemfile_unlock(args->hfsmp, lockflags);
 584         if (error) {
 585                 /*
 586                  * If we couldn't find this guy skip to the next one
 587                  */
 588                 if (args->namefix)
 589                         cache_purge(vp);
 590
 591                 return (VNODE_RETURNED);
 592         }
 593         /*
 594          * Get the real uid/gid and perm mask from disk.
 595          */
 596         if (args->permswitch || args->permfix) {
 597                 cp->c_uid = cnattr.ca_uid;
 598                 cp->c_gid = cnattr.ca_gid;
 599                 cp->c_mode = cnattr.ca_mode;
 600         }
 601         /*
 602          * If we're switching name converters then...
 603          *   Remove the existing entry from the namei cache.
 604          *   Update name to one based on new encoder.
 605          */
 606         if (args->namefix) {
 607                 cache_purge(vp);
 608                 replace_desc(cp, &cndesc);
 609
 610                 if (cndesc.cd_cnid == kHFSRootFolderID) {
 611                         strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1);
 612                         cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding;
 613                 }
 614         } else {
 615                 cat_releasedesc(&cndesc);
 616         }
 617         return (VNODE_RETURNED);
 618 }
 619
 620 /* Change fs mount parameters */
 621 static int
 622 hfs_changefs(struct mount *mp, struct hfs_mount_args *args)
 623 {
 624         int retval = 0;
 625         int namefix, permfix, permswitch;
 626         struct hfsmount *hfsmp;
 627         ExtendedVCB *vcb;
 628         struct hfs_changefs_cargs cargs;
 629         u_int32_t mount_flags;
 630
 631 #if CONFIG_HFS_STD
 632         u_int32_t old_encoding = 0;
 633         hfs_to_unicode_func_t   get_unicode_func;
 634         unicode_to_hfs_func_t   get_hfsname_func;
 635 #endif
 636
 637         hfsmp = VFSTOHFS(mp);
 638         vcb = HFSTOVCB(hfsmp);
 639         mount_flags = (unsigned int)vfs_flags(mp);
 640
 641         hfsmp->hfs_flags |= HFS_IN_CHANGEFS;
 642
 643         permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) &&
 644                        ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) ||
 645                       (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) &&
 646                        (mount_flags & MNT_UNKNOWNPERMISSIONS)));
 647
 648         /* The root filesystem must operate with actual permissions: */
 649         if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) {
 650                 vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS));  /* Just say "No". */
 651                 retval = EINVAL;
 652                 goto exit;
 653         }
 654         if (mount_flags & MNT_UNKNOWNPERMISSIONS)
 655                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
 656         else
 657                 hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS;
 658
 659         namefix = permfix = 0;
 660
 661         /*
 662          * Tracking of hot files requires up-to-date access times.  So if
 663          * access time updates are disabled, we must also disable hot files.
 664          */
 665         if (mount_flags & MNT_NOATIME) {
 666                 (void) hfs_recording_suspend(hfsmp);
 667         }
 668
 669         /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */
 670         if (args->hfs_timezone.tz_minuteswest != VNOVAL) {
 671                 gTimeZone = args->hfs_timezone;
 672         }
 673
 674         /* Change the default uid, gid and/or mask */
 675         if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) {
 676                 hfsmp->hfs_uid = args->hfs_uid;
 677                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 678                         ++permfix;
 679         }
 680         if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) {
 681                 hfsmp->hfs_gid = args->hfs_gid;
 682                 if (vcb->vcbSigWord == kHFSPlusSigWord)
 683                         ++permfix;
 684         }
 685         if (args->hfs_mask != (mode_t)VNOVAL) {
 686                 if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) {
 687                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
 688                         hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
 689                         if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES))
 690                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
 691                         if (vcb->vcbSigWord == kHFSPlusSigWord)
 692                                 ++permfix;
 693                 }
 694         }
 695
 696 #if CONFIG_HFS_STD
 697         /* Change the hfs encoding value (hfs only) */
 698         if ((vcb->vcbSigWord == kHFSSigWord)    &&
 699             (args->hfs_encoding != (u_int32_t)VNOVAL)              &&
 700             (hfsmp->hfs_encoding != args->hfs_encoding)) {
 701
 702                 retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func);
 703                 if (retval)
 704                         goto exit;
 705
 706                 /*
 707                  * Connect the new hfs_get_unicode converter but leave
 708                  * the old hfs_get_hfsname converter in place so that
 709                  * we can lookup existing vnodes to get their correctly
 710                  * encoded names.
 711                  *
 712                  * When we're all finished, we can then connect the new
 713                  * hfs_get_hfsname converter and release our interest
 714                  * in the old converters.
 715                  */
 716                 hfsmp->hfs_get_unicode = get_unicode_func;
 717                 old_encoding = hfsmp->hfs_encoding;
 718                 hfsmp->hfs_encoding = args->hfs_encoding;
 719                 ++namefix;
 720         }
 721 #endif
 722
 723         if (!(namefix || permfix || permswitch))
 724                 goto exit;
 725
 726         /* XXX 3762912 hack to support HFS filesystem 'owner' */
 727         if (permfix)
 728                 vfs_setowner(mp,
 729                     hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid,
 730                     hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid);
 731
 732         /*
 733          * For each active vnode fix things that changed
 734          *
 735          * Note that we can visit a vnode more than once
 736          * and we can race with fsync.
 737          *
 738          * hfs_changefs_callback will be called for each vnode
 739          * hung off of this mount point
 740          *
 741          * The vnode will be properly referenced and unreferenced
 742          * around the callback
 743          */
 744         cargs.hfsmp = hfsmp;
 745         cargs.namefix = namefix;
 746         cargs.permfix = permfix;
 747         cargs.permswitch = permswitch;
 748
 749         vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs);
 750
 751 #if CONFIG_HFS_STD
 752         /*
 753          * If we're switching name converters we can now
 754          * connect the new hfs_get_hfsname converter and
 755          * release our interest in the old converters.
 756          */
 757         if (namefix) {
 758                 /* HFS standard only */
 759                 hfsmp->hfs_get_hfsname = get_hfsname_func;
 760                 vcb->volumeNameEncodingHint = args->hfs_encoding;
 761                 (void) hfs_relconverter(old_encoding);
 762         }
 763 #endif
 764
 765 exit:
 766         hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS;
 767         return (retval);
 768 }
 769
 770
 771 struct hfs_reload_cargs {
 772         struct hfsmount *hfsmp;
 773         int             error;
 774 };
 775
 776 static int
 777 hfs_reload_callback(struct vnode *vp, void *cargs)
 778 {
 779         struct cnode *cp;
 780         struct hfs_reload_cargs *args;
 781         int lockflags;
 782
 783         args = (struct hfs_reload_cargs *)cargs;
 784         /*
 785          * flush all the buffers associated with this node
 786          */
 787         (void) buf_invalidateblks(vp, 0, 0, 0);
 788
 789         cp = VTOC(vp);
 790         /*
 791          * Remove any directory hints
 792          */
 793         if (vnode_isdir(vp))
 794                 hfs_reldirhints(cp, 0);
 795
 796         /*
 797          * Re-read cnode data for all active vnodes (non-metadata files).
 798          */
 799         if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) {
 800                 struct cat_fork *datafork;
 801                 struct cat_desc desc;
 802
 803                 datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL;
 804
 805                 /* lookup by fileID since name could have changed */
 806                 lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 807                 args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork);
 808                 hfs_systemfile_unlock(args->hfsmp, lockflags);
 809                 if (args->error) {
 810                         return (VNODE_RETURNED_DONE);
 811                 }
 812
 813                 /* update cnode's catalog descriptor */
 814                 (void) replace_desc(cp, &desc);
 815         }
 816         return (VNODE_RETURNED);
 817 }
 818
 819 /*
 820  * Reload all incore data for a filesystem (used after running fsck on
 821  * the root filesystem and finding things to fix). The filesystem must
 822  * be mounted read-only.
 823  *
 824  * Things to do to update the mount:
 825  *      invalidate all cached meta-data.
 826  *      invalidate all inactive vnodes.
 827  *      invalidate all cached file data.
 828  *      re-read volume header from disk.
 829  *      re-load meta-file info (extents, file size).
 830  *      re-load B-tree header data.
 831  *      re-read cnode data for all active vnodes.
 832  */
 833 int
 834 hfs_reload(struct mount *mountp)
 835 {
 836         register struct vnode *devvp;
 837         struct buf *bp;
 838         int error, i;
 839         struct hfsmount *hfsmp;
 840         struct HFSPlusVolumeHeader *vhp;
 841         ExtendedVCB *vcb;
 842         struct filefork *forkp;
 843         struct cat_desc cndesc;
 844         struct hfs_reload_cargs args;
 845         daddr64_t priIDSector;
 846
 847         hfsmp = VFSTOHFS(mountp);
 848         vcb = HFSTOVCB(hfsmp);
 849
 850         if (vcb->vcbSigWord == kHFSSigWord)
 851                 return (EINVAL);        /* rooting from HFS is not supported! */
 852
 853         /*
 854          * Invalidate all cached meta-data.
 855          */
 856         devvp = hfsmp->hfs_devvp;
 857         if (buf_invalidateblks(devvp, 0, 0, 0))
 858                 panic("hfs_reload: dirty1");
 859
 860         args.hfsmp = hfsmp;
 861         args.error = 0;
 862         /*
 863          * hfs_reload_callback will be called for each vnode
 864          * hung off of this mount point that can't be recycled...
 865          * vnode_iterate will recycle those that it can (the VNODE_RELOAD option)
 866          * the vnode will be in an 'unbusy' state (VNODE_WAIT) and
 867          * properly referenced and unreferenced around the callback
 868          */
 869         vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args);
 870
 871         if (args.error)
 872                 return (args.error);
 873
 874         /*
 875          * Re-read VolumeHeader from disk.
 876          */
 877         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
 878                         HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
 879
 880         error = (int)buf_meta_bread(hfsmp->hfs_devvp,
 881                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
 882                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
 883         if (error) {
 884                 if (bp != NULL)
 885                         buf_brelse(bp);
 886                 return (error);
 887         }
 888
 889         vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
 890
 891         /* Do a quick sanity check */
 892         if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
 893              SWAP_BE16(vhp->signature) != kHFSXSigWord) ||
 894             (SWAP_BE16(vhp->version) != kHFSPlusVersion &&
 895              SWAP_BE16(vhp->version) != kHFSXVersion) ||
 896             SWAP_BE32(vhp->blockSize) != vcb->blockSize) {
 897                 buf_brelse(bp);
 898                 return (EIO);
 899         }
 900
 901         vcb->vcbLsMod           = to_bsd_time(SWAP_BE32(vhp->modifyDate));
 902         vcb->vcbAtrb            = SWAP_BE32 (vhp->attributes);
 903         vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
 904         vcb->vcbClpSiz          = SWAP_BE32 (vhp->rsrcClumpSize);
 905         vcb->vcbNxtCNID         = SWAP_BE32 (vhp->nextCatalogID);
 906         vcb->vcbVolBkUp         = to_bsd_time(SWAP_BE32(vhp->backupDate));
 907         vcb->vcbWrCnt           = SWAP_BE32 (vhp->writeCount);
 908         vcb->vcbFilCnt          = SWAP_BE32 (vhp->fileCount);
 909         vcb->vcbDirCnt          = SWAP_BE32 (vhp->folderCount);
 910         HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation));
 911         vcb->totalBlocks        = SWAP_BE32 (vhp->totalBlocks);
 912         vcb->freeBlocks         = SWAP_BE32 (vhp->freeBlocks);
 913         vcb->encodingsBitmap    = SWAP_BE64 (vhp->encodingsBitmap);
 914         bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo));
 915         vcb->localCreateDate    = SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */
 916
 917         /*
 918          * Re-load meta-file vnode data (extent info, file size, etc).
 919          */
 920         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 921         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 922                 forkp->ff_extents[i].startBlock =
 923                         SWAP_BE32 (vhp->extentsFile.extents[i].startBlock);
 924                 forkp->ff_extents[i].blockCount =
 925                         SWAP_BE32 (vhp->extentsFile.extents[i].blockCount);
 926         }
 927         forkp->ff_size      = SWAP_BE64 (vhp->extentsFile.logicalSize);
 928         forkp->ff_blocks    = SWAP_BE32 (vhp->extentsFile.totalBlocks);
 929         forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize);
 930
 931
 932         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 933         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 934                 forkp->ff_extents[i].startBlock =
 935                         SWAP_BE32 (vhp->catalogFile.extents[i].startBlock);
 936                 forkp->ff_extents[i].blockCount =
 937                         SWAP_BE32 (vhp->catalogFile.extents[i].blockCount);
 938         }
 939         forkp->ff_size      = SWAP_BE64 (vhp->catalogFile.logicalSize);
 940         forkp->ff_blocks    = SWAP_BE32 (vhp->catalogFile.totalBlocks);
 941         forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize);
 942
 943         if (hfsmp->hfs_attribute_vp) {
 944                 forkp = VTOF(hfsmp->hfs_attribute_vp);
 945                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
 946                         forkp->ff_extents[i].startBlock =
 947                                 SWAP_BE32 (vhp->attributesFile.extents[i].startBlock);
 948                         forkp->ff_extents[i].blockCount =
 949                                 SWAP_BE32 (vhp->attributesFile.extents[i].blockCount);
 950                 }
 951                 forkp->ff_size      = SWAP_BE64 (vhp->attributesFile.logicalSize);
 952                 forkp->ff_blocks    = SWAP_BE32 (vhp->attributesFile.totalBlocks);
 953                 forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize);
 954         }
 955
 956         forkp = VTOF((struct vnode *)vcb->allocationsRefNum);
 957         for (i = 0; i < kHFSPlusExtentDensity; i++) {
 958                 forkp->ff_extents[i].startBlock =
 959                         SWAP_BE32 (vhp->allocationFile.extents[i].startBlock);
 960                 forkp->ff_extents[i].blockCount =
 961                         SWAP_BE32 (vhp->allocationFile.extents[i].blockCount);
 962         }
 963         forkp->ff_size      = SWAP_BE64 (vhp->allocationFile.logicalSize);
 964         forkp->ff_blocks    = SWAP_BE32 (vhp->allocationFile.totalBlocks);
 965         forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize);
 966
 967         buf_brelse(bp);
 968         vhp = NULL;
 969
 970         /*
 971          * Re-load B-tree header data
 972          */
 973         forkp = VTOF((struct vnode *)vcb->extentsRefNum);
 974         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 975                 return (error);
 976
 977         forkp = VTOF((struct vnode *)vcb->catalogRefNum);
 978         if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 979                 return (error);
 980
 981         if (hfsmp->hfs_attribute_vp) {
 982                 forkp = VTOF(hfsmp->hfs_attribute_vp);
 983                 if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
 984                         return (error);
 985         }
 986
 987         /* Reload the volume name */
 988         if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL)))
 989                 return (error);
 990         vcb->volumeNameEncodingHint = cndesc.cd_encoding;
 991         bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen));
 992         cat_releasedesc(&cndesc);
 993
 994         /* Re-establish private/hidden directories. */
 995         hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
 996         hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
 997
 998         /* In case any volume information changed to trigger a notification */
 999         hfs_generate_volume_notifications(hfsmp);
1000
1001         return (0);
1002 }
1003
1004
1005 static uint64_t timeval_to_microseconds(struct timeval *tv)
1006 {
1007         return tv->tv_sec * 1000000ULL + tv->tv_usec;
1008 }
1009
1010 static void
1011 hfs_syncer(void *arg0, void *unused)
1012 {
1013 #pragma unused(unused)
1014
1015     struct hfsmount *hfsmp = arg0;
1016     clock_sec_t secs;
1017     clock_usec_t usecs;
1018     uint64_t deadline = 0;
1019     uint64_t now;
1020
1021     clock_get_system_microtime(&secs, &usecs);
1022     now = ((uint64_t)secs * USEC_PER_SEC) + (uint64_t)usecs;
1023     KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER | DBG_FUNC_START, hfsmp, now, timeval_to_microseconds(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp), hfsmp->hfs_mp->mnt_pending_write_size, 0);
1024
1025     /*
1026      * Flush the journal if there have been no writes (or outstanding writes) for 0.1 seconds.
1027      *
1028      * WARNING!  last_write_completed >= last_write_issued isn't sufficient to test whether
1029      * there are still outstanding writes.  We could have issued a whole bunch of writes,
1030      * and then stopped issuing new writes, then one or more of those writes complete.
1031      *
1032      * NOTE: This routine uses clock_get_system_microtime (i.e. uptime) instead of
1033      * clock_get_calendar_microtime (i.e. wall time) because mnt_last_write_completed_timestamp
1034      * and mnt_last_write_issued_timestamp are also stored as system (uptime) times.
1035      * Trying to compute durations from a mix of system and calendar times is meaningless
1036      * since they are relative to different points in time.
1037      */
1038     hfs_start_transaction(hfsmp);   // so we hold off any new writes
1039     uint64_t last_write_completed = timeval_to_microseconds(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp);
1040     if (hfsmp->hfs_mp->mnt_pending_write_size == 0 && (now - last_write_completed) >= HFS_META_DELAY) {
1041         /*
1042          * Time to flush the journal.
1043          */
1044         KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_START, now, last_write_completed, timeval_to_microseconds(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp), hfsmp->hfs_mp->mnt_pending_write_size, 0);
1045
1046         /*
1047          * We intentionally do a synchronous flush (of the journal or entire volume) here.
1048          * For journaled volumes, this means we wait until the metadata blocks are written
1049          * to both the journal and their final locations (in the B-trees, etc.).
1050          *
1051          * This tends to avoid interleaving the metadata writes with other writes (for
1052          * example, user data, or to the journal when a later transaction notices that
1053          * an earlier transaction has finished its async writes, and then updates the
1054          * journal start in the journal header).  Avoiding interleaving of writes is
1055          * very good for performance on simple flash devices like SD cards, thumb drives;
1056          * and on devices like floppies.  Since removable devices tend to be this kind of
1057          * simple device, doing a synchronous flush actually improves performance in
1058          * practice.
1059          *
1060          * NOTE: For non-journaled volumes, the call to hfs_sync will also cause dirty
1061          * user data to be written.
1062          */
1063         if (hfsmp->jnl) {
1064             hfs_journal_flush(hfsmp, TRUE);
1065         } else {
1066             hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1067         }
1068
1069         clock_get_system_microtime(&secs, &usecs);
1070         now = ((uint64_t)secs * USEC_PER_SEC) + (uint64_t)usecs;
1071
1072         KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_END, now, timeval_to_microseconds(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp), timeval_to_microseconds(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp), hfsmp->hfs_mp->mnt_pending_write_size, 0);
1073         hfs_end_transaction(hfsmp);
1074
1075         //
1076         // NOTE: we decrement these *after* we've done the journal_flush() since
1077         // it can take a significant amount of time and so we don't want more
1078         // callbacks scheduled until we've done this one.
1079         //
1080         OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
1081         OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
1082         wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
1083     } else {
1084         /*
1085          * Defer the journal flush by rescheduling the timer.
1086          */
1087
1088         clock_interval_to_deadline(HFS_META_DELAY, NSEC_PER_USEC, &deadline);
1089         thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
1090
1091         // note: we intentionally return early here and do not
1092         // decrement the sync_scheduled and sync_incomplete
1093         // variables because we rescheduled the timer.
1094
1095         hfs_end_transaction(hfsmp);
1096     }
1097     KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER| DBG_FUNC_END, deadline ? EAGAIN : 0, deadline, 0, 0, 0);
1098 }
1099
1100
1101 extern int IOBSDIsMediaEjectable( const char *cdev_name );
1102
1103 /*
1104  * Call into the allocator code and perform a full scan of the bitmap file.
1105  *
1106  * This allows us to TRIM unallocated ranges if needed, and also to build up
1107  * an in-memory summary table of the state of the allocated blocks.
1108  */
1109 void hfs_scan_blocks (struct hfsmount *hfsmp) {
1110         /*
1111          * Take the allocation file lock.  Journal transactions will block until
1112          * we're done here.
1113          */
1114
1115         int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1116
1117         /*
1118          * We serialize here with the HFS mount lock as we're mounting.
1119          *
1120          * The mount can only proceed once this thread has acquired the bitmap
1121          * lock, since we absolutely do not want someone else racing in and
1122          * getting the bitmap lock, doing a read/write of the bitmap file,
1123          * then us getting the bitmap lock.
1124          *
1125          * To prevent this, the mount thread takes the HFS mount mutex, starts us
1126          * up, then immediately msleeps on the scan_var variable in the mount
1127          * point as a condition variable.  This serialization is safe since
1128          * if we race in and try to proceed while they're still holding the lock,
1129          * we'll block trying to acquire the global lock.  Since the mount thread
1130          * acquires the HFS mutex before starting this function in a new thread,
1131          * any lock acquisition on our part must be linearizably AFTER the mount thread's.
1132          *
1133          * Note that the HFS mount mutex is always taken last, and always for only
1134          * a short time.  In this case, we just take it long enough to mark the
1135          * scan-in-flight bit.
1136          */
1137         (void) hfs_lock_mount (hfsmp);
1138         hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_INFLIGHT;
1139         wakeup((caddr_t) &hfsmp->scan_var);
1140         hfs_unlock_mount (hfsmp);
1141
1142         /* Initialize the summary table */
1143         if (hfs_init_summary (hfsmp)) {
1144                 printf("hfs: could not initialize summary table for %s\n", hfsmp->vcbVN);
1145         }
1146
1147         /*
1148          * ScanUnmapBlocks assumes that the bitmap lock is held when you
1149          * call the function. We don't care if there were any errors issuing unmaps.
1150          *
1151          * It will also attempt to build up the summary table for subsequent
1152          * allocator use, as configured.
1153          */
1154         (void) ScanUnmapBlocks(hfsmp);
1155
1156         hfs_systemfile_unlock(hfsmp, flags);
1157 }
1158
1159 static int hfs_root_unmounted_cleanly = 0;
1160
1161 SYSCTL_DECL(_vfs_generic);
1162 SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
1163
1164 /*
1165  * Common code for mount and mountroot
1166  */
1167 int
1168 hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
1169             int journal_replay_only, vfs_context_t context)
1170 {
1171         struct proc *p = vfs_context_proc(context);
1172         int retval = E_NONE;
1173         struct hfsmount *hfsmp = NULL;
1174         struct buf *bp;
1175         dev_t dev;
1176         HFSMasterDirectoryBlock *mdbp = NULL;
1177         int ronly;
1178 #if QUOTA
1179         int i;
1180 #endif
1181         int mntwrapper;
1182         kauth_cred_t cred;
1183         u_int64_t disksize;
1184         daddr64_t log_blkcnt;
1185         u_int32_t log_blksize;
1186         u_int32_t phys_blksize;
1187         u_int32_t minblksize;
1188         u_int32_t iswritable;
1189         daddr64_t mdb_offset;
1190         int isvirtual = 0;
1191         int isroot = 0;
1192         u_int32_t device_features = 0;
1193         int isssd;
1194
1195         if (args == NULL) {
1196                 /* only hfs_mountroot passes us NULL as the 'args' argument */
1197                 isroot = 1;
1198         }
1199
1200         ronly = vfs_isrdonly(mp);
1201         dev = vnode_specrdev(devvp);
1202         cred = p ? vfs_context_ucred(context) : NOCRED;
1203         mntwrapper = 0;
1204
1205         bp = NULL;
1206         hfsmp = NULL;
1207         mdbp = NULL;
1208         minblksize = kHFSBlockSize;
1209
1210         /* Advisory locking should be handled at the VFS layer */
1211         vfs_setlocklocal(mp);
1212
1213         /* Get the logical block size (treated as physical block size everywhere) */
1214         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) {
1215                 if (HFS_MOUNT_DEBUG) {
1216                         printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n");
1217                 }
1218                 retval = ENXIO;
1219                 goto error_exit;
1220         }
1221         if (log_blksize == 0 || log_blksize > 1024*1024*1024) {
1222                 printf("hfs: logical block size 0x%x looks bad.  Not mounting.\n", log_blksize);
1223                 retval = ENXIO;
1224                 goto error_exit;
1225         }
1226
1227         /* Get the physical block size. */
1228         retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context);
1229         if (retval) {
1230                 if ((retval != ENOTSUP) && (retval != ENOTTY)) {
1231                         if (HFS_MOUNT_DEBUG) {
1232                                 printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n");
1233                         }
1234                         retval = ENXIO;
1235                         goto error_exit;
1236                 }
1237                 /* If device does not support this ioctl, assume that physical
1238                  * block size is same as logical block size
1239                  */
1240                 phys_blksize = log_blksize;
1241         }
1242         if (phys_blksize == 0 || phys_blksize > MAXBSIZE) {
1243                 printf("hfs: physical block size 0x%x looks bad.  Not mounting.\n", phys_blksize);
1244                 retval = ENXIO;
1245                 goto error_exit;
1246         }
1247
1248         /* Switch to 512 byte sectors (temporarily) */
1249         if (log_blksize > 512) {
1250                 u_int32_t size512 = 512;
1251
1252                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) {
1253                         if (HFS_MOUNT_DEBUG) {
1254                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n");
1255                         }
1256                         retval = ENXIO;
1257                         goto error_exit;
1258                 }
1259         }
1260         /* Get the number of 512 byte physical blocks. */
1261         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1262                 /* resetting block size may fail if getting block count did */
1263                 (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context);
1264                 if (HFS_MOUNT_DEBUG) {
1265                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n");
1266                 }
1267                 retval = ENXIO;
1268                 goto error_exit;
1269         }
1270         /* Compute an accurate disk size (i.e. within 512 bytes) */
1271         disksize = (u_int64_t)log_blkcnt * (u_int64_t)512;
1272
1273         /*
1274          * On Tiger it is not necessary to switch the device
1275          * block size to be 4k if there are more than 31-bits
1276          * worth of blocks but to insure compatibility with
1277          * pre-Tiger systems we have to do it.
1278          *
1279          * If the device size is not a multiple of 4K (8 * 512), then
1280          * switching the logical block size isn't going to help because
1281          * we will be unable to write the alternate volume header.
1282          * In this case, just leave the logical block size unchanged.
1283          */
1284         if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
1285                 minblksize = log_blksize = 4096;
1286                 if (phys_blksize < log_blksize)
1287                         phys_blksize = log_blksize;
1288         }
1289
1290         /*
1291          * The cluster layer is not currently prepared to deal with a logical
1292          * block size larger than the system's page size.  (It can handle
1293          * blocks per page, but not multiple pages per block.)  So limit the
1294          * logical block size to the page size.
1295          */
1296         if (log_blksize > PAGE_SIZE) {
1297                 log_blksize = PAGE_SIZE;
1298         }
1299
1300         /* Now switch to our preferred physical block size. */
1301         if (log_blksize > 512) {
1302                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1303                         if (HFS_MOUNT_DEBUG) {
1304                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n");
1305                         }
1306                         retval = ENXIO;
1307                         goto error_exit;
1308                 }
1309                 /* Get the count of physical blocks. */
1310                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1311                         if (HFS_MOUNT_DEBUG) {
1312                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n");
1313                         }
1314                         retval = ENXIO;
1315                         goto error_exit;
1316                 }
1317         }
1318         /*
1319          * At this point:
1320          *   minblksize is the minimum physical block size
1321          *   log_blksize has our preferred physical block size
1322          *   log_blkcnt has the total number of physical blocks
1323          */
1324
1325         mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize);
1326         if ((retval = (int)buf_meta_bread(devvp,
1327                                 HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)),
1328                                 phys_blksize, cred, &bp))) {
1329                 if (HFS_MOUNT_DEBUG) {
1330                         printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval);
1331                 }
1332                 goto error_exit;
1333         }
1334         MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK);
1335         if (mdbp == NULL) {
1336                 retval = ENOMEM;
1337                 if (HFS_MOUNT_DEBUG) {
1338                         printf("hfs_mountfs: MALLOC failed\n");
1339                 }
1340                 goto error_exit;
1341         }
1342         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize);
1343         buf_brelse(bp);
1344         bp = NULL;
1345
1346         MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK);
1347         if (hfsmp == NULL) {
1348                 if (HFS_MOUNT_DEBUG) {
1349                         printf("hfs_mountfs: MALLOC (2) failed\n");
1350                 }
1351                 retval = ENOMEM;
1352                 goto error_exit;
1353         }
1354         bzero(hfsmp, sizeof(struct hfsmount));
1355
1356         hfs_chashinit_finish(hfsmp);
1357
1358         /* Init the ID lookup hashtable */
1359         hfs_idhash_init (hfsmp);
1360
1361         /*
1362          * See if the disk supports unmap (trim).
1363          *
1364          * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field
1365          * returned by vfs_ioattr.  We need to call VNOP_IOCTL ourselves.
1366          */
1367         if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) {
1368                 if (device_features & DK_FEATURE_UNMAP) {
1369                         hfsmp->hfs_flags |= HFS_UNMAP;
1370                 }
1371         }
1372
1373         /*
1374          * See if the disk is a solid state device, too.  We need this to decide what to do about
1375          * hotfiles.
1376          */
1377         if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) {
1378                 if (isssd) {
1379                         hfsmp->hfs_flags |= HFS_SSD;
1380                 }
1381         }
1382
1383
1384         /*
1385          *  Init the volume information structure
1386          */
1387
1388         lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr);
1389         lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr);
1390         lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr);
1391         lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr);
1392         lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr);
1393
1394         vfs_setfsprivate(mp, hfsmp);
1395         hfsmp->hfs_mp = mp;                     /* Make VFSTOHFS work */
1396         hfsmp->hfs_raw_dev = vnode_specrdev(devvp);
1397         hfsmp->hfs_devvp = devvp;
1398         vnode_ref(devvp);  /* Hold a ref on the device, dropped when hfsmp is freed. */
1399         hfsmp->hfs_logical_block_size = log_blksize;
1400         hfsmp->hfs_logical_block_count = log_blkcnt;
1401         hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1402         hfsmp->hfs_physical_block_size = phys_blksize;
1403         hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize);
1404         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1405         if (ronly)
1406                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1407         if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)
1408                 hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
1409
1410 #if QUOTA
1411         for (i = 0; i < MAXQUOTAS; i++)
1412                 dqfileinit(&hfsmp->hfs_qfiles[i]);
1413 #endif
1414
1415         if (args) {
1416                 hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid;
1417                 if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID;
1418                 hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid;
1419                 if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID;
1420                 vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                               /* tell the VFS */
1421                 if (args->hfs_mask != (mode_t)VNOVAL) {
1422                         hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
1423                         if (args->flags & HFSFSMNT_NOXONFILES) {
1424                                 hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
1425                         } else {
1426                                 hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
1427                         }
1428                 } else {
1429                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1430                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1431                 }
1432                 if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER))
1433                         mntwrapper = 1;
1434         } else {
1435                 /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */
1436                 if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) {
1437                         hfsmp->hfs_uid = UNKNOWNUID;
1438                         hfsmp->hfs_gid = UNKNOWNGID;
1439                         vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);                       /* tell the VFS */
1440                         hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;            /* 0777: rwx---rwx */
1441                         hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;        /* 0666: no --x by default? */
1442                 }
1443         }
1444
1445         /* Find out if disk media is writable. */
1446         if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) {
1447                 if (iswritable)
1448                         hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1449                 else
1450                         hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1451         }
1452
1453         // record the current time at which we're mounting this volume
1454         struct timeval tv;
1455         microtime(&tv);
1456         hfsmp->hfs_mount_time = tv.tv_sec;
1457
1458         /* Mount a standard HFS disk */
1459         if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) &&
1460             (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) {
1461 #if CONFIG_HFS_STD
1462                 /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */
1463                 if (vfs_isrdwr(mp)) {
1464                         retval = EROFS;
1465                         goto error_exit;
1466                 }
1467
1468                 printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n");
1469
1470                 /* Treat it as if it's read-only and not writeable */
1471                 hfsmp->hfs_flags |= HFS_READ_ONLY;
1472                 hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1473
1474                 /* If only journal replay is requested, exit immediately */
1475                 if (journal_replay_only) {
1476                         retval = 0;
1477                         goto error_exit;
1478                 }
1479
1480                 if ((vfs_flags(mp) & MNT_ROOTFS)) {
1481                         retval = EINVAL;  /* Cannot root from HFS standard disks */
1482                         goto error_exit;
1483                 }
1484                 /* HFS disks can only use 512 byte physical blocks */
1485                 if (log_blksize > kHFSBlockSize) {
1486                         log_blksize = kHFSBlockSize;
1487                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1488                                 retval = ENXIO;
1489                                 goto error_exit;
1490                         }
1491                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1492                                 retval = ENXIO;
1493                                 goto error_exit;
1494                         }
1495                         hfsmp->hfs_logical_block_size = log_blksize;
1496                         hfsmp->hfs_logical_block_count = log_blkcnt;
1497                         hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1498                         hfsmp->hfs_physical_block_size = log_blksize;
1499                         hfsmp->hfs_log_per_phys = 1;
1500                 }
1501                 if (args) {
1502                         hfsmp->hfs_encoding = args->hfs_encoding;
1503                         HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding;
1504
1505                         /* establish the timezone */
1506                         gTimeZone = args->hfs_timezone;
1507                 }
1508
1509                 retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode,
1510                                         &hfsmp->hfs_get_hfsname);
1511                 if (retval)
1512                         goto error_exit;
1513
1514                 retval = hfs_MountHFSVolume(hfsmp, mdbp, p);
1515                 if (retval)
1516                         (void) hfs_relconverter(hfsmp->hfs_encoding);
1517 #else
1518                 /* On platforms where HFS Standard is not supported, deny the mount altogether */
1519                 retval = EINVAL;
1520                 goto error_exit;
1521 #endif
1522
1523         }
1524         else { /* Mount an HFS Plus disk */
1525                 HFSPlusVolumeHeader *vhp;
1526                 off_t embeddedOffset;
1527                 int   jnl_disable = 0;
1528
1529                 /* Get the embedded Volume Header */
1530                 if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
1531                         embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize;
1532                         embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) *
1533                                           (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1534
1535                         /*
1536                          * If the embedded volume doesn't start on a block
1537                          * boundary, then switch the device to a 512-byte
1538                          * block size so everything will line up on a block
1539                          * boundary.
1540                          */
1541                         if ((embeddedOffset % log_blksize) != 0) {
1542                                 printf("hfs_mountfs: embedded volume offset not"
1543                                     " a multiple of physical block size (%d);"
1544                                     " switching to 512\n", log_blksize);
1545                                 log_blksize = 512;
1546                                 if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE,
1547                                     (caddr_t)&log_blksize, FWRITE, context)) {
1548
1549                                         if (HFS_MOUNT_DEBUG) {
1550                                                 printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n");
1551                                         }
1552                                         retval = ENXIO;
1553                                         goto error_exit;
1554                                 }
1555                                 if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT,
1556                                     (caddr_t)&log_blkcnt, 0, context)) {
1557                                         if (HFS_MOUNT_DEBUG) {
1558                                                 printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n");
1559                                         }
1560                                         retval = ENXIO;
1561                                         goto error_exit;
1562                                 }
1563                                 /* Note: relative block count adjustment */
1564                                 hfsmp->hfs_logical_block_count *=
1565                                     hfsmp->hfs_logical_block_size / log_blksize;
1566
1567                                 /* Update logical /physical block size */
1568                                 hfsmp->hfs_logical_block_size = log_blksize;
1569                                 hfsmp->hfs_physical_block_size = log_blksize;
1570
1571                                 phys_blksize = log_blksize;
1572                                 hfsmp->hfs_log_per_phys = 1;
1573                         }
1574
1575                         disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
1576                                    (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1577
1578                         hfsmp->hfs_logical_block_count = disksize / log_blksize;
1579
1580                         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1581
1582                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1583                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1584                                         phys_blksize, cred, &bp);
1585                         if (retval) {
1586                                 if (HFS_MOUNT_DEBUG) {
1587                                         printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval);
1588                                 }
1589                                 goto error_exit;
1590                         }
1591                         bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512);
1592                         buf_brelse(bp);
1593                         bp = NULL;
1594                         vhp = (HFSPlusVolumeHeader*) mdbp;
1595
1596                 }
1597                 else { /* pure HFS+ */
1598                         embeddedOffset = 0;
1599                         vhp = (HFSPlusVolumeHeader*) mdbp;
1600                 }
1601
1602                 if (isroot) {
1603                         hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0);
1604                 }
1605
1606                 /*
1607                  * On inconsistent disks, do not allow read-write mount
1608                  * unless it is the boot volume being mounted.  We also
1609                  * always want to replay the journal if the journal_replay_only
1610                  * flag is set because that will (most likely) get the
1611                  * disk into a consistent state before fsck_hfs starts
1612                  * looking at it.
1613                  */
1614                 if (  !(vfs_flags(mp) & MNT_ROOTFS)
1615                    && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask)
1616                    && !journal_replay_only
1617                    && !(hfsmp->hfs_flags & HFS_READ_ONLY)) {
1618
1619                         if (HFS_MOUNT_DEBUG) {
1620                                 printf("hfs_mountfs: failed to mount non-root inconsistent disk\n");
1621                         }
1622                         retval = EINVAL;
1623                         goto error_exit;
1624                 }
1625
1626
1627                 // XXXdbg
1628                 //
1629                 hfsmp->jnl = NULL;
1630                 hfsmp->jvp = NULL;
1631                 if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) &&
1632                     args->journal_disable) {
1633                     jnl_disable = 1;
1634                 }
1635
1636                 //
1637                 // We only initialize the journal here if the last person
1638                 // to mount this volume was journaling aware.  Otherwise
1639                 // we delay journal initialization until later at the end
1640                 // of hfs_MountHFSPlusVolume() because the last person who
1641                 // mounted it could have messed things up behind our back
1642                 // (so we need to go find the .journal file, make sure it's
1643                 // the right size, re-sync up if it was moved, etc).
1644                 //
1645                 if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
1646                         && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
1647                         && !jnl_disable) {
1648
1649                         // if we're able to init the journal, mark the mount
1650                         // point as journaled.
1651                         //
1652                         if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) {
1653                                 vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1654                         } else {
1655                                 if (retval == EROFS) {
1656                                         // EROFS is a special error code that means the volume has an external
1657                                         // journal which we couldn't find.  in that case we do not want to
1658                                         // rewrite the volume header - we'll just refuse to mount the volume.
1659                                         if (HFS_MOUNT_DEBUG) {
1660                                                 printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n");
1661                                         }
1662                                         retval = EINVAL;
1663                                         goto error_exit;
1664                                 }
1665
1666                                 // if the journal failed to open, then set the lastMountedVersion
1667                                 // to be "FSK!" which fsck_hfs will see and force the fsck instead
1668                                 // of just bailing out because the volume is journaled.
1669                                 if (!ronly) {
1670                                         if (HFS_MOUNT_DEBUG) {
1671                                                 printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n");
1672                                         }
1673
1674                                         HFSPlusVolumeHeader *jvhp;
1675
1676                                     hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1677
1678                                     if (mdb_offset == 0) {
1679                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1680                                     }
1681
1682                                     bp = NULL;
1683                                     retval = (int)buf_meta_bread(devvp,
1684                                                     HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1685                                                     phys_blksize, cred, &bp);
1686                                     if (retval == 0) {
1687                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1688
1689                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1690                                                 printf ("hfs(1): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1691                                             jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1692                                             buf_bwrite(bp);
1693                                         } else {
1694                                             buf_brelse(bp);
1695                                         }
1696                                         bp = NULL;
1697                                     } else if (bp) {
1698                                         buf_brelse(bp);
1699                                         // clear this so the error exit path won't try to use it
1700                                         bp = NULL;
1701                                     }
1702                                 }
1703
1704                                 // if this isn't the root device just bail out.
1705                                 // If it is the root device we just continue on
1706                                 // in the hopes that fsck_hfs will be able to
1707                                 // fix any damage that exists on the volume.
1708                                 if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1709                                         if (HFS_MOUNT_DEBUG) {
1710                                                 printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n");
1711                                         }
1712                                     retval = EINVAL;
1713                                     goto error_exit;
1714                                 }
1715                         }
1716                 }
1717                 // XXXdbg
1718
1719                 /* Either the journal is replayed successfully, or there
1720                  * was nothing to replay, or no journal exists.  In any case,
1721                  * return success.
1722                  */
1723                 if (journal_replay_only) {
1724                         retval = 0;
1725                         goto error_exit;
1726                 }
1727
1728                 (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
1729
1730                 retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1731                 /*
1732                  * If the backend didn't like our physical blocksize
1733                  * then retry with physical blocksize of 512.
1734                  */
1735                 if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) {
1736                         printf("hfs_mountfs: could not use physical block size "
1737                                         "(%d) switching to 512\n", log_blksize);
1738                         log_blksize = 512;
1739                         if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1740                                 if (HFS_MOUNT_DEBUG) {
1741                                         printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n");
1742                                 }
1743                                 retval = ENXIO;
1744                                 goto error_exit;
1745                         }
1746                         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1747                                 if (HFS_MOUNT_DEBUG) {
1748                                         printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n");
1749                                 }
1750                                 retval = ENXIO;
1751                                 goto error_exit;
1752                         }
1753                         devvp->v_specsize = log_blksize;
1754                         /* Note: relative block count adjustment (in case this is an embedded volume). */
1755                         hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
1756                         hfsmp->hfs_logical_block_size = log_blksize;
1757                         hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
1758
1759                         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1760
1761                         if (hfsmp->jnl && hfsmp->jvp == devvp) {
1762                             // close and re-open this with the new block size
1763                             journal_close(hfsmp->jnl);
1764                             hfsmp->jnl = NULL;
1765                             if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
1766                                         vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1767                                 } else {
1768                                         // if the journal failed to open, then set the lastMountedVersion
1769                                         // to be "FSK!" which fsck_hfs will see and force the fsck instead
1770                                         // of just bailing out because the volume is journaled.
1771                                         if (!ronly) {
1772                                                 if (HFS_MOUNT_DEBUG) {
1773                                                         printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n");
1774                                                 }
1775                                         HFSPlusVolumeHeader *jvhp;
1776
1777                                         hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1778
1779                                         if (mdb_offset == 0) {
1780                                                         mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1781                                         }
1782
1783                                                 bp = NULL;
1784                                         retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1785                                                         phys_blksize, cred, &bp);
1786                                         if (retval == 0) {
1787                                                         jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1788
1789                                                         if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1790                                                                 printf ("hfs(2): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1791                                                         jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1792                                                         buf_bwrite(bp);
1793                                                         } else {
1794                                                         buf_brelse(bp);
1795                                                         }
1796                                                         bp = NULL;
1797                                         } else if (bp) {
1798                                                         buf_brelse(bp);
1799                                                         // clear this so the error exit path won't try to use it
1800                                                         bp = NULL;
1801                                         }
1802                                         }
1803
1804                                         // if this isn't the root device just bail out.
1805                                         // If it is the root device we just continue on
1806                                         // in the hopes that fsck_hfs will be able to
1807                                         // fix any damage that exists on the volume.
1808                                         if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1809                                                 if (HFS_MOUNT_DEBUG) {
1810                                                         printf("hfs_mountfs: hfs_early_journal_init (2) failed \n");
1811                                                 }
1812                                         retval = EINVAL;
1813                                         goto error_exit;
1814                                         }
1815                                 }
1816                         }
1817
1818                         /* Try again with a smaller block size... */
1819                         retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1820                         if (retval && HFS_MOUNT_DEBUG) {
1821                                 printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval);
1822                         }
1823                 }
1824                 if (retval)
1825                         (void) hfs_relconverter(0);
1826         }
1827
1828         // save off a snapshot of the mtime from the previous mount
1829         // (for matador).
1830         hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime;
1831
1832         if ( retval ) {
1833                 if (HFS_MOUNT_DEBUG) {
1834                         printf("hfs_mountfs: encountered failure %d \n", retval);
1835                 }
1836                 goto error_exit;
1837         }
1838
1839         mp->mnt_vfsstat.f_fsid.val[0] = dev;
1840         mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp);
1841         vfs_setmaxsymlen(mp, 0);
1842
1843         mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR;
1844 #if NAMEDSTREAMS
1845         mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1846 #endif
1847         if ((hfsmp->hfs_flags & HFS_STANDARD) == 0 ) {
1848                 /* Tell VFS that we support directory hard links. */
1849                 mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS;
1850         }
1851 #if CONFIG_HFS_STD
1852         else {
1853                 /* HFS standard doesn't support extended readdir! */
1854                 mount_set_noreaddirext (mp);
1855         }
1856 #endif
1857
1858         if (args) {
1859                 /*
1860                  * Set the free space warning levels for a non-root volume:
1861                  *
1862                  * Set the "danger" limit to 1% of the volume size or 100MB, whichever
1863                  * is less.  Set the "warning" limit to 2% of the volume size or 150MB,
1864                  * whichever is less.  And last, set the "desired" freespace level to
1865                  * to 3% of the volume size or 200MB, whichever is less.
1866                  */
1867                 hfsmp->hfs_freespace_notify_dangerlimit =
1868                         MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1869                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION);
1870                 hfsmp->hfs_freespace_notify_warninglimit =
1871                         MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1872                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION);
1873                 hfsmp->hfs_freespace_notify_desiredlevel =
1874                         MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1875                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION);
1876         } else {
1877                 /*
1878                  * Set the free space warning levels for the root volume:
1879                  *
1880                  * Set the "danger" limit to 5% of the volume size or 512MB, whichever
1881                  * is less.  Set the "warning" limit to 10% of the volume size or 1GB,
1882                  * whichever is less.  And last, set the "desired" freespace level to
1883                  * to 11% of the volume size or 1.25GB, whichever is less.
1884                  */
1885                 hfsmp->hfs_freespace_notify_dangerlimit =
1886                         MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1887                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION);
1888                 hfsmp->hfs_freespace_notify_warninglimit =
1889                         MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1890                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION);
1891                 hfsmp->hfs_freespace_notify_desiredlevel =
1892                         MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1893                                 (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION);
1894         };
1895
1896         /* Check if the file system exists on virtual device, like disk image */
1897         if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) {
1898                 if (isvirtual) {
1899                         hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE;
1900                 }
1901         }
1902
1903         /* do not allow ejectability checks on the root device */
1904         if (isroot == 0) {
1905                 if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 &&
1906                                 IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
1907                         hfsmp->hfs_max_pending_io = 4096*1024;   // a reasonable value to start with.
1908                         hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
1909                         if (hfsmp->hfs_syncer == NULL) {
1910                                 printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
1911                                                 mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
1912                         }
1913                 }
1914         }
1915
1916         printf("hfs: mounted %s on device %s\n", (hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"),
1917             (devvp->v_name ? devvp->v_name : (isroot ? "root_device": "unknown device")));
1918
1919         /*
1920          * Start looking for free space to drop below this level and generate a
1921          * warning immediately if needed:
1922          */
1923         hfsmp->hfs_notification_conditions = 0;
1924         hfs_generate_volume_notifications(hfsmp);
1925
1926         if (ronly == 0) {
1927                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1928         }
1929         FREE(mdbp, M_TEMP);
1930         return (0);
1931
1932 error_exit:
1933         if (bp)
1934                 buf_brelse(bp);
1935         if (mdbp)
1936                 FREE(mdbp, M_TEMP);
1937
1938         if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
1939                 vnode_clearmountedon(hfsmp->jvp);
1940                 (void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel());
1941                 hfsmp->jvp = NULL;
1942         }
1943         if (hfsmp) {
1944                 if (hfsmp->hfs_devvp) {
1945                         vnode_rele(hfsmp->hfs_devvp);
1946                 }
1947                 hfs_locks_destroy(hfsmp);
1948                 hfs_delete_chash(hfsmp);
1949                 hfs_idhash_destroy (hfsmp);
1950
1951                 FREE(hfsmp, M_HFSMNT);
1952                 vfs_setfsprivate(mp, NULL);
1953         }
1954         return (retval);
1955 }
1956
1957
1958 /*
1959  * Make a filesystem operational.
1960  * Nothing to do at the moment.
1961  */
1962 /* ARGSUSED */
1963 static int
1964 hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context)
1965 {
1966         return (0);
1967 }
1968
1969
1970 /*
1971  * unmount system call
1972  */
1973 int
1974 hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
1975 {
1976         struct proc *p = vfs_context_proc(context);
1977         struct hfsmount *hfsmp = VFSTOHFS(mp);
1978         int retval = E_NONE;
1979         int flags;
1980         int force;
1981         int started_tr = 0;
1982
1983         flags = 0;
1984         force = 0;
1985         if (mntflags & MNT_FORCE) {
1986                 flags |= FORCECLOSE;
1987                 force = 1;
1988         }
1989
1990         printf("hfs: unmount initiated on %s on device %s\n",
1991                         (hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"),
1992                         (hfsmp->hfs_devvp ? ((hfsmp->hfs_devvp->v_name ? hfsmp->hfs_devvp->v_name : "unknown device")) : "unknown device"));
1993
1994         if ((retval = hfs_flushfiles(mp, flags, p)) && !force)
1995                 return (retval);
1996
1997         if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
1998                 (void) hfs_recording_suspend(hfsmp);
1999
2000         /*
2001          * Cancel any pending timers for this volume.  Then wait for any timers
2002          * which have fired, but whose callbacks have not yet completed.
2003          */
2004         if (hfsmp->hfs_syncer)
2005         {
2006                 struct timespec ts = {0, 100000000};    /* 0.1 seconds */
2007
2008                 /*
2009                  * Cancel any timers that have been scheduled, but have not
2010                  * fired yet.  NOTE: The kernel considers a timer complete as
2011                  * soon as it starts your callback, so the kernel does not
2012                  * keep track of the number of callbacks in progress.
2013                  */
2014                 if (thread_call_cancel(hfsmp->hfs_syncer))
2015                         OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
2016                 thread_call_free(hfsmp->hfs_syncer);
2017                 hfsmp->hfs_syncer = NULL;
2018
2019                 /*
2020                  * This waits for all of the callbacks that were entered before
2021                  * we did thread_call_cancel above, but have not completed yet.
2022                  */
2023                 while(hfsmp->hfs_sync_incomplete > 0)
2024                 {
2025                         msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
2026                 }
2027
2028                 if (hfsmp->hfs_sync_incomplete < 0)
2029                         panic("hfs_unmount: pm_sync_incomplete underflow!\n");
2030         }
2031
2032         if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) {
2033                 if (hfsmp->hfs_summary_table) {
2034                         int err = 0;
2035                         /*
2036                          * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress
2037                          */
2038                         if (hfsmp->hfs_allocation_vp) {
2039                                 err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2040                         }
2041                         FREE (hfsmp->hfs_summary_table, M_TEMP);
2042                         hfsmp->hfs_summary_table = NULL;
2043                         hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE;
2044
2045                         if (err == 0 && hfsmp->hfs_allocation_vp){
2046                                 hfs_unlock (VTOC(hfsmp->hfs_allocation_vp));
2047                         }
2048
2049                 }
2050         }
2051
2052         /*
2053          * Flush out the b-trees, volume bitmap and Volume Header
2054          */
2055         if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2056                 retval = hfs_start_transaction(hfsmp);
2057                 if (retval == 0) {
2058                     started_tr = 1;
2059                 } else if (!force) {
2060                     goto err_exit;
2061                 }
2062
2063                 if (hfsmp->hfs_startup_vp) {
2064                         (void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2065                         retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p);
2066                         hfs_unlock(VTOC(hfsmp->hfs_startup_vp));
2067                         if (retval && !force)
2068                                 goto err_exit;
2069                 }
2070
2071                 if (hfsmp->hfs_attribute_vp) {
2072                         (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2073                         retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p);
2074                         hfs_unlock(VTOC(hfsmp->hfs_attribute_vp));
2075                         if (retval && !force)
2076                                 goto err_exit;
2077                 }
2078
2079                 (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2080                 retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p);
2081                 hfs_unlock(VTOC(hfsmp->hfs_catalog_vp));
2082                 if (retval && !force)
2083                         goto err_exit;
2084
2085                 (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2086                 retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p);
2087                 hfs_unlock(VTOC(hfsmp->hfs_extents_vp));
2088                 if (retval && !force)
2089                         goto err_exit;
2090
2091                 if (hfsmp->hfs_allocation_vp) {
2092                         (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2093                         retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p);
2094                         hfs_unlock(VTOC(hfsmp->hfs_allocation_vp));
2095                         if (retval && !force)
2096                                 goto err_exit;
2097                 }
2098
2099                 if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) {
2100                         retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p);
2101                         if (retval && !force)
2102                                 goto err_exit;
2103                 }
2104
2105                 /* If runtime corruption was detected, indicate that the volume
2106                  * was not unmounted cleanly.
2107                  */
2108                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2109                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2110                 } else {
2111                         HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
2112                 }
2113
2114                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2115                         int i;
2116                         u_int32_t min_start = hfsmp->totalBlocks;
2117
2118                         // set the nextAllocation pointer to the smallest free block number
2119                         // we've seen so on the next mount we won't rescan unnecessarily
2120                         lck_spin_lock(&hfsmp->vcbFreeExtLock);
2121                         for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) {
2122                                 if (hfsmp->vcbFreeExt[i].startBlock < min_start) {
2123                                         min_start = hfsmp->vcbFreeExt[i].startBlock;
2124                                 }
2125                         }
2126                         lck_spin_unlock(&hfsmp->vcbFreeExtLock);
2127                         if (min_start < hfsmp->nextAllocation) {
2128                                 hfsmp->nextAllocation = min_start;
2129                         }
2130                 }
2131
2132                 retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2133                 if (retval) {
2134                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2135                         if (!force)
2136                                 goto err_exit;  /* could not flush everything */
2137                 }
2138
2139                 if (started_tr) {
2140                     hfs_end_transaction(hfsmp);
2141                     started_tr = 0;
2142                 }
2143         }
2144
2145         if (hfsmp->jnl) {
2146                 hfs_journal_flush(hfsmp, FALSE);
2147         }
2148
2149         /*
2150          *      Invalidate our caches and release metadata vnodes
2151          */
2152         (void) hfsUnmount(hfsmp, p);
2153
2154 #if CONFIG_HFS_STD
2155         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2156                 (void) hfs_relconverter(hfsmp->hfs_encoding);
2157         }
2158 #endif
2159
2160         // XXXdbg
2161         if (hfsmp->jnl) {
2162             journal_close(hfsmp->jnl);
2163             hfsmp->jnl = NULL;
2164         }
2165
2166         VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
2167
2168         if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2169             vnode_clearmountedon(hfsmp->jvp);
2170             retval = VNOP_CLOSE(hfsmp->jvp,
2171                                hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE,
2172                                vfs_context_kernel());
2173             vnode_put(hfsmp->jvp);
2174             hfsmp->jvp = NULL;
2175         }
2176         // XXXdbg
2177
2178         /*
2179          * Last chance to dump unreferenced system files.
2180          */
2181         (void) vflush(mp, NULLVP, FORCECLOSE);
2182
2183 #if HFS_SPARSE_DEV
2184         /* Drop our reference on the backing fs (if any). */
2185         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) {
2186                 struct vnode * tmpvp;
2187
2188                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2189                 tmpvp = hfsmp->hfs_backingfs_rootvp;
2190                 hfsmp->hfs_backingfs_rootvp = NULLVP;
2191                 vnode_rele(tmpvp);
2192         }
2193 #endif /* HFS_SPARSE_DEV */
2194
2195         vnode_rele(hfsmp->hfs_devvp);
2196
2197         hfs_locks_destroy(hfsmp);
2198         hfs_delete_chash(hfsmp);
2199         hfs_idhash_destroy(hfsmp);
2200         FREE(hfsmp, M_HFSMNT);
2201
2202         return (0);
2203
2204   err_exit:
2205         if (started_tr) {
2206                 hfs_end_transaction(hfsmp);
2207         }
2208         return retval;
2209 }
2210
2211
2212 /*
2213  * Return the root of a filesystem.
2214  */
2215 static int
2216 hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context)
2217 {
2218         return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0);
2219 }
2220
2221
2222 /*
2223  * Do operations associated with quotas
2224  */
2225 #if !QUOTA
2226 static int
2227 hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context)
2228 {
2229         return (ENOTSUP);
2230 }
2231 #else
2232 static int
2233 hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context)
2234 {
2235         struct proc *p = vfs_context_proc(context);
2236         int cmd, type, error;
2237
2238         if (uid == ~0U)
2239                 uid = kauth_cred_getuid(vfs_context_ucred(context));
2240         cmd = cmds >> SUBCMDSHIFT;
2241
2242         switch (cmd) {
2243         case Q_SYNC:
2244         case Q_QUOTASTAT:
2245                 break;
2246         case Q_GETQUOTA:
2247                 if (uid == kauth_cred_getuid(vfs_context_ucred(context)))
2248                         break;
2249                 /* fall through */
2250         default:
2251                 if ( (error = vfs_context_suser(context)) )
2252                         return (error);
2253         }
2254
2255         type = cmds & SUBCMDMASK;
2256         if ((u_int)type >= MAXQUOTAS)
2257                 return (EINVAL);
2258         if (vfs_busy(mp, LK_NOWAIT))
2259                 return (0);
2260
2261         switch (cmd) {
2262
2263         case Q_QUOTAON:
2264                 error = hfs_quotaon(p, mp, type, datap);
2265                 break;
2266
2267         case Q_QUOTAOFF:
2268                 error = hfs_quotaoff(p, mp, type);
2269                 break;
2270
2271         case Q_SETQUOTA:
2272                 error = hfs_setquota(mp, uid, type, datap);
2273                 break;
2274
2275         case Q_SETUSE:
2276                 error = hfs_setuse(mp, uid, type, datap);
2277                 break;
2278
2279         case Q_GETQUOTA:
2280                 error = hfs_getquota(mp, uid, type, datap);
2281                 break;
2282
2283         case Q_SYNC:
2284                 error = hfs_qsync(mp);
2285                 break;
2286
2287         case Q_QUOTASTAT:
2288                 error = hfs_quotastat(mp, type, datap);
2289                 break;
2290
2291         default:
2292                 error = EINVAL;
2293                 break;
2294         }
2295         vfs_unbusy(mp);
2296
2297         return (error);
2298 }
2299 #endif /* QUOTA */
2300
2301 /* Subtype is composite of bits */
2302 #define HFS_SUBTYPE_JOURNALED      0x01
2303 #define HFS_SUBTYPE_CASESENSITIVE  0x02
2304 /* bits 2 - 6 reserved */
2305 #define HFS_SUBTYPE_STANDARDHFS    0x80
2306
2307 /*
2308  * Get file system statistics.
2309  */
2310 int
2311 hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context)
2312 {
2313         ExtendedVCB *vcb = VFSTOVCB(mp);
2314         struct hfsmount *hfsmp = VFSTOHFS(mp);
2315         u_int32_t freeCNIDs;
2316         u_int16_t subtype = 0;
2317
2318         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID;
2319
2320         sbp->f_bsize = (u_int32_t)vcb->blockSize;
2321         sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0);
2322         sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks);
2323         sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0));
2324         sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1));
2325         sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2));  /* max files is constrained by total blocks */
2326         sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail)));
2327
2328         /*
2329          * Subtypes (flavors) for HFS
2330          *   0:   Mac OS Extended
2331          *   1:   Mac OS Extended (Journaled)
2332          *   2:   Mac OS Extended (Case Sensitive)
2333          *   3:   Mac OS Extended (Case Sensitive, Journaled)
2334          *   4 - 127:   Reserved
2335          * 128:   Mac OS Standard
2336          *
2337          */
2338         if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
2339                 /* HFS+ & variants */
2340                 if (hfsmp->jnl) {
2341                         subtype |= HFS_SUBTYPE_JOURNALED;
2342                 }
2343                 if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) {
2344                         subtype |= HFS_SUBTYPE_CASESENSITIVE;
2345                 }
2346         }
2347 #if CONFIG_HFS_STD
2348         else {
2349                 /* HFS standard */
2350                 subtype = HFS_SUBTYPE_STANDARDHFS;
2351         }
2352 #endif
2353         sbp->f_fssubtype = subtype;
2354
2355         return (0);
2356 }
2357
2358
2359 //
2360 // XXXdbg -- this is a callback to be used by the journal to
2361 //           get meta data blocks flushed out to disk.
2362 //
2363 // XXXdbg -- be smarter and don't flush *every* block on each
2364 //           call.  try to only flush some so we don't wind up
2365 //           being too synchronous.
2366 //
2367 __private_extern__
2368 void
2369 hfs_sync_metadata(void *arg)
2370 {
2371         struct mount *mp = (struct mount *)arg;
2372         struct hfsmount *hfsmp;
2373         ExtendedVCB *vcb;
2374         buf_t   bp;
2375         int  retval;
2376         daddr64_t priIDSector;
2377         hfsmp = VFSTOHFS(mp);
2378         vcb = HFSTOVCB(hfsmp);
2379
2380         // now make sure the super block is flushed
2381         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
2382                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
2383
2384         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2385                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
2386                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
2387         if ((retval != 0 ) && (retval != ENXIO)) {
2388                 printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n",
2389                        (int)priIDSector, retval);
2390         }
2391
2392         if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2393             buf_bwrite(bp);
2394         } else if (bp) {
2395             buf_brelse(bp);
2396         }
2397
2398         // the alternate super block...
2399         // XXXdbg - we probably don't need to do this each and every time.
2400         //          hfs_btreeio.c:FlushAlternate() should flag when it was
2401         //          written...
2402         if (hfsmp->hfs_alt_id_sector) {
2403                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2404                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
2405                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
2406                 if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2407                     buf_bwrite(bp);
2408                 } else if (bp) {
2409                     buf_brelse(bp);
2410                 }
2411         }
2412 }
2413
2414
2415 struct hfs_sync_cargs {
2416         kauth_cred_t cred;
2417         struct proc  *p;
2418         int    waitfor;
2419         int    error;
2420 };
2421
2422
2423 static int
2424 hfs_sync_callback(struct vnode *vp, void *cargs)
2425 {
2426         struct cnode *cp;
2427         struct hfs_sync_cargs *args;
2428         int error;
2429
2430         args = (struct hfs_sync_cargs *)cargs;
2431
2432         if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
2433                 return (VNODE_RETURNED);
2434         }
2435         cp = VTOC(vp);
2436
2437         if ((cp->c_flag & C_MODIFIED) ||
2438             (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) ||
2439             vnode_hasdirtyblks(vp)) {
2440                 error = hfs_fsync(vp, args->waitfor, 0, args->p);
2441
2442                 if (error)
2443                         args->error = error;
2444         }
2445         hfs_unlock(cp);
2446         return (VNODE_RETURNED);
2447 }
2448
2449
2450
2451 /*
2452  * Go through the disk queues to initiate sandbagged IO;
2453  * go through the inodes to write those that have been modified;
2454  * initiate the writing of the super block if it has been modified.
2455  *
2456  * Note: we are always called with the filesystem marked `MPBUSY'.
2457  */
2458 int
2459 hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
2460 {
2461         struct proc *p = vfs_context_proc(context);
2462         struct cnode *cp;
2463         struct hfsmount *hfsmp;
2464         ExtendedVCB *vcb;
2465         struct vnode *meta_vp[4];
2466         int i;
2467         int error, allerror = 0;
2468         struct hfs_sync_cargs args;
2469
2470         hfsmp = VFSTOHFS(mp);
2471
2472         /*
2473          * hfs_changefs might be manipulating vnodes so back off
2474          */
2475         if (hfsmp->hfs_flags & HFS_IN_CHANGEFS)
2476                 return (0);
2477
2478         if (hfsmp->hfs_flags & HFS_READ_ONLY)
2479                 return (EROFS);
2480
2481         /* skip over frozen volumes */
2482         if (!lck_rw_try_lock_shared(&hfsmp->hfs_insync))
2483                 return 0;
2484
2485         args.cred = kauth_cred_get();
2486         args.waitfor = waitfor;
2487         args.p = p;
2488         args.error = 0;
2489         /*
2490          * hfs_sync_callback will be called for each vnode
2491          * hung off of this mount point... the vnode will be
2492          * properly referenced and unreferenced around the callback
2493          */
2494         vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args);
2495
2496         if (args.error)
2497                 allerror = args.error;
2498
2499         vcb = HFSTOVCB(hfsmp);
2500
2501         meta_vp[0] = vcb->extentsRefNum;
2502         meta_vp[1] = vcb->catalogRefNum;
2503         meta_vp[2] = vcb->allocationsRefNum;  /* This is NULL for standard HFS */
2504         meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */
2505
2506         /* Now sync our three metadata files */
2507         for (i = 0; i < 4; ++i) {
2508                 struct vnode *btvp;
2509
2510                 btvp = meta_vp[i];;
2511                 if ((btvp==0) || (vnode_mount(btvp) != mp))
2512                         continue;
2513
2514                 /* XXX use hfs_systemfile_lock instead ? */
2515                 (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2516                 cp = VTOC(btvp);
2517
2518                 if (((cp->c_flag &  C_MODIFIED) == 0) &&
2519                     (cp->c_touch_acctime == 0) &&
2520                     (cp->c_touch_chgtime == 0) &&
2521                     (cp->c_touch_modtime == 0) &&
2522                     vnode_hasdirtyblks(btvp) == 0) {
2523                         hfs_unlock(VTOC(btvp));
2524                         continue;
2525                 }
2526                 error = vnode_get(btvp);
2527                 if (error) {
2528                         hfs_unlock(VTOC(btvp));
2529                         continue;
2530                 }
2531                 if ((error = hfs_fsync(btvp, waitfor, 0, p)))
2532                         allerror = error;
2533
2534                 hfs_unlock(cp);
2535                 vnode_put(btvp);
2536         };
2537
2538
2539 #if CONFIG_HFS_STD
2540         /*
2541          * Force stale file system control information to be flushed.
2542          */
2543         if (vcb->vcbSigWord == kHFSSigWord) {
2544                 if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) {
2545                         allerror = error;
2546                 }
2547         }
2548 #endif
2549
2550 #if QUOTA
2551         hfs_qsync(mp);
2552 #endif /* QUOTA */
2553
2554         hfs_hotfilesync(hfsmp, vfs_context_kernel());
2555
2556         /*
2557          * Write back modified superblock.
2558          */
2559         if (IsVCBDirty(vcb)) {
2560                 error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
2561                 if (error)
2562                         allerror = error;
2563         }
2564
2565         if (hfsmp->jnl) {
2566             hfs_journal_flush(hfsmp, FALSE);
2567         }
2568
2569         lck_rw_unlock_shared(&hfsmp->hfs_insync);
2570         return (allerror);
2571 }
2572
2573
2574 /*
2575  * File handle to vnode
2576  *
2577  * Have to be really careful about stale file handles:
2578  * - check that the cnode id is valid
2579  * - call hfs_vget() to get the locked cnode
2580  * - check for an unallocated cnode (i_mode == 0)
2581  * - check that the given client host has export rights and return
2582  *   those rights via. exflagsp and credanonp
2583  */
2584 static int
2585 hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context)
2586 {
2587         struct hfsfid *hfsfhp;
2588         struct vnode *nvp;
2589         int result;
2590
2591         *vpp = NULL;
2592         hfsfhp = (struct hfsfid *)fhp;
2593
2594         if (fhlen < (int)sizeof(struct hfsfid))
2595                 return (EINVAL);
2596
2597         result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0);
2598         if (result) {
2599                 if (result == ENOENT)
2600                         result = ESTALE;
2601                 return result;
2602         }
2603
2604         /*
2605          * We used to use the create time as the gen id of the file handle,
2606          * but it is not static enough because it can change at any point
2607          * via system calls.  We still don't have another volume ID or other
2608          * unique identifier to use for a generation ID across reboots that
2609          * persists until the file is removed.  Using only the CNID exposes
2610          * us to the potential wrap-around case, but as of 2/2008, it would take
2611          * over 2 months to wrap around if the machine did nothing but allocate
2612          * CNIDs.  Using some kind of wrap counter would only be effective if
2613          * each file had the wrap counter associated with it.  For now,
2614          * we use only the CNID to identify the file as it's good enough.
2615          */
2616
2617         *vpp = nvp;
2618
2619         hfs_unlock(VTOC(nvp));
2620         return (0);
2621 }
2622
2623
2624 /*
2625  * Vnode pointer to File handle
2626  */
2627 /* ARGSUSED */
2628 static int
2629 hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context)
2630 {
2631         struct cnode *cp;
2632         struct hfsfid *hfsfhp;
2633
2634         if (ISHFS(VTOVCB(vp)))
2635                 return (ENOTSUP);       /* hfs standard is not exportable */
2636
2637         if (*fhlenp < (int)sizeof(struct hfsfid))
2638                 return (EOVERFLOW);
2639
2640         cp = VTOC(vp);
2641         hfsfhp = (struct hfsfid *)fhp;
2642         /* only the CNID is used to identify the file now */
2643         hfsfhp->hfsfid_cnid = htonl(cp->c_fileid);
2644         hfsfhp->hfsfid_gen = htonl(cp->c_fileid);
2645         *fhlenp = sizeof(struct hfsfid);
2646
2647         return (0);
2648 }
2649
2650
2651 /*
2652  * Initialize HFS filesystems, done only once per boot.
2653  *
2654  * HFS is not a kext-based file system.  This makes it difficult to find
2655  * out when the last HFS file system was unmounted and call hfs_uninit()
2656  * to deallocate data structures allocated in hfs_init().  Therefore we
2657  * never deallocate memory allocated by lock attribute and group initializations
2658  * in this function.
2659  */
2660 static int
2661 hfs_init(__unused struct vfsconf *vfsp)
2662 {
2663         static int done = 0;
2664
2665         if (done)
2666                 return (0);
2667         done = 1;
2668         hfs_chashinit();
2669         hfs_converterinit();
2670
2671         BTReserveSetup();
2672
2673         hfs_lock_attr    = lck_attr_alloc_init();
2674         hfs_group_attr   = lck_grp_attr_alloc_init();
2675         hfs_mutex_group  = lck_grp_alloc_init("hfs-mutex", hfs_group_attr);
2676         hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr);
2677         hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr);
2678
2679 #if HFS_COMPRESSION
2680         decmpfs_init();
2681 #endif
2682
2683         return (0);
2684 }
2685
2686
2687 /*
2688  * Destroy all locks, mutexes and spinlocks in hfsmp on unmount or failed mount
2689  */
2690 static void
2691 hfs_locks_destroy(struct hfsmount *hfsmp)
2692 {
2693
2694         lck_mtx_destroy(&hfsmp->hfs_mutex, hfs_mutex_group);
2695         lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group);
2696         lck_rw_destroy(&hfsmp->hfs_global_lock, hfs_rwlock_group);
2697         lck_rw_destroy(&hfsmp->hfs_insync, hfs_rwlock_group);
2698         lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group);
2699
2700         return;
2701 }
2702
2703
2704 static int
2705 hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp)
2706 {
2707         struct hfsmount * hfsmp;
2708         char fstypename[MFSNAMELEN];
2709
2710         if (vp == NULL)
2711                 return (EINVAL);
2712
2713         if (!vnode_isvroot(vp))
2714                 return (EINVAL);
2715
2716         vnode_vfsname(vp, fstypename);
2717         if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0)
2718                 return (EINVAL);
2719
2720         hfsmp = VTOHFS(vp);
2721
2722         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2723                 return (EINVAL);
2724
2725         *hfsmpp = hfsmp;
2726
2727         return (0);
2728 }
2729
2730 // XXXdbg
2731 #include <sys/filedesc.h>
2732
2733 /*
2734  * HFS filesystem related variables.
2735  */
2736 int
2737 hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp,
2738                         user_addr_t newp, size_t newlen, vfs_context_t context)
2739 {
2740         struct proc *p = vfs_context_proc(context);
2741         int error;
2742         struct hfsmount *hfsmp;
2743
2744         /* all sysctl names at this level are terminal */
2745
2746         if (name[0] == HFS_ENCODINGBIAS) {
2747                 int bias;
2748
2749                 bias = hfs_getencodingbias();
2750                 error = sysctl_int(oldp, oldlenp, newp, newlen, &bias);
2751                 if (error == 0 && newp)
2752                         hfs_setencodingbias(bias);
2753                 return (error);
2754
2755         } else if (name[0] == HFS_EXTEND_FS) {
2756                 u_int64_t  newsize;
2757                 vnode_t vp = vfs_context_cwd(context);
2758
2759                 if (newp == USER_ADDR_NULL || vp == NULLVP)
2760                         return (EINVAL);
2761                 if ((error = hfs_getmountpoint(vp, &hfsmp)))
2762                         return (error);
2763                 error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize);
2764                 if (error)
2765                         return (error);
2766
2767                 error = hfs_extendfs(hfsmp, newsize, context);
2768                 return (error);
2769
2770         } else if (name[0] == HFS_ENCODINGHINT) {
2771                 size_t bufsize;
2772                 size_t bytes;
2773                 u_int32_t hint;
2774                 u_int16_t *unicode_name = NULL;
2775                 char *filename = NULL;
2776
2777                 if ((newlen <= 0) || (newlen > MAXPATHLEN))
2778                         return (EINVAL);
2779
2780                 bufsize = MAX(newlen * 3, MAXPATHLEN);
2781                 MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK);
2782                 if (filename == NULL) {
2783                         error = ENOMEM;
2784                         goto encodinghint_exit;
2785                 }
2786                 MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK);
2787                 if (filename == NULL) {
2788                         error = ENOMEM;
2789                         goto encodinghint_exit;
2790                 }
2791
2792                 error = copyin(newp, (caddr_t)filename, newlen);
2793                 if (error == 0) {
2794                         error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name,
2795                                                &bytes, bufsize, 0, UTF_DECOMPOSED);
2796                         if (error == 0) {
2797                                 hint = hfs_pickencoding(unicode_name, bytes / 2);
2798                                 error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint);
2799                         }
2800                 }
2801
2802 encodinghint_exit:
2803                 if (unicode_name)
2804                         FREE(unicode_name, M_TEMP);
2805                 if (filename)
2806                         FREE(filename, M_TEMP);
2807                 return (error);
2808
2809         } else if (name[0] == HFS_ENABLE_JOURNALING) {
2810                 // make the file system journaled...
2811                 vnode_t vp = vfs_context_cwd(context);
2812                 vnode_t jvp;
2813                 ExtendedVCB *vcb;
2814                 struct cat_attr jnl_attr;
2815             struct cat_attr     jinfo_attr;
2816                 struct cat_fork jnl_fork;
2817                 struct cat_fork jinfo_fork;
2818                 buf_t jib_buf;
2819                 uint64_t jib_blkno;
2820                 uint32_t tmpblkno;
2821                 uint64_t journal_byte_offset;
2822                 uint64_t journal_size;
2823                 vnode_t jib_vp = NULLVP;
2824                 struct JournalInfoBlock local_jib;
2825                 int err = 0;
2826                 void *jnl = NULL;
2827                 int lockflags;
2828
2829                 /* Only root can enable journaling */
2830                 if (!kauth_cred_issuser(kauth_cred_get())) {
2831                         return (EPERM);
2832                 }
2833                 if (vp == NULLVP)
2834                         return EINVAL;
2835
2836                 hfsmp = VTOHFS(vp);
2837                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2838                         return EROFS;
2839                 }
2840                 if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2841                         printf("hfs: can't make a plain hfs volume journaled.\n");
2842                         return EINVAL;
2843                 }
2844
2845                 if (hfsmp->jnl) {
2846                     printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp));
2847                     return EAGAIN;
2848                 }
2849                 vcb = HFSTOVCB(hfsmp);
2850
2851                 /* Set up local copies of the initialization info */
2852                 tmpblkno = (uint32_t) name[1];
2853                 jib_blkno = (uint64_t) tmpblkno;
2854                 journal_byte_offset = (uint64_t) name[2];
2855                 journal_byte_offset *= hfsmp->blockSize;
2856                 journal_byte_offset += hfsmp->hfsPlusIOPosOffset;
2857                 journal_size = (uint64_t)((unsigned)name[3]);
2858
2859                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2860                 if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
2861                         BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
2862
2863                         printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
2864                         hfs_systemfile_unlock(hfsmp, lockflags);
2865                         return EINVAL;
2866                 }
2867                 hfs_systemfile_unlock(hfsmp, lockflags);
2868
2869                 // make sure these both exist!
2870                 if (   GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
2871                         || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) {
2872
2873                         return EINVAL;
2874                 }
2875
2876                 /*
2877                  * At this point, we have a copy of the metadata that lives in the catalog for the
2878                  * journal info block.  Compare that the journal info block's single extent matches
2879                  * that which was passed into this sysctl.
2880                  *
2881                  * If it is different, deny the journal enable call.
2882                  */
2883                 if (jinfo_fork.cf_blocks > 1) {
2884                         /* too many blocks */
2885                         return EINVAL;
2886                 }
2887
2888                 if (jinfo_fork.cf_extents[0].startBlock != jib_blkno) {
2889                         /* Wrong block */
2890                         return EINVAL;
2891                 }
2892
2893                 /*
2894                  * We want to immediately purge the vnode for the JIB.
2895                  *
2896                  * Because it was written to from userland, there's probably
2897                  * a vnode somewhere in the vnode cache (possibly with UBC backed blocks).
2898                  * So we bring the vnode into core, then immediately do whatever
2899                  * we can to flush/vclean it out.  This is because those blocks will be
2900                  * interpreted as user data, which may be treated separately on some platforms
2901                  * than metadata.  If the vnode is gone, then there cannot be backing blocks
2902                  * in the UBC.
2903                  */
2904                 if (hfs_vget (hfsmp, jinfo_attr.ca_fileid, &jib_vp, 1, 0)) {
2905                         return EINVAL;
2906                 }
2907                 /*
2908                  * Now we have a vnode for the JIB. recycle it. Because we hold an iocount
2909                  * on the vnode, we'll just mark it for termination when the last iocount
2910                  * (hopefully ours), is dropped.
2911                  */
2912                 vnode_recycle (jib_vp);
2913                 err = vnode_put (jib_vp);
2914                 if (err) {
2915                         return EINVAL;
2916                 }
2917
2918                 /* Initialize the local copy of the JIB (just like hfs.util) */
2919                 memset (&local_jib, 'Z', sizeof(struct JournalInfoBlock));
2920                 local_jib.flags = SWAP_BE32(kJIJournalInFSMask);
2921                 /* Note that the JIB's offset is in bytes */
2922                 local_jib.offset = SWAP_BE64(journal_byte_offset);
2923                 local_jib.size = SWAP_BE64(journal_size);
2924
2925                 /*
2926                  * Now write out the local JIB.  This essentially overwrites the userland
2927                  * copy of the JIB.  Read it as BLK_META to treat it as a metadata read/write.
2928                  */
2929                 jib_buf = buf_getblk (hfsmp->hfs_devvp,
2930                                 jib_blkno * (hfsmp->blockSize / hfsmp->hfs_logical_block_size),
2931                                 hfsmp->blockSize, 0, 0, BLK_META);
2932                 char* buf_ptr = (char*) buf_dataptr (jib_buf);
2933
2934                 /* Zero out the portion of the block that won't contain JIB data */
2935                 memset (buf_ptr, 0, hfsmp->blockSize);
2936
2937                 bcopy(&local_jib, buf_ptr, sizeof(local_jib));
2938                 if (buf_bwrite (jib_buf)) {
2939                         return EIO;
2940                 }
2941
2942                 /* Force a flush track cache */
2943                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
2944
2945
2946                 /* Now proceed with full volume sync */
2947                 hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context);
2948
2949                 printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
2950                            (off_t)name[2], (off_t)name[3]);
2951
2952                 //
2953                 // XXXdbg - note that currently (Sept, 08) hfs_util does not support
2954                 //          enabling the journal on a separate device so it is safe
2955                 //          to just copy hfs_devvp here.  If hfs_util gets the ability
2956                 //          to dynamically enable the journal on a separate device then
2957                 //          we will have to do the same thing as hfs_early_journal_init()
2958                 //          to locate and open the journal device.
2959                 //
2960                 jvp = hfsmp->hfs_devvp;
2961                 jnl = journal_create(jvp, journal_byte_offset, journal_size,
2962                                                          hfsmp->hfs_devvp,
2963                                                          hfsmp->hfs_logical_block_size,
2964                                                          0,
2965                                                          0,
2966                                                          hfs_sync_metadata, hfsmp->hfs_mp,
2967                                                          hfsmp->hfs_mp);
2968
2969                 /*
2970                  * Set up the trim callback function so that we can add
2971                  * recently freed extents to the free extent cache once
2972                  * the transaction that freed them is written to the
2973                  * journal on disk.
2974                  */
2975                 if (jnl)
2976                         journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp);
2977
2978                 if (jnl == NULL) {
2979                         printf("hfs: FAILED to create the journal!\n");
2980                         if (jvp && jvp != hfsmp->hfs_devvp) {
2981                                 vnode_clearmountedon(jvp);
2982                                 VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2983                         }
2984                         jvp = NULL;
2985
2986                         return EINVAL;
2987                 }
2988
2989                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2990
2991                 /*
2992                  * Flush all dirty metadata buffers.
2993                  */
2994                 buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl");
2995                 buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl");
2996                 buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl");
2997                 buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl");
2998                 if (hfsmp->hfs_attribute_vp)
2999                         buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl");
3000
3001                 HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
3002                 HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
3003                 hfsmp->jvp = jvp;
3004                 hfsmp->jnl = jnl;
3005
3006                 // save this off for the hack-y check in hfs_remove()
3007                 hfsmp->jnl_start        = (u_int32_t)name[2];
3008                 hfsmp->jnl_size         = (off_t)((unsigned)name[3]);
3009                 hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
3010                 hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
3011
3012                 vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3013
3014                 hfs_unlock_global (hfsmp);
3015                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3016
3017                 {
3018                         fsid_t fsid;
3019
3020                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3021                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3022                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3023                 }
3024                 return 0;
3025         } else if (name[0] == HFS_DISABLE_JOURNALING) {
3026                 // clear the journaling bit
3027                 vnode_t vp = vfs_context_cwd(context);
3028
3029                 /* Only root can disable journaling */
3030                 if (!kauth_cred_issuser(kauth_cred_get())) {
3031                         return (EPERM);
3032                 }
3033                 if (vp == NULLVP)
3034                         return EINVAL;
3035
3036                 hfsmp = VTOHFS(vp);
3037
3038                 /*
3039                  * Disabling journaling is disallowed on volumes with directory hard links
3040                  * because we have not tested the relevant code path.
3041                  */
3042                 if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){
3043                         printf("hfs: cannot disable journaling on volumes with directory hardlinks\n");
3044                         return EPERM;
3045                 }
3046
3047                 printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp));
3048
3049                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3050
3051                 // Lights out for you buddy!
3052                 journal_close(hfsmp->jnl);
3053                 hfsmp->jnl = NULL;
3054
3055                 if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
3056                         vnode_clearmountedon(hfsmp->jvp);
3057                         VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
3058                         vnode_put(hfsmp->jvp);
3059                 }
3060                 hfsmp->jvp = NULL;
3061                 vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3062                 hfsmp->jnl_start        = 0;
3063                 hfsmp->hfs_jnlinfoblkid = 0;
3064                 hfsmp->hfs_jnlfileid    = 0;
3065
3066                 HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
3067
3068                 hfs_unlock_global (hfsmp);
3069
3070                 hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3071
3072                 {
3073                         fsid_t fsid;
3074
3075                         fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3076                         fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3077                         vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3078                 }
3079                 return 0;
3080         } else if (name[0] == HFS_GET_JOURNAL_INFO) {
3081                 vnode_t vp = vfs_context_cwd(context);
3082                 off_t jnl_start, jnl_size;
3083
3084                 if (vp == NULLVP)
3085                         return EINVAL;
3086
3087                 /* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */
3088                 if (proc_is64bit(current_proc()))
3089                         return EINVAL;
3090
3091                 hfsmp = VTOHFS(vp);
3092             if (hfsmp->jnl == NULL) {
3093                         jnl_start = 0;
3094                         jnl_size  = 0;
3095             } else {
3096                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
3097                         jnl_size  = (off_t)hfsmp->jnl_size;
3098             }
3099
3100             if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) {
3101                         return error;
3102                 }
3103             if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) {
3104                         return error;
3105                 }
3106
3107                 return 0;
3108         } else if (name[0] == HFS_SET_PKG_EXTENSIONS) {
3109
3110             return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
3111
3112         } else if (name[0] == VFS_CTL_QUERY) {
3113         struct sysctl_req *req;
3114         union union_vfsidctl vc;
3115         struct mount *mp;
3116             struct vfsquery vq;
3117
3118                 req = CAST_DOWN(struct sysctl_req *, oldp);     /* we're new style vfs sysctl. */
3119
3120         error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32));
3121                 if (error) return (error);
3122
3123                 mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */
3124         if (mp == NULL) return (ENOENT);
3125
3126                 hfsmp = VFSTOHFS(mp);
3127                 bzero(&vq, sizeof(vq));
3128                 vq.vq_flags = hfsmp->hfs_notification_conditions;
3129                 return SYSCTL_OUT(req, &vq, sizeof(vq));;
3130         } else if (name[0] == HFS_REPLAY_JOURNAL) {
3131                 vnode_t devvp = NULL;
3132                 int device_fd;
3133                 if (namelen != 2) {
3134                         return (EINVAL);
3135                 }
3136                 device_fd = name[1];
3137                 error = file_vnode(device_fd, &devvp);
3138                 if (error) {
3139                         return error;
3140                 }
3141                 error = vnode_getwithref(devvp);
3142                 if (error) {
3143                         file_drop(device_fd);
3144                         return error;
3145                 }
3146                 error = hfs_journal_replay(devvp, context);
3147                 file_drop(device_fd);
3148                 vnode_put(devvp);
3149                 return error;
3150         } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) {
3151                 hfs_resize_debug = 1;
3152                 printf ("hfs_sysctl: Enabled volume resize debugging.\n");
3153                 return 0;
3154         }
3155
3156         return (ENOTSUP);
3157 }
3158
3159 /*
3160  * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support
3161  * the build_path ioctl.  We use it to leverage the code below that updates
3162  * the origin list cache if necessary
3163  */
3164
3165 int
3166 hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context)
3167 {
3168         int error;
3169         int lockflags;
3170         struct hfsmount *hfsmp;
3171
3172         hfsmp = VFSTOHFS(mp);
3173
3174         error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0);
3175         if (error)
3176                 return (error);
3177
3178         /*
3179          * ADLs may need to have their origin state updated
3180          * since build_path needs a valid parent.  The same is true
3181          * for hardlinked files as well.  There isn't a race window here
3182          * in re-acquiring the cnode lock since we aren't pulling any data
3183          * out of the cnode; instead, we're going to the catalog.
3184          */
3185         if ((VTOC(*vpp)->c_flag & C_HARDLINK) &&
3186             (hfs_lock(VTOC(*vpp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0)) {
3187                 cnode_t *cp = VTOC(*vpp);
3188                 struct cat_desc cdesc;
3189
3190                 if (!hfs_haslinkorigin(cp)) {
3191                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3192                         error = cat_findname(hfsmp, (cnid_t)ino, &cdesc);
3193                         hfs_systemfile_unlock(hfsmp, lockflags);
3194                         if (error == 0) {
3195                                 if ((cdesc.cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3196                                         (cdesc.cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid)) {
3197                                         hfs_savelinkorigin(cp, cdesc.cd_parentcnid);
3198                                 }
3199                                 cat_releasedesc(&cdesc);
3200                         }
3201                 }
3202                 hfs_unlock(cp);
3203         }
3204         return (0);
3205 }
3206
3207
3208 /*
3209  * Look up an HFS object by ID.
3210  *
3211  * The object is returned with an iocount reference and the cnode locked.
3212  *
3213  * If the object is a file then it will represent the data fork.
3214  */
3215 int
3216 hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted)
3217 {
3218         struct vnode *vp = NULLVP;
3219         struct cat_desc cndesc;
3220         struct cat_attr cnattr;
3221         struct cat_fork cnfork;
3222         u_int32_t linkref = 0;
3223         int error;
3224
3225         /* Check for cnids that should't be exported. */
3226         if ((cnid < kHFSFirstUserCatalogNodeID) &&
3227             (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) {
3228                 return (ENOENT);
3229         }
3230         /* Don't export our private directories. */
3231         if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid ||
3232             cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) {
3233                 return (ENOENT);
3234         }
3235         /*
3236          * Check the hash first
3237          */
3238         vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted);
3239         if (vp) {
3240                 *vpp = vp;
3241                 return(0);
3242         }
3243
3244         bzero(&cndesc, sizeof(cndesc));
3245         bzero(&cnattr, sizeof(cnattr));
3246         bzero(&cnfork, sizeof(cnfork));
3247
3248         /*
3249          * Not in hash, lookup in catalog
3250          */
3251         if (cnid == kHFSRootParentID) {
3252                 static char hfs_rootname[] = "/";
3253
3254                 cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0];
3255                 cndesc.cd_namelen = 1;
3256                 cndesc.cd_parentcnid = kHFSRootParentID;
3257                 cndesc.cd_cnid = kHFSRootFolderID;
3258                 cndesc.cd_flags = CD_ISDIR;
3259
3260                 cnattr.ca_fileid = kHFSRootFolderID;
3261                 cnattr.ca_linkcount = 1;
3262                 cnattr.ca_entries = 1;
3263                 cnattr.ca_dircount = 1;
3264                 cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
3265         } else {
3266                 int lockflags;
3267                 cnid_t pid;
3268                 const char *nameptr;
3269
3270                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3271                 error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork);
3272                 hfs_systemfile_unlock(hfsmp, lockflags);
3273
3274                 if (error) {
3275                         *vpp = NULL;
3276                         return (error);
3277                 }
3278
3279                 /*
3280                  * Check for a raw hardlink inode and save its linkref.
3281                  */
3282                 pid = cndesc.cd_parentcnid;
3283                 nameptr = (const char *)cndesc.cd_nameptr;
3284
3285                 if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3286                     (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) {
3287                         linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10);
3288
3289                 } else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3290                            (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) {
3291                         linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10);
3292
3293                 } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3294                            (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) {
3295                         *vpp = NULL;
3296                         cat_releasedesc(&cndesc);
3297                         return (ENOENT);  /* open unlinked file */
3298                 }
3299         }
3300
3301         /*
3302          * Finish initializing cnode descriptor for hardlinks.
3303          *
3304          * We need a valid name and parent for reverse lookups.
3305          */
3306         if (linkref) {
3307                 cnid_t lastid;
3308                 struct cat_desc linkdesc;
3309                 int linkerr = 0;
3310
3311                 cnattr.ca_linkref = linkref;
3312                 bzero (&linkdesc, sizeof (linkdesc));
3313
3314                 /*
3315                  * If the caller supplied the raw inode value, then we don't know exactly
3316                  * which hardlink they wanted. It's likely that they acquired the raw inode
3317                  * value BEFORE the item became a hardlink, in which case, they probably
3318                  * want the oldest link.  So request the oldest link from the catalog.
3319                  *
3320                  * Unfortunately, this requires that we iterate through all N hardlinks. On the plus
3321                  * side, since we know that we want the last linkID, we can also have this one
3322                  * call give us back the name of the last ID, since it's going to have it in-hand...
3323                  */
3324                 linkerr = hfs_lookup_lastlink (hfsmp, linkref, &lastid, &linkdesc);
3325                 if ((linkerr == 0) && (lastid != 0)) {
3326                         /*
3327                          * Release any lingering buffers attached to our local descriptor.
3328                          * Then copy the name and other business into the cndesc
3329                          */
3330                         cat_releasedesc (&cndesc);
3331                         bcopy (&linkdesc, &cndesc, sizeof(linkdesc));
3332                 }
3333                 /* If it failed, the linkref code will just use whatever it had in-hand below. */
3334         }
3335
3336         if (linkref) {
3337                 int newvnode_flags = 0;
3338
3339                 error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
3340                                                                 &cnfork, &vp, &newvnode_flags);
3341                 if (error == 0) {
3342                         VTOC(vp)->c_flag |= C_HARDLINK;
3343                         vnode_setmultipath(vp);
3344                 }
3345         } else {
3346                 struct componentname cn;
3347                 int newvnode_flags = 0;
3348
3349                 /* Supply hfs_getnewvnode with a component name. */
3350                 MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
3351                 cn.cn_nameiop = LOOKUP;
3352                 cn.cn_flags = ISLASTCN | HASBUF;
3353                 cn.cn_context = NULL;
3354                 cn.cn_pnlen = MAXPATHLEN;
3355                 cn.cn_nameptr = cn.cn_pnbuf;
3356                 cn.cn_namelen = cndesc.cd_namelen;
3357                 cn.cn_hash = 0;
3358                 cn.cn_consume = 0;
3359                 bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1);
3360
3361                 error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr,
3362                                                                 &cnfork, &vp, &newvnode_flags);
3363
3364                 if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) {
3365                         hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid);
3366                 }
3367                 FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI);
3368         }
3369         cat_releasedesc(&cndesc);
3370
3371         *vpp = vp;
3372         if (vp && skiplock) {
3373                 hfs_unlock(VTOC(vp));
3374         }
3375         return (error);
3376 }
3377
3378
3379 /*
3380  * Flush out all the files in a filesystem.
3381  */
3382 static int
3383 #if QUOTA
3384 hfs_flushfiles(struct mount *mp, int flags, struct proc *p)
3385 #else
3386 hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p)
3387 #endif /* QUOTA */
3388 {
3389         struct hfsmount *hfsmp;
3390         struct vnode *skipvp = NULLVP;
3391         int error;
3392         int accounted_root_usecounts;
3393 #if QUOTA
3394         int i;
3395 #endif
3396
3397         hfsmp = VFSTOHFS(mp);
3398
3399         accounted_root_usecounts = 0;
3400 #if QUOTA
3401         /*
3402          * The open quota files have an indirect reference on
3403          * the root directory vnode.  We must account for this
3404          * extra reference when doing the intial vflush.
3405          */
3406         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3407                 /* Find out how many quota files we have open. */
3408                 for (i = 0; i < MAXQUOTAS; i++) {
3409                         if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP)
3410                                 ++accounted_root_usecounts;
3411                 }
3412         }
3413 #endif /* QUOTA */
3414         if (hfsmp->hfs_flags & HFS_CS) {
3415                 ++accounted_root_usecounts;
3416         }
3417
3418         if (accounted_root_usecounts > 0) {
3419                 /* Obtain the root vnode so we can skip over it. */
3420                 skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0);
3421         }
3422
3423         error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags);
3424         if (error != 0)
3425                 return(error);
3426
3427         error = vflush(mp, skipvp, SKIPSYSTEM | flags);
3428
3429         if (skipvp) {
3430                 /*
3431                  * See if there are additional references on the
3432                  * root vp besides the ones obtained from the open
3433                  * quota files and CoreStorage.
3434                  */
3435                 if ((error == 0) &&
3436                     (vnode_isinuse(skipvp,  accounted_root_usecounts))) {
3437                         error = EBUSY;  /* root directory is still open */
3438                 }
3439                 hfs_unlock(VTOC(skipvp));
3440                 /* release the iocount from the hfs_chash_getvnode call above. */
3441                 vnode_put(skipvp);
3442         }
3443         if (error && (flags & FORCECLOSE) == 0)
3444                 return (error);
3445
3446 #if QUOTA
3447         if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3448                 for (i = 0; i < MAXQUOTAS; i++) {
3449                         if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP)
3450                                 continue;
3451                         hfs_quotaoff(p, mp, i);
3452                 }
3453         }
3454 #endif /* QUOTA */
3455         if (hfsmp->hfs_flags & HFS_CS) {
3456                 error = VNOP_IOCTL(hfsmp->hfs_devvp, _DKIOCCSSETFSVNODE,
3457                     (caddr_t)NULL, 0, vfs_context_kernel());
3458                 vnode_rele(skipvp);
3459                 printf("hfs_flushfiles: VNOP_IOCTL(_DKIOCCSSETFSVNODE) failed with error code %d\n",
3460                     error);
3461
3462                 /* ignore the CS error and proceed with the unmount. */
3463                 error = 0;
3464         }
3465         if (skipvp) {
3466                 error = vflush(mp, NULLVP, SKIPSYSTEM | flags);
3467         }
3468
3469         return (error);
3470 }
3471
3472 /*
3473  * Update volume encoding bitmap (HFS Plus only)
3474  *
3475  * Mark a legacy text encoding as in-use (as needed)
3476  * in the volume header of this HFS+ filesystem.
3477  */
3478 __private_extern__
3479 void
3480 hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding)
3481 {
3482 #define  kIndexMacUkrainian     48  /* MacUkrainian encoding is 152 */
3483 #define  kIndexMacFarsi         49  /* MacFarsi encoding is 140 */
3484
3485         u_int32_t       index;
3486
3487         switch (encoding) {
3488         case kTextEncodingMacUkrainian:
3489                 index = kIndexMacUkrainian;
3490                 break;
3491         case kTextEncodingMacFarsi:
3492                 index = kIndexMacFarsi;
3493                 break;
3494         default:
3495                 index = encoding;
3496                 break;
3497         }
3498
3499         /* Only mark the encoding as in-use if it wasn't already set */
3500         if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) {
3501                 hfs_lock_mount (hfsmp);
3502                 hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index);
3503                 MarkVCBDirty(hfsmp);
3504                 hfs_unlock_mount(hfsmp);
3505         }
3506 }
3507
3508 /*
3509  * Update volume stats
3510  *
3511  * On journal volumes this will cause a volume header flush
3512  */
3513 int
3514 hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
3515 {
3516         struct timeval tv;
3517
3518         microtime(&tv);
3519
3520         hfs_lock_mount (hfsmp);
3521
3522         MarkVCBDirty(hfsmp);
3523         hfsmp->hfs_mtime = tv.tv_sec;
3524
3525         switch (op) {
3526         case VOL_UPDATE:
3527                 break;
3528         case VOL_MKDIR:
3529                 if (hfsmp->hfs_dircount != 0xFFFFFFFF)
3530                         ++hfsmp->hfs_dircount;
3531                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3532                         ++hfsmp->vcbNmRtDirs;
3533                 break;
3534         case VOL_RMDIR:
3535                 if (hfsmp->hfs_dircount != 0)
3536                         --hfsmp->hfs_dircount;
3537                 if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3538                         --hfsmp->vcbNmRtDirs;
3539                 break;
3540         case VOL_MKFILE:
3541                 if (hfsmp->hfs_filecount != 0xFFFFFFFF)
3542                         ++hfsmp->hfs_filecount;
3543                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3544                         ++hfsmp->vcbNmFls;
3545                 break;
3546         case VOL_RMFILE:
3547                 if (hfsmp->hfs_filecount != 0)
3548                         --hfsmp->hfs_filecount;
3549                 if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3550                         --hfsmp->vcbNmFls;
3551                 break;
3552         }
3553
3554         hfs_unlock_mount (hfsmp);
3555
3556         if (hfsmp->jnl) {
3557                 hfs_flushvolumeheader(hfsmp, 0, 0);
3558         }
3559
3560         return (0);
3561 }
3562
3563
3564 #if CONFIG_HFS_STD
3565 static int
3566 hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
3567 {
3568         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3569         struct filefork *fp;
3570         HFSMasterDirectoryBlock *mdb;
3571         struct buf *bp = NULL;
3572         int retval;
3573         int sector_size;
3574         ByteCount namelen;
3575
3576         sector_size = hfsmp->hfs_logical_block_size;
3577         retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp);
3578         if (retval) {
3579                 if (bp)
3580                         buf_brelse(bp);
3581                 return retval;
3582         }
3583
3584         hfs_lock_mount (hfsmp);
3585
3586         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size));
3587
3588         mdb->drCrDate   = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime)));
3589         mdb->drLsMod    = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod)));
3590         mdb->drAtrb     = SWAP_BE16 (vcb->vcbAtrb);
3591         mdb->drNmFls    = SWAP_BE16 (vcb->vcbNmFls);
3592         mdb->drAllocPtr = SWAP_BE16 (vcb->nextAllocation);
3593         mdb->drClpSiz   = SWAP_BE32 (vcb->vcbClpSiz);
3594         mdb->drNxtCNID  = SWAP_BE32 (vcb->vcbNxtCNID);
3595         mdb->drFreeBks  = SWAP_BE16 (vcb->freeBlocks);
3596
3597         namelen = strlen((char *)vcb->vcbVN);
3598         retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN);
3599         /* Retry with MacRoman in case that's how it was exported. */
3600         if (retval)
3601                 retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN);
3602
3603         mdb->drVolBkUp  = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp)));
3604         mdb->drWrCnt    = SWAP_BE32 (vcb->vcbWrCnt);
3605         mdb->drNmRtDirs = SWAP_BE16 (vcb->vcbNmRtDirs);
3606         mdb->drFilCnt   = SWAP_BE32 (vcb->vcbFilCnt);
3607         mdb->drDirCnt   = SWAP_BE32 (vcb->vcbDirCnt);
3608
3609         bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo));
3610
3611         fp = VTOF(vcb->extentsRefNum);
3612         mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3613         mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3614         mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3615         mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3616         mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3617         mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3618         mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3619         mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3620         FTOC(fp)->c_flag &= ~C_MODIFIED;
3621
3622         fp = VTOF(vcb->catalogRefNum);
3623         mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3624         mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3625         mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3626         mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3627         mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3628         mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3629         mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3630         mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize);
3631         FTOC(fp)->c_flag &= ~C_MODIFIED;
3632
3633         MarkVCBClean( vcb );
3634
3635         hfs_unlock_mount (hfsmp);
3636
3637         /* If requested, flush out the alternate MDB */
3638         if (altflush) {
3639                 struct buf *alt_bp = NULL;
3640
3641                 if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sector_size, NOCRED, &alt_bp) == 0) {
3642                         bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize);
3643
3644                         (void) VNOP_BWRITE(alt_bp);
3645                 } else if (alt_bp)
3646                         buf_brelse(alt_bp);
3647         }
3648
3649         if (waitfor != MNT_WAIT)
3650                 buf_bawrite(bp);
3651         else
3652                 retval = VNOP_BWRITE(bp);
3653
3654         return (retval);
3655 }
3656 #endif
3657
3658 /*
3659  *  Flush any dirty in-memory mount data to the on-disk
3660  *  volume header.
3661  *
3662  *  Note: the on-disk volume signature is intentionally
3663  *  not flushed since the on-disk "H+" and "HX" signatures
3664  *  are always stored in-memory as "H+".
3665  */
3666 int
3667 hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
3668 {
3669         ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3670         struct filefork *fp;
3671         HFSPlusVolumeHeader *volumeHeader, *altVH;
3672         int retval;
3673         struct buf *bp, *alt_bp;
3674         int i;
3675         daddr64_t priIDSector;
3676         int critical;
3677         u_int16_t  signature;
3678         u_int16_t  hfsversion;
3679
3680         if (hfsmp->hfs_flags & HFS_READ_ONLY) {
3681                 return(0);
3682         }
3683 #if CONFIG_HFS_STD
3684         if (hfsmp->hfs_flags & HFS_STANDARD) {
3685                 return hfs_flushMDB(hfsmp, waitfor, altflush);
3686         }
3687 #endif
3688         critical = altflush;
3689         priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
3690                                   HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
3691
3692         if (hfs_start_transaction(hfsmp) != 0) {
3693             return EINVAL;
3694         }
3695
3696         bp = NULL;
3697         alt_bp = NULL;
3698
3699         retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3700                         HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
3701                         hfsmp->hfs_physical_block_size, NOCRED, &bp);
3702         if (retval) {
3703                 printf("hfs: err %d reading VH blk (vol=%s)\n", retval, vcb->vcbVN);
3704                 goto err_exit;
3705         }
3706
3707         volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) +
3708                         HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3709
3710         /*
3711          * Sanity check what we just read.  If it's bad, try the alternate
3712          * instead.
3713          */
3714         signature = SWAP_BE16 (volumeHeader->signature);
3715         hfsversion   = SWAP_BE16 (volumeHeader->version);
3716         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3717             (hfsversion < kHFSPlusVersion) || (hfsversion > 100) ||
3718             (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) {
3719                 printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d%s\n",
3720                       vcb->vcbVN, signature, hfsversion,
3721                       SWAP_BE32 (volumeHeader->blockSize),
3722                       hfsmp->hfs_alt_id_sector ? "; trying alternate" : "");
3723                 hfs_mark_volume_inconsistent(hfsmp);
3724
3725                 if (hfsmp->hfs_alt_id_sector) {
3726                         retval = buf_meta_bread(hfsmp->hfs_devvp,
3727                             HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3728                             hfsmp->hfs_physical_block_size, NOCRED, &alt_bp);
3729                         if (retval) {
3730                                 printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN);
3731                                 goto err_exit;
3732                         }
3733
3734                         altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) +
3735                                 HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size));
3736                         signature = SWAP_BE16(altVH->signature);
3737                         hfsversion = SWAP_BE16(altVH->version);
3738
3739                         if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3740                             (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) ||
3741                             (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) {
3742                                 printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n",
3743                                     vcb->vcbVN, signature, hfsversion,
3744                                     SWAP_BE32(altVH->blockSize));
3745                                 retval = EIO;
3746                                 goto err_exit;
3747                         }
3748
3749                         /* The alternate is plausible, so use it. */
3750                         bcopy(altVH, volumeHeader, kMDBSize);
3751                         buf_brelse(alt_bp);
3752                         alt_bp = NULL;
3753                 } else {
3754                         /* No alternate VH, nothing more we can do. */
3755                         retval = EIO;
3756                         goto err_exit;
3757                 }
3758         }
3759
3760         if (hfsmp->jnl) {
3761                 journal_modify_block_start(hfsmp->jnl, bp);
3762         }
3763
3764         /*
3765          * For embedded HFS+ volumes, update create date if it changed
3766          * (ie from a setattrlist call)
3767          */
3768         if ((vcb->hfsPlusIOPosOffset != 0) &&
3769             (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) {
3770                 struct buf *bp2;
3771                 HFSMasterDirectoryBlock *mdb;
3772
3773                 retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3774                                 HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys),
3775                                 hfsmp->hfs_physical_block_size, NOCRED, &bp2);
3776                 if (retval) {
3777                         if (bp2)
3778                                 buf_brelse(bp2);
3779                         retval = 0;
3780                 } else {
3781                         mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) +
3782                                 HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3783
3784                         if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
3785                           {
3786                                 if (hfsmp->jnl) {
3787                                     journal_modify_block_start(hfsmp->jnl, bp2);
3788                                 }
3789
3790                                 mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);       /* pick up the new create date */
3791
3792                                 if (hfsmp->jnl) {
3793                                         journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL);
3794                                 } else {
3795                                         (void) VNOP_BWRITE(bp2);                /* write out the changes */
3796                                 }
3797                           }
3798                         else
3799                           {
3800                                 buf_brelse(bp2);                                                /* just release it */
3801                           }
3802                   }
3803         }
3804
3805         hfs_lock_mount (hfsmp);
3806
3807         /* Note: only update the lower 16 bits worth of attributes */
3808         volumeHeader->attributes       = SWAP_BE32 (vcb->vcbAtrb);
3809         volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock);
3810         if (hfsmp->jnl) {
3811                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
3812         } else {
3813                 volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
3814         }
3815         volumeHeader->createDate        = SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
3816         volumeHeader->modifyDate        = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
3817         volumeHeader->backupDate        = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
3818         volumeHeader->fileCount         = SWAP_BE32 (vcb->vcbFilCnt);
3819         volumeHeader->folderCount       = SWAP_BE32 (vcb->vcbDirCnt);
3820         volumeHeader->totalBlocks       = SWAP_BE32 (vcb->totalBlocks);
3821         volumeHeader->freeBlocks        = SWAP_BE32 (vcb->freeBlocks);
3822         volumeHeader->nextAllocation    = SWAP_BE32 (vcb->nextAllocation);
3823         volumeHeader->rsrcClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3824         volumeHeader->dataClumpSize     = SWAP_BE32 (vcb->vcbClpSiz);
3825         volumeHeader->nextCatalogID     = SWAP_BE32 (vcb->vcbNxtCNID);
3826         volumeHeader->writeCount        = SWAP_BE32 (vcb->vcbWrCnt);
3827         volumeHeader->encodingsBitmap   = SWAP_BE64 (vcb->encodingsBitmap);
3828
3829         if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) {
3830                 bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo));
3831                 critical = 1;
3832         }
3833
3834         /*
3835          * System files are only dirty when altflush is set.
3836          */
3837         if (altflush == 0) {
3838                 goto done;
3839         }
3840
3841         /* Sync Extents over-flow file meta data */
3842         fp = VTOF(vcb->extentsRefNum);
3843         if (FTOC(fp)->c_flag & C_MODIFIED) {
3844                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3845                         volumeHeader->extentsFile.extents[i].startBlock =
3846                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3847                         volumeHeader->extentsFile.extents[i].blockCount =
3848                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3849                 }
3850                 volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size);
3851                 volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3852                 volumeHeader->extentsFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3853                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3854         }
3855
3856         /* Sync Catalog file meta data */
3857         fp = VTOF(vcb->catalogRefNum);
3858         if (FTOC(fp)->c_flag & C_MODIFIED) {
3859                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3860                         volumeHeader->catalogFile.extents[i].startBlock =
3861                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3862                         volumeHeader->catalogFile.extents[i].blockCount =
3863                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3864                 }
3865                 volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size);
3866                 volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3867                 volumeHeader->catalogFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3868                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3869         }
3870
3871         /* Sync Allocation file meta data */
3872         fp = VTOF(vcb->allocationsRefNum);
3873         if (FTOC(fp)->c_flag & C_MODIFIED) {
3874                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3875                         volumeHeader->allocationFile.extents[i].startBlock =
3876                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3877                         volumeHeader->allocationFile.extents[i].blockCount =
3878                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3879                 }
3880                 volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size);
3881                 volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3882                 volumeHeader->allocationFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3883                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3884         }
3885
3886         /* Sync Attribute file meta data */
3887         if (hfsmp->hfs_attribute_vp) {
3888                 fp = VTOF(hfsmp->hfs_attribute_vp);
3889                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
3890                         volumeHeader->attributesFile.extents[i].startBlock =
3891                                 SWAP_BE32 (fp->ff_extents[i].startBlock);
3892                         volumeHeader->attributesFile.extents[i].blockCount =
3893                                 SWAP_BE32 (fp->ff_extents[i].blockCount);
3894                 }
3895                 FTOC(fp)->c_flag &= ~C_MODIFIED;
3896                 volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size);
3897                 volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3898                 volumeHeader->attributesFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3899         }
3900
3901         /* Sync Startup file meta data */
3902         if (hfsmp->hfs_startup_vp) {
3903                 fp = VTOF(hfsmp->hfs_startup_vp);
3904                 if (FTOC(fp)->c_flag & C_MODIFIED) {
3905                         for (i = 0; i < kHFSPlusExtentDensity; i++) {
3906                                 volumeHeader->startupFile.extents[i].startBlock =
3907                                         SWAP_BE32 (fp->ff_extents[i].startBlock);
3908                                 volumeHeader->startupFile.extents[i].blockCount =
3909                                         SWAP_BE32 (fp->ff_extents[i].blockCount);
3910                         }
3911                         volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size);
3912                         volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3913                         volumeHeader->startupFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3914                         FTOC(fp)->c_flag &= ~C_MODIFIED;
3915                 }
3916         }
3917
3918 done:
3919         MarkVCBClean(hfsmp);
3920         hfs_unlock_mount (hfsmp);
3921
3922         /* If requested, flush out the alternate volume header */
3923         if (altflush && hfsmp->hfs_alt_id_sector) {
3924                 if (buf_meta_bread(hfsmp->hfs_devvp,
3925                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3926                                 hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) {
3927                         if (hfsmp->jnl) {
3928                                 journal_modify_block_start(hfsmp->jnl, alt_bp);
3929                         }
3930
3931                         bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) +
3932                                         HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size),
3933                                         kMDBSize);
3934
3935                         if (hfsmp->jnl) {
3936                                 journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL);
3937                         } else {
3938                                 (void) VNOP_BWRITE(alt_bp);
3939                         }
3940                 } else if (alt_bp)
3941                         buf_brelse(alt_bp);
3942         }
3943
3944         if (hfsmp->jnl) {
3945                 journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
3946         } else {
3947                 if (waitfor != MNT_WAIT)
3948                         buf_bawrite(bp);
3949                 else {
3950                     retval = VNOP_BWRITE(bp);
3951                     /* When critical data changes, flush the device cache */
3952                     if (critical && (retval == 0)) {
3953                         (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
3954                                          NULL, FWRITE, NULL);
3955                     }
3956                 }
3957         }
3958         hfs_end_transaction(hfsmp);
3959
3960         return (retval);
3961
3962 err_exit:
3963         if (alt_bp)
3964                 buf_brelse(alt_bp);
3965         if (bp)
3966                 buf_brelse(bp);
3967         hfs_end_transaction(hfsmp);
3968         return retval;
3969 }
3970
3971
3972 /*
3973  * Extend a file system.
3974  */
3975 int
3976 hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
3977 {
3978         struct proc *p = vfs_context_proc(context);
3979         kauth_cred_t cred = vfs_context_ucred(context);
3980         struct  vnode *vp;
3981         struct  vnode *devvp;
3982         struct  buf *bp;
3983         struct  filefork *fp = NULL;
3984         ExtendedVCB  *vcb;
3985         struct  cat_fork forkdata;
3986         u_int64_t  oldsize;
3987         u_int64_t  newblkcnt;
3988         u_int64_t  prev_phys_block_count;
3989         u_int32_t  addblks;
3990         u_int64_t  sector_count;
3991         u_int32_t  sector_size;
3992         u_int32_t  phys_sector_size;
3993         u_int32_t  overage_blocks;
3994         daddr64_t  prev_alt_sector;
3995         daddr_t    bitmapblks;
3996         int  lockflags = 0;
3997         int  error;
3998         int64_t oldBitmapSize;
3999         Boolean  usedExtendFileC = false;
4000         int transaction_begun = 0;
4001
4002         devvp = hfsmp->hfs_devvp;
4003         vcb = HFSTOVCB(hfsmp);
4004
4005         /*
4006          * - HFS Plus file systems only.
4007          * - Journaling must be enabled.
4008          * - No embedded volumes.
4009          */
4010         if ((vcb->vcbSigWord == kHFSSigWord) ||
4011              (hfsmp->jnl == NULL) ||
4012              (vcb->hfsPlusIOPosOffset != 0)) {
4013                 return (EPERM);
4014         }
4015         /*
4016          * If extending file system by non-root, then verify
4017          * ownership and check permissions.
4018          */
4019         if (suser(cred, NULL)) {
4020                 error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0);
4021
4022                 if (error)
4023                         return (error);
4024                 error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0);
4025                 if (error == 0) {
4026                         error = hfs_write_access(vp, cred, p, false);
4027                 }
4028                 hfs_unlock(VTOC(vp));
4029                 vnode_put(vp);
4030                 if (error)
4031                         return (error);
4032
4033                 error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context);
4034                 if (error)
4035                         return (error);
4036         }
4037         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&sector_size, 0, context)) {
4038                 return (ENXIO);
4039         }
4040         if (sector_size != hfsmp->hfs_logical_block_size) {
4041                 return (ENXIO);
4042         }
4043         if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&sector_count, 0, context)) {
4044                 return (ENXIO);
4045         }
4046         if ((sector_size * sector_count) < newsize) {
4047                 printf("hfs_extendfs: not enough space on device (vol=%s)\n", hfsmp->vcbVN);
4048                 return (ENOSPC);
4049         }
4050         error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context);
4051         if (error) {
4052                 if ((error != ENOTSUP) && (error != ENOTTY)) {
4053                         return (ENXIO);
4054                 }
4055                 /* If ioctl is not supported, force physical and logical sector size to be same */
4056                 phys_sector_size = sector_size;
4057         }
4058         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4059
4060         /*
4061          * Validate new size.
4062          */
4063         if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) {
4064                 printf("hfs_extendfs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4065                 return (EINVAL);
4066         }
4067         newblkcnt = newsize / vcb->blockSize;
4068         if (newblkcnt > (u_int64_t)0xFFFFFFFF) {
4069                 printf ("hfs_extendfs: current blockSize=%u too small for newsize=%qu\n", hfsmp->blockSize, newsize);
4070                 return (EOVERFLOW);
4071         }
4072
4073         addblks = newblkcnt - vcb->totalBlocks;
4074
4075         if (hfs_resize_debug) {
4076                 printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks);
4077                 printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks);
4078         }
4079         printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks);
4080
4081         hfs_lock_mount (hfsmp);
4082         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4083                 hfs_unlock_mount(hfsmp);
4084                 error = EALREADY;
4085                 goto out;
4086         }
4087         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4088         hfs_unlock_mount (hfsmp);
4089
4090         /* Start with a clean journal. */
4091         hfs_journal_flush(hfsmp, TRUE);
4092
4093         /*
4094          * Enclose changes inside a transaction.
4095          */
4096         if (hfs_start_transaction(hfsmp) != 0) {
4097                 error = EINVAL;
4098                 goto out;
4099         }
4100         transaction_begun = 1;
4101
4102
4103         /* Update the hfsmp fields for the physical information about the device */
4104         prev_phys_block_count = hfsmp->hfs_logical_block_count;
4105         prev_alt_sector = hfsmp->hfs_alt_id_sector;
4106
4107         hfsmp->hfs_logical_block_count = sector_count;
4108         /*
4109          * Note that the new AltVH location must be based on the device's EOF rather than the new
4110          * filesystem's EOF, so we use logical_block_count here rather than newsize.
4111          */
4112         hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) +
4113                                   HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count);
4114         hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size;
4115
4116
4117         /*
4118          * Note: we take the attributes lock in case we have an attribute data vnode
4119          * which needs to change size.
4120          */
4121         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4122         vp = vcb->allocationsRefNum;
4123         fp = VTOF(vp);
4124         bcopy(&fp->ff_data, &forkdata, sizeof(forkdata));
4125
4126         /*
4127          * Calculate additional space required (if any) by allocation bitmap.
4128          */
4129         oldBitmapSize = fp->ff_size;
4130         bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize;
4131         if (bitmapblks > (daddr_t)fp->ff_blocks)
4132                 bitmapblks -= fp->ff_blocks;
4133         else
4134                 bitmapblks = 0;
4135
4136         /*
4137          * The allocation bitmap can contain unused bits that are beyond end of
4138          * current volume's allocation blocks.  Usually they are supposed to be
4139          * zero'ed out but there can be cases where they might be marked as used.
4140          * After extending the file system, those bits can represent valid
4141          * allocation blocks, so we mark all the bits from the end of current
4142          * volume to end of allocation bitmap as "free".
4143          *
4144          * Figure out the number of overage blocks before proceeding though,
4145          * so we don't add more bytes to our I/O than necessary.
4146          * First figure out the total number of blocks representable by the
4147          * end of the bitmap file vs. the total number of blocks in the new FS.
4148          * Then subtract away the number of blocks in the current FS.  This is how much
4149          * we can mark as free right now without having to grow the bitmap file.
4150          */
4151         overage_blocks = fp->ff_blocks * vcb->blockSize * 8;
4152         overage_blocks = MIN (overage_blocks, newblkcnt);
4153         overage_blocks -= vcb->totalBlocks;
4154
4155         BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks);
4156
4157         if (bitmapblks > 0) {
4158                 daddr64_t blkno;
4159                 daddr_t blkcnt;
4160                 off_t bytesAdded;
4161
4162                 /*
4163                  * Get the bitmap's current size (in allocation blocks) so we know
4164                  * where to start zero filling once the new space is added.  We've
4165                  * got to do this before the bitmap is grown.
4166                  */
4167                 blkno  = (daddr64_t)fp->ff_blocks;
4168
4169                 /*
4170                  * Try to grow the allocation file in the normal way, using allocation
4171                  * blocks already existing in the file system.  This way, we might be
4172                  * able to grow the bitmap contiguously, or at least in the metadata
4173                  * zone.
4174                  */
4175                 error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0,
4176                                 kEFAllMask | kEFNoClumpMask | kEFReserveMask
4177                                 | kEFMetadataMask | kEFContigMask, &bytesAdded);
4178
4179                 if (error == 0) {
4180                         usedExtendFileC = true;
4181                 } else {
4182                         /*
4183                          * If the above allocation failed, fall back to allocating the new
4184                          * extent of the bitmap from the space we're going to add.  Since those
4185                          * blocks don't yet belong to the file system, we have to update the
4186                          * extent list directly, and manually adjust the file size.
4187                          */
4188                         bytesAdded = 0;
4189                         error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks);
4190                         if (error) {
4191                                 printf("hfs_extendfs: error %d adding extents\n", error);
4192                                 goto out;
4193                         }
4194                         fp->ff_blocks += bitmapblks;
4195                         VTOC(vp)->c_blocks = fp->ff_blocks;
4196                         VTOC(vp)->c_flag |= C_MODIFIED;
4197                 }
4198
4199                 /*
4200                  * Update the allocation file's size to include the newly allocated
4201                  * blocks.  Note that ExtendFileC doesn't do this, which is why this
4202                  * statement is outside the above "if" statement.
4203                  */
4204                 fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4205
4206                 /*
4207                  * Zero out the new bitmap blocks.
4208                  */
4209                 {
4210
4211                         bp = NULL;
4212                         blkcnt = bitmapblks;
4213                         while (blkcnt > 0) {
4214                                 error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp);
4215                                 if (error) {
4216                                         if (bp) {
4217                                                 buf_brelse(bp);
4218                                         }
4219                                         break;
4220                                 }
4221                                 bzero((char *)buf_dataptr(bp), vcb->blockSize);
4222                                 buf_markaged(bp);
4223                                 error = (int)buf_bwrite(bp);
4224                                 if (error)
4225                                         break;
4226                                 --blkcnt;
4227                                 ++blkno;
4228                         }
4229                 }
4230                 if (error) {
4231                         printf("hfs_extendfs: error %d clearing blocks\n", error);
4232                         goto out;
4233                 }
4234                 /*
4235                  * Mark the new bitmap space as allocated.
4236                  *
4237                  * Note that ExtendFileC will have marked any blocks it allocated, so
4238                  * this is only needed if we used AddFileExtent.  Also note that this
4239                  * has to come *after* the zero filling of new blocks in the case where
4240                  * we used AddFileExtent (since the part of the bitmap we're touching
4241                  * is in those newly allocated blocks).
4242                  */
4243                 if (!usedExtendFileC) {
4244                         error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks);
4245                         if (error) {
4246                                 printf("hfs_extendfs: error %d setting bitmap\n", error);
4247                                 goto out;
4248                         }
4249                         vcb->freeBlocks -= bitmapblks;
4250                 }
4251         }
4252         /*
4253          * Mark the new alternate VH as allocated.
4254          */
4255         if (vcb->blockSize == 512)
4256                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2);
4257         else
4258                 error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1);
4259         if (error) {
4260                 printf("hfs_extendfs: error %d setting bitmap (VH)\n", error);
4261                 goto out;
4262         }
4263         /*
4264          * Mark the old alternate VH as free.
4265          */
4266         if (vcb->blockSize == 512)
4267                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2);
4268         else
4269                 (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1);
4270         /*
4271          * Adjust file system variables for new space.
4272          */
4273         vcb->totalBlocks += addblks;
4274         vcb->freeBlocks += addblks;
4275         MarkVCBDirty(vcb);
4276         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4277         if (error) {
4278                 printf("hfs_extendfs: couldn't flush volume headers (%d)", error);
4279                 /*
4280                  * Restore to old state.
4281                  */
4282                 if (usedExtendFileC) {
4283                         (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp),
4284                                                                  FTOC(fp)->c_fileid, false);
4285                 } else {
4286                         fp->ff_blocks -= bitmapblks;
4287                         fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4288                         /*
4289                          * No need to mark the excess blocks free since those bitmap blocks
4290                          * are no longer part of the bitmap.  But we do need to undo the
4291                          * effect of the "vcb->freeBlocks -= bitmapblks" above.
4292                          */
4293                         vcb->freeBlocks += bitmapblks;
4294                 }
4295                 vcb->totalBlocks -= addblks;
4296                 vcb->freeBlocks -= addblks;
4297                 hfsmp->hfs_logical_block_count = prev_phys_block_count;
4298                 hfsmp->hfs_alt_id_sector = prev_alt_sector;
4299                 MarkVCBDirty(vcb);
4300                 if (vcb->blockSize == 512) {
4301                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) {
4302                                 hfs_mark_volume_inconsistent(hfsmp);
4303                         }
4304                 } else {
4305                         if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) {
4306                                 hfs_mark_volume_inconsistent(hfsmp);
4307                         }
4308                 }
4309                 goto out;
4310         }
4311         /*
4312          * Invalidate the old alternate volume header.
4313          */
4314         bp = NULL;
4315         if (prev_alt_sector) {
4316                 if (buf_meta_bread(hfsmp->hfs_devvp,
4317                                 HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys),
4318                                 hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
4319                         journal_modify_block_start(hfsmp->jnl, bp);
4320
4321                         bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize);
4322
4323                         journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
4324                 } else if (bp) {
4325                         buf_brelse(bp);
4326                 }
4327         }
4328
4329         /*
4330          * Update the metadata zone size based on current volume size
4331          */
4332         hfs_metadatazone_init(hfsmp, false);
4333
4334         /*
4335          * Adjust the size of hfsmp->hfs_attrdata_vp
4336          */
4337         if (hfsmp->hfs_attrdata_vp) {
4338                 struct cnode *attr_cp;
4339                 struct filefork *attr_fp;
4340
4341                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4342                         attr_cp = VTOC(hfsmp->hfs_attrdata_vp);
4343                         attr_fp = VTOF(hfsmp->hfs_attrdata_vp);
4344
4345                         attr_cp->c_blocks = newblkcnt;
4346                         attr_fp->ff_blocks = newblkcnt;
4347                         attr_fp->ff_extents[0].blockCount = newblkcnt;
4348                         attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4349                         ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size);
4350                         vnode_put(hfsmp->hfs_attrdata_vp);
4351                 }
4352         }
4353
4354         /*
4355          * Update the R/B Tree if necessary.  Since we don't have to drop the systemfile
4356          * locks in the middle of these operations like we do in the truncate case
4357          * where we have to relocate files, we can only update the red-black tree
4358          * if there were actual changes made to the bitmap.  Also, we can't really scan the
4359          * new portion of the bitmap before it has been allocated. The BlockMarkAllocated
4360          * routines are smart enough to avoid the r/b tree if the portion they are manipulating is
4361          * not currently controlled by the tree.
4362          *
4363          * We only update hfsmp->allocLimit if totalBlocks actually increased.
4364          */
4365         if (error == 0) {
4366                 UpdateAllocLimit(hfsmp, hfsmp->totalBlocks);
4367         }
4368
4369         /* Release all locks and sync up journal content before
4370          * checking and extending, if required, the journal
4371          */
4372         if (lockflags) {
4373                 hfs_systemfile_unlock(hfsmp, lockflags);
4374                 lockflags = 0;
4375         }
4376         if (transaction_begun) {
4377                 hfs_end_transaction(hfsmp);
4378                 hfs_journal_flush(hfsmp, TRUE);
4379                 transaction_begun = 0;
4380         }
4381
4382         /* Increase the journal size, if required. */
4383         error = hfs_extend_journal(hfsmp, sector_size, sector_count, context);
4384         if (error) {
4385                 printf ("hfs_extendfs: Could not extend journal size\n");
4386                 goto out_noalloc;
4387         }
4388
4389         /* Log successful extending */
4390         printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n",
4391                hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize));
4392
4393 out:
4394         if (error && fp) {
4395                 /* Restore allocation fork. */
4396                 bcopy(&forkdata, &fp->ff_data, sizeof(forkdata));
4397                 VTOC(vp)->c_blocks = fp->ff_blocks;
4398
4399         }
4400
4401 out_noalloc:
4402         hfs_lock_mount (hfsmp);
4403         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4404         hfs_unlock_mount (hfsmp);
4405         if (lockflags) {
4406                 hfs_systemfile_unlock(hfsmp, lockflags);
4407         }
4408         if (transaction_begun) {
4409                 hfs_end_transaction(hfsmp);
4410                 hfs_journal_flush(hfsmp, FALSE);
4411                 /* Just to be sure, sync all data to the disk */
4412                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4413         }
4414         if (error) {
4415                 printf ("hfs_extentfs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN);
4416         }
4417
4418         return MacToVFSError(error);
4419 }
4420
4421 #define HFS_MIN_SIZE  (32LL * 1024LL * 1024LL)
4422
4423 /*
4424  * Truncate a file system (while still mounted).
4425  */
4426 int
4427 hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4428 {
4429         struct  buf *bp = NULL;
4430         u_int64_t oldsize;
4431         u_int32_t newblkcnt;
4432         u_int32_t reclaimblks = 0;
4433         int lockflags = 0;
4434         int transaction_begun = 0;
4435         Boolean updateFreeBlocks = false;
4436         Boolean disable_sparse = false;
4437         int error = 0;
4438
4439         hfs_lock_mount (hfsmp);
4440         if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4441                 hfs_unlock_mount (hfsmp);
4442                 return (EALREADY);
4443         }
4444         hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4445         hfsmp->hfs_resize_blocksmoved = 0;
4446         hfsmp->hfs_resize_totalblocks = 0;
4447         hfsmp->hfs_resize_progress = 0;
4448         hfs_unlock_mount (hfsmp);
4449
4450         /*
4451          * - Journaled HFS Plus volumes only.
4452          * - No embedded volumes.
4453          */
4454         if ((hfsmp->jnl == NULL) ||
4455             (hfsmp->hfsPlusIOPosOffset != 0)) {
4456                 error = EPERM;
4457                 goto out;
4458         }
4459         oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4460         newblkcnt = newsize / hfsmp->blockSize;
4461         reclaimblks = hfsmp->totalBlocks - newblkcnt;
4462
4463         if (hfs_resize_debug) {
4464                 printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
4465                 printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
4466         }
4467
4468         /* Make sure new size is valid. */
4469         if ((newsize < HFS_MIN_SIZE) ||
4470             (newsize >= oldsize) ||
4471             (newsize % hfsmp->hfs_logical_block_size) ||
4472             (newsize % hfsmp->hfs_physical_block_size)) {
4473                 printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4474                 error = EINVAL;
4475                 goto out;
4476         }
4477
4478         /*
4479          * Make sure that the file system has enough free blocks reclaim.
4480          *
4481          * Before resize, the disk is divided into four zones -
4482          *      A. Allocated_Stationary - These are allocated blocks that exist
4483          *         before the new end of disk.  These blocks will not be
4484          *         relocated or modified during resize.
4485          *      B. Free_Stationary - These are free blocks that exist before the
4486          *         new end of disk.  These blocks can be used for any new
4487          *         allocations during resize, including allocation for relocating
4488          *         data from the area of disk being reclaimed.
4489          *      C. Allocated_To-Reclaim - These are allocated blocks that exist
4490          *         beyond the new end of disk.  These blocks need to be reclaimed
4491          *         during resize by allocating equal number of blocks in Free
4492          *         Stationary zone and copying the data.
4493          *      D. Free_To-Reclaim - These are free blocks that exist beyond the
4494          *         new end of disk.  Nothing special needs to be done to reclaim
4495          *         them.
4496          *
4497          * Total number of blocks on the disk before resize:
4498          * ------------------------------------------------
4499          *      Total Blocks = Allocated_Stationary + Free_Stationary +
4500          *                     Allocated_To-Reclaim + Free_To-Reclaim
4501          *
4502          * Total number of blocks that need to be reclaimed:
4503          * ------------------------------------------------
4504          *      Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim
4505          *
4506          * Note that the check below also makes sure that we have enough space
4507          * to relocate data from Allocated_To-Reclaim to Free_Stationary.
4508          * Therefore we do not need to check total number of blocks to relocate
4509          * later in the code.
4510          *
4511          * The condition below gets converted to:
4512          *
4513          * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim
4514          *
4515          * which is equivalent to:
4516          *
4517          *              Allocated To-Reclaim >= Free Stationary
4518          */
4519         if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
4520                 printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
4521                 error = ENOSPC;
4522                 goto out;
4523         }
4524
4525         /* Start with a clean journal. */
4526         hfs_journal_flush(hfsmp, TRUE);
4527
4528         if (hfs_start_transaction(hfsmp) != 0) {
4529                 error = EINVAL;
4530                 goto out;
4531         }
4532         transaction_begun = 1;
4533
4534         /* Take the bitmap lock to update the alloc limit field */
4535         lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4536
4537         /*
4538          * Prevent new allocations from using the part we're trying to truncate.
4539          *
4540          * NOTE: allocLimit is set to the allocation block number where the new
4541          * alternate volume header will be.  That way there will be no files to
4542          * interfere with allocating the new alternate volume header, and no files
4543          * in the allocation blocks beyond (i.e. the blocks we're trying to
4544          * truncate away.
4545          *
4546          * Also shrink the red-black tree if needed.
4547          */
4548         if (hfsmp->blockSize == 512) {
4549                 error = UpdateAllocLimit (hfsmp, newblkcnt - 2);
4550         }
4551         else {
4552                 error = UpdateAllocLimit (hfsmp, newblkcnt - 1);
4553         }
4554
4555         /* Sparse devices use first fit allocation which is not ideal
4556          * for volume resize which requires best fit allocation.  If a
4557          * sparse device is being truncated, disable the sparse device
4558          * property temporarily for the duration of resize.  Also reset
4559          * the free extent cache so that it is rebuilt as sorted by
4560          * totalBlocks instead of startBlock.
4561          *
4562          * Note that this will affect all allocations on the volume and
4563          * ideal fix would be just to modify resize-related allocations,
4564          * but it will result in complexity like handling of two free
4565          * extent caches sorted differently, etc.  So we stick to this
4566          * solution for now.
4567          */
4568         hfs_lock_mount (hfsmp);
4569         if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
4570                 hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
4571                 ResetVCBFreeExtCache(hfsmp);
4572                 disable_sparse = true;
4573         }
4574
4575         /*
4576          * Update the volume free block count to reflect the total number
4577          * of free blocks that will exist after a successful resize.
4578          * Relocation of extents will result in no net change in the total
4579          * free space on the disk.  Therefore the code that allocates
4580          * space for new extent and deallocates the old extent explicitly
4581          * prevents updating the volume free block count.  It will also
4582          * prevent false disk full error when the number of blocks in
4583          * an extent being relocated is more than the free blocks that
4584          * will exist after the volume is resized.
4585          */
4586         hfsmp->freeBlocks -= reclaimblks;
4587         updateFreeBlocks = true;
4588         hfs_unlock_mount(hfsmp);
4589
4590         if (lockflags) {
4591                 hfs_systemfile_unlock(hfsmp, lockflags);
4592                 lockflags = 0;
4593         }
4594
4595         /*
4596          * Update the metadata zone size to match the new volume size,
4597          * and if it too less, metadata zone might be disabled.
4598          */
4599         hfs_metadatazone_init(hfsmp, false);
4600
4601         /*
4602          * If some files have blocks at or beyond the location of the
4603          * new alternate volume header, recalculate free blocks and
4604          * reclaim blocks.  Otherwise just update free blocks count.
4605          *
4606          * The current allocLimit is set to the location of new alternate
4607          * volume header, and reclaimblks are the total number of blocks
4608          * that need to be reclaimed.  So the check below is really
4609          * ignoring the blocks allocated for old alternate volume header.
4610          */
4611         if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
4612                 /*
4613                  * hfs_reclaimspace will use separate transactions when
4614                  * relocating files (so we don't overwhelm the journal).
4615                  */
4616                 hfs_end_transaction(hfsmp);
4617                 transaction_begun = 0;
4618
4619                 /* Attempt to reclaim some space. */
4620                 error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
4621                 if (error != 0) {
4622                         printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
4623                         error = ENOSPC;
4624                         goto out;
4625                 }
4626                 if (hfs_start_transaction(hfsmp) != 0) {
4627                         error = EINVAL;
4628                         goto out;
4629                 }
4630                 transaction_begun = 1;
4631
4632                 /* Check if we're clear now. */
4633                 error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
4634                 if (error != 0) {
4635                         printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
4636                         error = EAGAIN;  /* tell client to try again */
4637                         goto out;
4638                 }
4639         }
4640
4641         /*
4642          * Note: we take the attributes lock in case we have an attribute data vnode
4643          * which needs to change size.
4644          */
4645         lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4646
4647         /*
4648          * Allocate last 1KB for alternate volume header.
4649          */
4650         error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1);
4651         if (error) {
4652                 printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error);
4653                 goto out;
4654         }
4655
4656         /*
4657          * Mark the old alternate volume header as free.
4658          * We don't bother shrinking allocation bitmap file.
4659          */
4660         if (hfsmp->blockSize == 512)
4661                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2);
4662         else
4663                 (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1);
4664
4665         /*
4666          * Invalidate the existing alternate volume header.
4667          *
4668          * Don't include this in a transaction (don't call journal_modify_block)
4669          * since this block will be outside of the truncated file system!
4670          */
4671         if (hfsmp->hfs_alt_id_sector) {
4672                 error = buf_meta_bread(hfsmp->hfs_devvp,
4673                                 HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
4674                                 hfsmp->hfs_physical_block_size, NOCRED, &bp);
4675                 if (error == 0) {
4676                         bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
4677                         (void) VNOP_BWRITE(bp);
4678                 } else {
4679                         if (bp) {
4680                                 buf_brelse(bp);
4681                         }
4682                 }
4683                 bp = NULL;
4684         }
4685
4686         /* Log successful shrinking. */
4687         printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n",
4688                hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks);
4689
4690         /*
4691          * Adjust file system variables and flush them to disk.
4692          */
4693         hfsmp->totalBlocks = newblkcnt;
4694         hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size;
4695         hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
4696
4697         /*
4698          * Note that although the logical block size is updated here, it is only done for
4699          * the benefit of the partition management software.  The logical block count change
4700          * has not yet actually been propagated to the disk device yet.
4701          */
4702
4703         hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count);
4704         MarkVCBDirty(hfsmp);
4705         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4706         if (error)
4707                 panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error);
4708
4709         /*
4710          * Adjust the size of hfsmp->hfs_attrdata_vp
4711          */
4712         if (hfsmp->hfs_attrdata_vp) {
4713                 struct cnode *cp;
4714                 struct filefork *fp;
4715
4716                 if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4717                         cp = VTOC(hfsmp->hfs_attrdata_vp);
4718                         fp = VTOF(hfsmp->hfs_attrdata_vp);
4719
4720                         cp->c_blocks = newblkcnt;
4721                         fp->ff_blocks = newblkcnt;
4722                         fp->ff_extents[0].blockCount = newblkcnt;
4723                         fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4724                         ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size);
4725                         vnode_put(hfsmp->hfs_attrdata_vp);
4726                 }
4727         }
4728
4729 out:
4730         /*
4731          * Update the allocLimit to acknowledge the last one or two blocks now.
4732          * Add it to the tree as well if necessary.
4733          */
4734         UpdateAllocLimit (hfsmp, hfsmp->totalBlocks);
4735
4736         hfs_lock_mount (hfsmp);
4737         if (disable_sparse == true) {
4738                 /* Now that resize is completed, set the volume to be sparse
4739                  * device again so that all further allocations will be first
4740                  * fit instead of best fit.  Reset free extent cache so that
4741                  * it is rebuilt.
4742                  */
4743                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
4744                 ResetVCBFreeExtCache(hfsmp);
4745         }
4746
4747         if (error && (updateFreeBlocks == true)) {
4748                 hfsmp->freeBlocks += reclaimblks;
4749         }
4750
4751         if (hfsmp->nextAllocation >= hfsmp->allocLimit) {
4752                 hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
4753         }
4754         hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4755         hfs_unlock_mount (hfsmp);
4756
4757         /* On error, reset the metadata zone for original volume size */
4758         if (error && (updateFreeBlocks == true)) {
4759                 hfs_metadatazone_init(hfsmp, false);
4760         }
4761
4762         if (lockflags) {
4763                 hfs_systemfile_unlock(hfsmp, lockflags);
4764         }
4765         if (transaction_begun) {
4766                 hfs_end_transaction(hfsmp);
4767                 hfs_journal_flush(hfsmp, FALSE);
4768                 /* Just to be sure, sync all data to the disk */
4769                 (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4770         }
4771
4772         if (error) {
4773                 printf ("hfs_truncatefs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN);
4774         }
4775
4776         return MacToVFSError(error);
4777 }
4778
4779
4780 /*
4781  * Invalidate the physical block numbers associated with buffer cache blocks
4782  * in the given extent of the given vnode.
4783  */
4784 struct hfs_inval_blk_no {
4785         daddr64_t sectorStart;
4786         daddr64_t sectorCount;
4787 };
4788 static int
4789 hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in)
4790 {
4791         daddr64_t blkno;
4792         struct hfs_inval_blk_no *args;
4793
4794         blkno = buf_blkno(bp);
4795         args = args_in;
4796
4797         if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount)
4798                 buf_setblkno(bp, buf_lblkno(bp));
4799
4800         return BUF_RETURNED;
4801 }
4802 static void
4803 hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount)
4804 {
4805         struct hfs_inval_blk_no args;
4806         args.sectorStart = sectorStart;
4807         args.sectorCount = sectorCount;
4808
4809         buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args);
4810 }
4811
4812
4813 /*
4814  * Copy the contents of an extent to a new location.  Also invalidates the
4815  * physical block number of any buffer cache block in the copied extent
4816  * (so that if the block is written, it will go through VNOP_BLOCKMAP to
4817  * determine the new physical block number).
4818  *
4819  * At this point, for regular files, we hold the truncate lock exclusive
4820  * and the cnode lock exclusive.
4821  */
4822 static int
4823 hfs_copy_extent(
4824         struct hfsmount *hfsmp,
4825         struct vnode *vp,               /* The file whose extent is being copied. */
4826         u_int32_t oldStart,             /* The start of the source extent. */
4827         u_int32_t newStart,             /* The start of the destination extent. */
4828         u_int32_t blockCount,   /* The number of allocation blocks to copy. */
4829         vfs_context_t context)
4830 {
4831         int err = 0;
4832         size_t bufferSize;
4833         void *buffer = NULL;
4834         struct vfsioattr ioattr;
4835         buf_t bp = NULL;
4836         off_t resid;
4837         size_t ioSize;
4838         u_int32_t ioSizeSectors;        /* Device sectors in this I/O */
4839         daddr64_t srcSector, destSector;
4840         u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4841 #if CONFIG_PROTECT
4842         int cpenabled = 0;
4843 #endif
4844
4845         /*
4846          * Sanity check that we have locked the vnode of the file we're copying.
4847          *
4848          * But since hfs_systemfile_lock() doesn't actually take the lock on
4849          * the allocation file if a journal is active, ignore the check if the
4850          * file being copied is the allocation file.
4851          */
4852         struct cnode *cp = VTOC(vp);
4853         if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
4854                 panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
4855
4856 #if CONFIG_PROTECT
4857         /*
4858          * Prepare the CP blob and get it ready for use, if necessary.
4859          *
4860          * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs),
4861          * because they are implicitly protected via the media key on iOS.  As such, they
4862          * must not be relocated except with the media key.  So it is OK to not pass down
4863          * a special cpentry to the IOMedia/LwVM code for handling.
4864          */
4865         if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) {
4866                 int cp_err = 0;
4867                 /*
4868                  * Ideally, the file whose extents we are about to manipulate is using the
4869                  * newer offset-based IVs so that we can manipulate it regardless of the
4870                  * current lock state.  However, we must maintain support for older-style
4871                  * EAs.
4872                  *
4873                  * For the older EA case, the IV was tied to the device LBA for file content.
4874                  * This means that encrypted data cannot be moved from one location to another
4875                  * in the filesystem without garbling the IV data.  As a result, we need to
4876                  * access the file's plaintext because we cannot do our AES-symmetry trick
4877                  * here.  This requires that we attempt a key-unwrap here (via cp_handle_relocate)
4878                  * to make forward progress.  If the keys are unavailable then we will
4879                  * simply stop the resize in its tracks here since we cannot move
4880                  * this extent at this time.
4881                  */
4882                 if ((cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) == 0) {
4883                         cp_err = cp_handle_relocate(cp, hfsmp);
4884                 }
4885
4886                 if (cp_err) {
4887                         printf ("hfs_copy_extent: cp_handle_relocate failed (%d) \n", cp_err);
4888                         return cp_err;
4889                 }
4890
4891                 cpenabled = 1;
4892         }
4893 #endif
4894
4895
4896         /*
4897          * Determine the I/O size to use
4898          *
4899          * NOTE: Many external drives will result in an ioSize of 128KB.
4900          * TODO: Should we use a larger buffer, doing several consecutive
4901          * reads, then several consecutive writes?
4902          */
4903         vfs_ioattr(hfsmp->hfs_mp, &ioattr);
4904         bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt);
4905         if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize))
4906                 return ENOMEM;
4907
4908         /* Get a buffer for doing the I/O */
4909         bp = buf_alloc(hfsmp->hfs_devvp);
4910         buf_setdataptr(bp, (uintptr_t)buffer);
4911
4912         resid = (off_t) blockCount * (off_t) hfsmp->blockSize;
4913         srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4914         destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4915         while (resid > 0) {
4916                 ioSize = MIN(bufferSize, (size_t) resid);
4917                 ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size;
4918
4919                 /* Prepare the buffer for reading */
4920                 buf_reset(bp, B_READ);
4921                 buf_setsize(bp, ioSize);
4922                 buf_setcount(bp, ioSize);
4923                 buf_setblkno(bp, srcSector);
4924                 buf_setlblkno(bp, srcSector);
4925
4926                 /*
4927                  * Note that because this is an I/O to the device vp
4928                  * it is correct to have lblkno and blkno both point to the
4929                  * start sector being read from.  If it were being issued against the
4930                  * underlying file then that would be different.
4931                  */
4932
4933                 /* Attach the new CP blob  to the buffer if needed */
4934 #if CONFIG_PROTECT
4935                 if (cpenabled) {
4936                         if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4937                                 /* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */
4938                                 cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT;
4939                                 buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4940                         }
4941                         else {
4942                                 /*
4943                                  * Use the cnode's cp key.  This file is tied to the
4944                                  * LBAs of the physical blocks that it occupies.
4945                                  */
4946                                 buf_setcpaddr (bp, cp->c_cpentry);
4947                         }
4948
4949                         /* Initialize the content protection file offset to start at 0 */
4950                         buf_setcpoff (bp, 0);
4951                 }
4952 #endif
4953
4954                 /* Do the read */
4955                 err = VNOP_STRATEGY(bp);
4956                 if (!err)
4957                         err = buf_biowait(bp);
4958                 if (err) {
4959 #if CONFIG_PROTECT
4960                         /* Turn the flag off in error cases. */
4961                         if (cpenabled) {
4962                                 cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
4963                         }
4964 #endif
4965                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err);
4966                         break;
4967                 }
4968
4969                 /* Prepare the buffer for writing */
4970                 buf_reset(bp, B_WRITE);
4971                 buf_setsize(bp, ioSize);
4972                 buf_setcount(bp, ioSize);
4973                 buf_setblkno(bp, destSector);
4974                 buf_setlblkno(bp, destSector);
4975                 if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
4976                         buf_markfua(bp);
4977
4978 #if CONFIG_PROTECT
4979                 /* Attach the CP to the buffer if needed */
4980                 if (cpenabled) {
4981                         if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4982                                 buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4983                         }
4984                         else {
4985                                 /*
4986                                  * Use the cnode's CP key.  This file is still tied
4987                                  * to the LBAs of the physical blocks that it occupies.
4988                                  */
4989                                 buf_setcpaddr (bp, cp->c_cpentry);
4990                         }
4991                         /*
4992                          * The last STRATEGY call may have updated the cp file offset behind our
4993                          * back, so we cannot trust it.  Re-initialize the content protection
4994                          * file offset back to 0 before initiating the write portion of this I/O.
4995                          */
4996                         buf_setcpoff (bp, 0);
4997                 }
4998 #endif
4999
5000                 /* Do the write */
5001                 vnode_startwrite(hfsmp->hfs_devvp);
5002                 err = VNOP_STRATEGY(bp);
5003                 if (!err) {
5004                         err = buf_biowait(bp);
5005                 }
5006 #if CONFIG_PROTECT
5007                 /* Turn the flag off regardless once the strategy call finishes. */
5008                 if (cpenabled) {
5009                         cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
5010                 }
5011 #endif
5012                 if (err) {
5013                         printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err);
5014                         break;
5015                 }
5016
5017                 resid -= ioSize;
5018                 srcSector += ioSizeSectors;
5019                 destSector += ioSizeSectors;
5020         }
5021         if (bp)
5022                 buf_free(bp);
5023         if (buffer)
5024                 kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
5025
5026         /* Make sure all writes have been flushed to disk. */
5027         if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
5028                 err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
5029                 if (err) {
5030                         printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
5031                         err = 0;        /* Don't fail the copy. */
5032                 }
5033         }
5034
5035         if (!err)
5036                 hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock);
5037
5038         return err;
5039 }
5040
5041
5042 /* Structure to store state of reclaiming extents from a
5043  * given file.  hfs_reclaim_file()/hfs_reclaim_xattr()
5044  * initializes the values in this structure which are then
5045  * used by code that reclaims and splits the extents.
5046  */
5047 struct hfs_reclaim_extent_info {
5048         struct vnode *vp;
5049         u_int32_t fileID;
5050         u_int8_t forkType;
5051         u_int8_t is_dirlink;                 /* Extent belongs to directory hard link */
5052         u_int8_t is_sysfile;                 /* Extent belongs to system file */
5053         u_int8_t is_xattr;                   /* Extent belongs to extent-based xattr */
5054         u_int8_t extent_index;
5055         int lockflags;                       /* Locks that reclaim and split code should grab before modifying the extent record */
5056         u_int32_t blocks_relocated;          /* Total blocks relocated for this file till now */
5057         u_int32_t recStartBlock;             /* File allocation block number (FABN) for current extent record */
5058         u_int32_t cur_blockCount;            /* Number of allocation blocks that have been checked for reclaim */
5059         struct filefork *catalog_fp;         /* If non-NULL, extent is from catalog record */
5060         union record {
5061                 HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */
5062                 HFSPlusAttrRecord xattr;     /* Attribute record for large EAs */
5063         } record;
5064         HFSPlusExtentDescriptor *extents;    /* Pointer to current extent record being processed.
5065                                               * For catalog extent record, points to the correct
5066                                               * extent information in filefork.  For overflow extent
5067                                               * record, or xattr record, points to extent record
5068                                               * in the structure above
5069                                               */
5070         struct cat_desc *dirlink_desc;
5071         struct cat_attr *dirlink_attr;
5072         struct filefork *dirlink_fork;        /* For directory hard links, fp points actually to this */
5073         struct BTreeIterator *iterator;       /* Shared read/write iterator, hfs_reclaim_file/xattr()
5074                                                * use it for reading and hfs_reclaim_extent()/hfs_split_extent()
5075                                                * use it for writing updated extent record
5076                                                */
5077         struct FSBufferDescriptor btdata;     /* Shared btdata for reading/writing extent record, same as iterator above */
5078         u_int16_t recordlen;
5079         int overflow_count;                   /* For debugging, counter for overflow extent record */
5080         FCB *fcb;                             /* Pointer to the current btree being traversed */
5081 };
5082
5083 /*
5084  * Split the current extent into two extents, with first extent
5085  * to contain given number of allocation blocks.  Splitting of
5086  * extent creates one new extent entry which can result in
5087  * shifting of many entries through all the extent records of a
5088  * file, and/or creating a new extent record in the overflow
5089  * extent btree.
5090  *
5091  * Example:
5092  * The diagram below represents two consecutive extent records,
5093  * for simplicity, lets call them record X and X+1 respectively.
5094  * Interesting extent entries have been denoted by letters.
5095  * If the letter is unchanged before and after split, it means
5096  * that the extent entry was not modified during the split.
5097  * A '.' means that the entry remains unchanged after the split
5098  * and is not relevant for our example.  A '0' means that the
5099  * extent entry is empty.
5100  *
5101  * If there isn't sufficient contiguous free space to relocate
5102  * an extent (extent "C" below), we will have to break the one
5103  * extent into multiple smaller extents, and relocate each of
5104  * the smaller extents individually.  The way we do this is by
5105  * finding the largest contiguous free space that is currently
5106  * available (N allocation blocks), and then convert extent "C"
5107  * into two extents, C1 and C2, that occupy exactly the same
5108  * allocation blocks as extent C.  Extent C1 is the first
5109  * N allocation blocks of extent C, and extent C2 is the remainder
5110  * of extent C.  Then we can relocate extent C1 since we know
5111  * we have enough contiguous free space to relocate it in its
5112  * entirety.  We then repeat the process starting with extent C2.
5113  *
5114  * In record X, only the entries following entry C are shifted, and
5115  * the original entry C is replaced with two entries C1 and C2 which
5116  * are actually two extent entries for contiguous allocation blocks.
5117  *
5118  * Note that the entry E from record X is shifted into record X+1 as
5119  * the new first entry.  Since the first entry of record X+1 is updated,
5120  * the FABN will also get updated with the blockCount of entry E.
5121  * This also results in shifting of all extent entries in record X+1.
5122  * Note that the number of empty entries after the split has been
5123  * changed from 3 to 2.
5124  *
5125  * Before:
5126  *               record X                           record X+1
5127  *  ---------------------===---------     ---------------------------------
5128  *  | A | . | . | . | B | C | D | E |     | F | . | . | . | G | 0 | 0 | 0 |
5129  *  ---------------------===---------     ---------------------------------
5130  *
5131  * After:
5132  *  ---------------------=======-----     ---------------------------------
5133  *  | A | . | . | . | B | C1| C2| D |     | E | F | . | . | . | G | 0 | 0 |
5134  *  ---------------------=======-----     ---------------------------------
5135  *
5136  *  C1.startBlock = C.startBlock
5137  *  C1.blockCount = N
5138  *
5139  *  C2.startBlock = C.startBlock + N
5140  *  C2.blockCount = C.blockCount - N
5141  *
5142  *                                        FABN = old FABN - E.blockCount
5143  *
5144  * Inputs:
5145  *      extent_info -   This is the structure that contains state about
5146  *                      the current file, extent, and extent record that
5147  *                      is being relocated.  This structure is shared
5148  *                      among code that traverses through all the extents
5149  *                      of the file, code that relocates extents, and
5150  *                      code that splits the extent.
5151  *      newBlockCount - The blockCount of the extent to be split after
5152  *                      successfully split operation.
5153  * Output:
5154  *      Zero on success, non-zero on failure.
5155  */
5156 static int
5157 hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount)
5158 {
5159         int error = 0;
5160         int index = extent_info->extent_index;
5161         int i;
5162         HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */
5163         HFSPlusExtentDescriptor last_extent;
5164         HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */
5165         HFSPlusExtentRecord *extents_rec = NULL;
5166         HFSPlusExtentKey *extents_key = NULL;
5167         HFSPlusAttrRecord *xattr_rec = NULL;
5168         HFSPlusAttrKey *xattr_key = NULL;
5169         struct BTreeIterator iterator;
5170         struct FSBufferDescriptor btdata;
5171         uint16_t reclen;
5172         uint32_t read_recStartBlock;    /* Starting allocation block number to read old extent record */
5173         uint32_t write_recStartBlock;   /* Starting allocation block number to insert newly updated extent record */
5174         Boolean create_record = false;
5175         Boolean is_xattr;
5176         struct cnode *cp;
5177
5178         is_xattr = extent_info->is_xattr;
5179         extents = extent_info->extents;
5180         cp = VTOC(extent_info->vp);
5181
5182         if (newBlockCount == 0) {
5183                 if (hfs_resize_debug) {
5184                         printf ("hfs_split_extent: No splitting required for newBlockCount=0\n");
5185                 }
5186                 return error;
5187         }
5188
5189         if (hfs_resize_debug) {
5190                 printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount);
5191         }
5192
5193         /* Extents overflow btree can not have more than 8 extents.
5194          * No split allowed if the 8th extent is already used.
5195          */
5196         if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) {
5197                 printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n");
5198                 error = ENOSPC;
5199                 goto out;
5200         }
5201
5202         /* Determine the starting allocation block number for the following
5203          * overflow extent record, if any, before the current record
5204          * gets modified.
5205          */
5206         read_recStartBlock = extent_info->recStartBlock;
5207         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5208                 if (extents[i].blockCount == 0) {
5209                         break;
5210                 }
5211                 read_recStartBlock += extents[i].blockCount;
5212         }
5213
5214         /* Shift and split */
5215         if (index == kHFSPlusExtentDensity-1) {
5216                 /* The new extent created after split will go into following overflow extent record */
5217                 shift_extent.startBlock = extents[index].startBlock + newBlockCount;
5218                 shift_extent.blockCount = extents[index].blockCount - newBlockCount;
5219
5220                 /* Last extent in the record will be split, so nothing to shift */
5221         } else {
5222                 /* Splitting of extents can result in at most of one
5223                  * extent entry to be shifted into following overflow extent
5224                  * record.  So, store the last extent entry for later.
5225                  */
5226                 shift_extent = extents[kHFSPlusExtentDensity-1];
5227                 if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) {
5228                         printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount);
5229                 }
5230
5231                 /* Start shifting extent information from the end of the extent
5232                  * record to the index where we want to insert the new extent.
5233                  * Note that kHFSPlusExtentDensity-1 is already saved above, and
5234                  * does not need to be shifted.  The extent entry that is being
5235                  * split does not get shifted.
5236                  */
5237                 for (i = kHFSPlusExtentDensity-2; i > index; i--) {
5238                         if (hfs_resize_debug) {
5239                                 if (extents[i].blockCount) {
5240                                         printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount);
5241                                 }
5242                         }
5243                         extents[i+1] = extents[i];
5244                 }
5245         }
5246
5247         if (index == kHFSPlusExtentDensity-1) {
5248                 /* The second half of the extent being split will be the overflow
5249                  * entry that will go into following overflow extent record.  The
5250                  * value has been stored in 'shift_extent' above, so there is
5251                  * nothing to be done here.
5252                  */
5253         } else {
5254                 /* Update the values in the second half of the extent being split
5255                  * before updating the first half of the split.  Note that the
5256                  * extent to split or first half of the split is at index 'index'
5257                  * and a new extent or second half of the split will be inserted at
5258                  * 'index+1' or into following overflow extent record.
5259                  */
5260                 extents[index+1].startBlock = extents[index].startBlock + newBlockCount;
5261                 extents[index+1].blockCount = extents[index].blockCount - newBlockCount;
5262         }
5263         /* Update the extent being split, only the block count will change */
5264         extents[index].blockCount = newBlockCount;
5265
5266         if (hfs_resize_debug) {
5267                 printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount);
5268                 if (index != kHFSPlusExtentDensity-1) {
5269                         printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount);
5270                 } else {
5271                         printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount);
5272                 }
5273         }
5274
5275         /* Write out information about the newly split extent to the disk */
5276         if (extent_info->catalog_fp) {
5277                 /* (extent_info->catalog_fp != NULL) means the newly split
5278                  * extent exists in the catalog record.  This means that
5279                  * the cnode was updated.  Therefore, to write out the changes,
5280                  * mark the cnode as modified.   We cannot call hfs_update()
5281                  * in this function because the caller hfs_reclaim_extent()
5282                  * is holding the catalog lock currently.
5283                  */
5284                 cp->c_flag |= C_MODIFIED;
5285         } else {
5286                 /* The newly split extent is for large EAs or is in overflow
5287                  * extent record, so update it directly in the btree using the
5288                  * iterator information from the shared extent_info structure
5289                  */
5290                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5291                                 &(extent_info->btdata), extent_info->recordlen);
5292                 if (error) {
5293                         printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error);
5294                         goto out;
5295                 }
5296         }
5297
5298         /* No extent entry to be shifted into another extent overflow record */
5299         if (shift_extent.blockCount == 0) {
5300                 if (hfs_resize_debug) {
5301                         printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n");
5302                 }
5303                 error = 0;
5304                 goto out;
5305         }
5306
5307         /* The overflow extent entry has to be shifted into an extent
5308          * overflow record.  This means that we might have to shift
5309          * extent entries from all subsequent overflow records by one.
5310          * We start iteration from the first record to the last record,
5311          * and shift the extent entry from one record to another.
5312          * We might have to create a new extent record for the last
5313          * extent entry for the file.
5314          */
5315
5316         /* Initialize iterator to search the next record */
5317         bzero(&iterator, sizeof(iterator));
5318         if (is_xattr) {
5319                 /* Copy the key from the iterator that was used to update the modified attribute record. */
5320                 xattr_key = (HFSPlusAttrKey *)&(iterator.key);
5321                 bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey));
5322                 /* Note: xattr_key->startBlock will be initialized later in the iteration loop */
5323
5324                 MALLOC(xattr_rec, HFSPlusAttrRecord *,
5325                                 sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK);
5326                 if (xattr_rec == NULL) {
5327                         error = ENOMEM;
5328                         goto out;
5329                 }
5330                 btdata.bufferAddress = xattr_rec;
5331                 btdata.itemSize = sizeof(HFSPlusAttrRecord);
5332                 btdata.itemCount = 1;
5333                 extents = xattr_rec->overflowExtents.extents;
5334         } else {
5335                 /* Initialize the extent key for the current file */
5336                 extents_key = (HFSPlusExtentKey *) &(iterator.key);
5337                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5338                 extents_key->forkType = extent_info->forkType;
5339                 extents_key->fileID = extent_info->fileID;
5340                 /* Note: extents_key->startBlock will be initialized later in the iteration loop */
5341
5342                 MALLOC(extents_rec, HFSPlusExtentRecord *,
5343                                 sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK);
5344                 if (extents_rec == NULL) {
5345                         error = ENOMEM;
5346                         goto out;
5347                 }
5348                 btdata.bufferAddress = extents_rec;
5349                 btdata.itemSize = sizeof(HFSPlusExtentRecord);
5350                 btdata.itemCount = 1;
5351                 extents = extents_rec[0];
5352         }
5353
5354         /* The overflow extent entry has to be shifted into an extent
5355          * overflow record.  This means that we might have to shift
5356          * extent entries from all subsequent overflow records by one.
5357          * We start iteration from the first record to the last record,
5358          * examine one extent record in each iteration and shift one
5359          * extent entry from one record to another.  We might have to
5360          * create a new extent record for the last extent entry for the
5361          * file.
5362          *
5363          * If shift_extent.blockCount is non-zero, it means that there is
5364          * an extent entry that needs to be shifted into the next
5365          * overflow extent record.  We keep on going till there are no such
5366          * entries left to be shifted.  This will also change the starting
5367          * allocation block number of the extent record which is part of
5368          * the key for the extent record in each iteration.  Note that
5369          * because the extent record key is changing while we are searching,
5370          * the record can not be updated directly, instead it has to be
5371          * deleted and inserted again.
5372          */
5373         while (shift_extent.blockCount) {
5374                 if (hfs_resize_debug) {
5375                         printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock);
5376                 }
5377
5378                 /* Search if there is any existing overflow extent record
5379                  * that matches the current file and the logical start block
5380                  * number.
5381                  *
5382                  * For this, the logical start block number in the key is
5383                  * the value calculated based on the logical start block
5384                  * number of the current extent record and the total number
5385                  * of blocks existing in the current extent record.
5386                  */
5387                 if (is_xattr) {
5388                         xattr_key->startBlock = read_recStartBlock;
5389                 } else {
5390                         extents_key->startBlock = read_recStartBlock;
5391                 }
5392                 error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator);
5393                 if (error) {
5394                         if (error != btNotFound) {
5395                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5396                                 goto out;
5397                         }
5398                         /* No matching record was found, so create a new extent record.
5399                          * Note:  Since no record was found, we can't rely on the
5400                          * btree key in the iterator any longer.  This will be initialized
5401                          * later before we insert the record.
5402                          */
5403                         create_record = true;
5404                 }
5405
5406                 /* The extra extent entry from the previous record is being inserted
5407                  * as the first entry in the current extent record.  This will change
5408                  * the file allocation block number (FABN) of the current extent
5409                  * record, which is the startBlock value from the extent record key.
5410                  * Since one extra entry is being inserted in the record, the new
5411                  * FABN for the record will less than old FABN by the number of blocks
5412                  * in the new extent entry being inserted at the start.  We have to
5413                  * do this before we update read_recStartBlock to point at the
5414                  * startBlock of the following record.
5415                  */
5416                 write_recStartBlock = read_recStartBlock - shift_extent.blockCount;
5417                 if (hfs_resize_debug) {
5418                         if (create_record) {
5419                                 printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock);
5420                         }
5421                 }
5422
5423                 /* Now update the read_recStartBlock to account for total number
5424                  * of blocks in this extent record.  It will now point to the
5425                  * starting allocation block number for the next extent record.
5426                  */
5427                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
5428                         if (extents[i].blockCount == 0) {
5429                                 break;
5430                         }
5431                         read_recStartBlock += extents[i].blockCount;
5432                 }
5433
5434                 if (create_record == true) {
5435                         /* Initialize new record content with only one extent entry */
5436                         bzero(extents, sizeof(HFSPlusExtentRecord));
5437                         /* The new record will contain only one extent entry */
5438                         extents[0] = shift_extent;
5439                         /* There are no more overflow extents to be shifted */
5440                         shift_extent.startBlock = shift_extent.blockCount = 0;
5441
5442                         if (is_xattr) {
5443                                 /* BTSearchRecord above returned btNotFound,
5444                                  * but since the attribute btree is never empty
5445                                  * if we are trying to insert new overflow
5446                                  * record for the xattrs, the extents_key will
5447                                  * contain correct data.  So we don't need to
5448                                  * re-initialize it again like below.
5449                                  */
5450
5451                                 /* Initialize the new xattr record */
5452                                 xattr_rec->recordType = kHFSPlusAttrExtents;
5453                                 xattr_rec->overflowExtents.reserved = 0;
5454                                 reclen = sizeof(HFSPlusAttrExtents);
5455                         } else {
5456                                 /* BTSearchRecord above returned btNotFound,
5457                                  * which means that extents_key content might
5458                                  * not correspond to the record that we are
5459                                  * trying to create, especially when the extents
5460                                  * overflow btree is empty.  So we reinitialize
5461                                  * the extents_key again always.
5462                                  */
5463                                 extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5464                                 extents_key->forkType = extent_info->forkType;
5465                                 extents_key->fileID = extent_info->fileID;
5466
5467                                 /* Initialize the new extent record */
5468                                 reclen = sizeof(HFSPlusExtentRecord);
5469                         }
5470                 } else {
5471                         /* The overflow extent entry from previous record will be
5472                          * the first entry in this extent record.  If the last
5473                          * extent entry in this record is valid, it will be shifted
5474                          * into the following extent record as its first entry.  So
5475                          * save the last entry before shifting entries in current
5476                          * record.
5477                          */
5478                         last_extent = extents[kHFSPlusExtentDensity-1];
5479
5480                         /* Shift all entries by one index towards the end */
5481                         for (i = kHFSPlusExtentDensity-2; i >= 0; i--) {
5482                                 extents[i+1] = extents[i];
5483                         }
5484
5485                         /* Overflow extent entry saved from previous record
5486                          * is now the first entry in the current record.
5487                          */
5488                         extents[0] = shift_extent;
5489
5490                         if (hfs_resize_debug) {
5491                                 printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock);
5492                         }
5493
5494                         /* The last entry from current record will be the
5495                          * overflow entry which will be the first entry for
5496                          * the following extent record.
5497                          */
5498                         shift_extent = last_extent;
5499
5500                         /* Since the key->startBlock is being changed for this record,
5501                          * it should be deleted and inserted with the new key.
5502                          */
5503                         error = BTDeleteRecord(extent_info->fcb, &iterator);
5504                         if (error) {
5505                                 printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5506                                 goto out;
5507                         }
5508                         if (hfs_resize_debug) {
5509                                 printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock));
5510                         }
5511                 }
5512
5513                 /* Insert the newly created or modified extent record */
5514                 bzero(&iterator.hint, sizeof(iterator.hint));
5515                 if (is_xattr) {
5516                         xattr_key->startBlock = write_recStartBlock;
5517                 } else {
5518                         extents_key->startBlock = write_recStartBlock;
5519                 }
5520                 error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen);
5521                 if (error) {
5522                         printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error);
5523                         goto out;
5524                 }
5525                 if (hfs_resize_debug) {
5526                         printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock);
5527                 }
5528         }
5529
5530 out:
5531         /*
5532          * Extents overflow btree or attributes btree headers might have
5533          * been modified during the split/shift operation, so flush the
5534          * changes to the disk while we are inside journal transaction.
5535          * We should only be able to generate I/O that modifies the B-Tree
5536          * header nodes while we're in the middle of a journal transaction.
5537          * Otherwise it might result in panic during unmount.
5538          */
5539         BTFlushPath(extent_info->fcb);
5540
5541         if (extents_rec) {
5542                 FREE (extents_rec, M_TEMP);
5543         }
5544         if (xattr_rec) {
5545                 FREE (xattr_rec, M_TEMP);
5546         }
5547         return error;
5548 }
5549
5550
5551 /*
5552  * Relocate an extent if it lies beyond the expected end of volume.
5553  *
5554  * This function is called for every extent of the file being relocated.
5555  * It allocates space for relocation, copies the data, deallocates
5556  * the old extent, and update corresponding on-disk extent.  If the function
5557  * does not find contiguous space to  relocate an extent, it splits the
5558  * extent in smaller size to be able to relocate it out of the area of
5559  * disk being reclaimed.  As an optimization, if an extent lies partially
5560  * in the area of the disk being reclaimed, it is split so that we only
5561  * have to relocate the area that was overlapping with the area of disk
5562  * being reclaimed.
5563  *
5564  * Note that every extent is relocated in its own transaction so that
5565  * they do not overwhelm the journal.  This function handles the extent
5566  * record that exists in the catalog record, extent record from overflow
5567  * extents btree, and extents for large EAs.
5568  *
5569  * Inputs:
5570  *      extent_info - This is the structure that contains state about
5571  *                    the current file, extent, and extent record that
5572  *                    is being relocated.  This structure is shared
5573  *                    among code that traverses through all the extents
5574  *                    of the file, code that relocates extents, and
5575  *                    code that splits the extent.
5576  */
5577 static int
5578 hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context)
5579 {
5580         int error = 0;
5581         int index;
5582         struct cnode *cp;
5583         u_int32_t oldStartBlock;
5584         u_int32_t oldBlockCount;
5585         u_int32_t newStartBlock;
5586         u_int32_t newBlockCount;
5587         u_int32_t roundedBlockCount;
5588         uint16_t node_size;
5589         uint32_t remainder_blocks;
5590         u_int32_t alloc_flags;
5591         int blocks_allocated = false;
5592
5593         index = extent_info->extent_index;
5594         cp = VTOC(extent_info->vp);
5595
5596         oldStartBlock = extent_info->extents[index].startBlock;
5597         oldBlockCount = extent_info->extents[index].blockCount;
5598
5599         if (0 && hfs_resize_debug) {
5600                 printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount);
5601         }
5602
5603         /* If the current extent lies completely within allocLimit,
5604          * it does not require any relocation.
5605          */
5606         if ((oldStartBlock + oldBlockCount) <= allocLimit) {
5607                 extent_info->cur_blockCount += oldBlockCount;
5608                 return error;
5609         }
5610
5611         /* Every extent should be relocated in its own transaction
5612          * to make sure that we don't overflow the journal buffer.
5613          */
5614         error = hfs_start_transaction(hfsmp);
5615         if (error) {
5616                 return error;
5617         }
5618         extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK);
5619
5620         /* Check if the extent lies partially in the area to reclaim,
5621          * i.e. it starts before allocLimit and ends beyond allocLimit.
5622          * We have already skipped extents that lie completely within
5623          * allocLimit in the check above, so we only check for the
5624          * startBlock.  If it lies partially, split it so that we
5625          * only relocate part of the extent.
5626          */
5627         if (oldStartBlock < allocLimit) {
5628                 newBlockCount = allocLimit - oldStartBlock;
5629
5630                 if (hfs_resize_debug) {
5631                         int idx = extent_info->extent_index;
5632                         printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5633                 }
5634
5635                 /* If the extent belongs to a btree, check and trim
5636                  * it to be multiple of the node size.
5637                  */
5638                 if (extent_info->is_sysfile) {
5639                         node_size = get_btree_nodesize(extent_info->vp);
5640                         /* If the btree node size is less than the block size,
5641                          * splitting this extent will not split a node across
5642                          * different extents.  So we only check and trim if
5643                          * node size is more than the allocation block size.
5644                          */
5645                         if (node_size > hfsmp->blockSize) {
5646                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5647                                 if (remainder_blocks) {
5648                                         newBlockCount -= remainder_blocks;
5649                                         if (hfs_resize_debug) {
5650                                                 printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5651                                         }
5652                                 }
5653                         }
5654                         /* The newBlockCount is zero because of rounding-down so that
5655                          * btree nodes are not split across extents.  Therefore this
5656                          * straddling extent across resize-boundary does not require
5657                          * splitting.  Skip over to relocating of complete extent.
5658                          */
5659                         if (newBlockCount == 0) {
5660                                 if (hfs_resize_debug) {
5661                                         printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n");
5662                                 }
5663                                 goto relocate_full_extent;
5664                         }
5665                 }
5666
5667                 /* Split the extents into two parts --- the first extent lies
5668                  * completely within allocLimit and therefore does not require
5669                  * relocation.  The second extent will require relocation which
5670                  * will be handled when the caller calls this function again
5671                  * for the next extent.
5672                  */
5673                 error = hfs_split_extent(extent_info, newBlockCount);
5674                 if (error == 0) {
5675                         /* Split success, no relocation required */
5676                         goto out;
5677                 }
5678                 /* Split failed, so try to relocate entire extent */
5679                 if (hfs_resize_debug) {
5680                         int idx = extent_info->extent_index;
5681                         printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5682                 }
5683         }
5684
5685 relocate_full_extent:
5686         /* At this point, the current extent requires relocation.
5687          * We will try to allocate space equal to the size of the extent
5688          * being relocated first to try to relocate it without splitting.
5689          * If the allocation fails, we will try to allocate contiguous
5690          * blocks out of metadata zone.  If that allocation also fails,
5691          * then we will take a whatever contiguous block run is returned
5692          * by the allocation, split the extent into two parts, and then
5693          * relocate the first splitted extent.
5694          */
5695         alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS;
5696         if (extent_info->is_sysfile) {
5697                 alloc_flags |= HFS_ALLOC_METAZONE;
5698         }
5699
5700         error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags,
5701                         &newStartBlock, &newBlockCount);
5702         if ((extent_info->is_sysfile == false) &&
5703             ((error == dskFulErr) || (error == ENOSPC))) {
5704                 /* For non-system files, try reallocating space in metadata zone */
5705                 alloc_flags |= HFS_ALLOC_METAZONE;
5706                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5707                                 alloc_flags, &newStartBlock, &newBlockCount);
5708         }
5709         if ((error == dskFulErr) || (error == ENOSPC)) {
5710                 /* We did not find desired contiguous space for this extent.
5711                  * So don't worry about getting contiguity anymore.  Also, allow using
5712                  * blocks that were recently deallocated.
5713                  */
5714                 alloc_flags &= ~HFS_ALLOC_FORCECONTIG;
5715                 alloc_flags |= HFS_ALLOC_FLUSHTXN;
5716
5717                 error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5718                                 alloc_flags, &newStartBlock, &newBlockCount);
5719                 if (error) {
5720                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5721                         goto out;
5722                 }
5723                 blocks_allocated = true;
5724
5725                 /* The number of blocks allocated is less than the requested
5726                  * number of blocks.  For btree extents, check and trim the
5727                  * extent to be multiple of the node size.
5728                  */
5729                 if (extent_info->is_sysfile) {
5730                         node_size = get_btree_nodesize(extent_info->vp);
5731                         if (node_size > hfsmp->blockSize) {
5732                                 remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5733                                 if (remainder_blocks) {
5734                                         roundedBlockCount = newBlockCount - remainder_blocks;
5735                                         /* Free tail-end blocks of the newly allocated extent */
5736                                         BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount,
5737                                                                newBlockCount - roundedBlockCount,
5738                                                                HFS_ALLOC_SKIPFREEBLKS);
5739                                         newBlockCount = roundedBlockCount;
5740                                         if (hfs_resize_debug) {
5741                                                 printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5742                                         }
5743                                         if (newBlockCount == 0) {
5744                                                 printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID);
5745                                                 error = ENOSPC;
5746                                                 goto out;
5747                                         }
5748                                 }
5749                         }
5750                 }
5751
5752                 /* The number of blocks allocated is less than the number of
5753                  * blocks requested, so split this extent --- the first extent
5754                  * will be relocated as part of this function call and the caller
5755                  * will handle relocating the second extent by calling this
5756                  * function again for the second extent.
5757                  */
5758                 error = hfs_split_extent(extent_info, newBlockCount);
5759                 if (error) {
5760                         printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5761                         goto out;
5762                 }
5763                 oldBlockCount = newBlockCount;
5764         }
5765         if (error) {
5766                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5767                 goto out;
5768         }
5769         blocks_allocated = true;
5770
5771         /* Copy data from old location to new location */
5772         error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock,
5773                         newStartBlock, newBlockCount, context);
5774         if (error) {
5775                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error);
5776                 goto out;
5777         }
5778
5779         /* Update the extent record with the new start block information */
5780         extent_info->extents[index].startBlock = newStartBlock;
5781
5782         /* Sync the content back to the disk */
5783         if (extent_info->catalog_fp) {
5784                 /* Update the extents in catalog record */
5785                 if (extent_info->is_dirlink) {
5786                         error = cat_update_dirlink(hfsmp, extent_info->forkType,
5787                                         extent_info->dirlink_desc, extent_info->dirlink_attr,
5788                                         &(extent_info->dirlink_fork->ff_data));
5789                 } else {
5790                         cp->c_flag |= C_MODIFIED;
5791                         /* If this is a system file, sync volume headers on disk */
5792                         if (extent_info->is_sysfile) {
5793                                 error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5794                         }
5795                 }
5796         } else {
5797                 /* Replace record for extents overflow or extents-based xattrs */
5798                 error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5799                                 &(extent_info->btdata), extent_info->recordlen);
5800         }
5801         if (error) {
5802                 printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error);
5803                 goto out;
5804         }
5805
5806         /* Deallocate the old extent */
5807         error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5808         if (error) {
5809                 printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5810                 goto out;
5811         }
5812         extent_info->blocks_relocated += newBlockCount;
5813
5814         if (hfs_resize_debug) {
5815                 printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
5816         }
5817
5818 out:
5819         if (error != 0) {
5820                 if (blocks_allocated == true) {
5821                         BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5822                 }
5823         } else {
5824                 /* On success, increment the total allocation blocks processed */
5825                 extent_info->cur_blockCount += newBlockCount;
5826         }
5827
5828         hfs_systemfile_unlock(hfsmp, extent_info->lockflags);
5829
5830         /* For a non-system file, if an extent entry from catalog record
5831          * was modified, sync the in-memory changes to the catalog record
5832          * on disk before ending the transaction.
5833          */
5834          if ((extent_info->catalog_fp) &&
5835              (extent_info->is_sysfile == false)) {
5836                 (void) hfs_update(extent_info->vp, MNT_WAIT);
5837         }
5838
5839         hfs_end_transaction(hfsmp);
5840
5841         return error;
5842 }
5843
5844 /* Report intermediate progress during volume resize */
5845 static void
5846 hfs_truncatefs_progress(struct hfsmount *hfsmp)
5847 {
5848         u_int32_t cur_progress = 0;
5849
5850         hfs_resize_progress(hfsmp, &cur_progress);
5851         if (cur_progress > (hfsmp->hfs_resize_progress + 9)) {
5852                 printf("hfs_truncatefs: %d%% done...\n", cur_progress);
5853                 hfsmp->hfs_resize_progress = cur_progress;
5854         }
5855         return;
5856 }
5857
5858 /*
5859  * Reclaim space at the end of a volume for given file and forktype.
5860  *
5861  * This routine attempts to move any extent which contains allocation blocks
5862  * at or after "allocLimit."  A separate transaction is used for every extent
5863  * that needs to be moved.  If there is not contiguous space available for
5864  * moving an extent, it can be split into smaller extents.  The contents of
5865  * any moved extents are read and written via the volume's device vnode --
5866  * NOT via "vp."  During the move, moved blocks which are part of a transaction
5867  * have their physical block numbers invalidated so they will eventually be
5868  * written to their new locations.
5869  *
5870  * This function is also called for directory hard links.  Directory hard links
5871  * are regular files with no data fork and resource fork that contains alias
5872  * information for backward compatibility with pre-Leopard systems.  However
5873  * non-Mac OS X implementation can add/modify data fork or resource fork
5874  * information to directory hard links, so we check, and if required, relocate
5875  * both data fork and resource fork.
5876  *
5877  * Inputs:
5878  *    hfsmp       The volume being resized.
5879  *    vp          The vnode for the system file.
5880  *    fileID      ID of the catalog record that needs to be relocated
5881  *    forktype    The type of fork that needs relocated,
5882  *                      kHFSResourceForkType for resource fork,
5883  *                      kHFSDataForkType for data fork
5884  *    allocLimit  Allocation limit for the new volume size,
5885  *                do not use this block or beyond.  All extents
5886  *                that use this block or any blocks beyond this limit
5887  *                will be relocated.
5888  *
5889  * Side Effects:
5890  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
5891  * blocks that were relocated.
5892  */
5893 static int
5894 hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID,
5895                 u_int8_t forktype, u_long allocLimit, vfs_context_t context)
5896 {
5897         int error = 0;
5898         struct hfs_reclaim_extent_info *extent_info;
5899         int i;
5900         int lockflags = 0;
5901         struct cnode *cp;
5902         struct filefork *fp;
5903         int took_truncate_lock = false;
5904         int release_desc = false;
5905         HFSPlusExtentKey *key;
5906
5907         /* If there is no vnode for this file, then there's nothing to do. */
5908         if (vp == NULL) {
5909                 return 0;
5910         }
5911
5912         cp = VTOC(vp);
5913
5914         if (hfs_resize_debug) {
5915                 const char *filename = (const char *) cp->c_desc.cd_nameptr;
5916                 int namelen = cp->c_desc.cd_namelen;
5917
5918                 if (filename == NULL) {
5919                         filename = "";
5920                         namelen = 0;
5921                 }
5922                 printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename);
5923         }
5924
5925         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
5926                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
5927         if (extent_info == NULL) {
5928                 return ENOMEM;
5929         }
5930         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
5931         extent_info->vp = vp;
5932         extent_info->fileID = fileID;
5933         extent_info->forkType = forktype;
5934         extent_info->is_sysfile = vnode_issystem(vp);
5935         if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) {
5936                 extent_info->is_dirlink = true;
5937         }
5938         /* We always need allocation bitmap and extent btree lock */
5939         lockflags = SFL_BITMAP | SFL_EXTENTS;
5940         if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) {
5941                 lockflags |= SFL_CATALOG;
5942         } else if (fileID == kHFSAttributesFileID) {
5943                 lockflags |= SFL_ATTRIBUTE;
5944         } else if (fileID == kHFSStartupFileID) {
5945                 lockflags |= SFL_STARTUP;
5946         }
5947         extent_info->lockflags = lockflags;
5948         extent_info->fcb = VTOF(hfsmp->hfs_extents_vp);
5949
5950         /* Flush data associated with current file on disk.
5951          *
5952          * If the current vnode is directory hard link, no flushing of
5953          * journal or vnode is required.  The current kernel does not
5954          * modify data/resource fork of directory hard links, so nothing
5955          * will be in the cache.  If a directory hard link is newly created,
5956          * the resource fork data is written directly using devvp and
5957          * the code that actually relocates data (hfs_copy_extent()) also
5958          * uses devvp for its I/O --- so they will see a consistent copy.
5959          */
5960         if (extent_info->is_sysfile) {
5961                 /* If the current vnode is system vnode, flush journal
5962                  * to make sure that all data is written to the disk.
5963                  */
5964                 error = hfs_journal_flush(hfsmp, TRUE);
5965                 if (error) {
5966                         printf ("hfs_reclaim_file: journal_flush returned %d\n", error);
5967                         goto out;
5968                 }
5969         } else if (extent_info->is_dirlink == false) {
5970                 /* Flush all blocks associated with this regular file vnode.
5971                  * Normally there should not be buffer cache blocks for regular
5972                  * files, but for objects like symlinks, we can have buffer cache
5973                  * blocks associated with the vnode.  Therefore we call
5974                  * buf_flushdirtyblks() also.
5975                  */
5976                 buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file");
5977
5978                 hfs_unlock(cp);
5979                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5980                 took_truncate_lock = true;
5981                 (void) cluster_push(vp, 0);
5982                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5983                 if (error) {
5984                         goto out;
5985                 }
5986
5987                 /* If the file no longer exists, nothing left to do */
5988                 if (cp->c_flag & C_NOEXISTS) {
5989                         error = 0;
5990                         goto out;
5991                 }
5992
5993                 /* Wait for any in-progress writes to this vnode to complete, so that we'll
5994                  * be copying consistent bits.  (Otherwise, it's possible that an async
5995                  * write will complete to the old extent after we read from it.  That
5996                  * could lead to corruption.)
5997                  */
5998                 error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
5999                 if (error) {
6000                         goto out;
6001                 }
6002         }
6003
6004         if (hfs_resize_debug) {
6005                 printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID);
6006         }
6007
6008         if (extent_info->is_dirlink) {
6009                 MALLOC(extent_info->dirlink_desc, struct cat_desc *,
6010                                 sizeof(struct cat_desc), M_TEMP, M_WAITOK);
6011                 MALLOC(extent_info->dirlink_attr, struct cat_attr *,
6012                                 sizeof(struct cat_attr), M_TEMP, M_WAITOK);
6013                 MALLOC(extent_info->dirlink_fork, struct filefork *,
6014                                 sizeof(struct filefork), M_TEMP, M_WAITOK);
6015                 if ((extent_info->dirlink_desc == NULL) ||
6016                     (extent_info->dirlink_attr == NULL) ||
6017                     (extent_info->dirlink_fork == NULL)) {
6018                         error = ENOMEM;
6019                         goto out;
6020                 }
6021
6022                 /* Lookup catalog record for directory hard link and
6023                  * create a fake filefork for the value looked up from
6024                  * the disk.
6025                  */
6026                 fp = extent_info->dirlink_fork;
6027                 bzero(extent_info->dirlink_fork, sizeof(struct filefork));
6028                 extent_info->dirlink_fork->ff_cp = cp;
6029                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6030                 error = cat_lookup_dirlink(hfsmp, fileID, forktype,
6031                                 extent_info->dirlink_desc, extent_info->dirlink_attr,
6032                                 &(extent_info->dirlink_fork->ff_data));
6033                 hfs_systemfile_unlock(hfsmp, lockflags);
6034                 if (error) {
6035                         printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error);
6036                         goto out;
6037                 }
6038                 release_desc = true;
6039         } else {
6040                 fp = VTOF(vp);
6041         }
6042
6043         extent_info->catalog_fp = fp;
6044         extent_info->recStartBlock = 0;
6045         extent_info->extents = extent_info->catalog_fp->ff_extents;
6046         /* Relocate extents from the catalog record */
6047         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6048                 if (fp->ff_extents[i].blockCount == 0) {
6049                         break;
6050                 }
6051                 extent_info->extent_index = i;
6052                 error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6053                 if (error) {
6054                         printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error);
6055                         goto out;
6056                 }
6057         }
6058
6059         /* If the number of allocation blocks processed for reclaiming
6060          * are less than total number of blocks for the file, continuing
6061          * working on overflow extents record.
6062          */
6063         if (fp->ff_blocks <= extent_info->cur_blockCount) {
6064                 if (0 && hfs_resize_debug) {
6065                         printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6066                 }
6067                 goto out;
6068         }
6069
6070         if (hfs_resize_debug) {
6071                 printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6072         }
6073
6074         MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6075         if (extent_info->iterator == NULL) {
6076                 error = ENOMEM;
6077                 goto out;
6078         }
6079         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6080         key = (HFSPlusExtentKey *) &(extent_info->iterator->key);
6081         key->keyLength = kHFSPlusExtentKeyMaximumLength;
6082         key->forkType = forktype;
6083         key->fileID = fileID;
6084         key->startBlock = extent_info->cur_blockCount;
6085
6086         extent_info->btdata.bufferAddress = extent_info->record.overflow;
6087         extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord);
6088         extent_info->btdata.itemCount = 1;
6089
6090         extent_info->catalog_fp = NULL;
6091
6092         /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */
6093         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6094         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6095                         &(extent_info->btdata), &(extent_info->recordlen),
6096                         extent_info->iterator);
6097         hfs_systemfile_unlock(hfsmp, lockflags);
6098         while (error == 0) {
6099                 extent_info->overflow_count++;
6100                 extent_info->recStartBlock = key->startBlock;
6101                 extent_info->extents = extent_info->record.overflow;
6102                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6103                         if (extent_info->record.overflow[i].blockCount == 0) {
6104                                 goto out;
6105                         }
6106                         extent_info->extent_index = i;
6107                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6108                         if (error) {
6109                                 printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error);
6110                                 goto out;
6111                         }
6112                 }
6113
6114                 /* Look for more overflow records */
6115                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6116                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6117                                 extent_info->iterator, &(extent_info->btdata),
6118                                 &(extent_info->recordlen));
6119                 hfs_systemfile_unlock(hfsmp, lockflags);
6120                 if (error) {
6121                         break;
6122                 }
6123                 /* Stop when we encounter a different file or fork. */
6124                 if ((key->fileID != fileID) || (key->forkType != forktype)) {
6125                         break;
6126                 }
6127         }
6128         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6129                 error = 0;
6130         }
6131
6132 out:
6133         /* If any blocks were relocated, account them and report progress */
6134         if (extent_info->blocks_relocated) {
6135                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6136                 hfs_truncatefs_progress(hfsmp);
6137                 if (fileID < kHFSFirstUserCatalogNodeID) {
6138                         printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n",
6139                                         extent_info->blocks_relocated, fileID, hfsmp->vcbVN);
6140                 }
6141         }
6142         if (extent_info->iterator) {
6143                 FREE(extent_info->iterator, M_TEMP);
6144         }
6145         if (release_desc == true) {
6146                 cat_releasedesc(extent_info->dirlink_desc);
6147         }
6148         if (extent_info->dirlink_desc) {
6149                 FREE(extent_info->dirlink_desc, M_TEMP);
6150         }
6151         if (extent_info->dirlink_attr) {
6152                 FREE(extent_info->dirlink_attr, M_TEMP);
6153         }
6154         if (extent_info->dirlink_fork) {
6155                 FREE(extent_info->dirlink_fork, M_TEMP);
6156         }
6157         if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) {
6158                 (void) hfs_update(vp, MNT_WAIT);
6159         }
6160         if (took_truncate_lock) {
6161                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
6162         }
6163         if (extent_info) {
6164                 FREE(extent_info, M_TEMP);
6165         }
6166         if (hfs_resize_debug) {
6167                 printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error);
6168         }
6169
6170         return error;
6171 }
6172
6173
6174 /*
6175  * This journal_relocate callback updates the journal info block to point
6176  * at the new journal location.  This write must NOT be done using the
6177  * transaction.  We must write the block immediately.  We must also force
6178  * it to get to the media so that the new journal location will be seen by
6179  * the replay code before we can safely let journaled blocks be written
6180  * to their normal locations.
6181  *
6182  * The tests for journal_uses_fua below are mildly hacky.  Since the journal
6183  * and the file system are both on the same device, I'm leveraging what
6184  * the journal has decided about FUA.
6185  */
6186 struct hfs_journal_relocate_args {
6187         struct hfsmount *hfsmp;
6188         vfs_context_t context;
6189         u_int32_t newStartBlock;
6190         u_int32_t newBlockCount;
6191 };
6192
6193 static errno_t
6194 hfs_journal_relocate_callback(void *_args)
6195 {
6196         int error;
6197         struct hfs_journal_relocate_args *args = _args;
6198         struct hfsmount *hfsmp = args->hfsmp;
6199         buf_t bp;
6200         JournalInfoBlock *jibp;
6201
6202         error = buf_meta_bread(hfsmp->hfs_devvp,
6203                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6204                 hfsmp->blockSize, vfs_context_ucred(args->context), &bp);
6205         if (error) {
6206                 printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error);
6207                 if (bp) {
6208                         buf_brelse(bp);
6209                 }
6210                 return error;
6211         }
6212         jibp = (JournalInfoBlock*) buf_dataptr(bp);
6213         jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize);
6214         jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize);
6215         if (journal_uses_fua(hfsmp->jnl))
6216                 buf_markfua(bp);
6217         error = buf_bwrite(bp);
6218         if (error) {
6219                 printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error);
6220                 return error;
6221         }
6222         if (!journal_uses_fua(hfsmp->jnl)) {
6223                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context);
6224                 if (error) {
6225                         printf("hfs_journal_relocate_callback: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6226                         error = 0;              /* Don't fail the operation. */
6227                 }
6228         }
6229
6230         return error;
6231 }
6232
6233
6234 /* Type of resize operation in progress */
6235 #define HFS_RESIZE_TRUNCATE     1
6236 #define HFS_RESIZE_EXTEND       2
6237
6238 /*
6239  * Core function to relocate the journal file.  This function takes the
6240  * journal size of the newly relocated journal --- the caller can
6241  * provide a new journal size if they want to change the size of
6242  * the journal.  The function takes care of updating the journal info
6243  * block and all other data structures correctly.
6244  *
6245  * Note: This function starts a transaction and grabs the btree locks.
6246  */
6247 static int
6248 hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context)
6249 {
6250         int error;
6251         int journal_err;
6252         int lockflags;
6253         u_int32_t oldStartBlock;
6254         u_int32_t newStartBlock;
6255         u_int32_t oldBlockCount;
6256         u_int32_t newBlockCount;
6257         u_int32_t jnlBlockCount;
6258         u_int32_t alloc_skipfreeblks;
6259         struct cat_desc journal_desc;
6260         struct cat_attr journal_attr;
6261         struct cat_fork journal_fork;
6262         struct hfs_journal_relocate_args callback_args;
6263
6264         /* Calculate the number of allocation blocks required for the journal */
6265         jnlBlockCount = howmany(jnl_size, hfsmp->blockSize);
6266
6267         /*
6268          * During truncatefs(), the volume free block count is updated
6269          * before relocating data and reflects the total number of free
6270          * blocks that will exist on volume after the resize is successful.
6271          * This means that the allocation blocks required for relocation
6272          * have already been reserved and accounted for in the free block
6273          * count.  Therefore, block allocation and deallocation routines
6274          * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS
6275          * flag.
6276          *
6277          * This special handling is not required when the file system
6278          * is being extended as we want all the allocated and deallocated
6279          * blocks to be accounted for correctly.
6280          */
6281         if (resize_type == HFS_RESIZE_TRUNCATE) {
6282                 alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS;
6283         } else {
6284                 alloc_skipfreeblks = 0;
6285         }
6286
6287         error = hfs_start_transaction(hfsmp);
6288         if (error) {
6289                 printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error);
6290                 return error;
6291         }
6292         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6293
6294         error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount,
6295                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_FLUSHTXN | alloc_skipfreeblks,
6296                          &newStartBlock, &newBlockCount);
6297         if (error) {
6298                 printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error);
6299                 goto fail;
6300         }
6301         if (newBlockCount != jnlBlockCount) {
6302                 printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount);
6303                 goto free_fail;
6304         }
6305
6306         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork);
6307         if (error) {
6308                 printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error);
6309                 goto free_fail;
6310         }
6311
6312         oldStartBlock = journal_fork.cf_extents[0].startBlock;
6313         oldBlockCount = journal_fork.cf_extents[0].blockCount;
6314         error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks);
6315         if (error) {
6316                 printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6317                 goto free_fail;
6318         }
6319
6320         /* Update the catalog record for .journal */
6321         journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
6322         journal_fork.cf_extents[0].startBlock = newStartBlock;
6323         journal_fork.cf_extents[0].blockCount = newBlockCount;
6324         journal_fork.cf_blocks = newBlockCount;
6325         error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL);
6326         cat_releasedesc(&journal_desc);  /* all done with cat descriptor */
6327         if (error) {
6328                 printf("hfs_relocate_journal_file: cat_update returned %d\n", error);
6329                 goto free_fail;
6330         }
6331
6332         /*
6333          * If the journal is part of the file system, then tell the journal
6334          * code about the new location.  If the journal is on an external
6335          * device, then just keep using it as-is.
6336          */
6337         if (hfsmp->jvp == hfsmp->hfs_devvp) {
6338                 callback_args.hfsmp = hfsmp;
6339                 callback_args.context = context;
6340                 callback_args.newStartBlock = newStartBlock;
6341                 callback_args.newBlockCount = newBlockCount;
6342
6343                 error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize,
6344                         (off_t)newBlockCount*hfsmp->blockSize, 0,
6345                         hfs_journal_relocate_callback, &callback_args);
6346                 if (error) {
6347                         /* NOTE: journal_relocate will mark the journal invalid. */
6348                         printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error);
6349                         goto fail;
6350                 }
6351                 if (hfs_resize_debug) {
6352                         printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
6353                 }
6354                 hfsmp->jnl_start = newStartBlock;
6355                 hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize;
6356         }
6357
6358         hfs_systemfile_unlock(hfsmp, lockflags);
6359         error = hfs_end_transaction(hfsmp);
6360         if (error) {
6361                 printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error);
6362         }
6363
6364         return error;
6365
6366 free_fail:
6367         journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
6368         if (journal_err) {
6369                 printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6370                 hfs_mark_volume_inconsistent(hfsmp);
6371         }
6372 fail:
6373         hfs_systemfile_unlock(hfsmp, lockflags);
6374         (void) hfs_end_transaction(hfsmp);
6375         if (hfs_resize_debug) {
6376                 printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error);
6377         }
6378         return error;
6379 }
6380
6381
6382 /*
6383  * Relocate the journal file when the file system is being truncated.
6384  * We do not down-size the journal when the file system size is
6385  * reduced, so we always provide the current journal size to the
6386  * relocate code.
6387  */
6388 static int
6389 hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6390 {
6391         int error = 0;
6392         u_int32_t startBlock;
6393         u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6394
6395         /*
6396          * Figure out the location of the .journal file.  When the journal
6397          * is on an external device, we need to look up the .journal file.
6398          */
6399         if (hfsmp->jvp == hfsmp->hfs_devvp) {
6400                 startBlock = hfsmp->jnl_start;
6401                 blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6402         } else {
6403                 u_int32_t fileid;
6404                 u_int32_t old_jnlfileid;
6405                 struct cat_attr attr;
6406                 struct cat_fork fork;
6407
6408                 /*
6409                  * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid
6410                  * is set, and it is trying to hide the .journal file.  So temporarily
6411                  * unset the field while calling GetFileInfo.
6412                  */
6413                 old_jnlfileid = hfsmp->hfs_jnlfileid;
6414                 hfsmp->hfs_jnlfileid = 0;
6415                 fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork);
6416                 hfsmp->hfs_jnlfileid = old_jnlfileid;
6417                 if (fileid != old_jnlfileid) {
6418                         printf("hfs_reclaim_journal_file: cannot find .journal file!\n");
6419                         return EIO;
6420                 }
6421
6422                 startBlock = fork.cf_extents[0].startBlock;
6423                 blockCount = fork.cf_extents[0].blockCount;
6424         }
6425
6426         if (startBlock + blockCount <= allocLimit) {
6427                 /* The journal file does not require relocation */
6428                 return 0;
6429         }
6430
6431         error = hfs_relocate_journal_file(hfsmp, blockCount * hfsmp->blockSize, HFS_RESIZE_TRUNCATE, context);
6432         if (error == 0) {
6433                 hfsmp->hfs_resize_blocksmoved += blockCount;
6434                 hfs_truncatefs_progress(hfsmp);
6435                 printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n",
6436                                 blockCount, hfsmp->vcbVN);
6437         }
6438
6439         return error;
6440 }
6441
6442
6443 /*
6444  * Move the journal info block to a new location.  We have to make sure the
6445  * new copy of the journal info block gets to the media first, then change
6446  * the field in the volume header and the catalog record.
6447  */
6448 static int
6449 hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6450 {
6451         int error;
6452         int journal_err;
6453         int lockflags;
6454         u_int32_t oldBlock;
6455         u_int32_t newBlock;
6456         u_int32_t blockCount;
6457         struct cat_desc jib_desc;
6458         struct cat_attr jib_attr;
6459         struct cat_fork jib_fork;
6460         buf_t old_bp, new_bp;
6461
6462         if (hfsmp->vcbJinfoBlock <= allocLimit) {
6463                 /* The journal info block does not require relocation */
6464                 return 0;
6465         }
6466
6467         error = hfs_start_transaction(hfsmp);
6468         if (error) {
6469                 printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error);
6470                 return error;
6471         }
6472         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6473
6474         error = BlockAllocate(hfsmp, 1, 1, 1,
6475                         HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS | HFS_ALLOC_FLUSHTXN,
6476                         &newBlock, &blockCount);
6477         if (error) {
6478                 printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error);
6479                 goto fail;
6480         }
6481         if (blockCount != 1) {
6482                 printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount);
6483                 goto free_fail;
6484         }
6485
6486         /* Copy the old journal info block content to the new location */
6487         error = buf_meta_bread(hfsmp->hfs_devvp,
6488                 hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6489                 hfsmp->blockSize, vfs_context_ucred(context), &old_bp);
6490         if (error) {
6491                 printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error);
6492                 if (old_bp) {
6493                         buf_brelse(old_bp);
6494                 }
6495                 goto free_fail;
6496         }
6497         new_bp = buf_getblk(hfsmp->hfs_devvp,
6498                 newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6499                 hfsmp->blockSize, 0, 0, BLK_META);
6500         bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize);
6501         buf_brelse(old_bp);
6502         if (journal_uses_fua(hfsmp->jnl))
6503                 buf_markfua(new_bp);
6504         error = buf_bwrite(new_bp);
6505         if (error) {
6506                 printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error);
6507                 goto free_fail;
6508         }
6509         if (!journal_uses_fua(hfsmp->jnl)) {
6510                 error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
6511                 if (error) {
6512                         printf("hfs_reclaim_journal_info_block: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6513                         /* Don't fail the operation. */
6514                 }
6515         }
6516
6517         /* Deallocate the old block once the new one has the new valid content */
6518         error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS);
6519         if (error) {
6520                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6521                 goto free_fail;
6522         }
6523
6524
6525         /* Update the catalog record for .journal_info_block */
6526         error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork);
6527         if (error) {
6528                 printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error);
6529                 goto fail;
6530         }
6531         oldBlock = jib_fork.cf_extents[0].startBlock;
6532         jib_fork.cf_size = hfsmp->blockSize;
6533         jib_fork.cf_extents[0].startBlock = newBlock;
6534         jib_fork.cf_extents[0].blockCount = 1;
6535         jib_fork.cf_blocks = 1;
6536         error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL);
6537         cat_releasedesc(&jib_desc);  /* all done with cat descriptor */
6538         if (error) {
6539                 printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error);
6540                 goto fail;
6541         }
6542
6543         /* Update the pointer to the journal info block in the volume header. */
6544         hfsmp->vcbJinfoBlock = newBlock;
6545         error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
6546         if (error) {
6547                 printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error);
6548                 goto fail;
6549         }
6550         hfs_systemfile_unlock(hfsmp, lockflags);
6551         error = hfs_end_transaction(hfsmp);
6552         if (error) {
6553                 printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error);
6554         }
6555         error = hfs_journal_flush(hfsmp, FALSE);
6556         if (error) {
6557                 printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
6558         }
6559
6560         /* Account for the block relocated and print progress */
6561         hfsmp->hfs_resize_blocksmoved += 1;
6562         hfs_truncatefs_progress(hfsmp);
6563         if (!error) {
6564                 printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n",
6565                                 hfsmp->vcbVN);
6566                 if (hfs_resize_debug) {
6567                         printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
6568                 }
6569         }
6570         return error;
6571
6572 free_fail:
6573         journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS);
6574         if (journal_err) {
6575                 printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6576                 hfs_mark_volume_inconsistent(hfsmp);
6577         }
6578
6579 fail:
6580         hfs_systemfile_unlock(hfsmp, lockflags);
6581         (void) hfs_end_transaction(hfsmp);
6582         if (hfs_resize_debug) {
6583                 printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
6584         }
6585         return error;
6586 }
6587
6588
6589 static u_int64_t
6590 calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count)
6591 {
6592         u_int64_t journal_size;
6593         u_int32_t journal_scale;
6594
6595 #define DEFAULT_JOURNAL_SIZE (8*1024*1024)
6596 #define MAX_JOURNAL_SIZE     (512*1024*1024)
6597
6598         /* Calculate the journal size for this volume.   We want
6599          * at least 8 MB of journal for each 100 GB of disk space.
6600          * We cap the size at 512 MB, unless the allocation block
6601          * size is larger, in which case, we use one allocation
6602          * block.
6603          */
6604         journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024);
6605         journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1);
6606         if (journal_size > MAX_JOURNAL_SIZE) {
6607                 journal_size = MAX_JOURNAL_SIZE;
6608         }
6609         if (journal_size < hfsmp->blockSize) {
6610                 journal_size = hfsmp->blockSize;
6611         }
6612         return journal_size;
6613 }
6614
6615
6616 /*
6617  * Calculate the expected journal size based on current partition size.
6618  * If the size of the current journal is less than the calculated size,
6619  * force journal relocation with the new journal size.
6620  */
6621 static int
6622 hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context)
6623 {
6624         int error = 0;
6625         u_int64_t calc_journal_size;
6626
6627         if (hfsmp->jvp != hfsmp->hfs_devvp) {
6628                 if (hfs_resize_debug) {
6629                         printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n");
6630                 }
6631                 return 0;
6632         }
6633
6634         calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count);
6635         if (calc_journal_size <= hfsmp->jnl_size) {
6636                 /* The journal size requires no modification */
6637                 goto out;
6638         }
6639
6640         if (hfs_resize_debug) {
6641                 printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size);
6642         }
6643
6644         /* Extend the journal to the new calculated size */
6645         error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context);
6646         if (error == 0) {
6647                 printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n",
6648                                 hfsmp->jnl_size, hfsmp->vcbVN);
6649         }
6650 out:
6651         return error;
6652 }
6653
6654
6655 /*
6656  * This function traverses through all extended attribute records for a given
6657  * fileID, and calls function that reclaims data blocks that exist in the
6658  * area of the disk being reclaimed which in turn is responsible for allocating
6659  * new space, copying extent data, deallocating new space, and if required,
6660  * splitting the extent.
6661  *
6662  * Note: The caller has already acquired the cnode lock on the file.  Therefore
6663  * we are assured that no other thread would be creating/deleting/modifying
6664  * extended attributes for this file.
6665  *
6666  * Side Effects:
6667  * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
6668  * blocks that were relocated.
6669  *
6670  * Returns:
6671  *      0 on success, non-zero on failure.
6672  */
6673 static int
6674 hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context)
6675 {
6676         int error = 0;
6677         struct hfs_reclaim_extent_info *extent_info;
6678         int i;
6679         HFSPlusAttrKey *key;
6680         int *lockflags;
6681
6682         if (hfs_resize_debug) {
6683                 printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID);
6684         }
6685
6686         MALLOC(extent_info, struct hfs_reclaim_extent_info *,
6687                sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
6688         if (extent_info == NULL) {
6689                 return ENOMEM;
6690         }
6691         bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
6692         extent_info->vp = vp;
6693         extent_info->fileID = fileID;
6694         extent_info->is_xattr = true;
6695         extent_info->is_sysfile = vnode_issystem(vp);
6696         extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp);
6697         lockflags = &(extent_info->lockflags);
6698         *lockflags = SFL_ATTRIBUTE | SFL_BITMAP;
6699
6700         /* Initialize iterator from the extent_info structure */
6701         MALLOC(extent_info->iterator, struct BTreeIterator *,
6702                sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6703         if (extent_info->iterator == NULL) {
6704                 error = ENOMEM;
6705                 goto out;
6706         }
6707         bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6708
6709         /* Build attribute key */
6710         key = (HFSPlusAttrKey *)&(extent_info->iterator->key);
6711         error = hfs_buildattrkey(fileID, NULL, key);
6712         if (error) {
6713                 goto out;
6714         }
6715
6716         /* Initialize btdata from extent_info structure.  Note that the
6717          * buffer pointer actually points to the xattr record from the
6718          * extent_info structure itself.
6719          */
6720         extent_info->btdata.bufferAddress = &(extent_info->record.xattr);
6721         extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord);
6722         extent_info->btdata.itemCount = 1;
6723
6724         /*
6725          * Sync all extent-based attribute data to the disk.
6726          *
6727          * All extent-based attribute data I/O is performed via cluster
6728          * I/O using a virtual file that spans across entire file system
6729          * space.
6730          */
6731         hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
6732         (void)cluster_push(hfsmp->hfs_attrdata_vp, 0);
6733         error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr");
6734         hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_LOCK_DEFAULT);
6735         if (error) {
6736                 goto out;
6737         }
6738
6739         /* Search for extended attribute for current file.  This
6740          * will place the iterator before the first matching record.
6741          */
6742         *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6743         error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6744                         &(extent_info->btdata), &(extent_info->recordlen),
6745                         extent_info->iterator);
6746         hfs_systemfile_unlock(hfsmp, *lockflags);
6747         if (error) {
6748                 if (error != btNotFound) {
6749                         goto out;
6750                 }
6751                 /* btNotFound is expected here, so just mask it */
6752                 error = 0;
6753         }
6754
6755         while (1) {
6756                 /* Iterate to the next record */
6757                 *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6758                 error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6759                                 extent_info->iterator, &(extent_info->btdata),
6760                                 &(extent_info->recordlen));
6761                 hfs_systemfile_unlock(hfsmp, *lockflags);
6762
6763                 /* Stop the iteration if we encounter end of btree or xattr with different fileID */
6764                 if (error || key->fileID != fileID) {
6765                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6766                                 error = 0;
6767                         }
6768                         break;
6769                 }
6770
6771                 /* We only care about extent-based EAs */
6772                 if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) &&
6773                     (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) {
6774                         continue;
6775                 }
6776
6777                 if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) {
6778                         extent_info->overflow_count = 0;
6779                         extent_info->extents = extent_info->record.xattr.forkData.theFork.extents;
6780                 } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) {
6781                         extent_info->overflow_count++;
6782                         extent_info->extents = extent_info->record.xattr.overflowExtents.extents;
6783                 }
6784
6785                 extent_info->recStartBlock = key->startBlock;
6786                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6787                         if (extent_info->extents[i].blockCount == 0) {
6788                                 break;
6789                         }
6790                         extent_info->extent_index = i;
6791                         error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6792                         if (error) {
6793                                 printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error);
6794                                 goto out;
6795                         }
6796                 }
6797         }
6798
6799 out:
6800         /* If any blocks were relocated, account them and report progress */
6801         if (extent_info->blocks_relocated) {
6802                 hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6803                 hfs_truncatefs_progress(hfsmp);
6804         }
6805         if (extent_info->iterator) {
6806                 FREE(extent_info->iterator, M_TEMP);
6807         }
6808         if (extent_info) {
6809                 FREE(extent_info, M_TEMP);
6810         }
6811         if (hfs_resize_debug) {
6812                 printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error);
6813         }
6814         return error;
6815 }
6816
6817 /*
6818  * Reclaim any extent-based extended attributes allocation blocks from
6819  * the area of the disk that is being truncated.
6820  *
6821  * The function traverses the attribute btree to find out the fileIDs
6822  * of the extended attributes that need to be relocated.  For every
6823  * file whose large EA requires relocation, it looks up the cnode and
6824  * calls hfs_reclaim_xattr() to do all the work for allocating
6825  * new space, copying data, deallocating old space, and if required,
6826  * splitting the extents.
6827  *
6828  * Inputs:
6829  *      allocLimit    - starting block of the area being reclaimed
6830  *
6831  * Returns:
6832  *      returns 0 on success, non-zero on failure.
6833  */
6834 static int
6835 hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6836 {
6837         int error = 0;
6838         FCB *fcb;
6839         struct BTreeIterator *iterator = NULL;
6840         struct FSBufferDescriptor btdata;
6841         HFSPlusAttrKey *key;
6842         HFSPlusAttrRecord rec;
6843         int lockflags = 0;
6844         cnid_t prev_fileid = 0;
6845         struct vnode *vp;
6846         int need_relocate;
6847         int btree_operation;
6848         u_int32_t files_moved = 0;
6849         u_int32_t prev_blocksmoved;
6850         int i;
6851
6852         fcb = VTOF(hfsmp->hfs_attribute_vp);
6853         /* Store the value to print total blocks moved by this function in end */
6854         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6855
6856         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6857                 return ENOMEM;
6858         }
6859         bzero(iterator, sizeof(*iterator));
6860         key = (HFSPlusAttrKey *)&iterator->key;
6861         btdata.bufferAddress = &rec;
6862         btdata.itemSize = sizeof(rec);
6863         btdata.itemCount = 1;
6864
6865         need_relocate = false;
6866         btree_operation = kBTreeFirstRecord;
6867         /* Traverse the attribute btree to find extent-based EAs to reclaim */
6868         while (1) {
6869                 lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK);
6870                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6871                 hfs_systemfile_unlock(hfsmp, lockflags);
6872                 if (error) {
6873                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6874                                 error = 0;
6875                         }
6876                         break;
6877                 }
6878                 btree_operation = kBTreeNextRecord;
6879
6880                 /* If the extents of current fileID were already relocated, skip it */
6881                 if (prev_fileid == key->fileID) {
6882                         continue;
6883                 }
6884
6885                 /* Check if any of the extents in the current record need to be relocated */
6886                 need_relocate = false;
6887                 switch(rec.recordType) {
6888                         case kHFSPlusAttrForkData:
6889                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6890                                         if (rec.forkData.theFork.extents[i].blockCount == 0) {
6891                                                 break;
6892                                         }
6893                                         if ((rec.forkData.theFork.extents[i].startBlock +
6894                                              rec.forkData.theFork.extents[i].blockCount) > allocLimit) {
6895                                                 need_relocate = true;
6896                                                 break;
6897                                         }
6898                                 }
6899                                 break;
6900
6901                         case kHFSPlusAttrExtents:
6902                                 for (i = 0; i < kHFSPlusExtentDensity; i++) {
6903                                         if (rec.overflowExtents.extents[i].blockCount == 0) {
6904                                                 break;
6905                                         }
6906                                         if ((rec.overflowExtents.extents[i].startBlock +
6907                                              rec.overflowExtents.extents[i].blockCount) > allocLimit) {
6908                                                 need_relocate = true;
6909                                                 break;
6910                                         }
6911                                 }
6912                                 break;
6913                 };
6914
6915                 /* Continue iterating to next attribute record */
6916                 if (need_relocate == false) {
6917                         continue;
6918                 }
6919
6920                 /* Look up the vnode for corresponding file.  The cnode
6921                  * will be locked which will ensure that no one modifies
6922                  * the xattrs when we are relocating them.
6923                  *
6924                  * We want to allow open-unlinked files to be moved,
6925                  * so provide allow_deleted == 1 for hfs_vget().
6926                  */
6927                 if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) {
6928                         continue;
6929                 }
6930
6931                 error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context);
6932                 hfs_unlock(VTOC(vp));
6933                 vnode_put(vp);
6934                 if (error) {
6935                         printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error);
6936                         break;
6937                 }
6938                 prev_fileid = key->fileID;
6939                 files_moved++;
6940         }
6941
6942         if (files_moved) {
6943                 printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n",
6944                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6945                                 files_moved, hfsmp->vcbVN);
6946         }
6947
6948         kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6949         return error;
6950 }
6951
6952 /*
6953  * Reclaim blocks from regular files.
6954  *
6955  * This function iterates over all the record in catalog btree looking
6956  * for files with extents that overlap into the space we're trying to
6957  * free up.  If a file extent requires relocation, it looks up the vnode
6958  * and calls function to relocate the data.
6959  *
6960  * Returns:
6961  *      Zero on success, non-zero on failure.
6962  */
6963 static int
6964 hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6965 {
6966         int error;
6967         FCB *fcb;
6968         struct BTreeIterator *iterator = NULL;
6969         struct FSBufferDescriptor btdata;
6970         int btree_operation;
6971         int lockflags;
6972         struct HFSPlusCatalogFile filerec;
6973         struct vnode *vp;
6974         struct vnode *rvp;
6975         struct filefork *datafork;
6976         u_int32_t files_moved = 0;
6977         u_int32_t prev_blocksmoved;
6978
6979 #if CONFIG_PROTECT
6980         int keys_generated = 0;
6981 #endif
6982
6983         fcb = VTOF(hfsmp->hfs_catalog_vp);
6984         /* Store the value to print total blocks moved by this function at the end */
6985         prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6986
6987         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6988                 error = ENOMEM;
6989                 goto reclaim_filespace_done;
6990         }
6991
6992 #if CONFIG_PROTECT
6993         /*
6994          * For content-protected filesystems, we may need to relocate files that
6995          * are encrypted.  If they use the new-style offset-based IVs, then
6996          * we can move them regardless of the lock state.  We create a temporary
6997          * key here that we use to read/write the data, then we discard it at the
6998          * end of the function.
6999          */
7000         if (cp_fs_protected (hfsmp->hfs_mp)) {
7001                 int needs = 0;
7002                 error = cp_needs_tempkeys(hfsmp, &needs);
7003
7004                 if ((error == 0) && (needs)) {
7005                         error = cp_entry_gentempkeys(&hfsmp->hfs_resize_cpentry, hfsmp);
7006                         if (error == 0) {
7007                                 keys_generated = 1;
7008                         }
7009                 }
7010
7011                 if (error) {
7012                         printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error);
7013                         goto reclaim_filespace_done;
7014                 }
7015         }
7016
7017 #endif
7018
7019         bzero(iterator, sizeof(*iterator));
7020
7021         btdata.bufferAddress = &filerec;
7022         btdata.itemSize = sizeof(filerec);
7023         btdata.itemCount = 1;
7024
7025         btree_operation = kBTreeFirstRecord;
7026         while (1) {
7027                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
7028                 error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
7029                 hfs_systemfile_unlock(hfsmp, lockflags);
7030                 if (error) {
7031                         if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
7032                                 error = 0;
7033                         }
7034                         break;
7035                 }
7036                 btree_operation = kBTreeNextRecord;
7037
7038                 if (filerec.recordType != kHFSPlusFileRecord) {
7039                         continue;
7040                 }
7041
7042                 /* Check if any of the extents require relocation */
7043                 if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) {
7044                         continue;
7045                 }
7046
7047                 /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */
7048                 if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) {
7049                         if (hfs_resize_debug) {
7050                                 printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID);
7051                         }
7052                         continue;
7053                 }
7054
7055                 /* If data fork exists or item is a directory hard link, relocate blocks */
7056                 datafork = VTOF(vp);
7057                 if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) {
7058                         error = hfs_reclaim_file(hfsmp, vp, filerec.fileID,
7059                                         kHFSDataForkType, allocLimit, context);
7060                         if (error)  {
7061                                 printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7062                                 hfs_unlock(VTOC(vp));
7063                                 vnode_put(vp);
7064                                 break;
7065                         }
7066                 }
7067
7068                 /* If resource fork exists or item is a directory hard link, relocate blocks */
7069                 if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) {
7070                         if (vnode_isdir(vp)) {
7071                                 /* Resource fork vnode lookup is invalid for directory hard link.
7072                                  * So we fake data fork vnode as resource fork vnode.
7073                                  */
7074                                 rvp = vp;
7075                         } else {
7076                                 error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
7077                                 if (error) {
7078                                         printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error);
7079                                         hfs_unlock(VTOC(vp));
7080                                         vnode_put(vp);
7081                                         break;
7082                                 }
7083                                 VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
7084                         }
7085
7086                         error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID,
7087                                         kHFSResourceForkType, allocLimit, context);
7088                         if (error) {
7089                                 printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7090                                 hfs_unlock(VTOC(vp));
7091                                 vnode_put(vp);
7092                                 break;
7093                         }
7094                 }
7095
7096                 /* The file forks were relocated successfully, now drop the
7097                  * cnode lock and vnode reference, and continue iterating to
7098                  * next catalog record.
7099                  */
7100                 hfs_unlock(VTOC(vp));
7101                 vnode_put(vp);
7102                 files_moved++;
7103         }
7104
7105         if (files_moved) {
7106                 printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n",
7107                                 (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
7108                                 files_moved, hfsmp->vcbVN);
7109         }
7110
7111 reclaim_filespace_done:
7112         if (iterator) {
7113                 kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7114         }
7115
7116 #if CONFIG_PROTECT
7117         if (keys_generated) {
7118                 cp_entry_destroy(hfsmp->hfs_resize_cpentry);
7119                 hfsmp->hfs_resize_cpentry = NULL;
7120         }
7121 #endif
7122         return error;
7123 }
7124
7125 /*
7126  * Reclaim space at the end of a file system.
7127  *
7128  * Inputs -
7129  *      allocLimit      - start block of the space being reclaimed
7130  *      reclaimblks     - number of allocation blocks to reclaim
7131  */
7132 static int
7133 hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context)
7134 {
7135         int error = 0;
7136
7137         /*
7138          * Preflight the bitmap to find out total number of blocks that need
7139          * relocation.
7140          *
7141          * Note: Since allocLimit is set to the location of new alternate volume
7142          * header, the check below does not account for blocks allocated for old
7143          * alternate volume header.
7144          */
7145         error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks));
7146         if (error) {
7147                 printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error);
7148                 return error;
7149         }
7150         if (hfs_resize_debug) {
7151                 printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks);
7152         }
7153
7154         /* Just to be safe, sync the content of the journal to the disk before we proceed */
7155         hfs_journal_flush(hfsmp, TRUE);
7156
7157         /* First, relocate journal file blocks if they're in the way.
7158          * Doing this first will make sure that journal relocate code
7159          * gets access to contiguous blocks on disk first.  The journal
7160          * file has to be contiguous on the disk, otherwise resize will
7161          * fail.
7162          */
7163         error = hfs_reclaim_journal_file(hfsmp, allocLimit, context);
7164         if (error) {
7165                 printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error);
7166                 return error;
7167         }
7168
7169         /* Relocate journal info block blocks if they're in the way. */
7170         error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context);
7171         if (error) {
7172                 printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error);
7173                 return error;
7174         }
7175
7176         /* Relocate extents of the Extents B-tree if they're in the way.
7177          * Relocating extents btree before other btrees is important as
7178          * this will provide access to largest contiguous block range on
7179          * the disk for relocating extents btree.  Note that extents btree
7180          * can only have maximum of 8 extents.
7181          */
7182         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID,
7183                         kHFSDataForkType, allocLimit, context);
7184         if (error) {
7185                 printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
7186                 return error;
7187         }
7188
7189         /* Relocate extents of the Allocation file if they're in the way. */
7190         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID,
7191                         kHFSDataForkType, allocLimit, context);
7192         if (error) {
7193                 printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
7194                 return error;
7195         }
7196
7197         /* Relocate extents of the Catalog B-tree if they're in the way. */
7198         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID,
7199                         kHFSDataForkType, allocLimit, context);
7200         if (error) {
7201                 printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
7202                 return error;
7203         }
7204
7205         /* Relocate extents of the Attributes B-tree if they're in the way. */
7206         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID,
7207                         kHFSDataForkType, allocLimit, context);
7208         if (error) {
7209                 printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
7210                 return error;
7211         }
7212
7213         /* Relocate extents of the Startup File if there is one and they're in the way. */
7214         error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID,
7215                         kHFSDataForkType, allocLimit, context);
7216         if (error) {
7217                 printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
7218                 return error;
7219         }
7220
7221         /*
7222          * We need to make sure the alternate volume header gets flushed if we moved
7223          * any extents in the volume header.  But we need to do that before
7224          * shrinking the size of the volume, or else the journal code will panic
7225          * with an invalid (too large) block number.
7226          *
7227          * Note that blks_moved will be set if ANY extent was moved, even
7228          * if it was just an overflow extent.  In this case, the journal_flush isn't
7229          * strictly required, but shouldn't hurt.
7230          */
7231         if (hfsmp->hfs_resize_blocksmoved) {
7232                 hfs_journal_flush(hfsmp, TRUE);
7233         }
7234
7235         /* Reclaim extents from catalog file records */
7236         error = hfs_reclaim_filespace(hfsmp, allocLimit, context);
7237         if (error) {
7238                 printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error);
7239                 return error;
7240         }
7241
7242         /* Reclaim extents from extent-based extended attributes, if any */
7243         error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context);
7244         if (error) {
7245                 printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error);
7246                 return error;
7247         }
7248
7249         return error;
7250 }
7251
7252
7253 /*
7254  * Check if there are any extents (including overflow extents) that overlap
7255  * into the disk space that is being reclaimed.
7256  *
7257  * Output -
7258  *      true  - One of the extents need to be relocated
7259  *      false - No overflow extents need to be relocated, or there was an error
7260  */
7261 static int
7262 hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec)
7263 {
7264         struct BTreeIterator * iterator = NULL;
7265         struct FSBufferDescriptor btdata;
7266         HFSPlusExtentRecord extrec;
7267         HFSPlusExtentKey *extkeyptr;
7268         FCB *fcb;
7269         int overlapped = false;
7270         int i, j;
7271         int error;
7272         int lockflags = 0;
7273         u_int32_t endblock;
7274
7275         /* Check if data fork overlaps the target space */
7276         for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7277                 if (filerec->dataFork.extents[i].blockCount == 0) {
7278                         break;
7279                 }
7280                 endblock = filerec->dataFork.extents[i].startBlock +
7281                         filerec->dataFork.extents[i].blockCount;
7282                 if (endblock > allocLimit) {
7283                         overlapped = true;
7284                         goto out;
7285                 }
7286         }
7287
7288         /* Check if resource fork overlaps the target space */
7289         for (j = 0; j < kHFSPlusExtentDensity; ++j) {
7290                 if (filerec->resourceFork.extents[j].blockCount == 0) {
7291                         break;
7292                 }
7293                 endblock = filerec->resourceFork.extents[j].startBlock +
7294                         filerec->resourceFork.extents[j].blockCount;
7295                 if (endblock > allocLimit) {
7296                         overlapped = true;
7297                         goto out;
7298                 }
7299         }
7300
7301         /* Return back if there are no overflow extents for this file */
7302         if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) {
7303                 goto out;
7304         }
7305
7306         if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
7307                 return 0;
7308         }
7309         bzero(iterator, sizeof(*iterator));
7310         extkeyptr = (HFSPlusExtentKey *)&iterator->key;
7311         extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
7312         extkeyptr->forkType = 0;
7313         extkeyptr->fileID = filerec->fileID;
7314         extkeyptr->startBlock = 0;
7315
7316         btdata.bufferAddress = &extrec;
7317         btdata.itemSize = sizeof(extrec);
7318         btdata.itemCount = 1;
7319
7320         fcb = VTOF(hfsmp->hfs_extents_vp);
7321
7322         lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
7323
7324         /* This will position the iterator just before the first overflow
7325          * extent record for given fileID.  It will always return btNotFound,
7326          * so we special case the error code.
7327          */
7328         error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
7329         if (error && (error != btNotFound)) {
7330                 goto out;
7331         }
7332
7333         /* BTIterateRecord() might return error if the btree is empty, and
7334          * therefore we return that the extent does not overflow to the caller
7335          */
7336         error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7337         while (error == 0) {
7338                 /* Stop when we encounter a different file. */
7339                 if (extkeyptr->fileID != filerec->fileID) {
7340                         break;
7341                 }
7342                 /* Check if any of the forks exist in the target space. */
7343                 for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7344                         if (extrec[i].blockCount == 0) {
7345                                 break;
7346                         }
7347                         endblock = extrec[i].startBlock + extrec[i].blockCount;
7348                         if (endblock > allocLimit) {
7349                                 overlapped = true;
7350                                 goto out;
7351                         }
7352                 }
7353                 /* Look for more records. */
7354                 error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7355         }
7356
7357 out:
7358         if (lockflags) {
7359                 hfs_systemfile_unlock(hfsmp, lockflags);
7360         }
7361         if (iterator) {
7362                 kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7363         }
7364         return overlapped;
7365 }
7366
7367
7368 /*
7369  * Calculate the progress of a file system resize operation.
7370  */
7371 __private_extern__
7372 int
7373 hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress)
7374 {
7375         if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) {
7376                 return (ENXIO);
7377         }
7378
7379         if (hfsmp->hfs_resize_totalblocks > 0) {
7380                 *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks);
7381         } else {
7382                 *progress = 0;
7383         }
7384
7385         return (0);
7386 }
7387
7388
7389 /*
7390  * Creates a UUID from a unique "name" in the HFS UUID Name space.
7391  * See version 3 UUID.
7392  */
7393 static void
7394 hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result)
7395 {
7396         MD5_CTX  md5c;
7397         uint8_t  rawUUID[8];
7398
7399         ((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6];
7400         ((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7];
7401
7402         MD5Init( &md5c );
7403         MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) );
7404         MD5Update( &md5c, rawUUID, sizeof (rawUUID) );
7405         MD5Final( result, &md5c );
7406
7407         result[6] = 0x30 | ( result[6] & 0x0F );
7408         result[8] = 0x80 | ( result[8] & 0x3F );
7409 }
7410
7411 /*
7412  * Get file system attributes.
7413  */
7414 static int
7415 hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7416 {
7417 #define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST))
7418 #define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST))
7419 #define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME))
7420
7421         ExtendedVCB *vcb = VFSTOVCB(mp);
7422         struct hfsmount *hfsmp = VFSTOHFS(mp);
7423         u_int32_t freeCNIDs;
7424
7425         int searchfs_on = 0;
7426         int exchangedata_on = 1;
7427
7428 #if CONFIG_SEARCHFS
7429         searchfs_on = 1;
7430 #endif
7431
7432 #if CONFIG_PROTECT
7433         if (cp_fs_protected(mp)) {
7434                 exchangedata_on = 0;
7435         }
7436 #endif
7437
7438         freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID;
7439
7440         VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt);
7441         VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt);
7442         VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt);
7443         VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF);
7444         VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0));
7445         VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks);
7446         VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0));
7447         VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1));
7448         VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize);
7449         /* XXX needs clarification */
7450         VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1));
7451         /* Maximum files is constrained by total blocks. */
7452         VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2));
7453         VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1)));
7454
7455         fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev;
7456         fsap->f_fsid.val[1] = vfs_typenum(mp);
7457         VFSATTR_SET_SUPPORTED(fsap, f_fsid);
7458
7459         VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord);
7460         VFSATTR_RETURN(fsap, f_carbon_fsid, 0);
7461
7462         if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
7463                 vol_capabilities_attr_t *cap;
7464
7465                 cap = &fsap->f_capabilities;
7466
7467                 if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
7468                         /* HFS+ & variants */
7469                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7470                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7471                                 VOL_CAP_FMT_SYMBOLICLINKS |
7472                                 VOL_CAP_FMT_HARDLINKS |
7473                                 VOL_CAP_FMT_JOURNAL |
7474                                 VOL_CAP_FMT_ZERO_RUNS |
7475                                 (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) |
7476                                 (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) |
7477                                 VOL_CAP_FMT_CASE_PRESERVING |
7478                                 VOL_CAP_FMT_FAST_STATFS |
7479                                 VOL_CAP_FMT_2TB_FILESIZE |
7480                                 VOL_CAP_FMT_HIDDEN_FILES |
7481 #if HFS_COMPRESSION
7482                                 VOL_CAP_FMT_PATH_FROM_ID |
7483                                 VOL_CAP_FMT_DECMPFS_COMPRESSION;
7484 #else
7485                                 VOL_CAP_FMT_PATH_FROM_ID;
7486 #endif
7487                 }
7488 #if CONFIG_HFS_STD
7489                 else {
7490                         /* HFS standard */
7491                         cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7492                                 VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7493                                 VOL_CAP_FMT_CASE_PRESERVING |
7494                                 VOL_CAP_FMT_FAST_STATFS |
7495                                 VOL_CAP_FMT_HIDDEN_FILES |
7496                                 VOL_CAP_FMT_PATH_FROM_ID;
7497                 }
7498 #endif
7499
7500                 /*
7501                  * The capabilities word in 'cap' tell you whether or not
7502                  * this particular filesystem instance has feature X enabled.
7503                  */
7504
7505                 cap->capabilities[VOL_CAPABILITIES_INTERFACES] =
7506                         VOL_CAP_INT_ATTRLIST |
7507                         VOL_CAP_INT_NFSEXPORT |
7508                         VOL_CAP_INT_READDIRATTR |
7509                         VOL_CAP_INT_ALLOCATE |
7510                         VOL_CAP_INT_VOL_RENAME |
7511                         VOL_CAP_INT_ADVLOCK |
7512                         VOL_CAP_INT_FLOCK |
7513 #if NAMEDSTREAMS
7514                         VOL_CAP_INT_EXTENDED_ATTR |
7515                         VOL_CAP_INT_NAMEDSTREAMS;
7516 #else
7517                         VOL_CAP_INT_EXTENDED_ATTR;
7518 #endif
7519
7520                 /* HFS may conditionally support searchfs and exchangedata depending on the runtime */
7521
7522                 if (searchfs_on) {
7523                         cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_SEARCHFS;
7524                 }
7525                 if (exchangedata_on) {
7526                         cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXCHANGEDATA;
7527                 }
7528
7529                 cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0;
7530                 cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0;
7531
7532                 cap->valid[VOL_CAPABILITIES_FORMAT] =
7533                         VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7534                         VOL_CAP_FMT_SYMBOLICLINKS |
7535                         VOL_CAP_FMT_HARDLINKS |
7536                         VOL_CAP_FMT_JOURNAL |
7537                         VOL_CAP_FMT_JOURNAL_ACTIVE |
7538                         VOL_CAP_FMT_NO_ROOT_TIMES |
7539                         VOL_CAP_FMT_SPARSE_FILES |
7540                         VOL_CAP_FMT_ZERO_RUNS |
7541                         VOL_CAP_FMT_CASE_SENSITIVE |
7542                         VOL_CAP_FMT_CASE_PRESERVING |
7543                         VOL_CAP_FMT_FAST_STATFS |
7544                         VOL_CAP_FMT_2TB_FILESIZE |
7545                         VOL_CAP_FMT_OPENDENYMODES |
7546                         VOL_CAP_FMT_HIDDEN_FILES |
7547 #if HFS_COMPRESSION
7548                         VOL_CAP_FMT_PATH_FROM_ID |
7549                         VOL_CAP_FMT_DECMPFS_COMPRESSION;
7550 #else
7551                         VOL_CAP_FMT_PATH_FROM_ID;
7552 #endif
7553
7554                 /*
7555                  * Bits in the "valid" field tell you whether or not the on-disk
7556                  * format supports feature X.
7557                  */
7558
7559                 cap->valid[VOL_CAPABILITIES_INTERFACES] =
7560                         VOL_CAP_INT_ATTRLIST |
7561                         VOL_CAP_INT_NFSEXPORT |
7562                         VOL_CAP_INT_READDIRATTR |
7563                         VOL_CAP_INT_COPYFILE |
7564                         VOL_CAP_INT_ALLOCATE |
7565                         VOL_CAP_INT_VOL_RENAME |
7566                         VOL_CAP_INT_ADVLOCK |
7567                         VOL_CAP_INT_FLOCK |
7568                         VOL_CAP_INT_MANLOCK |
7569 #if NAMEDSTREAMS
7570                         VOL_CAP_INT_EXTENDED_ATTR |
7571                         VOL_CAP_INT_NAMEDSTREAMS;
7572 #else
7573                         VOL_CAP_INT_EXTENDED_ATTR;
7574 #endif
7575
7576                 /* HFS always supports exchangedata and searchfs in the on-disk format natively */
7577                 cap->valid[VOL_CAPABILITIES_INTERFACES] |= (VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_EXCHANGEDATA);
7578
7579
7580                 cap->valid[VOL_CAPABILITIES_RESERVED1] = 0;
7581                 cap->valid[VOL_CAPABILITIES_RESERVED2] = 0;
7582                 VFSATTR_SET_SUPPORTED(fsap, f_capabilities);
7583         }
7584         if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) {
7585                 vol_attributes_attr_t *attrp = &fsap->f_attributes;
7586
7587                 attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7588                 attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7589                 attrp->validattr.dirattr = ATTR_DIR_VALIDMASK;
7590                 attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7591                 attrp->validattr.forkattr = 0;
7592
7593                 attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7594                 attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7595                 attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK;
7596                 attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7597                 attrp->nativeattr.forkattr = 0;
7598                 VFSATTR_SET_SUPPORTED(fsap, f_attributes);
7599         }
7600         fsap->f_create_time.tv_sec = hfsmp->hfs_itime;
7601         fsap->f_create_time.tv_nsec = 0;
7602         VFSATTR_SET_SUPPORTED(fsap, f_create_time);
7603         fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod;
7604         fsap->f_modify_time.tv_nsec = 0;
7605         VFSATTR_SET_SUPPORTED(fsap, f_modify_time);
7606
7607         fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp;
7608         fsap->f_backup_time.tv_nsec = 0;
7609         VFSATTR_SET_SUPPORTED(fsap, f_backup_time);
7610         if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) {
7611                 u_int16_t subtype = 0;
7612
7613                 /*
7614                  * Subtypes (flavors) for HFS
7615                  *   0:   Mac OS Extended
7616                  *   1:   Mac OS Extended (Journaled)
7617                  *   2:   Mac OS Extended (Case Sensitive)
7618                  *   3:   Mac OS Extended (Case Sensitive, Journaled)
7619                  *   4 - 127:   Reserved
7620                  * 128:   Mac OS Standard
7621                  *
7622                  */
7623                 if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
7624                         if (hfsmp->jnl) {
7625                                 subtype |= HFS_SUBTYPE_JOURNALED;
7626                         }
7627                         if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) {
7628                                 subtype |= HFS_SUBTYPE_CASESENSITIVE;
7629                         }
7630                 }
7631 #if CONFIG_HFS_STD
7632                 else {
7633                         subtype = HFS_SUBTYPE_STANDARDHFS;
7634                 }
7635 #endif
7636                 fsap->f_fssubtype = subtype;
7637                 VFSATTR_SET_SUPPORTED(fsap, f_fssubtype);
7638         }
7639
7640         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7641                 strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN);
7642                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7643         }
7644         if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) {
7645                 hfs_getvoluuid(hfsmp, fsap->f_uuid);
7646                 VFSATTR_SET_SUPPORTED(fsap, f_uuid);
7647         }
7648         return (0);
7649 }
7650
7651 /*
7652  * Perform a volume rename.  Requires the FS' root vp.
7653  */
7654 static int
7655 hfs_rename_volume(struct vnode *vp, const char *name, proc_t p)
7656 {
7657         ExtendedVCB *vcb = VTOVCB(vp);
7658         struct cnode *cp = VTOC(vp);
7659         struct hfsmount *hfsmp = VTOHFS(vp);
7660         struct cat_desc to_desc;
7661         struct cat_desc todir_desc;
7662         struct cat_desc new_desc;
7663         cat_cookie_t cookie;
7664         int lockflags;
7665         int error = 0;
7666         char converted_volname[256];
7667         size_t volname_length = 0;
7668         size_t conv_volname_length = 0;
7669
7670
7671         /*
7672          * Ignore attempts to rename a volume to a zero-length name.
7673          */
7674         if (name[0] == 0)
7675                 return(0);
7676
7677         bzero(&to_desc, sizeof(to_desc));
7678         bzero(&todir_desc, sizeof(todir_desc));
7679         bzero(&new_desc, sizeof(new_desc));
7680         bzero(&cookie, sizeof(cookie));
7681
7682         todir_desc.cd_parentcnid = kHFSRootParentID;
7683         todir_desc.cd_cnid = kHFSRootFolderID;
7684         todir_desc.cd_flags = CD_ISDIR;
7685
7686         to_desc.cd_nameptr = (const u_int8_t *)name;
7687         to_desc.cd_namelen = strlen(name);
7688         to_desc.cd_parentcnid = kHFSRootParentID;
7689         to_desc.cd_cnid = cp->c_cnid;
7690         to_desc.cd_flags = CD_ISDIR;
7691
7692         if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) {
7693                 if ((error = hfs_start_transaction(hfsmp)) == 0) {
7694                         if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) {
7695                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
7696
7697                                 error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc);
7698
7699                                 /*
7700                                  * If successful, update the name in the VCB, ensure it's terminated.
7701                                  */
7702                                 if (error == 0) {
7703                                         strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN));
7704
7705                                         volname_length = strlen ((const char*)vcb->vcbVN);
7706 #define DKIOCCSSETLVNAME _IOW('d', 198, char[256])
7707                                         /* Send the volume name down to CoreStorage if necessary */
7708                                         error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED);
7709                                         if (error == 0) {
7710                                                 (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current());
7711                                         }
7712                                         error = 0;
7713                                 }
7714
7715                                 hfs_systemfile_unlock(hfsmp, lockflags);
7716                                 cat_postflight(hfsmp, &cookie, p);
7717
7718                                 if (error)
7719                                         MarkVCBDirty(vcb);
7720                                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
7721                         }
7722                         hfs_end_transaction(hfsmp);
7723                 }
7724                 if (!error) {
7725                         /* Release old allocated name buffer */
7726                         if (cp->c_desc.cd_flags & CD_HASBUF) {
7727                                 const char *tmp_name = (const char *)cp->c_desc.cd_nameptr;
7728
7729                                 cp->c_desc.cd_nameptr = 0;
7730                                 cp->c_desc.cd_namelen = 0;
7731                                 cp->c_desc.cd_flags &= ~CD_HASBUF;
7732                                 vfs_removename(tmp_name);
7733                         }
7734                         /* Update cnode's catalog descriptor */
7735                         replace_desc(cp, &new_desc);
7736                         vcb->volumeNameEncodingHint = new_desc.cd_encoding;
7737                         cp->c_touch_chgtime = TRUE;
7738                 }
7739
7740                 hfs_unlock(cp);
7741         }
7742
7743         return(error);
7744 }
7745
7746 /*
7747  * Get file system attributes.
7748  */
7749 static int
7750 hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7751 {
7752         kauth_cred_t cred = vfs_context_ucred(context);
7753         int error = 0;
7754
7755         /*
7756          * Must be superuser or owner of filesystem to change volume attributes
7757          */
7758         if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner))
7759                 return(EACCES);
7760
7761         if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7762                 vnode_t root_vp;
7763
7764                 error = hfs_vfs_root(mp, &root_vp, context);
7765                 if (error)
7766                         goto out;
7767
7768                 error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context));
7769                 (void) vnode_put(root_vp);
7770                 if (error)
7771                         goto out;
7772
7773                 VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7774         }
7775
7776 out:
7777         return error;
7778 }
7779
7780 /* If a runtime corruption is detected, set the volume inconsistent
7781  * bit in the volume attributes.  The volume inconsistent bit is a persistent
7782  * bit which represents that the volume is corrupt and needs repair.
7783  * The volume inconsistent bit can be set from the kernel when it detects
7784  * runtime corruption or from file system repair utilities like fsck_hfs when
7785  * a repair operation fails.  The bit should be cleared only from file system
7786  * verify/repair utility like fsck_hfs when a verify/repair succeeds.
7787  */
7788 void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp)
7789 {
7790         hfs_lock_mount (hfsmp);
7791         if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) {
7792                 hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask;
7793                 MarkVCBDirty(hfsmp);
7794         }
7795         if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) {
7796                 /* Log information to ASL log */
7797                 fslog_fs_corrupt(hfsmp->hfs_mp);
7798                 printf("hfs: Runtime corruption detected on %s, fsck will be forced on next mount.\n", hfsmp->vcbVN);
7799         }
7800         hfs_unlock_mount (hfsmp);
7801 }
7802
7803 /* Replay the journal on the device node provided.  Returns zero if
7804  * journal replay succeeded or no journal was supposed to be replayed.
7805  */
7806 static int hfs_journal_replay(vnode_t devvp, vfs_context_t context)
7807 {
7808         int retval = 0;
7809         int error = 0;
7810         struct mount *mp = NULL;
7811         struct hfs_mount_args *args = NULL;
7812
7813         /* Replay allowed only on raw devices */
7814         if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) {
7815                 retval = EINVAL;
7816                 goto out;
7817         }
7818
7819         /* Create dummy mount structures */
7820         MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK);
7821         if (mp == NULL) {
7822                 retval = ENOMEM;
7823                 goto out;
7824         }
7825         bzero(mp, sizeof(struct mount));
7826         mount_lock_init(mp);
7827
7828         MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK);
7829         if (args == NULL) {
7830                 retval = ENOMEM;
7831                 goto out;
7832         }
7833         bzero(args, sizeof(struct hfs_mount_args));
7834
7835         retval = hfs_mountfs(devvp, mp, args, 1, context);
7836         buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay");
7837
7838         /* FSYNC the devnode to be sure all data has been flushed */
7839         error = VNOP_FSYNC(devvp, MNT_WAIT, context);
7840         if (error) {
7841                 retval = error;
7842         }
7843
7844 out:
7845         if (mp) {
7846                 mount_lock_destroy(mp);
7847                 FREE(mp, M_TEMP);
7848         }
7849         if (args) {
7850                 FREE(args, M_TEMP);
7851         }
7852         return retval;
7853 }
7854
7855 /*
7856  * hfs vfs operations.
7857  */
7858 struct vfsops hfs_vfsops = {
7859         hfs_mount,
7860         hfs_start,
7861         hfs_unmount,
7862         hfs_vfs_root,
7863         hfs_quotactl,
7864         hfs_vfs_getattr,        /* was hfs_statfs */
7865         hfs_sync,
7866         hfs_vfs_vget,
7867         hfs_fhtovp,
7868         hfs_vptofh,
7869         hfs_init,
7870         hfs_sysctl,
7871         hfs_vfs_setattr,
7872         {NULL}
7873 };