livefiles_hfs_plugin/lf_hfs_journal.c

   1 /*
   2  * Copyright (c) 2002-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // hfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 // Port to Live-Files: Oded Shoshani (oshoshani@apple.com)
  37 //
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <limits.h>
  43 #include <errno.h>
  44 #include <fcntl.h>
  45 #include <unistd.h>
  46 #include <stdarg.h>
  47 #include <assert.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/types.h>
  50 #include <mach/mach.h>
  51 #include <sys/disk.h>
  52 #include <sys/kdebug.h>
  53 #include "lf_hfs_locks.h"
  54 #include "lf_hfs_journal.h"
  55 #include "lf_hfs_vfsutils.h"
  56 #include "lf_hfs_raw_read_write.h"
  57 #include "lf_hfs_generic_buf.h"
  58 #include "lf_hfs_logger.h"
  59 #include "lf_hfs_vfsops.h"
  60
  61 // ************************** Function Definitions ***********************
  62 // number of bytes to checksum in a block_list_header
  63 // NOTE: this should be enough to clear out the header
  64 //       fields as well as the first entry of binfo[]
  65
  66 #define CHECK_JOURNAL(jnl)                                                   \
  67     do {                                                                     \
  68         if (jnl == NULL) {                                                   \
  69             printf("%s:%d: null journal ptr?\n", __FILE__, __LINE__);        \
  70             panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);         \
  71         }                                                                    \
  72         if (jnl->jdev == NULL) {                                             \
  73             printf("%s:%d: jdev is null!\n", __FILE__, __LINE__);            \
  74             panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);             \
  75         }                                                                    \
  76         if (jnl->fsdev == NULL) {                                            \
  77             printf("%s:%d: fsdev is null!\n", __FILE__, __LINE__);           \
  78             panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);            \
  79         }                                                                    \
  80         if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {                      \
  81             printf("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",           \
  82                 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
  83             panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",            \
  84                 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
  85         }                                                                    \
  86         if (jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size) {   \
  87             printf("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
  88                 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);      \
  89             panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n",  \
  90                 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);      \
  91         }                                                                    \
  92         if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {       \
  93             printf("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n",   \
  94                 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);        \
  95             panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n",    \
  96                 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);        \
  97         }                                                                    \
  98     } while(0)
  99
 100 #define CHECK_TRANSACTION(tr)        \
 101     do {                             \
 102         if (tr == NULL) {            \
 103             printf("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
 104             panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);  \
 105         }                            \
 106         if (tr->jnl == NULL) {       \
 107             printf("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
 108             panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);  \
 109         }                            \
 110         if (tr->blhdr != (block_list_header *)tr->tbuffer) {        \
 111             printf("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
 112             panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);  \
 113         }                            \
 114         if (tr->total_bytes < 0) {   \
 115             printf("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
 116             panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);  \
 117         }                            \
 118         if (tr->journal_start < 0) { \
 119             printf("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
 120             panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);  \
 121         }                            \
 122         if (tr->journal_end < 0) {   \
 123             printf("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
 124             panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);  \
 125         }                            \
 126         if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {          \
 127             printf("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);    \
 128             panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);     \
 129         }                            \
 130     } while(0)
 131
 132 #define SWAP16(x) OSSwapInt16(x)
 133 #define SWAP32(x) OSSwapInt32(x)
 134 #define SWAP64(x) OSSwapInt64(x)
 135
 136 #define JNL_WRITE    0x0001
 137 #define JNL_READ     0x0002
 138 #define JNL_HEADER   0x8000
 139
 140 #define BLHDR_CHECKSUM_SIZE 32
 141 #define MAX_JOURNAL_SIZE 0x80000000U
 142
 143 #define STARTING_BUCKETS 256
 144 typedef struct bucket {
 145     off_t     block_num;
 146     uint32_t  jnl_offset;
 147     uint32_t  block_size;
 148     int32_t   cksum;
 149 } bucket;
 150
 151 static int     replay_journal(journal *jnl);
 152 static void    free_old_stuff(journal *jnl);
 153 static errno_t journal_allocate_transaction(journal *jnl);
 154 static void    get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl);
 155 static size_t  read_journal_header(journal *jnl, void *data, size_t len);
 156 static size_t  do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction);
 157 static unsigned int calc_checksum(const char *ptr, int len);
 158 static void    swap_journal_header(journal *jnl);
 159 static int     end_transaction(transaction *tr,
 160                            int force_it,
 161                            errno_t (*callback)(void*),
 162                            void *callback_arg,
 163                            boolean_t drop_lock);
 164 static void   abort_transaction(journal *jnl, transaction *tr);
 165 static void   size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz);
 166 static void   lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name);
 167 static void   wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name);
 168 static void   unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag);
 169 static int    write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num);
 170 static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len);
 171 static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len);
 172
 173
 174 static __inline__ void lock_oldstart(journal *jnl) {
 175     lf_lck_mtx_lock(&jnl->old_start_lock);
 176 }
 177
 178 static __inline__ void unlock_oldstart(journal *jnl) {
 179     lf_lck_mtx_unlock(&jnl->old_start_lock);
 180 }
 181
 182 __inline__ void journal_lock(journal *jnl) {
 183     lf_lck_mtx_lock(&jnl->jlock);
 184     if (jnl->owner) {
 185         panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
 186     }
 187     jnl->owner = pthread_self();
 188 }
 189
 190 __inline__ void journal_unlock(journal *jnl) {
 191     jnl->owner = NULL;
 192     lf_lck_mtx_unlock(&jnl->jlock);
 193 }
 194
 195 static __inline__ void lock_flush(journal *jnl) {
 196     lf_lck_mtx_lock(&jnl->flock);
 197 }
 198
 199 static __inline__ void unlock_flush(journal *jnl) {
 200     lf_lck_mtx_unlock(&jnl->flock);
 201 }
 202
 203 // ************************** Global Variables ***********************
 204 // Journal Locking
 205 lck_grp_attr_t *jnl_group_attr  = NULL;
 206 lck_attr_t     *jnl_lock_attr   = NULL;
 207 lck_grp_t      *jnl_mutex_group = NULL;
 208
 209 // By default, we grow the list of extents to trim by 4K at a time.
 210 // We'll opt to flush a transaction if it contains at least
 211 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
 212 // of modified blocks is small).
 213 enum {
 214     JOURNAL_DEFAULT_TRIM_BYTES   = 4096,
 215     JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
 216     JOURNAL_FLUSH_TRIM_EXTENTS   = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
 217 };
 218
 219 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
 220
 221 // tbuffer
 222 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
 223 #define MAX_TRANSACTION_BUFFER_SIZE      (3072*1024)
 224 uint32_t def_tbuffer_size = 0; // XXXdbg - so I can change it in the debugger
 225
 226 // ************************** Global Functions ***********************
 227 void journal_init(void) {
 228
 229     jnl_lock_attr    = lf_lck_attr_alloc_init();
 230     jnl_group_attr   = lf_lck_grp_attr_alloc_init();
 231     jnl_mutex_group  = lf_lck_grp_alloc_init();
 232 }
 233
 234 journal *journal_open(struct vnode *jvp,
 235              off_t         offset,
 236              off_t         journal_size,
 237              struct vnode *fsvp,
 238              size_t        min_fs_blksz,
 239              int32_t       flags,
 240              int32_t       tbuffer_size,
 241              void        (*flush)(void *arg),
 242              void         *arg,
 243              struct mount *fsmount) {
 244     journal        *jnl;
 245     uint32_t     orig_blksz=0;
 246     uint32_t     phys_blksz;
 247     u_int32_t    min_size = 0;
 248     int          orig_checksum, checksum;
 249
 250     /* Get the real physical block size. */
 251     if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
 252         goto cleanup_jdev_name;
 253     }
 254
 255     if (phys_blksz > min_fs_blksz) {
 256         LFHFS_LOG(LEVEL_ERROR, "jnl: open: error: phys blksize %u bigger than min fs blksize %zd\n",
 257                phys_blksz, min_fs_blksz);
 258         goto cleanup_jdev_name;
 259     }
 260
 261     if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
 262         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size %lld looks bogus.\n", journal_size);
 263         goto cleanup_jdev_name;
 264     }
 265
 266     min_size = phys_blksz * (phys_blksz / sizeof(block_info));
 267     /* Reject journals that are too small given the sector size of the device */
 268     if (journal_size < min_size) {
 269         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size (%lld) too small given sector size of (%u)\n",
 270                journal_size, phys_blksz);
 271         goto cleanup_jdev_name;
 272     }
 273
 274     if ((journal_size % phys_blksz) != 0) {
 275         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
 276                journal_size, phys_blksz);
 277         goto cleanup_jdev_name;
 278     }
 279
 280     jnl = hfs_mallocz(sizeof(struct journal));
 281
 282     jnl->jdev         = jvp;
 283     jnl->jdev_offset  = offset;
 284     jnl->jdev_blknum  = (uint32_t)(offset / min_fs_blksz);
 285     jnl->fsdev        = fsvp;
 286     jnl->flush        = flush;
 287     jnl->flush_arg    = arg;
 288     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
 289     lf_lck_mtx_init(&jnl->old_start_lock);
 290     lf_cond_init(&jnl->flushing.sCond);
 291     lf_cond_init(&jnl->asyncIO.sCond);
 292     lf_cond_init(&jnl->writing_header.sCond);
 293
 294     /* We hold the mount to later pass to the throttling code for IO
 295      * accounting.
 296      */
 297     jnl->fsmount      = fsmount;
 298
 299     get_io_info(jvp, phys_blksz, jnl);
 300
 301     jnl->header_buf = hfs_malloc(phys_blksz);
 302     jnl->header_buf_size = phys_blksz;
 303
 304     jnl->jhdr = (journal_header *)jnl->header_buf;
 305     memset(jnl->jhdr, 0, sizeof(journal_header));
 306
 307     // we have to set this up here so that do_journal_io() will work
 308     jnl->jhdr->jhdr_size = phys_blksz;
 309
 310     if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
 311         LFHFS_LOG(LEVEL_ERROR, "jnl: open: could not read %u bytes for the journal header.\n",
 312                phys_blksz);
 313         goto bad_journal;
 314     }
 315
 316     /*
 317      * Check for a bad jhdr size after reading in the journal header.
 318      * The journal header length cannot be zero
 319      */
 320     if (jnl->jhdr->jhdr_size == 0) {
 321         LFHFS_LOG(LEVEL_ERROR, "jnl: open: bad jhdr size (%d) \n", jnl->jhdr->jhdr_size);
 322         goto bad_journal;
 323     }
 324
 325     orig_checksum = jnl->jhdr->checksum;
 326     jnl->jhdr->checksum = 0;
 327
 328     if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
 329
 330         // do this before the swap since it's done byte-at-a-time
 331         orig_checksum = SWAP32(orig_checksum);
 332         checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 333         swap_journal_header(jnl);
 334         jnl->flags |= JOURNAL_NEED_SWAP;
 335
 336     } else {
 337
 338         checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 339     }
 340
 341     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
 342         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal magic is bad (0x%x != 0x%x)\n",
 343                jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
 344         goto bad_journal;
 345     }
 346
 347     // only check if we're the current journal header magic value
 348     if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
 349
 350         if (orig_checksum != checksum) {
 351             LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
 352                    orig_checksum, checksum);
 353
 354             //goto bad_journal;
 355         }
 356     }
 357
 358     // XXXdbg - convert old style magic numbers to the new one
 359     if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
 360         jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
 361     }
 362
 363     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
 364         /*
 365          * The volume has probably been resized (such that we had to adjust the
 366          * logical sector size), or copied to media with a different logical
 367          * sector size.
 368          *
 369          * For us, though, no big deal because we are giving byte offsets to
 370          * pread() and pwrite() to do our I/O, and as long as we use self-
 371          * consistent units, we are all good.
 372          */
 373         LFHFS_LOG(LEVEL_ERROR,
 374                   "jnl: block size mismatch: phys_blksz=%llu, jhdr->jhdr_size=%llu -- COMPENSATING\n",
 375                   (unsigned long long)phys_blksz, (unsigned long long)jnl->jhdr->jhdr_size);
 376         orig_blksz = phys_blksz;
 377     }
 378
 379     if (   jnl->jhdr->start <= 0
 380         || jnl->jhdr->start > jnl->jhdr->size
 381         || jnl->jhdr->start > 1024*1024*1024) {
 382         LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
 383                jnl->jhdr->start, jnl->jhdr->size);
 384         goto bad_journal;
 385     }
 386
 387     if (   jnl->jhdr->end <= 0
 388         || jnl->jhdr->end > jnl->jhdr->size
 389         || jnl->jhdr->end > 1024*1024*1024) {
 390         LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
 391                jnl->jhdr->end, jnl->jhdr->size);
 392         goto bad_journal;
 393     }
 394
 395     if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) {
 396         LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
 397         goto bad_journal;
 398     }
 399
 400     // XXXdbg - can't do these checks because hfs writes all kinds of
 401     //          non-uniform sized blocks even on devices that have a block size
 402     //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
 403     //          therefore these checks will fail and so we just have to punt and
 404     //          do more relaxed checking...
 405     // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
 406     if ((jnl->jhdr->start % 512) != 0) {
 407         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal start (0x%llx) not a multiple of 512?\n",
 408                jnl->jhdr->start);
 409         goto bad_journal;
 410     }
 411
 412     //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
 413     if ((jnl->jhdr->end % 512) != 0) {
 414         LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
 415                jnl->jhdr->end, jnl->jhdr->jhdr_size);
 416         goto bad_journal;
 417     }
 418
 419     if (jnl->jhdr->blhdr_size < 0) {
 420         //throw out invalid sizes
 421         LFHFS_LOG(LEVEL_ERROR, "jnl: open: blhdr size looks bogus! (%d) \n",
 422                jnl->jhdr->blhdr_size);
 423         goto bad_journal;
 424     }
 425
 426     // take care of replaying the journal if necessary
 427     if (flags & JOURNAL_RESET) {
 428         LFHFS_LOG(LEVEL_ERROR, "jnl: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n",
 429                jnl->jhdr->start, jnl->jhdr->end);
 430         jnl->jhdr->start = jnl->jhdr->end;
 431     } else if (replay_journal(jnl) != 0) {
 432         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_open: Error replaying the journal!\n");
 433         goto bad_journal;
 434     }
 435
 436     /*
 437      * When we get here, we know that the journal is empty (jnl->jhdr->start ==
 438      * jnl->jhdr->end).  If the device's logical block size was different from
 439      * the journal's header size, then we can now restore the device's logical
 440      * block size and update the journal's header size to match.
 441      *
 442      * Note that we also adjust the journal's start and end so that they will
 443      * be aligned on the new block size.  We pick a new sequence number to
 444      * avoid any problems if a replay found previous transactions using the old
 445      * journal header size.  (See the comments in journal_create(), above.)
 446      */
 447
 448     if (orig_blksz != 0) {
 449         LFHFS_LOG(LEVEL_ERROR, "jnl: updating journal header with block size %llu\n",
 450                   (unsigned long long)phys_blksz);
 451
 452         jnl->jhdr->jhdr_size = phys_blksz;
 453         jnl->jhdr->start = phys_blksz;
 454         jnl->jhdr->end = phys_blksz;
 455         jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
 456                                    (journal_size / phys_blksz) +
 457                                    (random() % 16384)) & 0x00ffffff;
 458
 459         if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
 460             LFHFS_LOG(LEVEL_ERROR, "jnl: open: failed to update journal header size\n");
 461             goto bad_journal;
 462         }
 463     }
 464
 465     // make sure this is in sync!
 466     jnl->active_start = jnl->jhdr->start;
 467     jnl->sequence_num = jnl->jhdr->sequence_num;
 468
 469     // set this now, after we've replayed the journal
 470     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
 471
 472     // TODO: Does this need to change if the device's logical block size changed?
 473     if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
 474         LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jnl->jhdr->size,
 475                jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
 476         goto bad_journal;
 477     }
 478
 479     lf_lck_mtx_init(&jnl->jlock);
 480     lf_lck_mtx_init(&jnl->flock);
 481     lf_lck_rw_init(&jnl->trim_lock);
 482
 483     goto journal_open_complete;
 484
 485 bad_journal:
 486     hfs_free(jnl->header_buf);
 487     hfs_free(jnl);
 488 cleanup_jdev_name:
 489     jnl = NULL;
 490 journal_open_complete:
 491     return jnl;
 492 }
 493
 494 journal *journal_create(struct vnode *jvp,
 495                off_t         offset,
 496                off_t         journal_size,
 497                struct vnode *fsvp,
 498                size_t        min_fs_blksz,
 499                int32_t       flags,
 500                int32_t       tbuffer_size,
 501                void          (*flush)(void *arg),
 502                void          *arg,
 503                struct mount  *fsmount) {
 504
 505     journal     *jnl;
 506     uint32_t    phys_blksz, new_txn_base;
 507     u_int32_t   min_size;
 508
 509     /*
 510      * Cap the journal max size to 2GB.  On HFS, it will attempt to occupy
 511      * a full allocation block if the current size is smaller than the allocation
 512      * block on which it resides.  Once we hit the exabyte filesystem range, then
 513      * it will use 2GB allocation blocks.  As a result, make the cap 2GB.
 514      */
 515
 516     /* Get the real physical block size. */
 517     if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
 518         goto cleanup_jdev_name;
 519     }
 520
 521     if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
 522         LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size %lld looks bogus.\n", journal_size);
 523         goto cleanup_jdev_name;
 524     }
 525
 526     min_size = phys_blksz * (phys_blksz / sizeof(block_info));
 527     /* Reject journals that are too small given the sector size of the device */
 528     if (journal_size < min_size) {
 529         LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size (%lld) too small given sector size of (%u)\n",
 530                journal_size, phys_blksz);
 531         goto cleanup_jdev_name;
 532     }
 533
 534     if (phys_blksz > min_fs_blksz) {
 535         LFHFS_LOG(LEVEL_ERROR, "jnl: create: error: phys blksize %u bigger than min fs blksize %zd\n",
 536                 phys_blksz, min_fs_blksz);
 537         goto cleanup_jdev_name;
 538     }
 539
 540     if ((journal_size % phys_blksz) != 0) {
 541         LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
 542                journal_size, phys_blksz);
 543         goto cleanup_jdev_name;
 544     }
 545
 546
 547     jnl = hfs_mallocz(sizeof(struct journal));
 548
 549     jnl->jdev         = jvp;
 550     jnl->jdev_offset  = offset;
 551     jnl->jdev_blknum  = (uint32_t)(offset / min_fs_blksz);
 552     jnl->fsdev        = fsvp;
 553     jnl->flush        = flush;
 554     jnl->flush_arg    = arg;
 555     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
 556     lf_lck_mtx_init(&jnl->old_start_lock);
 557
 558     // Keep a point to the mount around for use in IO throttling.
 559     jnl->fsmount      = fsmount;
 560
 561     get_io_info(jvp, phys_blksz, jnl);
 562
 563     jnl->header_buf = hfs_malloc(phys_blksz);
 564     jnl->header_buf_size = phys_blksz;
 565
 566     jnl->jhdr = (journal_header *)jnl->header_buf;
 567     memset(jnl->jhdr, 0, sizeof(journal_header));
 568
 569     // we have to set this up here so that do_journal_io() will work
 570     jnl->jhdr->jhdr_size = phys_blksz;
 571
 572     //
 573     // We try and read the journal header to see if there is already one
 574     // out there.  If there is, it's possible that it has transactions
 575     // in it that we might replay if we happen to pick a sequence number
 576     // that is a little less than the old one, there is a crash and the
 577     // last txn written ends right at the start of a txn from the previous
 578     // incarnation of this file system.  If all that happens we would
 579     // replay the transactions from the old file system and that would
 580     // destroy your disk.  Although it is extremely unlikely for all those
 581     // conditions to happen, the probability is non-zero and the result is
 582     // severe - you lose your file system.  Therefore if we find a valid
 583     // journal header and the sequence number is non-zero we write junk
 584     // over the entire journal so that there is no way we will encounter
 585     // any old transactions.  This is slow but should be a rare event
 586     // since most tools erase the journal.
 587     //
 588     if (   read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
 589         && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
 590         && jnl->jhdr->sequence_num != 0) {
 591
 592         new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
 593         LFHFS_LOG(LEVEL_ERROR, "jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
 594
 595     } else {
 596         new_txn_base = random() & 0x00ffffff;
 597     }
 598
 599     memset(jnl->header_buf, 0, phys_blksz);
 600
 601     jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
 602     jnl->jhdr->endian     = ENDIAN_MAGIC;
 603     jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
 604     jnl->jhdr->end        = phys_blksz;
 605     jnl->jhdr->size       = journal_size;
 606     jnl->jhdr->jhdr_size  = phys_blksz;
 607     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
 608
 609     jnl->active_start     = jnl->jhdr->start;
 610
 611     jnl->jhdr->sequence_num = new_txn_base;
 612
 613     lf_lck_mtx_init(&jnl->jlock);
 614     lf_lck_mtx_init(&jnl->flock);
 615     lf_lck_rw_init(&jnl->trim_lock);
 616
 617     lf_cond_init(&jnl->flushing.sCond);
 618     lf_cond_init(&jnl->asyncIO.sCond);
 619     lf_cond_init(&jnl->writing_header.sCond);
 620     jnl->flush_aborted = FALSE;
 621     jnl->async_trim = NULL;
 622     jnl->sequence_num = jnl->jhdr->sequence_num;
 623
 624     if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
 625         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_create: failed to write journal header.\n");
 626         goto bad_write;
 627     }
 628
 629     goto journal_create_complete;
 630
 631
 632 bad_write:
 633     hfs_free(jnl->header_buf);
 634     jnl->jhdr = NULL;
 635     hfs_free(jnl);
 636 cleanup_jdev_name:
 637     jnl = NULL;
 638 journal_create_complete:
 639     return jnl;
 640 }
 641
 642
 643
 644 void *journal_owner(journal *jnl) {
 645     return jnl->owner;
 646 }
 647
 648 /* Is the given cnode either the .journal or .journal_info_block file on
 649  * a volume with an active journal?  Many VNOPs use this to deny access
 650  * to those files.
 651  *
 652  * Note: the .journal file on a volume with an external journal still
 653  * returns true here, even though it does not actually hold the contents
 654  * of the volume's journal.
 655  */
 656 _Bool hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) {
 657     if (hfsmp->jnl != NULL &&
 658         (cp->c_fileid == hfsmp->hfs_jnlinfoblkid ||
 659          cp->c_fileid == hfsmp->hfs_jnlfileid)) {
 660         return true;
 661     } else {
 662         return false;
 663     }
 664 }
 665
 666 bool is_journaled(UVFSFileNode *psRootNode) {
 667
 668     struct vnode *psRootVnode = *psRootNode;
 669
 670     if (!psRootNode) {
 671         LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootNode is NULL");
 672         return false;
 673     }
 674
 675     if (!psRootVnode->sFSParams.vnfs_mp) {
 676         LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootVnode->sFSParams.vnfs_mp is NULL");
 677         return false;
 678     }
 679
 680     if (psRootVnode->sFSParams.vnfs_mp->psHfsmount->jnl)
 681         return true;
 682
 683     return false;
 684 }
 685
 686
 687 // Media no longer available, clear all memory occupied by the journal
 688 void journal_release(journal *jnl) {
 689     if (jnl->owner != pthread_self()) {
 690         journal_lock(jnl);
 691     }
 692
 693     if (jnl->active_tr) {
 694         abort_transaction(jnl, jnl->active_tr);
 695     }
 696
 697     if (jnl->cur_tr) {
 698         abort_transaction(jnl, jnl->cur_tr);
 699     }
 700
 701     free_old_stuff(jnl);
 702
 703     hfs_free(jnl->header_buf);
 704     jnl->jhdr = (void *)0xbeefbabe;
 705
 706     journal_unlock(jnl);
 707     lf_lck_mtx_destroy(&jnl->old_start_lock);
 708     lf_lck_mtx_destroy(&jnl->jlock);
 709     lf_lck_mtx_destroy(&jnl->flock);
 710     hfs_free(jnl);
 711 }
 712
 713
 714 void journal_close(journal *jnl) {
 715     volatile off_t *start, *end;
 716     int             counter=0;
 717
 718     CHECK_JOURNAL(jnl);
 719
 720     // set this before doing anything that would block so that
 721     // we start tearing things down properly.
 722     //
 723     jnl->flags |= JOURNAL_CLOSE_PENDING;
 724
 725     if (jnl->owner != pthread_self()) {
 726         journal_lock(jnl);
 727     }
 728
 729     wait_condition(jnl, &jnl->flushing, "journal_close");
 730
 731     //
 732     // only write stuff to disk if the journal is still valid
 733     //
 734     if ((jnl->flags & JOURNAL_INVALID) == 0) {
 735
 736         if (jnl->active_tr) {
 737             /*
 738              * "journal_end_transaction" will fire the flush asynchronously
 739              */
 740             journal_end_transaction(jnl);
 741         }
 742
 743         // flush any buffered transactions
 744         if (jnl->cur_tr) {
 745             transaction *tr = jnl->cur_tr;
 746
 747             jnl->cur_tr = NULL;
 748             /*
 749              * "end_transaction" will wait for any in-progress flush to complete
 750              * before flushing "cur_tr" synchronously("must_wait" == TRUE)
 751              */
 752             end_transaction(tr, 1, NULL, NULL, FALSE);
 753         }
 754         /*
 755          * if there was an "active_tr", make sure we wait for
 756          * it to flush if there was no "cur_tr" to process
 757          */
 758         wait_condition(jnl, &jnl->flushing, "journal_close");
 759
 760         //start = &jnl->jhdr->start;
 761         start = &jnl->active_start;
 762         end   = &jnl->jhdr->end;
 763
 764         while (*start != *end && counter++ < 5000) {
 765             //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
 766             if (jnl->flush) {
 767                 jnl->flush(jnl->flush_arg);
 768             }
 769             usleep(10000);
 770         }
 771
 772         if (*start != *end) {
 773             LFHFS_LOG(LEVEL_ERROR, "jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
 774                    *start, *end);
 775         }
 776
 777         // make sure this is in sync when we close the journal
 778         jnl->jhdr->start = jnl->active_start;
 779
 780         // if this fails there's not much we can do at this point...
 781         write_journal_header(jnl, 1, jnl->sequence_num);
 782     } else {
 783         // if we're here the journal isn't valid any more.
 784         // so make sure we don't leave any locked blocks lying around
 785         LFHFS_LOG(LEVEL_ERROR, "jnl: close: journal is invalid.  aborting outstanding transactions\n");
 786         if (jnl->active_tr || jnl->cur_tr) {
 787             transaction *tr;
 788
 789             if (jnl->active_tr) {
 790                 tr = jnl->active_tr;
 791                 jnl->active_tr = NULL;
 792             } else {
 793                 tr = jnl->cur_tr;
 794                 jnl->cur_tr = NULL;
 795             }
 796             abort_transaction(jnl, tr);
 797
 798             if (jnl->active_tr || jnl->cur_tr) {
 799                 panic("jnl: close: jnl @ %p had both an active and cur tr\n", jnl);
 800             }
 801         }
 802     }
 803     wait_condition(jnl, &jnl->asyncIO, "journal_close");
 804
 805     free_old_stuff(jnl);
 806
 807     hfs_free(jnl->header_buf);
 808     jnl->jhdr = (void *)0xbeefbabe;
 809
 810     journal_unlock(jnl);
 811     lf_lck_mtx_destroy(&jnl->old_start_lock);
 812     lf_lck_mtx_destroy(&jnl->jlock);
 813     lf_lck_mtx_destroy(&jnl->flock);
 814     hfs_free(jnl);
 815 }
 816
 817 // This function performs the following:
 818 // 1) Checks that we have a valid journal
 819 // 2) locks the journal
 820 // 3) Allocates roon in the journal
 821 int journal_start_transaction(journal *jnl) {
 822
 823     int ret;
 824
 825     #if JOURNAL_DEBUG
 826         printf("journal_start_transaction (%u).\n", jnl->nested_count);
 827     #endif
 828
 829     CHECK_JOURNAL(jnl);
 830
 831     free_old_stuff(jnl);
 832
 833     if (jnl->flags & JOURNAL_INVALID) {
 834         return EINVAL;
 835     }
 836
 837     if (jnl->owner == pthread_self()) {
 838         if (jnl->active_tr == NULL) {
 839             panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
 840                   jnl, jnl->owner, pthread_self());
 841         }
 842         jnl->nested_count++;
 843         return 0;
 844     }
 845
 846     journal_lock(jnl);
 847
 848     if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
 849         panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
 850               jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
 851     }
 852
 853     jnl->nested_count = 1;
 854
 855     // if there's a buffered transaction, use it.
 856     if (jnl->cur_tr) {
 857         jnl->active_tr = jnl->cur_tr;
 858         jnl->cur_tr    = NULL;
 859
 860         return 0;
 861     }
 862
 863     ret = journal_allocate_transaction(jnl);
 864     if (ret) {
 865         goto bad_start;
 866     }
 867
 868     // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
 869
 870     return 0;
 871
 872 bad_start:
 873     jnl->nested_count = 0;
 874     journal_unlock(jnl);
 875
 876     return ret;
 877 }
 878 // journal_end_transaction
 879 // This function does the following:
 880 // 1) Validates journal status/state
 881 // 2)
 882 int journal_end_transaction(journal *jnl) {
 883     int ret;
 884     transaction *tr;
 885
 886 #if JOURNAL_DEBUG
 887     printf("journal_end_transaction   (%u).\n", jnl->nested_count-1);
 888 #endif
 889
 890     CHECK_JOURNAL(jnl);
 891
 892     free_old_stuff(jnl);
 893
 894     if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
 895         return 0;
 896     }
 897
 898     if (jnl->owner != pthread_self()) {
 899         panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
 900               jnl, jnl->owner, pthread_self());
 901     }
 902     jnl->nested_count--;
 903
 904     if (jnl->nested_count > 0) {
 905         return 0;
 906     } else if (jnl->nested_count < 0) {
 907         panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
 908     }
 909
 910     if (jnl->flags & JOURNAL_INVALID) {
 911         if (jnl->active_tr) {
 912             if (jnl->cur_tr != NULL) {
 913                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
 914                       jnl, jnl->active_tr, jnl->cur_tr);
 915             }
 916             tr             = jnl->active_tr;
 917             jnl->active_tr = NULL;
 918
 919             abort_transaction(jnl, tr);
 920         }
 921         journal_unlock(jnl);
 922
 923         return EINVAL;
 924     }
 925
 926     tr = jnl->active_tr;
 927     CHECK_TRANSACTION(tr);
 928
 929     // clear this out here so that when check_free_space() calls
 930     // the FS flush function, we don't panic in journal_flush()
 931     // if the FS were to call that.  note: check_free_space() is
 932     // called from end_transaction().
 933     jnl->active_tr = NULL;
 934
 935     /* Examine the force-journal-flush state in the active txn */
 936     if (tr->flush_on_completion == TRUE) {
 937         /*
 938          * If the FS requested it, disallow group commit and force the
 939          * transaction out to disk immediately.
 940          */
 941         ret = end_transaction(tr, 1, NULL, NULL, TRUE);
 942     }
 943     else {
 944         /* in the common path we can simply use the double-buffered journal */
 945         ret = end_transaction(tr, 0, NULL, NULL, TRUE);
 946     }
 947
 948     return ret;
 949 }
 950
 951 // journal_modify_block_start
 952 // This function does the following:
 953 // 1) Makes sure the journal file is on and valid
 954 // 2) Clean up (free previous transactions)
 955 // 3) Validate that the phy-block-size has not changed.
 956 // 4) Locks the buffer.
 957 // Buffer life cycle with journal:
 958 // 1) Client code (ie btrees_io.c) allocates a buffer (ie gains ownership). Other threads will pend on using this buffer until it is released.
 959 // 2) Client code calls journal_modify_block_start which sets the GEN_BUF_WRITE_LOCK uCacheFlag.
 960 // 3) Client code modifies the buffer.
 961 // 4) Client code calls journal_modify_block_end which released the buffer. The GEN_BUF_WRITE_LOCK flag remains set.
 962 //  It this point other threads are welcomed to modify the buffer (after executing steps 1 and 2 above). The buffer content will not be written to media before transaction_end, thus only the accumulative change of both threads after transaction_end will be committed.
 963 // 5) transaction-end (called from within client-code or async Sync) obtains ownership on in transaction buffers. By doing that it makes sure no buffer is currently being modified by any Client code. It then prepares the buffer for commiting (ie realigns endianizm), and commits (writes to the t-buffer, write the t-buffer to media, updates journal-info, clears the GEN_BUF_WRITE_LOCK flags and writes the buffers to media).
 964 int journal_modify_block_start(journal *jnl, GenericLFBuf *psGenBuf) {
 965
 966     transaction *tr;
 967
 968 #if JOURNAL_DEBUG
 969     printf("journal_modify_block_start: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uCacheFlags 0x%llx, uPhyCluster %llu, uLockCnt %u\n",
 970            psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uCacheFlags ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
 971 #endif
 972
 973     CHECK_JOURNAL(jnl);
 974
 975     free_old_stuff(jnl);
 976
 977     if (jnl->flags & JOURNAL_INVALID) {
 978         return EINVAL;
 979     }
 980
 981     tr = jnl->active_tr;
 982     CHECK_TRANSACTION(tr);
 983
 984     if (jnl->owner != pthread_self()) {
 985         panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
 986               jnl, jnl->owner, pthread_self());
 987     }
 988
 989     //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
 990     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
 991
 992     // can't allow blocks that aren't an even multiple of the
 993     // underlying block size.
 994     if ((psGenBuf->uDataSize % jnl->jhdr->jhdr_size) != 0) {
 995         uint32_t bad=0;
 996         uint32_t phys_blksz;
 997
 998         if (ioctl(jnl->jdev->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
 999             bad = 1;
1000         } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
1001             if (phys_blksz < 512) {
1002                 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
1003                       phys_blksz, psGenBuf->uDataSize, jnl->jhdr->jhdr_size);
1004             }
1005
1006             if ((psGenBuf->uDataSize % phys_blksz) != 0) {
1007                 bad = 1;
1008             } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
1009                 jnl->jhdr->jhdr_size = phys_blksz;
1010             } else {
1011                 // the phys_blksz is now larger... need to realloc the jhdr
1012                 char *new_header_buf;
1013
1014                 LFHFS_LOG(LEVEL_ERROR, "jnl: phys blksz got bigger (was: %d/%d now %d)\n",
1015                        jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
1016                 new_header_buf = hfs_malloc(phys_blksz);
1017                 memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
1018                 memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
1019                 hfs_free(jnl->header_buf);
1020                 jnl->header_buf = new_header_buf;
1021                 jnl->header_buf_size = phys_blksz;
1022
1023                 jnl->jhdr = (journal_header *)jnl->header_buf;
1024                 jnl->jhdr->jhdr_size = phys_blksz;
1025             }
1026         } else {
1027             bad = 1;
1028         }
1029
1030         if (bad) {
1031             panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1032                   psGenBuf->uDataSize, jnl->jhdr->jhdr_size);
1033
1034             return -1;
1035         }
1036     }
1037
1038     // make sure that this transaction isn't bigger than the whole journal
1039     if ((tr->total_bytes+psGenBuf->uDataSize) >= (size_t)(jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
1040         panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
1041               tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), psGenBuf->uDataSize, tr, psGenBuf->pvData);
1042
1043         return -1;
1044     }
1045
1046     lf_hfs_generic_buf_set_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
1047
1048     return 0;
1049 }
1050 // journal_modify_block_end
1051 // This function does the following:
1052 // 1) Makes sure the journal file is on and valid
1053 // 2) Clean up (free previous transactions)
1054 // 3) Check if this block already exists in transaction
1055 // 4) Add block number to transcation. We dont add the block data, nor we release the buffer at this point.
1056 //    This will be done later on, at the transaction-end.
1057 int journal_modify_block_end(journal *jnl, GenericLFBuf *psGenBuf,
1058                             void (*func)(GenericLFBuf *bp, void *arg), void *arg) {
1059     int                i = 1;
1060     size_t             tbuffer_offset=0;
1061     block_list_header *blhdr, *prev=NULL;
1062     transaction       *tr = NULL;
1063
1064     #if JOURNAL_DEBUG
1065         printf("journal_modify_block_end:   psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1066                psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
1067     #endif
1068
1069     CHECK_JOURNAL(jnl);
1070
1071     free_old_stuff(jnl);
1072
1073     if (func) {
1074         psGenBuf->pfFunc         = func;
1075         psGenBuf->pvCallbackArgs = arg;
1076     }
1077
1078     if (jnl->flags & JOURNAL_INVALID) {
1079         /* Still need to buf_brelse(). Callers assume we consume the bp. */
1080         lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
1081         lf_hfs_generic_buf_release(psGenBuf);
1082         return EINVAL;
1083     }
1084
1085     tr = jnl->active_tr;
1086     CHECK_TRANSACTION(tr);
1087
1088     if (jnl->owner != pthread_self()) {
1089         panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
1090               jnl, jnl->owner, pthread_self());
1091     }
1092
1093     if ((psGenBuf->uCacheFlags & GEN_BUF_WRITE_LOCK) == 0) {
1094         panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", psGenBuf, jnl);
1095     }
1096
1097     // first check if this block is already part of this transaction
1098     for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
1099         tbuffer_offset = jnl->jhdr->blhdr_size;
1100
1101         for (i = 1; i < blhdr->num_blocks; i++) {
1102             GenericLFBuf *bp = (void*)blhdr->binfo[i].u.bp;
1103             if (psGenBuf == bp) {
1104                 // Block found in transaction
1105                 #if JOURNAL_DEBUG
1106                     printf("block_end, already in journal:   psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1107                        psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
1108                 #endif
1109                 break;
1110             }
1111             if (blhdr->binfo[i].bnum != (off_t)-1) {
1112                 off_t uSizeOfBuf = ((GenericLFBuf*)(blhdr->binfo[i].u.bp))->uDataSize;
1113                 tbuffer_offset  += uSizeOfBuf;
1114             } else {
1115                 tbuffer_offset  += blhdr->binfo[i].u.bi.bsize;
1116             }
1117         }
1118
1119         if (i < blhdr->num_blocks) {
1120             break;
1121         }
1122     }
1123
1124     if (blhdr == NULL
1125         && prev
1126         && (prev->num_blocks+1) <= prev->max_blocks
1127         && (prev->bytes_used+psGenBuf->uDataSize) <= (uint32_t)tr->tbuffer_size) {
1128         // Block not found, add to last list
1129         blhdr = prev;
1130
1131     } else if (blhdr == NULL) {
1132         block_list_header *nblhdr;
1133         if (prev == NULL) {
1134             panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, psGenBuf %p\n", jnl, psGenBuf);
1135         }
1136         // Add another tbuffer:
1137
1138         // we got to the end of the list, didn't find the block and there's
1139         // no room in the block_list_header pointed to by prev
1140
1141         // we allocate another tbuffer and link it in at the end of the list
1142         // through prev->binfo[0].bnum.  that's a skanky way to do things but
1143         // avoids having yet another linked list of small data structures to manage.
1144
1145         nblhdr = hfs_malloc(tr->tbuffer_size);
1146
1147         // journal replay code checksum check depends on this.
1148         memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
1149         // Fill up the rest of the block with unimportant bytes
1150         memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
1151
1152         // initialize the new guy
1153         nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
1154         nblhdr->num_blocks = 1;      // accounts for this header block
1155         nblhdr->bytes_used = (uint32_t)jnl->jhdr->blhdr_size;
1156         nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
1157
1158         tr->num_blhdrs++;
1159         tr->total_bytes += jnl->jhdr->blhdr_size;
1160
1161         // then link him in at the end
1162         prev->binfo[0].bnum = (off_t)((long)nblhdr);
1163
1164         // and finally switch to using the new guy
1165         blhdr          = nblhdr;
1166         i              = 1;
1167     }
1168
1169     if ((i+1) > blhdr->max_blocks) {
1170         panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
1171     }
1172
1173     // if this is true then this is a new block we haven't seen before
1174     if (i >= blhdr->num_blocks) {
1175         off_t    bsize;
1176         bsize = psGenBuf->uDataSize;
1177
1178         // Add block to list
1179         blhdr->binfo[i].bnum = (off_t)(psGenBuf->uBlockN);
1180         blhdr->binfo[i].u.bp = (void*)psGenBuf;
1181
1182         blhdr->bytes_used += bsize;
1183         tr->total_bytes   += bsize;
1184
1185         blhdr->num_blocks++;
1186     }
1187
1188     // We can release the block here to allow other threads to perform operations on it until the next transaction-end.
1189     // The buffer will not be removed from cache since it is write-locked.
1190     lf_hfs_generic_buf_release(psGenBuf);
1191
1192     return 0;
1193 }
1194
1195 // This function validates if a block is already registered to a transaction
1196 /*
1197  * Flush the contents of the journal to the disk.
1198  *
1199  *  Input:
1200  *      wait_for_IO -
1201  *      If TRUE, wait to write in-memory journal to the disk
1202  *      consistently, and also wait to write all asynchronous
1203  *      metadata blocks to its corresponding locations
1204  *      consistently on the disk.  This means that the journal
1205  *      is empty at this point and does not contain any
1206  *      transactions.  This is overkill in normal scenarios
1207  *      but is useful whenever the metadata blocks are required
1208  *      to be consistent on-disk instead of just the journal
1209  *      being consistent; like before live verification
1210  *      and live volume resizing.
1211  *
1212  *      If FALSE, only wait to write in-memory journal to the
1213  *      disk consistently.  This means that the journal still
1214  *      contains uncommitted transactions and the file system
1215  *      metadata blocks in the journal transactions might be
1216  *      written asynchronously to the disk.  But there is no
1217  *      guarantee that they are written to the disk before
1218  *      returning to the caller.  Note that this option is
1219  *      sufficient for file system data integrity as it
1220  *      guarantees consistent journal content on the disk.
1221  */
1222 int journal_flush(journal *jnl, journal_flush_options_t options) {
1223     boolean_t drop_lock   = FALSE;
1224     errno_t   error       = 0;
1225     uint32_t  flush_count = 0;
1226
1227     CHECK_JOURNAL(jnl);
1228
1229     free_old_stuff(jnl);
1230
1231     if (jnl->flags & JOURNAL_INVALID) {
1232         return EINVAL;
1233     }
1234
1235     if (jnl->owner != pthread_self()) {
1236         journal_lock(jnl);
1237         drop_lock = TRUE;
1238     }
1239
1240     if (ISSET(options, JOURNAL_FLUSH_FULL))
1241         flush_count = jnl->flush_counter;
1242
1243     // if we're not active, flush any buffered transactions
1244     if (jnl->active_tr == NULL && jnl->cur_tr) {
1245         transaction *tr = jnl->cur_tr;
1246
1247         jnl->cur_tr = NULL;
1248
1249         if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
1250             wait_condition(jnl, &jnl->flushing, "journal_flush");
1251             wait_condition(jnl, &jnl->asyncIO,  "journal_flush");
1252         }
1253
1254         // As the journal flush changes the MetaData content (update Endianizm), we need to lock the system times.
1255         int lockflags = hfs_systemfile_lock(jnl->fsmount->psHfsmount, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1256
1257         /*
1258          * "end_transction" will wait for any current async flush
1259          * to complete, before flushing "cur_tr"... because we've
1260          * specified the 'must_wait' arg as TRUE, it will then
1261          * synchronously flush the "cur_tr"
1262          */
1263         end_transaction(tr, 1, NULL, NULL, drop_lock);   // force it to get flushed
1264
1265         hfs_systemfile_unlock(jnl->fsmount->psHfsmount, lockflags);
1266
1267     } else  {
1268         if (drop_lock == TRUE) {
1269             journal_unlock(jnl);
1270         }
1271
1272         /* Because of pipelined journal, the journal transactions
1273          * might be in process of being flushed on another thread.
1274          * If there is nothing to flush currently, we should
1275          * synchronize ourselves with the pipelined journal thread
1276          * to ensure that all inflight transactions, if any, are
1277          * flushed before we return success to caller.
1278          */
1279         wait_condition(jnl, &jnl->flushing, "journal_flush");
1280     }
1281     if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
1282         wait_condition(jnl, &jnl->asyncIO, "journal_flush");
1283     }
1284
1285     if (ISSET(options, JOURNAL_FLUSH_FULL)) {
1286
1287         dk_synchronize_t sync_request = {
1288             .options                        = 0,
1289         };
1290
1291         // We need a full cache flush. If it has not been done, do it here.
1292         if (flush_count == jnl->flush_counter)
1293             error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
1294
1295         // If external journal partition is enabled, flush filesystem data partition.
1296         if (jnl->jdev != jnl->fsdev)
1297             error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
1298
1299     }
1300
1301     return error;
1302 }
1303
1304
1305 // ************************** Local Functions ***********************
1306 static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) {
1307
1308     int            iRet    = 0;
1309     GenericLFBuf *psGenBuf = NULL;
1310
1311     // first read the block we want.
1312     psGenBuf = lf_hfs_generic_buf_allocate(jnl->fsmount->psHfsmount->hfs_devvp,
1313                                            fs_block,
1314                                            (uint32_t)bsize,
1315                                            GEN_BUF_PHY_BLOCK | GEN_BUF_NON_CACHED);
1316     if (!psGenBuf) {
1317         LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error allocating fs block # %lld!\n", fs_block);
1318         iRet = -1;
1319         goto exit;
1320     }
1321
1322     iRet = lf_hfs_generic_buf_read(psGenBuf);
1323     if (iRet) {
1324         LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error reading fs block # %lld!\n", fs_block);
1325         goto exit;
1326     }
1327
1328     // copy the journal data over top of it
1329     memcpy(psGenBuf->pvData, block_ptr, bsize);
1330
1331     iRet = lf_hfs_generic_buf_write(psGenBuf);
1332     if (iRet) {
1333         LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: failed to write block %lld (ret %d)\n", fs_block, iRet);
1334         goto exit;
1335     }
1336
1337 exit:
1338     if (psGenBuf) {
1339        lf_hfs_generic_buf_release(psGenBuf);
1340     }
1341
1342     return iRet;
1343 }
1344
1345
1346 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) {
1347     struct bucket *newBuf;
1348     int current_size = num_buckets, i;
1349
1350     // return if newsize is less than the current size
1351     if (new_size < num_buckets) {
1352         return current_size;
1353     }
1354
1355     newBuf = hfs_malloc(new_size*sizeof(struct bucket));
1356
1357     //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
1358
1359     // copy existing elements
1360     bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
1361
1362     // initialize the new ones
1363     for(i = num_buckets; i < new_size; i++) {
1364         newBuf[i].block_num = (off_t)-1;
1365     }
1366
1367     // free the old container
1368     hfs_free(*buf_ptr);
1369
1370     // reset the buf_ptr
1371     *buf_ptr = newBuf;
1372
1373     return new_size;
1374 }
1375
1376
1377 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) {
1378
1379     if (!overwriting) {
1380         // grow the table if we're out of space - we may index the table
1381         // with *num_full_ptr (lookup_bucket() can return a maximum value ==
1382         // *num_full_ptr), so we need to grow when we hit (*num_buckets_ptr - 1)
1383         // to prevent out-of-bounds indexing
1384         if (*num_full_ptr >= (*num_buckets_ptr - 1)) {
1385             int new_size = *num_buckets_ptr * 2;
1386             int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
1387
1388             if (grow_size < new_size) {
1389                 LFHFS_LOG(LEVEL_ERROR, "jnl: add_block: grow_table returned an error!\n");
1390                 return -1;
1391             }
1392
1393             *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
1394         }
1395
1396         // if we're not inserting at the end, we need to bcopy
1397         if (blk_index != *num_full_ptr) {
1398             bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
1399         }
1400
1401         (*num_full_ptr)++; // increment only if we're not overwriting
1402     }
1403
1404     // sanity check the values we're about to add
1405     if ((off_t)offset >= jnl->jhdr->size) {
1406         offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1407     }
1408     if (size <= 0) {
1409         panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
1410     }
1411
1412     (*buf_ptr)[blk_index].block_num = num;
1413     (*buf_ptr)[blk_index].block_size = (uint32_t)size;
1414     (*buf_ptr)[blk_index].jnl_offset = (uint32_t)offset;
1415     (*buf_ptr)[blk_index].cksum = cksum;
1416
1417     return blk_index;
1418 }
1419
1420 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) {
1421
1422     int     num_to_remove, index, i, overwrite, err;
1423     size_t  jhdr_size = jnl->jhdr->jhdr_size, new_offset;
1424     off_t   overlap, block_start, block_end;
1425
1426     block_start = block_num*jhdr_size;
1427     block_end = block_start + size;
1428     overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
1429
1430     // first, eliminate any overlap with the previous entry
1431     if (blk_index != 0 && !overwrite) {
1432         off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
1433         off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
1434         overlap = prev_block_end - block_start;
1435         if (overlap > 0) {
1436             if (overlap % jhdr_size != 0) {
1437                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
1438             }
1439
1440             // if the previous entry completely overlaps this one, we need to break it into two pieces.
1441             if (prev_block_end > block_end) {
1442                 off_t new_num = block_end / jhdr_size;
1443                 size_t new_size = prev_block_end - block_end;
1444
1445                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
1446
1447                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
1448                 if (err < 0) {
1449                     panic("jnl: do_overlap: error inserting during pre-overlap\n");
1450                 }
1451             }
1452
1453             // Regardless, we need to truncate the previous entry to the beginning of the overlap
1454             (*buf_ptr)[blk_index-1].block_size = (uint32_t)(block_start - prev_block_start);
1455             (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
1456         }
1457     }
1458
1459     // then, bail out fast if there's no overlap with the entries that follow
1460     if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) {
1461         return 0; // no overlap, no overwrite
1462     } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) {
1463
1464         (*buf_ptr)[blk_index].cksum = cksum;   // update this
1465         return 1; // simple overwrite
1466     }
1467
1468     // Otherwise, find all cases of total and partial overlap. We use the special
1469     // block_num of -2 to designate entries that are completely overlapped and must
1470     // be eliminated. The block_num, size, and jnl_offset of partially overlapped
1471     // entries must be adjusted to keep the array consistent.
1472     index = blk_index;
1473     num_to_remove = 0;
1474     while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) {
1475         if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) {
1476             (*buf_ptr)[index].block_num = -2; // mark this for deletion
1477             num_to_remove++;
1478         } else {
1479             overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
1480             if (overlap > 0) {
1481                 if (overlap % jhdr_size != 0) {
1482                     panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
1483                 }
1484
1485                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1486                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
1487                 (*buf_ptr)[index].cksum = 0;
1488
1489                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
1490                 if ((off_t)new_offset >= jnl->jhdr->size) {
1491                     new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
1492                 }
1493                 (*buf_ptr)[index].jnl_offset = (uint32_t)new_offset;
1494
1495                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
1496                 if ((*buf_ptr)[index].block_size <= 0) {
1497                     panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
1498                     // return -1; // if above panic is removed, return -1 for error
1499                 }
1500             }
1501
1502         }
1503
1504         index++;
1505     }
1506
1507     // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1508     index--; // start with the last index used within the above loop
1509     while (index >= blk_index) {
1510         if ((*buf_ptr)[index].block_num == -2) {
1511             if (index == *num_full_ptr-1) {
1512                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
1513             } else {
1514                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
1515             }
1516             (*num_full_ptr)--;
1517         }
1518         index--;
1519     }
1520
1521     // eliminate any stale entries at the end of the table
1522     for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
1523         (*buf_ptr)[i].block_num = -1;
1524     }
1525
1526     return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1527 }
1528
1529
1530 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) {
1531     int lo, hi, index, matches, i;
1532
1533     if (num_full == 0) {
1534         return 0; // table is empty, so insert at index=0
1535     }
1536
1537     lo = 0;
1538     hi = num_full - 1;
1539     index = -1;
1540
1541     // perform binary search for block_num
1542     do {
1543         int mid = (hi - lo)/2 + lo;
1544         off_t this_num = (*buf_ptr)[mid].block_num;
1545
1546         if (block_num == this_num) {
1547             index = mid;
1548             break;
1549         }
1550
1551         if (block_num < this_num) {
1552             hi = mid;
1553             continue;
1554         }
1555
1556         if (block_num > this_num) {
1557             lo = mid + 1;
1558             continue;
1559         }
1560     } while (lo < hi);
1561
1562     // check if lo and hi converged on the match
1563     if (block_num == (*buf_ptr)[hi].block_num) {
1564         index = hi;
1565     }
1566
1567     // if no existing entry found, find index for new one
1568     if (index == -1) {
1569         index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
1570     } else {
1571         // make sure that we return the right-most index in the case of multiple matches
1572         matches = 0;
1573         i = index + 1;
1574         while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
1575             matches++;
1576             i++;
1577         }
1578
1579         index += matches;
1580     }
1581
1582     return index;
1583 }
1584
1585 // PR-3105942: Coalesce writes to the same block in journal replay
1586 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1587 // to be replayed and the corresponding location in the journal which contains
1588 // the most recent data for those blocks. The array is "played" once the all the
1589 // blocks in the journal have been coalesced. The code for the case of conflicting/
1590 // overlapping writes to a single block is the most dense. Because coalescing can
1591 // disrupt the existing time-ordering of blocks in the journal playback, care
1592 // is taken to catch any overlaps and keep the array consistent.
1593 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) {
1594     int    blk_index, overwriting;
1595
1596     // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1597     // inserted (or the index of the elem to overwrite).
1598     blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
1599
1600     // check if the index is within bounds (if we're adding this block to the end of
1601     // the table, blk_index will be equal to num_full)
1602     if (blk_index < 0 || blk_index > *num_full_ptr) {
1603         //printf("jnl: add_block: trouble adding block to co_buf\n");
1604         return -1;
1605     } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1606
1607     // Determine whether we're overwriting an existing entry by checking for overlap
1608     overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
1609     if (overwriting < 0) {
1610         return -1; // if we got an error, pass it along
1611     }
1612
1613     // returns the index, or -1 on error
1614     blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
1615
1616     return blk_index;
1617 }
1618
1619 static void swap_block_list_header(journal *jnl, block_list_header *blhdr) {
1620     int i;
1621
1622     blhdr->max_blocks = SWAP16(blhdr->max_blocks);
1623     blhdr->num_blocks = SWAP16(blhdr->num_blocks);
1624     blhdr->bytes_used = SWAP32(blhdr->bytes_used);
1625     blhdr->checksum   = SWAP32(blhdr->checksum);
1626     blhdr->flags      = SWAP32(blhdr->flags);
1627
1628     if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
1629         LFHFS_LOG(LEVEL_ERROR, "jnl: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", blhdr->num_blocks, jnl->jhdr->blhdr_size);
1630         return;
1631     }
1632
1633     for(i = 0; i < blhdr->num_blocks; i++) {
1634         blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
1635         blhdr->binfo[i].u.bi.bsize   = SWAP32(blhdr->binfo[i].u.bi.bsize);
1636         blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
1637     }
1638 }
1639
1640 static int replay_journal(journal *jnl) {
1641     int          i, bad_blocks=0;
1642     unsigned int   orig_checksum, checksum, check_block_checksums = 0;
1643     size_t         ret;
1644     size_t         max_bsize = 0;        /* protected by block_ptr */
1645     block_list_header *blhdr;
1646     off_t          offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1647     char          *buff, *block_ptr=NULL;
1648     struct bucket *co_buf;
1649     int           num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory = 0;
1650     uint32_t      last_sequence_num = 0;
1651     int           replay_retry_count = 0;
1652
1653     LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: start.\n");
1654
1655
1656     // wrap the start ptr if it points to the very end of the journal
1657     if (jnl->jhdr->start == jnl->jhdr->size) {
1658         jnl->jhdr->start = jnl->jhdr->jhdr_size;
1659     }
1660     if (jnl->jhdr->end == jnl->jhdr->size) {
1661         jnl->jhdr->end = jnl->jhdr->jhdr_size;
1662     }
1663
1664     if (jnl->jhdr->start == jnl->jhdr->end) {
1665         LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: journal empty.\n");
1666         goto success;
1667     }
1668
1669     orig_jnl_start = jnl->jhdr->start;
1670
1671     // allocate memory for the header_block.  we'll read each blhdr into this
1672     buff = hfs_malloc(jnl->jhdr->blhdr_size);
1673
1674     // allocate memory for the coalesce buffer
1675     co_buf = hfs_malloc(num_buckets*sizeof(struct bucket));
1676
1677 restart_replay:
1678
1679     // initialize entries
1680     for(i = 0; i < num_buckets; i++) {
1681         co_buf[i].block_num = -1;
1682     }
1683     num_full = 0; // empty at first
1684
1685
1686     while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1687         offset = blhdr_offset = jnl->jhdr->start;
1688         ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1689         if (ret != (size_t)jnl->jhdr->blhdr_size) {
1690             LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
1691             goto bad_txn_handling;
1692         }
1693
1694         blhdr = (block_list_header *)buff;
1695
1696         orig_checksum = blhdr->checksum;
1697         blhdr->checksum = 0;
1698         if (jnl->flags & JOURNAL_NEED_SWAP) {
1699             // calculate the checksum based on the unswapped data
1700             // because it is done byte-at-a-time.
1701             orig_checksum = (unsigned int)SWAP32(orig_checksum);
1702             checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1703             swap_block_list_header(jnl, blhdr);
1704         } else {
1705             checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1706         }
1707
1708
1709         //
1710         // XXXdbg - if these checks fail, we should replay as much
1711         //          we can in the hopes that it will still leave the
1712         //          drive in a better state than if we didn't replay
1713         //          anything
1714         //
1715         if (checksum != orig_checksum) {
1716             if (check_past_jnl_end && in_uncharted_territory) {
1717
1718                 if (blhdr_offset != jnl->jhdr->end) {
1719                     LFHFS_LOG(LEVEL_ERROR, "jnl: Extra txn replay stopped @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset);
1720                 }
1721
1722                 check_past_jnl_end = 0;
1723                 jnl->jhdr->end = blhdr_offset;
1724                 continue;
1725             }
1726
1727             LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1728                    blhdr_offset, orig_checksum, checksum);
1729
1730             if (blhdr_offset == orig_jnl_start) {
1731                 // if there's nothing in the journal at all, just bail out altogether.
1732                 goto bad_replay;
1733             }
1734
1735             goto bad_txn_handling;
1736         }
1737
1738         if (   (last_sequence_num != 0)
1739             && (blhdr->binfo[0].u.bi.b.sequence_num != 0)
1740             && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
1741             && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
1742
1743             txn_start_offset = jnl->jhdr->end = blhdr_offset;
1744
1745             if (check_past_jnl_end) {
1746                 check_past_jnl_end = 0;
1747                 LFHFS_LOG(LEVEL_ERROR, "jnl: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1748                        blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1749                 continue;
1750             }
1751
1752             LFHFS_LOG(LEVEL_ERROR, "jnl: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1753                    blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1754             goto bad_txn_handling;
1755         }
1756         last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
1757
1758         if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1759             if (last_sequence_num == 0) {
1760                 check_past_jnl_end = 0;
1761                 LFHFS_LOG(LEVEL_ERROR, "jnl: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1762                        jnl->jhdr->start, jnl->jhdr->end);
1763                 if (jnl->jhdr->start != jnl->jhdr->end) {
1764                     jnl->jhdr->start = jnl->jhdr->end;
1765                 }
1766                 continue;
1767             }
1768             LFHFS_LOG(LEVEL_ERROR, "jnl: examining extra transactions starting @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset);
1769         }
1770
1771         if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
1772             || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1773             LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
1774                    blhdr->max_blocks, blhdr->num_blocks);
1775             goto bad_txn_handling;
1776         }
1777
1778         max_bsize = 0;
1779         for (i = 1; i < blhdr->num_blocks; i++) {
1780             if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1781                 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
1782                 goto bad_txn_handling;
1783             }
1784
1785             if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
1786                 max_bsize = blhdr->binfo[i].u.bi.bsize;
1787             }
1788         }
1789
1790         if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1791             check_block_checksums = 1;
1792             block_ptr = hfs_malloc(max_bsize);
1793         } else {
1794             block_ptr = NULL;
1795         }
1796
1797         if (blhdr->flags & BLHDR_FIRST_HEADER) {
1798             txn_start_offset = blhdr_offset;
1799         }
1800
1801         //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1802         //       blhdr->num_blocks-1, jnl->jhdr->start);
1803         bad_blocks = 0;
1804         for (i = 1; i < blhdr->num_blocks; i++) {
1805             int size, ret_val;
1806             off_t number;
1807
1808             size = blhdr->binfo[i].u.bi.bsize;
1809             number = blhdr->binfo[i].bnum;
1810
1811             // don't add "killed" blocks
1812             if (number == (off_t)-1) {
1813                 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1814             } else {
1815
1816                 if (check_block_checksums) {
1817                     int32_t disk_cksum;
1818                     off_t block_offset;
1819
1820                     block_offset = offset;
1821
1822                     // read the block so we can check the checksum
1823                     ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1824                     if (ret != (size_t)size) {
1825                         LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
1826                         goto bad_txn_handling;
1827                     }
1828
1829                     disk_cksum = calc_checksum(block_ptr, size);
1830
1831                     // there is no need to swap the checksum from disk because
1832                     // it got swapped when the blhdr was read in.
1833                     if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
1834                         LFHFS_LOG(LEVEL_ERROR, "jnl: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1835                                txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
1836                         LFHFS_LOG(LEVEL_ERROR, "jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1837                                *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1838                                *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1839
1840                         goto bad_txn_handling;
1841                     }
1842                 }
1843
1844
1845                 // add this bucket to co_buf, coalescing where possible
1846                 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1847                 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
1848
1849                 if (ret_val == -1) {
1850                     LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: trouble adding block to co_buf\n");
1851                     goto bad_replay;
1852                 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1853             }
1854
1855             // increment offset
1856             offset += size;
1857
1858             // check if the last block added puts us off the end of the jnl.
1859             // if so, we need to wrap to the beginning and take any remainder
1860             // into account
1861             //
1862             if (offset >= jnl->jhdr->size) {
1863                 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1864             }
1865         }
1866
1867         if (block_ptr) {
1868             hfs_free(block_ptr);
1869             block_ptr = NULL;
1870         }
1871
1872         if (bad_blocks) {
1873         bad_txn_handling:
1874             /* Journal replay got error before it found any valid
1875              *  transations, abort replay */
1876             if (txn_start_offset == 0) {
1877                 LFHFS_LOG(LEVEL_ERROR, "jnl: no known good txn start offset! aborting journal replay.\n");
1878                 goto bad_replay;
1879             }
1880
1881             /* Repeated error during journal replay, abort replay */
1882             if (replay_retry_count == 3) {
1883                 LFHFS_LOG(LEVEL_ERROR, "jnl: repeated errors replaying journal! aborting journal replay.\n");
1884                 goto bad_replay;
1885             }
1886             replay_retry_count++;
1887
1888             /* There was an error replaying the journal (possibly
1889              * EIO/ENXIO from the device).  So retry replaying all
1890              * the good transactions that we found before getting
1891              * the error.
1892              */
1893             jnl->jhdr->start = orig_jnl_start;
1894             jnl->jhdr->end = txn_start_offset;
1895             check_past_jnl_end = 0;
1896             last_sequence_num = 0;
1897             LFHFS_LOG(LEVEL_ERROR, "jnl: restarting journal replay (%lld - %lld)!\n", jnl->jhdr->start, jnl->jhdr->end);
1898             goto restart_replay;
1899         }
1900
1901         jnl->jhdr->start += blhdr->bytes_used;
1902         if (jnl->jhdr->start >= jnl->jhdr->size) {
1903             // wrap around and skip the journal header block
1904             jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1905         }
1906
1907         if (jnl->jhdr->start == jnl->jhdr->end) {
1908             in_uncharted_territory = 1;
1909         }
1910     }
1911
1912     if (jnl->jhdr->start != jnl->jhdr->end) {
1913         LFHFS_LOG(LEVEL_ERROR, "jnl: start %lld != end %lld.  resetting end.\n", jnl->jhdr->start, jnl->jhdr->end);
1914         jnl->jhdr->end = jnl->jhdr->start;
1915     }
1916
1917     //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1918
1919     /*
1920      * make sure it's at least one page in size, so
1921      * start max_bsize at PAGE_SIZE
1922      */
1923     for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1924
1925         if (co_buf[i].block_num == (off_t)-1)
1926             continue;
1927
1928         if (co_buf[i].block_size > max_bsize)
1929             max_bsize = co_buf[i].block_size;
1930     }
1931     /*
1932      * round max_bsize up to the nearest PAGE_SIZE multiple
1933      */
1934     if (max_bsize & (PAGE_SIZE - 1)) {
1935         max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1936     }
1937
1938     block_ptr = hfs_malloc(max_bsize);
1939
1940     // Replay the coalesced entries in the co-buf
1941     for(i = 0; i < num_full; i++) {
1942         size_t size = co_buf[i].block_size;
1943         off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1944         off_t number = co_buf[i].block_num;
1945
1946
1947         // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1948         //      co_buf[i].block_size, co_buf[i].jnl_offset);
1949
1950         if (number == (off_t)-1) {
1951             // printf("jnl: replay_journal: skipping killed fs block\n");
1952         } else {
1953
1954             // do journal read, and set the phys. block
1955             ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1956             if (ret != size) {
1957                 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl_offset);
1958                 goto bad_replay;
1959             }
1960
1961             if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1962                 goto bad_replay;
1963             }
1964         }
1965     }
1966
1967
1968     // done replaying; update jnl header
1969     if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1970         goto bad_replay;
1971     }
1972
1973     // free block_ptr
1974     if (block_ptr) {
1975         hfs_free(block_ptr);
1976         block_ptr = NULL;
1977     }
1978
1979     // free the coalesce buffer
1980     hfs_free(co_buf);
1981     co_buf = NULL;
1982
1983     hfs_free(buff);
1984
1985 success:
1986     LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: success.\n");
1987     return 0;
1988
1989 bad_replay:
1990     hfs_free(block_ptr);
1991     hfs_free(co_buf);
1992     hfs_free(buff);
1993
1994     LFHFS_LOG(LEVEL_ERROR, "replay_journal: error.\n");
1995     return -1;
1996 }
1997
1998 // buffer_written:
1999 // This function get executed after a buffer has been written to its
2000 // final destination.
2001 // This function lets us know when a buffer has been
2002 // flushed to disk.  Originally (kext), it was called from deep
2003 // within the driver stack and thus is quite limited in what it could do.
2004 // Notably, it could not initiate any new i/o's or allocate/free memory.
2005 static void buffer_written(transaction *tr, GenericLFBuf *bp) {
2006
2007     journal      *jnl;
2008     transaction  *ctr, *prev=NULL, *next;
2009     size_t        i;
2010     size_t        bufsize, amt_flushed, total_bytes;
2011
2012
2013     // snarf out the bits we want
2014     bufsize = bp->uDataSize;
2015
2016     // then we've already seen it
2017     if (tr == NULL) {
2018         return;
2019     }
2020
2021     CHECK_TRANSACTION(tr);
2022
2023     jnl = tr->jnl;
2024
2025     CHECK_JOURNAL(jnl);
2026
2027     amt_flushed = tr->num_killed;
2028     total_bytes = tr->total_bytes;
2029
2030     // update the number of blocks that have been flushed.
2031     // this buf may represent more than one block so take
2032     // that into account.
2033     amt_flushed     += tr->num_flushed;
2034     tr->num_flushed += bufsize;
2035
2036     // if this transaction isn't done yet, just return as
2037     // there is nothing to do.
2038     //
2039     // NOTE: we are careful to not reference anything through
2040     //       the tr pointer after doing the OSAddAtomic().  if
2041     //       this if statement fails then we are the last one
2042     //       and then it's ok to dereference "tr".
2043     //
2044     if ((amt_flushed + bufsize) < total_bytes) {
2045         return;
2046     }
2047
2048     // this will single thread checking the transaction
2049     lock_oldstart(jnl);
2050
2051     if (tr->total_bytes == (int)0xfbadc0de) {
2052         // then someone beat us to it...
2053         unlock_oldstart(jnl);
2054         return;
2055     }
2056
2057     // mark this so that we're the owner of dealing with the
2058     // cleanup for this transaction
2059     tr->total_bytes = 0xfbadc0de;
2060
2061     if (jnl->flags & JOURNAL_INVALID)
2062         goto transaction_done;
2063
2064     //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
2065     //   tr, tr->journal_start, tr->journal_end, jnl);
2066
2067     // find this entry in the old_start[] index and mark it completed
2068     for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2069
2070         if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
2071             jnl->old_start[i] &= ~(0x8000000000000000ULL);
2072             break;
2073         }
2074     }
2075
2076     if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2077         panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
2078               tr->journal_start, tr, jnl);
2079     }
2080
2081
2082     // if we are here then we need to update the journal header
2083     // to reflect that this transaction is complete
2084     if (tr->journal_start == jnl->active_start) {
2085         jnl->active_start = tr->journal_end;
2086         tr->journal_start = tr->journal_end = (off_t)0;
2087     }
2088
2089     // go through the completed_trs list and try to coalesce
2090     // entries, restarting back at the beginning if we have to.
2091     for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
2092         if (ctr->journal_start == jnl->active_start) {
2093             jnl->active_start = ctr->journal_end;
2094             if (prev) {
2095                 prev->next = ctr->next;
2096             }
2097             if (ctr == jnl->completed_trs) {
2098                 jnl->completed_trs = ctr->next;
2099             }
2100
2101             next           = jnl->completed_trs;   // this starts us over again
2102             ctr->next      = jnl->tr_freeme;
2103             jnl->tr_freeme = ctr;
2104             ctr            = NULL;
2105
2106         } else if (tr->journal_end == ctr->journal_start) {
2107             ctr->journal_start = tr->journal_start;
2108             next               = jnl->completed_trs;  // this starts us over again
2109             ctr                = NULL;
2110             tr->journal_start  = tr->journal_end = (off_t)0;
2111
2112         } else if (tr->journal_start == ctr->journal_end) {
2113             ctr->journal_end  = tr->journal_end;
2114             next              = ctr->next;
2115             tr->journal_start = tr->journal_end = (off_t)0;
2116         } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
2117             // coalesce the next entry with this one and link the next
2118             // entry in at the head of the tr_freeme list
2119             next              = ctr->next;           // temporarily use the "next" variable
2120             ctr->journal_end  = next->journal_end;
2121             ctr->next         = next->next;
2122             next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
2123             jnl->tr_freeme    = next;
2124
2125             next              = jnl->completed_trs;  // this starts us over again
2126             ctr               = NULL;
2127
2128         } else {
2129             next = ctr->next;
2130         }
2131     }
2132
2133     // if this is true then we didn't merge with anyone
2134     // so link ourselves in at the head of the completed
2135     // transaction list.
2136     if (tr->journal_start != 0) {
2137         // put this entry into the correct sorted place
2138         // in the list instead of just at the head.
2139
2140         prev = NULL;
2141         for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
2142             // just keep looping
2143         }
2144
2145         if (ctr == NULL && prev == NULL) {
2146             jnl->completed_trs = tr;
2147             tr->next = NULL;
2148
2149         } else if (ctr == jnl->completed_trs) {
2150             tr->next = jnl->completed_trs;
2151             jnl->completed_trs = tr;
2152
2153         } else {
2154             tr->next = prev->next;
2155             prev->next = tr;
2156         }
2157
2158     } else {
2159         // if we're here this tr got merged with someone else so
2160         // put it on the list to be free'd
2161         tr->next       = jnl->tr_freeme;
2162         jnl->tr_freeme = tr;
2163     }
2164 transaction_done:
2165     unlock_oldstart(jnl);
2166
2167     unlock_condition(jnl, &jnl->asyncIO);
2168 }
2169
2170 static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) {
2171     return do_journal_io(jnl, offset, data, len, JNL_WRITE);
2172 }
2173
2174 static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) {
2175     return do_journal_io(jnl, offset, data, len, JNL_READ);
2176 }
2177
2178
2179 // This function sets the size of the tbuffer and the
2180 // size of the blhdr.  It assumes that jnl->jhdr->size
2181 // and jnl->jhdr->jhdr_size are already valid.
2182 static void size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz) {
2183     //
2184     // one-time initialization based on how much memory
2185     // there is in the machine.
2186     //
2187     if (def_tbuffer_size == 0) {
2188         uint64_t memsize = 0;
2189         size_t l = sizeof(memsize);
2190         sysctlbyname("hw.memsize", &memsize, &l, NULL, 0);
2191
2192         if (memsize < (256*1024*1024)) {
2193             def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
2194         } else if (memsize < (512*1024*1024)) {
2195             def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
2196         } else if (memsize < (1024*1024*1024)) {
2197             def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
2198         } else {
2199             def_tbuffer_size = (uint32_t)(DEFAULT_TRANSACTION_BUFFER_SIZE * (memsize / (256*1024*1024)));
2200         }
2201     }
2202
2203     // For analyzer
2204     if (!(jnl->jhdr->jhdr_size > 0)) {
2205         panic("jnl->jhdr->jhdr_size is %d", jnl->jhdr->jhdr_size);
2206     }
2207
2208     // size up the transaction buffer... can't be larger than the number
2209     // of blocks that can fit in a block_list_header block.
2210     if (tbuffer_size == 0) {
2211         jnl->tbuffer_size = def_tbuffer_size;
2212     } else {
2213         // make sure that the specified tbuffer_size isn't too small
2214         if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
2215             tbuffer_size = jnl->jhdr->blhdr_size * 2;
2216         }
2217         // and make sure it's an even multiple of the block size
2218         if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
2219             tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
2220         }
2221
2222         jnl->tbuffer_size = tbuffer_size;
2223     }
2224
2225     if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
2226         jnl->tbuffer_size = (uint32_t)(jnl->jhdr->size / 2);
2227     }
2228
2229     if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
2230         jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
2231     }
2232
2233     jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
2234     if (jnl->jhdr->blhdr_size < phys_blksz) {
2235         jnl->jhdr->blhdr_size = phys_blksz;
2236     } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
2237         // have to round up so we're an even multiple of the physical block size
2238         jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
2239     }
2240 }
2241
2242
2243 static int write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) {
2244     static int num_err_prints = 0;
2245     int ret=0;
2246     off_t jhdr_offset = 0;
2247
2248     // Flush the track cache if we're not doing force-unit-access
2249     // writes.
2250     if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
2251
2252         dk_synchronize_t sync_request = {
2253             .options            = DK_SYNCHRONIZE_OPTION_BARRIER,
2254         };
2255
2256         /*
2257          * If device doesn't support barrier-only flush, or
2258          * the journal is on a different device, use full flush.
2259          */
2260         if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
2261             sync_request.options = 0;
2262             jnl->flush_counter++;
2263         }
2264
2265         ret = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
2266     }
2267     if (ret != 0) {
2268         //
2269         // Only print this error if it's a different error than the
2270         // previous one, or if it's the first time for this device
2271         // or if the total number of printfs is less than 25.  We
2272         // allow for up to 25 printfs to insure that some make it
2273         // into the on-disk syslog.  Otherwise if we only printed
2274         // one, it's possible it would never make it to the syslog
2275         // for the root volume and that makes debugging hard.
2276         //
2277         if (   ret != jnl->last_flush_err
2278             || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
2279             || num_err_prints++ < 25) {
2280
2281             LFHFS_LOG(LEVEL_ERROR, "jnl: flushing fs disk buffer returned 0x%x\n", ret);
2282
2283             jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
2284             jnl->last_flush_err = ret;
2285         }
2286     }
2287
2288     jnl->jhdr->sequence_num = sequence_num;
2289     jnl->jhdr->checksum = 0;
2290     jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2291
2292     if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
2293         LFHFS_LOG(LEVEL_ERROR, "jnl: write_journal_header: error writing the journal header!\n");
2294         jnl->flags |= JOURNAL_INVALID;
2295         return -1;
2296     }
2297
2298     // If we're not doing force-unit-access writes, then we
2299     // have to flush after writing the journal header so that
2300     // a future transaction doesn't sneak out to disk before
2301     // the header does and thus overwrite data that the old
2302     // journal header refers to.  Saw this exact case happen
2303     // on an IDE bus analyzer with Larry Barras so while it
2304     // may seem obscure, it's not.
2305     //
2306     if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
2307
2308         dk_synchronize_t sync_request = {
2309             .options            = DK_SYNCHRONIZE_OPTION_BARRIER,
2310         };
2311
2312         /*
2313          * If device doesn't support barrier-only flush, or
2314          * the journal is on a different device, use full flush.
2315          */
2316         if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
2317             sync_request.options = 0;
2318             jnl->flush_counter++;
2319         }
2320
2321         ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
2322     }
2323     return 0;
2324 }
2325
2326 static int journal_binfo_cmp(const void *a, const void *b) {
2327
2328     const block_info *bi_a = (const struct block_info *)a;
2329     const block_info *bi_b = (const struct block_info *)b;
2330     daddr64_t res;
2331
2332     if (bi_a->bnum == (off_t)-1) {
2333         return 1;
2334     }
2335     if (bi_b->bnum == (off_t)-1) {
2336         return -1;
2337     }
2338
2339     // don't have to worry about negative block
2340     // numbers so this is ok to do.
2341     GenericLFBuf *psGenBufA, *psGenBufB;
2342     psGenBufA = (void*)bi_a->u.bp;
2343     psGenBufB = (void*)bi_b->u.bp;
2344     res = psGenBufA->uBlockN - psGenBufB->uBlockN;
2345
2346     return (int)res;
2347 }
2348
2349 // finish_end_transaction:
2350
2351 static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) {
2352     int                i;
2353     size_t             amt;
2354     size_t             ret = 0;
2355     off_t              end;
2356     journal           *jnl = tr->jnl;
2357     GenericLFBuf       *bp = NULL, **bparray = NULL;
2358     block_list_header *blhdr=NULL, *next=NULL;
2359     size_t             tbuffer_offset;
2360     int                bufs_written = 0;
2361     int                ret_val = 0;
2362
2363     end  = jnl->jhdr->end;
2364
2365     for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2366
2367         amt = blhdr->bytes_used;
2368
2369         blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
2370
2371         blhdr->checksum = 0;
2372         blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2373
2374         bparray = hfs_malloc(blhdr->num_blocks * sizeof(buf_t));
2375         tbuffer_offset = jnl->jhdr->blhdr_size;
2376
2377         // for each block in the block-header,
2378         for (i = 1; i < blhdr->num_blocks; i++) {
2379             size_t   bsize;
2380
2381             /*
2382              * finish preparing the shadow buf_t before
2383              * calculating the individual block checksums
2384              */
2385             if (blhdr->binfo[i].bnum != (off_t)-1) {
2386                 daddr64_t blkno;
2387
2388                 bp = (void*)blhdr->binfo[i].u.bp;
2389                 blkno  = bp->uPhyCluster;
2390                 // update this so we write out the correct physical block number!
2391                 blhdr->binfo[i].bnum = (off_t)(blkno);
2392
2393                 bparray[i] = bp;
2394                 bsize = bp->uDataSize;
2395                 blhdr->binfo[i].u.bi.bsize   = (uint32_t)bsize;
2396                 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], (uint32_t)bsize);
2397             } else {
2398                 bparray[i] = NULL;
2399                 bsize = blhdr->binfo[i].u.bi.bsize;
2400                 blhdr->binfo[i].u.bi.b.cksum = 0;
2401             }
2402             tbuffer_offset += bsize;
2403         }
2404
2405         /*
2406          * if we fired off the journal_write_header asynchronously in
2407          * 'end_transaction', we need to wait for its completion
2408          * before writing the actual journal data
2409          */
2410         wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
2411
2412         if (jnl->write_header_failed == FALSE)
2413             ret = write_journal_data(jnl, &end, blhdr, amt);
2414         else
2415             ret_val = -1;
2416
2417         #if HFS_CRASH_TEST
2418             CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_DATA, jnl->fsmount->psHfsmount, NULL);
2419         #endif
2420
2421         /*
2422          * put the bp pointers back so that we can
2423          * make the final pass on them
2424          */
2425         for (i = 1; i < blhdr->num_blocks; i++)
2426             blhdr->binfo[i].u.bp = (void*)bparray[i];
2427
2428         hfs_free(bparray);
2429
2430         if (ret_val == -1)
2431             goto bad_journal;
2432
2433         if (ret != amt) {
2434             LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: only wrote %zu of %zu bytes to the journal!\n",
2435                    ret, amt);
2436
2437             ret_val = -1;
2438             goto bad_journal;
2439         }
2440     }
2441     jnl->jhdr->end  = end;    // update where the journal now ends
2442     tr->journal_end = end;    // the transaction ends here too
2443
2444     if (tr->journal_start == 0 || tr->journal_end == 0) {
2445         panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2446               tr->journal_start, tr->journal_end);
2447     }
2448
2449     if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
2450         ret_val = -1;
2451         goto bad_journal;
2452     }
2453
2454     #if HFS_CRASH_TEST
2455         CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_HEADER, jnl->fsmount->psHfsmount, NULL);
2456     #endif
2457
2458     /*
2459      * If the caller supplied a callback, call it now that the blocks have been
2460      * written to the journal.  This is used by journal_relocate so, for example,
2461      * the file system can change its pointer to the new journal.
2462      */
2463     if (callback != NULL && callback(callback_arg) != 0) {
2464         ret_val = -1;
2465         goto bad_journal;
2466     }
2467
2468     // the buffer_flushed_callback will only be called for the
2469     // real blocks that get flushed so we have to account for
2470     // the block_list_headers here.
2471     //
2472     tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2473
2474     lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
2475
2476     //
2477     // setup for looping through all the blhdr's.
2478     //
2479     for (blhdr = tr->blhdr; blhdr; blhdr = next) {
2480         uint16_t    num_blocks;
2481
2482         /*
2483          * grab this info ahead of issuing the buf_bawrites...
2484          * once the last one goes out, its possible for blhdr
2485          * to be freed (especially if we get preempted) before
2486          * we do the last check of num_blocks or
2487          * grab the next blhdr pointer...
2488          */
2489         next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2490         num_blocks = blhdr->num_blocks;
2491
2492         /*
2493          * we can re-order the buf ptrs because everything is written out already
2494          */
2495         qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2496
2497         /*
2498          * need to make sure that the loop issuing the buf_bawrite's
2499          * does not touch blhdr once the last buf_bawrite has been
2500          * issued... at that point, we no longer have a legitmate
2501          * reference on the associated storage since it will be
2502          * released upon the completion of that last buf_bawrite
2503          */
2504         for (i = num_blocks-1; i >= 1; i--) {
2505             if (blhdr->binfo[i].bnum != (off_t)-1)
2506                 break;
2507             num_blocks--;
2508         }
2509         for (i = 1; i < num_blocks; i++) {
2510
2511             if ((bp = (void*)blhdr->binfo[i].u.bp)) {
2512
2513                 errno_t ret_val = 0;
2514
2515                 #if JOURNAL_DEBUG
2516                     printf("journal write physical: bp %p, psVnode %p, uBlockN %llu, uPhyCluster %llu uLockCnt %u\n",
2517                            bp, bp->psVnode, bp->uBlockN, bp->uPhyCluster, bp->uLockCnt);
2518                 #endif
2519
2520                 lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK);
2521                 ret_val = lf_hfs_generic_buf_write(bp);
2522
2523                 #if HFS_CRASH_TEST
2524                     CRASH_ABORT(CRASH_ABORT_JOURNAL_IN_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL);
2525                 #endif
2526
2527                 if (ret_val) {
2528                     LFHFS_LOG(LEVEL_ERROR, "jnl: raw_readwrite_write_mount inside finish_end_transaction returned %d.\n", ret_val);
2529                 }
2530
2531                 buffer_written(tr, bp);
2532
2533                 lf_hfs_generic_buf_unlock(bp);
2534                 lf_hfs_generic_buf_release(bp);
2535
2536                 bufs_written++;
2537             }
2538         }
2539     }
2540     #if HFS_CRASH_TEST
2541         CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL);
2542     #endif
2543     if (bufs_written == 0) {
2544         /*
2545          * since we didn't issue any buf_bawrite's, there is no
2546          * async trigger to cause the memory associated with this
2547          * transaction to be freed... so, move it to the garbage
2548          * list now
2549          */
2550         lock_oldstart(jnl);
2551
2552         tr->next       = jnl->tr_freeme;
2553         jnl->tr_freeme = tr;
2554
2555         unlock_oldstart(jnl);
2556
2557         unlock_condition(jnl, &jnl->asyncIO);
2558     }
2559
2560     //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2561     //   tr, tr->journal_start, tr->journal_end);
2562
2563 bad_journal:
2564     if (ret_val == -1) {
2565         abort_transaction(jnl, tr);        // cleans up list of extents to be trimmed
2566
2567         /*
2568          * 'flush_aborted' is protected by the flushing condition... we need to
2569          * set it before dropping the condition so that it will be
2570          * noticed in 'end_transaction'... we add this additional
2571          * aborted condition so that we can drop the 'flushing' condition
2572          * before grabbing the journal lock... this avoids a deadlock
2573          * in 'end_transaction' which is holding the journal lock while
2574          * waiting for the 'flushing' condition to clear...
2575          * everyone else will notice the JOURNAL_INVALID flag
2576          */
2577         jnl->flush_aborted = TRUE;
2578
2579         unlock_condition(jnl, &jnl->flushing);
2580         journal_lock(jnl);
2581
2582         jnl->flags |= JOURNAL_INVALID;
2583         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2584
2585         journal_unlock(jnl);
2586     } else
2587         unlock_condition(jnl, &jnl->flushing);
2588
2589     return (ret_val);
2590 }
2591 static off_t free_space(journal *jnl) {
2592     off_t free_space_offset;
2593
2594     if (jnl->jhdr->start < jnl->jhdr->end) {
2595         free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2596     } else if (jnl->jhdr->start > jnl->jhdr->end) {
2597         free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2598     } else {
2599         // journal is completely empty
2600         free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2601     }
2602
2603     return free_space_offset;
2604 }
2605
2606 static void dump_journal(journal *jnl) {
2607     transaction *ctr;
2608
2609     printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
2610     printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
2611     printf("  start: 0x%.8llx\n", jnl->jhdr->start);
2612     printf("  end:   0x%.8llx\n", jnl->jhdr->end);
2613     printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2614     printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2615     printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2616     printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2617
2618     printf("  completed transactions:\n");
2619     for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
2620         printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2621     }
2622 }
2623
2624 // The journal must be locked on entry to this function.
2625 // The "desired_size" is in bytes.
2626 static int check_free_space( journal *jnl,
2627                              int desired_size,
2628                              boolean_t *delayed_header_write,
2629                              uint32_t sequence_num) {
2630
2631     size_t    i;
2632     int    counter=0;
2633
2634     //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2635     //       desired_size, free_space(jnl));
2636
2637     if (delayed_header_write)
2638         *delayed_header_write = FALSE;
2639
2640     while (1) {
2641         int old_start_empty;
2642
2643         // make sure there's space in the journal to hold this transaction
2644         if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2645             break;
2646         }
2647         if (counter++ == 5000) {
2648             dump_journal(jnl);
2649             panic("jnl: check_free_space: buffer flushing isn't working "
2650                   "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2651                   jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2652         }
2653         if (counter > 7500) {
2654             return ENOSPC;
2655         }
2656
2657         // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2658         // entries until there is enough space for the next transaction.
2659         old_start_empty = 1;
2660         lock_oldstart(jnl);
2661
2662         for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2663             int   lcl_counter;
2664
2665             lcl_counter = 0;
2666             while (jnl->old_start[i] & 0x8000000000000000LL) {
2667                 if (lcl_counter++ > 10000) {
2668                     panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2669                           jnl->old_start[i], jnl);
2670                 }
2671
2672                 unlock_oldstart(jnl);
2673                 if (jnl->flush) {
2674                     jnl->flush(jnl->flush_arg);
2675                 }
2676                 usleep(10000);
2677                 lock_oldstart(jnl);
2678             }
2679
2680             if (jnl->old_start[i] == 0) {
2681                 continue;
2682             }
2683
2684             old_start_empty   = 0;
2685             jnl->jhdr->start  = jnl->old_start[i];
2686             jnl->old_start[i] = 0;
2687
2688             if (free_space(jnl) > desired_size) {
2689
2690                 if (delayed_header_write)
2691                     *delayed_header_write = TRUE;
2692                 else {
2693                     unlock_oldstart(jnl);
2694                     write_journal_header(jnl, 1, sequence_num);
2695                     lock_oldstart(jnl);
2696                 }
2697                 break;
2698             }
2699         }
2700         unlock_oldstart(jnl);
2701
2702         // if we bumped the start, loop and try again
2703         if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2704             continue;
2705         } else if (old_start_empty) {
2706             //
2707             // if there is nothing in old_start anymore then we can
2708             // bump the jhdr->start to be the same as active_start
2709             // since it is possible there was only one very large
2710             // transaction in the old_start array.  if we didn't do
2711             // this then jhdr->start would never get updated and we
2712             // would wind up looping until we hit the panic at the
2713             // start of the loop.
2714             //
2715             jnl->jhdr->start = jnl->active_start;
2716
2717             if (delayed_header_write)
2718                 *delayed_header_write = TRUE;
2719             else
2720                 write_journal_header(jnl, 1, sequence_num);
2721             continue;
2722         }
2723
2724
2725         // if the file system gave us a flush function, call it to so that
2726         // it can flush some blocks which hopefully will cause some transactions
2727         // to complete and thus free up space in the journal.
2728         if (jnl->flush) {
2729             jnl->flush(jnl->flush_arg);
2730         }
2731
2732         // wait for a while to avoid being cpu-bound (this will
2733         // put us to sleep for 10 milliseconds)
2734         usleep(10000);
2735     }
2736
2737     return 0;
2738 }
2739
2740 static void lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) {
2741
2742     lock_flush(jnl);
2743
2744     while (psCondFlag->uFlag) {
2745         pthread_cond_wait(&psCondFlag->sCond, &jnl->flock);
2746     }
2747
2748     psCondFlag->uFlag = TRUE;
2749     unlock_flush(jnl);
2750 }
2751
2752 static void wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) {
2753
2754     if (!psCondFlag->uFlag)
2755         return;
2756
2757     lock_flush(jnl);
2758
2759     while (psCondFlag->uFlag) {
2760         pthread_cond_wait(&psCondFlag->sCond, &jnl->flock);
2761     }
2762
2763     unlock_flush(jnl);
2764 }
2765
2766 static void unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag) {
2767     lock_flush(jnl);
2768
2769     psCondFlag->uFlag = FALSE;
2770     pthread_cond_broadcast(&psCondFlag->sCond);
2771
2772     unlock_flush(jnl);
2773 }
2774
2775 /*
2776  * End a transaction:
2777  * 1) Determine if it is time to commit the transaction or not:
2778  * If the transaction is small enough, and we're not forcing
2779  * a write to disk, the "active" transaction becomes the "current" transaction,
2780  * and will be reused for the next transaction that is started (group commit).
2781  *
2782  * 2) Commit:
2783  * If the transaction gets written to disk (because force_it is true, or no
2784  * group commit, or the transaction is sufficiently full), the blocks get
2785  * written into the journal first, then they are written to their final location
2786  * asynchronously. When those async writes complete, the transaction can be freed
2787  * and removed from the journal.
2788  *
2789  * 3) Callback:
2790  * An optional callback can be supplied.  If given, it is called after the
2791  * the blocks have been written to the journal, but before the async writes
2792  * of those blocks to their normal on-disk locations.  This is used by
2793  * journal_relocate so that the location of the journal can be changed and
2794  * flushed to disk before the blocks get written to their normal locations.
2795  * Note that the callback is only called if the transaction gets written to
2796  * the journal during this end_transaction call; you probably want to set the
2797  * force_it flag.
2798  *
2799  * 4) Free blocks' Generic Buff.
2800  *
2801  * Inputs:
2802  *    tr           Transaction to add to the journal
2803  *    force_it     If true, force this transaction to the on-disk journal immediately.
2804  *    callback     See description above.  Pass NULL for no callback.
2805  *    callback_arg Argument passed to callback routine.
2806  *
2807  * Result
2808  *         0        No errors
2809  *        -1        An error occurred.  The journal is marked invalid.
2810  */
2811 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock) {
2812
2813     block_list_header  *blhdr=NULL, *next=NULL;
2814     int           i, ret_val = 0;
2815     journal      *jnl = tr->jnl;
2816     GenericLFBuf *bp;
2817     size_t        tbuffer_offset;
2818
2819     if (jnl->cur_tr) {
2820         panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2821               jnl, jnl->cur_tr, tr);
2822     }
2823
2824     // if there weren't any modified blocks in the transaction
2825     // just save off the transaction pointer and return.
2826     if (tr->total_bytes == (int)jnl->jhdr->blhdr_size) {
2827         jnl->cur_tr = tr;
2828         goto done;
2829     }
2830
2831     // if our transaction buffer isn't very full, just hang
2832     // on to it and don't actually flush anything.  this is
2833     // what is known as "group commit".  we will flush the
2834     // transaction buffer if it's full or if we have more than
2835     // one of them so we don't start hogging too much memory.
2836     //
2837     // We also check the device supports UNMAP/TRIM, and if so,
2838     // the number of extents waiting to be trimmed.  If it is
2839     // small enough, then keep accumulating more (so we can
2840     // reduce the overhead of trimming).  If there was a prior
2841     // trim error, then we stop issuing trims for this
2842     // volume, so we can also coalesce transactions.
2843     //
2844     if (   force_it == 0
2845         && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2846         && tr->num_blhdrs < 3
2847         && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
2848         && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) {
2849
2850         jnl->cur_tr = tr;
2851         goto done;
2852     }
2853
2854     lock_condition(jnl, &jnl->flushing, "end_transaction");
2855
2856     /*
2857      * if the previous 'finish_end_transaction' was being run
2858      * asynchronously, it could have encountered a condition
2859      * that caused it to mark the journal invalid... if that
2860      * occurred while we were waiting for it to finish, we
2861      * need to notice and abort the current transaction
2862      */
2863     if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) {
2864         unlock_condition(jnl, &jnl->flushing);
2865
2866         abort_transaction(jnl, tr);
2867         ret_val = -1;
2868         goto done;
2869     }
2870
2871     /*
2872      * Store a pointer to this transaction's trim list so that
2873      * future transactions can find it.
2874      *
2875      * Note: if there are no extents in the trim list, then don't
2876      * bother saving the pointer since nothing can add new extents
2877      * to the list (and other threads/transactions only care if
2878      * there is a trim pending).
2879      */
2880     lf_lck_rw_lock_exclusive(&jnl->trim_lock);
2881     if (jnl->async_trim != NULL)
2882         panic("jnl: end_transaction: async_trim already non-NULL!");
2883     if (tr->trim.extent_count > 0)
2884         jnl->async_trim = &tr->trim;
2885     lf_lck_rw_unlock_exclusive(&jnl->trim_lock);
2886
2887     /*
2888      * snapshot the transaction sequence number while we are still behind
2889      * the journal lock since it will be bumped upon the start of the
2890      * next transaction group which may overlap the current journal flush...
2891      * we pass the snapshot into write_journal_header during the journal
2892      * flush so that it can write the correct version in the header...
2893      * because we hold the 'flushing' condition variable for the duration
2894      * of the journal flush, 'saved_sequence_num' remains stable
2895      */
2896     jnl->saved_sequence_num = jnl->sequence_num;
2897
2898     /*
2899      * if we're here we're going to flush the transaction buffer to disk.
2900      * 'check_free_space' will not return untl there is enough free
2901      * space for this transaction in the journal and jnl->old_start[0]
2902      * is avaiable for use
2903      */
2904     check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
2905
2906     // range check the end index
2907     if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2908         panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2909               jnl->jhdr->end, jnl->jhdr->size);
2910     }
2911
2912     // this transaction starts where the current journal ends
2913     tr->journal_start = jnl->jhdr->end;
2914
2915     lock_oldstart(jnl);
2916     /*
2917      * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memmove.
2918      * slide everyone else down and put our latest guy in the last
2919      * entry in the old_start array
2920      */
2921     memmove(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2922     jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2923
2924     unlock_oldstart(jnl);
2925
2926     // go over the blocks in the transaction.
2927     // for each block, call the fpCallback and copy the content into the journal buffer
2928     for (blhdr = tr->blhdr; blhdr; blhdr = next) {
2929         char         *blkptr;
2930         size_t       bsize;
2931
2932         tbuffer_offset = jnl->jhdr->blhdr_size;
2933
2934         for (i = 1; i < blhdr->num_blocks; i++) {
2935
2936             if (blhdr->binfo[i].bnum != (off_t)-1) {
2937
2938                 bp = (GenericLFBuf*)blhdr->binfo[i].u.bp;
2939
2940                 if (bp == NULL) {
2941                     panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2942                           blhdr->binfo[i].bnum, jnl, tr);
2943                 }
2944
2945                 bsize = bp->uDataSize;
2946
2947                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2948
2949                 int iRet;
2950             retry:
2951                 iRet = lf_hfs_generic_buf_take_ownership(bp, NULL);
2952                 if (iRet == EAGAIN) {
2953                     goto retry;
2954                 } else if (iRet) {
2955                     LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: lf_hfs_generic_buf_take_ownership returned %d.\n", iRet);
2956                     ret_val = -1;
2957                     goto done;
2958                 }
2959
2960                 if (!(bp->uCacheFlags & GEN_BUF_WRITE_LOCK)) {
2961                     panic("GEN_BUF_WRITE_LOCK should be set!");
2962                 }
2963
2964                 // Call the buffer callback
2965                 if (bp->pfFunc) {
2966                     bp->pfFunc(bp, bp->pvCallbackArgs);
2967                     bp->pfFunc = NULL;
2968                 }
2969
2970                 if (bp->uCacheFlags & GEN_BUF_LITTLE_ENDIAN) {
2971                     panic("We do not want to write a GEN_BUF_LITTLE_ENDIAN buffer to media!");
2972                 }
2973
2974                 // copy the data into the transaction buffer...
2975                 memcpy(blkptr, bp->pvData, bsize);
2976
2977                 blhdr->binfo[i].u.bp = (void*)bp;
2978
2979             } else {
2980                 // bnum == -1, only true if a block was "killed"
2981                 bsize = blhdr->binfo[i].u.bi.bsize;
2982             }
2983             tbuffer_offset += bsize;
2984         }
2985         next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2986     }
2987
2988     #if HFS_CRASH_TEST
2989         CRASH_ABORT(CRASH_ABORT_JOURNAL_BEFORE_FINISH, jnl->fsmount->psHfsmount, NULL);
2990     #endif
2991
2992     ret_val = finish_end_transaction(tr, callback, callback_arg);
2993
2994 done:
2995     if (drop_lock == TRUE) {
2996         journal_unlock(jnl);
2997     }
2998     return (ret_val);
2999 }
3000
3001 static void abort_transaction(journal *jnl, transaction *tr) {
3002
3003     block_list_header *blhdr, *next;
3004     // for each block list header, iterate over the blocks then
3005     // free up the memory associated with the block list.
3006     for (blhdr = tr->blhdr; blhdr; blhdr = next) {
3007         int    i;
3008
3009         for (i = 1; i < blhdr->num_blocks; i++) {
3010             GenericLFBufPtr bp;
3011
3012             if (blhdr->binfo[i].bnum == (off_t)-1)
3013                 continue;
3014
3015             bp = (void*)blhdr->binfo[i].u.bp;
3016
3017             // Release the buffers
3018             lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK);
3019             if (lf_hfs_generic_buf_validate_owner(bp)) { // abort_transaction can be called before or after we take ownership
3020                 lf_hfs_generic_buf_release(bp);
3021             }
3022
3023         }
3024         next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3025
3026         // we can free blhdr here since we won't need it any more
3027         blhdr->binfo[0].bnum = 0xdeadc0de;
3028         hfs_free(blhdr);
3029     }
3030
3031     /*
3032      * If the transaction we're aborting was the async transaction, then
3033      * tell the current transaction that there is no pending trim
3034      * any more.
3035      */
3036     lf_lck_rw_lock_exclusive(&jnl->trim_lock);
3037     if (jnl->async_trim == &tr->trim)
3038         jnl->async_trim = NULL;
3039     lf_lck_rw_unlock_exclusive(&jnl->trim_lock);
3040
3041
3042     if (tr->trim.extents) {
3043         hfs_free(tr->trim.extents);
3044     }
3045     tr->trim.allocated_count = 0;
3046     tr->trim.extent_count = 0;
3047     tr->trim.extents = NULL;
3048     tr->tbuffer     = NULL;
3049     tr->blhdr       = NULL;
3050     tr->total_bytes = 0xdbadc0de;
3051     hfs_free(tr);
3052 }
3053
3054 static void swap_journal_header(journal *jnl) {
3055     jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
3056     jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
3057     jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
3058     jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
3059     jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
3060     jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
3061     jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
3062     jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
3063     jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
3064 }
3065
3066 // this isn't a great checksum routine but it will do for now.
3067 // we use it to checksum the journal header and the block list
3068 // headers that are at the start of each transaction.
3069 static unsigned int calc_checksum(const char *ptr, int len) {
3070     int i;
3071     unsigned int cksum=0;
3072
3073     // this is a lame checksum but for now it'll do
3074     for(i = 0; i < len; i++, ptr++) {
3075         cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
3076     }
3077
3078     return (~cksum);
3079 }
3080
3081
3082 static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) {
3083     off_t     curlen = len;
3084     size_t    io_sz = 0;
3085     off_t     max_iosize;
3086 #if 0 // TBD
3087     int       err;
3088     buf_t     bp;
3089     off_t     accumulated_offset = 0;
3090     ExtendedVCB *vcb = HFSTOVCB(jnl->fsmount->psHfsmount);
3091 #endif
3092
3093     if (*offset < 0 || *offset > jnl->jhdr->size) {
3094         panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
3095     }
3096
3097     if (direction & JNL_WRITE)
3098         max_iosize = jnl->max_write_size;
3099     else if (direction & JNL_READ)
3100         max_iosize = jnl->max_read_size;
3101     else
3102         max_iosize = 128 * 1024;
3103
3104 again:
3105
3106     // Determine the Current R/W Length, taking cyclic wrap around into account
3107     if (*offset + curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
3108         if (*offset == jnl->jhdr->size) {
3109             *offset = jnl->jhdr->jhdr_size;
3110         } else {
3111             curlen = jnl->jhdr->size - *offset;
3112         }
3113     }
3114
3115     if (curlen > max_iosize) {
3116         curlen = max_iosize;
3117     }
3118
3119     if (curlen <= 0) {
3120         panic("jnl: do_jnl_io: curlen == %lld, offset 0x%llx len %zd\n", curlen, *offset, len);
3121     }
3122
3123     if (*offset == 0 && (direction & JNL_HEADER) == 0) {
3124         panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %lld, data %p)\n", curlen, data);
3125     }
3126
3127
3128     // Perform the I/O
3129     uint64_t phyblksize = jnl->fsmount->psHfsmount->hfs_physical_block_size;
3130     uint64_t uBlkNum    = jnl->jdev_blknum+(*offset)/phyblksize;
3131
3132     if (direction & JNL_READ) {
3133         raw_readwrite_read_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL);
3134
3135     } else if (direction & JNL_WRITE) {
3136         raw_readwrite_write_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL);
3137     }
3138
3139     // Move to the next section
3140     *offset += curlen;
3141     io_sz   += curlen;
3142
3143     if (io_sz != len) {
3144         // handle wrap-around
3145         data    = (char *)data + curlen;
3146         curlen  = len - io_sz;
3147         if (*offset >= jnl->jhdr->size) {
3148             *offset = jnl->jhdr->jhdr_size;
3149         }
3150         goto again;
3151     }
3152
3153     return io_sz;
3154 }
3155
3156 static size_t read_journal_header(journal *jnl, void *data, size_t len) {
3157     off_t hdr_offset = 0;
3158
3159     return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
3160 }
3161
3162 static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl) {
3163     off_t    readblockcnt;
3164     off_t    writeblockcnt;
3165     off_t    readmaxcnt=0, tmp_readmaxcnt;
3166     off_t    writemaxcnt=0, tmp_writemaxcnt;
3167     off_t    readsegcnt, writesegcnt;
3168
3169     // First check the max read size via several different mechanisms...
3170     ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt);
3171
3172     if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt) == 0) {
3173         tmp_readmaxcnt = readblockcnt * phys_blksz;
3174         if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
3175             readmaxcnt = tmp_readmaxcnt;
3176         }
3177     }
3178
3179     if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt)) {
3180         readsegcnt = 0;
3181     }
3182
3183     if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
3184         readmaxcnt = readsegcnt * PAGE_SIZE;
3185     }
3186
3187     if (readmaxcnt == 0) {
3188         readmaxcnt = 128 * 1024;
3189     } else if (readmaxcnt > UINT32_MAX) {
3190         readmaxcnt = UINT32_MAX;
3191     }
3192
3193
3194     // Now check the max writes size via several different mechanisms...
3195     ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt);
3196
3197     if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt) == 0) {
3198         tmp_writemaxcnt = writeblockcnt * phys_blksz;
3199         if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
3200             writemaxcnt = tmp_writemaxcnt;
3201         }
3202     }
3203
3204     if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt)) {
3205         writesegcnt = 0;
3206     }
3207
3208     if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
3209         writemaxcnt = writesegcnt * PAGE_SIZE;
3210     }
3211
3212     if (writemaxcnt == 0) {
3213         writemaxcnt = 128 * 1024;
3214     } else if (writemaxcnt > UINT32_MAX) {
3215         writemaxcnt = UINT32_MAX;
3216     }
3217
3218     jnl->max_read_size  = readmaxcnt;
3219     jnl->max_write_size = writemaxcnt;
3220 }
3221
3222 // this is a work function used to free up transactions that
3223 // completed. they can't be free'd from buffer_flushed_callback
3224 // because it is called from deep with the disk driver stack
3225 // and thus can't do something that would potentially cause
3226 // paging.  it gets called by each of the journal api entry
3227 // points so stuff shouldn't hang around for too long.
3228 static void free_old_stuff(journal *jnl) {
3229     transaction *tr, *next;
3230     block_list_header  *blhdr=NULL, *next_blhdr=NULL;
3231
3232     if (jnl->tr_freeme == NULL)
3233         return;
3234
3235     lock_oldstart(jnl);
3236     tr = jnl->tr_freeme;
3237     jnl->tr_freeme = NULL;
3238     unlock_oldstart(jnl);
3239
3240     for(; tr; tr=next) {
3241         for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
3242             next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
3243             blhdr->binfo[0].bnum = 0xdeadc0de;
3244
3245             hfs_free(blhdr);
3246
3247             KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
3248         }
3249         next = tr->next;
3250         hfs_free(tr);
3251     }
3252 }
3253
3254 // Allocate a new active transaction.
3255 // The function does the following:
3256 // 1) mallocs memory for a transaction structure and a buffer
3257 // 2) initializes the transaction structure and the buffer (invalid CRC + 0x5a)
3258 static errno_t journal_allocate_transaction(journal *jnl) {
3259     transaction *tr;
3260
3261     tr = hfs_mallocz(sizeof(transaction));
3262
3263     tr->tbuffer_size = jnl->tbuffer_size;
3264
3265     tr->tbuffer = hfs_malloc(tr->tbuffer_size);
3266
3267     // journal replay code checksum check depends on this.
3268     memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
3269     // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
3270     memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
3271
3272     tr->blhdr = (block_list_header *)tr->tbuffer;
3273     tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
3274     tr->blhdr->num_blocks = 1;      // accounts for this header block
3275     tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
3276     tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
3277
3278     tr->sequence_num = ++jnl->sequence_num;
3279     tr->num_blhdrs  = 1;
3280     tr->total_bytes = jnl->jhdr->blhdr_size;
3281     tr->jnl         = jnl;
3282
3283     jnl->active_tr  = tr;
3284
3285     return 0;
3286 }
3287
3288 int journal_kill_block(journal *jnl, GenericLFBuf *psGenBuf) {
3289     int                i;
3290     uint64_t           uflags;
3291     block_list_header *blhdr;
3292     transaction       *tr;
3293
3294     #if JOURNAL_DEBUG
3295         printf("journal_kill_block: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
3296            psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
3297     #endif
3298
3299     CHECK_JOURNAL(jnl);
3300     free_old_stuff(jnl);
3301
3302     if (jnl->flags & JOURNAL_INVALID) {
3303         lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3304         lf_hfs_generic_buf_release(psGenBuf);
3305         return 0;
3306     }
3307
3308     tr = jnl->active_tr;
3309     CHECK_TRANSACTION(tr);
3310
3311     if (jnl->owner != pthread_self()) {
3312         panic("jnl: journal_kill_block: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3313               jnl, jnl->owner, pthread_self());
3314     }
3315
3316     uflags = psGenBuf->uCacheFlags;
3317
3318     if ( !(uflags & GEN_BUF_WRITE_LOCK))
3319         panic("jnl: journal_kill_block: called with bp not B_LOCKED");
3320
3321     /*
3322      * bp must be BL_BUSY and B_LOCKED
3323      * first check if it's already part of this transaction
3324      */
3325     for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
3326
3327         for (i = 1; i < blhdr->num_blocks; i++) {
3328             if (psGenBuf == (void*)blhdr->binfo[i].u.bp) {
3329
3330                 // if the block has the DELWRI and FILTER bits sets, then
3331                 // things are seriously weird.  if it was part of another
3332                 // transaction then journal_modify_block_start() should
3333                 // have force it to be written.
3334                 //
3335                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
3336                 //    panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
3337                 //} else {
3338                 tr->num_killed += psGenBuf->uDataSize;
3339                 //}
3340                 blhdr->binfo[i].bnum = (off_t)-1;
3341                 blhdr->binfo[i].u.bp = NULL;
3342                 blhdr->binfo[i].u.bi.bsize = psGenBuf->uDataSize;
3343
3344                 lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3345                 lf_hfs_generic_buf_release(psGenBuf);
3346
3347                 return 0;
3348             }
3349         }
3350     }
3351
3352     /*
3353      * We did not find the block in any transaction buffer but we still
3354      * need to release it or else it will be left locked forever.
3355      */
3356     lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3357     lf_hfs_generic_buf_release(psGenBuf);
3358
3359     return 0;
3360 }
3361
3362 int journal_is_clean(struct vnode *jvp,
3363                      off_t         offset,
3364                      off_t         journal_size,
3365                      struct vnode *fsvp,
3366                      size_t        min_fs_block_size,
3367                      struct mount  *fsmount) {
3368
3369     journal        jnl;
3370     uint32_t    phys_blksz;
3371     int        ret;
3372     int        orig_checksum, checksum;
3373
3374     /* Get the real physical block size. */
3375     if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
3376         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: failed to get device block size.\n");
3377         ret = EINVAL;
3378         goto cleanup_jdev_name;
3379     }
3380
3381     if (phys_blksz > (uint32_t)min_fs_block_size) {
3382         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
3383                phys_blksz, min_fs_block_size);
3384         ret = EINVAL;
3385         goto cleanup_jdev_name;
3386     }
3387
3388     if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
3389         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size %lld looks bogus.\n", journal_size);
3390         ret = EINVAL;
3391         goto cleanup_jdev_name;
3392     }
3393
3394     if ((journal_size % phys_blksz) != 0) {
3395         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3396                journal_size, phys_blksz);
3397         ret = EINVAL;
3398         goto cleanup_jdev_name;
3399     }
3400
3401     memset(&jnl, 0, sizeof(jnl));
3402
3403     jnl.header_buf = hfs_malloc(phys_blksz);
3404     jnl.header_buf_size = phys_blksz;
3405
3406     // Keep a point to the mount around for use in IO throttling.
3407     jnl.fsmount = fsmount;
3408
3409     get_io_info(jvp, phys_blksz, &jnl);
3410
3411     jnl.jhdr = (journal_header *)jnl.header_buf;
3412     memset(jnl.jhdr, 0, sizeof(journal_header));
3413
3414     jnl.jdev        = jvp;
3415     jnl.jdev_offset = offset;
3416     jnl.jdev_blknum = (uint32_t)(offset / phys_blksz);
3417     jnl.fsdev       = fsvp;
3418
3419     // we have to set this up here so that do_journal_io() will work
3420     jnl.jhdr->jhdr_size = phys_blksz;
3421
3422     if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
3423         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: could not read %d bytes for the journal header.\n",
3424                phys_blksz);
3425         ret = EINVAL;
3426         goto get_out;
3427     }
3428
3429     orig_checksum = jnl.jhdr->checksum;
3430     jnl.jhdr->checksum = 0;
3431
3432     if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
3433         // do this before the swap since it's done byte-at-a-time
3434         orig_checksum = SWAP32(orig_checksum);
3435         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
3436         swap_journal_header(&jnl);
3437         jnl.flags |= JOURNAL_NEED_SWAP;
3438     } else {
3439         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
3440     }
3441
3442     if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
3443         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal magic is bad (0x%x != 0x%x)\n",
3444                jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
3445         ret = EINVAL;
3446         goto get_out;
3447     }
3448
3449     if (orig_checksum != checksum) {
3450         LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum, checksum);
3451         ret = EINVAL;
3452         goto get_out;
3453     }
3454
3455     //
3456     // if the start and end are equal then the journal is clean.
3457     // otherwise it's not clean and therefore an error.
3458     //
3459     if (jnl.jhdr->start == jnl.jhdr->end) {
3460         ret = 0;
3461     } else {
3462         ret = EBUSY;    // so the caller can differentiate an invalid journal from a "busy" one
3463     }
3464
3465 get_out:
3466     hfs_free(jnl.header_buf);
3467 cleanup_jdev_name:
3468     return ret;
3469 }
3470
3471 uint32_t journal_current_txn(journal *jnl) {
3472     return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1);
3473 }
3474