bsd/vfs/vfs_journal.c

   1 /*
   2  * Copyright (c) 1995-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // vfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 //
  37
  38 #ifdef KERNEL
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/file_internal.h>
  44 #include <sys/stat.h>
  45 #include <sys/buf_internal.h>
  46 #include <sys/proc_internal.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/namei.h>
  49 #include <sys/vnode_internal.h>
  50 #include <sys/ioctl.h>
  51 #include <sys/tty.h>
  52 #include <sys/ubc.h>
  53 #include <sys/malloc.h>
  54 #include <kern/thread.h>
  55 #include <sys/disk.h>
  56 #include <sys/kdebug.h>
  57 #include <miscfs/specfs/specdev.h>
  58 #include <libkern/OSAtomic.h>   /* OSAddAtomic */
  59
  60 extern task_t kernel_task;
  61
  62 #define DBG_JOURNAL_FLUSH 1
  63
  64 #else
  65
  66 #include <stdio.h>
  67 #include <stdlib.h>
  68 #include <string.h>
  69 #include <limits.h>
  70 #include <errno.h>
  71 #include <fcntl.h>
  72 #include <unistd.h>
  73 #include <stdarg.h>
  74 #include <sys/types.h>
  75 #include "compat.h"
  76
  77 #endif   /* KERNEL */
  78
  79 #include "vfs_journal.h"
  80
  81 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
  82 __private_extern__ void qsort(
  83     void * array,
  84     size_t nmembers,
  85     size_t member_size,
  86     int (*)(const void *, const void *));
  87
  88
  89
  90 // number of bytes to checksum in a block_list_header
  91 // NOTE: this should be enough to clear out the header
  92 //       fields as well as the first entry of binfo[]
  93 #define BLHDR_CHECKSUM_SIZE 32
  94
  95
  96 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg);
  97 static void abort_transaction(journal *jnl, transaction *tr);
  98 static void dump_journal(journal *jnl);
  99
 100 static __inline__ void  lock_journal(journal *jnl);
 101 static __inline__ void  unlock_journal(journal *jnl);
 102 static __inline__ void  lock_oldstart(journal *jnl);
 103 static __inline__ void  unlock_oldstart(journal *jnl);
 104
 105
 106
 107
 108 //
 109 // 3105942 - Coalesce writes to the same block on journal replay
 110 //
 111
 112 typedef struct bucket {
 113     off_t   block_num;
 114     size_t  jnl_offset;
 115     size_t  block_size;
 116     int32_t cksum;
 117 } bucket;
 118
 119 #define STARTING_BUCKETS 256
 120
 121 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 122 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
 123 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
 124 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 125 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
 126
 127 #define CHECK_JOURNAL(jnl) \
 128     do { \
 129     if (jnl == NULL) {\
 130         panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
 131     }\
 132     if (jnl->jdev == NULL) { \
 133         panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
 134     } \
 135     if (jnl->fsdev == NULL) { \
 136         panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
 137     } \
 138     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
 139         panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
 140         __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
 141     }\
 142     if (   jnl->jhdr->start <= 0 \
 143         || jnl->jhdr->start > jnl->jhdr->size\
 144         || jnl->jhdr->start > 1024*1024*1024) {\
 145         panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
 146         __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
 147     }\
 148     if (   jnl->jhdr->end <= 0 \
 149         || jnl->jhdr->end > jnl->jhdr->size\
 150         || jnl->jhdr->end > 1024*1024*1024) {\
 151         panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
 152         __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
 153     }\
 154     if (jnl->jhdr->size > 1024*1024*1024) {\
 155         panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
 156         __FILE__, __LINE__, jnl->jhdr->size);\
 157     } \
 158     } while(0)
 159
 160 #define CHECK_TRANSACTION(tr) \
 161     do {\
 162     if (tr == NULL) {\
 163         panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
 164     }\
 165     if (tr->jnl == NULL) {\
 166         panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
 167     }\
 168     if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
 169         panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
 170     }\
 171     if (tr->total_bytes < 0) {\
 172         panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
 173     }\
 174     if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
 175         panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
 176     }\
 177     if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
 178         panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
 179     }\
 180     if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
 181         panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
 182     }\
 183     } while(0)
 184
 185
 186
 187 //
 188 // this isn't a great checksum routine but it will do for now.
 189 // we use it to checksum the journal header and the block list
 190 // headers that are at the start of each transaction.
 191 //
 192 static int
 193 calc_checksum(char *ptr, int len)
 194 {
 195     int i, cksum=0;
 196
 197     // this is a lame checksum but for now it'll do
 198     for(i=0; i < len; i++, ptr++) {
 199                 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
 200     }
 201
 202     return (~cksum);
 203 }
 204
 205 //
 206 // Journal Locking
 207 //
 208 lck_grp_attr_t *  jnl_group_attr;
 209 lck_attr_t *      jnl_lock_attr;
 210 lck_grp_t *       jnl_mutex_group;
 211
 212 void
 213 journal_init(void)
 214 {
 215         jnl_lock_attr    = lck_attr_alloc_init();
 216         jnl_group_attr   = lck_grp_attr_alloc_init();
 217         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 218 }
 219
 220 static __inline__ void
 221 lock_journal(journal *jnl)
 222 {
 223         lck_mtx_lock(&jnl->jlock);
 224 }
 225
 226 static __inline__ void
 227 unlock_journal(journal *jnl)
 228 {
 229         lck_mtx_unlock(&jnl->jlock);
 230 }
 231
 232 static __inline__ void
 233 lock_oldstart(journal *jnl)
 234 {
 235         lck_mtx_lock(&jnl->old_start_lock);
 236 }
 237
 238 static __inline__ void
 239 unlock_oldstart(journal *jnl)
 240 {
 241         lck_mtx_unlock(&jnl->old_start_lock);
 242 }
 243
 244
 245
 246 #define JNL_WRITE    0x0001
 247 #define JNL_READ     0x0002
 248 #define JNL_HEADER   0x8000
 249
 250 //
 251 // This function sets up a fake buf and passes it directly to the
 252 // journal device strategy routine (so that it won't get cached in
 253 // the block cache.
 254 //
 255 // It also handles range checking the i/o so that we don't write
 256 // outside the journal boundaries and it will wrap the i/o back
 257 // to the beginning if necessary (skipping over the journal header)
 258 //
 259 static size_t
 260 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
 261 {
 262     int         err, curlen=len;
 263     size_t      io_sz = 0;
 264     buf_t       bp;
 265     off_t       max_iosize;
 266
 267     if (*offset < 0 || *offset > jnl->jhdr->size) {
 268                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
 269     }
 270
 271     if (direction & JNL_WRITE)
 272         max_iosize = jnl->max_write_size;
 273     else if (direction & JNL_READ)
 274         max_iosize = jnl->max_read_size;
 275     else
 276         max_iosize = 128 * 1024;
 277
 278   again:
 279     bp = alloc_io_buf(jnl->jdev, 1);
 280
 281     if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
 282                 if (*offset == jnl->jhdr->size) {
 283                         *offset = jnl->jhdr->jhdr_size;
 284                 } else {
 285                         curlen = (off_t)jnl->jhdr->size - *offset;
 286                 }
 287     }
 288
 289         if (curlen > max_iosize) {
 290                 curlen = max_iosize;
 291         }
 292
 293     if (curlen <= 0) {
 294                 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %lu\n", curlen, *offset, len);
 295     }
 296
 297         if (*offset == 0 && (direction & JNL_HEADER) == 0) {
 298                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 299         }
 300
 301     if (direction & JNL_READ)
 302             buf_setflags(bp, B_READ);
 303     else {
 304             /*
 305              * don't have to set any flags
 306              */
 307             vnode_startwrite(jnl->jdev);
 308     }
 309     buf_setsize(bp, curlen);
 310     buf_setcount(bp, curlen);
 311     buf_setdataptr(bp, (uintptr_t)data);
 312     buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 313     buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 314     if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
 315         buf_markfua(bp);
 316     }
 317
 318     err = VNOP_STRATEGY(bp);
 319     if (!err) {
 320                 err = (int)buf_biowait(bp);
 321     }
 322     free_io_buf(bp);
 323
 324     if (err) {
 325         printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
 326         return 0;
 327     }
 328
 329     *offset += curlen;
 330     io_sz   += curlen;
 331     if (io_sz != len) {
 332                 // handle wrap-around
 333                 data    = (char *)data + curlen;
 334                 curlen  = len - io_sz;
 335                 if (*offset >= jnl->jhdr->size) {
 336                         *offset = jnl->jhdr->jhdr_size;
 337                 }
 338                 goto again;
 339     }
 340
 341     return io_sz;
 342 }
 343
 344 static size_t
 345 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 346 {
 347     return do_journal_io(jnl, offset, data, len, JNL_READ);
 348 }
 349
 350 static size_t
 351 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 352 {
 353     return do_journal_io(jnl, offset, data, len, JNL_WRITE);
 354 }
 355
 356
 357 static size_t
 358 read_journal_header(journal *jnl, void *data, size_t len)
 359 {
 360         off_t hdr_offset = 0;
 361
 362         return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
 363 }
 364
 365 static int
 366 write_journal_header(journal *jnl)
 367 {
 368     static int num_err_prints = 0;
 369     int ret=0;
 370     off_t jhdr_offset = 0;
 371     struct vfs_context context;
 372
 373     context.vc_thread = current_thread();
 374     context.vc_ucred = NOCRED;
 375     //
 376     // Flush the track cache if we're not doing force-unit-access
 377     // writes.
 378     //
 379     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 380         ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 381     }
 382     if (ret != 0) {
 383         //
 384         // Only print this error if it's a different error than the
 385         // previous one, or if it's the first time for this device
 386         // or if the total number of printfs is less than 25.  We
 387         // allow for up to 25 printfs to insure that some make it
 388         // into the on-disk syslog.  Otherwise if we only printed
 389         // one, it's possible it would never make it to the syslog
 390         // for the root volume and that makes debugging hard.
 391         //
 392         if (   ret != jnl->last_flush_err
 393             || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
 394             || num_err_prints++ < 25) {
 395
 396             printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
 397
 398             jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
 399             jnl->last_flush_err = ret;
 400         }
 401     }
 402
 403     jnl->jhdr->checksum = 0;
 404     jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 405     if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
 406         printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
 407         jnl->flags |= JOURNAL_INVALID;
 408         return -1;
 409     }
 410
 411     // If we're not doing force-unit-access writes, then we
 412     // have to flush after writing the journal header so that
 413     // a future transaction doesn't sneak out to disk before
 414     // the header does and thus overwrite data that the old
 415     // journal header refers to.  Saw this exact case happen
 416     // on an IDE bus analyzer with Larry Barras so while it
 417     // may seem obscure, it's not.
 418     //
 419     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 420         VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 421     }
 422
 423     return 0;
 424 }
 425
 426
 427
 428 //
 429 // this is a work function used to free up transactions that
 430 // completed. they can't be free'd from buffer_flushed_callback
 431 // because it is called from deep with the disk driver stack
 432 // and thus can't do something that would potentially cause
 433 // paging.  it gets called by each of the journal api entry
 434 // points so stuff shouldn't hang around for too long.
 435 //
 436 static void
 437 free_old_stuff(journal *jnl)
 438 {
 439     transaction *tr, *next;
 440
 441     lock_oldstart(jnl);
 442     tr = jnl->tr_freeme;
 443     jnl->tr_freeme = NULL;
 444     unlock_oldstart(jnl);
 445
 446     for(; tr; tr=next) {
 447         next = tr->next;
 448         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
 449     }
 450
 451 }
 452
 453
 454
 455 //
 456 // This is our callback that lets us know when a buffer has been
 457 // flushed to disk.  It's called from deep within the driver stack
 458 // and thus is quite limited in what it can do.  Notably, it can
 459 // not initiate any new i/o's or allocate/free memory.
 460 //
 461 static void
 462 buffer_flushed_callback(struct buf *bp, void *arg)
 463 {
 464     transaction  *tr;
 465     journal      *jnl;
 466     transaction  *ctr, *prev=NULL, *next;
 467     size_t        i;
 468     int           bufsize, amt_flushed, total_bytes;
 469
 470
 471     //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
 472     //     bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
 473
 474     // snarf out the bits we want
 475     bufsize = buf_size(bp);
 476     tr      = (transaction *)arg;
 477
 478     // then we've already seen it
 479     if (tr == NULL) {
 480                 return;
 481     }
 482
 483     CHECK_TRANSACTION(tr);
 484
 485     jnl = tr->jnl;
 486     if (jnl->flags & JOURNAL_INVALID) {
 487                 return;
 488     }
 489
 490     CHECK_JOURNAL(jnl);
 491
 492     amt_flushed = tr->num_killed;
 493     total_bytes = tr->total_bytes;
 494
 495     // update the number of blocks that have been flushed.
 496     // this buf may represent more than one block so take
 497     // that into account.
 498     //
 499     // OSAddAtomic() returns the value of tr->num_flushed before the add
 500     //
 501     amt_flushed += OSAddAtomic(bufsize, (SInt32 *)&tr->num_flushed);
 502
 503
 504     // if this transaction isn't done yet, just return as
 505     // there is nothing to do.
 506     //
 507     // NOTE: we are careful to not reference anything through
 508     //       the tr pointer after doing the OSAddAtomic().  if
 509     //       this if statement fails then we are the last one
 510     //       and then it's ok to dereference "tr".
 511     //
 512     if ((amt_flushed + bufsize) < total_bytes) {
 513                 return;
 514     }
 515
 516     // this will single thread checking the transaction
 517     lock_oldstart(jnl);
 518
 519     if (tr->total_bytes == (int)0xfbadc0de) {
 520         // then someone beat us to it...
 521         unlock_oldstart(jnl);
 522         return;
 523     }
 524
 525     // mark this so that we're the owner of dealing with the
 526     // cleanup for this transaction
 527     tr->total_bytes = 0xfbadc0de;
 528
 529     //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 530     //   tr, tr->journal_start, tr->journal_end, jnl);
 531
 532     // find this entry in the old_start[] index and mark it completed
 533     for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
 534
 535         if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
 536             jnl->old_start[i] &= ~(0x8000000000000000ULL);
 537             break;
 538         }
 539     }
 540
 541     if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
 542         panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
 543             tr->journal_start, tr, jnl);
 544     }
 545
 546
 547     // if we are here then we need to update the journal header
 548     // to reflect that this transaction is complete
 549     if (tr->journal_start == jnl->active_start) {
 550         jnl->active_start = tr->journal_end;
 551         tr->journal_start = tr->journal_end = (off_t)0;
 552     }
 553
 554     // go through the completed_trs list and try to coalesce
 555     // entries, restarting back at the beginning if we have to.
 556     for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
 557         if (ctr->journal_start == jnl->active_start) {
 558             jnl->active_start = ctr->journal_end;
 559             if (prev) {
 560                 prev->next = ctr->next;
 561             }
 562             if (ctr == jnl->completed_trs) {
 563                 jnl->completed_trs = ctr->next;
 564             }
 565
 566             next           = jnl->completed_trs;   // this starts us over again
 567             ctr->next      = jnl->tr_freeme;
 568             jnl->tr_freeme = ctr;
 569             ctr            = NULL;
 570         } else if (tr->journal_end == ctr->journal_start) {
 571             ctr->journal_start = tr->journal_start;
 572             next               = jnl->completed_trs;  // this starts us over again
 573             ctr                = NULL;
 574             tr->journal_start  = tr->journal_end = (off_t)0;
 575         } else if (tr->journal_start == ctr->journal_end) {
 576             ctr->journal_end  = tr->journal_end;
 577             next              = ctr->next;
 578             tr->journal_start = tr->journal_end = (off_t)0;
 579         } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
 580             // coalesce the next entry with this one and link the next
 581             // entry in at the head of the tr_freeme list
 582             next              = ctr->next;           // temporarily use the "next" variable
 583             ctr->journal_end  = next->journal_end;
 584             ctr->next         = next->next;
 585             next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
 586             jnl->tr_freeme    = next;
 587
 588             next              = jnl->completed_trs;  // this starts us over again
 589             ctr               = NULL;
 590         } else {
 591             next = ctr->next;
 592         }
 593     }
 594
 595     // if this is true then we didn't merge with anyone
 596     // so link ourselves in at the head of the completed
 597     // transaction list.
 598     if (tr->journal_start != 0) {
 599         // put this entry into the correct sorted place
 600         // in the list instead of just at the head.
 601         //
 602
 603         prev = NULL;
 604         for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
 605             // just keep looping
 606         }
 607
 608         if (ctr == NULL && prev == NULL) {
 609             jnl->completed_trs = tr;
 610             tr->next = NULL;
 611         } else if (ctr == jnl->completed_trs) {
 612             tr->next = jnl->completed_trs;
 613             jnl->completed_trs = tr;
 614         } else {
 615             tr->next = prev->next;
 616             prev->next = tr;
 617         }
 618     } else {
 619         // if we're here this tr got merged with someone else so
 620         // put it on the list to be free'd
 621         tr->next       = jnl->tr_freeme;
 622         jnl->tr_freeme = tr;
 623     }
 624     unlock_oldstart(jnl);
 625 }
 626
 627
 628 #include <libkern/OSByteOrder.h>
 629
 630 #define SWAP16(x) OSSwapInt16(x)
 631 #define SWAP32(x) OSSwapInt32(x)
 632 #define SWAP64(x) OSSwapInt64(x)
 633
 634
 635 static void
 636 swap_journal_header(journal *jnl)
 637 {
 638     jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
 639     jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
 640     jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
 641     jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
 642     jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
 643     jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
 644     jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
 645     jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
 646     jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
 647 }
 648
 649 static void
 650 swap_block_list_header(journal *jnl, block_list_header *blhdr)
 651 {
 652     int i;
 653
 654     blhdr->max_blocks = SWAP16(blhdr->max_blocks);
 655     blhdr->num_blocks = SWAP16(blhdr->num_blocks);
 656     blhdr->bytes_used = SWAP32(blhdr->bytes_used);
 657     blhdr->checksum   = SWAP32(blhdr->checksum);
 658     blhdr->flags      = SWAP32(blhdr->flags);
 659
 660     if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
 661         printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
 662         return;
 663     }
 664
 665     for(i=0; i < blhdr->num_blocks; i++) {
 666                 blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
 667                 blhdr->binfo[i].bsize   = SWAP32(blhdr->binfo[i].bsize);
 668                 blhdr->binfo[i].b.cksum = SWAP32(blhdr->binfo[i].b.cksum);
 669     }
 670 }
 671
 672
 673 static int
 674 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
 675 {
 676     int         ret;
 677     struct buf *oblock_bp=NULL;
 678
 679     // first read the block we want.
 680     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 681     if (ret != 0) {
 682         printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
 683
 684                 if (oblock_bp) {
 685                         buf_brelse(oblock_bp);
 686                         oblock_bp = NULL;
 687                 }
 688
 689                 // let's try to be aggressive here and just re-write the block
 690                 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
 691                 if (oblock_bp == NULL) {
 692                     printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
 693                     return -1;
 694                 }
 695     }
 696
 697     // make sure it's the correct size.
 698     if (buf_size(oblock_bp) != bsize) {
 699                 buf_brelse(oblock_bp);
 700                 return -1;
 701     }
 702
 703     // copy the journal data over top of it
 704     memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize);
 705
 706     if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
 707         printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
 708         return ret;
 709     }
 710
 711     // and now invalidate it so that if someone else wants to read
 712     // it in a different size they'll be able to do it.
 713     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 714     if (oblock_bp) {
 715                 buf_markinvalid(oblock_bp);
 716                 buf_brelse(oblock_bp);
 717     }
 718
 719     return 0;
 720 }
 721
 722 static int
 723 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
 724 {
 725     struct bucket *newBuf;
 726     int current_size = num_buckets, i;
 727
 728     // return if newsize is less than the current size
 729     if (new_size < num_buckets) {
 730         return current_size;
 731     }
 732
 733     if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
 734         printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
 735         return -1;
 736     }
 737
 738     //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
 739
 740     // copy existing elements
 741     bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
 742
 743     // initialize the new ones
 744     for(i=num_buckets; i < new_size; i++) {
 745         newBuf[i].block_num = (off_t)-1;
 746     }
 747
 748     // free the old container
 749     FREE(*buf_ptr, M_TEMP);
 750
 751     // reset the buf_ptr
 752     *buf_ptr = newBuf;
 753
 754     return new_size;
 755 }
 756
 757 static int
 758 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
 759 {
 760     int lo, hi, index, matches, i;
 761
 762     if (num_full == 0) {
 763         return 0; // table is empty, so insert at index=0
 764     }
 765
 766     lo = 0;
 767     hi = num_full - 1;
 768     index = -1;
 769
 770     // perform binary search for block_num
 771     do {
 772         int mid = (hi - lo)/2 + lo;
 773         off_t this_num = (*buf_ptr)[mid].block_num;
 774
 775         if (block_num == this_num) {
 776             index = mid;
 777             break;
 778         }
 779
 780         if (block_num < this_num) {
 781             hi = mid;
 782             continue;
 783         }
 784
 785         if (block_num > this_num) {
 786             lo = mid + 1;
 787             continue;
 788         }
 789     } while(lo < hi);
 790
 791     // check if lo and hi converged on the match
 792     if (block_num == (*buf_ptr)[hi].block_num) {
 793         index = hi;
 794     }
 795
 796     // if no existing entry found, find index for new one
 797     if (index == -1) {
 798         index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
 799     } else {
 800         // make sure that we return the right-most index in the case of multiple matches
 801         matches = 0;
 802         i = index + 1;
 803         while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
 804             matches++;
 805             i++;
 806         }
 807
 808         index += matches;
 809     }
 810
 811     return index;
 812 }
 813
 814 static int
 815 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
 816 {
 817     if (!overwriting) {
 818         // grow the table if we're out of space
 819         if (*num_full_ptr >= *num_buckets_ptr) {
 820             int new_size = *num_buckets_ptr * 2;
 821             int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
 822
 823             if (grow_size < new_size) {
 824                 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
 825                 return -1;
 826             }
 827
 828             *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
 829         }
 830
 831         // if we're not inserting at the end, we need to bcopy
 832         if (blk_index != *num_full_ptr) {
 833             bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
 834         }
 835
 836         (*num_full_ptr)++; // increment only if we're not overwriting
 837     }
 838
 839     // sanity check the values we're about to add
 840     if (offset >= jnl->jhdr->size) {
 841         offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
 842     }
 843     if (size <= 0) {
 844         panic("jnl: insert_block: bad size in insert_block (%lu)\n", size);
 845     }
 846
 847     (*buf_ptr)[blk_index].block_num = num;
 848     (*buf_ptr)[blk_index].block_size = size;
 849     (*buf_ptr)[blk_index].jnl_offset = offset;
 850     (*buf_ptr)[blk_index].cksum = cksum;
 851
 852     return blk_index;
 853 }
 854
 855 static int
 856 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 857 {
 858     int num_to_remove, index, i, overwrite, err;
 859     size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
 860     off_t overlap, block_start, block_end;
 861
 862     block_start = block_num*jhdr_size;
 863     block_end = block_start + size;
 864     overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
 865
 866     // first, eliminate any overlap with the previous entry
 867     if (blk_index != 0 && !overwrite) {
 868         off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
 869         off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
 870         overlap = prev_block_end - block_start;
 871         if (overlap > 0) {
 872             if (overlap % jhdr_size != 0) {
 873                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %lu\n", jhdr_size);
 874             }
 875
 876             // if the previous entry completely overlaps this one, we need to break it into two pieces.
 877             if (prev_block_end > block_end) {
 878                 off_t new_num = block_end / jhdr_size;
 879                 size_t new_size = prev_block_end - block_end;
 880
 881                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
 882
 883                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
 884                 if (err < 0) {
 885                     panic("jnl: do_overlap: error inserting during pre-overlap\n");
 886                 }
 887             }
 888
 889             // Regardless, we need to truncate the previous entry to the beginning of the overlap
 890             (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
 891             (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
 892         }
 893     }
 894
 895     // then, bail out fast if there's no overlap with the entries that follow
 896     if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
 897         return 0; // no overlap, no overwrite
 898     } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
 899
 900         (*buf_ptr)[blk_index].cksum = cksum;   // update this
 901         return 1; // simple overwrite
 902     }
 903
 904     // Otherwise, find all cases of total and partial overlap. We use the special
 905     // block_num of -2 to designate entries that are completely overlapped and must
 906     // be eliminated. The block_num, size, and jnl_offset of partially overlapped
 907     // entries must be adjusted to keep the array consistent.
 908     index = blk_index;
 909     num_to_remove = 0;
 910     while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
 911         if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
 912             (*buf_ptr)[index].block_num = -2; // mark this for deletion
 913             num_to_remove++;
 914         } else {
 915             overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
 916             if (overlap > 0) {
 917                 if (overlap % jhdr_size != 0) {
 918                     panic("jnl: do_overlap: overlap of %lld is not multiple of %lu\n", overlap, jhdr_size);
 919                 }
 920
 921                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
 922                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
 923                 (*buf_ptr)[index].cksum = 0;
 924
 925                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
 926                 if (new_offset >= jnl->jhdr->size) {
 927                     new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
 928                 }
 929                 (*buf_ptr)[index].jnl_offset = new_offset;
 930
 931                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
 932                 if ((*buf_ptr)[index].block_size <= 0) {
 933                     panic("jnl: do_overlap: after overlap, new block size is invalid (%lu)\n", (*buf_ptr)[index].block_size);
 934                     // return -1; // if above panic is removed, return -1 for error
 935                 }
 936             }
 937
 938         }
 939
 940         index++;
 941     }
 942
 943     // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
 944     index--; // start with the last index used within the above loop
 945     while(index >= blk_index) {
 946         if ((*buf_ptr)[index].block_num == -2) {
 947             if (index == *num_full_ptr-1) {
 948                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
 949             } else {
 950                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
 951             }
 952             (*num_full_ptr)--;
 953         }
 954         index--;
 955     }
 956
 957     // eliminate any stale entries at the end of the table
 958     for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
 959         (*buf_ptr)[i].block_num = -1;
 960     }
 961
 962     return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
 963 }
 964
 965 // PR-3105942: Coalesce writes to the same block in journal replay
 966 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
 967 // to be replayed and the corresponding location in the journal which contains
 968 // the most recent data for those blocks. The array is "played" once the all the
 969 // blocks in the journal have been coalesced. The code for the case of conflicting/
 970 // overlapping writes to a single block is the most dense. Because coalescing can
 971 // disrupt the existing time-ordering of blocks in the journal playback, care
 972 // is taken to catch any overlaps and keep the array consistent.
 973 static int
 974 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 975 {
 976     int blk_index, overwriting;
 977
 978     // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
 979     // inserted (or the index of the elem to overwrite).
 980     blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
 981
 982     // check if the index is within bounds (if we're adding this block to the end of
 983     // the table, blk_index will be equal to num_full)
 984     if (blk_index < 0 || blk_index > *num_full_ptr) {
 985         //printf("jnl: add_block: trouble adding block to co_buf\n");
 986         return -1;
 987     } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
 988
 989     // Determine whether we're overwriting an existing entry by checking for overlap
 990     overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
 991     if (overwriting < 0) {
 992         return -1; // if we got an error, pass it along
 993     }
 994
 995     // returns the index, or -1 on error
 996     blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
 997
 998     return blk_index;
 999 }
1000
1001 static int
1002 replay_journal(journal *jnl)
1003 {
1004     int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
1005     size_t ret;
1006     size_t  max_bsize = 0;              /* protected by block_ptr */
1007     block_list_header *blhdr;
1008     off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1009     char *buff, *block_ptr=NULL;
1010     struct bucket *co_buf;
1011     int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1012     uint32_t last_sequence_num = 0;
1013
1014     // wrap the start ptr if it points to the very end of the journal
1015     if (jnl->jhdr->start == jnl->jhdr->size) {
1016                 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1017     }
1018     if (jnl->jhdr->end == jnl->jhdr->size) {
1019                 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1020     }
1021
1022     if (jnl->jhdr->start == jnl->jhdr->end) {
1023                 return 0;
1024     }
1025
1026     orig_jnl_start = jnl->jhdr->start;
1027
1028     // allocate memory for the header_block.  we'll read each blhdr into this
1029     if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
1030                 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1031                     jnl->jdev_name, jnl->jhdr->blhdr_size);
1032                 return -1;
1033     }
1034
1035     // allocate memory for the coalesce buffer
1036     if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1037         printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1038         return -1;
1039     }
1040
1041   restart_replay:
1042
1043     // initialize entries
1044     for(i=0; i < num_buckets; i++) {
1045         co_buf[i].block_num = -1;
1046     }
1047     num_full = 0; // empty at first
1048
1049
1050     printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1051         jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1052
1053     while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1054                 offset = blhdr_offset = jnl->jhdr->start;
1055                 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1056                 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1057                     printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1058                     bad_blocks = 1;
1059                     goto bad_txn_handling;
1060                 }
1061
1062                 blhdr = (block_list_header *)buff;
1063
1064                 orig_checksum = blhdr->checksum;
1065                 blhdr->checksum = 0;
1066                 if (jnl->flags & JOURNAL_NEED_SWAP) {
1067                         // calculate the checksum based on the unswapped data
1068                         // because it is done byte-at-a-time.
1069                         orig_checksum = SWAP32(orig_checksum);
1070                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1071                         swap_block_list_header(jnl, blhdr);
1072                 } else {
1073                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1074                 }
1075
1076
1077                 //
1078                 // XXXdbg - if these checks fail, we should replay as much
1079                 //          we can in the hopes that it will still leave the
1080                 //          drive in a better state than if we didn't replay
1081                 //          anything
1082                 //
1083                 if (checksum != orig_checksum) {
1084                     if (check_past_jnl_end && in_uncharted_territory) {
1085
1086                         if (blhdr_offset != jnl->jhdr->end) {
1087                             printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1088                         }
1089
1090                         check_past_jnl_end = 0;
1091                         jnl->jhdr->end = blhdr_offset;
1092                         continue;
1093                     }
1094
1095                     printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1096                         jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1097
1098                     if (blhdr_offset == orig_jnl_start) {
1099                         // if there's nothing in the journal at all, just bail out altogether.
1100                         goto bad_replay;
1101                     }
1102
1103                     bad_blocks = 1;
1104                     goto bad_txn_handling;
1105                 }
1106
1107                 if (   (last_sequence_num != 0)
1108                     && (blhdr->binfo[0].b.sequence_num != 0)
1109                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num)
1110                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num+1)) {
1111
1112                     txn_start_offset = jnl->jhdr->end = blhdr_offset;
1113
1114                     if (check_past_jnl_end) {
1115                         check_past_jnl_end = 0;
1116                         printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1117                             jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1118                         continue;
1119                     }
1120
1121                     printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1122                         jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1123                     bad_blocks = 1;
1124                     goto bad_txn_handling;
1125                 }
1126                 last_sequence_num = blhdr->binfo[0].b.sequence_num;
1127
1128                 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1129                     if (last_sequence_num == 0) {
1130                         check_past_jnl_end = 0;
1131                         printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1132                             jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1133                         if (jnl->jhdr->start != jnl->jhdr->end) {
1134                             jnl->jhdr->start = jnl->jhdr->end;
1135                         }
1136                         continue;
1137                     }
1138                     printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1139                 }
1140
1141                 if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
1142                            || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1143                     printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1144                         jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1145                     bad_blocks = 1;
1146                     goto bad_txn_handling;
1147                 }
1148
1149                 max_bsize = 0;
1150                 for(i=1; i < blhdr->num_blocks; i++) {
1151                         if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1152                             printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1153                             bad_blocks = 1;
1154                             goto bad_txn_handling;
1155                         }
1156
1157                         if (blhdr->binfo[i].bsize > max_bsize) {
1158                             max_bsize = blhdr->binfo[i].bsize;
1159                         }
1160                 }
1161
1162                 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1163                     check_block_checksums = 1;
1164                     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1165                         goto bad_replay;
1166                     }
1167                 } else {
1168                     block_ptr = NULL;
1169                 }
1170
1171                 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1172                     txn_start_offset = blhdr_offset;
1173                 }
1174
1175                 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1176                 //       blhdr->num_blocks-1, jnl->jhdr->start);
1177                 bad_blocks = 0;
1178                 for(i=1; i < blhdr->num_blocks; i++) {
1179                         int size, ret_val;
1180                         off_t number;
1181
1182                         size = blhdr->binfo[i].bsize;
1183                         number = blhdr->binfo[i].bnum;
1184
1185                         // don't add "killed" blocks
1186                         if (number == (off_t)-1) {
1187                             //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1188                         } else {
1189
1190                             if (check_block_checksums) {
1191                                 int32_t disk_cksum;
1192                                 off_t block_offset;
1193
1194                                 block_offset = offset;
1195
1196                                 // read the block so we can check the checksum
1197                                 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1198                                 if (ret != (size_t)size) {
1199                                     printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1200                                     bad_blocks = 1;
1201                                     goto bad_txn_handling;
1202                                 }
1203
1204                                 disk_cksum = calc_checksum(block_ptr, size);
1205
1206                                 // there is no need to swap the checksum from disk because
1207                                 // it got swapped when the blhdr was read in.
1208                                 if (blhdr->binfo[i].b.cksum != 0 && disk_cksum != blhdr->binfo[i].b.cksum) {
1209                                     printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1210                                         jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].b.cksum);
1211                                     printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1212                                         *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1213                                         *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1214
1215                                     bad_blocks = 1;
1216                                     goto bad_txn_handling;
1217                                 }
1218                             }
1219
1220
1221                             // add this bucket to co_buf, coalescing where possible
1222                             // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1223                             ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].b.cksum, &num_buckets, &num_full);
1224
1225                             if (ret_val == -1) {
1226                                 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1227                                 goto bad_replay;
1228                             } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1229                         }
1230
1231                         // increment offset
1232                         offset += size;
1233
1234                         // check if the last block added puts us off the end of the jnl.
1235                         // if so, we need to wrap to the beginning and take any remainder
1236                         // into account
1237                         //
1238                         if (offset >= jnl->jhdr->size) {
1239                             offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1240                         }
1241                 }
1242
1243                 if (block_ptr) {
1244                     kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1245                     block_ptr = NULL;
1246                 }
1247
1248       bad_txn_handling:
1249                 if (bad_blocks) {
1250                     if (txn_start_offset == 0) {
1251                         printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1252                         goto bad_replay;
1253                     }
1254
1255                     jnl->jhdr->start = orig_jnl_start;
1256                     jnl->jhdr->end = txn_start_offset;
1257                     check_past_jnl_end = 0;
1258                     last_sequence_num = 0;
1259                     printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1260                     goto restart_replay;
1261                 }
1262
1263                 jnl->jhdr->start += blhdr->bytes_used;
1264                 if (jnl->jhdr->start >= jnl->jhdr->size) {
1265                         // wrap around and skip the journal header block
1266                         jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1267                 }
1268
1269                 if (jnl->jhdr->start == jnl->jhdr->end) {
1270                     in_uncharted_territory = 1;
1271                 }
1272     }
1273
1274     if (jnl->jhdr->start != jnl->jhdr->end) {
1275         printf("jnl: %s: start %lld != end %lld.  resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1276         jnl->jhdr->end = jnl->jhdr->start;
1277     }
1278
1279     //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1280
1281     /*
1282      * make sure it's at least one page in size, so
1283      * start max_bsize at PAGE_SIZE
1284      */
1285     for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1286
1287             if (co_buf[i].block_num == (off_t)-1)
1288                     continue;
1289
1290             if (co_buf[i].block_size > max_bsize)
1291                     max_bsize = co_buf[i].block_size;
1292     }
1293     /*
1294      * round max_bsize up to the nearest PAGE_SIZE multiple
1295      */
1296     if (max_bsize & (PAGE_SIZE - 1)) {
1297             max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1298     }
1299
1300     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1301         goto bad_replay;
1302     }
1303
1304     // Replay the coalesced entries in the co-buf
1305     for(i=0; i < num_full; i++) {
1306         size_t size = co_buf[i].block_size;
1307         off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1308         off_t number = co_buf[i].block_num;
1309
1310
1311         // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1312         //      co_buf[i].block_size, co_buf[i].jnl_offset);
1313
1314         if (number == (off_t)-1) {
1315             // printf("jnl: replay_journal: skipping killed fs block\n");
1316         } else {
1317
1318             // do journal read, and set the phys. block
1319             ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1320             if (ret != size) {
1321                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1322                 goto bad_replay;
1323             }
1324
1325             if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1326                 goto bad_replay;
1327             }
1328         }
1329     }
1330
1331
1332     // done replaying; update jnl header
1333     if (write_journal_header(jnl) != 0) {
1334         goto bad_replay;
1335     }
1336
1337     printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1338
1339     // free block_ptr
1340     if (block_ptr) {
1341         kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1342         block_ptr = NULL;
1343     }
1344
1345     // free the coalesce buffer
1346     FREE(co_buf, M_TEMP);
1347     co_buf = NULL;
1348
1349     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1350     return 0;
1351
1352   bad_replay:
1353     if (block_ptr) {
1354                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1355     }
1356     if (co_buf) {
1357       FREE(co_buf, M_TEMP);
1358     }
1359     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1360
1361     return -1;
1362 }
1363
1364
1365 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
1366 //#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
1367 #define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
1368
1369 // XXXdbg - so I can change it in the debugger
1370 int def_tbuffer_size = 0;
1371
1372
1373 //
1374 // This function sets the size of the tbuffer and the
1375 // size of the blhdr.  It assumes that jnl->jhdr->size
1376 // and jnl->jhdr->jhdr_size are already valid.
1377 //
1378 static void
1379 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1380 {
1381         //
1382         // one-time initialization based on how much memory
1383         // there is in the machine.
1384         //
1385         if (def_tbuffer_size == 0) {
1386                 if (mem_size < (256*1024*1024)) {
1387                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1388                 } else if (mem_size < (512*1024*1024)) {
1389                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1390                 } else if (mem_size < (1024*1024*1024)) {
1391                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1392                 } else if (mem_size >= (1024*1024*1024)) {
1393                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1394                 }
1395         }
1396
1397     // size up the transaction buffer... can't be larger than the number
1398     // of blocks that can fit in a block_list_header block.
1399     if (tbuffer_size == 0) {
1400                 jnl->tbuffer_size = def_tbuffer_size;
1401     } else {
1402                 // make sure that the specified tbuffer_size isn't too small
1403                 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1404                         tbuffer_size = jnl->jhdr->blhdr_size * 2;
1405                 }
1406                 // and make sure it's an even multiple of the block size
1407                 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1408                         tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1409                 }
1410
1411                 jnl->tbuffer_size = tbuffer_size;
1412     }
1413
1414     if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1415                 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1416     }
1417
1418     if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1419                 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1420     }
1421
1422     jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1423     if (jnl->jhdr->blhdr_size < phys_blksz) {
1424         jnl->jhdr->blhdr_size = phys_blksz;
1425     } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1426                 // have to round up so we're an even multiple of the physical block size
1427                 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1428     }
1429 }
1430
1431
1432
1433 static void
1434 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1435 {
1436     off_t       readblockcnt;
1437     off_t       writeblockcnt;
1438     off_t       readmaxcnt;
1439     off_t       writemaxcnt;
1440     int32_t     features;
1441
1442     if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1443         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1444             const char *name = vnode_name(devvp);
1445             jnl->flags |= JOURNAL_DO_FUA_WRITES;
1446             printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
1447         }
1448     }
1449
1450     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context)) {
1451         readmaxcnt = 0;
1452     }
1453
1454     if (readmaxcnt == 0) {
1455         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context)) {
1456             readmaxcnt = 128 * 1024;
1457         } else {
1458             readmaxcnt = readblockcnt * phys_blksz;
1459         }
1460     }
1461
1462
1463     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context)) {
1464         writemaxcnt = 0;
1465     }
1466
1467     if (writemaxcnt == 0) {
1468         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context)) {
1469             writemaxcnt = 128 * 1024;
1470         } else {
1471             writemaxcnt = writeblockcnt * phys_blksz;
1472         }
1473     }
1474
1475     jnl->max_read_size  = readmaxcnt;
1476     jnl->max_write_size = writemaxcnt;
1477
1478     // just in case it's still zero...
1479     if (jnl->max_read_size == 0) {
1480         jnl->max_read_size = 128 * 1024;
1481         jnl->max_write_size = 128 * 1024;
1482     }
1483 }
1484
1485
1486 static const char *
1487 get_jdev_name(struct vnode *jvp)
1488 {
1489     const char *jdev_name;
1490
1491     jdev_name = vnode_name(jvp);
1492     if (jdev_name == NULL) {
1493         jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
1494     } else {
1495         // this just bumps the refcount on the name so we have our own copy
1496         jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
1497     }
1498
1499     return jdev_name;
1500 }
1501
1502
1503 journal *
1504 journal_create(struct vnode *jvp,
1505                            off_t         offset,
1506                            off_t         journal_size,
1507                            struct vnode *fsvp,
1508                            size_t        min_fs_blksz,
1509                            int32_t       flags,
1510                            int32_t       tbuffer_size,
1511                            void        (*flush)(void *arg),
1512                            void         *arg)
1513 {
1514     journal *jnl;
1515     size_t      phys_blksz;
1516     struct vfs_context context;
1517     const char *jdev_name;
1518
1519     context.vc_thread = current_thread();
1520     context.vc_ucred = FSCRED;
1521
1522     jdev_name = get_jdev_name(jvp);
1523
1524     /* Get the real physical block size. */
1525     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1526         return NULL;
1527     }
1528
1529     if (phys_blksz > min_fs_blksz) {
1530                 printf("jnl: %s: create: error: phys blksize %lu bigger than min fs blksize %lu\n",
1531                     jdev_name, phys_blksz, min_fs_blksz);
1532                 return NULL;
1533     }
1534
1535     if ((journal_size % phys_blksz) != 0) {
1536                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1537                     jdev_name, journal_size, phys_blksz);
1538                 return NULL;
1539     }
1540
1541
1542     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1543     memset(jnl, 0, sizeof(*jnl));
1544
1545     jnl->jdev         = jvp;
1546     jnl->jdev_offset  = offset;
1547     jnl->fsdev        = fsvp;
1548     jnl->flush        = flush;
1549     jnl->flush_arg    = arg;
1550     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1551     jnl->jdev_name    = jdev_name;
1552     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1553
1554     get_io_info(jvp, phys_blksz, jnl, &context);
1555
1556     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1557         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1558         goto bad_kmem_alloc;
1559     }
1560
1561     memset(jnl->header_buf, 0, phys_blksz);
1562
1563     jnl->jhdr             = (journal_header *)jnl->header_buf;
1564     jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
1565     jnl->jhdr->endian     = ENDIAN_MAGIC;
1566     jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
1567     jnl->jhdr->end        = phys_blksz;
1568     jnl->jhdr->size       = journal_size;
1569     jnl->jhdr->jhdr_size  = phys_blksz;
1570     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1571
1572         jnl->active_start     = jnl->jhdr->start;
1573
1574     // XXXdbg  - for testing you can force the journal to wrap around
1575     // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1576     // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
1577
1578     jnl->jhdr->sequence_num = random() & 0x00ffffff;
1579
1580         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1581
1582     if (write_journal_header(jnl) != 0) {
1583         printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1584         goto bad_write;
1585     }
1586
1587     return jnl;
1588
1589
1590   bad_write:
1591     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1592   bad_kmem_alloc:
1593     if (jdev_name) {
1594         vfs_removename(jdev_name);
1595     }
1596     jnl->jhdr = NULL;
1597     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1598     return NULL;
1599 }
1600
1601
1602 journal *
1603 journal_open(struct vnode *jvp,
1604                          off_t         offset,
1605                          off_t         journal_size,
1606                          struct vnode *fsvp,
1607                          size_t        min_fs_blksz,
1608                          int32_t       flags,
1609                          int32_t       tbuffer_size,
1610                          void        (*flush)(void *arg),
1611                          void         *arg)
1612 {
1613     journal *jnl;
1614     int      orig_blksz=0;
1615     size_t   phys_blksz;
1616     int      orig_checksum, checksum;
1617     struct vfs_context context;
1618     const char *jdev_name = get_jdev_name(jvp);
1619
1620     context.vc_thread = current_thread();
1621     context.vc_ucred = FSCRED;
1622
1623     /* Get the real physical block size. */
1624     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1625                 return NULL;
1626     }
1627
1628     if (phys_blksz > min_fs_blksz) {
1629                 printf("jnl: %s: open: error: phys blksize %lu bigger than min fs blksize %lu\n",
1630                     jdev_name, phys_blksz, min_fs_blksz);
1631                 return NULL;
1632     }
1633
1634     if ((journal_size % phys_blksz) != 0) {
1635                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1636                     jdev_name, journal_size, phys_blksz);
1637                 return NULL;
1638     }
1639
1640     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1641     memset(jnl, 0, sizeof(*jnl));
1642
1643     jnl->jdev         = jvp;
1644     jnl->jdev_offset  = offset;
1645     jnl->fsdev        = fsvp;
1646     jnl->flush        = flush;
1647     jnl->flush_arg    = arg;
1648     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1649     jnl->jdev_name    = jdev_name;
1650     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1651
1652     get_io_info(jvp, phys_blksz, jnl, &context);
1653
1654     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1655         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1656         goto bad_kmem_alloc;
1657     }
1658
1659     jnl->jhdr = (journal_header *)jnl->header_buf;
1660     memset(jnl->jhdr, 0, sizeof(journal_header));
1661
1662     // we have to set this up here so that do_journal_io() will work
1663     jnl->jhdr->jhdr_size = phys_blksz;
1664
1665     if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1666                 printf("jnl: %s: open: could not read %lu bytes for the journal header.\n",
1667                     jdev_name, phys_blksz);
1668                 goto bad_journal;
1669     }
1670
1671         orig_checksum = jnl->jhdr->checksum;
1672         jnl->jhdr->checksum = 0;
1673
1674         if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1675                 // do this before the swap since it's done byte-at-a-time
1676                 orig_checksum = SWAP32(orig_checksum);
1677                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1678                 swap_journal_header(jnl);
1679                 jnl->flags |= JOURNAL_NEED_SWAP;
1680         } else {
1681                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1682         }
1683
1684     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1685                 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1686                     jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1687                 goto bad_journal;
1688     }
1689
1690         // only check if we're the current journal header magic value
1691         if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1692
1693                 if (orig_checksum != checksum) {
1694                         printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1695                             jdev_name, orig_checksum, checksum);
1696
1697                         //goto bad_journal;
1698                 }
1699         }
1700
1701         // XXXdbg - convert old style magic numbers to the new one
1702         if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1703                 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1704         }
1705
1706     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1707         /*
1708          * The volume has probably been resized (such that we had to adjust the
1709          * logical sector size), or copied to media with a different logical
1710          * sector size.  If the journal is empty, then just switch to the
1711          * current logical sector size.  If the journal is not empty, then
1712          * fail to open the journal.
1713          */
1714
1715         if (jnl->jhdr->start == jnl->jhdr->end) {
1716             int err;
1717             printf("jnl: %s: open: changing journal header size from %d to %lu\n",
1718                 jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
1719             jnl->jhdr->jhdr_size = phys_blksz;
1720             if (write_journal_header(jnl)) {
1721                 printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
1722                 goto bad_journal;
1723             }
1724         } else {
1725             printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d, and journal is not empty!\n",
1726                 jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
1727             goto bad_journal;
1728         }
1729     }
1730
1731     if (   jnl->jhdr->start <= 0
1732                    || jnl->jhdr->start > jnl->jhdr->size
1733                    || jnl->jhdr->start > 1024*1024*1024) {
1734                 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1735                     jdev_name, jnl->jhdr->start, jnl->jhdr->size);
1736                 goto bad_journal;
1737     }
1738
1739     if (   jnl->jhdr->end <= 0
1740                    || jnl->jhdr->end > jnl->jhdr->size
1741                    || jnl->jhdr->end > 1024*1024*1024) {
1742                 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1743                     jdev_name, jnl->jhdr->end, jnl->jhdr->size);
1744                 goto bad_journal;
1745     }
1746
1747     if (jnl->jhdr->size > 1024*1024*1024) {
1748         printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
1749         goto bad_journal;
1750     }
1751
1752 // XXXdbg - can't do these checks because hfs writes all kinds of
1753 //          non-uniform sized blocks even on devices that have a block size
1754 //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
1755 //          therefore these checks will fail and so we just have to punt and
1756 //          do more relaxed checking...
1757 // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1758     if ((jnl->jhdr->start % 512) != 0) {
1759                 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1760                     jdev_name, jnl->jhdr->start);
1761                 goto bad_journal;
1762     }
1763
1764 //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1765     if ((jnl->jhdr->end % 512) != 0) {
1766                 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1767                     jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
1768                 goto bad_journal;
1769     }
1770
1771     // take care of replaying the journal if necessary
1772     if (flags & JOURNAL_RESET) {
1773         printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1774             jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
1775         jnl->jhdr->start = jnl->jhdr->end;
1776     } else if (replay_journal(jnl) != 0) {
1777         printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
1778         goto bad_journal;
1779     }
1780
1781     if (orig_blksz != 0) {
1782         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1783         phys_blksz = orig_blksz;
1784         if (orig_blksz < jnl->jhdr->jhdr_size) {
1785             printf("jnl: %s: open: jhdr_size is %d but orig phys blk size is %d.  switching.\n",
1786                 jdev_name, jnl->jhdr->jhdr_size, orig_blksz);
1787
1788             jnl->jhdr->jhdr_size = orig_blksz;
1789         }
1790     }
1791
1792     // make sure this is in sync!
1793     jnl->active_start = jnl->jhdr->start;
1794
1795     // set this now, after we've replayed the journal
1796     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1797
1798     lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1799
1800     return jnl;
1801
1802   bad_journal:
1803     if (orig_blksz != 0) {
1804         phys_blksz = orig_blksz;
1805         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1806     }
1807     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1808   bad_kmem_alloc:
1809     if (jdev_name) {
1810         vfs_removename(jdev_name);
1811     }
1812     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1813     return NULL;
1814 }
1815
1816
1817 int
1818 journal_is_clean(struct vnode *jvp,
1819                  off_t         offset,
1820                  off_t         journal_size,
1821                  struct vnode *fsvp,
1822                  size_t        min_fs_block_size)
1823 {
1824     journal jnl;
1825     int     phys_blksz, ret;
1826     int     orig_checksum, checksum;
1827     struct vfs_context context;
1828     const char *jdev_name = get_jdev_name(jvp);
1829
1830     context.vc_thread = current_thread();
1831     context.vc_ucred = FSCRED;
1832
1833     /* Get the real physical block size. */
1834     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1835         printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
1836         return EINVAL;
1837     }
1838
1839     if (phys_blksz > (int)min_fs_block_size) {
1840         printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %lu\n",
1841             jdev_name, phys_blksz, min_fs_block_size);
1842         return EINVAL;
1843     }
1844
1845     if ((journal_size % phys_blksz) != 0) {
1846         printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1847             jdev_name, journal_size, phys_blksz);
1848         return EINVAL;
1849     }
1850
1851     memset(&jnl, 0, sizeof(jnl));
1852
1853     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
1854         printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
1855         return ENOMEM;
1856     }
1857
1858     get_io_info(jvp, phys_blksz, &jnl, &context);
1859
1860     jnl.jhdr = (journal_header *)jnl.header_buf;
1861     memset(jnl.jhdr, 0, sizeof(journal_header));
1862
1863     jnl.jdev        = jvp;
1864     jnl.jdev_offset = offset;
1865     jnl.fsdev       = fsvp;
1866
1867     // we have to set this up here so that do_journal_io() will work
1868     jnl.jhdr->jhdr_size = phys_blksz;
1869
1870     if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
1871         printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
1872             jdev_name, phys_blksz);
1873         ret = EINVAL;
1874         goto get_out;
1875     }
1876
1877     orig_checksum = jnl.jhdr->checksum;
1878     jnl.jhdr->checksum = 0;
1879
1880     if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1881         // do this before the swap since it's done byte-at-a-time
1882         orig_checksum = SWAP32(orig_checksum);
1883         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1884         swap_journal_header(&jnl);
1885         jnl.flags |= JOURNAL_NEED_SWAP;
1886     } else {
1887         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1888     }
1889
1890     if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1891         printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1892             jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
1893         ret = EINVAL;
1894         goto get_out;
1895     }
1896
1897     if (orig_checksum != checksum) {
1898         printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
1899         ret = EINVAL;
1900         goto get_out;
1901     }
1902
1903     //
1904     // if the start and end are equal then the journal is clean.
1905     // otherwise it's not clean and therefore an error.
1906     //
1907     if (jnl.jhdr->start == jnl.jhdr->end) {
1908         ret = 0;
1909     } else {
1910         ret = EINVAL;
1911     }
1912
1913   get_out:
1914     kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
1915     if (jdev_name) {
1916         vfs_removename(jdev_name);
1917     }
1918
1919     return ret;
1920
1921
1922 }
1923
1924
1925 void
1926 journal_close(journal *jnl)
1927 {
1928     volatile off_t *start, *end;
1929     int             counter=0;
1930
1931     CHECK_JOURNAL(jnl);
1932
1933         // set this before doing anything that would block so that
1934         // we start tearing things down properly.
1935         //
1936         jnl->flags |= JOURNAL_CLOSE_PENDING;
1937
1938     if (jnl->owner != current_thread()) {
1939                 lock_journal(jnl);
1940     }
1941
1942     //
1943     // only write stuff to disk if the journal is still valid
1944     //
1945     if ((jnl->flags & JOURNAL_INVALID) == 0) {
1946
1947                 if (jnl->active_tr) {
1948                         journal_end_transaction(jnl);
1949                 }
1950
1951                 // flush any buffered transactions
1952                 if (jnl->cur_tr) {
1953                         transaction *tr = jnl->cur_tr;
1954
1955                         jnl->cur_tr = NULL;
1956                         end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
1957                 }
1958
1959                 //start = &jnl->jhdr->start;
1960                 start = &jnl->active_start;
1961                 end   = &jnl->jhdr->end;
1962
1963                 while (*start != *end && counter++ < 5000) {
1964                         //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1965                         if (jnl->flush) {
1966                                 jnl->flush(jnl->flush_arg);
1967                         }
1968                         tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
1969                 }
1970
1971                 if (*start != *end) {
1972                         printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1973                             jnl->jdev_name, *start, *end);
1974                 }
1975
1976                 // make sure this is in sync when we close the journal
1977                 jnl->jhdr->start = jnl->active_start;
1978
1979                 // if this fails there's not much we can do at this point...
1980                 write_journal_header(jnl);
1981     } else {
1982                 // if we're here the journal isn't valid any more.
1983                 // so make sure we don't leave any locked blocks lying around
1984                 printf("jnl: %s: close: journal %p, is invalid.  aborting outstanding transactions\n", jnl->jdev_name, jnl);
1985                 if (jnl->active_tr || jnl->cur_tr) {
1986                         transaction *tr;
1987                         if (jnl->active_tr) {
1988                                 tr = jnl->active_tr;
1989                                 jnl->active_tr = NULL;
1990                         } else {
1991                                 tr = jnl->cur_tr;
1992                                 jnl->cur_tr = NULL;
1993                         }
1994
1995                         abort_transaction(jnl, tr);
1996                         if (jnl->active_tr || jnl->cur_tr) {
1997                             panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
1998                         }
1999                 }
2000     }
2001
2002     free_old_stuff(jnl);
2003
2004     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
2005     jnl->jhdr = (void *)0xbeefbabe;
2006
2007     if (jnl->jdev_name) {
2008         vfs_removename(jnl->jdev_name);
2009     }
2010
2011     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2012 }
2013
2014 static void
2015 dump_journal(journal *jnl)
2016 {
2017     transaction *ctr;
2018
2019     printf("journal for dev %s:", jnl->jdev_name);
2020     printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
2021     printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
2022     printf("  start: 0x%.8llx\n", jnl->jhdr->start);
2023     printf("  end:   0x%.8llx\n", jnl->jhdr->end);
2024     printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2025     printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2026     printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2027     printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2028
2029     printf("  completed transactions:\n");
2030     for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
2031                 printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2032     }
2033 }
2034
2035
2036
2037 static off_t
2038 free_space(journal *jnl)
2039 {
2040     off_t free_space_offset;
2041
2042     if (jnl->jhdr->start < jnl->jhdr->end) {
2043                 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2044     } else if (jnl->jhdr->start > jnl->jhdr->end) {
2045                 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2046     } else {
2047                 // journal is completely empty
2048                 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2049     }
2050
2051     return free_space_offset;
2052 }
2053
2054
2055 //
2056 // The journal must be locked on entry to this function.
2057 // The "desired_size" is in bytes.
2058 //
2059 static int
2060 check_free_space(journal *jnl, int desired_size)
2061 {
2062     size_t i;
2063     int    counter=0;
2064
2065     //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2066 //         desired_size, free_space(jnl));
2067
2068     while (1) {
2069                 int old_start_empty;
2070
2071                 if (counter++ == 5000) {
2072                         dump_journal(jnl);
2073                         panic("jnl: check_free_space: buffer flushing isn't working "
2074                                   "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2075                                   jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2076                 }
2077                 if (counter > 7500) {
2078                     printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2079                     return ENOSPC;
2080                 }
2081
2082                 // make sure there's space in the journal to hold this transaction
2083                 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2084                         break;
2085                 }
2086                 //
2087                 // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2088                 // entries until there is enough space for the next transaction.
2089                 //
2090                 old_start_empty = 1;
2091                 lock_oldstart(jnl);
2092                 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2093                         int   lcl_counter;
2094
2095                         lcl_counter = 0;
2096                         while (jnl->old_start[i] & 0x8000000000000000LL) {
2097                                 if (lcl_counter++ > 1000) {
2098                                         panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2099                                                   jnl->old_start[i], jnl);
2100                                 }
2101
2102                                 unlock_oldstart(jnl);
2103                                 if (jnl->flush) {
2104                                         jnl->flush(jnl->flush_arg);
2105                                 }
2106                                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2107                                 lock_oldstart(jnl);
2108                         }
2109
2110                         if (jnl->old_start[i] == 0) {
2111                                 continue;
2112                         }
2113
2114                         old_start_empty   = 0;
2115                         jnl->jhdr->start  = jnl->old_start[i];
2116                         jnl->old_start[i] = 0;
2117                         if (free_space(jnl) > desired_size) {
2118                                 unlock_oldstart(jnl);
2119                                 write_journal_header(jnl);
2120                                 lock_oldstart(jnl);
2121                                 break;
2122                         }
2123                 }
2124                 unlock_oldstart(jnl);
2125
2126                 // if we bumped the start, loop and try again
2127                 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2128                         continue;
2129                 } else if (old_start_empty) {
2130                         //
2131                         // if there is nothing in old_start anymore then we can
2132                         // bump the jhdr->start to be the same as active_start
2133                         // since it is possible there was only one very large
2134                         // transaction in the old_start array.  if we didn't do
2135                         // this then jhdr->start would never get updated and we
2136                         // would wind up looping until we hit the panic at the
2137                         // start of the loop.
2138                         //
2139                         jnl->jhdr->start = jnl->active_start;
2140                         write_journal_header(jnl);
2141                         continue;
2142                 }
2143
2144
2145                 // if the file system gave us a flush function, call it to so that
2146                 // it can flush some blocks which hopefully will cause some transactions
2147                 // to complete and thus free up space in the journal.
2148                 if (jnl->flush) {
2149                         jnl->flush(jnl->flush_arg);
2150                 }
2151
2152                 // wait for a while to avoid being cpu-bound (this will
2153                 // put us to sleep for 10 milliseconds)
2154                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2155     }
2156
2157     return 0;
2158 }
2159
2160 /*
2161  * Allocate a new active transaction.
2162  */
2163 static errno_t
2164 journal_allocate_transaction(journal *jnl)
2165 {
2166         transaction *tr;
2167
2168         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2169     memset(tr, 0, sizeof(transaction));
2170
2171     tr->tbuffer_size = jnl->tbuffer_size;
2172
2173     if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
2174                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2175                 jnl->active_tr = NULL;
2176                 return ENOMEM;
2177     }
2178
2179     // journal replay code checksum check depends on this.
2180     memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2181     // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2182     memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2183
2184     tr->blhdr = (block_list_header *)tr->tbuffer;
2185     tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2186     tr->blhdr->num_blocks = 1;      // accounts for this header block
2187     tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2188     tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2189
2190     tr->sequence_num = ++jnl->jhdr->sequence_num;
2191     tr->num_blhdrs  = 1;
2192     tr->total_bytes = jnl->jhdr->blhdr_size;
2193     tr->jnl         = jnl;
2194
2195         jnl->active_tr  = tr;
2196
2197         return 0;
2198 }
2199
2200 int
2201 journal_start_transaction(journal *jnl)
2202 {
2203     int ret;
2204
2205     CHECK_JOURNAL(jnl);
2206
2207     if (jnl->flags & JOURNAL_INVALID) {
2208                 return EINVAL;
2209     }
2210
2211     if (jnl->owner == current_thread()) {
2212                 if (jnl->active_tr == NULL) {
2213                         panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2214                                   jnl, jnl->owner, current_thread());
2215                 }
2216                 jnl->nested_count++;
2217                 return 0;
2218     }
2219
2220     lock_journal(jnl);
2221
2222     if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
2223                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2224                           jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2225     }
2226
2227     jnl->owner        = current_thread();
2228     jnl->nested_count = 1;
2229
2230     free_old_stuff(jnl);
2231
2232     // make sure there's room in the journal
2233     if (free_space(jnl) < jnl->tbuffer_size) {
2234         // this is the call that really waits for space to free up
2235         // as well as updating jnl->jhdr->start
2236         if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
2237                 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2238                 ret = ENOSPC;
2239                 goto bad_start;
2240         }
2241     }
2242
2243     // if there's a buffered transaction, use it.
2244     if (jnl->cur_tr) {
2245                 jnl->active_tr = jnl->cur_tr;
2246                 jnl->cur_tr    = NULL;
2247
2248                 return 0;
2249     }
2250
2251         ret = journal_allocate_transaction(jnl);
2252         if (ret) {
2253                 goto bad_start;
2254         }
2255
2256     // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2257
2258     return 0;
2259
2260   bad_start:
2261         jnl->owner        = NULL;
2262         jnl->nested_count = 0;
2263         unlock_journal(jnl);
2264         return ret;
2265 }
2266
2267
2268 int
2269 journal_modify_block_start(journal *jnl, struct buf *bp)
2270 {
2271     transaction *tr;
2272
2273     CHECK_JOURNAL(jnl);
2274
2275     if (jnl->flags & JOURNAL_INVALID) {
2276                 return EINVAL;
2277     }
2278
2279     // XXXdbg - for debugging I want this to be true.  later it may
2280     //          not be necessary.
2281     if ((buf_flags(bp) & B_META) == 0) {
2282                 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2283     }
2284
2285     tr = jnl->active_tr;
2286     CHECK_TRANSACTION(tr);
2287
2288     if (jnl->owner != current_thread()) {
2289                 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2290                           jnl, jnl->owner, current_thread());
2291     }
2292
2293     free_old_stuff(jnl);
2294
2295     //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2296     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2297
2298     // can't allow blocks that aren't an even multiple of the
2299     // underlying block size.
2300     if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2301                 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2302                           buf_size(bp), jnl->jhdr->jhdr_size);
2303                 return -1;
2304     }
2305
2306     // make sure that this transaction isn't bigger than the whole journal
2307     if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2308                 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2309                           tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2310                 return -1;
2311     }
2312
2313     // if the block is dirty and not already locked we have to write
2314     // it out before we muck with it because it has data that belongs
2315     // (presumably) to another transaction.
2316     //
2317     if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2318
2319                 if (buf_flags(bp) & B_ASYNC) {
2320                         panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2321                 }
2322
2323                 // this will cause it to not be buf_brelse()'d
2324                 buf_setflags(bp, B_NORELSE);
2325                 VNOP_BWRITE(bp);
2326     }
2327     buf_setflags(bp, B_LOCKED);
2328
2329     return 0;
2330 }
2331
2332 int
2333 journal_modify_block_abort(journal *jnl, struct buf *bp)
2334 {
2335     transaction *tr;
2336         block_list_header *blhdr;
2337         int i;
2338
2339     CHECK_JOURNAL(jnl);
2340
2341     tr = jnl->active_tr;
2342
2343         //
2344         // if there's no active transaction then we just want to
2345         // call buf_brelse() and return since this is just a block
2346         // that happened to be modified as part of another tr.
2347         //
2348         if (tr == NULL) {
2349                 buf_brelse(bp);
2350                 return 0;
2351         }
2352
2353     if (jnl->flags & JOURNAL_INVALID) {
2354                 return EINVAL;
2355     }
2356
2357     CHECK_TRANSACTION(tr);
2358
2359     if (jnl->owner != current_thread()) {
2360                 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2361                           jnl, jnl->owner, current_thread());
2362     }
2363
2364     free_old_stuff(jnl);
2365
2366     // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2367
2368     // first check if it's already part of this transaction
2369     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2370                 for(i=1; i < blhdr->num_blocks; i++) {
2371                         if (bp == blhdr->binfo[i].b.bp) {
2372                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2373                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2374                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2375                                 }
2376                                 break;
2377                         }
2378                 }
2379
2380                 if (i < blhdr->num_blocks) {
2381                         break;
2382                 }
2383     }
2384
2385         //
2386         // if blhdr is null, then this block has only had modify_block_start
2387         // called on it as part of the current transaction.  that means that
2388         // it is ok to clear the LOCKED bit since it hasn't actually been
2389         // modified.  if blhdr is non-null then modify_block_end was called
2390         // on it and so we need to keep it locked in memory.
2391         //
2392         if (blhdr == NULL) {
2393                   buf_clearflags(bp, B_LOCKED);
2394         }
2395
2396     buf_brelse(bp);
2397     return 0;
2398 }
2399
2400
2401 int
2402 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg)
2403 {
2404     int                i = 1;
2405     int                tbuffer_offset=0;
2406     char              *blkptr;
2407     block_list_header *blhdr, *prev=NULL;
2408     transaction       *tr;
2409
2410     CHECK_JOURNAL(jnl);
2411
2412     if (jnl->flags & JOURNAL_INVALID) {
2413                 return EINVAL;
2414     }
2415
2416     tr = jnl->active_tr;
2417     CHECK_TRANSACTION(tr);
2418
2419     if (jnl->owner != current_thread()) {
2420                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2421                           jnl, jnl->owner, current_thread());
2422     }
2423
2424     free_old_stuff(jnl);
2425
2426     //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2427     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2428
2429     if ((buf_flags(bp) & B_LOCKED) == 0) {
2430                 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2431     }
2432
2433     // first check if it's already part of this transaction
2434     for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2435                 tbuffer_offset = jnl->jhdr->blhdr_size;
2436
2437                 for(i=1; i < blhdr->num_blocks; i++) {
2438                         if (bp == blhdr->binfo[i].b.bp) {
2439                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2440                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2441                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2442                                 }
2443                                 break;
2444                         }
2445                         tbuffer_offset += blhdr->binfo[i].bsize;
2446                 }
2447
2448                 if (i < blhdr->num_blocks) {
2449                         break;
2450                 }
2451     }
2452
2453     if (blhdr == NULL
2454                 && prev
2455                 && (prev->num_blocks+1) <= prev->max_blocks
2456                 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2457                 blhdr = prev;
2458     } else if (blhdr == NULL) {
2459                 block_list_header *nblhdr;
2460
2461                 if (prev == NULL) {
2462                         panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2463                 }
2464
2465                 // we got to the end of the list, didn't find the block and there's
2466                 // no room in the block_list_header pointed to by prev
2467
2468                 // we allocate another tbuffer and link it in at the end of the list
2469                 // through prev->binfo[0].bnum.  that's a skanky way to do things but
2470                 // avoids having yet another linked list of small data structures to manage.
2471
2472                 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2473                         panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2474                                   tr, tr->total_bytes);
2475                 }
2476
2477                 // journal replay code checksum check depends on this.
2478                 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2479                 // Fill up the rest of the block with unimportant bytes
2480                 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2481
2482                 // initialize the new guy
2483                 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2484                 nblhdr->num_blocks = 1;      // accounts for this header block
2485                 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2486                 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2487
2488                 tr->num_blhdrs++;
2489                 tr->total_bytes += jnl->jhdr->blhdr_size;
2490
2491                 // then link him in at the end
2492                 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2493
2494                 // and finally switch to using the new guy
2495                 blhdr          = nblhdr;
2496                 tbuffer_offset = jnl->jhdr->blhdr_size;
2497                 i              = 1;
2498     }
2499
2500
2501     if ((i+1) > blhdr->max_blocks) {
2502                 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2503     }
2504
2505         // if the function pointer is not set then copy the
2506         // block of data now.  if the function pointer is set
2507         // the copy will happen after calling the callback in
2508         // end_transaction() just before it goes to disk.
2509         //
2510         if (func == NULL) {
2511                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2512                 memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp));
2513         }
2514
2515     // if this is true then this is a new block we haven't seen
2516     if (i >= blhdr->num_blocks) {
2517                 int     bsize;
2518                 vnode_t vp;
2519
2520                 vp = buf_vnode(bp);
2521                 vnode_ref(vp);
2522                 bsize = buf_size(bp);
2523
2524                 blhdr->binfo[i].bnum  = (off_t)(buf_blkno(bp));
2525                 blhdr->binfo[i].bsize = bsize;
2526                 blhdr->binfo[i].b.bp    = bp;
2527                 if (func) {
2528                         void *old_func=NULL, *old_arg=NULL;
2529
2530                         buf_setfilter(bp, func, arg, &old_func, &old_arg);
2531                         if (old_func != NULL) {
2532                                 panic("jnl: modify_block_end: old func %p / arg %p", old_func, old_arg);
2533                         }
2534                 }
2535
2536                 blhdr->bytes_used += bsize;
2537                 tr->total_bytes   += bsize;
2538
2539                 blhdr->num_blocks++;
2540     }
2541     buf_bdwrite(bp);
2542
2543     return 0;
2544 }
2545
2546 int
2547 journal_kill_block(journal *jnl, struct buf *bp)
2548 {
2549     int                i;
2550     int                bflags;
2551     block_list_header *blhdr;
2552     transaction       *tr;
2553
2554     CHECK_JOURNAL(jnl);
2555
2556     if (jnl->flags & JOURNAL_INVALID) {
2557                 return EINVAL;
2558     }
2559
2560     tr = jnl->active_tr;
2561     CHECK_TRANSACTION(tr);
2562
2563     if (jnl->owner != current_thread()) {
2564                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2565                           jnl, jnl->owner, current_thread());
2566     }
2567
2568     free_old_stuff(jnl);
2569
2570     bflags = buf_flags(bp);
2571
2572     if ( !(bflags & B_LOCKED))
2573             panic("jnl: modify_block_end: called with bp not B_LOCKED");
2574
2575     /*
2576      * bp must be BL_BUSY and B_LOCKED
2577      */
2578     // first check if it's already part of this transaction
2579     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2580
2581                 for(i=1; i < blhdr->num_blocks; i++) {
2582                         if (bp == blhdr->binfo[i].b.bp) {
2583                                 vnode_t vp;
2584
2585                                 buf_clearflags(bp, B_LOCKED);
2586
2587                                 // this undoes the vnode_ref() in journal_modify_block_end()
2588                                 vp = buf_vnode(bp);
2589                                 vnode_rele_ext(vp, 0, 1);
2590
2591                                 // if the block has the DELWRI and FILTER bits sets, then
2592                                 // things are seriously weird.  if it was part of another
2593                                 // transaction then journal_modify_block_start() should
2594                                 // have force it to be written.
2595                                 //
2596                                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2597                                 //      panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2598                                 //} else {
2599                                         tr->num_killed += buf_size(bp);
2600                                 //}
2601                                 blhdr->binfo[i].b.bp   = NULL;
2602                                 blhdr->binfo[i].bnum = (off_t)-1;
2603
2604                                 buf_markinvalid(bp);
2605                                 buf_brelse(bp);
2606
2607                                 break;
2608                         }
2609                 }
2610
2611                 if (i < blhdr->num_blocks) {
2612                         break;
2613                 }
2614     }
2615
2616     return 0;
2617 }
2618
2619
2620 static int
2621 journal_binfo_cmp(const void *a, const void *b)
2622 {
2623     const block_info *bi_a = (const struct block_info *)a;
2624     const block_info *bi_b = (const struct block_info *)b;
2625     daddr64_t res;
2626
2627     if (bi_a->b.bp == NULL) {
2628                 return 1;
2629     }
2630     if (bi_b->b.bp == NULL) {
2631                 return -1;
2632     }
2633
2634     // don't have to worry about negative block
2635     // numbers so this is ok to do.
2636     //
2637     res = (buf_blkno(bi_a->b.bp) - buf_blkno(bi_b->b.bp));
2638
2639     return (int)res;
2640 }
2641
2642
2643 /*
2644  * End a transaction.  If the transaction is small enough, and we're not forcing
2645  * a write to disk, the "active" transaction becomes the "current" transaction,
2646  * and will be reused for the next transaction that is started (group commit).
2647  *
2648  * If the transaction gets written to disk (because force_it is true, or no
2649  * group commit, or the transaction is sufficiently full), the blocks get
2650  * written into the journal first, then the are written asynchronously.  When
2651  * those async writes complete, the transaction can be freed and removed from
2652  * the journal.
2653  *
2654  * An optional callback can be supplied.  If given, it is called after the
2655  * the blocks have been written to the journal, but before the async writes
2656  * of those blocks to their normal on-disk locations.  This is used by
2657  * journal_relocate so that the location of the journal can be changed and
2658  * flushed to disk before the blocks get written to their normal locations.
2659  * Note that the callback is only called if the transaction gets written to
2660  * the journal during this end_transaction call; you probably want to set the
2661  * force_it flag.
2662  *
2663  * Inputs:
2664  *      tr                       Transaction to add to the journal
2665  *      force_it         If true, force this transaction to the on-disk journal immediately.
2666  *      callback         See description above.  Pass NULL for no callback.
2667  *      callback_arg Argument passed to callback routine.
2668  *
2669  * Result
2670  *               0              No errors
2671  *              -1              An error occurred.  The journal is marked invalid.
2672  */
2673 static int
2674 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg)
2675 {
2676     int                 i, ret, amt;
2677     errno_t             errno;
2678     off_t               end;
2679     journal            *jnl = tr->jnl;
2680     struct buf         *bp, **bparray;
2681     block_list_header  *blhdr=NULL, *next=NULL;
2682     size_t              tbuffer_offset;
2683
2684         if (jnl->cur_tr) {
2685                 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2686                           jnl, jnl->cur_tr, tr);
2687         }
2688
2689     // if there weren't any modified blocks in the transaction
2690     // just save off the transaction pointer and return.
2691     if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2692                 jnl->cur_tr = tr;
2693                 return 0;
2694     }
2695
2696     // if our transaction buffer isn't very full, just hang
2697     // on to it and don't actually flush anything.  this is
2698     // what is known as "group commit".  we will flush the
2699     // transaction buffer if it's full or if we have more than
2700     // one of them so we don't start hogging too much memory.
2701     //
2702     if (   force_it == 0
2703                    && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2704                    && tr->num_blhdrs < 3
2705                    && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2706
2707                 jnl->cur_tr = tr;
2708                 return 0;
2709     }
2710
2711
2712     // if we're here we're going to flush the transaction buffer to disk.
2713     // make sure there is room in the journal first.
2714     check_free_space(jnl, tr->total_bytes);
2715
2716     // range check the end index
2717     if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2718                 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2719                           jnl->jhdr->end, jnl->jhdr->size);
2720     }
2721
2722     // this transaction starts where the current journal ends
2723     tr->journal_start = jnl->jhdr->end;
2724     end               = jnl->jhdr->end;
2725
2726         //
2727         // if the first entry in old_start[] isn't free yet, loop calling the
2728         // file system flush routine until it is (or we panic).
2729         //
2730         i = 0;
2731         lock_oldstart(jnl);
2732         while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2733                 if (jnl->flush) {
2734                         unlock_oldstart(jnl);
2735
2736                         if (jnl->flush) {
2737                                 jnl->flush(jnl->flush_arg);
2738                         }
2739
2740                         // yield the cpu so others can get in to clear the lock bit
2741                         (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2742
2743                         lock_oldstart(jnl);
2744                 }
2745                 if (i++ >= 500) {
2746                         panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n",
2747                                   jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2748                 }
2749         }
2750
2751         //
2752         // slide everyone else down and put our latest guy in the last
2753         // entry in the old_start array
2754         //
2755         memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2756         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2757
2758         unlock_oldstart(jnl);
2759
2760
2761     // for each block, make sure that the physical block # is set
2762     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2763                 char *blkptr;
2764
2765                 tbuffer_offset = jnl->jhdr->blhdr_size;
2766                 for(i=1; i < blhdr->num_blocks; i++) {
2767                         daddr64_t blkno;
2768                         daddr64_t lblkno;
2769                         struct vnode *vp;
2770
2771                         bp = blhdr->binfo[i].b.bp;
2772
2773                         // if this block has a callback function set, call
2774                         // it now and then copy the data from the bp into
2775                         // the journal.
2776                         if (bp) {
2777                                 void (*func)(struct buf *, void *);
2778                                 void  *arg;
2779
2780                                 buf_setfilter(bp, NULL, NULL, (void **)&func, &arg);
2781
2782                                 if (func) {
2783                                         // acquire the bp here so that we can safely
2784                                         // mess around with its data.  buf_acquire()
2785                                         // will return EAGAIN if the buffer was busy,
2786                                         // so loop trying again.
2787                                         do {
2788                                                 errno = buf_acquire(bp, 0, 0, 0);
2789                                         } while (errno == EAGAIN);
2790
2791                                         if (errno == 0) {
2792
2793                                                 // call the hook function and then copy the
2794                                                 // data into the transaction buffer...
2795                                                 func(bp, arg);
2796
2797                                                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2798                                                 memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp));
2799
2800                                                 buf_drop(bp);
2801                                         } else {
2802                                                 panic("could not acquire bp %p (err %d)\n", bp, errno);
2803                                         }
2804                                 }
2805
2806                         } else {   // bp == NULL, only true if a block was "killed"
2807                                 if (blhdr->binfo[i].bnum != (off_t)-1) {
2808                                         panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2809                                                 blhdr->binfo[i].bnum, jnl, tr);
2810                                 }
2811
2812                                 tbuffer_offset += blhdr->binfo[i].bsize;
2813                                 continue;
2814                         }
2815
2816                         tbuffer_offset += blhdr->binfo[i].bsize;
2817
2818                         vp = buf_vnode(bp);
2819                         blkno = buf_blkno(bp);
2820                         lblkno = buf_lblkno(bp);
2821
2822                         if (vp == NULL && lblkno == blkno) {
2823                             printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd.  aborting the transaction (tr %p jnl %p).\n",
2824                                 jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
2825                             goto bad_journal;
2826                         }
2827
2828                         // if the lblkno is the same as blkno and this bp isn't
2829                         // associated with the underlying file system device then
2830                         // we need to call bmap() to get the actual physical block.
2831                         //
2832                         if ((lblkno == blkno) && (vp != jnl->fsdev)) {
2833                                 off_t   f_offset;
2834                                 size_t  contig_bytes;
2835
2836                                 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
2837                                         printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2838                                         goto bad_journal;
2839                                 }
2840                                 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
2841                                         printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2842                                         goto bad_journal;
2843                                 }
2844                                 if ((uint32_t)contig_bytes < buf_count(bp)) {
2845                                         printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2846                                         goto bad_journal;
2847                                 }
2848                                 buf_setblkno(bp, blkno);
2849                         }
2850                         // update this so we write out the correct physical block number!
2851                         blhdr->binfo[i].bnum = (off_t)(blkno);
2852                 }
2853
2854                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2855     }
2856
2857
2858
2859     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2860                 amt = blhdr->bytes_used;
2861
2862                 blhdr->binfo[0].b.sequence_num = tr->sequence_num;
2863
2864                 blhdr->checksum = 0;
2865                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2866
2867                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
2868                     panic("can't allocate %lu bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
2869                 }
2870
2871                 // calculate individual block checksums
2872                 tbuffer_offset = jnl->jhdr->blhdr_size;
2873                 for(i=1; i < blhdr->num_blocks; i++) {
2874                     bparray[i] = blhdr->binfo[i].b.bp;
2875                     if (bparray[i]) {
2876                         blhdr->binfo[i].b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], blhdr->binfo[i].bsize);
2877                     } else {
2878                         blhdr->binfo[i].b.cksum = 0;
2879                     }
2880
2881                     tbuffer_offset += blhdr->binfo[i].bsize;
2882                 }
2883
2884                 ret = write_journal_data(jnl, &end, blhdr, amt);
2885
2886                 // always put the bp pointers back
2887                 for(i=1; i < blhdr->num_blocks; i++) {
2888                     blhdr->binfo[i].b.bp = bparray[i];
2889                 }
2890
2891                 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
2892
2893                 if (ret != amt) {
2894                         printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
2895                             jnl->jdev_name, ret, amt);
2896
2897                         goto bad_journal;
2898                 }
2899     }
2900
2901     jnl->jhdr->end  = end;    // update where the journal now ends
2902     tr->journal_end = end;    // the transaction ends here too
2903     if (tr->journal_start == 0 || tr->journal_end == 0) {
2904                 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2905                           tr->journal_start, tr->journal_end);
2906     }
2907
2908     if (write_journal_header(jnl) != 0) {
2909                 goto bad_journal;
2910     }
2911
2912         /*
2913          * If the caller supplied a callback, call it now that the blocks have been
2914          * written to the journal.  This is used by journal_relocate so, for example,
2915          * the file system can change its pointer to the new journal.
2916          */
2917         if (callback != NULL && callback(callback_arg) != 0) {
2918                 goto bad_journal;
2919         }
2920
2921     //
2922     // setup for looping through all the blhdr's.  we null out the
2923     // tbuffer and blhdr fields so that they're not used any more.
2924     //
2925     blhdr       = tr->blhdr;
2926     tr->tbuffer = NULL;
2927     tr->blhdr   = NULL;
2928
2929     // the buffer_flushed_callback will only be called for the
2930     // real blocks that get flushed so we have to account for
2931     // the block_list_headers here.
2932     //
2933     tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2934
2935     // for each block, set the iodone callback and unlock it
2936     for(; blhdr; blhdr=next) {
2937
2938                 // we can re-order the buf ptrs because everything is written out already
2939                 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2940
2941                 for(i=1; i < blhdr->num_blocks; i++) {
2942                         if (blhdr->binfo[i].b.bp == NULL) {
2943                                 continue;
2944                         }
2945
2946                         bp = blhdr->binfo[i].b.bp;
2947
2948                         // have to pass BAC_REMOVE here because we're going to bawrite()
2949                         // the buffer when we're done
2950                         do {
2951                                 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
2952                         } while (errno == EAGAIN);
2953
2954                         if (errno == 0) {
2955                                 struct vnode *save_vp;
2956                                 void *cur_filter;
2957
2958                                 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2959                                         if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2960                                             buf_clearflags(bp, B_LOCKED);
2961                                             buf_brelse(bp);
2962                                                 continue;
2963                                         } else {
2964                                                 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
2965                                         }
2966                                 }
2967                                 save_vp = buf_vnode(bp);
2968
2969                                 buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL);
2970
2971                                 if (cur_filter) {
2972                                         panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n",
2973                                                   bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback);
2974                                 }
2975                                 buf_clearflags(bp, B_LOCKED);
2976
2977                                 // kicking off the write here helps performance
2978                                 buf_bawrite(bp);
2979                                 // XXXdbg this is good for testing: buf_bdwrite(bp);
2980                                 //buf_bdwrite(bp);
2981
2982                                 // this undoes the vnode_ref() in journal_modify_block_end()
2983                                 vnode_rele_ext(save_vp, 0, 1);
2984                         } else {
2985                                 printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n",
2986                                     jnl->jdev_name,bp, errno);
2987                         }
2988                 }
2989
2990                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2991
2992                 // we can free blhdr here since we won't need it any more
2993                 blhdr->binfo[0].bnum = 0xdeadc0de;
2994                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2995     }
2996
2997     //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2998     //   tr, tr->journal_start, tr->journal_end);
2999     return 0;
3000
3001
3002   bad_journal:
3003     jnl->flags |= JOURNAL_INVALID;
3004     jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
3005     abort_transaction(jnl, tr);
3006     return -1;
3007 }
3008
3009 static void
3010 abort_transaction(journal *jnl, transaction *tr)
3011 {
3012     int                i;
3013     errno_t             errno;
3014     block_list_header *blhdr, *next;
3015     struct buf        *bp;
3016     struct vnode      *save_vp;
3017
3018     // for each block list header, iterate over the blocks then
3019     // free up the memory associated with the block list.
3020     //
3021     // for each block, clear the lock bit and release it.
3022     //
3023     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
3024
3025                 for(i=1; i < blhdr->num_blocks; i++) {
3026                         if (blhdr->binfo[i].b.bp == NULL) {
3027                                 continue;
3028                         }
3029                         if ( (buf_vnode(blhdr->binfo[i].b.bp) == NULL) ||
3030                              !(buf_flags(blhdr->binfo[i].b.bp) & B_LOCKED) ) {
3031                                 continue;
3032                         }
3033
3034                         errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].b.bp),
3035                                                          buf_lblkno(blhdr->binfo[i].b.bp),
3036                                                          buf_size(blhdr->binfo[i].b.bp),
3037                                                          NOCRED,
3038                                                          &bp);
3039                         if (errno == 0) {
3040                                 if (bp != blhdr->binfo[i].b.bp) {
3041                                         panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
3042                                                   bp, blhdr->binfo[i].b.bp, jnl);
3043                                 }
3044
3045                                 // releasing a bp marked invalid
3046                                 // also clears the locked and delayed state
3047                                 buf_markinvalid(bp);
3048                                 save_vp = buf_vnode(bp);
3049
3050                                 buf_brelse(bp);
3051
3052                                 vnode_rele_ext(save_vp, 0, 1);
3053                         } else {
3054                                 printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
3055                                     jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].b.bp);
3056                                 if (bp) {
3057                                         buf_brelse(bp);
3058                                 }
3059                         }
3060                 }
3061
3062                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3063
3064                 // we can free blhdr here since we won't need it any more
3065                 blhdr->binfo[0].bnum = 0xdeadc0de;
3066                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
3067     }
3068
3069     tr->tbuffer     = NULL;
3070     tr->blhdr       = NULL;
3071     tr->total_bytes = 0xdbadc0de;
3072         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
3073 }
3074
3075
3076 int
3077 journal_end_transaction(journal *jnl)
3078 {
3079     int ret;
3080         transaction *tr;
3081
3082     CHECK_JOURNAL(jnl);
3083
3084         if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
3085                 return 0;
3086         }
3087
3088     if (jnl->owner != current_thread()) {
3089                 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
3090                           jnl, jnl->owner, current_thread());
3091     }
3092
3093     free_old_stuff(jnl);
3094
3095     jnl->nested_count--;
3096     if (jnl->nested_count > 0) {
3097                 return 0;
3098     } else if (jnl->nested_count < 0) {
3099                 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
3100     }
3101
3102     if (jnl->flags & JOURNAL_INVALID) {
3103                 if (jnl->active_tr) {
3104                         if (jnl->cur_tr != NULL) {
3105                                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
3106                                           jnl, jnl->active_tr, jnl->cur_tr);
3107                         }
3108
3109                         tr             = jnl->active_tr;
3110                         jnl->active_tr = NULL;
3111                         abort_transaction(jnl, tr);
3112                 }
3113
3114                 jnl->owner = NULL;
3115                 unlock_journal(jnl);
3116
3117                 return EINVAL;
3118     }
3119
3120     tr = jnl->active_tr;
3121     CHECK_TRANSACTION(tr);
3122
3123     // clear this out here so that when check_free_space() calls
3124     // the FS flush function, we don't panic in journal_flush()
3125     // if the FS were to call that.  note: check_free_space() is
3126     // called from end_transaction().
3127     //
3128     jnl->active_tr = NULL;
3129     ret = end_transaction(tr, 0, NULL, NULL);
3130
3131     jnl->owner = NULL;
3132     unlock_journal(jnl);
3133
3134     return ret;
3135 }
3136
3137
3138 int
3139 journal_flush(journal *jnl)
3140 {
3141     int need_signal = 0;
3142
3143     CHECK_JOURNAL(jnl);
3144
3145     if (jnl->flags & JOURNAL_INVALID) {
3146                 return -1;
3147     }
3148
3149     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3150         | DBG_FUNC_START, 0, 0, 0, 0, 0);
3151
3152     if (jnl->owner != current_thread()) {
3153                 lock_journal(jnl);
3154                 need_signal = 1;
3155     }
3156
3157     free_old_stuff(jnl);
3158
3159     // if we're not active, flush any buffered transactions
3160     if (jnl->active_tr == NULL && jnl->cur_tr) {
3161                 transaction *tr = jnl->cur_tr;
3162
3163                 jnl->cur_tr = NULL;
3164                 end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
3165     }
3166
3167     if (need_signal) {
3168                 unlock_journal(jnl);
3169     }
3170
3171     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3172         | DBG_FUNC_END, 0, 0, 0, 0, 0);
3173
3174     return 0;
3175 }
3176
3177 int
3178 journal_active(journal *jnl)
3179 {
3180     if (jnl->flags & JOURNAL_INVALID) {
3181                 return -1;
3182     }
3183
3184     return (jnl->active_tr == NULL) ? 0 : 1;
3185 }
3186
3187 void *
3188 journal_owner(journal *jnl)
3189 {
3190     return jnl->owner;
3191 }
3192
3193 int journal_uses_fua(journal *jnl)
3194 {
3195         if (jnl->flags & JOURNAL_DO_FUA_WRITES)
3196                 return 1;
3197         return 0;
3198 }
3199
3200 /*
3201  * Relocate the journal.
3202  *
3203  * You provide the new starting offset and size for the journal. You may
3204  * optionally provide a new tbuffer_size; passing zero defaults to not
3205  * changing the tbuffer size except as needed to fit within the new journal
3206  * size.
3207  *
3208  * You must have already started a transaction. The transaction may contain
3209  * modified blocks (such as those needed to deallocate the old journal,
3210  * allocate the new journal, and update the location and size of the journal
3211  * in filesystem-private structures). Any transactions prior to the active
3212  * transaction will be flushed to the old journal. The new journal will be
3213  * initialized, and the blocks from the active transaction will be written to
3214  * the new journal.
3215  *
3216  * The caller will need to update the structures that identify the location
3217  * and size of the journal.  These updates should be made in the supplied
3218  * callback routine.  These updates must NOT go into a transaction.  You should
3219  * force these updates to the media before returning from the callback.  In the
3220  * even of a crash, either the old journal will be found, with an empty journal,
3221  * or the new journal will be found with the contents of the active transaction.
3222  *
3223  * Upon return from the callback, the blocks from the active transaction are
3224  * written to their normal locations on disk.
3225  *
3226  * (Remember that we have to ensure that blocks get committed to the journal
3227  * before being committed to their normal locations.  But the blocks don't count
3228  * as committed until the new journal is pointed at.)
3229  *
3230  * Upon return, there is still an active transaction: newly allocated, and
3231  * with no modified blocks.  Call journal_end_transaction as normal.  You may
3232  * modifiy additional blocks before calling journal_end_transaction, and those
3233  * blocks will (eventually) go to the relocated journal.
3234  *
3235  * Inputs:
3236  *      jnl                             The (opened) journal to relocate.
3237  *      offset                  The new journal byte offset (from start of the journal device).
3238  *      journal_size    The size, in bytes, of the new journal.
3239  *      tbuffer_size    The new desired transaction buffer size.  Pass zero to keep
3240  *                                      the same size as the current journal.  The size will be
3241  *                                      modified as needed to fit the new journal.
3242  *      callback                Routine called after the new journal has been initialized,
3243  *                                      and the active transaction written to the new journal, but
3244  *                                      before the blocks are written to their normal locations.
3245  *                                      Pass NULL for no callback.
3246  *      callback_arg    An argument passed to the callback routine.
3247  *
3248  * Result:
3249  *      0                               No errors
3250  *      EINVAL                  The offset is not block aligned
3251  *      EINVAL                  The journal_size is not a multiple of the block size
3252  *      EINVAL                  The journal is invalid
3253  *      (any)                   An error returned by journal_flush.
3254  *
3255  */
3256 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
3257         errno_t (*callback)(void *), void *callback_arg)
3258 {
3259         int ret;
3260         transaction *tr;
3261
3262         /*
3263          * Sanity check inputs, and adjust the size of the transaction buffer.
3264          */
3265     if ((offset % jnl->jhdr->jhdr_size) != 0) {
3266                 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
3267                     jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
3268                 return EINVAL;
3269     }
3270     if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
3271                 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3272                     jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
3273                 return EINVAL;
3274     }
3275
3276     CHECK_JOURNAL(jnl);
3277
3278         /* Guarantee we own the active transaction. */
3279     if (jnl->flags & JOURNAL_INVALID) {
3280                 return EINVAL;
3281     }
3282     if (jnl->owner != current_thread()) {
3283         panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
3284                 jnl, jnl->owner, current_thread());
3285         }
3286
3287     if (tbuffer_size == 0)
3288         tbuffer_size = jnl->tbuffer_size;
3289     size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
3290
3291         /*
3292          * Flush any non-active transactions.  We have to temporarily hide the
3293          * active transaction to make journal_flush flush out non-active but
3294          * current (unwritten) transactions.
3295          */
3296         tr = jnl->active_tr;
3297         CHECK_TRANSACTION(tr);
3298         jnl->active_tr = NULL;
3299         ret = journal_flush(jnl);
3300         jnl->active_tr = tr;
3301         if (ret) {
3302                 return ret;
3303         }
3304
3305         /* Update the journal's offset and size in memory. */
3306         jnl->jdev_offset = offset;
3307         jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
3308         jnl->jhdr->size = journal_size;
3309         jnl->active_start = jnl->jhdr->start;
3310
3311         /*
3312          * Force the active transaction to be written to the new journal.  Call the
3313          * supplied callback after the blocks have been written to the journal, but
3314          * before they get written to their normal on-disk locations.
3315          */
3316         jnl->active_tr = NULL;
3317         ret = end_transaction(tr, 1, callback, callback_arg);
3318         if (ret) {
3319                 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
3320                 goto bad_journal;
3321         }
3322
3323         /*
3324          * Create a new, empty transaction to be the active transaction.  This way
3325          * our caller can use journal_end_transaction as usual.
3326          */
3327         ret = journal_allocate_transaction(jnl);
3328         if (ret) {
3329                 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
3330                 goto bad_journal;
3331         }
3332
3333         return 0;
3334
3335 bad_journal:
3336     jnl->flags |= JOURNAL_INVALID;
3337     abort_transaction(jnl, tr);
3338     return ret;
3339 }