bsd/vfs/vfs_journal.c

   1 /*
   2  * Copyright (c) 1995-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // vfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 //
  37
  38 #ifdef KERNEL
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/file_internal.h>
  44 #include <sys/stat.h>
  45 #include <sys/buf_internal.h>
  46 #include <sys/proc_internal.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/namei.h>
  49 #include <sys/vnode_internal.h>
  50 #include <sys/ioctl.h>
  51 #include <sys/tty.h>
  52 #include <sys/ubc.h>
  53 #include <sys/malloc.h>
  54 #include <kern/thread.h>
  55 #include <sys/disk.h>
  56 #include <sys/kdebug.h>
  57 #include <miscfs/specfs/specdev.h>
  58 #include <libkern/OSAtomic.h>   /* OSAddAtomic */
  59
  60 extern task_t kernel_task;
  61
  62 #define DBG_JOURNAL_FLUSH 1
  63
  64 #else
  65
  66 #include <stdio.h>
  67 #include <stdlib.h>
  68 #include <string.h>
  69 #include <limits.h>
  70 #include <errno.h>
  71 #include <fcntl.h>
  72 #include <unistd.h>
  73 #include <stdarg.h>
  74 #include <sys/types.h>
  75 #include "compat.h"
  76
  77 #endif   /* KERNEL */
  78
  79 #include "vfs_journal.h"
  80
  81 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
  82 __private_extern__ void qsort(
  83     void * array,
  84     size_t nmembers,
  85     size_t member_size,
  86     int (*)(const void *, const void *));
  87
  88
  89
  90 // number of bytes to checksum in a block_list_header
  91 // NOTE: this should be enough to clear out the header
  92 //       fields as well as the first entry of binfo[]
  93 #define BLHDR_CHECKSUM_SIZE 32
  94
  95
  96 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg);
  97 static void abort_transaction(journal *jnl, transaction *tr);
  98 static void dump_journal(journal *jnl);
  99
 100 static __inline__ void  lock_journal(journal *jnl);
 101 static __inline__ void  unlock_journal(journal *jnl);
 102 static __inline__ void  lock_oldstart(journal *jnl);
 103 static __inline__ void  unlock_oldstart(journal *jnl);
 104
 105
 106
 107
 108 //
 109 // 3105942 - Coalesce writes to the same block on journal replay
 110 //
 111
 112 typedef struct bucket {
 113     off_t   block_num;
 114     size_t  jnl_offset;
 115     size_t  block_size;
 116     int32_t cksum;
 117 } bucket;
 118
 119 #define STARTING_BUCKETS 256
 120
 121 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 122 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
 123 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
 124 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 125 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
 126
 127 #define CHECK_JOURNAL(jnl) \
 128     do { \
 129     if (jnl == NULL) {\
 130         panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
 131     }\
 132     if (jnl->jdev == NULL) { \
 133         panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
 134     } \
 135     if (jnl->fsdev == NULL) { \
 136         panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
 137     } \
 138     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
 139         panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
 140         __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
 141     }\
 142     if (   jnl->jhdr->start <= 0 \
 143         || jnl->jhdr->start > jnl->jhdr->size\
 144         || jnl->jhdr->start > 1024*1024*1024) {\
 145         panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
 146         __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
 147     }\
 148     if (   jnl->jhdr->end <= 0 \
 149         || jnl->jhdr->end > jnl->jhdr->size\
 150         || jnl->jhdr->end > 1024*1024*1024) {\
 151         panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
 152         __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
 153     }\
 154     if (jnl->jhdr->size > 1024*1024*1024) {\
 155         panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
 156         __FILE__, __LINE__, jnl->jhdr->size);\
 157     } \
 158     } while(0)
 159
 160 #define CHECK_TRANSACTION(tr) \
 161     do {\
 162     if (tr == NULL) {\
 163         panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
 164     }\
 165     if (tr->jnl == NULL) {\
 166         panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
 167     }\
 168     if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
 169         panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
 170     }\
 171     if (tr->total_bytes < 0) {\
 172         panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
 173     }\
 174     if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
 175         panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
 176     }\
 177     if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
 178         panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
 179     }\
 180     if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
 181         panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
 182     }\
 183     } while(0)
 184
 185
 186
 187 //
 188 // this isn't a great checksum routine but it will do for now.
 189 // we use it to checksum the journal header and the block list
 190 // headers that are at the start of each transaction.
 191 //
 192 static int
 193 calc_checksum(char *ptr, int len)
 194 {
 195     int i, cksum=0;
 196
 197     // this is a lame checksum but for now it'll do
 198     for(i=0; i < len; i++, ptr++) {
 199                 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
 200     }
 201
 202     return (~cksum);
 203 }
 204
 205 //
 206 // Journal Locking
 207 //
 208 lck_grp_attr_t *  jnl_group_attr;
 209 lck_attr_t *      jnl_lock_attr;
 210 lck_grp_t *       jnl_mutex_group;
 211
 212 void
 213 journal_init(void)
 214 {
 215         jnl_lock_attr    = lck_attr_alloc_init();
 216         jnl_group_attr   = lck_grp_attr_alloc_init();
 217         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 218 }
 219
 220 static __inline__ void
 221 lock_journal(journal *jnl)
 222 {
 223         lck_mtx_lock(&jnl->jlock);
 224 }
 225
 226 static __inline__ void
 227 unlock_journal(journal *jnl)
 228 {
 229         lck_mtx_unlock(&jnl->jlock);
 230 }
 231
 232 static __inline__ void
 233 lock_oldstart(journal *jnl)
 234 {
 235         lck_mtx_lock(&jnl->old_start_lock);
 236 }
 237
 238 static __inline__ void
 239 unlock_oldstart(journal *jnl)
 240 {
 241         lck_mtx_unlock(&jnl->old_start_lock);
 242 }
 243
 244
 245
 246 #define JNL_WRITE    0x0001
 247 #define JNL_READ     0x0002
 248 #define JNL_HEADER   0x8000
 249
 250 //
 251 // This function sets up a fake buf and passes it directly to the
 252 // journal device strategy routine (so that it won't get cached in
 253 // the block cache.
 254 //
 255 // It also handles range checking the i/o so that we don't write
 256 // outside the journal boundaries and it will wrap the i/o back
 257 // to the beginning if necessary (skipping over the journal header)
 258 //
 259 static size_t
 260 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
 261 {
 262     int         err, curlen=len;
 263     size_t      io_sz = 0;
 264     buf_t       bp;
 265     off_t       max_iosize;
 266
 267     if (*offset < 0 || *offset > jnl->jhdr->size) {
 268                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
 269     }
 270
 271     if (direction & JNL_WRITE)
 272         max_iosize = jnl->max_write_size;
 273     else if (direction & JNL_READ)
 274         max_iosize = jnl->max_read_size;
 275     else
 276         max_iosize = 128 * 1024;
 277
 278   again:
 279     bp = alloc_io_buf(jnl->jdev, 1);
 280
 281     if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
 282                 if (*offset == jnl->jhdr->size) {
 283                         *offset = jnl->jhdr->jhdr_size;
 284                 } else {
 285                         curlen = (off_t)jnl->jhdr->size - *offset;
 286                 }
 287     }
 288
 289         if (curlen > max_iosize) {
 290                 curlen = max_iosize;
 291         }
 292
 293     if (curlen <= 0) {
 294                 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %lu\n", curlen, *offset, len);
 295     }
 296
 297         if (*offset == 0 && (direction & JNL_HEADER) == 0) {
 298                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 299         }
 300
 301     if (direction & JNL_READ)
 302             buf_setflags(bp, B_READ);
 303     else {
 304             /*
 305              * don't have to set any flags
 306              */
 307             vnode_startwrite(jnl->jdev);
 308     }
 309     buf_setsize(bp, curlen);
 310     buf_setcount(bp, curlen);
 311     buf_setdataptr(bp, (uintptr_t)data);
 312     buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 313     buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 314     if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
 315         buf_markfua(bp);
 316     }
 317
 318     err = VNOP_STRATEGY(bp);
 319     if (!err) {
 320                 err = (int)buf_biowait(bp);
 321     }
 322     free_io_buf(bp);
 323
 324     if (err) {
 325         printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
 326         return 0;
 327     }
 328
 329     *offset += curlen;
 330     io_sz   += curlen;
 331     if (io_sz != len) {
 332                 // handle wrap-around
 333                 data    = (char *)data + curlen;
 334                 curlen  = len - io_sz;
 335                 if (*offset >= jnl->jhdr->size) {
 336                         *offset = jnl->jhdr->jhdr_size;
 337                 }
 338                 goto again;
 339     }
 340
 341     return io_sz;
 342 }
 343
 344 static size_t
 345 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 346 {
 347     return do_journal_io(jnl, offset, data, len, JNL_READ);
 348 }
 349
 350 static size_t
 351 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 352 {
 353     return do_journal_io(jnl, offset, data, len, JNL_WRITE);
 354 }
 355
 356
 357 static size_t
 358 read_journal_header(journal *jnl, void *data, size_t len)
 359 {
 360         off_t hdr_offset = 0;
 361
 362         return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
 363 }
 364
 365 static int
 366 write_journal_header(journal *jnl)
 367 {
 368     static int num_err_prints = 0;
 369     int ret=0;
 370     off_t jhdr_offset = 0;
 371     struct vfs_context context;
 372
 373     context.vc_thread = current_thread();
 374     context.vc_ucred = NOCRED;
 375     //
 376     // Flush the track cache if we're not doing force-unit-access
 377     // writes.
 378     //
 379     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 380         ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 381     }
 382     if (ret != 0) {
 383         //
 384         // Only print this error if it's a different error than the
 385         // previous one, or if it's the first time for this device
 386         // or if the total number of printfs is less than 25.  We
 387         // allow for up to 25 printfs to insure that some make it
 388         // into the on-disk syslog.  Otherwise if we only printed
 389         // one, it's possible it would never make it to the syslog
 390         // for the root volume and that makes debugging hard.
 391         //
 392         if (   ret != jnl->last_flush_err
 393             || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
 394             || num_err_prints++ < 25) {
 395
 396             printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
 397
 398             jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
 399             jnl->last_flush_err = ret;
 400         }
 401     }
 402
 403     jnl->jhdr->checksum = 0;
 404     jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 405     if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
 406         printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
 407         jnl->flags |= JOURNAL_INVALID;
 408         return -1;
 409     }
 410
 411     // If we're not doing force-unit-access writes, then we
 412     // have to flush after writing the journal header so that
 413     // a future transaction doesn't sneak out to disk before
 414     // the header does and thus overwrite data that the old
 415     // journal header refers to.  Saw this exact case happen
 416     // on an IDE bus analyzer with Larry Barras so while it
 417     // may seem obscure, it's not.
 418     //
 419     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 420         VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 421     }
 422
 423     return 0;
 424 }
 425
 426
 427
 428 //
 429 // this is a work function used to free up transactions that
 430 // completed. they can't be free'd from buffer_flushed_callback
 431 // because it is called from deep with the disk driver stack
 432 // and thus can't do something that would potentially cause
 433 // paging.  it gets called by each of the journal api entry
 434 // points so stuff shouldn't hang around for too long.
 435 //
 436 static void
 437 free_old_stuff(journal *jnl)
 438 {
 439     transaction *tr, *next;
 440
 441     lock_oldstart(jnl);
 442     tr = jnl->tr_freeme;
 443     jnl->tr_freeme = NULL;
 444     unlock_oldstart(jnl);
 445
 446     for(; tr; tr=next) {
 447         next = tr->next;
 448         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
 449     }
 450
 451 }
 452
 453
 454
 455 //
 456 // This is our callback that lets us know when a buffer has been
 457 // flushed to disk.  It's called from deep within the driver stack
 458 // and thus is quite limited in what it can do.  Notably, it can
 459 // not initiate any new i/o's or allocate/free memory.
 460 //
 461 static void
 462 buffer_flushed_callback(struct buf *bp, void *arg)
 463 {
 464     transaction  *tr;
 465     journal      *jnl;
 466     transaction  *ctr, *prev=NULL, *next;
 467     size_t        i;
 468     int           bufsize, amt_flushed, total_bytes;
 469
 470
 471     //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
 472     //     bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
 473
 474     // snarf out the bits we want
 475     bufsize = buf_size(bp);
 476     tr      = (transaction *)arg;
 477
 478     // then we've already seen it
 479     if (tr == NULL) {
 480                 return;
 481     }
 482
 483     CHECK_TRANSACTION(tr);
 484
 485     jnl = tr->jnl;
 486     if (jnl->flags & JOURNAL_INVALID) {
 487                 return;
 488     }
 489
 490     CHECK_JOURNAL(jnl);
 491
 492     amt_flushed = tr->num_killed;
 493     total_bytes = tr->total_bytes;
 494
 495     // update the number of blocks that have been flushed.
 496     // this buf may represent more than one block so take
 497     // that into account.
 498     //
 499     // OSAddAtomic() returns the value of tr->num_flushed before the add
 500     //
 501     amt_flushed += OSAddAtomic(bufsize, (SInt32 *)&tr->num_flushed);
 502
 503
 504     // if this transaction isn't done yet, just return as
 505     // there is nothing to do.
 506     //
 507     // NOTE: we are careful to not reference anything through
 508     //       the tr pointer after doing the OSAddAtomic().  if
 509     //       this if statement fails then we are the last one
 510     //       and then it's ok to dereference "tr".
 511     //
 512     if ((amt_flushed + bufsize) < total_bytes) {
 513                 return;
 514     }
 515
 516     // this will single thread checking the transaction
 517     lock_oldstart(jnl);
 518
 519     if (tr->total_bytes == (int)0xfbadc0de) {
 520         // then someone beat us to it...
 521         unlock_oldstart(jnl);
 522         return;
 523     }
 524
 525     // mark this so that we're the owner of dealing with the
 526     // cleanup for this transaction
 527     tr->total_bytes = 0xfbadc0de;
 528
 529     //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 530     //   tr, tr->journal_start, tr->journal_end, jnl);
 531
 532     // find this entry in the old_start[] index and mark it completed
 533     for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
 534
 535         if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
 536             jnl->old_start[i] &= ~(0x8000000000000000ULL);
 537             break;
 538         }
 539     }
 540
 541     if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
 542         panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
 543             tr->journal_start, tr, jnl);
 544     }
 545
 546
 547     // if we are here then we need to update the journal header
 548     // to reflect that this transaction is complete
 549     if (tr->journal_start == jnl->active_start) {
 550         jnl->active_start = tr->journal_end;
 551         tr->journal_start = tr->journal_end = (off_t)0;
 552     }
 553
 554     // go through the completed_trs list and try to coalesce
 555     // entries, restarting back at the beginning if we have to.
 556     for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
 557         if (ctr->journal_start == jnl->active_start) {
 558             jnl->active_start = ctr->journal_end;
 559             if (prev) {
 560                 prev->next = ctr->next;
 561             }
 562             if (ctr == jnl->completed_trs) {
 563                 jnl->completed_trs = ctr->next;
 564             }
 565
 566             next           = jnl->completed_trs;   // this starts us over again
 567             ctr->next      = jnl->tr_freeme;
 568             jnl->tr_freeme = ctr;
 569             ctr            = NULL;
 570         } else if (tr->journal_end == ctr->journal_start) {
 571             ctr->journal_start = tr->journal_start;
 572             next               = jnl->completed_trs;  // this starts us over again
 573             ctr                = NULL;
 574             tr->journal_start  = tr->journal_end = (off_t)0;
 575         } else if (tr->journal_start == ctr->journal_end) {
 576             ctr->journal_end  = tr->journal_end;
 577             next              = ctr->next;
 578             tr->journal_start = tr->journal_end = (off_t)0;
 579         } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
 580             // coalesce the next entry with this one and link the next
 581             // entry in at the head of the tr_freeme list
 582             next              = ctr->next;           // temporarily use the "next" variable
 583             ctr->journal_end  = next->journal_end;
 584             ctr->next         = next->next;
 585             next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
 586             jnl->tr_freeme    = next;
 587
 588             next              = jnl->completed_trs;  // this starts us over again
 589             ctr               = NULL;
 590         } else {
 591             next = ctr->next;
 592         }
 593     }
 594
 595     // if this is true then we didn't merge with anyone
 596     // so link ourselves in at the head of the completed
 597     // transaction list.
 598     if (tr->journal_start != 0) {
 599         // put this entry into the correct sorted place
 600         // in the list instead of just at the head.
 601         //
 602
 603         prev = NULL;
 604         for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
 605             // just keep looping
 606         }
 607
 608         if (ctr == NULL && prev == NULL) {
 609             jnl->completed_trs = tr;
 610             tr->next = NULL;
 611         } else if (ctr == jnl->completed_trs) {
 612             tr->next = jnl->completed_trs;
 613             jnl->completed_trs = tr;
 614         } else {
 615             tr->next = prev->next;
 616             prev->next = tr;
 617         }
 618     } else {
 619         // if we're here this tr got merged with someone else so
 620         // put it on the list to be free'd
 621         tr->next       = jnl->tr_freeme;
 622         jnl->tr_freeme = tr;
 623     }
 624     unlock_oldstart(jnl);
 625 }
 626
 627
 628 #include <libkern/OSByteOrder.h>
 629
 630 #define SWAP16(x) OSSwapInt16(x)
 631 #define SWAP32(x) OSSwapInt32(x)
 632 #define SWAP64(x) OSSwapInt64(x)
 633
 634
 635 static void
 636 swap_journal_header(journal *jnl)
 637 {
 638     jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
 639     jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
 640     jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
 641     jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
 642     jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
 643     jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
 644     jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
 645     jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
 646     jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
 647 }
 648
 649 static void
 650 swap_block_list_header(journal *jnl, block_list_header *blhdr)
 651 {
 652     int i;
 653
 654     blhdr->max_blocks = SWAP16(blhdr->max_blocks);
 655     blhdr->num_blocks = SWAP16(blhdr->num_blocks);
 656     blhdr->bytes_used = SWAP32(blhdr->bytes_used);
 657     blhdr->checksum   = SWAP32(blhdr->checksum);
 658     blhdr->flags      = SWAP32(blhdr->flags);
 659
 660     if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
 661         printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
 662         return;
 663     }
 664
 665     for(i=0; i < blhdr->num_blocks; i++) {
 666                 blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
 667                 blhdr->binfo[i].bsize   = SWAP32(blhdr->binfo[i].bsize);
 668                 blhdr->binfo[i].b.cksum = SWAP32(blhdr->binfo[i].b.cksum);
 669     }
 670 }
 671
 672
 673 static int
 674 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
 675 {
 676     int         ret;
 677     struct buf *oblock_bp=NULL;
 678
 679     // first read the block we want.
 680     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 681     if (ret != 0) {
 682         printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
 683
 684                 if (oblock_bp) {
 685                         buf_brelse(oblock_bp);
 686                         oblock_bp = NULL;
 687                 }
 688
 689                 // let's try to be aggressive here and just re-write the block
 690                 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
 691                 if (oblock_bp == NULL) {
 692                     printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
 693                     return -1;
 694                 }
 695     }
 696
 697     // make sure it's the correct size.
 698     if (buf_size(oblock_bp) != bsize) {
 699                 buf_brelse(oblock_bp);
 700                 return -1;
 701     }
 702
 703     // copy the journal data over top of it
 704     memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize);
 705
 706     if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
 707         printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
 708         return ret;
 709     }
 710
 711     // and now invalidate it so that if someone else wants to read
 712     // it in a different size they'll be able to do it.
 713     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 714     if (oblock_bp) {
 715                 buf_markinvalid(oblock_bp);
 716                 buf_brelse(oblock_bp);
 717     }
 718
 719     return 0;
 720 }
 721
 722 static int
 723 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
 724 {
 725     struct bucket *newBuf;
 726     int current_size = num_buckets, i;
 727
 728     // return if newsize is less than the current size
 729     if (new_size < num_buckets) {
 730         return current_size;
 731     }
 732
 733     if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
 734         printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
 735         return -1;
 736     }
 737
 738     //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
 739
 740     // copy existing elements
 741     bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
 742
 743     // initialize the new ones
 744     for(i=num_buckets; i < new_size; i++) {
 745         newBuf[i].block_num = (off_t)-1;
 746     }
 747
 748     // free the old container
 749     FREE(*buf_ptr, M_TEMP);
 750
 751     // reset the buf_ptr
 752     *buf_ptr = newBuf;
 753
 754     return new_size;
 755 }
 756
 757 static int
 758 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
 759 {
 760     int lo, hi, index, matches, i;
 761
 762     if (num_full == 0) {
 763         return 0; // table is empty, so insert at index=0
 764     }
 765
 766     lo = 0;
 767     hi = num_full - 1;
 768     index = -1;
 769
 770     // perform binary search for block_num
 771     do {
 772         int mid = (hi - lo)/2 + lo;
 773         off_t this_num = (*buf_ptr)[mid].block_num;
 774
 775         if (block_num == this_num) {
 776             index = mid;
 777             break;
 778         }
 779
 780         if (block_num < this_num) {
 781             hi = mid;
 782             continue;
 783         }
 784
 785         if (block_num > this_num) {
 786             lo = mid + 1;
 787             continue;
 788         }
 789     } while(lo < hi);
 790
 791     // check if lo and hi converged on the match
 792     if (block_num == (*buf_ptr)[hi].block_num) {
 793         index = hi;
 794     }
 795
 796     // if no existing entry found, find index for new one
 797     if (index == -1) {
 798         index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
 799     } else {
 800         // make sure that we return the right-most index in the case of multiple matches
 801         matches = 0;
 802         i = index + 1;
 803         while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
 804             matches++;
 805             i++;
 806         }
 807
 808         index += matches;
 809     }
 810
 811     return index;
 812 }
 813
 814 static int
 815 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
 816 {
 817     if (!overwriting) {
 818         // grow the table if we're out of space
 819         if (*num_full_ptr >= *num_buckets_ptr) {
 820             int new_size = *num_buckets_ptr * 2;
 821             int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
 822
 823             if (grow_size < new_size) {
 824                 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
 825                 return -1;
 826             }
 827
 828             *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
 829         }
 830
 831         // if we're not inserting at the end, we need to bcopy
 832         if (blk_index != *num_full_ptr) {
 833             bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
 834         }
 835
 836         (*num_full_ptr)++; // increment only if we're not overwriting
 837     }
 838
 839     // sanity check the values we're about to add
 840     if (offset >= jnl->jhdr->size) {
 841         offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
 842     }
 843     if (size <= 0) {
 844         panic("jnl: insert_block: bad size in insert_block (%lu)\n", size);
 845     }
 846
 847     (*buf_ptr)[blk_index].block_num = num;
 848     (*buf_ptr)[blk_index].block_size = size;
 849     (*buf_ptr)[blk_index].jnl_offset = offset;
 850     (*buf_ptr)[blk_index].cksum = cksum;
 851
 852     return blk_index;
 853 }
 854
 855 static int
 856 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 857 {
 858     int num_to_remove, index, i, overwrite, err;
 859     size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
 860     off_t overlap, block_start, block_end;
 861
 862     block_start = block_num*jhdr_size;
 863     block_end = block_start + size;
 864     overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
 865
 866     // first, eliminate any overlap with the previous entry
 867     if (blk_index != 0 && !overwrite) {
 868         off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
 869         off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
 870         overlap = prev_block_end - block_start;
 871         if (overlap > 0) {
 872             if (overlap % jhdr_size != 0) {
 873                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %lu\n", jhdr_size);
 874             }
 875
 876             // if the previous entry completely overlaps this one, we need to break it into two pieces.
 877             if (prev_block_end > block_end) {
 878                 off_t new_num = block_end / jhdr_size;
 879                 size_t new_size = prev_block_end - block_end;
 880
 881                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
 882
 883                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
 884                 if (err < 0) {
 885                     panic("jnl: do_overlap: error inserting during pre-overlap\n");
 886                 }
 887             }
 888
 889             // Regardless, we need to truncate the previous entry to the beginning of the overlap
 890             (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
 891             (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
 892         }
 893     }
 894
 895     // then, bail out fast if there's no overlap with the entries that follow
 896     if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
 897         return 0; // no overlap, no overwrite
 898     } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
 899
 900         (*buf_ptr)[blk_index].cksum = cksum;   // update this
 901         return 1; // simple overwrite
 902     }
 903
 904     // Otherwise, find all cases of total and partial overlap. We use the special
 905     // block_num of -2 to designate entries that are completely overlapped and must
 906     // be eliminated. The block_num, size, and jnl_offset of partially overlapped
 907     // entries must be adjusted to keep the array consistent.
 908     index = blk_index;
 909     num_to_remove = 0;
 910     while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
 911         if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
 912             (*buf_ptr)[index].block_num = -2; // mark this for deletion
 913             num_to_remove++;
 914         } else {
 915             overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
 916             if (overlap > 0) {
 917                 if (overlap % jhdr_size != 0) {
 918                     panic("jnl: do_overlap: overlap of %lld is not multiple of %lu\n", overlap, jhdr_size);
 919                 }
 920
 921                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
 922                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
 923                 (*buf_ptr)[index].cksum = 0;
 924
 925                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
 926                 if (new_offset >= jnl->jhdr->size) {
 927                     new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
 928                 }
 929                 (*buf_ptr)[index].jnl_offset = new_offset;
 930
 931                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
 932                 if ((*buf_ptr)[index].block_size <= 0) {
 933                     panic("jnl: do_overlap: after overlap, new block size is invalid (%lu)\n", (*buf_ptr)[index].block_size);
 934                     // return -1; // if above panic is removed, return -1 for error
 935                 }
 936             }
 937
 938         }
 939
 940         index++;
 941     }
 942
 943     // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
 944     index--; // start with the last index used within the above loop
 945     while(index >= blk_index) {
 946         if ((*buf_ptr)[index].block_num == -2) {
 947             if (index == *num_full_ptr-1) {
 948                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
 949             } else {
 950                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
 951             }
 952             (*num_full_ptr)--;
 953         }
 954         index--;
 955     }
 956
 957     // eliminate any stale entries at the end of the table
 958     for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
 959         (*buf_ptr)[i].block_num = -1;
 960     }
 961
 962     return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
 963 }
 964
 965 // PR-3105942: Coalesce writes to the same block in journal replay
 966 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
 967 // to be replayed and the corresponding location in the journal which contains
 968 // the most recent data for those blocks. The array is "played" once the all the
 969 // blocks in the journal have been coalesced. The code for the case of conflicting/
 970 // overlapping writes to a single block is the most dense. Because coalescing can
 971 // disrupt the existing time-ordering of blocks in the journal playback, care
 972 // is taken to catch any overlaps and keep the array consistent.
 973 static int
 974 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 975 {
 976     int blk_index, overwriting;
 977
 978     // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
 979     // inserted (or the index of the elem to overwrite).
 980     blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
 981
 982     // check if the index is within bounds (if we're adding this block to the end of
 983     // the table, blk_index will be equal to num_full)
 984     if (blk_index < 0 || blk_index > *num_full_ptr) {
 985         //printf("jnl: add_block: trouble adding block to co_buf\n");
 986         return -1;
 987     } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
 988
 989     // Determine whether we're overwriting an existing entry by checking for overlap
 990     overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
 991     if (overwriting < 0) {
 992         return -1; // if we got an error, pass it along
 993     }
 994
 995     // returns the index, or -1 on error
 996     blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
 997
 998     return blk_index;
 999 }
1000
1001 static int
1002 replay_journal(journal *jnl)
1003 {
1004     int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
1005     size_t ret;
1006     size_t  max_bsize = 0;              /* protected by block_ptr */
1007     block_list_header *blhdr;
1008     off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1009     char *buff, *block_ptr=NULL;
1010     struct bucket *co_buf;
1011     int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1012     uint32_t last_sequence_num = 0;
1013
1014     // wrap the start ptr if it points to the very end of the journal
1015     if (jnl->jhdr->start == jnl->jhdr->size) {
1016                 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1017     }
1018     if (jnl->jhdr->end == jnl->jhdr->size) {
1019                 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1020     }
1021
1022     if (jnl->jhdr->start == jnl->jhdr->end) {
1023                 return 0;
1024     }
1025
1026     orig_jnl_start = jnl->jhdr->start;
1027
1028     // allocate memory for the header_block.  we'll read each blhdr into this
1029     if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
1030                 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1031                     jnl->jdev_name, jnl->jhdr->blhdr_size);
1032                 return -1;
1033     }
1034
1035     // allocate memory for the coalesce buffer
1036     if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1037         printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1038         return -1;
1039     }
1040
1041   restart_replay:
1042
1043     // initialize entries
1044     for(i=0; i < num_buckets; i++) {
1045         co_buf[i].block_num = -1;
1046     }
1047     num_full = 0; // empty at first
1048
1049
1050     printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1051         jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1052
1053     while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1054                 offset = blhdr_offset = jnl->jhdr->start;
1055                 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1056                 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1057                     printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1058                     bad_blocks = 1;
1059                     goto bad_txn_handling;
1060                 }
1061
1062                 blhdr = (block_list_header *)buff;
1063
1064                 orig_checksum = blhdr->checksum;
1065                 blhdr->checksum = 0;
1066                 if (jnl->flags & JOURNAL_NEED_SWAP) {
1067                         // calculate the checksum based on the unswapped data
1068                         // because it is done byte-at-a-time.
1069                         orig_checksum = SWAP32(orig_checksum);
1070                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1071                         swap_block_list_header(jnl, blhdr);
1072                 } else {
1073                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1074                 }
1075
1076
1077                 //
1078                 // XXXdbg - if these checks fail, we should replay as much
1079                 //          we can in the hopes that it will still leave the
1080                 //          drive in a better state than if we didn't replay
1081                 //          anything
1082                 //
1083                 if (checksum != orig_checksum) {
1084                     if (check_past_jnl_end && in_uncharted_territory) {
1085
1086                         if (blhdr_offset != jnl->jhdr->end) {
1087                             printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1088                         }
1089
1090                         check_past_jnl_end = 0;
1091                         jnl->jhdr->end = blhdr_offset;
1092                         continue;
1093                     }
1094
1095                     printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1096                         jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1097
1098                     if (blhdr_offset == orig_jnl_start) {
1099                         // if there's nothing in the journal at all, just bail out altogether.
1100                         goto bad_replay;
1101                     }
1102
1103                     bad_blocks = 1;
1104                     goto bad_txn_handling;
1105                 }
1106
1107                 if (   (last_sequence_num != 0)
1108                     && (blhdr->binfo[0].b.sequence_num != 0)
1109                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num)
1110                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num+1)) {
1111
1112                     txn_start_offset = jnl->jhdr->end = blhdr_offset;
1113
1114                     if (check_past_jnl_end) {
1115                         check_past_jnl_end = 0;
1116                         printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1117                             jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1118                         continue;
1119                     }
1120
1121                     printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1122                         jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1123                     bad_blocks = 1;
1124                     goto bad_txn_handling;
1125                 }
1126                 last_sequence_num = blhdr->binfo[0].b.sequence_num;
1127
1128                 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1129                     printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1130                 }
1131
1132                 if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
1133                            || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1134                     printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1135                         jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1136                     bad_blocks = 1;
1137                     goto bad_txn_handling;
1138                 }
1139
1140                 max_bsize = 0;
1141                 for(i=1; i < blhdr->num_blocks; i++) {
1142                         if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1143                             printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1144                             bad_blocks = 1;
1145                             goto bad_txn_handling;
1146                         }
1147
1148                         if (blhdr->binfo[i].bsize > max_bsize) {
1149                             max_bsize = blhdr->binfo[i].bsize;
1150                         }
1151                 }
1152
1153                 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1154                     check_block_checksums = 1;
1155                     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1156                         goto bad_replay;
1157                     }
1158                 } else {
1159                     block_ptr = NULL;
1160                 }
1161
1162                 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1163                     txn_start_offset = blhdr_offset;
1164                 }
1165
1166                 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1167                 //       blhdr->num_blocks-1, jnl->jhdr->start);
1168                 bad_blocks = 0;
1169                 for(i=1; i < blhdr->num_blocks; i++) {
1170                         int size, ret_val;
1171                         off_t number;
1172
1173                         size = blhdr->binfo[i].bsize;
1174                         number = blhdr->binfo[i].bnum;
1175
1176                         // don't add "killed" blocks
1177                         if (number == (off_t)-1) {
1178                             //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1179                         } else {
1180
1181                             if (check_block_checksums) {
1182                                 int32_t disk_cksum;
1183                                 off_t block_offset;
1184
1185                                 block_offset = offset;
1186
1187                                 // read the block so we can check the checksum
1188                                 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1189                                 if (ret != (size_t)size) {
1190                                     printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1191                                     bad_blocks = 1;
1192                                     goto bad_txn_handling;
1193                                 }
1194
1195                                 disk_cksum = calc_checksum(block_ptr, size);
1196
1197                                 // there is no need to swap the checksum from disk because
1198                                 // it got swapped when the blhdr was read in.
1199                                 if (blhdr->binfo[i].b.cksum != 0 && disk_cksum != blhdr->binfo[i].b.cksum) {
1200                                     printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1201                                         jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].b.cksum);
1202                                     printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1203                                         *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1204                                         *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1205
1206                                     bad_blocks = 1;
1207                                     goto bad_txn_handling;
1208                                 }
1209                             }
1210
1211
1212                             // add this bucket to co_buf, coalescing where possible
1213                             // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1214                             ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].b.cksum, &num_buckets, &num_full);
1215
1216                             if (ret_val == -1) {
1217                                 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1218                                 goto bad_replay;
1219                             } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1220                         }
1221
1222                         // increment offset
1223                         offset += size;
1224
1225                         // check if the last block added puts us off the end of the jnl.
1226                         // if so, we need to wrap to the beginning and take any remainder
1227                         // into account
1228                         //
1229                         if (offset >= jnl->jhdr->size) {
1230                             offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1231                         }
1232                 }
1233
1234                 if (block_ptr) {
1235                     kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1236                     block_ptr = NULL;
1237                 }
1238
1239       bad_txn_handling:
1240                 if (bad_blocks) {
1241                     if (txn_start_offset == 0) {
1242                         printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1243                         goto bad_replay;
1244                     }
1245
1246                     jnl->jhdr->start = orig_jnl_start;
1247                     jnl->jhdr->end = txn_start_offset;
1248                     check_past_jnl_end = 0;
1249                     last_sequence_num = 0;
1250                     printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1251                     goto restart_replay;
1252                 }
1253
1254                 jnl->jhdr->start += blhdr->bytes_used;
1255                 if (jnl->jhdr->start >= jnl->jhdr->size) {
1256                         // wrap around and skip the journal header block
1257                         jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1258                 }
1259
1260                 if (jnl->jhdr->start == jnl->jhdr->end) {
1261                     in_uncharted_territory = 1;
1262                 }
1263     }
1264
1265     if (jnl->jhdr->start != jnl->jhdr->end) {
1266         printf("jnl: %s: start %lld != end %lld.  resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1267         jnl->jhdr->end = jnl->jhdr->start;
1268     }
1269
1270     //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1271
1272     /*
1273      * make sure it's at least one page in size, so
1274      * start max_bsize at PAGE_SIZE
1275      */
1276     for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1277
1278             if (co_buf[i].block_num == (off_t)-1)
1279                     continue;
1280
1281             if (co_buf[i].block_size > max_bsize)
1282                     max_bsize = co_buf[i].block_size;
1283     }
1284     /*
1285      * round max_bsize up to the nearest PAGE_SIZE multiple
1286      */
1287     if (max_bsize & (PAGE_SIZE - 1)) {
1288             max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1289     }
1290
1291     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1292         goto bad_replay;
1293     }
1294
1295     // Replay the coalesced entries in the co-buf
1296     for(i=0; i < num_full; i++) {
1297         size_t size = co_buf[i].block_size;
1298         off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1299         off_t number = co_buf[i].block_num;
1300
1301
1302         // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1303         //      co_buf[i].block_size, co_buf[i].jnl_offset);
1304
1305         if (number == (off_t)-1) {
1306             // printf("jnl: replay_journal: skipping killed fs block\n");
1307         } else {
1308
1309             // do journal read, and set the phys. block
1310             ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1311             if (ret != size) {
1312                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1313                 goto bad_replay;
1314             }
1315
1316             if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1317                 goto bad_replay;
1318             }
1319         }
1320     }
1321
1322
1323     // done replaying; update jnl header
1324     if (write_journal_header(jnl) != 0) {
1325         goto bad_replay;
1326     }
1327
1328     printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1329
1330     // free block_ptr
1331     if (block_ptr) {
1332         kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1333         block_ptr = NULL;
1334     }
1335
1336     // free the coalesce buffer
1337     FREE(co_buf, M_TEMP);
1338     co_buf = NULL;
1339
1340     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1341     return 0;
1342
1343   bad_replay:
1344     if (block_ptr) {
1345                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1346     }
1347     if (co_buf) {
1348       FREE(co_buf, M_TEMP);
1349     }
1350     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1351
1352     return -1;
1353 }
1354
1355
1356 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
1357 //#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
1358 #define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
1359
1360 // XXXdbg - so I can change it in the debugger
1361 int def_tbuffer_size = 0;
1362
1363
1364 //
1365 // This function sets the size of the tbuffer and the
1366 // size of the blhdr.  It assumes that jnl->jhdr->size
1367 // and jnl->jhdr->jhdr_size are already valid.
1368 //
1369 static void
1370 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1371 {
1372         //
1373         // one-time initialization based on how much memory
1374         // there is in the machine.
1375         //
1376         if (def_tbuffer_size == 0) {
1377                 if (mem_size < (256*1024*1024)) {
1378                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1379                 } else if (mem_size < (512*1024*1024)) {
1380                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1381                 } else if (mem_size < (1024*1024*1024)) {
1382                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1383                 } else if (mem_size >= (1024*1024*1024)) {
1384                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1385                 }
1386         }
1387
1388     // size up the transaction buffer... can't be larger than the number
1389     // of blocks that can fit in a block_list_header block.
1390     if (tbuffer_size == 0) {
1391                 jnl->tbuffer_size = def_tbuffer_size;
1392     } else {
1393                 // make sure that the specified tbuffer_size isn't too small
1394                 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1395                         tbuffer_size = jnl->jhdr->blhdr_size * 2;
1396                 }
1397                 // and make sure it's an even multiple of the block size
1398                 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1399                         tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1400                 }
1401
1402                 jnl->tbuffer_size = tbuffer_size;
1403     }
1404
1405     if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1406                 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1407     }
1408
1409     if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1410                 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1411     }
1412
1413     jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1414     if (jnl->jhdr->blhdr_size < phys_blksz) {
1415         jnl->jhdr->blhdr_size = phys_blksz;
1416     } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1417                 // have to round up so we're an even multiple of the physical block size
1418                 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1419     }
1420 }
1421
1422
1423
1424 static void
1425 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1426 {
1427     off_t       readblockcnt;
1428     off_t       writeblockcnt;
1429     off_t       readmaxcnt;
1430     off_t       writemaxcnt;
1431     int32_t     features;
1432
1433     if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1434         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1435             const char *name = vnode_name(devvp);
1436             jnl->flags |= JOURNAL_DO_FUA_WRITES;
1437             printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
1438         }
1439     }
1440
1441     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context)) {
1442         readmaxcnt = 0;
1443     }
1444
1445     if (readmaxcnt == 0) {
1446         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context)) {
1447             readmaxcnt = 128 * 1024;
1448         } else {
1449             readmaxcnt = readblockcnt * phys_blksz;
1450         }
1451     }
1452
1453
1454     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context)) {
1455         writemaxcnt = 0;
1456     }
1457
1458     if (writemaxcnt == 0) {
1459         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context)) {
1460             writemaxcnt = 128 * 1024;
1461         } else {
1462             writemaxcnt = writeblockcnt * phys_blksz;
1463         }
1464     }
1465
1466     jnl->max_read_size  = readmaxcnt;
1467     jnl->max_write_size = writemaxcnt;
1468
1469     // just in case it's still zero...
1470     if (jnl->max_read_size == 0) {
1471         jnl->max_read_size = 128 * 1024;
1472         jnl->max_write_size = 128 * 1024;
1473     }
1474 }
1475
1476
1477 static const char *
1478 get_jdev_name(struct vnode *jvp)
1479 {
1480     const char *jdev_name;
1481
1482     jdev_name = vnode_name(jvp);
1483     if (jdev_name == NULL) {
1484         jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
1485     } else {
1486         // this just bumps the refcount on the name so we have our own copy
1487         jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
1488     }
1489
1490     return jdev_name;
1491 }
1492
1493
1494 journal *
1495 journal_create(struct vnode *jvp,
1496                            off_t         offset,
1497                            off_t         journal_size,
1498                            struct vnode *fsvp,
1499                            size_t        min_fs_blksz,
1500                            int32_t       flags,
1501                            int32_t       tbuffer_size,
1502                            void        (*flush)(void *arg),
1503                            void         *arg)
1504 {
1505     journal *jnl;
1506     size_t      phys_blksz;
1507     struct vfs_context context;
1508     const char *jdev_name;
1509
1510     context.vc_thread = current_thread();
1511     context.vc_ucred = FSCRED;
1512
1513     jdev_name = get_jdev_name(jvp);
1514
1515     /* Get the real physical block size. */
1516     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1517         return NULL;
1518     }
1519
1520     if (phys_blksz > min_fs_blksz) {
1521                 printf("jnl: %s: create: error: phys blksize %lu bigger than min fs blksize %lu\n",
1522                     jdev_name, phys_blksz, min_fs_blksz);
1523                 return NULL;
1524     }
1525
1526     if ((journal_size % phys_blksz) != 0) {
1527                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1528                     jdev_name, journal_size, phys_blksz);
1529                 return NULL;
1530     }
1531
1532
1533     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1534     memset(jnl, 0, sizeof(*jnl));
1535
1536     jnl->jdev         = jvp;
1537     jnl->jdev_offset  = offset;
1538     jnl->fsdev        = fsvp;
1539     jnl->flush        = flush;
1540     jnl->flush_arg    = arg;
1541     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1542     jnl->jdev_name    = jdev_name;
1543     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1544
1545     get_io_info(jvp, phys_blksz, jnl, &context);
1546
1547     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1548         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1549         goto bad_kmem_alloc;
1550     }
1551
1552     memset(jnl->header_buf, 0, phys_blksz);
1553
1554     jnl->jhdr             = (journal_header *)jnl->header_buf;
1555     jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
1556     jnl->jhdr->endian     = ENDIAN_MAGIC;
1557     jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
1558     jnl->jhdr->end        = phys_blksz;
1559     jnl->jhdr->size       = journal_size;
1560     jnl->jhdr->jhdr_size  = phys_blksz;
1561     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1562
1563         jnl->active_start     = jnl->jhdr->start;
1564
1565     // XXXdbg  - for testing you can force the journal to wrap around
1566     // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1567     // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
1568
1569     jnl->jhdr->sequence_num = random() & 0x00ffffff;
1570
1571         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1572
1573     if (write_journal_header(jnl) != 0) {
1574         printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1575         goto bad_write;
1576     }
1577
1578     return jnl;
1579
1580
1581   bad_write:
1582     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1583   bad_kmem_alloc:
1584     if (jdev_name) {
1585         vfs_removename(jdev_name);
1586     }
1587     jnl->jhdr = NULL;
1588     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1589     return NULL;
1590 }
1591
1592
1593 journal *
1594 journal_open(struct vnode *jvp,
1595                          off_t         offset,
1596                          off_t         journal_size,
1597                          struct vnode *fsvp,
1598                          size_t        min_fs_blksz,
1599                          int32_t       flags,
1600                          int32_t       tbuffer_size,
1601                          void        (*flush)(void *arg),
1602                          void         *arg)
1603 {
1604     journal *jnl;
1605     int      orig_blksz=0;
1606     size_t   phys_blksz;
1607     int      orig_checksum, checksum;
1608     struct vfs_context context;
1609     const char *jdev_name = get_jdev_name(jvp);
1610
1611     context.vc_thread = current_thread();
1612     context.vc_ucred = FSCRED;
1613
1614     /* Get the real physical block size. */
1615     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1616                 return NULL;
1617     }
1618
1619     if (phys_blksz > min_fs_blksz) {
1620                 printf("jnl: %s: open: error: phys blksize %lu bigger than min fs blksize %lu\n",
1621                     jdev_name, phys_blksz, min_fs_blksz);
1622                 return NULL;
1623     }
1624
1625     if ((journal_size % phys_blksz) != 0) {
1626                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1627                     jdev_name, journal_size, phys_blksz);
1628                 return NULL;
1629     }
1630
1631     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1632     memset(jnl, 0, sizeof(*jnl));
1633
1634     jnl->jdev         = jvp;
1635     jnl->jdev_offset  = offset;
1636     jnl->fsdev        = fsvp;
1637     jnl->flush        = flush;
1638     jnl->flush_arg    = arg;
1639     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1640     jnl->jdev_name    = jdev_name;
1641     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1642
1643     get_io_info(jvp, phys_blksz, jnl, &context);
1644
1645     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1646         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1647         goto bad_kmem_alloc;
1648     }
1649
1650     jnl->jhdr = (journal_header *)jnl->header_buf;
1651     memset(jnl->jhdr, 0, sizeof(journal_header));
1652
1653     // we have to set this up here so that do_journal_io() will work
1654     jnl->jhdr->jhdr_size = phys_blksz;
1655
1656     if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1657                 printf("jnl: %s: open: could not read %lu bytes for the journal header.\n",
1658                     jdev_name, phys_blksz);
1659                 goto bad_journal;
1660     }
1661
1662         orig_checksum = jnl->jhdr->checksum;
1663         jnl->jhdr->checksum = 0;
1664
1665         if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1666                 // do this before the swap since it's done byte-at-a-time
1667                 orig_checksum = SWAP32(orig_checksum);
1668                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1669                 swap_journal_header(jnl);
1670                 jnl->flags |= JOURNAL_NEED_SWAP;
1671         } else {
1672                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1673         }
1674
1675     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1676                 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1677                     jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1678                 goto bad_journal;
1679     }
1680
1681         // only check if we're the current journal header magic value
1682         if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1683
1684                 if (orig_checksum != checksum) {
1685                         printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1686                             jdev_name, orig_checksum, checksum);
1687
1688                         //goto bad_journal;
1689                 }
1690         }
1691
1692         // XXXdbg - convert old style magic numbers to the new one
1693         if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1694                 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1695         }
1696
1697     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1698                 printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n",
1699                     jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
1700
1701                 orig_blksz = phys_blksz;
1702                 phys_blksz = jnl->jhdr->jhdr_size;
1703                 if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) {
1704                     printf("jnl: %s: could not set block size to %lu bytes.\n", jdev_name, phys_blksz);
1705                     goto bad_journal;
1706                 }
1707 //              goto bad_journal;
1708     }
1709
1710     if (   jnl->jhdr->start <= 0
1711                    || jnl->jhdr->start > jnl->jhdr->size
1712                    || jnl->jhdr->start > 1024*1024*1024) {
1713                 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1714                     jdev_name, jnl->jhdr->start, jnl->jhdr->size);
1715                 goto bad_journal;
1716     }
1717
1718     if (   jnl->jhdr->end <= 0
1719                    || jnl->jhdr->end > jnl->jhdr->size
1720                    || jnl->jhdr->end > 1024*1024*1024) {
1721                 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1722                     jdev_name, jnl->jhdr->end, jnl->jhdr->size);
1723                 goto bad_journal;
1724     }
1725
1726     if (jnl->jhdr->size > 1024*1024*1024) {
1727         printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
1728         goto bad_journal;
1729     }
1730
1731 // XXXdbg - can't do these checks because hfs writes all kinds of
1732 //          non-uniform sized blocks even on devices that have a block size
1733 //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
1734 //          therefore these checks will fail and so we just have to punt and
1735 //          do more relaxed checking...
1736 // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1737     if ((jnl->jhdr->start % 512) != 0) {
1738                 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1739                     jdev_name, jnl->jhdr->start);
1740                 goto bad_journal;
1741     }
1742
1743 //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1744     if ((jnl->jhdr->end % 512) != 0) {
1745                 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1746                     jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
1747                 goto bad_journal;
1748     }
1749
1750     // take care of replaying the journal if necessary
1751     if (flags & JOURNAL_RESET) {
1752         printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1753             jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
1754         jnl->jhdr->start = jnl->jhdr->end;
1755     } else if (replay_journal(jnl) != 0) {
1756         printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
1757         goto bad_journal;
1758     }
1759
1760     if (orig_blksz != 0) {
1761         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1762         phys_blksz = orig_blksz;
1763         if (orig_blksz < jnl->jhdr->jhdr_size) {
1764             printf("jnl: %s: open: jhdr_size is %d but orig phys blk size is %d.  switching.\n",
1765                 jdev_name, jnl->jhdr->jhdr_size, orig_blksz);
1766
1767             jnl->jhdr->jhdr_size = orig_blksz;
1768         }
1769     }
1770
1771     // make sure this is in sync!
1772     jnl->active_start = jnl->jhdr->start;
1773
1774     // set this now, after we've replayed the journal
1775     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1776
1777     lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1778
1779     return jnl;
1780
1781   bad_journal:
1782     if (orig_blksz != 0) {
1783         phys_blksz = orig_blksz;
1784         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1785     }
1786     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1787   bad_kmem_alloc:
1788     if (jdev_name) {
1789         vfs_removename(jdev_name);
1790     }
1791     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1792     return NULL;
1793 }
1794
1795
1796 int
1797 journal_is_clean(struct vnode *jvp,
1798                  off_t         offset,
1799                  off_t         journal_size,
1800                  struct vnode *fsvp,
1801                  size_t        min_fs_block_size)
1802 {
1803     journal jnl;
1804     int     phys_blksz, ret;
1805     int     orig_checksum, checksum;
1806     struct vfs_context context;
1807     const char *jdev_name = get_jdev_name(jvp);
1808
1809     context.vc_thread = current_thread();
1810     context.vc_ucred = FSCRED;
1811
1812     /* Get the real physical block size. */
1813     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1814         printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
1815         return EINVAL;
1816     }
1817
1818     if (phys_blksz > (int)min_fs_block_size) {
1819         printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %lu\n",
1820             jdev_name, phys_blksz, min_fs_block_size);
1821         return EINVAL;
1822     }
1823
1824     if ((journal_size % phys_blksz) != 0) {
1825         printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1826             jdev_name, journal_size, phys_blksz);
1827         return EINVAL;
1828     }
1829
1830     memset(&jnl, 0, sizeof(jnl));
1831
1832     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
1833         printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
1834         return ENOMEM;
1835     }
1836
1837     get_io_info(jvp, phys_blksz, &jnl, &context);
1838
1839     jnl.jhdr = (journal_header *)jnl.header_buf;
1840     memset(jnl.jhdr, 0, sizeof(journal_header));
1841
1842     jnl.jdev        = jvp;
1843     jnl.jdev_offset = offset;
1844     jnl.fsdev       = fsvp;
1845
1846     // we have to set this up here so that do_journal_io() will work
1847     jnl.jhdr->jhdr_size = phys_blksz;
1848
1849     if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
1850         printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
1851             jdev_name, phys_blksz);
1852         ret = EINVAL;
1853         goto get_out;
1854     }
1855
1856     orig_checksum = jnl.jhdr->checksum;
1857     jnl.jhdr->checksum = 0;
1858
1859     if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1860         // do this before the swap since it's done byte-at-a-time
1861         orig_checksum = SWAP32(orig_checksum);
1862         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1863         swap_journal_header(&jnl);
1864         jnl.flags |= JOURNAL_NEED_SWAP;
1865     } else {
1866         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1867     }
1868
1869     if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1870         printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1871             jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
1872         ret = EINVAL;
1873         goto get_out;
1874     }
1875
1876     if (orig_checksum != checksum) {
1877         printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
1878         ret = EINVAL;
1879         goto get_out;
1880     }
1881
1882     //
1883     // if the start and end are equal then the journal is clean.
1884     // otherwise it's not clean and therefore an error.
1885     //
1886     if (jnl.jhdr->start == jnl.jhdr->end) {
1887         ret = 0;
1888     } else {
1889         ret = EINVAL;
1890     }
1891
1892   get_out:
1893     kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
1894     if (jdev_name) {
1895         vfs_removename(jdev_name);
1896     }
1897
1898     return ret;
1899
1900
1901 }
1902
1903
1904 void
1905 journal_close(journal *jnl)
1906 {
1907     volatile off_t *start, *end;
1908     int             counter=0;
1909
1910     CHECK_JOURNAL(jnl);
1911
1912         // set this before doing anything that would block so that
1913         // we start tearing things down properly.
1914         //
1915         jnl->flags |= JOURNAL_CLOSE_PENDING;
1916
1917     if (jnl->owner != current_thread()) {
1918                 lock_journal(jnl);
1919     }
1920
1921     //
1922     // only write stuff to disk if the journal is still valid
1923     //
1924     if ((jnl->flags & JOURNAL_INVALID) == 0) {
1925
1926                 if (jnl->active_tr) {
1927                         journal_end_transaction(jnl);
1928                 }
1929
1930                 // flush any buffered transactions
1931                 if (jnl->cur_tr) {
1932                         transaction *tr = jnl->cur_tr;
1933
1934                         jnl->cur_tr = NULL;
1935                         end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
1936                 }
1937
1938                 //start = &jnl->jhdr->start;
1939                 start = &jnl->active_start;
1940                 end   = &jnl->jhdr->end;
1941
1942                 while (*start != *end && counter++ < 5000) {
1943                         //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1944                         if (jnl->flush) {
1945                                 jnl->flush(jnl->flush_arg);
1946                         }
1947                         tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
1948                 }
1949
1950                 if (*start != *end) {
1951                         printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1952                             jnl->jdev_name, *start, *end);
1953                 }
1954
1955                 // make sure this is in sync when we close the journal
1956                 jnl->jhdr->start = jnl->active_start;
1957
1958                 // if this fails there's not much we can do at this point...
1959                 write_journal_header(jnl);
1960     } else {
1961                 // if we're here the journal isn't valid any more.
1962                 // so make sure we don't leave any locked blocks lying around
1963                 printf("jnl: %s: close: journal %p, is invalid.  aborting outstanding transactions\n", jnl->jdev_name, jnl);
1964                 if (jnl->active_tr || jnl->cur_tr) {
1965                         transaction *tr;
1966                         if (jnl->active_tr) {
1967                                 tr = jnl->active_tr;
1968                                 jnl->active_tr = NULL;
1969                         } else {
1970                                 tr = jnl->cur_tr;
1971                                 jnl->cur_tr = NULL;
1972                         }
1973
1974                         abort_transaction(jnl, tr);
1975                         if (jnl->active_tr || jnl->cur_tr) {
1976                             panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
1977                         }
1978                 }
1979     }
1980
1981     free_old_stuff(jnl);
1982
1983     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
1984     jnl->jhdr = (void *)0xbeefbabe;
1985
1986     if (jnl->jdev_name) {
1987         vfs_removename(jnl->jdev_name);
1988     }
1989
1990     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1991 }
1992
1993 static void
1994 dump_journal(journal *jnl)
1995 {
1996     transaction *ctr;
1997
1998     printf("journal for dev %s:", jnl->jdev_name);
1999     printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
2000     printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
2001     printf("  start: 0x%.8llx\n", jnl->jhdr->start);
2002     printf("  end:   0x%.8llx\n", jnl->jhdr->end);
2003     printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2004     printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2005     printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2006     printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2007
2008     printf("  completed transactions:\n");
2009     for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
2010                 printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2011     }
2012 }
2013
2014
2015
2016 static off_t
2017 free_space(journal *jnl)
2018 {
2019     off_t free_space_offset;
2020
2021     if (jnl->jhdr->start < jnl->jhdr->end) {
2022                 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2023     } else if (jnl->jhdr->start > jnl->jhdr->end) {
2024                 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2025     } else {
2026                 // journal is completely empty
2027                 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2028     }
2029
2030     return free_space_offset;
2031 }
2032
2033
2034 //
2035 // The journal must be locked on entry to this function.
2036 // The "desired_size" is in bytes.
2037 //
2038 static int
2039 check_free_space(journal *jnl, int desired_size)
2040 {
2041     size_t i;
2042     int    counter=0;
2043
2044     //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2045 //         desired_size, free_space(jnl));
2046
2047     while (1) {
2048                 int old_start_empty;
2049
2050                 if (counter++ == 5000) {
2051                         dump_journal(jnl);
2052                         panic("jnl: check_free_space: buffer flushing isn't working "
2053                                   "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2054                                   jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2055                 }
2056                 if (counter > 7500) {
2057                     printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2058                     return ENOSPC;
2059                 }
2060
2061                 // make sure there's space in the journal to hold this transaction
2062                 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2063                         break;
2064                 }
2065                 //
2066                 // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2067                 // entries until there is enough space for the next transaction.
2068                 //
2069                 old_start_empty = 1;
2070                 lock_oldstart(jnl);
2071                 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2072                         int   lcl_counter;
2073
2074                         lcl_counter = 0;
2075                         while (jnl->old_start[i] & 0x8000000000000000LL) {
2076                                 if (lcl_counter++ > 100) {
2077                                         panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2078                                                   jnl->old_start[i], jnl);
2079                                 }
2080
2081                                 unlock_oldstart(jnl);
2082                                 if (jnl->flush) {
2083                                         jnl->flush(jnl->flush_arg);
2084                                 }
2085                                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2086                                 lock_oldstart(jnl);
2087                         }
2088
2089                         if (jnl->old_start[i] == 0) {
2090                                 continue;
2091                         }
2092
2093                         old_start_empty   = 0;
2094                         jnl->jhdr->start  = jnl->old_start[i];
2095                         jnl->old_start[i] = 0;
2096                         if (free_space(jnl) > desired_size) {
2097                                 unlock_oldstart(jnl);
2098                                 write_journal_header(jnl);
2099                                 lock_oldstart(jnl);
2100                                 break;
2101                         }
2102                 }
2103                 unlock_oldstart(jnl);
2104
2105                 // if we bumped the start, loop and try again
2106                 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2107                         continue;
2108                 } else if (old_start_empty) {
2109                         //
2110                         // if there is nothing in old_start anymore then we can
2111                         // bump the jhdr->start to be the same as active_start
2112                         // since it is possible there was only one very large
2113                         // transaction in the old_start array.  if we didn't do
2114                         // this then jhdr->start would never get updated and we
2115                         // would wind up looping until we hit the panic at the
2116                         // start of the loop.
2117                         //
2118                         jnl->jhdr->start = jnl->active_start;
2119                         write_journal_header(jnl);
2120                         continue;
2121                 }
2122
2123
2124                 // if the file system gave us a flush function, call it to so that
2125                 // it can flush some blocks which hopefully will cause some transactions
2126                 // to complete and thus free up space in the journal.
2127                 if (jnl->flush) {
2128                         jnl->flush(jnl->flush_arg);
2129                 }
2130
2131                 // wait for a while to avoid being cpu-bound (this will
2132                 // put us to sleep for 10 milliseconds)
2133                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2134     }
2135
2136     return 0;
2137 }
2138
2139 /*
2140  * Allocate a new active transaction.
2141  */
2142 static errno_t
2143 journal_allocate_transaction(journal *jnl)
2144 {
2145         transaction *tr;
2146
2147         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2148     memset(tr, 0, sizeof(transaction));
2149
2150     tr->tbuffer_size = jnl->tbuffer_size;
2151
2152     if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
2153                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2154                 jnl->active_tr = NULL;
2155                 return ENOMEM;
2156     }
2157
2158     // journal replay code checksum check depends on this.
2159     memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2160     // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2161     memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2162
2163     tr->blhdr = (block_list_header *)tr->tbuffer;
2164     tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2165     tr->blhdr->num_blocks = 1;      // accounts for this header block
2166     tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2167     tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2168
2169     tr->sequence_num = ++jnl->jhdr->sequence_num;
2170     tr->num_blhdrs  = 1;
2171     tr->total_bytes = jnl->jhdr->blhdr_size;
2172     tr->jnl         = jnl;
2173
2174         jnl->active_tr  = tr;
2175
2176         return 0;
2177 }
2178
2179 int
2180 journal_start_transaction(journal *jnl)
2181 {
2182     int ret;
2183
2184     CHECK_JOURNAL(jnl);
2185
2186     if (jnl->flags & JOURNAL_INVALID) {
2187                 return EINVAL;
2188     }
2189
2190     if (jnl->owner == current_thread()) {
2191                 if (jnl->active_tr == NULL) {
2192                         panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2193                                   jnl, jnl->owner, current_thread());
2194                 }
2195                 jnl->nested_count++;
2196                 return 0;
2197     }
2198
2199     lock_journal(jnl);
2200
2201     if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
2202                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2203                           jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2204     }
2205
2206     jnl->owner        = current_thread();
2207     jnl->nested_count = 1;
2208
2209     free_old_stuff(jnl);
2210
2211     // make sure there's room in the journal
2212     if (free_space(jnl) < jnl->tbuffer_size) {
2213         // this is the call that really waits for space to free up
2214         // as well as updating jnl->jhdr->start
2215         if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
2216                 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2217                 ret = ENOSPC;
2218                 goto bad_start;
2219         }
2220     }
2221
2222     // if there's a buffered transaction, use it.
2223     if (jnl->cur_tr) {
2224                 jnl->active_tr = jnl->cur_tr;
2225                 jnl->cur_tr    = NULL;
2226
2227                 return 0;
2228     }
2229
2230         ret = journal_allocate_transaction(jnl);
2231         if (ret) {
2232                 goto bad_start;
2233         }
2234
2235     // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2236
2237     return 0;
2238
2239   bad_start:
2240         jnl->owner        = NULL;
2241         jnl->nested_count = 0;
2242         unlock_journal(jnl);
2243         return ret;
2244 }
2245
2246
2247 int
2248 journal_modify_block_start(journal *jnl, struct buf *bp)
2249 {
2250     transaction *tr;
2251
2252     CHECK_JOURNAL(jnl);
2253
2254     if (jnl->flags & JOURNAL_INVALID) {
2255                 return EINVAL;
2256     }
2257
2258     // XXXdbg - for debugging I want this to be true.  later it may
2259     //          not be necessary.
2260     if ((buf_flags(bp) & B_META) == 0) {
2261                 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2262     }
2263
2264     tr = jnl->active_tr;
2265     CHECK_TRANSACTION(tr);
2266
2267     if (jnl->owner != current_thread()) {
2268                 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2269                           jnl, jnl->owner, current_thread());
2270     }
2271
2272     free_old_stuff(jnl);
2273
2274     //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2275     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2276
2277     // can't allow blocks that aren't an even multiple of the
2278     // underlying block size.
2279     if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2280                 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2281                           buf_size(bp), jnl->jhdr->jhdr_size);
2282                 return -1;
2283     }
2284
2285     // make sure that this transaction isn't bigger than the whole journal
2286     if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2287                 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2288                           tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2289                 return -1;
2290     }
2291
2292     // if the block is dirty and not already locked we have to write
2293     // it out before we muck with it because it has data that belongs
2294     // (presumably) to another transaction.
2295     //
2296     if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2297
2298                 if (buf_flags(bp) & B_ASYNC) {
2299                         panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2300                 }
2301
2302                 // this will cause it to not be buf_brelse()'d
2303                 buf_setflags(bp, B_NORELSE);
2304                 VNOP_BWRITE(bp);
2305     }
2306     buf_setflags(bp, B_LOCKED);
2307
2308     return 0;
2309 }
2310
2311 int
2312 journal_modify_block_abort(journal *jnl, struct buf *bp)
2313 {
2314     transaction *tr;
2315         block_list_header *blhdr;
2316         int i;
2317
2318     CHECK_JOURNAL(jnl);
2319
2320     tr = jnl->active_tr;
2321
2322         //
2323         // if there's no active transaction then we just want to
2324         // call buf_brelse() and return since this is just a block
2325         // that happened to be modified as part of another tr.
2326         //
2327         if (tr == NULL) {
2328                 buf_brelse(bp);
2329                 return 0;
2330         }
2331
2332     if (jnl->flags & JOURNAL_INVALID) {
2333                 return EINVAL;
2334     }
2335
2336     CHECK_TRANSACTION(tr);
2337
2338     if (jnl->owner != current_thread()) {
2339                 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2340                           jnl, jnl->owner, current_thread());
2341     }
2342
2343     free_old_stuff(jnl);
2344
2345     // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2346
2347     // first check if it's already part of this transaction
2348     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2349                 for(i=1; i < blhdr->num_blocks; i++) {
2350                         if (bp == blhdr->binfo[i].b.bp) {
2351                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2352                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2353                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2354                                 }
2355                                 break;
2356                         }
2357                 }
2358
2359                 if (i < blhdr->num_blocks) {
2360                         break;
2361                 }
2362     }
2363
2364         //
2365         // if blhdr is null, then this block has only had modify_block_start
2366         // called on it as part of the current transaction.  that means that
2367         // it is ok to clear the LOCKED bit since it hasn't actually been
2368         // modified.  if blhdr is non-null then modify_block_end was called
2369         // on it and so we need to keep it locked in memory.
2370         //
2371         if (blhdr == NULL) {
2372                   buf_clearflags(bp, B_LOCKED);
2373         }
2374
2375     buf_brelse(bp);
2376     return 0;
2377 }
2378
2379
2380 int
2381 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg)
2382 {
2383     int                i = 1;
2384     int                tbuffer_offset=0;
2385     char              *blkptr;
2386     block_list_header *blhdr, *prev=NULL;
2387     transaction       *tr;
2388
2389     CHECK_JOURNAL(jnl);
2390
2391     if (jnl->flags & JOURNAL_INVALID) {
2392                 return EINVAL;
2393     }
2394
2395     tr = jnl->active_tr;
2396     CHECK_TRANSACTION(tr);
2397
2398     if (jnl->owner != current_thread()) {
2399                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2400                           jnl, jnl->owner, current_thread());
2401     }
2402
2403     free_old_stuff(jnl);
2404
2405     //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2406     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2407
2408     if ((buf_flags(bp) & B_LOCKED) == 0) {
2409                 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2410     }
2411
2412     // first check if it's already part of this transaction
2413     for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2414                 tbuffer_offset = jnl->jhdr->blhdr_size;
2415
2416                 for(i=1; i < blhdr->num_blocks; i++) {
2417                         if (bp == blhdr->binfo[i].b.bp) {
2418                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2419                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2420                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2421                                 }
2422                                 break;
2423                         }
2424                         tbuffer_offset += blhdr->binfo[i].bsize;
2425                 }
2426
2427                 if (i < blhdr->num_blocks) {
2428                         break;
2429                 }
2430     }
2431
2432     if (blhdr == NULL
2433                 && prev
2434                 && (prev->num_blocks+1) <= prev->max_blocks
2435                 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2436                 blhdr = prev;
2437     } else if (blhdr == NULL) {
2438                 block_list_header *nblhdr;
2439
2440                 if (prev == NULL) {
2441                         panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2442                 }
2443
2444                 // we got to the end of the list, didn't find the block and there's
2445                 // no room in the block_list_header pointed to by prev
2446
2447                 // we allocate another tbuffer and link it in at the end of the list
2448                 // through prev->binfo[0].bnum.  that's a skanky way to do things but
2449                 // avoids having yet another linked list of small data structures to manage.
2450
2451                 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2452                         panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2453                                   tr, tr->total_bytes);
2454                 }
2455
2456                 // journal replay code checksum check depends on this.
2457                 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2458                 // Fill up the rest of the block with unimportant bytes
2459                 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2460
2461                 // initialize the new guy
2462                 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2463                 nblhdr->num_blocks = 1;      // accounts for this header block
2464                 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2465                 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2466
2467                 tr->num_blhdrs++;
2468                 tr->total_bytes += jnl->jhdr->blhdr_size;
2469
2470                 // then link him in at the end
2471                 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2472
2473                 // and finally switch to using the new guy
2474                 blhdr          = nblhdr;
2475                 tbuffer_offset = jnl->jhdr->blhdr_size;
2476                 i              = 1;
2477     }
2478
2479
2480     if ((i+1) > blhdr->max_blocks) {
2481                 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2482     }
2483
2484         // if the function pointer is not set then copy the
2485         // block of data now.  if the function pointer is set
2486         // the copy will happen after calling the callback in
2487         // end_transaction() just before it goes to disk.
2488         //
2489         if (func == NULL) {
2490                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2491                 memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp));
2492         }
2493
2494     // if this is true then this is a new block we haven't seen
2495     if (i >= blhdr->num_blocks) {
2496                 int     bsize;
2497                 vnode_t vp;
2498
2499                 vp = buf_vnode(bp);
2500                 vnode_ref(vp);
2501                 bsize = buf_size(bp);
2502
2503                 blhdr->binfo[i].bnum  = (off_t)(buf_blkno(bp));
2504                 blhdr->binfo[i].bsize = bsize;
2505                 blhdr->binfo[i].b.bp    = bp;
2506                 if (func) {
2507                         void *old_func=NULL, *old_arg=NULL;
2508
2509                         buf_setfilter(bp, func, arg, &old_func, &old_arg);
2510                         if (old_func != NULL) {
2511                                 panic("jnl: modify_block_end: old func %p / arg %p", old_func, old_arg);
2512                         }
2513                 }
2514
2515                 blhdr->bytes_used += bsize;
2516                 tr->total_bytes   += bsize;
2517
2518                 blhdr->num_blocks++;
2519     }
2520     buf_bdwrite(bp);
2521
2522     return 0;
2523 }
2524
2525 int
2526 journal_kill_block(journal *jnl, struct buf *bp)
2527 {
2528     int                i;
2529     int                bflags;
2530     block_list_header *blhdr;
2531     transaction       *tr;
2532
2533     CHECK_JOURNAL(jnl);
2534
2535     if (jnl->flags & JOURNAL_INVALID) {
2536                 return EINVAL;
2537     }
2538
2539     tr = jnl->active_tr;
2540     CHECK_TRANSACTION(tr);
2541
2542     if (jnl->owner != current_thread()) {
2543                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2544                           jnl, jnl->owner, current_thread());
2545     }
2546
2547     free_old_stuff(jnl);
2548
2549     bflags = buf_flags(bp);
2550
2551     if ( !(bflags & B_LOCKED))
2552             panic("jnl: modify_block_end: called with bp not B_LOCKED");
2553
2554     /*
2555      * bp must be BL_BUSY and B_LOCKED
2556      */
2557     // first check if it's already part of this transaction
2558     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2559
2560                 for(i=1; i < blhdr->num_blocks; i++) {
2561                         if (bp == blhdr->binfo[i].b.bp) {
2562                                 vnode_t vp;
2563
2564                                 buf_clearflags(bp, B_LOCKED);
2565
2566                                 // this undoes the vnode_ref() in journal_modify_block_end()
2567                                 vp = buf_vnode(bp);
2568                                 vnode_rele_ext(vp, 0, 1);
2569
2570                                 // if the block has the DELWRI and FILTER bits sets, then
2571                                 // things are seriously weird.  if it was part of another
2572                                 // transaction then journal_modify_block_start() should
2573                                 // have force it to be written.
2574                                 //
2575                                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2576                                 //      panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2577                                 //} else {
2578                                         tr->num_killed += buf_size(bp);
2579                                 //}
2580                                 blhdr->binfo[i].b.bp   = NULL;
2581                                 blhdr->binfo[i].bnum = (off_t)-1;
2582
2583                                 buf_markinvalid(bp);
2584                                 buf_brelse(bp);
2585
2586                                 break;
2587                         }
2588                 }
2589
2590                 if (i < blhdr->num_blocks) {
2591                         break;
2592                 }
2593     }
2594
2595     return 0;
2596 }
2597
2598
2599 static int
2600 journal_binfo_cmp(const void *a, const void *b)
2601 {
2602     const block_info *bi_a = (const struct block_info *)a;
2603     const block_info *bi_b = (const struct block_info *)b;
2604     daddr64_t res;
2605
2606     if (bi_a->b.bp == NULL) {
2607                 return 1;
2608     }
2609     if (bi_b->b.bp == NULL) {
2610                 return -1;
2611     }
2612
2613     // don't have to worry about negative block
2614     // numbers so this is ok to do.
2615     //
2616     res = (buf_blkno(bi_a->b.bp) - buf_blkno(bi_b->b.bp));
2617
2618     return (int)res;
2619 }
2620
2621
2622 /*
2623  * End a transaction.  If the transaction is small enough, and we're not forcing
2624  * a write to disk, the "active" transaction becomes the "current" transaction,
2625  * and will be reused for the next transaction that is started (group commit).
2626  *
2627  * If the transaction gets written to disk (because force_it is true, or no
2628  * group commit, or the transaction is sufficiently full), the blocks get
2629  * written into the journal first, then the are written asynchronously.  When
2630  * those async writes complete, the transaction can be freed and removed from
2631  * the journal.
2632  *
2633  * An optional callback can be supplied.  If given, it is called after the
2634  * the blocks have been written to the journal, but before the async writes
2635  * of those blocks to their normal on-disk locations.  This is used by
2636  * journal_relocate so that the location of the journal can be changed and
2637  * flushed to disk before the blocks get written to their normal locations.
2638  * Note that the callback is only called if the transaction gets written to
2639  * the journal during this end_transaction call; you probably want to set the
2640  * force_it flag.
2641  *
2642  * Inputs:
2643  *      tr                       Transaction to add to the journal
2644  *      force_it         If true, force this transaction to the on-disk journal immediately.
2645  *      callback         See description above.  Pass NULL for no callback.
2646  *      callback_arg Argument passed to callback routine.
2647  *
2648  * Result
2649  *               0              No errors
2650  *              -1              An error occurred.  The journal is marked invalid.
2651  */
2652 static int
2653 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg)
2654 {
2655     int                 i, ret, amt;
2656     errno_t             errno;
2657     off_t               end;
2658     journal            *jnl = tr->jnl;
2659     struct buf         *bp, **bparray;
2660     block_list_header  *blhdr=NULL, *next=NULL;
2661     size_t              tbuffer_offset;
2662
2663         if (jnl->cur_tr) {
2664                 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2665                           jnl, jnl->cur_tr, tr);
2666         }
2667
2668     // if there weren't any modified blocks in the transaction
2669     // just save off the transaction pointer and return.
2670     if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2671                 jnl->cur_tr = tr;
2672                 return 0;
2673     }
2674
2675     // if our transaction buffer isn't very full, just hang
2676     // on to it and don't actually flush anything.  this is
2677     // what is known as "group commit".  we will flush the
2678     // transaction buffer if it's full or if we have more than
2679     // one of them so we don't start hogging too much memory.
2680     //
2681     if (   force_it == 0
2682                    && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2683                    && tr->num_blhdrs < 3
2684                    && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2685
2686                 jnl->cur_tr = tr;
2687                 return 0;
2688     }
2689
2690
2691     // if we're here we're going to flush the transaction buffer to disk.
2692     // make sure there is room in the journal first.
2693     check_free_space(jnl, tr->total_bytes);
2694
2695     // range check the end index
2696     if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2697                 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2698                           jnl->jhdr->end, jnl->jhdr->size);
2699     }
2700
2701     // this transaction starts where the current journal ends
2702     tr->journal_start = jnl->jhdr->end;
2703     end               = jnl->jhdr->end;
2704
2705         //
2706         // if the first entry in old_start[] isn't free yet, loop calling the
2707         // file system flush routine until it is (or we panic).
2708         //
2709         i = 0;
2710         lock_oldstart(jnl);
2711         while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2712                 if (jnl->flush) {
2713                         unlock_oldstart(jnl);
2714
2715                         if (jnl->flush) {
2716                                 jnl->flush(jnl->flush_arg);
2717                         }
2718
2719                         // yield the cpu so others can get in to clear the lock bit
2720                         (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2721
2722                         lock_oldstart(jnl);
2723                 }
2724                 if (i++ >= 500) {
2725                         panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n",
2726                                   jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2727                 }
2728         }
2729
2730         //
2731         // slide everyone else down and put our latest guy in the last
2732         // entry in the old_start array
2733         //
2734         memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2735         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2736
2737         unlock_oldstart(jnl);
2738
2739
2740     // for each block, make sure that the physical block # is set
2741     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2742                 char *blkptr;
2743
2744                 tbuffer_offset = jnl->jhdr->blhdr_size;
2745                 for(i=1; i < blhdr->num_blocks; i++) {
2746                         daddr64_t blkno;
2747                         daddr64_t lblkno;
2748                         struct vnode *vp;
2749
2750                         bp = blhdr->binfo[i].b.bp;
2751
2752                         // if this block has a callback function set, call
2753                         // it now and then copy the data from the bp into
2754                         // the journal.
2755                         if (bp) {
2756                                 void (*func)(struct buf *, void *);
2757                                 void  *arg;
2758
2759                                 buf_setfilter(bp, NULL, NULL, (void **)&func, &arg);
2760
2761                                 if (func) {
2762                                         // acquire the bp here so that we can safely
2763                                         // mess around with its data.  buf_acquire()
2764                                         // will return EAGAIN if the buffer was busy,
2765                                         // so loop trying again.
2766                                         do {
2767                                                 errno = buf_acquire(bp, 0, 0, 0);
2768                                         } while (errno == EAGAIN);
2769
2770                                         if (errno == 0) {
2771
2772                                                 // call the hook function and then copy the
2773                                                 // data into the transaction buffer...
2774                                                 func(bp, arg);
2775
2776                                                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2777                                                 memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp));
2778
2779                                                 buf_drop(bp);
2780                                         } else {
2781                                                 panic("could not acquire bp %p (err %d)\n", bp, errno);
2782                                         }
2783                                 }
2784
2785                         } else {   // bp == NULL, only true if a block was "killed"
2786                                 if (blhdr->binfo[i].bnum != (off_t)-1) {
2787                                         panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2788                                                 blhdr->binfo[i].bnum, jnl, tr);
2789                                 }
2790
2791                                 tbuffer_offset += blhdr->binfo[i].bsize;
2792                                 continue;
2793                         }
2794
2795                         tbuffer_offset += blhdr->binfo[i].bsize;
2796
2797                         vp = buf_vnode(bp);
2798                         blkno = buf_blkno(bp);
2799                         lblkno = buf_lblkno(bp);
2800
2801                         if (vp == NULL && lblkno == blkno) {
2802                             printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd.  aborting the transaction (tr %p jnl %p).\n",
2803                                 jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
2804                             goto bad_journal;
2805                         }
2806
2807                         // if the lblkno is the same as blkno and this bp isn't
2808                         // associated with the underlying file system device then
2809                         // we need to call bmap() to get the actual physical block.
2810                         //
2811                         if ((lblkno == blkno) && (vp != jnl->fsdev)) {
2812                                 off_t   f_offset;
2813                                 size_t  contig_bytes;
2814
2815                                 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
2816                                         printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2817                                         goto bad_journal;
2818                                 }
2819                                 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
2820                                         printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2821                                         goto bad_journal;
2822                                 }
2823                                 if ((uint32_t)contig_bytes < buf_count(bp)) {
2824                                         printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2825                                         goto bad_journal;
2826                                 }
2827                                 buf_setblkno(bp, blkno);
2828                         }
2829                         // update this so we write out the correct physical block number!
2830                         blhdr->binfo[i].bnum = (off_t)(blkno);
2831                 }
2832
2833                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2834     }
2835
2836
2837
2838     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2839                 amt = blhdr->bytes_used;
2840
2841                 blhdr->binfo[0].b.sequence_num = tr->sequence_num;
2842
2843                 blhdr->checksum = 0;
2844                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2845
2846                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
2847                     panic("can't allocate %lu bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
2848                 }
2849
2850                 // calculate individual block checksums
2851                 tbuffer_offset = jnl->jhdr->blhdr_size;
2852                 for(i=1; i < blhdr->num_blocks; i++) {
2853                     bparray[i] = blhdr->binfo[i].b.bp;
2854                     if (bparray[i]) {
2855                         blhdr->binfo[i].b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], blhdr->binfo[i].bsize);
2856                     } else {
2857                         blhdr->binfo[i].b.cksum = 0;
2858                     }
2859
2860                     tbuffer_offset += blhdr->binfo[i].bsize;
2861                 }
2862
2863                 ret = write_journal_data(jnl, &end, blhdr, amt);
2864
2865                 // always put the bp pointers back
2866                 for(i=1; i < blhdr->num_blocks; i++) {
2867                     blhdr->binfo[i].b.bp = bparray[i];
2868                 }
2869
2870                 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
2871
2872                 if (ret != amt) {
2873                         printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
2874                             jnl->jdev_name, ret, amt);
2875
2876                         goto bad_journal;
2877                 }
2878     }
2879
2880     jnl->jhdr->end  = end;    // update where the journal now ends
2881     tr->journal_end = end;    // the transaction ends here too
2882     if (tr->journal_start == 0 || tr->journal_end == 0) {
2883                 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2884                           tr->journal_start, tr->journal_end);
2885     }
2886
2887     if (write_journal_header(jnl) != 0) {
2888                 goto bad_journal;
2889     }
2890
2891         /*
2892          * If the caller supplied a callback, call it now that the blocks have been
2893          * written to the journal.  This is used by journal_relocate so, for example,
2894          * the file system can change its pointer to the new journal.
2895          */
2896         if (callback != NULL && callback(callback_arg) != 0) {
2897                 goto bad_journal;
2898         }
2899
2900     //
2901     // setup for looping through all the blhdr's.  we null out the
2902     // tbuffer and blhdr fields so that they're not used any more.
2903     //
2904     blhdr       = tr->blhdr;
2905     tr->tbuffer = NULL;
2906     tr->blhdr   = NULL;
2907
2908     // the buffer_flushed_callback will only be called for the
2909     // real blocks that get flushed so we have to account for
2910     // the block_list_headers here.
2911     //
2912     tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2913
2914     // for each block, set the iodone callback and unlock it
2915     for(; blhdr; blhdr=next) {
2916
2917                 // we can re-order the buf ptrs because everything is written out already
2918                 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2919
2920                 for(i=1; i < blhdr->num_blocks; i++) {
2921                         if (blhdr->binfo[i].b.bp == NULL) {
2922                                 continue;
2923                         }
2924
2925                         bp = blhdr->binfo[i].b.bp;
2926
2927                         // have to pass BAC_REMOVE here because we're going to bawrite()
2928                         // the buffer when we're done
2929                         do {
2930                                 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
2931                         } while (errno == EAGAIN);
2932
2933                         if (errno == 0) {
2934                                 struct vnode *save_vp;
2935                                 void *cur_filter;
2936
2937                                 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2938                                         if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2939                                             buf_clearflags(bp, B_LOCKED);
2940                                             buf_brelse(bp);
2941                                                 continue;
2942                                         } else {
2943                                                 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
2944                                         }
2945                                 }
2946                                 save_vp = buf_vnode(bp);
2947
2948                                 buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL);
2949
2950                                 if (cur_filter) {
2951                                         panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n",
2952                                                   bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback);
2953                                 }
2954                                 buf_clearflags(bp, B_LOCKED);
2955
2956                                 // kicking off the write here helps performance
2957                                 buf_bawrite(bp);
2958                                 // XXXdbg this is good for testing: buf_bdwrite(bp);
2959                                 //buf_bdwrite(bp);
2960
2961                                 // this undoes the vnode_ref() in journal_modify_block_end()
2962                                 vnode_rele_ext(save_vp, 0, 1);
2963                         } else {
2964                                 printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n",
2965                                     jnl->jdev_name,bp, errno);
2966                         }
2967                 }
2968
2969                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2970
2971                 // we can free blhdr here since we won't need it any more
2972                 blhdr->binfo[0].bnum = 0xdeadc0de;
2973                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2974     }
2975
2976     //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2977     //   tr, tr->journal_start, tr->journal_end);
2978     return 0;
2979
2980
2981   bad_journal:
2982     jnl->flags |= JOURNAL_INVALID;
2983     jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2984     abort_transaction(jnl, tr);
2985     return -1;
2986 }
2987
2988 static void
2989 abort_transaction(journal *jnl, transaction *tr)
2990 {
2991     int                i;
2992     errno_t             errno;
2993     block_list_header *blhdr, *next;
2994     struct buf        *bp;
2995     struct vnode      *save_vp;
2996
2997     // for each block list header, iterate over the blocks then
2998     // free up the memory associated with the block list.
2999     //
3000     // for each block, clear the lock bit and release it.
3001     //
3002     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
3003
3004                 for(i=1; i < blhdr->num_blocks; i++) {
3005                         if (blhdr->binfo[i].b.bp == NULL) {
3006                                 continue;
3007                         }
3008                         if ( (buf_vnode(blhdr->binfo[i].b.bp) == NULL) ||
3009                              !(buf_flags(blhdr->binfo[i].b.bp) & B_LOCKED) ) {
3010                                 continue;
3011                         }
3012
3013                         errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].b.bp),
3014                                                          buf_lblkno(blhdr->binfo[i].b.bp),
3015                                                          buf_size(blhdr->binfo[i].b.bp),
3016                                                          NOCRED,
3017                                                          &bp);
3018                         if (errno == 0) {
3019                                 if (bp != blhdr->binfo[i].b.bp) {
3020                                         panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
3021                                                   bp, blhdr->binfo[i].b.bp, jnl);
3022                                 }
3023
3024                                 // releasing a bp marked invalid
3025                                 // also clears the locked and delayed state
3026                                 buf_markinvalid(bp);
3027                                 save_vp = buf_vnode(bp);
3028
3029                                 buf_brelse(bp);
3030
3031                                 vnode_rele_ext(save_vp, 0, 1);
3032                         } else {
3033                                 printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
3034                                     jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].b.bp);
3035                                 if (bp) {
3036                                         buf_brelse(bp);
3037                                 }
3038                         }
3039                 }
3040
3041                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3042
3043                 // we can free blhdr here since we won't need it any more
3044                 blhdr->binfo[0].bnum = 0xdeadc0de;
3045                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
3046     }
3047
3048     tr->tbuffer     = NULL;
3049     tr->blhdr       = NULL;
3050     tr->total_bytes = 0xdbadc0de;
3051         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
3052 }
3053
3054
3055 int
3056 journal_end_transaction(journal *jnl)
3057 {
3058     int ret;
3059         transaction *tr;
3060
3061     CHECK_JOURNAL(jnl);
3062
3063         if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
3064                 return 0;
3065         }
3066
3067     if (jnl->owner != current_thread()) {
3068                 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
3069                           jnl, jnl->owner, current_thread());
3070     }
3071
3072     free_old_stuff(jnl);
3073
3074     jnl->nested_count--;
3075     if (jnl->nested_count > 0) {
3076                 return 0;
3077     } else if (jnl->nested_count < 0) {
3078                 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
3079     }
3080
3081     if (jnl->flags & JOURNAL_INVALID) {
3082                 if (jnl->active_tr) {
3083                         if (jnl->cur_tr != NULL) {
3084                                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
3085                                           jnl, jnl->active_tr, jnl->cur_tr);
3086                         }
3087
3088                         tr             = jnl->active_tr;
3089                         jnl->active_tr = NULL;
3090                         abort_transaction(jnl, tr);
3091                 }
3092
3093                 jnl->owner = NULL;
3094                 unlock_journal(jnl);
3095
3096                 return EINVAL;
3097     }
3098
3099     tr = jnl->active_tr;
3100     CHECK_TRANSACTION(tr);
3101
3102     // clear this out here so that when check_free_space() calls
3103     // the FS flush function, we don't panic in journal_flush()
3104     // if the FS were to call that.  note: check_free_space() is
3105     // called from end_transaction().
3106     //
3107     jnl->active_tr = NULL;
3108     ret = end_transaction(tr, 0, NULL, NULL);
3109
3110     jnl->owner = NULL;
3111     unlock_journal(jnl);
3112
3113     return ret;
3114 }
3115
3116
3117 int
3118 journal_flush(journal *jnl)
3119 {
3120     int need_signal = 0;
3121
3122     CHECK_JOURNAL(jnl);
3123
3124     if (jnl->flags & JOURNAL_INVALID) {
3125                 return -1;
3126     }
3127
3128     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3129         | DBG_FUNC_START, 0, 0, 0, 0, 0);
3130
3131     if (jnl->owner != current_thread()) {
3132                 lock_journal(jnl);
3133                 need_signal = 1;
3134     }
3135
3136     free_old_stuff(jnl);
3137
3138     // if we're not active, flush any buffered transactions
3139     if (jnl->active_tr == NULL && jnl->cur_tr) {
3140                 transaction *tr = jnl->cur_tr;
3141
3142                 jnl->cur_tr = NULL;
3143                 end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
3144     }
3145
3146     if (need_signal) {
3147                 unlock_journal(jnl);
3148     }
3149
3150     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3151         | DBG_FUNC_END, 0, 0, 0, 0, 0);
3152
3153     return 0;
3154 }
3155
3156 int
3157 journal_active(journal *jnl)
3158 {
3159     if (jnl->flags & JOURNAL_INVALID) {
3160                 return -1;
3161     }
3162
3163     return (jnl->active_tr == NULL) ? 0 : 1;
3164 }
3165
3166 void *
3167 journal_owner(journal *jnl)
3168 {
3169     return jnl->owner;
3170 }
3171
3172 int journal_uses_fua(journal *jnl)
3173 {
3174         if (jnl->flags & JOURNAL_DO_FUA_WRITES)
3175                 return 1;
3176         return 0;
3177 }
3178
3179 /*
3180  * Relocate the journal.
3181  *
3182  * You provide the new starting offset and size for the journal. You may
3183  * optionally provide a new tbuffer_size; passing zero defaults to not
3184  * changing the tbuffer size except as needed to fit within the new journal
3185  * size.
3186  *
3187  * You must have already started a transaction. The transaction may contain
3188  * modified blocks (such as those needed to deallocate the old journal,
3189  * allocate the new journal, and update the location and size of the journal
3190  * in filesystem-private structures). Any transactions prior to the active
3191  * transaction will be flushed to the old journal. The new journal will be
3192  * initialized, and the blocks from the active transaction will be written to
3193  * the new journal.
3194  *
3195  * The caller will need to update the structures that identify the location
3196  * and size of the journal.  These updates should be made in the supplied
3197  * callback routine.  These updates must NOT go into a transaction.  You should
3198  * force these updates to the media before returning from the callback.  In the
3199  * even of a crash, either the old journal will be found, with an empty journal,
3200  * or the new journal will be found with the contents of the active transaction.
3201  *
3202  * Upon return from the callback, the blocks from the active transaction are
3203  * written to their normal locations on disk.
3204  *
3205  * (Remember that we have to ensure that blocks get committed to the journal
3206  * before being committed to their normal locations.  But the blocks don't count
3207  * as committed until the new journal is pointed at.)
3208  *
3209  * Upon return, there is still an active transaction: newly allocated, and
3210  * with no modified blocks.  Call journal_end_transaction as normal.  You may
3211  * modifiy additional blocks before calling journal_end_transaction, and those
3212  * blocks will (eventually) go to the relocated journal.
3213  *
3214  * Inputs:
3215  *      jnl                             The (opened) journal to relocate.
3216  *      offset                  The new journal byte offset (from start of the journal device).
3217  *      journal_size    The size, in bytes, of the new journal.
3218  *      tbuffer_size    The new desired transaction buffer size.  Pass zero to keep
3219  *                                      the same size as the current journal.  The size will be
3220  *                                      modified as needed to fit the new journal.
3221  *      callback                Routine called after the new journal has been initialized,
3222  *                                      and the active transaction written to the new journal, but
3223  *                                      before the blocks are written to their normal locations.
3224  *                                      Pass NULL for no callback.
3225  *      callback_arg    An argument passed to the callback routine.
3226  *
3227  * Result:
3228  *      0                               No errors
3229  *      EINVAL                  The offset is not block aligned
3230  *      EINVAL                  The journal_size is not a multiple of the block size
3231  *      EINVAL                  The journal is invalid
3232  *      (any)                   An error returned by journal_flush.
3233  *
3234  */
3235 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
3236         errno_t (*callback)(void *), void *callback_arg)
3237 {
3238         int ret;
3239         transaction *tr;
3240
3241         /*
3242          * Sanity check inputs, and adjust the size of the transaction buffer.
3243          */
3244     if ((offset % jnl->jhdr->jhdr_size) != 0) {
3245                 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
3246                     jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
3247                 return EINVAL;
3248     }
3249     if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
3250                 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3251                     jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
3252                 return EINVAL;
3253     }
3254
3255     CHECK_JOURNAL(jnl);
3256
3257         /* Guarantee we own the active transaction. */
3258     if (jnl->flags & JOURNAL_INVALID) {
3259                 return EINVAL;
3260     }
3261     if (jnl->owner != current_thread()) {
3262         panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
3263                 jnl, jnl->owner, current_thread());
3264         }
3265
3266     if (tbuffer_size == 0)
3267         tbuffer_size = jnl->tbuffer_size;
3268     size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
3269
3270         /*
3271          * Flush any non-active transactions.  We have to temporarily hide the
3272          * active transaction to make journal_flush flush out non-active but
3273          * current (unwritten) transactions.
3274          */
3275         tr = jnl->active_tr;
3276         CHECK_TRANSACTION(tr);
3277         jnl->active_tr = NULL;
3278         ret = journal_flush(jnl);
3279         jnl->active_tr = tr;
3280         if (ret) {
3281                 return ret;
3282         }
3283
3284         /* Update the journal's offset and size in memory. */
3285         jnl->jdev_offset = offset;
3286         jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
3287         jnl->jhdr->size = journal_size;
3288         jnl->active_start = jnl->jhdr->start;
3289
3290         /*
3291          * Force the active transaction to be written to the new journal.  Call the
3292          * supplied callback after the blocks have been written to the journal, but
3293          * before they get written to their normal on-disk locations.
3294          */
3295         jnl->active_tr = NULL;
3296         ret = end_transaction(tr, 1, callback, callback_arg);
3297         if (ret) {
3298                 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
3299                 goto bad_journal;
3300         }
3301
3302         /*
3303          * Create a new, empty transaction to be the active transaction.  This way
3304          * our caller can use journal_end_transaction as usual.
3305          */
3306         ret = journal_allocate_transaction(jnl);
3307         if (ret) {
3308                 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
3309                 goto bad_journal;
3310         }
3311
3312         return 0;
3313
3314 bad_journal:
3315     jnl->flags |= JOURNAL_INVALID;
3316     abort_transaction(jnl, tr);
3317     return ret;
3318 }