bsd/vfs/vfs_journal.c

   1 /*
   2  * Copyright (c) 1995-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // vfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 //
  37
  38 #ifdef KERNEL
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/file_internal.h>
  44 #include <sys/stat.h>
  45 #include <sys/buf_internal.h>
  46 #include <sys/proc_internal.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/namei.h>
  49 #include <sys/vnode_internal.h>
  50 #include <sys/ioctl.h>
  51 #include <sys/tty.h>
  52 #include <sys/ubc.h>
  53 #include <sys/malloc.h>
  54 #include <kern/thread.h>
  55 #include <sys/disk.h>
  56 #include <sys/kdebug.h>
  57 #include <miscfs/specfs/specdev.h>
  58 #include <libkern/OSAtomic.h>   /* OSAddAtomic */
  59
  60 extern task_t kernel_task;
  61
  62 #define DBG_JOURNAL_FLUSH 1
  63
  64 #else
  65
  66 #include <stdio.h>
  67 #include <stdlib.h>
  68 #include <string.h>
  69 #include <limits.h>
  70 #include <errno.h>
  71 #include <fcntl.h>
  72 #include <unistd.h>
  73 #include <stdarg.h>
  74 #include <sys/types.h>
  75 #include "compat.h"
  76
  77 #endif   /* KERNEL */
  78
  79 #include "vfs_journal.h"
  80
  81 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
  82 __private_extern__ void qsort(
  83     void * array,
  84     size_t nmembers,
  85     size_t member_size,
  86     int (*)(const void *, const void *));
  87
  88
  89
  90 // number of bytes to checksum in a block_list_header
  91 // NOTE: this should be enough to clear out the header
  92 //       fields as well as the first entry of binfo[]
  93 #define BLHDR_CHECKSUM_SIZE 32
  94
  95
  96 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg);
  97 static void abort_transaction(journal *jnl, transaction *tr);
  98 static void dump_journal(journal *jnl);
  99
 100 static __inline__ void  lock_journal(journal *jnl);
 101 static __inline__ void  unlock_journal(journal *jnl);
 102 static __inline__ void  lock_oldstart(journal *jnl);
 103 static __inline__ void  unlock_oldstart(journal *jnl);
 104
 105
 106
 107
 108 //
 109 // 3105942 - Coalesce writes to the same block on journal replay
 110 //
 111
 112 typedef struct bucket {
 113     off_t   block_num;
 114     size_t  jnl_offset;
 115     size_t  block_size;
 116     int32_t cksum;
 117 } bucket;
 118
 119 #define STARTING_BUCKETS 256
 120
 121 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 122 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
 123 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
 124 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 125 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
 126
 127 #define CHECK_JOURNAL(jnl) \
 128     do { \
 129     if (jnl == NULL) {\
 130         panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
 131     }\
 132     if (jnl->jdev == NULL) { \
 133         panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
 134     } \
 135     if (jnl->fsdev == NULL) { \
 136         panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
 137     } \
 138     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
 139         panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
 140         __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
 141     }\
 142     if (   jnl->jhdr->start <= 0 \
 143         || jnl->jhdr->start > jnl->jhdr->size\
 144         || jnl->jhdr->start > 1024*1024*1024) {\
 145         panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
 146         __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
 147     }\
 148     if (   jnl->jhdr->end <= 0 \
 149         || jnl->jhdr->end > jnl->jhdr->size\
 150         || jnl->jhdr->end > 1024*1024*1024) {\
 151         panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
 152         __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
 153     }\
 154     if (jnl->jhdr->size > 1024*1024*1024) {\
 155         panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
 156         __FILE__, __LINE__, jnl->jhdr->size);\
 157     } \
 158     } while(0)
 159
 160 #define CHECK_TRANSACTION(tr) \
 161     do {\
 162     if (tr == NULL) {\
 163         panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
 164     }\
 165     if (tr->jnl == NULL) {\
 166         panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
 167     }\
 168     if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
 169         panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
 170     }\
 171     if (tr->total_bytes < 0) {\
 172         panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
 173     }\
 174     if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
 175         panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
 176     }\
 177     if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
 178         panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
 179     }\
 180     if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
 181         panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
 182     }\
 183     } while(0)
 184
 185
 186
 187 //
 188 // this isn't a great checksum routine but it will do for now.
 189 // we use it to checksum the journal header and the block list
 190 // headers that are at the start of each transaction.
 191 //
 192 static int
 193 calc_checksum(char *ptr, int len)
 194 {
 195     int i, cksum=0;
 196
 197     // this is a lame checksum but for now it'll do
 198     for(i=0; i < len; i++, ptr++) {
 199                 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
 200     }
 201
 202     return (~cksum);
 203 }
 204
 205 //
 206 // Journal Locking
 207 //
 208 lck_grp_attr_t *  jnl_group_attr;
 209 lck_attr_t *      jnl_lock_attr;
 210 lck_grp_t *       jnl_mutex_group;
 211
 212 void
 213 journal_init(void)
 214 {
 215         jnl_lock_attr    = lck_attr_alloc_init();
 216         jnl_group_attr   = lck_grp_attr_alloc_init();
 217         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 218 }
 219
 220 static __inline__ void
 221 lock_journal(journal *jnl)
 222 {
 223         lck_mtx_lock(&jnl->jlock);
 224 }
 225
 226 static __inline__ void
 227 unlock_journal(journal *jnl)
 228 {
 229         lck_mtx_unlock(&jnl->jlock);
 230 }
 231
 232 static __inline__ void
 233 lock_oldstart(journal *jnl)
 234 {
 235         lck_mtx_lock(&jnl->old_start_lock);
 236 }
 237
 238 static __inline__ void
 239 unlock_oldstart(journal *jnl)
 240 {
 241         lck_mtx_unlock(&jnl->old_start_lock);
 242 }
 243
 244
 245
 246 #define JNL_WRITE    0x0001
 247 #define JNL_READ     0x0002
 248 #define JNL_HEADER   0x8000
 249
 250 //
 251 // This function sets up a fake buf and passes it directly to the
 252 // journal device strategy routine (so that it won't get cached in
 253 // the block cache.
 254 //
 255 // It also handles range checking the i/o so that we don't write
 256 // outside the journal boundaries and it will wrap the i/o back
 257 // to the beginning if necessary (skipping over the journal header)
 258 //
 259 static size_t
 260 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
 261 {
 262     int         err, curlen=len;
 263     size_t      io_sz = 0;
 264     buf_t       bp;
 265     off_t       max_iosize;
 266
 267     if (*offset < 0 || *offset > jnl->jhdr->size) {
 268                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
 269     }
 270
 271     if (direction & JNL_WRITE)
 272         max_iosize = jnl->max_write_size;
 273     else if (direction & JNL_READ)
 274         max_iosize = jnl->max_read_size;
 275     else
 276         max_iosize = 128 * 1024;
 277
 278   again:
 279     bp = alloc_io_buf(jnl->jdev, 1);
 280
 281     if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
 282                 if (*offset == jnl->jhdr->size) {
 283                         *offset = jnl->jhdr->jhdr_size;
 284                 } else {
 285                         curlen = (off_t)jnl->jhdr->size - *offset;
 286                 }
 287     }
 288
 289         if (curlen > max_iosize) {
 290                 curlen = max_iosize;
 291         }
 292
 293     if (curlen <= 0) {
 294                 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %lu\n", curlen, *offset, len);
 295     }
 296
 297         if (*offset == 0 && (direction & JNL_HEADER) == 0) {
 298                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 299         }
 300
 301     if (direction & JNL_READ)
 302             buf_setflags(bp, B_READ);
 303     else {
 304             /*
 305              * don't have to set any flags
 306              */
 307             vnode_startwrite(jnl->jdev);
 308     }
 309     buf_setsize(bp, curlen);
 310     buf_setcount(bp, curlen);
 311     buf_setdataptr(bp, (uintptr_t)data);
 312     buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 313     buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 314     if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
 315         buf_markfua(bp);
 316     }
 317
 318     err = VNOP_STRATEGY(bp);
 319     if (!err) {
 320                 err = (int)buf_biowait(bp);
 321     }
 322     free_io_buf(bp);
 323
 324     if (err) {
 325         printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
 326         return 0;
 327     }
 328
 329     *offset += curlen;
 330     io_sz   += curlen;
 331     if (io_sz != len) {
 332                 // handle wrap-around
 333                 data    = (char *)data + curlen;
 334                 curlen  = len - io_sz;
 335                 if (*offset >= jnl->jhdr->size) {
 336                         *offset = jnl->jhdr->jhdr_size;
 337                 }
 338                 goto again;
 339     }
 340
 341     return io_sz;
 342 }
 343
 344 static size_t
 345 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 346 {
 347     return do_journal_io(jnl, offset, data, len, JNL_READ);
 348 }
 349
 350 static size_t
 351 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 352 {
 353     return do_journal_io(jnl, offset, data, len, JNL_WRITE);
 354 }
 355
 356
 357 static size_t
 358 read_journal_header(journal *jnl, void *data, size_t len)
 359 {
 360         off_t hdr_offset = 0;
 361
 362         return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
 363 }
 364
 365 static int
 366 write_journal_header(journal *jnl)
 367 {
 368     static int num_err_prints = 0;
 369     int ret=0;
 370     off_t jhdr_offset = 0;
 371     struct vfs_context context;
 372
 373     context.vc_thread = current_thread();
 374     context.vc_ucred = NOCRED;
 375     //
 376     // Flush the track cache if we're not doing force-unit-access
 377     // writes.
 378     //
 379     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 380         ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 381     }
 382     if (ret != 0) {
 383         //
 384         // Only print this error if it's a different error than the
 385         // previous one, or if it's the first time for this device
 386         // or if the total number of printfs is less than 25.  We
 387         // allow for up to 25 printfs to insure that some make it
 388         // into the on-disk syslog.  Otherwise if we only printed
 389         // one, it's possible it would never make it to the syslog
 390         // for the root volume and that makes debugging hard.
 391         //
 392         if (   ret != jnl->last_flush_err
 393             || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
 394             || num_err_prints++ < 25) {
 395
 396             printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
 397
 398             jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
 399             jnl->last_flush_err = ret;
 400         }
 401     }
 402
 403     jnl->jhdr->checksum = 0;
 404     jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 405     if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
 406         printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
 407         jnl->flags |= JOURNAL_INVALID;
 408         return -1;
 409     }
 410
 411     // If we're not doing force-unit-access writes, then we
 412     // have to flush after writing the journal header so that
 413     // a future transaction doesn't sneak out to disk before
 414     // the header does and thus overwrite data that the old
 415     // journal header refers to.  Saw this exact case happen
 416     // on an IDE bus analyzer with Larry Barras so while it
 417     // may seem obscure, it's not.
 418     //
 419     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 420         VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 421     }
 422
 423     return 0;
 424 }
 425
 426
 427
 428 //
 429 // this is a work function used to free up transactions that
 430 // completed. they can't be free'd from buffer_flushed_callback
 431 // because it is called from deep with the disk driver stack
 432 // and thus can't do something that would potentially cause
 433 // paging.  it gets called by each of the journal api entry
 434 // points so stuff shouldn't hang around for too long.
 435 //
 436 static void
 437 free_old_stuff(journal *jnl)
 438 {
 439     transaction *tr, *next;
 440
 441     lock_oldstart(jnl);
 442     tr = jnl->tr_freeme;
 443     jnl->tr_freeme = NULL;
 444     unlock_oldstart(jnl);
 445
 446     for(; tr; tr=next) {
 447         next = tr->next;
 448         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
 449     }
 450
 451 }
 452
 453
 454
 455 //
 456 // This is our callback that lets us know when a buffer has been
 457 // flushed to disk.  It's called from deep within the driver stack
 458 // and thus is quite limited in what it can do.  Notably, it can
 459 // not initiate any new i/o's or allocate/free memory.
 460 //
 461 static void
 462 buffer_flushed_callback(struct buf *bp, void *arg)
 463 {
 464     transaction  *tr;
 465     journal      *jnl;
 466     transaction  *ctr, *prev=NULL, *next;
 467     size_t        i;
 468     int           bufsize, amt_flushed, total_bytes;
 469
 470
 471     //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
 472     //     bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
 473
 474     // snarf out the bits we want
 475     bufsize = buf_size(bp);
 476     tr      = (transaction *)arg;
 477
 478     // then we've already seen it
 479     if (tr == NULL) {
 480                 return;
 481     }
 482
 483     CHECK_TRANSACTION(tr);
 484
 485     jnl = tr->jnl;
 486     if (jnl->flags & JOURNAL_INVALID) {
 487                 return;
 488     }
 489
 490     CHECK_JOURNAL(jnl);
 491
 492     amt_flushed = tr->num_killed;
 493     total_bytes = tr->total_bytes;
 494
 495     // update the number of blocks that have been flushed.
 496     // this buf may represent more than one block so take
 497     // that into account.
 498     //
 499     // OSAddAtomic() returns the value of tr->num_flushed before the add
 500     //
 501     amt_flushed += OSAddAtomic(bufsize, (SInt32 *)&tr->num_flushed);
 502
 503
 504     // if this transaction isn't done yet, just return as
 505     // there is nothing to do.
 506     //
 507     // NOTE: we are careful to not reference anything through
 508     //       the tr pointer after doing the OSAddAtomic().  if
 509     //       this if statement fails then we are the last one
 510     //       and then it's ok to dereference "tr".
 511     //
 512     if ((amt_flushed + bufsize) < total_bytes) {
 513                 return;
 514     }
 515
 516     // this will single thread checking the transaction
 517     lock_oldstart(jnl);
 518
 519     if (tr->total_bytes == (int)0xfbadc0de) {
 520         // then someone beat us to it...
 521         unlock_oldstart(jnl);
 522         return;
 523     }
 524
 525     // mark this so that we're the owner of dealing with the
 526     // cleanup for this transaction
 527     tr->total_bytes = 0xfbadc0de;
 528
 529     //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 530     //   tr, tr->journal_start, tr->journal_end, jnl);
 531
 532     // find this entry in the old_start[] index and mark it completed
 533     for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
 534
 535         if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
 536             jnl->old_start[i] &= ~(0x8000000000000000ULL);
 537             break;
 538         }
 539     }
 540
 541     if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
 542         panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
 543             tr->journal_start, tr, jnl);
 544     }
 545
 546
 547     // if we are here then we need to update the journal header
 548     // to reflect that this transaction is complete
 549     if (tr->journal_start == jnl->active_start) {
 550         jnl->active_start = tr->journal_end;
 551         tr->journal_start = tr->journal_end = (off_t)0;
 552     }
 553
 554     // go through the completed_trs list and try to coalesce
 555     // entries, restarting back at the beginning if we have to.
 556     for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
 557         if (ctr->journal_start == jnl->active_start) {
 558             jnl->active_start = ctr->journal_end;
 559             if (prev) {
 560                 prev->next = ctr->next;
 561             }
 562             if (ctr == jnl->completed_trs) {
 563                 jnl->completed_trs = ctr->next;
 564             }
 565
 566             next           = jnl->completed_trs;   // this starts us over again
 567             ctr->next      = jnl->tr_freeme;
 568             jnl->tr_freeme = ctr;
 569             ctr            = NULL;
 570         } else if (tr->journal_end == ctr->journal_start) {
 571             ctr->journal_start = tr->journal_start;
 572             next               = jnl->completed_trs;  // this starts us over again
 573             ctr                = NULL;
 574             tr->journal_start  = tr->journal_end = (off_t)0;
 575         } else if (tr->journal_start == ctr->journal_end) {
 576             ctr->journal_end  = tr->journal_end;
 577             next              = ctr->next;
 578             tr->journal_start = tr->journal_end = (off_t)0;
 579         } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
 580             // coalesce the next entry with this one and link the next
 581             // entry in at the head of the tr_freeme list
 582             next              = ctr->next;           // temporarily use the "next" variable
 583             ctr->journal_end  = next->journal_end;
 584             ctr->next         = next->next;
 585             next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
 586             jnl->tr_freeme    = next;
 587
 588             next              = jnl->completed_trs;  // this starts us over again
 589             ctr               = NULL;
 590         } else {
 591             next = ctr->next;
 592         }
 593     }
 594
 595     // if this is true then we didn't merge with anyone
 596     // so link ourselves in at the head of the completed
 597     // transaction list.
 598     if (tr->journal_start != 0) {
 599         // put this entry into the correct sorted place
 600         // in the list instead of just at the head.
 601         //
 602
 603         prev = NULL;
 604         for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
 605             // just keep looping
 606         }
 607
 608         if (ctr == NULL && prev == NULL) {
 609             jnl->completed_trs = tr;
 610             tr->next = NULL;
 611         } else if (ctr == jnl->completed_trs) {
 612             tr->next = jnl->completed_trs;
 613             jnl->completed_trs = tr;
 614         } else {
 615             tr->next = prev->next;
 616             prev->next = tr;
 617         }
 618     } else {
 619         // if we're here this tr got merged with someone else so
 620         // put it on the list to be free'd
 621         tr->next       = jnl->tr_freeme;
 622         jnl->tr_freeme = tr;
 623     }
 624     unlock_oldstart(jnl);
 625 }
 626
 627
 628 #include <libkern/OSByteOrder.h>
 629
 630 #define SWAP16(x) OSSwapInt16(x)
 631 #define SWAP32(x) OSSwapInt32(x)
 632 #define SWAP64(x) OSSwapInt64(x)
 633
 634
 635 static void
 636 swap_journal_header(journal *jnl)
 637 {
 638     jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
 639     jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
 640     jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
 641     jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
 642     jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
 643     jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
 644     jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
 645     jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
 646     jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
 647 }
 648
 649 static void
 650 swap_block_list_header(journal *jnl, block_list_header *blhdr)
 651 {
 652     int i;
 653
 654     blhdr->max_blocks = SWAP16(blhdr->max_blocks);
 655     blhdr->num_blocks = SWAP16(blhdr->num_blocks);
 656     blhdr->bytes_used = SWAP32(blhdr->bytes_used);
 657     blhdr->checksum   = SWAP32(blhdr->checksum);
 658     blhdr->flags      = SWAP32(blhdr->flags);
 659
 660     if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
 661         printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
 662         return;
 663     }
 664
 665     for(i=0; i < blhdr->num_blocks; i++) {
 666                 blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
 667                 blhdr->binfo[i].bsize   = SWAP32(blhdr->binfo[i].bsize);
 668                 blhdr->binfo[i].b.cksum = SWAP32(blhdr->binfo[i].b.cksum);
 669     }
 670 }
 671
 672
 673 static int
 674 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
 675 {
 676     int         ret;
 677     struct buf *oblock_bp=NULL;
 678
 679     // first read the block we want.
 680     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 681     if (ret != 0) {
 682         printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
 683
 684                 if (oblock_bp) {
 685                         buf_brelse(oblock_bp);
 686                         oblock_bp = NULL;
 687                 }
 688
 689                 // let's try to be aggressive here and just re-write the block
 690                 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
 691                 if (oblock_bp == NULL) {
 692                     printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
 693                     return -1;
 694                 }
 695     }
 696
 697     // make sure it's the correct size.
 698     if (buf_size(oblock_bp) != bsize) {
 699                 buf_brelse(oblock_bp);
 700                 return -1;
 701     }
 702
 703     // copy the journal data over top of it
 704     memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize);
 705
 706     if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
 707         printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
 708         return ret;
 709     }
 710
 711     // and now invalidate it so that if someone else wants to read
 712     // it in a different size they'll be able to do it.
 713     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 714     if (oblock_bp) {
 715                 buf_markinvalid(oblock_bp);
 716                 buf_brelse(oblock_bp);
 717     }
 718
 719     return 0;
 720 }
 721
 722 static int
 723 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
 724 {
 725     struct bucket *newBuf;
 726     int current_size = num_buckets, i;
 727
 728     // return if newsize is less than the current size
 729     if (new_size < num_buckets) {
 730         return current_size;
 731     }
 732
 733     if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
 734         printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
 735         return -1;
 736     }
 737
 738     //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
 739
 740     // copy existing elements
 741     bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
 742
 743     // initialize the new ones
 744     for(i=num_buckets; i < new_size; i++) {
 745         newBuf[i].block_num = (off_t)-1;
 746     }
 747
 748     // free the old container
 749     FREE(*buf_ptr, M_TEMP);
 750
 751     // reset the buf_ptr
 752     *buf_ptr = newBuf;
 753
 754     return new_size;
 755 }
 756
 757 static int
 758 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
 759 {
 760     int lo, hi, index, matches, i;
 761
 762     if (num_full == 0) {
 763         return 0; // table is empty, so insert at index=0
 764     }
 765
 766     lo = 0;
 767     hi = num_full - 1;
 768     index = -1;
 769
 770     // perform binary search for block_num
 771     do {
 772         int mid = (hi - lo)/2 + lo;
 773         off_t this_num = (*buf_ptr)[mid].block_num;
 774
 775         if (block_num == this_num) {
 776             index = mid;
 777             break;
 778         }
 779
 780         if (block_num < this_num) {
 781             hi = mid;
 782             continue;
 783         }
 784
 785         if (block_num > this_num) {
 786             lo = mid + 1;
 787             continue;
 788         }
 789     } while(lo < hi);
 790
 791     // check if lo and hi converged on the match
 792     if (block_num == (*buf_ptr)[hi].block_num) {
 793         index = hi;
 794     }
 795
 796     // if no existing entry found, find index for new one
 797     if (index == -1) {
 798         index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
 799     } else {
 800         // make sure that we return the right-most index in the case of multiple matches
 801         matches = 0;
 802         i = index + 1;
 803         while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
 804             matches++;
 805             i++;
 806         }
 807
 808         index += matches;
 809     }
 810
 811     return index;
 812 }
 813
 814 static int
 815 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
 816 {
 817     if (!overwriting) {
 818         // grow the table if we're out of space
 819         if (*num_full_ptr >= *num_buckets_ptr) {
 820             int new_size = *num_buckets_ptr * 2;
 821             int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
 822
 823             if (grow_size < new_size) {
 824                 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
 825                 return -1;
 826             }
 827
 828             *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
 829         }
 830
 831         // if we're not inserting at the end, we need to bcopy
 832         if (blk_index != *num_full_ptr) {
 833             bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
 834         }
 835
 836         (*num_full_ptr)++; // increment only if we're not overwriting
 837     }
 838
 839     // sanity check the values we're about to add
 840     if (offset >= jnl->jhdr->size) {
 841         offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
 842     }
 843     if (size <= 0) {
 844         panic("jnl: insert_block: bad size in insert_block (%lu)\n", size);
 845     }
 846
 847     (*buf_ptr)[blk_index].block_num = num;
 848     (*buf_ptr)[blk_index].block_size = size;
 849     (*buf_ptr)[blk_index].jnl_offset = offset;
 850     (*buf_ptr)[blk_index].cksum = cksum;
 851
 852     return blk_index;
 853 }
 854
 855 static int
 856 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 857 {
 858     int num_to_remove, index, i, overwrite, err;
 859     size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
 860     off_t overlap, block_start, block_end;
 861
 862     block_start = block_num*jhdr_size;
 863     block_end = block_start + size;
 864     overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
 865
 866     // first, eliminate any overlap with the previous entry
 867     if (blk_index != 0 && !overwrite) {
 868         off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
 869         off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
 870         overlap = prev_block_end - block_start;
 871         if (overlap > 0) {
 872             if (overlap % jhdr_size != 0) {
 873                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %lu\n", jhdr_size);
 874             }
 875
 876             // if the previous entry completely overlaps this one, we need to break it into two pieces.
 877             if (prev_block_end > block_end) {
 878                 off_t new_num = block_end / jhdr_size;
 879                 size_t new_size = prev_block_end - block_end;
 880
 881                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
 882
 883                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
 884                 if (err < 0) {
 885                     panic("jnl: do_overlap: error inserting during pre-overlap\n");
 886                 }
 887             }
 888
 889             // Regardless, we need to truncate the previous entry to the beginning of the overlap
 890             (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
 891             (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
 892         }
 893     }
 894
 895     // then, bail out fast if there's no overlap with the entries that follow
 896     if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
 897         return 0; // no overlap, no overwrite
 898     } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
 899
 900         (*buf_ptr)[blk_index].cksum = cksum;   // update this
 901         return 1; // simple overwrite
 902     }
 903
 904     // Otherwise, find all cases of total and partial overlap. We use the special
 905     // block_num of -2 to designate entries that are completely overlapped and must
 906     // be eliminated. The block_num, size, and jnl_offset of partially overlapped
 907     // entries must be adjusted to keep the array consistent.
 908     index = blk_index;
 909     num_to_remove = 0;
 910     while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
 911         if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
 912             (*buf_ptr)[index].block_num = -2; // mark this for deletion
 913             num_to_remove++;
 914         } else {
 915             overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
 916             if (overlap > 0) {
 917                 if (overlap % jhdr_size != 0) {
 918                     panic("jnl: do_overlap: overlap of %lld is not multiple of %lu\n", overlap, jhdr_size);
 919                 }
 920
 921                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
 922                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
 923                 (*buf_ptr)[index].cksum = 0;
 924
 925                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
 926                 if (new_offset >= jnl->jhdr->size) {
 927                     new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
 928                 }
 929                 (*buf_ptr)[index].jnl_offset = new_offset;
 930
 931                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
 932                 if ((*buf_ptr)[index].block_size <= 0) {
 933                     panic("jnl: do_overlap: after overlap, new block size is invalid (%lu)\n", (*buf_ptr)[index].block_size);
 934                     // return -1; // if above panic is removed, return -1 for error
 935                 }
 936             }
 937
 938         }
 939
 940         index++;
 941     }
 942
 943     // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
 944     index--; // start with the last index used within the above loop
 945     while(index >= blk_index) {
 946         if ((*buf_ptr)[index].block_num == -2) {
 947             if (index == *num_full_ptr-1) {
 948                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
 949             } else {
 950                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
 951             }
 952             (*num_full_ptr)--;
 953         }
 954         index--;
 955     }
 956
 957     // eliminate any stale entries at the end of the table
 958     for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
 959         (*buf_ptr)[i].block_num = -1;
 960     }
 961
 962     return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
 963 }
 964
 965 // PR-3105942: Coalesce writes to the same block in journal replay
 966 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
 967 // to be replayed and the corresponding location in the journal which contains
 968 // the most recent data for those blocks. The array is "played" once the all the
 969 // blocks in the journal have been coalesced. The code for the case of conflicting/
 970 // overlapping writes to a single block is the most dense. Because coalescing can
 971 // disrupt the existing time-ordering of blocks in the journal playback, care
 972 // is taken to catch any overlaps and keep the array consistent.
 973 static int
 974 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 975 {
 976     int blk_index, overwriting;
 977
 978     // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
 979     // inserted (or the index of the elem to overwrite).
 980     blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
 981
 982     // check if the index is within bounds (if we're adding this block to the end of
 983     // the table, blk_index will be equal to num_full)
 984     if (blk_index < 0 || blk_index > *num_full_ptr) {
 985         //printf("jnl: add_block: trouble adding block to co_buf\n");
 986         return -1;
 987     } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
 988
 989     // Determine whether we're overwriting an existing entry by checking for overlap
 990     overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
 991     if (overwriting < 0) {
 992         return -1; // if we got an error, pass it along
 993     }
 994
 995     // returns the index, or -1 on error
 996     blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
 997
 998     return blk_index;
 999 }
1000
1001 static int
1002 replay_journal(journal *jnl)
1003 {
1004     int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
1005     size_t ret;
1006     size_t  max_bsize = 0;              /* protected by block_ptr */
1007     block_list_header *blhdr;
1008     off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1009     char *buff, *block_ptr=NULL;
1010     struct bucket *co_buf;
1011     int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1012     uint32_t last_sequence_num = 0;
1013
1014     // wrap the start ptr if it points to the very end of the journal
1015     if (jnl->jhdr->start == jnl->jhdr->size) {
1016                 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1017     }
1018     if (jnl->jhdr->end == jnl->jhdr->size) {
1019                 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1020     }
1021
1022     if (jnl->jhdr->start == jnl->jhdr->end) {
1023                 return 0;
1024     }
1025
1026     orig_jnl_start = jnl->jhdr->start;
1027
1028     // allocate memory for the header_block.  we'll read each blhdr into this
1029     if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
1030                 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1031                     jnl->jdev_name, jnl->jhdr->blhdr_size);
1032                 return -1;
1033     }
1034
1035     // allocate memory for the coalesce buffer
1036     if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1037         printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1038         return -1;
1039     }
1040
1041   restart_replay:
1042
1043     // initialize entries
1044     for(i=0; i < num_buckets; i++) {
1045         co_buf[i].block_num = -1;
1046     }
1047     num_full = 0; // empty at first
1048
1049
1050     printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1051         jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1052
1053     while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1054                 offset = blhdr_offset = jnl->jhdr->start;
1055                 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1056                 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1057                     printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1058                     bad_blocks = 1;
1059                     goto bad_txn_handling;
1060                 }
1061
1062                 blhdr = (block_list_header *)buff;
1063
1064                 orig_checksum = blhdr->checksum;
1065                 blhdr->checksum = 0;
1066                 if (jnl->flags & JOURNAL_NEED_SWAP) {
1067                         // calculate the checksum based on the unswapped data
1068                         // because it is done byte-at-a-time.
1069                         orig_checksum = SWAP32(orig_checksum);
1070                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1071                         swap_block_list_header(jnl, blhdr);
1072                 } else {
1073                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1074                 }
1075
1076
1077                 //
1078                 // XXXdbg - if these checks fail, we should replay as much
1079                 //          we can in the hopes that it will still leave the
1080                 //          drive in a better state than if we didn't replay
1081                 //          anything
1082                 //
1083                 if (checksum != orig_checksum) {
1084                     if (check_past_jnl_end && in_uncharted_territory) {
1085
1086                         if (blhdr_offset != jnl->jhdr->end) {
1087                             printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1088                         }
1089
1090                         check_past_jnl_end = 0;
1091                         jnl->jhdr->end = blhdr_offset;
1092                         continue;
1093                     }
1094
1095                     printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1096                         jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1097
1098                     if (blhdr_offset == orig_jnl_start) {
1099                         // if there's nothing in the journal at all, just bail out altogether.
1100                         goto bad_replay;
1101                     }
1102
1103                     bad_blocks = 1;
1104                     goto bad_txn_handling;
1105                 }
1106
1107                 if (blhdr->binfo[0].b.sequence_num < last_sequence_num) {
1108                     txn_start_offset = jnl->jhdr->end = blhdr_offset;
1109
1110                     if (check_past_jnl_end) {
1111                         check_past_jnl_end = 0;
1112                         printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1113                             jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1114                         continue;
1115                     }
1116
1117                     printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1118                         jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1119                     bad_blocks = 1;
1120                     goto bad_txn_handling;
1121                 }
1122                 last_sequence_num = blhdr->binfo[0].b.sequence_num;
1123
1124                 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1125                     printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1126                 }
1127
1128                 if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
1129                            || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1130                     printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1131                         jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1132                     bad_blocks = 1;
1133                     goto bad_txn_handling;
1134                 }
1135
1136                 max_bsize = 0;
1137                 for(i=1; i < blhdr->num_blocks; i++) {
1138                         if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1139                             printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1140                             bad_blocks = 1;
1141                             goto bad_txn_handling;
1142                         }
1143
1144                         if (blhdr->binfo[i].bsize > max_bsize) {
1145                             max_bsize = blhdr->binfo[i].bsize;
1146                         }
1147                 }
1148
1149                 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1150                     check_block_checksums = 1;
1151                     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1152                         goto bad_replay;
1153                     }
1154                 } else {
1155                     block_ptr = NULL;
1156                 }
1157
1158                 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1159                     txn_start_offset = blhdr_offset;
1160                 }
1161
1162                 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1163                 //       blhdr->num_blocks-1, jnl->jhdr->start);
1164                 bad_blocks = 0;
1165                 for(i=1; i < blhdr->num_blocks; i++) {
1166                         int size, ret_val;
1167                         off_t number;
1168
1169                         size = blhdr->binfo[i].bsize;
1170                         number = blhdr->binfo[i].bnum;
1171
1172                         // don't add "killed" blocks
1173                         if (number == (off_t)-1) {
1174                             //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1175                         } else {
1176
1177                             if (check_block_checksums) {
1178                                 int32_t disk_cksum;
1179                                 off_t block_offset;
1180
1181                                 block_offset = offset;
1182
1183                                 // read the block so we can check the checksum
1184                                 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1185                                 if (ret != (size_t)size) {
1186                                     printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1187                                     bad_blocks = 1;
1188                                     goto bad_txn_handling;
1189                                 }
1190
1191                                 disk_cksum = calc_checksum(block_ptr, size);
1192
1193                                 // there is no need to swap the checksum from disk because
1194                                 // it got swapped when the blhdr was read in.
1195                                 if (blhdr->binfo[i].b.cksum != 0 && disk_cksum != blhdr->binfo[i].b.cksum) {
1196                                     printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1197                                         jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].b.cksum);
1198                                     printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1199                                         *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1200                                         *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1201
1202                                     bad_blocks = 1;
1203                                     goto bad_txn_handling;
1204                                 }
1205                             }
1206
1207
1208                             // add this bucket to co_buf, coalescing where possible
1209                             // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1210                             ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].b.cksum, &num_buckets, &num_full);
1211
1212                             if (ret_val == -1) {
1213                                 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1214                                 goto bad_replay;
1215                             } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1216                         }
1217
1218                         // increment offset
1219                         offset += size;
1220
1221                         // check if the last block added puts us off the end of the jnl.
1222                         // if so, we need to wrap to the beginning and take any remainder
1223                         // into account
1224                         //
1225                         if (offset >= jnl->jhdr->size) {
1226                             offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1227                         }
1228                 }
1229
1230                 if (block_ptr) {
1231                     kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1232                     block_ptr = NULL;
1233                 }
1234
1235       bad_txn_handling:
1236                 if (bad_blocks) {
1237                     if (txn_start_offset == 0) {
1238                         printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1239                         goto bad_replay;
1240                     }
1241
1242                     jnl->jhdr->start = orig_jnl_start;
1243                     jnl->jhdr->end = txn_start_offset;
1244                     check_past_jnl_end = 0;
1245                     last_sequence_num = 0;
1246                     printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1247                     goto restart_replay;
1248                 }
1249
1250                 jnl->jhdr->start += blhdr->bytes_used;
1251                 if (jnl->jhdr->start >= jnl->jhdr->size) {
1252                         // wrap around and skip the journal header block
1253                         jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1254                 }
1255
1256                 if (jnl->jhdr->start == jnl->jhdr->end) {
1257                     in_uncharted_territory = 1;
1258                 }
1259     }
1260
1261     if (jnl->jhdr->start != jnl->jhdr->end) {
1262         printf("jnl: %s: start %lld != end %lld.  resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1263         jnl->jhdr->end = jnl->jhdr->start;
1264     }
1265
1266     //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1267
1268     /*
1269      * make sure it's at least one page in size, so
1270      * start max_bsize at PAGE_SIZE
1271      */
1272     for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1273
1274             if (co_buf[i].block_num == (off_t)-1)
1275                     continue;
1276
1277             if (co_buf[i].block_size > max_bsize)
1278                     max_bsize = co_buf[i].block_size;
1279     }
1280     /*
1281      * round max_bsize up to the nearest PAGE_SIZE multiple
1282      */
1283     if (max_bsize & (PAGE_SIZE - 1)) {
1284             max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1285     }
1286
1287     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1288         goto bad_replay;
1289     }
1290
1291     // Replay the coalesced entries in the co-buf
1292     for(i=0; i < num_full; i++) {
1293         size_t size = co_buf[i].block_size;
1294         off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1295         off_t number = co_buf[i].block_num;
1296
1297
1298         // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1299         //      co_buf[i].block_size, co_buf[i].jnl_offset);
1300
1301         if (number == (off_t)-1) {
1302             // printf("jnl: replay_journal: skipping killed fs block\n");
1303         } else {
1304
1305             // do journal read, and set the phys. block
1306             ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1307             if (ret != size) {
1308                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1309                 goto bad_replay;
1310             }
1311
1312             if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1313                 goto bad_replay;
1314             }
1315         }
1316     }
1317
1318
1319     // done replaying; update jnl header
1320     if (write_journal_header(jnl) != 0) {
1321         goto bad_replay;
1322     }
1323
1324     printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1325
1326     // free block_ptr
1327     if (block_ptr) {
1328         kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1329         block_ptr = NULL;
1330     }
1331
1332     // free the coalesce buffer
1333     FREE(co_buf, M_TEMP);
1334     co_buf = NULL;
1335
1336     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1337     return 0;
1338
1339   bad_replay:
1340     if (block_ptr) {
1341                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1342     }
1343     if (co_buf) {
1344       FREE(co_buf, M_TEMP);
1345     }
1346     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1347
1348     return -1;
1349 }
1350
1351
1352 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
1353 //#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
1354 #define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
1355
1356 // XXXdbg - so I can change it in the debugger
1357 int def_tbuffer_size = 0;
1358
1359
1360 //
1361 // This function sets the size of the tbuffer and the
1362 // size of the blhdr.  It assumes that jnl->jhdr->size
1363 // and jnl->jhdr->jhdr_size are already valid.
1364 //
1365 static void
1366 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1367 {
1368         //
1369         // one-time initialization based on how much memory
1370         // there is in the machine.
1371         //
1372         if (def_tbuffer_size == 0) {
1373                 if (mem_size < (256*1024*1024)) {
1374                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1375                 } else if (mem_size < (512*1024*1024)) {
1376                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1377                 } else if (mem_size < (1024*1024*1024)) {
1378                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1379                 } else if (mem_size >= (1024*1024*1024)) {
1380                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1381                 }
1382         }
1383
1384     // size up the transaction buffer... can't be larger than the number
1385     // of blocks that can fit in a block_list_header block.
1386     if (tbuffer_size == 0) {
1387                 jnl->tbuffer_size = def_tbuffer_size;
1388     } else {
1389                 // make sure that the specified tbuffer_size isn't too small
1390                 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1391                         tbuffer_size = jnl->jhdr->blhdr_size * 2;
1392                 }
1393                 // and make sure it's an even multiple of the block size
1394                 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1395                         tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1396                 }
1397
1398                 jnl->tbuffer_size = tbuffer_size;
1399     }
1400
1401     if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1402                 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1403     }
1404
1405     if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1406                 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1407     }
1408
1409     jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1410     if (jnl->jhdr->blhdr_size < phys_blksz) {
1411         jnl->jhdr->blhdr_size = phys_blksz;
1412     } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1413                 // have to round up so we're an even multiple of the physical block size
1414                 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1415     }
1416 }
1417
1418
1419
1420 static void
1421 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1422 {
1423     off_t       readblockcnt;
1424     off_t       writeblockcnt;
1425     off_t       readmaxcnt;
1426     off_t       writemaxcnt;
1427     int32_t     features;
1428
1429     if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1430         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1431             const char *name = vnode_name(devvp);
1432             jnl->flags |= JOURNAL_DO_FUA_WRITES;
1433             printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
1434         }
1435     }
1436
1437     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context)) {
1438         readmaxcnt = 0;
1439     }
1440
1441     if (readmaxcnt == 0) {
1442         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context)) {
1443             readmaxcnt = 128 * 1024;
1444         } else {
1445             readmaxcnt = readblockcnt * phys_blksz;
1446         }
1447     }
1448
1449
1450     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context)) {
1451         writemaxcnt = 0;
1452     }
1453
1454     if (writemaxcnt == 0) {
1455         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context)) {
1456             writemaxcnt = 128 * 1024;
1457         } else {
1458             writemaxcnt = writeblockcnt * phys_blksz;
1459         }
1460     }
1461
1462     jnl->max_read_size  = readmaxcnt;
1463     jnl->max_write_size = writemaxcnt;
1464
1465     // just in case it's still zero...
1466     if (jnl->max_read_size == 0) {
1467         jnl->max_read_size = 128 * 1024;
1468         jnl->max_write_size = 128 * 1024;
1469     }
1470 }
1471
1472
1473 static const char *
1474 get_jdev_name(struct vnode *jvp)
1475 {
1476     const char *jdev_name;
1477
1478     jdev_name = vnode_name(jvp);
1479     if (jdev_name == NULL) {
1480         jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
1481     } else {
1482         // this just bumps the refcount on the name so we have our own copy
1483         jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
1484     }
1485
1486     return jdev_name;
1487 }
1488
1489
1490 journal *
1491 journal_create(struct vnode *jvp,
1492                            off_t         offset,
1493                            off_t         journal_size,
1494                            struct vnode *fsvp,
1495                            size_t        min_fs_blksz,
1496                            int32_t       flags,
1497                            int32_t       tbuffer_size,
1498                            void        (*flush)(void *arg),
1499                            void         *arg)
1500 {
1501     journal *jnl;
1502     size_t      phys_blksz;
1503     struct vfs_context context;
1504     const char *jdev_name;
1505
1506     context.vc_thread = current_thread();
1507     context.vc_ucred = FSCRED;
1508
1509     jdev_name = get_jdev_name(jvp);
1510
1511     /* Get the real physical block size. */
1512     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1513         return NULL;
1514     }
1515
1516     if (phys_blksz > min_fs_blksz) {
1517                 printf("jnl: %s: create: error: phys blksize %lu bigger than min fs blksize %lu\n",
1518                     jdev_name, phys_blksz, min_fs_blksz);
1519                 return NULL;
1520     }
1521
1522     if ((journal_size % phys_blksz) != 0) {
1523                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1524                     jdev_name, journal_size, phys_blksz);
1525                 return NULL;
1526     }
1527
1528
1529     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1530     memset(jnl, 0, sizeof(*jnl));
1531
1532     jnl->jdev         = jvp;
1533     jnl->jdev_offset  = offset;
1534     jnl->fsdev        = fsvp;
1535     jnl->flush        = flush;
1536     jnl->flush_arg    = arg;
1537     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1538     jnl->jdev_name    = jdev_name;
1539     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1540
1541     get_io_info(jvp, phys_blksz, jnl, &context);
1542
1543     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1544         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1545         goto bad_kmem_alloc;
1546     }
1547
1548     memset(jnl->header_buf, 0, phys_blksz);
1549
1550     jnl->jhdr             = (journal_header *)jnl->header_buf;
1551     jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
1552     jnl->jhdr->endian     = ENDIAN_MAGIC;
1553     jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
1554     jnl->jhdr->end        = phys_blksz;
1555     jnl->jhdr->size       = journal_size;
1556     jnl->jhdr->jhdr_size  = phys_blksz;
1557     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1558
1559         jnl->active_start     = jnl->jhdr->start;
1560
1561     // XXXdbg  - for testing you can force the journal to wrap around
1562     // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1563     // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
1564
1565     jnl->jhdr->sequence_num = random() & 0x00ffffff;
1566
1567         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1568
1569     if (write_journal_header(jnl) != 0) {
1570         printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1571         goto bad_write;
1572     }
1573
1574     return jnl;
1575
1576
1577   bad_write:
1578     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1579   bad_kmem_alloc:
1580     if (jdev_name) {
1581         vfs_removename(jdev_name);
1582     }
1583     jnl->jhdr = NULL;
1584     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1585     return NULL;
1586 }
1587
1588
1589 journal *
1590 journal_open(struct vnode *jvp,
1591                          off_t         offset,
1592                          off_t         journal_size,
1593                          struct vnode *fsvp,
1594                          size_t        min_fs_blksz,
1595                          int32_t       flags,
1596                          int32_t       tbuffer_size,
1597                          void        (*flush)(void *arg),
1598                          void         *arg)
1599 {
1600     journal *jnl;
1601     int      orig_blksz=0;
1602     size_t   phys_blksz;
1603     int      orig_checksum, checksum;
1604     struct vfs_context context;
1605     const char *jdev_name = get_jdev_name(jvp);
1606
1607     context.vc_thread = current_thread();
1608     context.vc_ucred = FSCRED;
1609
1610     /* Get the real physical block size. */
1611     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1612                 return NULL;
1613     }
1614
1615     if (phys_blksz > min_fs_blksz) {
1616                 printf("jnl: %s: open: error: phys blksize %lu bigger than min fs blksize %lu\n",
1617                     jdev_name, phys_blksz, min_fs_blksz);
1618                 return NULL;
1619     }
1620
1621     if ((journal_size % phys_blksz) != 0) {
1622                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1623                     jdev_name, journal_size, phys_blksz);
1624                 return NULL;
1625     }
1626
1627     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1628     memset(jnl, 0, sizeof(*jnl));
1629
1630     jnl->jdev         = jvp;
1631     jnl->jdev_offset  = offset;
1632     jnl->fsdev        = fsvp;
1633     jnl->flush        = flush;
1634     jnl->flush_arg    = arg;
1635     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1636     jnl->jdev_name    = jdev_name;
1637     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1638
1639     get_io_info(jvp, phys_blksz, jnl, &context);
1640
1641     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1642         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1643         goto bad_kmem_alloc;
1644     }
1645
1646     jnl->jhdr = (journal_header *)jnl->header_buf;
1647     memset(jnl->jhdr, 0, sizeof(journal_header));
1648
1649     // we have to set this up here so that do_journal_io() will work
1650     jnl->jhdr->jhdr_size = phys_blksz;
1651
1652     if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1653                 printf("jnl: %s: open: could not read %lu bytes for the journal header.\n",
1654                     jdev_name, phys_blksz);
1655                 goto bad_journal;
1656     }
1657
1658         orig_checksum = jnl->jhdr->checksum;
1659         jnl->jhdr->checksum = 0;
1660
1661         if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1662                 // do this before the swap since it's done byte-at-a-time
1663                 orig_checksum = SWAP32(orig_checksum);
1664                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1665                 swap_journal_header(jnl);
1666                 jnl->flags |= JOURNAL_NEED_SWAP;
1667         } else {
1668                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1669         }
1670
1671     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1672                 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1673                     jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1674                 goto bad_journal;
1675     }
1676
1677         // only check if we're the current journal header magic value
1678         if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1679
1680                 if (orig_checksum != checksum) {
1681                         printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1682                             jdev_name, orig_checksum, checksum);
1683
1684                         //goto bad_journal;
1685                 }
1686         }
1687
1688         // XXXdbg - convert old style magic numbers to the new one
1689         if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1690                 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1691         }
1692
1693     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1694                 printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n",
1695                     jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
1696
1697                 orig_blksz = phys_blksz;
1698                 phys_blksz = jnl->jhdr->jhdr_size;
1699                 if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) {
1700                     printf("jnl: %s: could not set block size to %lu bytes.\n", jdev_name, phys_blksz);
1701                     goto bad_journal;
1702                 }
1703 //              goto bad_journal;
1704     }
1705
1706     if (   jnl->jhdr->start <= 0
1707                    || jnl->jhdr->start > jnl->jhdr->size
1708                    || jnl->jhdr->start > 1024*1024*1024) {
1709                 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1710                     jdev_name, jnl->jhdr->start, jnl->jhdr->size);
1711                 goto bad_journal;
1712     }
1713
1714     if (   jnl->jhdr->end <= 0
1715                    || jnl->jhdr->end > jnl->jhdr->size
1716                    || jnl->jhdr->end > 1024*1024*1024) {
1717                 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1718                     jdev_name, jnl->jhdr->end, jnl->jhdr->size);
1719                 goto bad_journal;
1720     }
1721
1722     if (jnl->jhdr->size > 1024*1024*1024) {
1723         printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
1724         goto bad_journal;
1725     }
1726
1727 // XXXdbg - can't do these checks because hfs writes all kinds of
1728 //          non-uniform sized blocks even on devices that have a block size
1729 //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
1730 //          therefore these checks will fail and so we just have to punt and
1731 //          do more relaxed checking...
1732 // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1733     if ((jnl->jhdr->start % 512) != 0) {
1734                 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1735                     jdev_name, jnl->jhdr->start);
1736                 goto bad_journal;
1737     }
1738
1739 //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1740     if ((jnl->jhdr->end % 512) != 0) {
1741                 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1742                     jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
1743                 goto bad_journal;
1744     }
1745
1746     // take care of replaying the journal if necessary
1747     if (flags & JOURNAL_RESET) {
1748         printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1749             jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
1750         jnl->jhdr->start = jnl->jhdr->end;
1751     } else if (replay_journal(jnl) != 0) {
1752         printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
1753         goto bad_journal;
1754     }
1755
1756     if (orig_blksz != 0) {
1757         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1758         phys_blksz = orig_blksz;
1759         if (orig_blksz < jnl->jhdr->jhdr_size) {
1760             printf("jnl: %s: open: jhdr_size is %d but orig phys blk size is %d.  switching.\n",
1761                 jdev_name, jnl->jhdr->jhdr_size, orig_blksz);
1762
1763             jnl->jhdr->jhdr_size = orig_blksz;
1764         }
1765     }
1766
1767     // make sure this is in sync!
1768     jnl->active_start = jnl->jhdr->start;
1769
1770     // set this now, after we've replayed the journal
1771     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1772
1773     lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1774
1775     return jnl;
1776
1777   bad_journal:
1778     if (orig_blksz != 0) {
1779         phys_blksz = orig_blksz;
1780         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1781     }
1782     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1783   bad_kmem_alloc:
1784     if (jdev_name) {
1785         vfs_removename(jdev_name);
1786     }
1787     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1788     return NULL;
1789 }
1790
1791
1792 int
1793 journal_is_clean(struct vnode *jvp,
1794                  off_t         offset,
1795                  off_t         journal_size,
1796                  struct vnode *fsvp,
1797                  size_t        min_fs_block_size)
1798 {
1799     journal jnl;
1800     int     phys_blksz, ret;
1801     int     orig_checksum, checksum;
1802     struct vfs_context context;
1803     const char *jdev_name = get_jdev_name(jvp);
1804
1805     context.vc_thread = current_thread();
1806     context.vc_ucred = FSCRED;
1807
1808     /* Get the real physical block size. */
1809     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1810         printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
1811         return EINVAL;
1812     }
1813
1814     if (phys_blksz > (int)min_fs_block_size) {
1815         printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %lu\n",
1816             jdev_name, phys_blksz, min_fs_block_size);
1817         return EINVAL;
1818     }
1819
1820     if ((journal_size % phys_blksz) != 0) {
1821         printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1822             jdev_name, journal_size, phys_blksz);
1823         return EINVAL;
1824     }
1825
1826     memset(&jnl, 0, sizeof(jnl));
1827
1828     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
1829         printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
1830         return ENOMEM;
1831     }
1832
1833     get_io_info(jvp, phys_blksz, &jnl, &context);
1834
1835     jnl.jhdr = (journal_header *)jnl.header_buf;
1836     memset(jnl.jhdr, 0, sizeof(journal_header));
1837
1838     jnl.jdev        = jvp;
1839     jnl.jdev_offset = offset;
1840     jnl.fsdev       = fsvp;
1841
1842     // we have to set this up here so that do_journal_io() will work
1843     jnl.jhdr->jhdr_size = phys_blksz;
1844
1845     if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
1846         printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
1847             jdev_name, phys_blksz);
1848         ret = EINVAL;
1849         goto get_out;
1850     }
1851
1852     orig_checksum = jnl.jhdr->checksum;
1853     jnl.jhdr->checksum = 0;
1854
1855     if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1856         // do this before the swap since it's done byte-at-a-time
1857         orig_checksum = SWAP32(orig_checksum);
1858         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1859         swap_journal_header(&jnl);
1860         jnl.flags |= JOURNAL_NEED_SWAP;
1861     } else {
1862         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1863     }
1864
1865     if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1866         printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1867             jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
1868         ret = EINVAL;
1869         goto get_out;
1870     }
1871
1872     if (orig_checksum != checksum) {
1873         printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
1874         ret = EINVAL;
1875         goto get_out;
1876     }
1877
1878     //
1879     // if the start and end are equal then the journal is clean.
1880     // otherwise it's not clean and therefore an error.
1881     //
1882     if (jnl.jhdr->start == jnl.jhdr->end) {
1883         ret = 0;
1884     } else {
1885         ret = EINVAL;
1886     }
1887
1888   get_out:
1889     kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
1890     if (jdev_name) {
1891         vfs_removename(jdev_name);
1892     }
1893
1894     return ret;
1895
1896
1897 }
1898
1899
1900 void
1901 journal_close(journal *jnl)
1902 {
1903     volatile off_t *start, *end;
1904     int             counter=0;
1905
1906     CHECK_JOURNAL(jnl);
1907
1908         // set this before doing anything that would block so that
1909         // we start tearing things down properly.
1910         //
1911         jnl->flags |= JOURNAL_CLOSE_PENDING;
1912
1913     if (jnl->owner != current_thread()) {
1914                 lock_journal(jnl);
1915     }
1916
1917     //
1918     // only write stuff to disk if the journal is still valid
1919     //
1920     if ((jnl->flags & JOURNAL_INVALID) == 0) {
1921
1922                 if (jnl->active_tr) {
1923                         journal_end_transaction(jnl);
1924                 }
1925
1926                 // flush any buffered transactions
1927                 if (jnl->cur_tr) {
1928                         transaction *tr = jnl->cur_tr;
1929
1930                         jnl->cur_tr = NULL;
1931                         end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
1932                 }
1933
1934                 //start = &jnl->jhdr->start;
1935                 start = &jnl->active_start;
1936                 end   = &jnl->jhdr->end;
1937
1938                 while (*start != *end && counter++ < 5000) {
1939                         //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1940                         if (jnl->flush) {
1941                                 jnl->flush(jnl->flush_arg);
1942                         }
1943                         tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
1944                 }
1945
1946                 if (*start != *end) {
1947                         printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1948                             jnl->jdev_name, *start, *end);
1949                 }
1950
1951                 // make sure this is in sync when we close the journal
1952                 jnl->jhdr->start = jnl->active_start;
1953
1954                 // if this fails there's not much we can do at this point...
1955                 write_journal_header(jnl);
1956     } else {
1957                 // if we're here the journal isn't valid any more.
1958                 // so make sure we don't leave any locked blocks lying around
1959                 printf("jnl: %s: close: journal %p, is invalid.  aborting outstanding transactions\n", jnl->jdev_name, jnl);
1960                 if (jnl->active_tr || jnl->cur_tr) {
1961                         transaction *tr;
1962                         if (jnl->active_tr) {
1963                                 tr = jnl->active_tr;
1964                                 jnl->active_tr = NULL;
1965                         } else {
1966                                 tr = jnl->cur_tr;
1967                                 jnl->cur_tr = NULL;
1968                         }
1969
1970                         abort_transaction(jnl, tr);
1971                         if (jnl->active_tr || jnl->cur_tr) {
1972                             panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
1973                         }
1974                 }
1975     }
1976
1977     free_old_stuff(jnl);
1978
1979     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
1980     jnl->jhdr = (void *)0xbeefbabe;
1981
1982     if (jnl->jdev_name) {
1983         vfs_removename(jnl->jdev_name);
1984     }
1985
1986     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1987 }
1988
1989 static void
1990 dump_journal(journal *jnl)
1991 {
1992     transaction *ctr;
1993
1994     printf("journal for dev %s:", jnl->jdev_name);
1995     printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
1996     printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
1997     printf("  start: 0x%.8llx\n", jnl->jhdr->start);
1998     printf("  end:   0x%.8llx\n", jnl->jhdr->end);
1999     printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2000     printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2001     printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2002     printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2003
2004     printf("  completed transactions:\n");
2005     for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
2006                 printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2007     }
2008 }
2009
2010
2011
2012 static off_t
2013 free_space(journal *jnl)
2014 {
2015     off_t free_space_offset;
2016
2017     if (jnl->jhdr->start < jnl->jhdr->end) {
2018                 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2019     } else if (jnl->jhdr->start > jnl->jhdr->end) {
2020                 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2021     } else {
2022                 // journal is completely empty
2023                 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2024     }
2025
2026     return free_space_offset;
2027 }
2028
2029
2030 //
2031 // The journal must be locked on entry to this function.
2032 // The "desired_size" is in bytes.
2033 //
2034 static int
2035 check_free_space(journal *jnl, int desired_size)
2036 {
2037     size_t i;
2038     int    counter=0;
2039
2040     //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2041 //         desired_size, free_space(jnl));
2042
2043     while (1) {
2044                 int old_start_empty;
2045
2046                 if (counter++ == 5000) {
2047                         dump_journal(jnl);
2048                         panic("jnl: check_free_space: buffer flushing isn't working "
2049                                   "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2050                                   jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2051                 }
2052                 if (counter > 7500) {
2053                     printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2054                     return ENOSPC;
2055                 }
2056
2057                 // make sure there's space in the journal to hold this transaction
2058                 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2059                         break;
2060                 }
2061                 //
2062                 // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2063                 // entries until there is enough space for the next transaction.
2064                 //
2065                 old_start_empty = 1;
2066                 lock_oldstart(jnl);
2067                 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2068                         int   lcl_counter;
2069
2070                         lcl_counter = 0;
2071                         while (jnl->old_start[i] & 0x8000000000000000LL) {
2072                                 if (lcl_counter++ > 100) {
2073                                         panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2074                                                   jnl->old_start[i], jnl);
2075                                 }
2076
2077                                 unlock_oldstart(jnl);
2078                                 if (jnl->flush) {
2079                                         jnl->flush(jnl->flush_arg);
2080                                 }
2081                                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2082                                 lock_oldstart(jnl);
2083                         }
2084
2085                         if (jnl->old_start[i] == 0) {
2086                                 continue;
2087                         }
2088
2089                         old_start_empty   = 0;
2090                         jnl->jhdr->start  = jnl->old_start[i];
2091                         jnl->old_start[i] = 0;
2092                         if (free_space(jnl) > desired_size) {
2093                                 unlock_oldstart(jnl);
2094                                 write_journal_header(jnl);
2095                                 lock_oldstart(jnl);
2096                                 break;
2097                         }
2098                 }
2099                 unlock_oldstart(jnl);
2100
2101                 // if we bumped the start, loop and try again
2102                 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2103                         continue;
2104                 } else if (old_start_empty) {
2105                         //
2106                         // if there is nothing in old_start anymore then we can
2107                         // bump the jhdr->start to be the same as active_start
2108                         // since it is possible there was only one very large
2109                         // transaction in the old_start array.  if we didn't do
2110                         // this then jhdr->start would never get updated and we
2111                         // would wind up looping until we hit the panic at the
2112                         // start of the loop.
2113                         //
2114                         jnl->jhdr->start = jnl->active_start;
2115                         write_journal_header(jnl);
2116                         continue;
2117                 }
2118
2119
2120                 // if the file system gave us a flush function, call it to so that
2121                 // it can flush some blocks which hopefully will cause some transactions
2122                 // to complete and thus free up space in the journal.
2123                 if (jnl->flush) {
2124                         jnl->flush(jnl->flush_arg);
2125                 }
2126
2127                 // wait for a while to avoid being cpu-bound (this will
2128                 // put us to sleep for 10 milliseconds)
2129                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2130     }
2131
2132     return 0;
2133 }
2134
2135 /*
2136  * Allocate a new active transaction.
2137  */
2138 static errno_t
2139 journal_allocate_transaction(journal *jnl)
2140 {
2141         transaction *tr;
2142
2143         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2144     memset(tr, 0, sizeof(transaction));
2145
2146     tr->tbuffer_size = jnl->tbuffer_size;
2147
2148     if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
2149                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2150                 jnl->active_tr = NULL;
2151                 return ENOMEM;
2152     }
2153
2154     // journal replay code checksum check depends on this.
2155     memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2156     // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2157     memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2158
2159     tr->blhdr = (block_list_header *)tr->tbuffer;
2160     tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2161     tr->blhdr->num_blocks = 1;      // accounts for this header block
2162     tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2163     tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2164
2165     tr->sequence_num = ++jnl->jhdr->sequence_num;
2166     tr->num_blhdrs  = 1;
2167     tr->total_bytes = jnl->jhdr->blhdr_size;
2168     tr->jnl         = jnl;
2169
2170         jnl->active_tr  = tr;
2171
2172         return 0;
2173 }
2174
2175 int
2176 journal_start_transaction(journal *jnl)
2177 {
2178     int ret;
2179
2180     CHECK_JOURNAL(jnl);
2181
2182     if (jnl->flags & JOURNAL_INVALID) {
2183                 return EINVAL;
2184     }
2185
2186     if (jnl->owner == current_thread()) {
2187                 if (jnl->active_tr == NULL) {
2188                         panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2189                                   jnl, jnl->owner, current_thread());
2190                 }
2191                 jnl->nested_count++;
2192                 return 0;
2193     }
2194
2195     lock_journal(jnl);
2196
2197     if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
2198                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2199                           jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2200     }
2201
2202     jnl->owner        = current_thread();
2203     jnl->nested_count = 1;
2204
2205     free_old_stuff(jnl);
2206
2207     // make sure there's room in the journal
2208     if (free_space(jnl) < jnl->tbuffer_size) {
2209         // this is the call that really waits for space to free up
2210         // as well as updating jnl->jhdr->start
2211         if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
2212                 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2213                 ret = ENOSPC;
2214                 goto bad_start;
2215         }
2216     }
2217
2218     // if there's a buffered transaction, use it.
2219     if (jnl->cur_tr) {
2220                 jnl->active_tr = jnl->cur_tr;
2221                 jnl->cur_tr    = NULL;
2222
2223                 return 0;
2224     }
2225
2226         ret = journal_allocate_transaction(jnl);
2227         if (ret) {
2228                 goto bad_start;
2229         }
2230
2231     // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2232
2233     return 0;
2234
2235   bad_start:
2236         jnl->owner        = NULL;
2237         jnl->nested_count = 0;
2238         unlock_journal(jnl);
2239         return ret;
2240 }
2241
2242
2243 int
2244 journal_modify_block_start(journal *jnl, struct buf *bp)
2245 {
2246     transaction *tr;
2247
2248     CHECK_JOURNAL(jnl);
2249
2250     if (jnl->flags & JOURNAL_INVALID) {
2251                 return EINVAL;
2252     }
2253
2254     // XXXdbg - for debugging I want this to be true.  later it may
2255     //          not be necessary.
2256     if ((buf_flags(bp) & B_META) == 0) {
2257                 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2258     }
2259
2260     tr = jnl->active_tr;
2261     CHECK_TRANSACTION(tr);
2262
2263     if (jnl->owner != current_thread()) {
2264                 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2265                           jnl, jnl->owner, current_thread());
2266     }
2267
2268     free_old_stuff(jnl);
2269
2270     //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2271     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2272
2273     // can't allow blocks that aren't an even multiple of the
2274     // underlying block size.
2275     if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2276                 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2277                           buf_size(bp), jnl->jhdr->jhdr_size);
2278                 return -1;
2279     }
2280
2281     // make sure that this transaction isn't bigger than the whole journal
2282     if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2283                 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2284                           tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2285                 return -1;
2286     }
2287
2288     // if the block is dirty and not already locked we have to write
2289     // it out before we muck with it because it has data that belongs
2290     // (presumably) to another transaction.
2291     //
2292     if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2293
2294                 if (buf_flags(bp) & B_ASYNC) {
2295                         panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2296                 }
2297
2298                 // this will cause it to not be buf_brelse()'d
2299                 buf_setflags(bp, B_NORELSE);
2300                 VNOP_BWRITE(bp);
2301     }
2302     buf_setflags(bp, B_LOCKED);
2303
2304     return 0;
2305 }
2306
2307 int
2308 journal_modify_block_abort(journal *jnl, struct buf *bp)
2309 {
2310     transaction *tr;
2311         block_list_header *blhdr;
2312         int i;
2313
2314     CHECK_JOURNAL(jnl);
2315
2316     tr = jnl->active_tr;
2317
2318         //
2319         // if there's no active transaction then we just want to
2320         // call buf_brelse() and return since this is just a block
2321         // that happened to be modified as part of another tr.
2322         //
2323         if (tr == NULL) {
2324                 buf_brelse(bp);
2325                 return 0;
2326         }
2327
2328     if (jnl->flags & JOURNAL_INVALID) {
2329                 return EINVAL;
2330     }
2331
2332     CHECK_TRANSACTION(tr);
2333
2334     if (jnl->owner != current_thread()) {
2335                 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2336                           jnl, jnl->owner, current_thread());
2337     }
2338
2339     free_old_stuff(jnl);
2340
2341     // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2342
2343     // first check if it's already part of this transaction
2344     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2345                 for(i=1; i < blhdr->num_blocks; i++) {
2346                         if (bp == blhdr->binfo[i].b.bp) {
2347                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2348                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2349                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2350                                 }
2351                                 break;
2352                         }
2353                 }
2354
2355                 if (i < blhdr->num_blocks) {
2356                         break;
2357                 }
2358     }
2359
2360         //
2361         // if blhdr is null, then this block has only had modify_block_start
2362         // called on it as part of the current transaction.  that means that
2363         // it is ok to clear the LOCKED bit since it hasn't actually been
2364         // modified.  if blhdr is non-null then modify_block_end was called
2365         // on it and so we need to keep it locked in memory.
2366         //
2367         if (blhdr == NULL) {
2368                   buf_clearflags(bp, B_LOCKED);
2369         }
2370
2371     buf_brelse(bp);
2372     return 0;
2373 }
2374
2375
2376 int
2377 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg)
2378 {
2379     int                i = 1;
2380     int                tbuffer_offset=0;
2381     char              *blkptr;
2382     block_list_header *blhdr, *prev=NULL;
2383     transaction       *tr;
2384
2385     CHECK_JOURNAL(jnl);
2386
2387     if (jnl->flags & JOURNAL_INVALID) {
2388                 return EINVAL;
2389     }
2390
2391     tr = jnl->active_tr;
2392     CHECK_TRANSACTION(tr);
2393
2394     if (jnl->owner != current_thread()) {
2395                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2396                           jnl, jnl->owner, current_thread());
2397     }
2398
2399     free_old_stuff(jnl);
2400
2401     //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2402     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2403
2404     if ((buf_flags(bp) & B_LOCKED) == 0) {
2405                 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2406     }
2407
2408     // first check if it's already part of this transaction
2409     for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2410                 tbuffer_offset = jnl->jhdr->blhdr_size;
2411
2412                 for(i=1; i < blhdr->num_blocks; i++) {
2413                         if (bp == blhdr->binfo[i].b.bp) {
2414                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2415                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2416                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2417                                 }
2418                                 break;
2419                         }
2420                         tbuffer_offset += blhdr->binfo[i].bsize;
2421                 }
2422
2423                 if (i < blhdr->num_blocks) {
2424                         break;
2425                 }
2426     }
2427
2428     if (blhdr == NULL
2429                 && prev
2430                 && (prev->num_blocks+1) <= prev->max_blocks
2431                 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2432                 blhdr = prev;
2433     } else if (blhdr == NULL) {
2434                 block_list_header *nblhdr;
2435
2436                 if (prev == NULL) {
2437                         panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2438                 }
2439
2440                 // we got to the end of the list, didn't find the block and there's
2441                 // no room in the block_list_header pointed to by prev
2442
2443                 // we allocate another tbuffer and link it in at the end of the list
2444                 // through prev->binfo[0].bnum.  that's a skanky way to do things but
2445                 // avoids having yet another linked list of small data structures to manage.
2446
2447                 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2448                         panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2449                                   tr, tr->total_bytes);
2450                 }
2451
2452                 // journal replay code checksum check depends on this.
2453                 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2454                 // Fill up the rest of the block with unimportant bytes
2455                 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2456
2457                 // initialize the new guy
2458                 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2459                 nblhdr->num_blocks = 1;      // accounts for this header block
2460                 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2461                 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2462
2463                 tr->num_blhdrs++;
2464                 tr->total_bytes += jnl->jhdr->blhdr_size;
2465
2466                 // then link him in at the end
2467                 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2468
2469                 // and finally switch to using the new guy
2470                 blhdr          = nblhdr;
2471                 tbuffer_offset = jnl->jhdr->blhdr_size;
2472                 i              = 1;
2473     }
2474
2475
2476     if ((i+1) > blhdr->max_blocks) {
2477                 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2478     }
2479
2480         // if the function pointer is not set then copy the
2481         // block of data now.  if the function pointer is set
2482         // the copy will happen after calling the callback in
2483         // end_transaction() just before it goes to disk.
2484         //
2485         if (func == NULL) {
2486                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2487                 memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp));
2488         }
2489
2490     // if this is true then this is a new block we haven't seen
2491     if (i >= blhdr->num_blocks) {
2492                 int     bsize;
2493                 vnode_t vp;
2494
2495                 vp = buf_vnode(bp);
2496                 vnode_ref(vp);
2497                 bsize = buf_size(bp);
2498
2499                 blhdr->binfo[i].bnum  = (off_t)(buf_blkno(bp));
2500                 blhdr->binfo[i].bsize = bsize;
2501                 blhdr->binfo[i].b.bp    = bp;
2502                 if (func) {
2503                         void *old_func=NULL, *old_arg=NULL;
2504
2505                         buf_setfilter(bp, func, arg, &old_func, &old_arg);
2506                         if (old_func != NULL) {
2507                                 panic("jnl: modify_block_end: old func %p / arg %p", old_func, old_arg);
2508                         }
2509                 }
2510
2511                 blhdr->bytes_used += bsize;
2512                 tr->total_bytes   += bsize;
2513
2514                 blhdr->num_blocks++;
2515     }
2516     buf_bdwrite(bp);
2517
2518     return 0;
2519 }
2520
2521 int
2522 journal_kill_block(journal *jnl, struct buf *bp)
2523 {
2524     int                i;
2525     int                bflags;
2526     block_list_header *blhdr;
2527     transaction       *tr;
2528
2529     CHECK_JOURNAL(jnl);
2530
2531     if (jnl->flags & JOURNAL_INVALID) {
2532                 return EINVAL;
2533     }
2534
2535     tr = jnl->active_tr;
2536     CHECK_TRANSACTION(tr);
2537
2538     if (jnl->owner != current_thread()) {
2539                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2540                           jnl, jnl->owner, current_thread());
2541     }
2542
2543     free_old_stuff(jnl);
2544
2545     bflags = buf_flags(bp);
2546
2547     if ( !(bflags & B_LOCKED))
2548             panic("jnl: modify_block_end: called with bp not B_LOCKED");
2549
2550     /*
2551      * bp must be BL_BUSY and B_LOCKED
2552      */
2553     // first check if it's already part of this transaction
2554     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2555
2556                 for(i=1; i < blhdr->num_blocks; i++) {
2557                         if (bp == blhdr->binfo[i].b.bp) {
2558                                 vnode_t vp;
2559
2560                                 buf_clearflags(bp, B_LOCKED);
2561
2562                                 // this undoes the vnode_ref() in journal_modify_block_end()
2563                                 vp = buf_vnode(bp);
2564                                 vnode_rele_ext(vp, 0, 1);
2565
2566                                 // if the block has the DELWRI and FILTER bits sets, then
2567                                 // things are seriously weird.  if it was part of another
2568                                 // transaction then journal_modify_block_start() should
2569                                 // have force it to be written.
2570                                 //
2571                                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2572                                 //      panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2573                                 //} else {
2574                                         tr->num_killed += buf_size(bp);
2575                                 //}
2576                                 blhdr->binfo[i].b.bp   = NULL;
2577                                 blhdr->binfo[i].bnum = (off_t)-1;
2578
2579                                 buf_markinvalid(bp);
2580                                 buf_brelse(bp);
2581
2582                                 break;
2583                         }
2584                 }
2585
2586                 if (i < blhdr->num_blocks) {
2587                         break;
2588                 }
2589     }
2590
2591     return 0;
2592 }
2593
2594
2595 static int
2596 journal_binfo_cmp(const void *a, const void *b)
2597 {
2598     const block_info *bi_a = (const struct block_info *)a;
2599     const block_info *bi_b = (const struct block_info *)b;
2600     daddr64_t res;
2601
2602     if (bi_a->b.bp == NULL) {
2603                 return 1;
2604     }
2605     if (bi_b->b.bp == NULL) {
2606                 return -1;
2607     }
2608
2609     // don't have to worry about negative block
2610     // numbers so this is ok to do.
2611     //
2612     res = (buf_blkno(bi_a->b.bp) - buf_blkno(bi_b->b.bp));
2613
2614     return (int)res;
2615 }
2616
2617
2618 /*
2619  * End a transaction.  If the transaction is small enough, and we're not forcing
2620  * a write to disk, the "active" transaction becomes the "current" transaction,
2621  * and will be reused for the next transaction that is started (group commit).
2622  *
2623  * If the transaction gets written to disk (because force_it is true, or no
2624  * group commit, or the transaction is sufficiently full), the blocks get
2625  * written into the journal first, then the are written asynchronously.  When
2626  * those async writes complete, the transaction can be freed and removed from
2627  * the journal.
2628  *
2629  * An optional callback can be supplied.  If given, it is called after the
2630  * the blocks have been written to the journal, but before the async writes
2631  * of those blocks to their normal on-disk locations.  This is used by
2632  * journal_relocate so that the location of the journal can be changed and
2633  * flushed to disk before the blocks get written to their normal locations.
2634  * Note that the callback is only called if the transaction gets written to
2635  * the journal during this end_transaction call; you probably want to set the
2636  * force_it flag.
2637  *
2638  * Inputs:
2639  *      tr                       Transaction to add to the journal
2640  *      force_it         If true, force this transaction to the on-disk journal immediately.
2641  *      callback         See description above.  Pass NULL for no callback.
2642  *      callback_arg Argument passed to callback routine.
2643  *
2644  * Result
2645  *               0              No errors
2646  *              -1              An error occurred.  The journal is marked invalid.
2647  */
2648 static int
2649 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg)
2650 {
2651     int                 i, ret, amt;
2652     errno_t             errno;
2653     off_t               end;
2654     journal            *jnl = tr->jnl;
2655     struct buf         *bp, **bparray;
2656     block_list_header  *blhdr=NULL, *next=NULL;
2657     size_t              tbuffer_offset;
2658
2659         if (jnl->cur_tr) {
2660                 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2661                           jnl, jnl->cur_tr, tr);
2662         }
2663
2664     // if there weren't any modified blocks in the transaction
2665     // just save off the transaction pointer and return.
2666     if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2667                 jnl->cur_tr = tr;
2668                 return 0;
2669     }
2670
2671     // if our transaction buffer isn't very full, just hang
2672     // on to it and don't actually flush anything.  this is
2673     // what is known as "group commit".  we will flush the
2674     // transaction buffer if it's full or if we have more than
2675     // one of them so we don't start hogging too much memory.
2676     //
2677     if (   force_it == 0
2678                    && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2679                    && tr->num_blhdrs < 3
2680                    && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2681
2682                 jnl->cur_tr = tr;
2683                 return 0;
2684     }
2685
2686
2687     // if we're here we're going to flush the transaction buffer to disk.
2688     // make sure there is room in the journal first.
2689     check_free_space(jnl, tr->total_bytes);
2690
2691     // range check the end index
2692     if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2693                 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2694                           jnl->jhdr->end, jnl->jhdr->size);
2695     }
2696
2697     // this transaction starts where the current journal ends
2698     tr->journal_start = jnl->jhdr->end;
2699     end               = jnl->jhdr->end;
2700
2701         //
2702         // if the first entry in old_start[] isn't free yet, loop calling the
2703         // file system flush routine until it is (or we panic).
2704         //
2705         i = 0;
2706         lock_oldstart(jnl);
2707         while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2708                 if (jnl->flush) {
2709                         unlock_oldstart(jnl);
2710
2711                         if (jnl->flush) {
2712                                 jnl->flush(jnl->flush_arg);
2713                         }
2714
2715                         // yield the cpu so others can get in to clear the lock bit
2716                         (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2717
2718                         lock_oldstart(jnl);
2719                 }
2720                 if (i++ >= 500) {
2721                         panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n",
2722                                   jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2723                 }
2724         }
2725
2726         //
2727         // slide everyone else down and put our latest guy in the last
2728         // entry in the old_start array
2729         //
2730         memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2731         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2732
2733         unlock_oldstart(jnl);
2734
2735
2736     // for each block, make sure that the physical block # is set
2737     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2738                 char *blkptr;
2739
2740                 tbuffer_offset = jnl->jhdr->blhdr_size;
2741                 for(i=1; i < blhdr->num_blocks; i++) {
2742                         daddr64_t blkno;
2743                         daddr64_t lblkno;
2744                         struct vnode *vp;
2745
2746                         bp = blhdr->binfo[i].b.bp;
2747
2748                         // if this block has a callback function set, call
2749                         // it now and then copy the data from the bp into
2750                         // the journal.
2751                         if (bp) {
2752                                 void (*func)(struct buf *, void *);
2753                                 void  *arg;
2754
2755                                 buf_setfilter(bp, NULL, NULL, (void **)&func, &arg);
2756
2757                                 if (func) {
2758                                         // acquire the bp here so that we can safely
2759                                         // mess around with its data.  buf_acquire()
2760                                         // will return EAGAIN if the buffer was busy,
2761                                         // so loop trying again.
2762                                         do {
2763                                                 errno = buf_acquire(bp, 0, 0, 0);
2764                                         } while (errno == EAGAIN);
2765
2766                                         if (errno == 0) {
2767
2768                                                 // call the hook function and then copy the
2769                                                 // data into the transaction buffer...
2770                                                 func(bp, arg);
2771
2772                                                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2773                                                 memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp));
2774
2775                                                 buf_drop(bp);
2776                                         } else {
2777                                                 panic("could not acquire bp %p (err %d)\n", bp, errno);
2778                                         }
2779                                 }
2780
2781                         } else {   // bp == NULL, only true if a block was "killed"
2782                                 if (blhdr->binfo[i].bnum != (off_t)-1) {
2783                                         panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2784                                                 blhdr->binfo[i].bnum, jnl, tr);
2785                                 }
2786
2787                                 tbuffer_offset += blhdr->binfo[i].bsize;
2788                                 continue;
2789                         }
2790
2791                         tbuffer_offset += blhdr->binfo[i].bsize;
2792
2793                         vp = buf_vnode(bp);
2794                         blkno = buf_blkno(bp);
2795                         lblkno = buf_lblkno(bp);
2796
2797                         if (vp == NULL && lblkno == blkno) {
2798                             printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd.  aborting the transaction (tr %p jnl %p).\n",
2799                                 jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
2800                             goto bad_journal;
2801                         }
2802
2803                         // if the lblkno is the same as blkno and this bp isn't
2804                         // associated with the underlying file system device then
2805                         // we need to call bmap() to get the actual physical block.
2806                         //
2807                         if ((lblkno == blkno) && (vp != jnl->fsdev)) {
2808                                 off_t   f_offset;
2809                                 size_t  contig_bytes;
2810
2811                                 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
2812                                         printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2813                                         goto bad_journal;
2814                                 }
2815                                 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
2816                                         printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2817                                         goto bad_journal;
2818                                 }
2819                                 if ((uint32_t)contig_bytes < buf_count(bp)) {
2820                                         printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2821                                         goto bad_journal;
2822                                 }
2823                                 buf_setblkno(bp, blkno);
2824                         }
2825                         // update this so we write out the correct physical block number!
2826                         blhdr->binfo[i].bnum = (off_t)(blkno);
2827                 }
2828
2829                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2830     }
2831
2832
2833
2834     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2835                 amt = blhdr->bytes_used;
2836
2837                 blhdr->binfo[0].b.sequence_num = tr->sequence_num;
2838
2839                 blhdr->checksum = 0;
2840                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2841
2842                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, tr->blhdr->num_blocks * sizeof(struct buf *))) {
2843                     panic("can't allocate %lu bytes for bparray\n", tr->blhdr->num_blocks * sizeof(struct buf *));
2844                 }
2845
2846                 // calculate individual block checksums
2847                 tbuffer_offset = jnl->jhdr->blhdr_size;
2848                 for(i=1; i < blhdr->num_blocks; i++) {
2849                     bparray[i] = blhdr->binfo[i].b.bp;
2850                     if (bparray[i]) {
2851                         blhdr->binfo[i].b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], blhdr->binfo[i].bsize);
2852                     } else {
2853                         blhdr->binfo[i].b.cksum = 0;
2854                     }
2855
2856                     tbuffer_offset += blhdr->binfo[i].bsize;
2857                 }
2858
2859                 ret = write_journal_data(jnl, &end, blhdr, amt);
2860
2861                 // always put the bp pointers back
2862                 for(i=1; i < blhdr->num_blocks; i++) {
2863                     blhdr->binfo[i].b.bp = bparray[i];
2864                 }
2865
2866                 kmem_free(kernel_map, (vm_offset_t)bparray, tr->blhdr->num_blocks * sizeof(struct buf *));
2867
2868                 if (ret != amt) {
2869                         printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
2870                             jnl->jdev_name, ret, amt);
2871
2872                         goto bad_journal;
2873                 }
2874     }
2875
2876     jnl->jhdr->end  = end;    // update where the journal now ends
2877     tr->journal_end = end;    // the transaction ends here too
2878     if (tr->journal_start == 0 || tr->journal_end == 0) {
2879                 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2880                           tr->journal_start, tr->journal_end);
2881     }
2882
2883     if (write_journal_header(jnl) != 0) {
2884                 goto bad_journal;
2885     }
2886
2887         /*
2888          * If the caller supplied a callback, call it now that the blocks have been
2889          * written to the journal.  This is used by journal_relocate so, for example,
2890          * the file system can change its pointer to the new journal.
2891          */
2892         if (callback != NULL && callback(callback_arg) != 0) {
2893                 goto bad_journal;
2894         }
2895
2896     //
2897     // setup for looping through all the blhdr's.  we null out the
2898     // tbuffer and blhdr fields so that they're not used any more.
2899     //
2900     blhdr       = tr->blhdr;
2901     tr->tbuffer = NULL;
2902     tr->blhdr   = NULL;
2903
2904     // the buffer_flushed_callback will only be called for the
2905     // real blocks that get flushed so we have to account for
2906     // the block_list_headers here.
2907     //
2908     tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2909
2910     // for each block, set the iodone callback and unlock it
2911     for(; blhdr; blhdr=next) {
2912
2913                 // we can re-order the buf ptrs because everything is written out already
2914                 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2915
2916                 for(i=1; i < blhdr->num_blocks; i++) {
2917                         if (blhdr->binfo[i].b.bp == NULL) {
2918                                 continue;
2919                         }
2920
2921                         bp = blhdr->binfo[i].b.bp;
2922
2923                         // have to pass BAC_REMOVE here because we're going to bawrite()
2924                         // the buffer when we're done
2925                         do {
2926                                 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
2927                         } while (errno == EAGAIN);
2928
2929                         if (errno == 0) {
2930                                 struct vnode *save_vp;
2931                                 void *cur_filter;
2932
2933                                 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2934                                         if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2935                                             buf_clearflags(bp, B_LOCKED);
2936                                             buf_brelse(bp);
2937                                                 continue;
2938                                         } else {
2939                                                 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
2940                                         }
2941                                 }
2942                                 save_vp = buf_vnode(bp);
2943
2944                                 buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL);
2945
2946                                 if (cur_filter) {
2947                                         panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n",
2948                                                   bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback);
2949                                 }
2950                                 buf_clearflags(bp, B_LOCKED);
2951
2952                                 // kicking off the write here helps performance
2953                                 buf_bawrite(bp);
2954                                 // XXXdbg this is good for testing: buf_bdwrite(bp);
2955                                 //buf_bdwrite(bp);
2956
2957                                 // this undoes the vnode_ref() in journal_modify_block_end()
2958                                 vnode_rele_ext(save_vp, 0, 1);
2959                         } else {
2960                                 printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n",
2961                                     jnl->jdev_name,bp, errno);
2962                         }
2963                 }
2964
2965                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2966
2967                 // we can free blhdr here since we won't need it any more
2968                 blhdr->binfo[0].bnum = 0xdeadc0de;
2969                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2970     }
2971
2972     //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2973     //   tr, tr->journal_start, tr->journal_end);
2974     return 0;
2975
2976
2977   bad_journal:
2978     jnl->flags |= JOURNAL_INVALID;
2979     jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2980     abort_transaction(jnl, tr);
2981     return -1;
2982 }
2983
2984 static void
2985 abort_transaction(journal *jnl, transaction *tr)
2986 {
2987     int                i;
2988     errno_t             errno;
2989     block_list_header *blhdr, *next;
2990     struct buf        *bp;
2991     struct vnode      *save_vp;
2992
2993     // for each block list header, iterate over the blocks then
2994     // free up the memory associated with the block list.
2995     //
2996     // for each block, clear the lock bit and release it.
2997     //
2998     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2999
3000                 for(i=1; i < blhdr->num_blocks; i++) {
3001                         if (blhdr->binfo[i].b.bp == NULL) {
3002                                 continue;
3003                         }
3004                         if ( (buf_vnode(blhdr->binfo[i].b.bp) == NULL) ||
3005                              !(buf_flags(blhdr->binfo[i].b.bp) & B_LOCKED) ) {
3006                                 continue;
3007                         }
3008
3009                         errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].b.bp),
3010                                                          buf_lblkno(blhdr->binfo[i].b.bp),
3011                                                          buf_size(blhdr->binfo[i].b.bp),
3012                                                          NOCRED,
3013                                                          &bp);
3014                         if (errno == 0) {
3015                                 if (bp != blhdr->binfo[i].b.bp) {
3016                                         panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
3017                                                   bp, blhdr->binfo[i].b.bp, jnl);
3018                                 }
3019
3020                                 // releasing a bp marked invalid
3021                                 // also clears the locked and delayed state
3022                                 buf_markinvalid(bp);
3023                                 save_vp = buf_vnode(bp);
3024
3025                                 buf_brelse(bp);
3026
3027                                 vnode_rele_ext(save_vp, 0, 1);
3028                         } else {
3029                                 printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
3030                                     jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].b.bp);
3031                                 if (bp) {
3032                                         buf_brelse(bp);
3033                                 }
3034                         }
3035                 }
3036
3037                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3038
3039                 // we can free blhdr here since we won't need it any more
3040                 blhdr->binfo[0].bnum = 0xdeadc0de;
3041                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
3042     }
3043
3044     tr->tbuffer     = NULL;
3045     tr->blhdr       = NULL;
3046     tr->total_bytes = 0xdbadc0de;
3047         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
3048 }
3049
3050
3051 int
3052 journal_end_transaction(journal *jnl)
3053 {
3054     int ret;
3055         transaction *tr;
3056
3057     CHECK_JOURNAL(jnl);
3058
3059         if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
3060                 return 0;
3061         }
3062
3063     if (jnl->owner != current_thread()) {
3064                 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
3065                           jnl, jnl->owner, current_thread());
3066     }
3067
3068     free_old_stuff(jnl);
3069
3070     jnl->nested_count--;
3071     if (jnl->nested_count > 0) {
3072                 return 0;
3073     } else if (jnl->nested_count < 0) {
3074                 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
3075     }
3076
3077     if (jnl->flags & JOURNAL_INVALID) {
3078                 if (jnl->active_tr) {
3079                         if (jnl->cur_tr != NULL) {
3080                                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
3081                                           jnl, jnl->active_tr, jnl->cur_tr);
3082                         }
3083
3084                         tr             = jnl->active_tr;
3085                         jnl->active_tr = NULL;
3086                         abort_transaction(jnl, tr);
3087                 }
3088
3089                 jnl->owner = NULL;
3090                 unlock_journal(jnl);
3091
3092                 return EINVAL;
3093     }
3094
3095     tr = jnl->active_tr;
3096     CHECK_TRANSACTION(tr);
3097
3098     // clear this out here so that when check_free_space() calls
3099     // the FS flush function, we don't panic in journal_flush()
3100     // if the FS were to call that.  note: check_free_space() is
3101     // called from end_transaction().
3102     //
3103     jnl->active_tr = NULL;
3104     ret = end_transaction(tr, 0, NULL, NULL);
3105
3106     jnl->owner = NULL;
3107     unlock_journal(jnl);
3108
3109     return ret;
3110 }
3111
3112
3113 int
3114 journal_flush(journal *jnl)
3115 {
3116     int need_signal = 0;
3117
3118     CHECK_JOURNAL(jnl);
3119
3120     if (jnl->flags & JOURNAL_INVALID) {
3121                 return -1;
3122     }
3123
3124     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3125         | DBG_FUNC_START, 0, 0, 0, 0, 0);
3126
3127     if (jnl->owner != current_thread()) {
3128                 lock_journal(jnl);
3129                 need_signal = 1;
3130     }
3131
3132     free_old_stuff(jnl);
3133
3134     // if we're not active, flush any buffered transactions
3135     if (jnl->active_tr == NULL && jnl->cur_tr) {
3136                 transaction *tr = jnl->cur_tr;
3137
3138                 jnl->cur_tr = NULL;
3139                 end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
3140     }
3141
3142     if (need_signal) {
3143                 unlock_journal(jnl);
3144     }
3145
3146     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3147         | DBG_FUNC_END, 0, 0, 0, 0, 0);
3148
3149     return 0;
3150 }
3151
3152 int
3153 journal_active(journal *jnl)
3154 {
3155     if (jnl->flags & JOURNAL_INVALID) {
3156                 return -1;
3157     }
3158
3159     return (jnl->active_tr == NULL) ? 0 : 1;
3160 }
3161
3162 void *
3163 journal_owner(journal *jnl)
3164 {
3165     return jnl->owner;
3166 }
3167
3168 int journal_uses_fua(journal *jnl)
3169 {
3170         if (jnl->flags & JOURNAL_DO_FUA_WRITES)
3171                 return 1;
3172         return 0;
3173 }
3174
3175 /*
3176  * Relocate the journal.
3177  *
3178  * You provide the new starting offset and size for the journal. You may
3179  * optionally provide a new tbuffer_size; passing zero defaults to not
3180  * changing the tbuffer size except as needed to fit within the new journal
3181  * size.
3182  *
3183  * You must have already started a transaction. The transaction may contain
3184  * modified blocks (such as those needed to deallocate the old journal,
3185  * allocate the new journal, and update the location and size of the journal
3186  * in filesystem-private structures). Any transactions prior to the active
3187  * transaction will be flushed to the old journal. The new journal will be
3188  * initialized, and the blocks from the active transaction will be written to
3189  * the new journal.
3190  *
3191  * The caller will need to update the structures that identify the location
3192  * and size of the journal.  These updates should be made in the supplied
3193  * callback routine.  These updates must NOT go into a transaction.  You should
3194  * force these updates to the media before returning from the callback.  In the
3195  * even of a crash, either the old journal will be found, with an empty journal,
3196  * or the new journal will be found with the contents of the active transaction.
3197  *
3198  * Upon return from the callback, the blocks from the active transaction are
3199  * written to their normal locations on disk.
3200  *
3201  * (Remember that we have to ensure that blocks get committed to the journal
3202  * before being committed to their normal locations.  But the blocks don't count
3203  * as committed until the new journal is pointed at.)
3204  *
3205  * Upon return, there is still an active transaction: newly allocated, and
3206  * with no modified blocks.  Call journal_end_transaction as normal.  You may
3207  * modifiy additional blocks before calling journal_end_transaction, and those
3208  * blocks will (eventually) go to the relocated journal.
3209  *
3210  * Inputs:
3211  *      jnl                             The (opened) journal to relocate.
3212  *      offset                  The new journal byte offset (from start of the journal device).
3213  *      journal_size    The size, in bytes, of the new journal.
3214  *      tbuffer_size    The new desired transaction buffer size.  Pass zero to keep
3215  *                                      the same size as the current journal.  The size will be
3216  *                                      modified as needed to fit the new journal.
3217  *      callback                Routine called after the new journal has been initialized,
3218  *                                      and the active transaction written to the new journal, but
3219  *                                      before the blocks are written to their normal locations.
3220  *                                      Pass NULL for no callback.
3221  *      callback_arg    An argument passed to the callback routine.
3222  *
3223  * Result:
3224  *      0                               No errors
3225  *      EINVAL                  The offset is not block aligned
3226  *      EINVAL                  The journal_size is not a multiple of the block size
3227  *      EINVAL                  The journal is invalid
3228  *      (any)                   An error returned by journal_flush.
3229  *
3230  */
3231 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
3232         errno_t (*callback)(void *), void *callback_arg)
3233 {
3234         int ret;
3235         transaction *tr;
3236
3237         /*
3238          * Sanity check inputs, and adjust the size of the transaction buffer.
3239          */
3240     if ((offset % jnl->jhdr->jhdr_size) != 0) {
3241                 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
3242                     jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
3243                 return EINVAL;
3244     }
3245     if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
3246                 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3247                     jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
3248                 return EINVAL;
3249     }
3250
3251     CHECK_JOURNAL(jnl);
3252
3253         /* Guarantee we own the active transaction. */
3254     if (jnl->flags & JOURNAL_INVALID) {
3255                 return EINVAL;
3256     }
3257     if (jnl->owner != current_thread()) {
3258         panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
3259                 jnl, jnl->owner, current_thread());
3260         }
3261
3262     if (tbuffer_size == 0)
3263         tbuffer_size = jnl->tbuffer_size;
3264     size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
3265
3266         /*
3267          * Flush any non-active transactions.  We have to temporarily hide the
3268          * active transaction to make journal_flush flush out non-active but
3269          * current (unwritten) transactions.
3270          */
3271         tr = jnl->active_tr;
3272         CHECK_TRANSACTION(tr);
3273         jnl->active_tr = NULL;
3274         ret = journal_flush(jnl);
3275         jnl->active_tr = tr;
3276         if (ret) {
3277                 return ret;
3278         }
3279
3280         /* Update the journal's offset and size in memory. */
3281         jnl->jdev_offset = offset;
3282         jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
3283         jnl->jhdr->size = journal_size;
3284         jnl->active_start = jnl->jhdr->start;
3285
3286         /*
3287          * Force the active transaction to be written to the new journal.  Call the
3288          * supplied callback after the blocks have been written to the journal, but
3289          * before they get written to their normal on-disk locations.
3290          */
3291         jnl->active_tr = NULL;
3292         ret = end_transaction(tr, 1, callback, callback_arg);
3293         if (ret) {
3294                 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
3295                 goto bad_journal;
3296         }
3297
3298         /*
3299          * Create a new, empty transaction to be the active transaction.  This way
3300          * our caller can use journal_end_transaction as usual.
3301          */
3302         ret = journal_allocate_transaction(jnl);
3303         if (ret) {
3304                 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
3305                 goto bad_journal;
3306         }
3307
3308         return 0;
3309
3310 bad_journal:
3311     jnl->flags |= JOURNAL_INVALID;
3312     abort_transaction(jnl, tr);
3313     return ret;
3314 }