bsd/vfs/vfs_journal.c

   1 /*
   2  * Copyright (c) 1995-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // vfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 //
  37
  38 #ifdef KERNEL
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/file_internal.h>
  44 #include <sys/stat.h>
  45 #include <sys/buf_internal.h>
  46 #include <sys/proc_internal.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/namei.h>
  49 #include <sys/vnode_internal.h>
  50 #include <sys/ioctl.h>
  51 #include <sys/tty.h>
  52 #include <sys/ubc.h>
  53 #include <sys/malloc.h>
  54 #include <kern/thread.h>
  55 #include <sys/disk.h>
  56 #include <sys/kdebug.h>
  57 #include <miscfs/specfs/specdev.h>
  58 #include <libkern/OSAtomic.h>   /* OSAddAtomic */
  59
  60 extern task_t kernel_task;
  61
  62 #define DBG_JOURNAL_FLUSH 1
  63
  64 #else
  65
  66 #include <stdio.h>
  67 #include <stdlib.h>
  68 #include <string.h>
  69 #include <limits.h>
  70 #include <errno.h>
  71 #include <fcntl.h>
  72 #include <unistd.h>
  73 #include <stdarg.h>
  74 #include <sys/types.h>
  75 #include "compat.h"
  76
  77 #endif   /* KERNEL */
  78
  79 #include "vfs_journal.h"
  80
  81 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
  82 __private_extern__ void qsort(
  83     void * array,
  84     size_t nmembers,
  85     size_t member_size,
  86     int (*)(const void *, const void *));
  87
  88
  89
  90 // number of bytes to checksum in a block_list_header
  91 // NOTE: this should be enough to clear out the header
  92 //       fields as well as the first entry of binfo[]
  93 #define BLHDR_CHECKSUM_SIZE 32
  94
  95
  96 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg);
  97 static void abort_transaction(journal *jnl, transaction *tr);
  98 static void dump_journal(journal *jnl);
  99
 100 static __inline__ void  lock_journal(journal *jnl);
 101 static __inline__ void  unlock_journal(journal *jnl);
 102 static __inline__ void  lock_oldstart(journal *jnl);
 103 static __inline__ void  unlock_oldstart(journal *jnl);
 104
 105
 106
 107
 108 //
 109 // 3105942 - Coalesce writes to the same block on journal replay
 110 //
 111
 112 typedef struct bucket {
 113     off_t   block_num;
 114     size_t  jnl_offset;
 115     size_t  block_size;
 116     int32_t cksum;
 117 } bucket;
 118
 119 #define STARTING_BUCKETS 256
 120
 121 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 122 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
 123 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
 124 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 125 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
 126
 127 #define CHECK_JOURNAL(jnl) \
 128     do { \
 129     if (jnl == NULL) {\
 130         panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
 131     }\
 132     if (jnl->jdev == NULL) { \
 133         panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
 134     } \
 135     if (jnl->fsdev == NULL) { \
 136         panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
 137     } \
 138     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
 139         panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
 140         __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
 141     }\
 142     if (   jnl->jhdr->start <= 0 \
 143         || jnl->jhdr->start > jnl->jhdr->size\
 144         || jnl->jhdr->start > 1024*1024*1024) {\
 145         panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
 146         __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
 147     }\
 148     if (   jnl->jhdr->end <= 0 \
 149         || jnl->jhdr->end > jnl->jhdr->size\
 150         || jnl->jhdr->end > 1024*1024*1024) {\
 151         panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
 152         __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
 153     }\
 154     if (jnl->jhdr->size > 1024*1024*1024) {\
 155         panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
 156         __FILE__, __LINE__, jnl->jhdr->size);\
 157     } \
 158     } while(0)
 159
 160 #define CHECK_TRANSACTION(tr) \
 161     do {\
 162     if (tr == NULL) {\
 163         panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
 164     }\
 165     if (tr->jnl == NULL) {\
 166         panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
 167     }\
 168     if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
 169         panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
 170     }\
 171     if (tr->total_bytes < 0) {\
 172         panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
 173     }\
 174     if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
 175         panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
 176     }\
 177     if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
 178         panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
 179     }\
 180     if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
 181         panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
 182     }\
 183     } while(0)
 184
 185
 186
 187 //
 188 // this isn't a great checksum routine but it will do for now.
 189 // we use it to checksum the journal header and the block list
 190 // headers that are at the start of each transaction.
 191 //
 192 static int
 193 calc_checksum(char *ptr, int len)
 194 {
 195     int i, cksum=0;
 196
 197     // this is a lame checksum but for now it'll do
 198     for(i=0; i < len; i++, ptr++) {
 199                 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
 200     }
 201
 202     return (~cksum);
 203 }
 204
 205 //
 206 // Journal Locking
 207 //
 208 lck_grp_attr_t *  jnl_group_attr;
 209 lck_attr_t *      jnl_lock_attr;
 210 lck_grp_t *       jnl_mutex_group;
 211
 212 void
 213 journal_init(void)
 214 {
 215         jnl_lock_attr    = lck_attr_alloc_init();
 216         jnl_group_attr   = lck_grp_attr_alloc_init();
 217         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 218 }
 219
 220 static __inline__ void
 221 lock_journal(journal *jnl)
 222 {
 223         lck_mtx_lock(&jnl->jlock);
 224 }
 225
 226 static __inline__ void
 227 unlock_journal(journal *jnl)
 228 {
 229         lck_mtx_unlock(&jnl->jlock);
 230 }
 231
 232 static __inline__ void
 233 lock_oldstart(journal *jnl)
 234 {
 235         lck_mtx_lock(&jnl->old_start_lock);
 236 }
 237
 238 static __inline__ void
 239 unlock_oldstart(journal *jnl)
 240 {
 241         lck_mtx_unlock(&jnl->old_start_lock);
 242 }
 243
 244
 245
 246 #define JNL_WRITE    0x0001
 247 #define JNL_READ     0x0002
 248 #define JNL_HEADER   0x8000
 249
 250 //
 251 // This function sets up a fake buf and passes it directly to the
 252 // journal device strategy routine (so that it won't get cached in
 253 // the block cache.
 254 //
 255 // It also handles range checking the i/o so that we don't write
 256 // outside the journal boundaries and it will wrap the i/o back
 257 // to the beginning if necessary (skipping over the journal header)
 258 //
 259 static size_t
 260 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
 261 {
 262     int         err, curlen=len;
 263     size_t      io_sz = 0;
 264     buf_t       bp;
 265     off_t       max_iosize;
 266
 267     if (*offset < 0 || *offset > jnl->jhdr->size) {
 268                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
 269     }
 270
 271     if (direction & JNL_WRITE)
 272         max_iosize = jnl->max_write_size;
 273     else if (direction & JNL_READ)
 274         max_iosize = jnl->max_read_size;
 275     else
 276         max_iosize = 128 * 1024;
 277
 278   again:
 279     bp = alloc_io_buf(jnl->jdev, 1);
 280
 281     if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
 282                 if (*offset == jnl->jhdr->size) {
 283                         *offset = jnl->jhdr->jhdr_size;
 284                 } else {
 285                         curlen = (off_t)jnl->jhdr->size - *offset;
 286                 }
 287     }
 288
 289         if (curlen > max_iosize) {
 290                 curlen = max_iosize;
 291         }
 292
 293     if (curlen <= 0) {
 294                 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %lu\n", curlen, *offset, len);
 295     }
 296
 297         if (*offset == 0 && (direction & JNL_HEADER) == 0) {
 298                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 299         }
 300
 301     if (direction & JNL_READ)
 302             buf_setflags(bp, B_READ);
 303     else {
 304             /*
 305              * don't have to set any flags
 306              */
 307             vnode_startwrite(jnl->jdev);
 308     }
 309     buf_setsize(bp, curlen);
 310     buf_setcount(bp, curlen);
 311     buf_setdataptr(bp, (uintptr_t)data);
 312     buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 313     buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 314     if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
 315         buf_markfua(bp);
 316     }
 317
 318     err = VNOP_STRATEGY(bp);
 319     if (!err) {
 320                 err = (int)buf_biowait(bp);
 321     }
 322     free_io_buf(bp);
 323
 324     if (err) {
 325         printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
 326         return 0;
 327     }
 328
 329     *offset += curlen;
 330     io_sz   += curlen;
 331     if (io_sz != len) {
 332                 // handle wrap-around
 333                 data    = (char *)data + curlen;
 334                 curlen  = len - io_sz;
 335                 if (*offset >= jnl->jhdr->size) {
 336                         *offset = jnl->jhdr->jhdr_size;
 337                 }
 338                 goto again;
 339     }
 340
 341     return io_sz;
 342 }
 343
 344 static size_t
 345 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 346 {
 347     return do_journal_io(jnl, offset, data, len, JNL_READ);
 348 }
 349
 350 static size_t
 351 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 352 {
 353     return do_journal_io(jnl, offset, data, len, JNL_WRITE);
 354 }
 355
 356
 357 static size_t
 358 read_journal_header(journal *jnl, void *data, size_t len)
 359 {
 360         off_t hdr_offset = 0;
 361
 362         return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
 363 }
 364
 365 static int
 366 write_journal_header(journal *jnl)
 367 {
 368     static int num_err_prints = 0;
 369     int ret=0;
 370     off_t jhdr_offset = 0;
 371     struct vfs_context context;
 372
 373     context.vc_thread = current_thread();
 374     context.vc_ucred = NOCRED;
 375     //
 376     // Flush the track cache if we're not doing force-unit-access
 377     // writes.
 378     //
 379     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 380         ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 381     }
 382     if (ret != 0) {
 383         //
 384         // Only print this error if it's a different error than the
 385         // previous one, or if it's the first time for this device
 386         // or if the total number of printfs is less than 25.  We
 387         // allow for up to 25 printfs to insure that some make it
 388         // into the on-disk syslog.  Otherwise if we only printed
 389         // one, it's possible it would never make it to the syslog
 390         // for the root volume and that makes debugging hard.
 391         //
 392         if (   ret != jnl->last_flush_err
 393             || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
 394             || num_err_prints++ < 25) {
 395
 396             printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
 397
 398             jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
 399             jnl->last_flush_err = ret;
 400         }
 401     }
 402
 403     jnl->jhdr->checksum = 0;
 404     jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 405     if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
 406         printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
 407         jnl->flags |= JOURNAL_INVALID;
 408         return -1;
 409     }
 410
 411     // If we're not doing force-unit-access writes, then we
 412     // have to flush after writing the journal header so that
 413     // a future transaction doesn't sneak out to disk before
 414     // the header does and thus overwrite data that the old
 415     // journal header refers to.  Saw this exact case happen
 416     // on an IDE bus analyzer with Larry Barras so while it
 417     // may seem obscure, it's not.
 418     //
 419     if ((jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 420         VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
 421     }
 422
 423     return 0;
 424 }
 425
 426
 427
 428 //
 429 // this is a work function used to free up transactions that
 430 // completed. they can't be free'd from buffer_flushed_callback
 431 // because it is called from deep with the disk driver stack
 432 // and thus can't do something that would potentially cause
 433 // paging.  it gets called by each of the journal api entry
 434 // points so stuff shouldn't hang around for too long.
 435 //
 436 static void
 437 free_old_stuff(journal *jnl)
 438 {
 439     transaction *tr, *next;
 440
 441     lock_oldstart(jnl);
 442     tr = jnl->tr_freeme;
 443     jnl->tr_freeme = NULL;
 444     unlock_oldstart(jnl);
 445
 446     for(; tr; tr=next) {
 447         next = tr->next;
 448         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
 449     }
 450
 451 }
 452
 453
 454
 455 //
 456 // This is our callback that lets us know when a buffer has been
 457 // flushed to disk.  It's called from deep within the driver stack
 458 // and thus is quite limited in what it can do.  Notably, it can
 459 // not initiate any new i/o's or allocate/free memory.
 460 //
 461 static void
 462 buffer_flushed_callback(struct buf *bp, void *arg)
 463 {
 464     transaction  *tr;
 465     journal      *jnl;
 466     transaction  *ctr, *prev=NULL, *next;
 467     size_t        i;
 468     int           bufsize, amt_flushed, total_bytes;
 469
 470
 471     //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
 472     //     bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
 473
 474     // snarf out the bits we want
 475     bufsize = buf_size(bp);
 476     tr      = (transaction *)arg;
 477
 478     // then we've already seen it
 479     if (tr == NULL) {
 480                 return;
 481     }
 482
 483     CHECK_TRANSACTION(tr);
 484
 485     jnl = tr->jnl;
 486     if (jnl->flags & JOURNAL_INVALID) {
 487                 return;
 488     }
 489
 490     CHECK_JOURNAL(jnl);
 491
 492     amt_flushed = tr->num_killed;
 493     total_bytes = tr->total_bytes;
 494
 495     // update the number of blocks that have been flushed.
 496     // this buf may represent more than one block so take
 497     // that into account.
 498     //
 499     // OSAddAtomic() returns the value of tr->num_flushed before the add
 500     //
 501     amt_flushed += OSAddAtomic(bufsize, (SInt32 *)&tr->num_flushed);
 502
 503
 504     // if this transaction isn't done yet, just return as
 505     // there is nothing to do.
 506     //
 507     // NOTE: we are careful to not reference anything through
 508     //       the tr pointer after doing the OSAddAtomic().  if
 509     //       this if statement fails then we are the last one
 510     //       and then it's ok to dereference "tr".
 511     //
 512     if ((amt_flushed + bufsize) < total_bytes) {
 513                 return;
 514     }
 515
 516     // this will single thread checking the transaction
 517     lock_oldstart(jnl);
 518
 519     if (tr->total_bytes == (int)0xfbadc0de) {
 520         // then someone beat us to it...
 521         unlock_oldstart(jnl);
 522         return;
 523     }
 524
 525     // mark this so that we're the owner of dealing with the
 526     // cleanup for this transaction
 527     tr->total_bytes = 0xfbadc0de;
 528
 529     //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 530     //   tr, tr->journal_start, tr->journal_end, jnl);
 531
 532     // find this entry in the old_start[] index and mark it completed
 533     for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
 534
 535         if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
 536             jnl->old_start[i] &= ~(0x8000000000000000ULL);
 537             break;
 538         }
 539     }
 540
 541     if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
 542         panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
 543             tr->journal_start, tr, jnl);
 544     }
 545
 546
 547     // if we are here then we need to update the journal header
 548     // to reflect that this transaction is complete
 549     if (tr->journal_start == jnl->active_start) {
 550         jnl->active_start = tr->journal_end;
 551         tr->journal_start = tr->journal_end = (off_t)0;
 552     }
 553
 554     // go through the completed_trs list and try to coalesce
 555     // entries, restarting back at the beginning if we have to.
 556     for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
 557         if (ctr->journal_start == jnl->active_start) {
 558             jnl->active_start = ctr->journal_end;
 559             if (prev) {
 560                 prev->next = ctr->next;
 561             }
 562             if (ctr == jnl->completed_trs) {
 563                 jnl->completed_trs = ctr->next;
 564             }
 565
 566             next           = jnl->completed_trs;   // this starts us over again
 567             ctr->next      = jnl->tr_freeme;
 568             jnl->tr_freeme = ctr;
 569             ctr            = NULL;
 570         } else if (tr->journal_end == ctr->journal_start) {
 571             ctr->journal_start = tr->journal_start;
 572             next               = jnl->completed_trs;  // this starts us over again
 573             ctr                = NULL;
 574             tr->journal_start  = tr->journal_end = (off_t)0;
 575         } else if (tr->journal_start == ctr->journal_end) {
 576             ctr->journal_end  = tr->journal_end;
 577             next              = ctr->next;
 578             tr->journal_start = tr->journal_end = (off_t)0;
 579         } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
 580             // coalesce the next entry with this one and link the next
 581             // entry in at the head of the tr_freeme list
 582             next              = ctr->next;           // temporarily use the "next" variable
 583             ctr->journal_end  = next->journal_end;
 584             ctr->next         = next->next;
 585             next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
 586             jnl->tr_freeme    = next;
 587
 588             next              = jnl->completed_trs;  // this starts us over again
 589             ctr               = NULL;
 590         } else {
 591             next = ctr->next;
 592         }
 593     }
 594
 595     // if this is true then we didn't merge with anyone
 596     // so link ourselves in at the head of the completed
 597     // transaction list.
 598     if (tr->journal_start != 0) {
 599         // put this entry into the correct sorted place
 600         // in the list instead of just at the head.
 601         //
 602
 603         prev = NULL;
 604         for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
 605             // just keep looping
 606         }
 607
 608         if (ctr == NULL && prev == NULL) {
 609             jnl->completed_trs = tr;
 610             tr->next = NULL;
 611         } else if (ctr == jnl->completed_trs) {
 612             tr->next = jnl->completed_trs;
 613             jnl->completed_trs = tr;
 614         } else {
 615             tr->next = prev->next;
 616             prev->next = tr;
 617         }
 618     } else {
 619         // if we're here this tr got merged with someone else so
 620         // put it on the list to be free'd
 621         tr->next       = jnl->tr_freeme;
 622         jnl->tr_freeme = tr;
 623     }
 624     unlock_oldstart(jnl);
 625 }
 626
 627
 628 #include <libkern/OSByteOrder.h>
 629
 630 #define SWAP16(x) OSSwapInt16(x)
 631 #define SWAP32(x) OSSwapInt32(x)
 632 #define SWAP64(x) OSSwapInt64(x)
 633
 634
 635 static void
 636 swap_journal_header(journal *jnl)
 637 {
 638     jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
 639     jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
 640     jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
 641     jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
 642     jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
 643     jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
 644     jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
 645     jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
 646     jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
 647 }
 648
 649 static void
 650 swap_block_list_header(journal *jnl, block_list_header *blhdr)
 651 {
 652     int i;
 653
 654     blhdr->max_blocks = SWAP16(blhdr->max_blocks);
 655     blhdr->num_blocks = SWAP16(blhdr->num_blocks);
 656     blhdr->bytes_used = SWAP32(blhdr->bytes_used);
 657     blhdr->checksum   = SWAP32(blhdr->checksum);
 658     blhdr->flags      = SWAP32(blhdr->flags);
 659
 660     if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
 661         printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
 662         return;
 663     }
 664
 665     for(i=0; i < blhdr->num_blocks; i++) {
 666                 blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
 667                 blhdr->binfo[i].bsize   = SWAP32(blhdr->binfo[i].bsize);
 668                 blhdr->binfo[i].b.cksum = SWAP32(blhdr->binfo[i].b.cksum);
 669     }
 670 }
 671
 672
 673 static int
 674 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
 675 {
 676     int         ret;
 677     struct buf *oblock_bp=NULL;
 678
 679     // first read the block we want.
 680     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 681     if (ret != 0) {
 682         printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
 683
 684                 if (oblock_bp) {
 685                         buf_brelse(oblock_bp);
 686                         oblock_bp = NULL;
 687                 }
 688
 689                 // let's try to be aggressive here and just re-write the block
 690                 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
 691                 if (oblock_bp == NULL) {
 692                     printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
 693                     return -1;
 694                 }
 695     }
 696
 697     // make sure it's the correct size.
 698     if (buf_size(oblock_bp) != bsize) {
 699                 buf_brelse(oblock_bp);
 700                 return -1;
 701     }
 702
 703     // copy the journal data over top of it
 704     memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize);
 705
 706     if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
 707         printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
 708         return ret;
 709     }
 710
 711     // and now invalidate it so that if someone else wants to read
 712     // it in a different size they'll be able to do it.
 713     ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 714     if (oblock_bp) {
 715                 buf_markinvalid(oblock_bp);
 716                 buf_brelse(oblock_bp);
 717     }
 718
 719     return 0;
 720 }
 721
 722 static int
 723 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
 724 {
 725     struct bucket *newBuf;
 726     int current_size = num_buckets, i;
 727
 728     // return if newsize is less than the current size
 729     if (new_size < num_buckets) {
 730         return current_size;
 731     }
 732
 733     if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
 734         printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
 735         return -1;
 736     }
 737
 738     //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
 739
 740     // copy existing elements
 741     bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
 742
 743     // initialize the new ones
 744     for(i=num_buckets; i < new_size; i++) {
 745         newBuf[i].block_num = (off_t)-1;
 746     }
 747
 748     // free the old container
 749     FREE(*buf_ptr, M_TEMP);
 750
 751     // reset the buf_ptr
 752     *buf_ptr = newBuf;
 753
 754     return new_size;
 755 }
 756
 757 static int
 758 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
 759 {
 760     int lo, hi, index, matches, i;
 761
 762     if (num_full == 0) {
 763         return 0; // table is empty, so insert at index=0
 764     }
 765
 766     lo = 0;
 767     hi = num_full - 1;
 768     index = -1;
 769
 770     // perform binary search for block_num
 771     do {
 772         int mid = (hi - lo)/2 + lo;
 773         off_t this_num = (*buf_ptr)[mid].block_num;
 774
 775         if (block_num == this_num) {
 776             index = mid;
 777             break;
 778         }
 779
 780         if (block_num < this_num) {
 781             hi = mid;
 782             continue;
 783         }
 784
 785         if (block_num > this_num) {
 786             lo = mid + 1;
 787             continue;
 788         }
 789     } while(lo < hi);
 790
 791     // check if lo and hi converged on the match
 792     if (block_num == (*buf_ptr)[hi].block_num) {
 793         index = hi;
 794     }
 795
 796     // if no existing entry found, find index for new one
 797     if (index == -1) {
 798         index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
 799     } else {
 800         // make sure that we return the right-most index in the case of multiple matches
 801         matches = 0;
 802         i = index + 1;
 803         while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
 804             matches++;
 805             i++;
 806         }
 807
 808         index += matches;
 809     }
 810
 811     return index;
 812 }
 813
 814 static int
 815 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
 816 {
 817     if (!overwriting) {
 818         // grow the table if we're out of space
 819         if (*num_full_ptr >= *num_buckets_ptr) {
 820             int new_size = *num_buckets_ptr * 2;
 821             int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
 822
 823             if (grow_size < new_size) {
 824                 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
 825                 return -1;
 826             }
 827
 828             *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
 829         }
 830
 831         // if we're not inserting at the end, we need to bcopy
 832         if (blk_index != *num_full_ptr) {
 833             bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
 834         }
 835
 836         (*num_full_ptr)++; // increment only if we're not overwriting
 837     }
 838
 839     // sanity check the values we're about to add
 840     if (offset >= jnl->jhdr->size) {
 841         offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
 842     }
 843     if (size <= 0) {
 844         panic("jnl: insert_block: bad size in insert_block (%lu)\n", size);
 845     }
 846
 847     (*buf_ptr)[blk_index].block_num = num;
 848     (*buf_ptr)[blk_index].block_size = size;
 849     (*buf_ptr)[blk_index].jnl_offset = offset;
 850     (*buf_ptr)[blk_index].cksum = cksum;
 851
 852     return blk_index;
 853 }
 854
 855 static int
 856 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 857 {
 858     int num_to_remove, index, i, overwrite, err;
 859     size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
 860     off_t overlap, block_start, block_end;
 861
 862     block_start = block_num*jhdr_size;
 863     block_end = block_start + size;
 864     overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
 865
 866     // first, eliminate any overlap with the previous entry
 867     if (blk_index != 0 && !overwrite) {
 868         off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
 869         off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
 870         overlap = prev_block_end - block_start;
 871         if (overlap > 0) {
 872             if (overlap % jhdr_size != 0) {
 873                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %lu\n", jhdr_size);
 874             }
 875
 876             // if the previous entry completely overlaps this one, we need to break it into two pieces.
 877             if (prev_block_end > block_end) {
 878                 off_t new_num = block_end / jhdr_size;
 879                 size_t new_size = prev_block_end - block_end;
 880
 881                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
 882
 883                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
 884                 if (err < 0) {
 885                     panic("jnl: do_overlap: error inserting during pre-overlap\n");
 886                 }
 887             }
 888
 889             // Regardless, we need to truncate the previous entry to the beginning of the overlap
 890             (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
 891             (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
 892         }
 893     }
 894
 895     // then, bail out fast if there's no overlap with the entries that follow
 896     if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
 897         return 0; // no overlap, no overwrite
 898     } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
 899
 900         (*buf_ptr)[blk_index].cksum = cksum;   // update this
 901         return 1; // simple overwrite
 902     }
 903
 904     // Otherwise, find all cases of total and partial overlap. We use the special
 905     // block_num of -2 to designate entries that are completely overlapped and must
 906     // be eliminated. The block_num, size, and jnl_offset of partially overlapped
 907     // entries must be adjusted to keep the array consistent.
 908     index = blk_index;
 909     num_to_remove = 0;
 910     while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
 911         if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
 912             (*buf_ptr)[index].block_num = -2; // mark this for deletion
 913             num_to_remove++;
 914         } else {
 915             overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
 916             if (overlap > 0) {
 917                 if (overlap % jhdr_size != 0) {
 918                     panic("jnl: do_overlap: overlap of %lld is not multiple of %lu\n", overlap, jhdr_size);
 919                 }
 920
 921                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
 922                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
 923                 (*buf_ptr)[index].cksum = 0;
 924
 925                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
 926                 if (new_offset >= jnl->jhdr->size) {
 927                     new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
 928                 }
 929                 (*buf_ptr)[index].jnl_offset = new_offset;
 930
 931                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
 932                 if ((*buf_ptr)[index].block_size <= 0) {
 933                     panic("jnl: do_overlap: after overlap, new block size is invalid (%lu)\n", (*buf_ptr)[index].block_size);
 934                     // return -1; // if above panic is removed, return -1 for error
 935                 }
 936             }
 937
 938         }
 939
 940         index++;
 941     }
 942
 943     // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
 944     index--; // start with the last index used within the above loop
 945     while(index >= blk_index) {
 946         if ((*buf_ptr)[index].block_num == -2) {
 947             if (index == *num_full_ptr-1) {
 948                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
 949             } else {
 950                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
 951             }
 952             (*num_full_ptr)--;
 953         }
 954         index--;
 955     }
 956
 957     // eliminate any stale entries at the end of the table
 958     for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
 959         (*buf_ptr)[i].block_num = -1;
 960     }
 961
 962     return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
 963 }
 964
 965 // PR-3105942: Coalesce writes to the same block in journal replay
 966 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
 967 // to be replayed and the corresponding location in the journal which contains
 968 // the most recent data for those blocks. The array is "played" once the all the
 969 // blocks in the journal have been coalesced. The code for the case of conflicting/
 970 // overlapping writes to a single block is the most dense. Because coalescing can
 971 // disrupt the existing time-ordering of blocks in the journal playback, care
 972 // is taken to catch any overlaps and keep the array consistent.
 973 static int
 974 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
 975 {
 976     int blk_index, overwriting;
 977
 978     // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
 979     // inserted (or the index of the elem to overwrite).
 980     blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
 981
 982     // check if the index is within bounds (if we're adding this block to the end of
 983     // the table, blk_index will be equal to num_full)
 984     if (blk_index < 0 || blk_index > *num_full_ptr) {
 985         //printf("jnl: add_block: trouble adding block to co_buf\n");
 986         return -1;
 987     } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
 988
 989     // Determine whether we're overwriting an existing entry by checking for overlap
 990     overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
 991     if (overwriting < 0) {
 992         return -1; // if we got an error, pass it along
 993     }
 994
 995     // returns the index, or -1 on error
 996     blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
 997
 998     return blk_index;
 999 }
1000
1001 static int
1002 replay_journal(journal *jnl)
1003 {
1004     int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
1005     size_t ret;
1006     size_t  max_bsize = 0;              /* protected by block_ptr */
1007     block_list_header *blhdr;
1008     off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1009     char *buff, *block_ptr=NULL;
1010     struct bucket *co_buf;
1011     int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1012     uint32_t last_sequence_num = 0;
1013
1014     // wrap the start ptr if it points to the very end of the journal
1015     if (jnl->jhdr->start == jnl->jhdr->size) {
1016                 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1017     }
1018     if (jnl->jhdr->end == jnl->jhdr->size) {
1019                 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1020     }
1021
1022     if (jnl->jhdr->start == jnl->jhdr->end) {
1023                 return 0;
1024     }
1025
1026     orig_jnl_start = jnl->jhdr->start;
1027
1028     // allocate memory for the header_block.  we'll read each blhdr into this
1029     if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
1030                 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1031                     jnl->jdev_name, jnl->jhdr->blhdr_size);
1032                 return -1;
1033     }
1034
1035     // allocate memory for the coalesce buffer
1036     if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1037         printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1038         return -1;
1039     }
1040
1041   restart_replay:
1042
1043     // initialize entries
1044     for(i=0; i < num_buckets; i++) {
1045         co_buf[i].block_num = -1;
1046     }
1047     num_full = 0; // empty at first
1048
1049
1050     printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1051         jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1052
1053     while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1054                 offset = blhdr_offset = jnl->jhdr->start;
1055                 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1056                 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1057                     printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1058                     bad_blocks = 1;
1059                     goto bad_txn_handling;
1060                 }
1061
1062                 blhdr = (block_list_header *)buff;
1063
1064                 orig_checksum = blhdr->checksum;
1065                 blhdr->checksum = 0;
1066                 if (jnl->flags & JOURNAL_NEED_SWAP) {
1067                         // calculate the checksum based on the unswapped data
1068                         // because it is done byte-at-a-time.
1069                         orig_checksum = SWAP32(orig_checksum);
1070                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1071                         swap_block_list_header(jnl, blhdr);
1072                 } else {
1073                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1074                 }
1075
1076
1077                 //
1078                 // XXXdbg - if these checks fail, we should replay as much
1079                 //          we can in the hopes that it will still leave the
1080                 //          drive in a better state than if we didn't replay
1081                 //          anything
1082                 //
1083                 if (checksum != orig_checksum) {
1084                     if (check_past_jnl_end && in_uncharted_territory) {
1085
1086                         if (blhdr_offset != jnl->jhdr->end) {
1087                             printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1088                         }
1089
1090                         check_past_jnl_end = 0;
1091                         jnl->jhdr->end = blhdr_offset;
1092                         continue;
1093                     }
1094
1095                     printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1096                         jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1097
1098                     if (blhdr_offset == orig_jnl_start) {
1099                         // if there's nothing in the journal at all, just bail out altogether.
1100                         goto bad_replay;
1101                     }
1102
1103                     bad_blocks = 1;
1104                     goto bad_txn_handling;
1105                 }
1106
1107                 if (   (last_sequence_num != 0)
1108                     && (blhdr->binfo[0].b.sequence_num != 0)
1109                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num)
1110                     && (blhdr->binfo[0].b.sequence_num != last_sequence_num+1)) {
1111
1112                     txn_start_offset = jnl->jhdr->end = blhdr_offset;
1113
1114                     if (check_past_jnl_end) {
1115                         check_past_jnl_end = 0;
1116                         printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1117                             jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1118                         continue;
1119                     }
1120
1121                     printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1122                         jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].b.sequence_num, last_sequence_num);
1123                     bad_blocks = 1;
1124                     goto bad_txn_handling;
1125                 }
1126                 last_sequence_num = blhdr->binfo[0].b.sequence_num;
1127
1128                 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1129                     if (last_sequence_num == 0) {
1130                         check_past_jnl_end = 0;
1131                         printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1132                             jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1133                         if (jnl->jhdr->start != jnl->jhdr->end) {
1134                             jnl->jhdr->start = jnl->jhdr->end;
1135                         }
1136                         continue;
1137                     }
1138                     printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1139                 }
1140
1141                 if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
1142                            || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1143                     printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1144                         jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1145                     bad_blocks = 1;
1146                     goto bad_txn_handling;
1147                 }
1148
1149                 max_bsize = 0;
1150                 for(i=1; i < blhdr->num_blocks; i++) {
1151                         if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1152                             printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1153                             bad_blocks = 1;
1154                             goto bad_txn_handling;
1155                         }
1156
1157                         if (blhdr->binfo[i].bsize > max_bsize) {
1158                             max_bsize = blhdr->binfo[i].bsize;
1159                         }
1160                 }
1161
1162                 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1163                     check_block_checksums = 1;
1164                     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1165                         goto bad_replay;
1166                     }
1167                 } else {
1168                     block_ptr = NULL;
1169                 }
1170
1171                 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1172                     txn_start_offset = blhdr_offset;
1173                 }
1174
1175                 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1176                 //       blhdr->num_blocks-1, jnl->jhdr->start);
1177                 bad_blocks = 0;
1178                 for(i=1; i < blhdr->num_blocks; i++) {
1179                         int size, ret_val;
1180                         off_t number;
1181
1182                         size = blhdr->binfo[i].bsize;
1183                         number = blhdr->binfo[i].bnum;
1184
1185                         // don't add "killed" blocks
1186                         if (number == (off_t)-1) {
1187                             //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1188                         } else {
1189
1190                             if (check_block_checksums) {
1191                                 int32_t disk_cksum;
1192                                 off_t block_offset;
1193
1194                                 block_offset = offset;
1195
1196                                 // read the block so we can check the checksum
1197                                 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1198                                 if (ret != (size_t)size) {
1199                                     printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1200                                     bad_blocks = 1;
1201                                     goto bad_txn_handling;
1202                                 }
1203
1204                                 disk_cksum = calc_checksum(block_ptr, size);
1205
1206                                 // there is no need to swap the checksum from disk because
1207                                 // it got swapped when the blhdr was read in.
1208                                 if (blhdr->binfo[i].b.cksum != 0 && disk_cksum != blhdr->binfo[i].b.cksum) {
1209                                     printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1210                                         jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].b.cksum);
1211                                     printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1212                                         *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1213                                         *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1214
1215                                     bad_blocks = 1;
1216                                     goto bad_txn_handling;
1217                                 }
1218                             }
1219
1220
1221                             // add this bucket to co_buf, coalescing where possible
1222                             // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1223                             ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].b.cksum, &num_buckets, &num_full);
1224
1225                             if (ret_val == -1) {
1226                                 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1227                                 goto bad_replay;
1228                             } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1229                         }
1230
1231                         // increment offset
1232                         offset += size;
1233
1234                         // check if the last block added puts us off the end of the jnl.
1235                         // if so, we need to wrap to the beginning and take any remainder
1236                         // into account
1237                         //
1238                         if (offset >= jnl->jhdr->size) {
1239                             offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1240                         }
1241                 }
1242
1243                 if (block_ptr) {
1244                     kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1245                     block_ptr = NULL;
1246                 }
1247
1248       bad_txn_handling:
1249                 if (bad_blocks) {
1250                     if (txn_start_offset == 0) {
1251                         printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1252                         goto bad_replay;
1253                     }
1254
1255                     jnl->jhdr->start = orig_jnl_start;
1256                     jnl->jhdr->end = txn_start_offset;
1257                     check_past_jnl_end = 0;
1258                     last_sequence_num = 0;
1259                     printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1260                     goto restart_replay;
1261                 }
1262
1263                 jnl->jhdr->start += blhdr->bytes_used;
1264                 if (jnl->jhdr->start >= jnl->jhdr->size) {
1265                         // wrap around and skip the journal header block
1266                         jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1267                 }
1268
1269                 if (jnl->jhdr->start == jnl->jhdr->end) {
1270                     in_uncharted_territory = 1;
1271                 }
1272     }
1273
1274     if (jnl->jhdr->start != jnl->jhdr->end) {
1275         printf("jnl: %s: start %lld != end %lld.  resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1276         jnl->jhdr->end = jnl->jhdr->start;
1277     }
1278
1279     //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1280
1281     /*
1282      * make sure it's at least one page in size, so
1283      * start max_bsize at PAGE_SIZE
1284      */
1285     for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1286
1287             if (co_buf[i].block_num == (off_t)-1)
1288                     continue;
1289
1290             if (co_buf[i].block_size > max_bsize)
1291                     max_bsize = co_buf[i].block_size;
1292     }
1293     /*
1294      * round max_bsize up to the nearest PAGE_SIZE multiple
1295      */
1296     if (max_bsize & (PAGE_SIZE - 1)) {
1297             max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1298     }
1299
1300     if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1301         goto bad_replay;
1302     }
1303
1304     // Replay the coalesced entries in the co-buf
1305     for(i=0; i < num_full; i++) {
1306         size_t size = co_buf[i].block_size;
1307         off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1308         off_t number = co_buf[i].block_num;
1309
1310
1311         // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1312         //      co_buf[i].block_size, co_buf[i].jnl_offset);
1313
1314         if (number == (off_t)-1) {
1315             // printf("jnl: replay_journal: skipping killed fs block\n");
1316         } else {
1317
1318             // do journal read, and set the phys. block
1319             ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1320             if (ret != size) {
1321                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1322                 goto bad_replay;
1323             }
1324
1325             if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1326                 goto bad_replay;
1327             }
1328         }
1329     }
1330
1331
1332     // done replaying; update jnl header
1333     if (write_journal_header(jnl) != 0) {
1334         goto bad_replay;
1335     }
1336
1337     printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1338
1339     // free block_ptr
1340     if (block_ptr) {
1341         kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1342         block_ptr = NULL;
1343     }
1344
1345     // free the coalesce buffer
1346     FREE(co_buf, M_TEMP);
1347     co_buf = NULL;
1348
1349     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1350     return 0;
1351
1352   bad_replay:
1353     if (block_ptr) {
1354                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1355     }
1356     if (co_buf) {
1357       FREE(co_buf, M_TEMP);
1358     }
1359     kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1360
1361     return -1;
1362 }
1363
1364
1365 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
1366 //#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
1367 #define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
1368
1369 // XXXdbg - so I can change it in the debugger
1370 int def_tbuffer_size = 0;
1371
1372
1373 //
1374 // This function sets the size of the tbuffer and the
1375 // size of the blhdr.  It assumes that jnl->jhdr->size
1376 // and jnl->jhdr->jhdr_size are already valid.
1377 //
1378 static void
1379 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1380 {
1381         //
1382         // one-time initialization based on how much memory
1383         // there is in the machine.
1384         //
1385         if (def_tbuffer_size == 0) {
1386                 if (mem_size < (256*1024*1024)) {
1387                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1388                 } else if (mem_size < (512*1024*1024)) {
1389                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1390                 } else if (mem_size < (1024*1024*1024)) {
1391                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1392                 } else if (mem_size >= (1024*1024*1024)) {
1393                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1394                 }
1395         }
1396
1397     // size up the transaction buffer... can't be larger than the number
1398     // of blocks that can fit in a block_list_header block.
1399     if (tbuffer_size == 0) {
1400                 jnl->tbuffer_size = def_tbuffer_size;
1401     } else {
1402                 // make sure that the specified tbuffer_size isn't too small
1403                 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1404                         tbuffer_size = jnl->jhdr->blhdr_size * 2;
1405                 }
1406                 // and make sure it's an even multiple of the block size
1407                 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1408                         tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1409                 }
1410
1411                 jnl->tbuffer_size = tbuffer_size;
1412     }
1413
1414     if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1415                 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1416     }
1417
1418     if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1419                 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1420     }
1421
1422     jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1423     if (jnl->jhdr->blhdr_size < phys_blksz) {
1424         jnl->jhdr->blhdr_size = phys_blksz;
1425     } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1426                 // have to round up so we're an even multiple of the physical block size
1427                 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1428     }
1429 }
1430
1431
1432
1433 static void
1434 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1435 {
1436     off_t       readblockcnt;
1437     off_t       writeblockcnt;
1438     off_t       readmaxcnt;
1439     off_t       writemaxcnt;
1440     int32_t     features;
1441
1442     if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1443         if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1444             const char *name = vnode_name(devvp);
1445             jnl->flags |= JOURNAL_DO_FUA_WRITES;
1446             printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
1447         }
1448     }
1449
1450     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context)) {
1451         readmaxcnt = 0;
1452     }
1453
1454     if (readmaxcnt == 0) {
1455         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context)) {
1456             readmaxcnt = 128 * 1024;
1457         } else {
1458             readmaxcnt = readblockcnt * phys_blksz;
1459         }
1460     }
1461
1462
1463     if (VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context)) {
1464         writemaxcnt = 0;
1465     }
1466
1467     if (writemaxcnt == 0) {
1468         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context)) {
1469             writemaxcnt = 128 * 1024;
1470         } else {
1471             writemaxcnt = writeblockcnt * phys_blksz;
1472         }
1473     }
1474
1475     jnl->max_read_size  = readmaxcnt;
1476     jnl->max_write_size = writemaxcnt;
1477
1478     // just in case it's still zero...
1479     if (jnl->max_read_size == 0) {
1480         jnl->max_read_size = 128 * 1024;
1481         jnl->max_write_size = 128 * 1024;
1482     }
1483 }
1484
1485
1486 static const char *
1487 get_jdev_name(struct vnode *jvp)
1488 {
1489     const char *jdev_name;
1490
1491     jdev_name = vnode_name(jvp);
1492     if (jdev_name == NULL) {
1493         jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
1494     } else {
1495         // this just bumps the refcount on the name so we have our own copy
1496         jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
1497     }
1498
1499     return jdev_name;
1500 }
1501
1502
1503 journal *
1504 journal_create(struct vnode *jvp,
1505                            off_t         offset,
1506                            off_t         journal_size,
1507                            struct vnode *fsvp,
1508                            size_t        min_fs_blksz,
1509                            int32_t       flags,
1510                            int32_t       tbuffer_size,
1511                            void        (*flush)(void *arg),
1512                            void         *arg)
1513 {
1514     journal *jnl;
1515     size_t      phys_blksz;
1516     struct vfs_context context;
1517     const char *jdev_name;
1518
1519     context.vc_thread = current_thread();
1520     context.vc_ucred = FSCRED;
1521
1522     jdev_name = get_jdev_name(jvp);
1523
1524     /* Get the real physical block size. */
1525     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1526         return NULL;
1527     }
1528
1529     if (phys_blksz > min_fs_blksz) {
1530                 printf("jnl: %s: create: error: phys blksize %lu bigger than min fs blksize %lu\n",
1531                     jdev_name, phys_blksz, min_fs_blksz);
1532                 return NULL;
1533     }
1534
1535     if ((journal_size % phys_blksz) != 0) {
1536                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1537                     jdev_name, journal_size, phys_blksz);
1538                 return NULL;
1539     }
1540
1541
1542     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1543     memset(jnl, 0, sizeof(*jnl));
1544
1545     jnl->jdev         = jvp;
1546     jnl->jdev_offset  = offset;
1547     jnl->fsdev        = fsvp;
1548     jnl->flush        = flush;
1549     jnl->flush_arg    = arg;
1550     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1551     jnl->jdev_name    = jdev_name;
1552     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1553
1554     get_io_info(jvp, phys_blksz, jnl, &context);
1555
1556     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1557         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1558         goto bad_kmem_alloc;
1559     }
1560
1561     memset(jnl->header_buf, 0, phys_blksz);
1562
1563     jnl->jhdr             = (journal_header *)jnl->header_buf;
1564     jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
1565     jnl->jhdr->endian     = ENDIAN_MAGIC;
1566     jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
1567     jnl->jhdr->end        = phys_blksz;
1568     jnl->jhdr->size       = journal_size;
1569     jnl->jhdr->jhdr_size  = phys_blksz;
1570     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1571
1572         jnl->active_start     = jnl->jhdr->start;
1573
1574     // XXXdbg  - for testing you can force the journal to wrap around
1575     // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1576     // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
1577
1578     jnl->jhdr->sequence_num = random() & 0x00ffffff;
1579
1580         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1581
1582     if (write_journal_header(jnl) != 0) {
1583         printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1584         goto bad_write;
1585     }
1586
1587     return jnl;
1588
1589
1590   bad_write:
1591     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1592   bad_kmem_alloc:
1593     if (jdev_name) {
1594         vfs_removename(jdev_name);
1595     }
1596     jnl->jhdr = NULL;
1597     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1598     return NULL;
1599 }
1600
1601
1602 journal *
1603 journal_open(struct vnode *jvp,
1604                          off_t         offset,
1605                          off_t         journal_size,
1606                          struct vnode *fsvp,
1607                          size_t        min_fs_blksz,
1608                          int32_t       flags,
1609                          int32_t       tbuffer_size,
1610                          void        (*flush)(void *arg),
1611                          void         *arg)
1612 {
1613     journal *jnl;
1614     int      orig_blksz=0;
1615     size_t   phys_blksz;
1616     int      orig_checksum, checksum;
1617     struct vfs_context context;
1618     const char *jdev_name = get_jdev_name(jvp);
1619
1620     context.vc_thread = current_thread();
1621     context.vc_ucred = FSCRED;
1622
1623     /* Get the real physical block size. */
1624     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1625                 return NULL;
1626     }
1627
1628     if (phys_blksz > min_fs_blksz) {
1629                 printf("jnl: %s: open: error: phys blksize %lu bigger than min fs blksize %lu\n",
1630                     jdev_name, phys_blksz, min_fs_blksz);
1631                 return NULL;
1632     }
1633
1634     if ((journal_size % phys_blksz) != 0) {
1635                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1636                     jdev_name, journal_size, phys_blksz);
1637                 return NULL;
1638     }
1639
1640     MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1641     memset(jnl, 0, sizeof(*jnl));
1642
1643     jnl->jdev         = jvp;
1644     jnl->jdev_offset  = offset;
1645     jnl->fsdev        = fsvp;
1646     jnl->flush        = flush;
1647     jnl->flush_arg    = arg;
1648     jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1649     jnl->jdev_name    = jdev_name;
1650     lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1651
1652     get_io_info(jvp, phys_blksz, jnl, &context);
1653
1654     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1655         printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name, phys_blksz);
1656         goto bad_kmem_alloc;
1657     }
1658
1659     jnl->jhdr = (journal_header *)jnl->header_buf;
1660     memset(jnl->jhdr, 0, sizeof(journal_header));
1661
1662     // we have to set this up here so that do_journal_io() will work
1663     jnl->jhdr->jhdr_size = phys_blksz;
1664
1665     if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1666                 printf("jnl: %s: open: could not read %lu bytes for the journal header.\n",
1667                     jdev_name, phys_blksz);
1668                 goto bad_journal;
1669     }
1670
1671         orig_checksum = jnl->jhdr->checksum;
1672         jnl->jhdr->checksum = 0;
1673
1674         if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1675                 // do this before the swap since it's done byte-at-a-time
1676                 orig_checksum = SWAP32(orig_checksum);
1677                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1678                 swap_journal_header(jnl);
1679                 jnl->flags |= JOURNAL_NEED_SWAP;
1680         } else {
1681                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1682         }
1683
1684     if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1685                 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1686                     jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1687                 goto bad_journal;
1688     }
1689
1690         // only check if we're the current journal header magic value
1691         if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1692
1693                 if (orig_checksum != checksum) {
1694                         printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1695                             jdev_name, orig_checksum, checksum);
1696
1697                         //goto bad_journal;
1698                 }
1699         }
1700
1701         // XXXdbg - convert old style magic numbers to the new one
1702         if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1703                 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1704         }
1705
1706     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1707                 printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n",
1708                     jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
1709
1710                 orig_blksz = phys_blksz;
1711                 phys_blksz = jnl->jhdr->jhdr_size;
1712                 if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) {
1713                     printf("jnl: %s: could not set block size to %lu bytes.\n", jdev_name, phys_blksz);
1714                     goto bad_journal;
1715                 }
1716 //              goto bad_journal;
1717     }
1718
1719     if (   jnl->jhdr->start <= 0
1720                    || jnl->jhdr->start > jnl->jhdr->size
1721                    || jnl->jhdr->start > 1024*1024*1024) {
1722                 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1723                     jdev_name, jnl->jhdr->start, jnl->jhdr->size);
1724                 goto bad_journal;
1725     }
1726
1727     if (   jnl->jhdr->end <= 0
1728                    || jnl->jhdr->end > jnl->jhdr->size
1729                    || jnl->jhdr->end > 1024*1024*1024) {
1730                 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1731                     jdev_name, jnl->jhdr->end, jnl->jhdr->size);
1732                 goto bad_journal;
1733     }
1734
1735     if (jnl->jhdr->size > 1024*1024*1024) {
1736         printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
1737         goto bad_journal;
1738     }
1739
1740 // XXXdbg - can't do these checks because hfs writes all kinds of
1741 //          non-uniform sized blocks even on devices that have a block size
1742 //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
1743 //          therefore these checks will fail and so we just have to punt and
1744 //          do more relaxed checking...
1745 // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1746     if ((jnl->jhdr->start % 512) != 0) {
1747                 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1748                     jdev_name, jnl->jhdr->start);
1749                 goto bad_journal;
1750     }
1751
1752 //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1753     if ((jnl->jhdr->end % 512) != 0) {
1754                 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1755                     jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
1756                 goto bad_journal;
1757     }
1758
1759     // take care of replaying the journal if necessary
1760     if (flags & JOURNAL_RESET) {
1761         printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1762             jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
1763         jnl->jhdr->start = jnl->jhdr->end;
1764     } else if (replay_journal(jnl) != 0) {
1765         printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
1766         goto bad_journal;
1767     }
1768
1769     if (orig_blksz != 0) {
1770         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1771         phys_blksz = orig_blksz;
1772         if (orig_blksz < jnl->jhdr->jhdr_size) {
1773             printf("jnl: %s: open: jhdr_size is %d but orig phys blk size is %d.  switching.\n",
1774                 jdev_name, jnl->jhdr->jhdr_size, orig_blksz);
1775
1776             jnl->jhdr->jhdr_size = orig_blksz;
1777         }
1778     }
1779
1780     // make sure this is in sync!
1781     jnl->active_start = jnl->jhdr->start;
1782
1783     // set this now, after we've replayed the journal
1784     size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1785
1786     lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1787
1788     return jnl;
1789
1790   bad_journal:
1791     if (orig_blksz != 0) {
1792         phys_blksz = orig_blksz;
1793         VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1794     }
1795     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1796   bad_kmem_alloc:
1797     if (jdev_name) {
1798         vfs_removename(jdev_name);
1799     }
1800     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1801     return NULL;
1802 }
1803
1804
1805 int
1806 journal_is_clean(struct vnode *jvp,
1807                  off_t         offset,
1808                  off_t         journal_size,
1809                  struct vnode *fsvp,
1810                  size_t        min_fs_block_size)
1811 {
1812     journal jnl;
1813     int     phys_blksz, ret;
1814     int     orig_checksum, checksum;
1815     struct vfs_context context;
1816     const char *jdev_name = get_jdev_name(jvp);
1817
1818     context.vc_thread = current_thread();
1819     context.vc_ucred = FSCRED;
1820
1821     /* Get the real physical block size. */
1822     if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1823         printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
1824         return EINVAL;
1825     }
1826
1827     if (phys_blksz > (int)min_fs_block_size) {
1828         printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %lu\n",
1829             jdev_name, phys_blksz, min_fs_block_size);
1830         return EINVAL;
1831     }
1832
1833     if ((journal_size % phys_blksz) != 0) {
1834         printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1835             jdev_name, journal_size, phys_blksz);
1836         return EINVAL;
1837     }
1838
1839     memset(&jnl, 0, sizeof(jnl));
1840
1841     if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
1842         printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
1843         return ENOMEM;
1844     }
1845
1846     get_io_info(jvp, phys_blksz, &jnl, &context);
1847
1848     jnl.jhdr = (journal_header *)jnl.header_buf;
1849     memset(jnl.jhdr, 0, sizeof(journal_header));
1850
1851     jnl.jdev        = jvp;
1852     jnl.jdev_offset = offset;
1853     jnl.fsdev       = fsvp;
1854
1855     // we have to set this up here so that do_journal_io() will work
1856     jnl.jhdr->jhdr_size = phys_blksz;
1857
1858     if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
1859         printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
1860             jdev_name, phys_blksz);
1861         ret = EINVAL;
1862         goto get_out;
1863     }
1864
1865     orig_checksum = jnl.jhdr->checksum;
1866     jnl.jhdr->checksum = 0;
1867
1868     if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1869         // do this before the swap since it's done byte-at-a-time
1870         orig_checksum = SWAP32(orig_checksum);
1871         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1872         swap_journal_header(&jnl);
1873         jnl.flags |= JOURNAL_NEED_SWAP;
1874     } else {
1875         checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1876     }
1877
1878     if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1879         printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1880             jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
1881         ret = EINVAL;
1882         goto get_out;
1883     }
1884
1885     if (orig_checksum != checksum) {
1886         printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
1887         ret = EINVAL;
1888         goto get_out;
1889     }
1890
1891     //
1892     // if the start and end are equal then the journal is clean.
1893     // otherwise it's not clean and therefore an error.
1894     //
1895     if (jnl.jhdr->start == jnl.jhdr->end) {
1896         ret = 0;
1897     } else {
1898         ret = EINVAL;
1899     }
1900
1901   get_out:
1902     kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
1903     if (jdev_name) {
1904         vfs_removename(jdev_name);
1905     }
1906
1907     return ret;
1908
1909
1910 }
1911
1912
1913 void
1914 journal_close(journal *jnl)
1915 {
1916     volatile off_t *start, *end;
1917     int             counter=0;
1918
1919     CHECK_JOURNAL(jnl);
1920
1921         // set this before doing anything that would block so that
1922         // we start tearing things down properly.
1923         //
1924         jnl->flags |= JOURNAL_CLOSE_PENDING;
1925
1926     if (jnl->owner != current_thread()) {
1927                 lock_journal(jnl);
1928     }
1929
1930     //
1931     // only write stuff to disk if the journal is still valid
1932     //
1933     if ((jnl->flags & JOURNAL_INVALID) == 0) {
1934
1935                 if (jnl->active_tr) {
1936                         journal_end_transaction(jnl);
1937                 }
1938
1939                 // flush any buffered transactions
1940                 if (jnl->cur_tr) {
1941                         transaction *tr = jnl->cur_tr;
1942
1943                         jnl->cur_tr = NULL;
1944                         end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
1945                 }
1946
1947                 //start = &jnl->jhdr->start;
1948                 start = &jnl->active_start;
1949                 end   = &jnl->jhdr->end;
1950
1951                 while (*start != *end && counter++ < 5000) {
1952                         //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1953                         if (jnl->flush) {
1954                                 jnl->flush(jnl->flush_arg);
1955                         }
1956                         tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
1957                 }
1958
1959                 if (*start != *end) {
1960                         printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1961                             jnl->jdev_name, *start, *end);
1962                 }
1963
1964                 // make sure this is in sync when we close the journal
1965                 jnl->jhdr->start = jnl->active_start;
1966
1967                 // if this fails there's not much we can do at this point...
1968                 write_journal_header(jnl);
1969     } else {
1970                 // if we're here the journal isn't valid any more.
1971                 // so make sure we don't leave any locked blocks lying around
1972                 printf("jnl: %s: close: journal %p, is invalid.  aborting outstanding transactions\n", jnl->jdev_name, jnl);
1973                 if (jnl->active_tr || jnl->cur_tr) {
1974                         transaction *tr;
1975                         if (jnl->active_tr) {
1976                                 tr = jnl->active_tr;
1977                                 jnl->active_tr = NULL;
1978                         } else {
1979                                 tr = jnl->cur_tr;
1980                                 jnl->cur_tr = NULL;
1981                         }
1982
1983                         abort_transaction(jnl, tr);
1984                         if (jnl->active_tr || jnl->cur_tr) {
1985                             panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
1986                         }
1987                 }
1988     }
1989
1990     free_old_stuff(jnl);
1991
1992     kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
1993     jnl->jhdr = (void *)0xbeefbabe;
1994
1995     if (jnl->jdev_name) {
1996         vfs_removename(jnl->jdev_name);
1997     }
1998
1999     FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2000 }
2001
2002 static void
2003 dump_journal(journal *jnl)
2004 {
2005     transaction *ctr;
2006
2007     printf("journal for dev %s:", jnl->jdev_name);
2008     printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
2009     printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
2010     printf("  start: 0x%.8llx\n", jnl->jhdr->start);
2011     printf("  end:   0x%.8llx\n", jnl->jhdr->end);
2012     printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2013     printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2014     printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2015     printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2016
2017     printf("  completed transactions:\n");
2018     for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
2019                 printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2020     }
2021 }
2022
2023
2024
2025 static off_t
2026 free_space(journal *jnl)
2027 {
2028     off_t free_space_offset;
2029
2030     if (jnl->jhdr->start < jnl->jhdr->end) {
2031                 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2032     } else if (jnl->jhdr->start > jnl->jhdr->end) {
2033                 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2034     } else {
2035                 // journal is completely empty
2036                 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2037     }
2038
2039     return free_space_offset;
2040 }
2041
2042
2043 //
2044 // The journal must be locked on entry to this function.
2045 // The "desired_size" is in bytes.
2046 //
2047 static int
2048 check_free_space(journal *jnl, int desired_size)
2049 {
2050     size_t i;
2051     int    counter=0;
2052
2053     //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2054 //         desired_size, free_space(jnl));
2055
2056     while (1) {
2057                 int old_start_empty;
2058
2059                 if (counter++ == 5000) {
2060                         dump_journal(jnl);
2061                         panic("jnl: check_free_space: buffer flushing isn't working "
2062                                   "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2063                                   jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2064                 }
2065                 if (counter > 7500) {
2066                     printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2067                     return ENOSPC;
2068                 }
2069
2070                 // make sure there's space in the journal to hold this transaction
2071                 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2072                         break;
2073                 }
2074                 //
2075                 // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2076                 // entries until there is enough space for the next transaction.
2077                 //
2078                 old_start_empty = 1;
2079                 lock_oldstart(jnl);
2080                 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2081                         int   lcl_counter;
2082
2083                         lcl_counter = 0;
2084                         while (jnl->old_start[i] & 0x8000000000000000LL) {
2085                                 if (lcl_counter++ > 1000) {
2086                                         panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2087                                                   jnl->old_start[i], jnl);
2088                                 }
2089
2090                                 unlock_oldstart(jnl);
2091                                 if (jnl->flush) {
2092                                         jnl->flush(jnl->flush_arg);
2093                                 }
2094                                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2095                                 lock_oldstart(jnl);
2096                         }
2097
2098                         if (jnl->old_start[i] == 0) {
2099                                 continue;
2100                         }
2101
2102                         old_start_empty   = 0;
2103                         jnl->jhdr->start  = jnl->old_start[i];
2104                         jnl->old_start[i] = 0;
2105                         if (free_space(jnl) > desired_size) {
2106                                 unlock_oldstart(jnl);
2107                                 write_journal_header(jnl);
2108                                 lock_oldstart(jnl);
2109                                 break;
2110                         }
2111                 }
2112                 unlock_oldstart(jnl);
2113
2114                 // if we bumped the start, loop and try again
2115                 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2116                         continue;
2117                 } else if (old_start_empty) {
2118                         //
2119                         // if there is nothing in old_start anymore then we can
2120                         // bump the jhdr->start to be the same as active_start
2121                         // since it is possible there was only one very large
2122                         // transaction in the old_start array.  if we didn't do
2123                         // this then jhdr->start would never get updated and we
2124                         // would wind up looping until we hit the panic at the
2125                         // start of the loop.
2126                         //
2127                         jnl->jhdr->start = jnl->active_start;
2128                         write_journal_header(jnl);
2129                         continue;
2130                 }
2131
2132
2133                 // if the file system gave us a flush function, call it to so that
2134                 // it can flush some blocks which hopefully will cause some transactions
2135                 // to complete and thus free up space in the journal.
2136                 if (jnl->flush) {
2137                         jnl->flush(jnl->flush_arg);
2138                 }
2139
2140                 // wait for a while to avoid being cpu-bound (this will
2141                 // put us to sleep for 10 milliseconds)
2142                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2143     }
2144
2145     return 0;
2146 }
2147
2148 /*
2149  * Allocate a new active transaction.
2150  */
2151 static errno_t
2152 journal_allocate_transaction(journal *jnl)
2153 {
2154         transaction *tr;
2155
2156         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2157     memset(tr, 0, sizeof(transaction));
2158
2159     tr->tbuffer_size = jnl->tbuffer_size;
2160
2161     if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
2162                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2163                 jnl->active_tr = NULL;
2164                 return ENOMEM;
2165     }
2166
2167     // journal replay code checksum check depends on this.
2168     memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2169     // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2170     memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2171
2172     tr->blhdr = (block_list_header *)tr->tbuffer;
2173     tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2174     tr->blhdr->num_blocks = 1;      // accounts for this header block
2175     tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2176     tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2177
2178     tr->sequence_num = ++jnl->jhdr->sequence_num;
2179     tr->num_blhdrs  = 1;
2180     tr->total_bytes = jnl->jhdr->blhdr_size;
2181     tr->jnl         = jnl;
2182
2183         jnl->active_tr  = tr;
2184
2185         return 0;
2186 }
2187
2188 int
2189 journal_start_transaction(journal *jnl)
2190 {
2191     int ret;
2192
2193     CHECK_JOURNAL(jnl);
2194
2195     if (jnl->flags & JOURNAL_INVALID) {
2196                 return EINVAL;
2197     }
2198
2199     if (jnl->owner == current_thread()) {
2200                 if (jnl->active_tr == NULL) {
2201                         panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2202                                   jnl, jnl->owner, current_thread());
2203                 }
2204                 jnl->nested_count++;
2205                 return 0;
2206     }
2207
2208     lock_journal(jnl);
2209
2210     if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
2211                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2212                           jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2213     }
2214
2215     jnl->owner        = current_thread();
2216     jnl->nested_count = 1;
2217
2218     free_old_stuff(jnl);
2219
2220     // make sure there's room in the journal
2221     if (free_space(jnl) < jnl->tbuffer_size) {
2222         // this is the call that really waits for space to free up
2223         // as well as updating jnl->jhdr->start
2224         if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
2225                 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2226                 ret = ENOSPC;
2227                 goto bad_start;
2228         }
2229     }
2230
2231     // if there's a buffered transaction, use it.
2232     if (jnl->cur_tr) {
2233                 jnl->active_tr = jnl->cur_tr;
2234                 jnl->cur_tr    = NULL;
2235
2236                 return 0;
2237     }
2238
2239         ret = journal_allocate_transaction(jnl);
2240         if (ret) {
2241                 goto bad_start;
2242         }
2243
2244     // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2245
2246     return 0;
2247
2248   bad_start:
2249         jnl->owner        = NULL;
2250         jnl->nested_count = 0;
2251         unlock_journal(jnl);
2252         return ret;
2253 }
2254
2255
2256 int
2257 journal_modify_block_start(journal *jnl, struct buf *bp)
2258 {
2259     transaction *tr;
2260
2261     CHECK_JOURNAL(jnl);
2262
2263     if (jnl->flags & JOURNAL_INVALID) {
2264                 return EINVAL;
2265     }
2266
2267     // XXXdbg - for debugging I want this to be true.  later it may
2268     //          not be necessary.
2269     if ((buf_flags(bp) & B_META) == 0) {
2270                 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2271     }
2272
2273     tr = jnl->active_tr;
2274     CHECK_TRANSACTION(tr);
2275
2276     if (jnl->owner != current_thread()) {
2277                 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2278                           jnl, jnl->owner, current_thread());
2279     }
2280
2281     free_old_stuff(jnl);
2282
2283     //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2284     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2285
2286     // can't allow blocks that aren't an even multiple of the
2287     // underlying block size.
2288     if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2289                 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2290                           buf_size(bp), jnl->jhdr->jhdr_size);
2291                 return -1;
2292     }
2293
2294     // make sure that this transaction isn't bigger than the whole journal
2295     if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2296                 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2297                           tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2298                 return -1;
2299     }
2300
2301     // if the block is dirty and not already locked we have to write
2302     // it out before we muck with it because it has data that belongs
2303     // (presumably) to another transaction.
2304     //
2305     if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2306
2307                 if (buf_flags(bp) & B_ASYNC) {
2308                         panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2309                 }
2310
2311                 // this will cause it to not be buf_brelse()'d
2312                 buf_setflags(bp, B_NORELSE);
2313                 VNOP_BWRITE(bp);
2314     }
2315     buf_setflags(bp, B_LOCKED);
2316
2317     return 0;
2318 }
2319
2320 int
2321 journal_modify_block_abort(journal *jnl, struct buf *bp)
2322 {
2323     transaction *tr;
2324         block_list_header *blhdr;
2325         int i;
2326
2327     CHECK_JOURNAL(jnl);
2328
2329     tr = jnl->active_tr;
2330
2331         //
2332         // if there's no active transaction then we just want to
2333         // call buf_brelse() and return since this is just a block
2334         // that happened to be modified as part of another tr.
2335         //
2336         if (tr == NULL) {
2337                 buf_brelse(bp);
2338                 return 0;
2339         }
2340
2341     if (jnl->flags & JOURNAL_INVALID) {
2342                 return EINVAL;
2343     }
2344
2345     CHECK_TRANSACTION(tr);
2346
2347     if (jnl->owner != current_thread()) {
2348                 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2349                           jnl, jnl->owner, current_thread());
2350     }
2351
2352     free_old_stuff(jnl);
2353
2354     // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2355
2356     // first check if it's already part of this transaction
2357     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2358                 for(i=1; i < blhdr->num_blocks; i++) {
2359                         if (bp == blhdr->binfo[i].b.bp) {
2360                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2361                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2362                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2363                                 }
2364                                 break;
2365                         }
2366                 }
2367
2368                 if (i < blhdr->num_blocks) {
2369                         break;
2370                 }
2371     }
2372
2373         //
2374         // if blhdr is null, then this block has only had modify_block_start
2375         // called on it as part of the current transaction.  that means that
2376         // it is ok to clear the LOCKED bit since it hasn't actually been
2377         // modified.  if blhdr is non-null then modify_block_end was called
2378         // on it and so we need to keep it locked in memory.
2379         //
2380         if (blhdr == NULL) {
2381                   buf_clearflags(bp, B_LOCKED);
2382         }
2383
2384     buf_brelse(bp);
2385     return 0;
2386 }
2387
2388
2389 int
2390 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg)
2391 {
2392     int                i = 1;
2393     int                tbuffer_offset=0;
2394     char              *blkptr;
2395     block_list_header *blhdr, *prev=NULL;
2396     transaction       *tr;
2397
2398     CHECK_JOURNAL(jnl);
2399
2400     if (jnl->flags & JOURNAL_INVALID) {
2401                 return EINVAL;
2402     }
2403
2404     tr = jnl->active_tr;
2405     CHECK_TRANSACTION(tr);
2406
2407     if (jnl->owner != current_thread()) {
2408                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2409                           jnl, jnl->owner, current_thread());
2410     }
2411
2412     free_old_stuff(jnl);
2413
2414     //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2415     //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2416
2417     if ((buf_flags(bp) & B_LOCKED) == 0) {
2418                 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2419     }
2420
2421     // first check if it's already part of this transaction
2422     for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2423                 tbuffer_offset = jnl->jhdr->blhdr_size;
2424
2425                 for(i=1; i < blhdr->num_blocks; i++) {
2426                         if (bp == blhdr->binfo[i].b.bp) {
2427                                 if (buf_size(bp) != blhdr->binfo[i].bsize) {
2428                                         panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2429                                                   bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2430                                 }
2431                                 break;
2432                         }
2433                         tbuffer_offset += blhdr->binfo[i].bsize;
2434                 }
2435
2436                 if (i < blhdr->num_blocks) {
2437                         break;
2438                 }
2439     }
2440
2441     if (blhdr == NULL
2442                 && prev
2443                 && (prev->num_blocks+1) <= prev->max_blocks
2444                 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2445                 blhdr = prev;
2446     } else if (blhdr == NULL) {
2447                 block_list_header *nblhdr;
2448
2449                 if (prev == NULL) {
2450                         panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2451                 }
2452
2453                 // we got to the end of the list, didn't find the block and there's
2454                 // no room in the block_list_header pointed to by prev
2455
2456                 // we allocate another tbuffer and link it in at the end of the list
2457                 // through prev->binfo[0].bnum.  that's a skanky way to do things but
2458                 // avoids having yet another linked list of small data structures to manage.
2459
2460                 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2461                         panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2462                                   tr, tr->total_bytes);
2463                 }
2464
2465                 // journal replay code checksum check depends on this.
2466                 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2467                 // Fill up the rest of the block with unimportant bytes
2468                 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2469
2470                 // initialize the new guy
2471                 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2472                 nblhdr->num_blocks = 1;      // accounts for this header block
2473                 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2474                 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2475
2476                 tr->num_blhdrs++;
2477                 tr->total_bytes += jnl->jhdr->blhdr_size;
2478
2479                 // then link him in at the end
2480                 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2481
2482                 // and finally switch to using the new guy
2483                 blhdr          = nblhdr;
2484                 tbuffer_offset = jnl->jhdr->blhdr_size;
2485                 i              = 1;
2486     }
2487
2488
2489     if ((i+1) > blhdr->max_blocks) {
2490                 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2491     }
2492
2493         // if the function pointer is not set then copy the
2494         // block of data now.  if the function pointer is set
2495         // the copy will happen after calling the callback in
2496         // end_transaction() just before it goes to disk.
2497         //
2498         if (func == NULL) {
2499                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2500                 memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp));
2501         }
2502
2503     // if this is true then this is a new block we haven't seen
2504     if (i >= blhdr->num_blocks) {
2505                 int     bsize;
2506                 vnode_t vp;
2507
2508                 vp = buf_vnode(bp);
2509                 vnode_ref(vp);
2510                 bsize = buf_size(bp);
2511
2512                 blhdr->binfo[i].bnum  = (off_t)(buf_blkno(bp));
2513                 blhdr->binfo[i].bsize = bsize;
2514                 blhdr->binfo[i].b.bp    = bp;
2515                 if (func) {
2516                         void *old_func=NULL, *old_arg=NULL;
2517
2518                         buf_setfilter(bp, func, arg, &old_func, &old_arg);
2519                         if (old_func != NULL) {
2520                                 panic("jnl: modify_block_end: old func %p / arg %p", old_func, old_arg);
2521                         }
2522                 }
2523
2524                 blhdr->bytes_used += bsize;
2525                 tr->total_bytes   += bsize;
2526
2527                 blhdr->num_blocks++;
2528     }
2529     buf_bdwrite(bp);
2530
2531     return 0;
2532 }
2533
2534 int
2535 journal_kill_block(journal *jnl, struct buf *bp)
2536 {
2537     int                i;
2538     int                bflags;
2539     block_list_header *blhdr;
2540     transaction       *tr;
2541
2542     CHECK_JOURNAL(jnl);
2543
2544     if (jnl->flags & JOURNAL_INVALID) {
2545                 return EINVAL;
2546     }
2547
2548     tr = jnl->active_tr;
2549     CHECK_TRANSACTION(tr);
2550
2551     if (jnl->owner != current_thread()) {
2552                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2553                           jnl, jnl->owner, current_thread());
2554     }
2555
2556     free_old_stuff(jnl);
2557
2558     bflags = buf_flags(bp);
2559
2560     if ( !(bflags & B_LOCKED))
2561             panic("jnl: modify_block_end: called with bp not B_LOCKED");
2562
2563     /*
2564      * bp must be BL_BUSY and B_LOCKED
2565      */
2566     // first check if it's already part of this transaction
2567     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2568
2569                 for(i=1; i < blhdr->num_blocks; i++) {
2570                         if (bp == blhdr->binfo[i].b.bp) {
2571                                 vnode_t vp;
2572
2573                                 buf_clearflags(bp, B_LOCKED);
2574
2575                                 // this undoes the vnode_ref() in journal_modify_block_end()
2576                                 vp = buf_vnode(bp);
2577                                 vnode_rele_ext(vp, 0, 1);
2578
2579                                 // if the block has the DELWRI and FILTER bits sets, then
2580                                 // things are seriously weird.  if it was part of another
2581                                 // transaction then journal_modify_block_start() should
2582                                 // have force it to be written.
2583                                 //
2584                                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2585                                 //      panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2586                                 //} else {
2587                                         tr->num_killed += buf_size(bp);
2588                                 //}
2589                                 blhdr->binfo[i].b.bp   = NULL;
2590                                 blhdr->binfo[i].bnum = (off_t)-1;
2591
2592                                 buf_markinvalid(bp);
2593                                 buf_brelse(bp);
2594
2595                                 break;
2596                         }
2597                 }
2598
2599                 if (i < blhdr->num_blocks) {
2600                         break;
2601                 }
2602     }
2603
2604     return 0;
2605 }
2606
2607
2608 static int
2609 journal_binfo_cmp(const void *a, const void *b)
2610 {
2611     const block_info *bi_a = (const struct block_info *)a;
2612     const block_info *bi_b = (const struct block_info *)b;
2613     daddr64_t res;
2614
2615     if (bi_a->b.bp == NULL) {
2616                 return 1;
2617     }
2618     if (bi_b->b.bp == NULL) {
2619                 return -1;
2620     }
2621
2622     // don't have to worry about negative block
2623     // numbers so this is ok to do.
2624     //
2625     res = (buf_blkno(bi_a->b.bp) - buf_blkno(bi_b->b.bp));
2626
2627     return (int)res;
2628 }
2629
2630
2631 /*
2632  * End a transaction.  If the transaction is small enough, and we're not forcing
2633  * a write to disk, the "active" transaction becomes the "current" transaction,
2634  * and will be reused for the next transaction that is started (group commit).
2635  *
2636  * If the transaction gets written to disk (because force_it is true, or no
2637  * group commit, or the transaction is sufficiently full), the blocks get
2638  * written into the journal first, then the are written asynchronously.  When
2639  * those async writes complete, the transaction can be freed and removed from
2640  * the journal.
2641  *
2642  * An optional callback can be supplied.  If given, it is called after the
2643  * the blocks have been written to the journal, but before the async writes
2644  * of those blocks to their normal on-disk locations.  This is used by
2645  * journal_relocate so that the location of the journal can be changed and
2646  * flushed to disk before the blocks get written to their normal locations.
2647  * Note that the callback is only called if the transaction gets written to
2648  * the journal during this end_transaction call; you probably want to set the
2649  * force_it flag.
2650  *
2651  * Inputs:
2652  *      tr                       Transaction to add to the journal
2653  *      force_it         If true, force this transaction to the on-disk journal immediately.
2654  *      callback         See description above.  Pass NULL for no callback.
2655  *      callback_arg Argument passed to callback routine.
2656  *
2657  * Result
2658  *               0              No errors
2659  *              -1              An error occurred.  The journal is marked invalid.
2660  */
2661 static int
2662 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg)
2663 {
2664     int                 i, ret, amt;
2665     errno_t             errno;
2666     off_t               end;
2667     journal            *jnl = tr->jnl;
2668     struct buf         *bp, **bparray;
2669     block_list_header  *blhdr=NULL, *next=NULL;
2670     size_t              tbuffer_offset;
2671
2672         if (jnl->cur_tr) {
2673                 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2674                           jnl, jnl->cur_tr, tr);
2675         }
2676
2677     // if there weren't any modified blocks in the transaction
2678     // just save off the transaction pointer and return.
2679     if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2680                 jnl->cur_tr = tr;
2681                 return 0;
2682     }
2683
2684     // if our transaction buffer isn't very full, just hang
2685     // on to it and don't actually flush anything.  this is
2686     // what is known as "group commit".  we will flush the
2687     // transaction buffer if it's full or if we have more than
2688     // one of them so we don't start hogging too much memory.
2689     //
2690     if (   force_it == 0
2691                    && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2692                    && tr->num_blhdrs < 3
2693                    && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2694
2695                 jnl->cur_tr = tr;
2696                 return 0;
2697     }
2698
2699
2700     // if we're here we're going to flush the transaction buffer to disk.
2701     // make sure there is room in the journal first.
2702     check_free_space(jnl, tr->total_bytes);
2703
2704     // range check the end index
2705     if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2706                 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2707                           jnl->jhdr->end, jnl->jhdr->size);
2708     }
2709
2710     // this transaction starts where the current journal ends
2711     tr->journal_start = jnl->jhdr->end;
2712     end               = jnl->jhdr->end;
2713
2714         //
2715         // if the first entry in old_start[] isn't free yet, loop calling the
2716         // file system flush routine until it is (or we panic).
2717         //
2718         i = 0;
2719         lock_oldstart(jnl);
2720         while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2721                 if (jnl->flush) {
2722                         unlock_oldstart(jnl);
2723
2724                         if (jnl->flush) {
2725                                 jnl->flush(jnl->flush_arg);
2726                         }
2727
2728                         // yield the cpu so others can get in to clear the lock bit
2729                         (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2730
2731                         lock_oldstart(jnl);
2732                 }
2733                 if (i++ >= 500) {
2734                         panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n",
2735                                   jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2736                 }
2737         }
2738
2739         //
2740         // slide everyone else down and put our latest guy in the last
2741         // entry in the old_start array
2742         //
2743         memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2744         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2745
2746         unlock_oldstart(jnl);
2747
2748
2749     // for each block, make sure that the physical block # is set
2750     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2751                 char *blkptr;
2752
2753                 tbuffer_offset = jnl->jhdr->blhdr_size;
2754                 for(i=1; i < blhdr->num_blocks; i++) {
2755                         daddr64_t blkno;
2756                         daddr64_t lblkno;
2757                         struct vnode *vp;
2758
2759                         bp = blhdr->binfo[i].b.bp;
2760
2761                         // if this block has a callback function set, call
2762                         // it now and then copy the data from the bp into
2763                         // the journal.
2764                         if (bp) {
2765                                 void (*func)(struct buf *, void *);
2766                                 void  *arg;
2767
2768                                 buf_setfilter(bp, NULL, NULL, (void **)&func, &arg);
2769
2770                                 if (func) {
2771                                         // acquire the bp here so that we can safely
2772                                         // mess around with its data.  buf_acquire()
2773                                         // will return EAGAIN if the buffer was busy,
2774                                         // so loop trying again.
2775                                         do {
2776                                                 errno = buf_acquire(bp, 0, 0, 0);
2777                                         } while (errno == EAGAIN);
2778
2779                                         if (errno == 0) {
2780
2781                                                 // call the hook function and then copy the
2782                                                 // data into the transaction buffer...
2783                                                 func(bp, arg);
2784
2785                                                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2786                                                 memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp));
2787
2788                                                 buf_drop(bp);
2789                                         } else {
2790                                                 panic("could not acquire bp %p (err %d)\n", bp, errno);
2791                                         }
2792                                 }
2793
2794                         } else {   // bp == NULL, only true if a block was "killed"
2795                                 if (blhdr->binfo[i].bnum != (off_t)-1) {
2796                                         panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2797                                                 blhdr->binfo[i].bnum, jnl, tr);
2798                                 }
2799
2800                                 tbuffer_offset += blhdr->binfo[i].bsize;
2801                                 continue;
2802                         }
2803
2804                         tbuffer_offset += blhdr->binfo[i].bsize;
2805
2806                         vp = buf_vnode(bp);
2807                         blkno = buf_blkno(bp);
2808                         lblkno = buf_lblkno(bp);
2809
2810                         if (vp == NULL && lblkno == blkno) {
2811                             printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd.  aborting the transaction (tr %p jnl %p).\n",
2812                                 jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
2813                             goto bad_journal;
2814                         }
2815
2816                         // if the lblkno is the same as blkno and this bp isn't
2817                         // associated with the underlying file system device then
2818                         // we need to call bmap() to get the actual physical block.
2819                         //
2820                         if ((lblkno == blkno) && (vp != jnl->fsdev)) {
2821                                 off_t   f_offset;
2822                                 size_t  contig_bytes;
2823
2824                                 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
2825                                         printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2826                                         goto bad_journal;
2827                                 }
2828                                 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
2829                                         printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2830                                         goto bad_journal;
2831                                 }
2832                                 if ((uint32_t)contig_bytes < buf_count(bp)) {
2833                                         printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
2834                                         goto bad_journal;
2835                                 }
2836                                 buf_setblkno(bp, blkno);
2837                         }
2838                         // update this so we write out the correct physical block number!
2839                         blhdr->binfo[i].bnum = (off_t)(blkno);
2840                 }
2841
2842                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2843     }
2844
2845
2846
2847     for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2848                 amt = blhdr->bytes_used;
2849
2850                 blhdr->binfo[0].b.sequence_num = tr->sequence_num;
2851
2852                 blhdr->checksum = 0;
2853                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2854
2855                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
2856                     panic("can't allocate %lu bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
2857                 }
2858
2859                 // calculate individual block checksums
2860                 tbuffer_offset = jnl->jhdr->blhdr_size;
2861                 for(i=1; i < blhdr->num_blocks; i++) {
2862                     bparray[i] = blhdr->binfo[i].b.bp;
2863                     if (bparray[i]) {
2864                         blhdr->binfo[i].b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], blhdr->binfo[i].bsize);
2865                     } else {
2866                         blhdr->binfo[i].b.cksum = 0;
2867                     }
2868
2869                     tbuffer_offset += blhdr->binfo[i].bsize;
2870                 }
2871
2872                 ret = write_journal_data(jnl, &end, blhdr, amt);
2873
2874                 // always put the bp pointers back
2875                 for(i=1; i < blhdr->num_blocks; i++) {
2876                     blhdr->binfo[i].b.bp = bparray[i];
2877                 }
2878
2879                 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
2880
2881                 if (ret != amt) {
2882                         printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
2883                             jnl->jdev_name, ret, amt);
2884
2885                         goto bad_journal;
2886                 }
2887     }
2888
2889     jnl->jhdr->end  = end;    // update where the journal now ends
2890     tr->journal_end = end;    // the transaction ends here too
2891     if (tr->journal_start == 0 || tr->journal_end == 0) {
2892                 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2893                           tr->journal_start, tr->journal_end);
2894     }
2895
2896     if (write_journal_header(jnl) != 0) {
2897                 goto bad_journal;
2898     }
2899
2900         /*
2901          * If the caller supplied a callback, call it now that the blocks have been
2902          * written to the journal.  This is used by journal_relocate so, for example,
2903          * the file system can change its pointer to the new journal.
2904          */
2905         if (callback != NULL && callback(callback_arg) != 0) {
2906                 goto bad_journal;
2907         }
2908
2909     //
2910     // setup for looping through all the blhdr's.  we null out the
2911     // tbuffer and blhdr fields so that they're not used any more.
2912     //
2913     blhdr       = tr->blhdr;
2914     tr->tbuffer = NULL;
2915     tr->blhdr   = NULL;
2916
2917     // the buffer_flushed_callback will only be called for the
2918     // real blocks that get flushed so we have to account for
2919     // the block_list_headers here.
2920     //
2921     tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2922
2923     // for each block, set the iodone callback and unlock it
2924     for(; blhdr; blhdr=next) {
2925
2926                 // we can re-order the buf ptrs because everything is written out already
2927                 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2928
2929                 for(i=1; i < blhdr->num_blocks; i++) {
2930                         if (blhdr->binfo[i].b.bp == NULL) {
2931                                 continue;
2932                         }
2933
2934                         bp = blhdr->binfo[i].b.bp;
2935
2936                         // have to pass BAC_REMOVE here because we're going to bawrite()
2937                         // the buffer when we're done
2938                         do {
2939                                 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
2940                         } while (errno == EAGAIN);
2941
2942                         if (errno == 0) {
2943                                 struct vnode *save_vp;
2944                                 void *cur_filter;
2945
2946                                 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2947                                         if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2948                                             buf_clearflags(bp, B_LOCKED);
2949                                             buf_brelse(bp);
2950                                                 continue;
2951                                         } else {
2952                                                 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
2953                                         }
2954                                 }
2955                                 save_vp = buf_vnode(bp);
2956
2957                                 buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL);
2958
2959                                 if (cur_filter) {
2960                                         panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n",
2961                                                   bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback);
2962                                 }
2963                                 buf_clearflags(bp, B_LOCKED);
2964
2965                                 // kicking off the write here helps performance
2966                                 buf_bawrite(bp);
2967                                 // XXXdbg this is good for testing: buf_bdwrite(bp);
2968                                 //buf_bdwrite(bp);
2969
2970                                 // this undoes the vnode_ref() in journal_modify_block_end()
2971                                 vnode_rele_ext(save_vp, 0, 1);
2972                         } else {
2973                                 printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n",
2974                                     jnl->jdev_name,bp, errno);
2975                         }
2976                 }
2977
2978                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2979
2980                 // we can free blhdr here since we won't need it any more
2981                 blhdr->binfo[0].bnum = 0xdeadc0de;
2982                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2983     }
2984
2985     //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2986     //   tr, tr->journal_start, tr->journal_end);
2987     return 0;
2988
2989
2990   bad_journal:
2991     jnl->flags |= JOURNAL_INVALID;
2992     jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2993     abort_transaction(jnl, tr);
2994     return -1;
2995 }
2996
2997 static void
2998 abort_transaction(journal *jnl, transaction *tr)
2999 {
3000     int                i;
3001     errno_t             errno;
3002     block_list_header *blhdr, *next;
3003     struct buf        *bp;
3004     struct vnode      *save_vp;
3005
3006     // for each block list header, iterate over the blocks then
3007     // free up the memory associated with the block list.
3008     //
3009     // for each block, clear the lock bit and release it.
3010     //
3011     for(blhdr=tr->blhdr; blhdr; blhdr=next) {
3012
3013                 for(i=1; i < blhdr->num_blocks; i++) {
3014                         if (blhdr->binfo[i].b.bp == NULL) {
3015                                 continue;
3016                         }
3017                         if ( (buf_vnode(blhdr->binfo[i].b.bp) == NULL) ||
3018                              !(buf_flags(blhdr->binfo[i].b.bp) & B_LOCKED) ) {
3019                                 continue;
3020                         }
3021
3022                         errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].b.bp),
3023                                                          buf_lblkno(blhdr->binfo[i].b.bp),
3024                                                          buf_size(blhdr->binfo[i].b.bp),
3025                                                          NOCRED,
3026                                                          &bp);
3027                         if (errno == 0) {
3028                                 if (bp != blhdr->binfo[i].b.bp) {
3029                                         panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
3030                                                   bp, blhdr->binfo[i].b.bp, jnl);
3031                                 }
3032
3033                                 // releasing a bp marked invalid
3034                                 // also clears the locked and delayed state
3035                                 buf_markinvalid(bp);
3036                                 save_vp = buf_vnode(bp);
3037
3038                                 buf_brelse(bp);
3039
3040                                 vnode_rele_ext(save_vp, 0, 1);
3041                         } else {
3042                                 printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
3043                                     jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].b.bp);
3044                                 if (bp) {
3045                                         buf_brelse(bp);
3046                                 }
3047                         }
3048                 }
3049
3050                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3051
3052                 // we can free blhdr here since we won't need it any more
3053                 blhdr->binfo[0].bnum = 0xdeadc0de;
3054                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
3055     }
3056
3057     tr->tbuffer     = NULL;
3058     tr->blhdr       = NULL;
3059     tr->total_bytes = 0xdbadc0de;
3060         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
3061 }
3062
3063
3064 int
3065 journal_end_transaction(journal *jnl)
3066 {
3067     int ret;
3068         transaction *tr;
3069
3070     CHECK_JOURNAL(jnl);
3071
3072         if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
3073                 return 0;
3074         }
3075
3076     if (jnl->owner != current_thread()) {
3077                 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
3078                           jnl, jnl->owner, current_thread());
3079     }
3080
3081     free_old_stuff(jnl);
3082
3083     jnl->nested_count--;
3084     if (jnl->nested_count > 0) {
3085                 return 0;
3086     } else if (jnl->nested_count < 0) {
3087                 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
3088     }
3089
3090     if (jnl->flags & JOURNAL_INVALID) {
3091                 if (jnl->active_tr) {
3092                         if (jnl->cur_tr != NULL) {
3093                                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
3094                                           jnl, jnl->active_tr, jnl->cur_tr);
3095                         }
3096
3097                         tr             = jnl->active_tr;
3098                         jnl->active_tr = NULL;
3099                         abort_transaction(jnl, tr);
3100                 }
3101
3102                 jnl->owner = NULL;
3103                 unlock_journal(jnl);
3104
3105                 return EINVAL;
3106     }
3107
3108     tr = jnl->active_tr;
3109     CHECK_TRANSACTION(tr);
3110
3111     // clear this out here so that when check_free_space() calls
3112     // the FS flush function, we don't panic in journal_flush()
3113     // if the FS were to call that.  note: check_free_space() is
3114     // called from end_transaction().
3115     //
3116     jnl->active_tr = NULL;
3117     ret = end_transaction(tr, 0, NULL, NULL);
3118
3119     jnl->owner = NULL;
3120     unlock_journal(jnl);
3121
3122     return ret;
3123 }
3124
3125
3126 int
3127 journal_flush(journal *jnl)
3128 {
3129     int need_signal = 0;
3130
3131     CHECK_JOURNAL(jnl);
3132
3133     if (jnl->flags & JOURNAL_INVALID) {
3134                 return -1;
3135     }
3136
3137     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3138         | DBG_FUNC_START, 0, 0, 0, 0, 0);
3139
3140     if (jnl->owner != current_thread()) {
3141                 lock_journal(jnl);
3142                 need_signal = 1;
3143     }
3144
3145     free_old_stuff(jnl);
3146
3147     // if we're not active, flush any buffered transactions
3148     if (jnl->active_tr == NULL && jnl->cur_tr) {
3149                 transaction *tr = jnl->cur_tr;
3150
3151                 jnl->cur_tr = NULL;
3152                 end_transaction(tr, 1, NULL, NULL);   // force it to get flushed
3153     }
3154
3155     if (need_signal) {
3156                 unlock_journal(jnl);
3157     }
3158
3159     KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH))
3160         | DBG_FUNC_END, 0, 0, 0, 0, 0);
3161
3162     return 0;
3163 }
3164
3165 int
3166 journal_active(journal *jnl)
3167 {
3168     if (jnl->flags & JOURNAL_INVALID) {
3169                 return -1;
3170     }
3171
3172     return (jnl->active_tr == NULL) ? 0 : 1;
3173 }
3174
3175 void *
3176 journal_owner(journal *jnl)
3177 {
3178     return jnl->owner;
3179 }
3180
3181 int journal_uses_fua(journal *jnl)
3182 {
3183         if (jnl->flags & JOURNAL_DO_FUA_WRITES)
3184                 return 1;
3185         return 0;
3186 }
3187
3188 /*
3189  * Relocate the journal.
3190  *
3191  * You provide the new starting offset and size for the journal. You may
3192  * optionally provide a new tbuffer_size; passing zero defaults to not
3193  * changing the tbuffer size except as needed to fit within the new journal
3194  * size.
3195  *
3196  * You must have already started a transaction. The transaction may contain
3197  * modified blocks (such as those needed to deallocate the old journal,
3198  * allocate the new journal, and update the location and size of the journal
3199  * in filesystem-private structures). Any transactions prior to the active
3200  * transaction will be flushed to the old journal. The new journal will be
3201  * initialized, and the blocks from the active transaction will be written to
3202  * the new journal.
3203  *
3204  * The caller will need to update the structures that identify the location
3205  * and size of the journal.  These updates should be made in the supplied
3206  * callback routine.  These updates must NOT go into a transaction.  You should
3207  * force these updates to the media before returning from the callback.  In the
3208  * even of a crash, either the old journal will be found, with an empty journal,
3209  * or the new journal will be found with the contents of the active transaction.
3210  *
3211  * Upon return from the callback, the blocks from the active transaction are
3212  * written to their normal locations on disk.
3213  *
3214  * (Remember that we have to ensure that blocks get committed to the journal
3215  * before being committed to their normal locations.  But the blocks don't count
3216  * as committed until the new journal is pointed at.)
3217  *
3218  * Upon return, there is still an active transaction: newly allocated, and
3219  * with no modified blocks.  Call journal_end_transaction as normal.  You may
3220  * modifiy additional blocks before calling journal_end_transaction, and those
3221  * blocks will (eventually) go to the relocated journal.
3222  *
3223  * Inputs:
3224  *      jnl                             The (opened) journal to relocate.
3225  *      offset                  The new journal byte offset (from start of the journal device).
3226  *      journal_size    The size, in bytes, of the new journal.
3227  *      tbuffer_size    The new desired transaction buffer size.  Pass zero to keep
3228  *                                      the same size as the current journal.  The size will be
3229  *                                      modified as needed to fit the new journal.
3230  *      callback                Routine called after the new journal has been initialized,
3231  *                                      and the active transaction written to the new journal, but
3232  *                                      before the blocks are written to their normal locations.
3233  *                                      Pass NULL for no callback.
3234  *      callback_arg    An argument passed to the callback routine.
3235  *
3236  * Result:
3237  *      0                               No errors
3238  *      EINVAL                  The offset is not block aligned
3239  *      EINVAL                  The journal_size is not a multiple of the block size
3240  *      EINVAL                  The journal is invalid
3241  *      (any)                   An error returned by journal_flush.
3242  *
3243  */
3244 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
3245         errno_t (*callback)(void *), void *callback_arg)
3246 {
3247         int ret;
3248         transaction *tr;
3249
3250         /*
3251          * Sanity check inputs, and adjust the size of the transaction buffer.
3252          */
3253     if ((offset % jnl->jhdr->jhdr_size) != 0) {
3254                 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
3255                     jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
3256                 return EINVAL;
3257     }
3258     if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
3259                 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3260                     jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
3261                 return EINVAL;
3262     }
3263
3264     CHECK_JOURNAL(jnl);
3265
3266         /* Guarantee we own the active transaction. */
3267     if (jnl->flags & JOURNAL_INVALID) {
3268                 return EINVAL;
3269     }
3270     if (jnl->owner != current_thread()) {
3271         panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
3272                 jnl, jnl->owner, current_thread());
3273         }
3274
3275     if (tbuffer_size == 0)
3276         tbuffer_size = jnl->tbuffer_size;
3277     size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
3278
3279         /*
3280          * Flush any non-active transactions.  We have to temporarily hide the
3281          * active transaction to make journal_flush flush out non-active but
3282          * current (unwritten) transactions.
3283          */
3284         tr = jnl->active_tr;
3285         CHECK_TRANSACTION(tr);
3286         jnl->active_tr = NULL;
3287         ret = journal_flush(jnl);
3288         jnl->active_tr = tr;
3289         if (ret) {
3290                 return ret;
3291         }
3292
3293         /* Update the journal's offset and size in memory. */
3294         jnl->jdev_offset = offset;
3295         jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
3296         jnl->jhdr->size = journal_size;
3297         jnl->active_start = jnl->jhdr->start;
3298
3299         /*
3300          * Force the active transaction to be written to the new journal.  Call the
3301          * supplied callback after the blocks have been written to the journal, but
3302          * before they get written to their normal on-disk locations.
3303          */
3304         jnl->active_tr = NULL;
3305         ret = end_transaction(tr, 1, callback, callback_arg);
3306         if (ret) {
3307                 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
3308                 goto bad_journal;
3309         }
3310
3311         /*
3312          * Create a new, empty transaction to be the active transaction.  This way
3313          * our caller can use journal_end_transaction as usual.
3314          */
3315         ret = journal_allocate_transaction(jnl);
3316         if (ret) {
3317                 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
3318                 goto bad_journal;
3319         }
3320
3321         return 0;
3322
3323 bad_journal:
3324     jnl->flags |= JOURNAL_INVALID;
3325     abort_transaction(jnl, tr);
3326     return ret;
3327 }