bsd/vfs/vfs_journal.c

   1 /*
   2  * Copyright (c) 2002-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 //
  29 // This file implements a simple write-ahead journaling layer.
  30 // In theory any file system can make use of it by calling these
  31 // functions when the fs wants to modify meta-data blocks.  See
  32 // vfs_journal.h for a more detailed description of the api and
  33 // data structures.
  34 //
  35 // Dominic Giampaolo (dbg@apple.com)
  36 //
  37
  38 #ifdef KERNEL
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/file_internal.h>
  44 #include <sys/stat.h>
  45 #include <sys/buf_internal.h>
  46 #include <sys/proc_internal.h>
  47 #include <sys/mount_internal.h>
  48 #include <sys/namei.h>
  49 #include <sys/vnode_internal.h>
  50 #include <sys/ioctl.h>
  51 #include <sys/tty.h>
  52 #include <sys/ubc.h>
  53 #include <sys/malloc.h>
  54 #include <kern/task.h>
  55 #include <kern/thread.h>
  56 #include <kern/kalloc.h>
  57 #include <sys/disk.h>
  58 #include <sys/kdebug.h>
  59 #include <miscfs/specfs/specdev.h>
  60 #include <libkern/OSAtomic.h>   /* OSAddAtomic */
  61
  62 kern_return_t   thread_terminate(thread_t);
  63
  64 /*
  65  * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT
  66  * logging of trim-related calls within the journal.  (They're
  67  * disabled by default because there can be a lot of these events,
  68  * and we don't want to overwhelm the kernel debug buffer.  If you
  69  * want to watch these events in particular, just set the sysctl.)
  70  */
  71 static int jnl_kdebug = 0;
  72 SYSCTL_DECL(_vfs_generic);
  73 SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal");
  74 SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug");
  75 SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM");
  76
  77 #define DBG_JOURNAL_FLUSH                       FSDBG_CODE(DBG_JOURNAL, 1)
  78 #define DBG_JOURNAL_TRIM_ADD            FSDBG_CODE(DBG_JOURNAL, 2)
  79 #define DBG_JOURNAL_TRIM_REMOVE         FSDBG_CODE(DBG_JOURNAL, 3)
  80 #define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4)
  81 #define DBG_JOURNAL_TRIM_REALLOC        FSDBG_CODE(DBG_JOURNAL, 5)
  82 #define DBG_JOURNAL_TRIM_FLUSH          FSDBG_CODE(DBG_JOURNAL, 6)
  83 #define DBG_JOURNAL_TRIM_UNMAP          FSDBG_CODE(DBG_JOURNAL, 7)
  84
  85 /*
  86  * Cap the journal max size to 2GB.  On HFS, it will attempt to occupy
  87  * a full allocation block if the current size is smaller than the allocation
  88  * block on which it resides.  Once we hit the exabyte filesystem range, then
  89  * it will use 2GB allocation blocks.  As a result, make the cap 2GB.
  90  */
  91 #define MAX_JOURNAL_SIZE 0x80000000U
  92
  93 #include <sys/sdt.h> /* DTRACE_IO1 */
  94 #else
  95
  96 #include <stdio.h>
  97 #include <stdlib.h>
  98 #include <string.h>
  99 #include <limits.h>
 100 #include <errno.h>
 101 #include <fcntl.h>
 102 #include <unistd.h>
 103 #include <stdarg.h>
 104 #include <sys/types.h>
 105 #include "compat.h"
 106
 107 #endif   /* KERNEL */
 108
 109 #include "vfs_journal.h"
 110
 111 #include <sys/kdebug.h>
 112
 113 #if 0
 114 #undef KERNEL_DEBUG
 115 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 116 #endif
 117
 118
 119 #ifndef CONFIG_HFS_TRIM
 120 #define CONFIG_HFS_TRIM 0
 121 #endif
 122
 123
 124 #if JOURNALING
 125
 126 //
 127 // By default, we grow the list of extents to trim by 4K at a time.
 128 // We'll opt to flush a transaction if it contains at least
 129 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
 130 // of modified blocks is small).
 131 //
 132 enum {
 133     JOURNAL_DEFAULT_TRIM_BYTES = 4096,
 134     JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
 135     JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
 136 };
 137
 138 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
 139 SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
 140
 141 /* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
 142 __private_extern__ void qsort(
 143         void * array,
 144         size_t nmembers,
 145         size_t member_size,
 146         int (*)(const void *, const void *));
 147
 148
 149
 150 // number of bytes to checksum in a block_list_header
 151 // NOTE: this should be enough to clear out the header
 152 //       fields as well as the first entry of binfo[]
 153 #define BLHDR_CHECKSUM_SIZE 32
 154
 155 static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name);
 156 static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name);
 157 static void unlock_condition(journal *jnl, boolean_t *condition);
 158 static void finish_end_thread(transaction *tr);
 159 static void write_header_thread(journal *jnl);
 160 static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg);
 161 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait);
 162 static void abort_transaction(journal *jnl, transaction *tr);
 163 static void dump_journal(journal *jnl);
 164
 165 static __inline__ void  lock_oldstart(journal *jnl);
 166 static __inline__ void  unlock_oldstart(journal *jnl);
 167 static __inline__ void  lock_flush(journal *jnl);
 168 static __inline__ void  unlock_flush(journal *jnl);
 169
 170
 171 //
 172 // 3105942 - Coalesce writes to the same block on journal replay
 173 //
 174
 175 typedef struct bucket {
 176         off_t     block_num;
 177         uint32_t  jnl_offset;
 178         uint32_t  block_size;
 179         int32_t   cksum;
 180 } bucket;
 181
 182 #define STARTING_BUCKETS 256
 183
 184 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 185 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
 186 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
 187 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
 188 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
 189
 190 #define CHECK_JOURNAL(jnl) \
 191         do {               \
 192         if (jnl == NULL) {                                      \
 193                 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
 194         }                                                               \
 195         if (jnl->jdev == NULL) {                                \
 196                 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
 197         }                                                       \
 198         if (jnl->fsdev == NULL) {                               \
 199                 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);   \
 200         }                                                               \
 201         if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {                 \
 202                 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",   \
 203                       __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
 204         }                                                               \
 205         if (   jnl->jhdr->start <= 0                                    \
 206                || jnl->jhdr->start > jnl->jhdr->size) {                 \
 207                 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
 208                       __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
 209         }                                                               \
 210         if (   jnl->jhdr->end <= 0                                      \
 211                || jnl->jhdr->end > jnl->jhdr->size) {                   \
 212                 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
 213                       __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
 214         }                                                               \
 215         } while(0)
 216
 217 #define CHECK_TRANSACTION(tr) \
 218         do {                  \
 219         if (tr == NULL) {                                       \
 220                 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
 221         }                                                               \
 222         if (tr->jnl == NULL) {                                          \
 223                 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
 224         }                                                               \
 225         if (tr->blhdr != (block_list_header *)tr->tbuffer) {            \
 226                 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
 227         }                                                               \
 228         if (tr->total_bytes < 0) {                                      \
 229                 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
 230         }                                                               \
 231         if (tr->journal_start < 0) {                                    \
 232                 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
 233         }                                                               \
 234         if (tr->journal_end < 0) {                                      \
 235                 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
 236         }                                                               \
 237         if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
 238                 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
 239         }                                                               \
 240         } while(0)
 241
 242
 243
 244 //
 245 // this isn't a great checksum routine but it will do for now.
 246 // we use it to checksum the journal header and the block list
 247 // headers that are at the start of each transaction.
 248 //
 249 static unsigned int
 250 calc_checksum(char *ptr, int len)
 251 {
 252         int i;
 253         unsigned int cksum=0;
 254
 255         // this is a lame checksum but for now it'll do
 256         for(i = 0; i < len; i++, ptr++) {
 257                 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
 258         }
 259
 260         return (~cksum);
 261 }
 262
 263 //
 264 // Journal Locking
 265 //
 266 lck_grp_attr_t *  jnl_group_attr;
 267 lck_attr_t *      jnl_lock_attr;
 268 lck_grp_t *       jnl_mutex_group;
 269
 270 void
 271 journal_init(void)
 272 {
 273         jnl_lock_attr    = lck_attr_alloc_init();
 274         jnl_group_attr   = lck_grp_attr_alloc_init();
 275         jnl_mutex_group  = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
 276 }
 277
 278 __inline__ void
 279 journal_lock(journal *jnl)
 280 {
 281         lck_mtx_lock(&jnl->jlock);
 282         if (jnl->owner) {
 283                 panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
 284         }
 285         jnl->owner = current_thread();
 286 }
 287
 288 __inline__ void
 289 journal_unlock(journal *jnl)
 290 {
 291         jnl->owner = NULL;
 292         lck_mtx_unlock(&jnl->jlock);
 293 }
 294
 295 static __inline__ void
 296 lock_flush(journal *jnl)
 297 {
 298         lck_mtx_lock(&jnl->flock);
 299 }
 300
 301 static __inline__ void
 302 unlock_flush(journal *jnl)
 303 {
 304         lck_mtx_unlock(&jnl->flock);
 305 }
 306
 307 static __inline__ void
 308 lock_oldstart(journal *jnl)
 309 {
 310         lck_mtx_lock(&jnl->old_start_lock);
 311 }
 312
 313 static __inline__ void
 314 unlock_oldstart(journal *jnl)
 315 {
 316         lck_mtx_unlock(&jnl->old_start_lock);
 317 }
 318
 319
 320
 321 #define JNL_WRITE    0x0001
 322 #define JNL_READ     0x0002
 323 #define JNL_HEADER   0x8000
 324
 325 //
 326 // This function sets up a fake buf and passes it directly to the
 327 // journal device strategy routine (so that it won't get cached in
 328 // the block cache.
 329 //
 330 // It also handles range checking the i/o so that we don't write
 331 // outside the journal boundaries and it will wrap the i/o back
 332 // to the beginning if necessary (skipping over the journal header)
 333 //
 334 static size_t
 335 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
 336 {
 337         int     err, curlen=len;
 338         size_t  io_sz = 0;
 339         buf_t   bp;
 340         off_t   max_iosize;
 341         struct bufattr *bap;
 342         boolean_t was_vm_privileged = FALSE;
 343         boolean_t need_vm_privilege = FALSE;
 344
 345         if (jnl->fsmount) {
 346                 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT)
 347                         need_vm_privilege = TRUE;
 348         }
 349
 350         if (*offset < 0 || *offset > jnl->jhdr->size) {
 351                 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
 352         }
 353
 354         if (direction & JNL_WRITE)
 355                 max_iosize = jnl->max_write_size;
 356         else if (direction & JNL_READ)
 357                 max_iosize = jnl->max_read_size;
 358         else
 359                 max_iosize = 128 * 1024;
 360
 361 again:
 362         bp = alloc_io_buf(jnl->jdev, 1);
 363
 364         if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
 365                 if (*offset == jnl->jhdr->size) {
 366                         *offset = jnl->jhdr->jhdr_size;
 367                 } else {
 368                         curlen = (off_t)jnl->jhdr->size - *offset;
 369                 }
 370         }
 371
 372         if (curlen > max_iosize) {
 373                 curlen = max_iosize;
 374         }
 375
 376         if (curlen <= 0) {
 377                 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len);
 378         }
 379
 380         if (*offset == 0 && (direction & JNL_HEADER) == 0) {
 381                 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
 382         }
 383
 384         /*
 385          * As alluded to in the block comment at the top of the function, we use a "fake" iobuf
 386          * here and issue directly to the disk device that the journal protects since we don't
 387          * want this to enter the block cache.  As a result, we lose the ability to mark it
 388          * as a metadata buf_t for the layers below us that may care. If we were to
 389          * simply attach the B_META flag into the b_flags this may confuse things further
 390          * since this is an iobuf, not a metadata buffer.
 391          *
 392          * To address this, we use the extended bufattr struct embedded in the bp.
 393          * Explicitly mark the buf here as a metadata buffer in its bufattr flags.
 394          */
 395         bap = &bp->b_attr;
 396         bap->ba_flags |= BA_META;
 397
 398         if (direction & JNL_READ)
 399                 buf_setflags(bp, B_READ);
 400         else {
 401                 /*
 402                  * don't have to set any flags
 403                  */
 404                 vnode_startwrite(jnl->jdev);
 405         }
 406         buf_setsize(bp, curlen);
 407         buf_setcount(bp, curlen);
 408         buf_setdataptr(bp, (uintptr_t)data);
 409         buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 410         buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
 411
 412         if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
 413                 buf_markfua(bp);
 414         }
 415
 416         if (need_vm_privilege == TRUE) {
 417                 /*
 418                  * if we block waiting for memory, and there is enough pressure to
 419                  * cause us to try and create a new swap file, we may end up deadlocking
 420                  * due to waiting for the journal on the swap file creation path...
 421                  * by making ourselves vm_privileged, we give ourselves the best chance
 422                  * of not blocking
 423                  */
 424                 was_vm_privileged = set_vm_privilege(TRUE);
 425         }
 426         DTRACE_IO1(journal__start, buf_t, bp);
 427         err = VNOP_STRATEGY(bp);
 428         if (!err) {
 429                 err = (int)buf_biowait(bp);
 430         }
 431         DTRACE_IO1(journal__done, buf_t, bp);
 432
 433         if (need_vm_privilege == TRUE && was_vm_privileged == FALSE)
 434                 set_vm_privilege(FALSE);
 435
 436         free_io_buf(bp);
 437
 438         if (err) {
 439                 printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
 440                 return 0;
 441         }
 442
 443         *offset += curlen;
 444         io_sz   += curlen;
 445
 446         if (io_sz != len) {
 447                 // handle wrap-around
 448                 data    = (char *)data + curlen;
 449                 curlen  = len - io_sz;
 450                 if (*offset >= jnl->jhdr->size) {
 451                         *offset = jnl->jhdr->jhdr_size;
 452                 }
 453                 goto again;
 454         }
 455
 456         return io_sz;
 457 }
 458
 459 static size_t
 460 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 461 {
 462         return do_journal_io(jnl, offset, data, len, JNL_READ);
 463 }
 464
 465 static size_t
 466 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
 467 {
 468         return do_journal_io(jnl, offset, data, len, JNL_WRITE);
 469 }
 470
 471
 472 static size_t
 473 read_journal_header(journal *jnl, void *data, size_t len)
 474 {
 475         off_t hdr_offset = 0;
 476
 477         return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
 478 }
 479
 480 static int
 481 write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num)
 482 {
 483         static int num_err_prints = 0;
 484         int ret=0;
 485         off_t jhdr_offset = 0;
 486         struct vfs_context context;
 487
 488         context.vc_thread = current_thread();
 489         context.vc_ucred = NOCRED;
 490         //
 491         // Flush the track cache if we're not doing force-unit-access
 492         // writes.
 493         //
 494         if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 495
 496                 dk_synchronize_t sync_request = {
 497                         .options                        = DK_SYNCHRONIZE_OPTION_BARRIER,
 498                 };
 499
 500                 /*
 501                  * If device doesn't support barrier-only flush, or
 502                  * the journal is on a different device, use full flush.
 503                  */
 504                 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
 505                         sync_request.options = 0;
 506                         jnl->flush_counter++;
 507                 }
 508
 509                 ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context);
 510         }
 511         if (ret != 0) {
 512                 //
 513                 // Only print this error if it's a different error than the
 514                 // previous one, or if it's the first time for this device
 515                 // or if the total number of printfs is less than 25.  We
 516                 // allow for up to 25 printfs to insure that some make it
 517                 // into the on-disk syslog.  Otherwise if we only printed
 518                 // one, it's possible it would never make it to the syslog
 519                 // for the root volume and that makes debugging hard.
 520                 //
 521                 if (   ret != jnl->last_flush_err
 522                        || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
 523                        || num_err_prints++ < 25) {
 524
 525                         printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
 526
 527                         jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
 528                         jnl->last_flush_err = ret;
 529                 }
 530         }
 531
 532         jnl->jhdr->sequence_num = sequence_num;
 533         jnl->jhdr->checksum = 0;
 534         jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
 535
 536         if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
 537                 printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
 538                 jnl->flags |= JOURNAL_INVALID;
 539                 return -1;
 540         }
 541
 542         // If we're not doing force-unit-access writes, then we
 543         // have to flush after writing the journal header so that
 544         // a future transaction doesn't sneak out to disk before
 545         // the header does and thus overwrite data that the old
 546         // journal header refers to.  Saw this exact case happen
 547         // on an IDE bus analyzer with Larry Barras so while it
 548         // may seem obscure, it's not.
 549         //
 550         if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
 551
 552                 dk_synchronize_t sync_request = {
 553                         .options                        = DK_SYNCHRONIZE_OPTION_BARRIER,
 554                 };
 555
 556                 /*
 557                  * If device doesn't support barrier-only flush, or
 558                  * the journal is on a different device, use full flush.
 559                  */
 560                 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
 561                         sync_request.options = 0;
 562                         jnl->flush_counter++;
 563                 }
 564
 565                 VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context);
 566         }
 567
 568         return 0;
 569 }
 570
 571
 572
 573 //
 574 // this is a work function used to free up transactions that
 575 // completed. they can't be free'd from buffer_flushed_callback
 576 // because it is called from deep with the disk driver stack
 577 // and thus can't do something that would potentially cause
 578 // paging.  it gets called by each of the journal api entry
 579 // points so stuff shouldn't hang around for too long.
 580 //
 581 static void
 582 free_old_stuff(journal *jnl)
 583 {
 584         transaction *tr, *next;
 585         block_list_header  *blhdr=NULL, *next_blhdr=NULL;
 586
 587         if (jnl->tr_freeme == NULL)
 588                 return;
 589
 590         lock_oldstart(jnl);
 591         tr = jnl->tr_freeme;
 592         jnl->tr_freeme = NULL;
 593         unlock_oldstart(jnl);
 594
 595         for(; tr; tr=next) {
 596                 for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
 597                         next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
 598                         blhdr->binfo[0].bnum = 0xdeadc0de;
 599
 600                         kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
 601
 602                         KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
 603                 }
 604                 next = tr->next;
 605                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
 606         }
 607 }
 608
 609
 610
 611 //
 612 // This is our callback that lets us know when a buffer has been
 613 // flushed to disk.  It's called from deep within the driver stack
 614 // and thus is quite limited in what it can do.  Notably, it can
 615 // not initiate any new i/o's or allocate/free memory.
 616 //
 617 static void
 618 buffer_flushed_callback(struct buf *bp, void *arg)
 619 {
 620         transaction  *tr;
 621         journal      *jnl;
 622         transaction  *ctr, *prev=NULL, *next;
 623         size_t        i;
 624         int           bufsize, amt_flushed, total_bytes;
 625
 626
 627         //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
 628         //         bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
 629
 630         // snarf out the bits we want
 631         bufsize = buf_size(bp);
 632         tr      = (transaction *)arg;
 633
 634         // then we've already seen it
 635         if (tr == NULL) {
 636                 return;
 637         }
 638
 639         CHECK_TRANSACTION(tr);
 640
 641         jnl = tr->jnl;
 642
 643         CHECK_JOURNAL(jnl);
 644
 645         amt_flushed = tr->num_killed;
 646         total_bytes = tr->total_bytes;
 647
 648         // update the number of blocks that have been flushed.
 649         // this buf may represent more than one block so take
 650         // that into account.
 651         //
 652         // OSAddAtomic() returns the value of tr->num_flushed before the add
 653         //
 654         amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed);
 655
 656
 657         // if this transaction isn't done yet, just return as
 658         // there is nothing to do.
 659         //
 660         // NOTE: we are careful to not reference anything through
 661         //       the tr pointer after doing the OSAddAtomic().  if
 662         //       this if statement fails then we are the last one
 663         //       and then it's ok to dereference "tr".
 664         //
 665         if ((amt_flushed + bufsize) < total_bytes) {
 666                 return;
 667         }
 668
 669         // this will single thread checking the transaction
 670         lock_oldstart(jnl);
 671
 672         if (tr->total_bytes == (int)0xfbadc0de) {
 673                 // then someone beat us to it...
 674                 unlock_oldstart(jnl);
 675                 return;
 676         }
 677
 678         // mark this so that we're the owner of dealing with the
 679         // cleanup for this transaction
 680         tr->total_bytes = 0xfbadc0de;
 681
 682         if (jnl->flags & JOURNAL_INVALID)
 683                 goto transaction_done;
 684
 685         //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
 686         //   tr, tr->journal_start, tr->journal_end, jnl);
 687
 688         // find this entry in the old_start[] index and mark it completed
 689         for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
 690
 691                 if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
 692                         jnl->old_start[i] &= ~(0x8000000000000000ULL);
 693                         break;
 694                 }
 695         }
 696
 697         if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
 698                 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
 699                       tr->journal_start, tr, jnl);
 700         }
 701
 702
 703         // if we are here then we need to update the journal header
 704         // to reflect that this transaction is complete
 705         if (tr->journal_start == jnl->active_start) {
 706                 jnl->active_start = tr->journal_end;
 707                 tr->journal_start = tr->journal_end = (off_t)0;
 708         }
 709
 710         // go through the completed_trs list and try to coalesce
 711         // entries, restarting back at the beginning if we have to.
 712         for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
 713                 if (ctr->journal_start == jnl->active_start) {
 714                         jnl->active_start = ctr->journal_end;
 715                         if (prev) {
 716                                 prev->next = ctr->next;
 717                         }
 718                         if (ctr == jnl->completed_trs) {
 719                                 jnl->completed_trs = ctr->next;
 720                         }
 721
 722                         next           = jnl->completed_trs;   // this starts us over again
 723                         ctr->next      = jnl->tr_freeme;
 724                         jnl->tr_freeme = ctr;
 725                         ctr            = NULL;
 726                 } else if (tr->journal_end == ctr->journal_start) {
 727                         ctr->journal_start = tr->journal_start;
 728                         next               = jnl->completed_trs;  // this starts us over again
 729                         ctr                = NULL;
 730                         tr->journal_start  = tr->journal_end = (off_t)0;
 731                 } else if (tr->journal_start == ctr->journal_end) {
 732                         ctr->journal_end  = tr->journal_end;
 733                         next              = ctr->next;
 734                         tr->journal_start = tr->journal_end = (off_t)0;
 735                 } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
 736                         // coalesce the next entry with this one and link the next
 737                         // entry in at the head of the tr_freeme list
 738                         next              = ctr->next;           // temporarily use the "next" variable
 739                         ctr->journal_end  = next->journal_end;
 740                         ctr->next         = next->next;
 741                         next->next        = jnl->tr_freeme;      // link in the next guy at the head of the tr_freeme list
 742                         jnl->tr_freeme    = next;
 743
 744                         next              = jnl->completed_trs;  // this starts us over again
 745                         ctr               = NULL;
 746                 } else {
 747                         next = ctr->next;
 748                 }
 749         }
 750
 751         // if this is true then we didn't merge with anyone
 752         // so link ourselves in at the head of the completed
 753         // transaction list.
 754         if (tr->journal_start != 0) {
 755                 // put this entry into the correct sorted place
 756                 // in the list instead of just at the head.
 757                 //
 758
 759                 prev = NULL;
 760                 for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
 761                         // just keep looping
 762                 }
 763
 764                 if (ctr == NULL && prev == NULL) {
 765                         jnl->completed_trs = tr;
 766                         tr->next = NULL;
 767                 } else if (ctr == jnl->completed_trs) {
 768                         tr->next = jnl->completed_trs;
 769                         jnl->completed_trs = tr;
 770                 } else {
 771                         tr->next = prev->next;
 772                         prev->next = tr;
 773                 }
 774         } else {
 775                 // if we're here this tr got merged with someone else so
 776                 // put it on the list to be free'd
 777                 tr->next       = jnl->tr_freeme;
 778                 jnl->tr_freeme = tr;
 779         }
 780 transaction_done:
 781         unlock_oldstart(jnl);
 782
 783         unlock_condition(jnl, &jnl->asyncIO);
 784 }
 785
 786
 787 #include <libkern/OSByteOrder.h>
 788
 789 #define SWAP16(x) OSSwapInt16(x)
 790 #define SWAP32(x) OSSwapInt32(x)
 791 #define SWAP64(x) OSSwapInt64(x)
 792
 793
 794 static void
 795 swap_journal_header(journal *jnl)
 796 {
 797         jnl->jhdr->magic      = SWAP32(jnl->jhdr->magic);
 798         jnl->jhdr->endian     = SWAP32(jnl->jhdr->endian);
 799         jnl->jhdr->start      = SWAP64(jnl->jhdr->start);
 800         jnl->jhdr->end        = SWAP64(jnl->jhdr->end);
 801         jnl->jhdr->size       = SWAP64(jnl->jhdr->size);
 802         jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
 803         jnl->jhdr->checksum   = SWAP32(jnl->jhdr->checksum);
 804         jnl->jhdr->jhdr_size  = SWAP32(jnl->jhdr->jhdr_size);
 805         jnl->jhdr->sequence_num  = SWAP32(jnl->jhdr->sequence_num);
 806 }
 807
 808 static void
 809 swap_block_list_header(journal *jnl, block_list_header *blhdr)
 810 {
 811         int i;
 812
 813         blhdr->max_blocks = SWAP16(blhdr->max_blocks);
 814         blhdr->num_blocks = SWAP16(blhdr->num_blocks);
 815         blhdr->bytes_used = SWAP32(blhdr->bytes_used);
 816         blhdr->checksum   = SWAP32(blhdr->checksum);
 817         blhdr->flags      = SWAP32(blhdr->flags);
 818
 819         if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
 820                 printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d).  not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
 821                 return;
 822         }
 823
 824         for(i = 0; i < blhdr->num_blocks; i++) {
 825                 blhdr->binfo[i].bnum    = SWAP64(blhdr->binfo[i].bnum);
 826                 blhdr->binfo[i].u.bi.bsize   = SWAP32(blhdr->binfo[i].u.bi.bsize);
 827                 blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
 828         }
 829 }
 830
 831
 832 static int
 833 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
 834 {
 835         int             ret;
 836         struct buf *oblock_bp=NULL;
 837         boolean_t was_vm_privileged = FALSE;
 838
 839
 840         // first read the block we want.
 841         ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 842         if (ret != 0) {
 843                 printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
 844
 845                 if (oblock_bp) {
 846                         buf_brelse(oblock_bp);
 847                         oblock_bp = NULL;
 848                 }
 849
 850                 // let's try to be aggressive here and just re-write the block
 851                 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
 852                 if (oblock_bp == NULL) {
 853                         printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
 854                         return -1;
 855                 }
 856         }
 857
 858         // make sure it's the correct size.
 859         if (buf_size(oblock_bp) != bsize) {
 860                 buf_brelse(oblock_bp);
 861                 return -1;
 862         }
 863
 864         // copy the journal data over top of it
 865         memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize);
 866
 867         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
 868                 /*
 869                  * if we block waiting for memory, and there is enough pressure to
 870                  * cause us to try and create a new swap file, we may end up deadlocking
 871                  * due to waiting for the journal on the swap file creation path...
 872                  * by making ourselves vm_privileged, we give ourselves the best chance
 873                  * of not blocking
 874                  */
 875                 was_vm_privileged = set_vm_privilege(TRUE);
 876         }
 877         ret = VNOP_BWRITE(oblock_bp);
 878
 879         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
 880                 set_vm_privilege(FALSE);
 881
 882         if (ret != 0) {
 883                 printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
 884                 return ret;
 885         }
 886         // and now invalidate it so that if someone else wants to read
 887         // it in a different size they'll be able to do it.
 888         ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
 889         if (oblock_bp) {
 890                 buf_markinvalid(oblock_bp);
 891                 buf_brelse(oblock_bp);
 892         }
 893
 894         return 0;
 895 }
 896
 897 static int
 898 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
 899 {
 900         struct bucket *newBuf;
 901         int current_size = num_buckets, i;
 902
 903         // return if newsize is less than the current size
 904         if (new_size < num_buckets) {
 905                 return current_size;
 906         }
 907
 908         if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
 909                 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
 910                 return -1;
 911         }
 912
 913         //  printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
 914
 915         // copy existing elements
 916         bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
 917
 918         // initialize the new ones
 919         for(i = num_buckets; i < new_size; i++) {
 920                 newBuf[i].block_num = (off_t)-1;
 921         }
 922
 923         // free the old container
 924         FREE(*buf_ptr, M_TEMP);
 925
 926         // reset the buf_ptr
 927         *buf_ptr = newBuf;
 928
 929         return new_size;
 930 }
 931
 932 static int
 933 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
 934 {
 935         int lo, hi, index, matches, i;
 936
 937         if (num_full == 0) {
 938                 return 0; // table is empty, so insert at index=0
 939         }
 940
 941         lo = 0;
 942         hi = num_full - 1;
 943         index = -1;
 944
 945         // perform binary search for block_num
 946         do {
 947                 int mid = (hi - lo)/2 + lo;
 948                 off_t this_num = (*buf_ptr)[mid].block_num;
 949
 950                 if (block_num == this_num) {
 951                         index = mid;
 952                         break;
 953                 }
 954
 955                 if (block_num < this_num) {
 956                         hi = mid;
 957                         continue;
 958                 }
 959
 960                 if (block_num > this_num) {
 961                         lo = mid + 1;
 962                         continue;
 963                 }
 964         } while (lo < hi);
 965
 966         // check if lo and hi converged on the match
 967         if (block_num == (*buf_ptr)[hi].block_num) {
 968                 index = hi;
 969         }
 970
 971         // if no existing entry found, find index for new one
 972         if (index == -1) {
 973                 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
 974         } else {
 975                 // make sure that we return the right-most index in the case of multiple matches
 976                 matches = 0;
 977                 i = index + 1;
 978                 while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
 979                         matches++;
 980                         i++;
 981                 }
 982
 983                 index += matches;
 984         }
 985
 986         return index;
 987 }
 988
 989 static int
 990 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
 991 {
 992         if (!overwriting) {
 993                 // grow the table if we're out of space
 994                 if (*num_full_ptr >= *num_buckets_ptr) {
 995                         int new_size = *num_buckets_ptr * 2;
 996                         int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
 997
 998                         if (grow_size < new_size) {
 999                                 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
1000                                 return -1;
1001                         }
1002
1003                         *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
1004                 }
1005
1006                 // if we're not inserting at the end, we need to bcopy
1007                 if (blk_index != *num_full_ptr) {
1008                         bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
1009                 }
1010
1011                 (*num_full_ptr)++; // increment only if we're not overwriting
1012         }
1013
1014         // sanity check the values we're about to add
1015         if ((off_t)offset >= jnl->jhdr->size) {
1016                 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1017         }
1018         if (size <= 0) {
1019                 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
1020         }
1021
1022         (*buf_ptr)[blk_index].block_num = num;
1023         (*buf_ptr)[blk_index].block_size = size;
1024         (*buf_ptr)[blk_index].jnl_offset = offset;
1025         (*buf_ptr)[blk_index].cksum = cksum;
1026
1027         return blk_index;
1028 }
1029
1030 static int
1031 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
1032 {
1033         int     num_to_remove, index, i, overwrite, err;
1034         size_t  jhdr_size = jnl->jhdr->jhdr_size, new_offset;
1035         off_t   overlap, block_start, block_end;
1036
1037         block_start = block_num*jhdr_size;
1038         block_end = block_start + size;
1039         overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
1040
1041         // first, eliminate any overlap with the previous entry
1042         if (blk_index != 0 && !overwrite) {
1043                 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
1044                 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
1045                 overlap = prev_block_end - block_start;
1046                 if (overlap > 0) {
1047                         if (overlap % jhdr_size != 0) {
1048                                 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
1049                         }
1050
1051                         // if the previous entry completely overlaps this one, we need to break it into two pieces.
1052                         if (prev_block_end > block_end) {
1053                                 off_t new_num = block_end / jhdr_size;
1054                                 size_t new_size = prev_block_end - block_end;
1055
1056                                 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
1057
1058                                 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
1059                                 if (err < 0) {
1060                                         panic("jnl: do_overlap: error inserting during pre-overlap\n");
1061                                 }
1062                         }
1063
1064                         // Regardless, we need to truncate the previous entry to the beginning of the overlap
1065                         (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
1066                         (*buf_ptr)[blk_index-1].cksum = 0;   // have to blow it away because there's no way to check it
1067                 }
1068         }
1069
1070         // then, bail out fast if there's no overlap with the entries that follow
1071         if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) {
1072                 return 0; // no overlap, no overwrite
1073         } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) {
1074
1075                 (*buf_ptr)[blk_index].cksum = cksum;   // update this
1076                 return 1; // simple overwrite
1077         }
1078
1079         // Otherwise, find all cases of total and partial overlap. We use the special
1080         // block_num of -2 to designate entries that are completely overlapped and must
1081         // be eliminated. The block_num, size, and jnl_offset of partially overlapped
1082         // entries must be adjusted to keep the array consistent.
1083         index = blk_index;
1084         num_to_remove = 0;
1085         while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) {
1086                 if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) {
1087                         (*buf_ptr)[index].block_num = -2; // mark this for deletion
1088                         num_to_remove++;
1089                 } else {
1090                         overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
1091                         if (overlap > 0) {
1092                                 if (overlap % jhdr_size != 0) {
1093                                         panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
1094                                 }
1095
1096                                 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1097                                 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
1098                                 (*buf_ptr)[index].cksum = 0;
1099
1100                                 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
1101                                 if ((off_t)new_offset >= jnl->jhdr->size) {
1102                                         new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
1103                                 }
1104                                 (*buf_ptr)[index].jnl_offset = new_offset;
1105
1106                                 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
1107                                 if ((*buf_ptr)[index].block_size <= 0) {
1108                                         panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
1109                                         // return -1; // if above panic is removed, return -1 for error
1110                                 }
1111                         }
1112
1113                 }
1114
1115                 index++;
1116         }
1117
1118         // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1119         index--; // start with the last index used within the above loop
1120         while (index >= blk_index) {
1121                 if ((*buf_ptr)[index].block_num == -2) {
1122                         if (index == *num_full_ptr-1) {
1123                                 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
1124                         } else {
1125                                 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
1126                         }
1127                         (*num_full_ptr)--;
1128                 }
1129                 index--;
1130         }
1131
1132         // eliminate any stale entries at the end of the table
1133         for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
1134                 (*buf_ptr)[i].block_num = -1;
1135         }
1136
1137         return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1138 }
1139
1140 // PR-3105942: Coalesce writes to the same block in journal replay
1141 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1142 // to be replayed and the corresponding location in the journal which contains
1143 // the most recent data for those blocks. The array is "played" once the all the
1144 // blocks in the journal have been coalesced. The code for the case of conflicting/
1145 // overlapping writes to a single block is the most dense. Because coalescing can
1146 // disrupt the existing time-ordering of blocks in the journal playback, care
1147 // is taken to catch any overlaps and keep the array consistent.
1148 static int
1149 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
1150 {
1151         int     blk_index, overwriting;
1152
1153         // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1154         // inserted (or the index of the elem to overwrite).
1155         blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
1156
1157         // check if the index is within bounds (if we're adding this block to the end of
1158         // the table, blk_index will be equal to num_full)
1159         if (blk_index < 0 || blk_index > *num_full_ptr) {
1160                 //printf("jnl: add_block: trouble adding block to co_buf\n");
1161                 return -1;
1162         } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1163
1164         // Determine whether we're overwriting an existing entry by checking for overlap
1165         overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
1166         if (overwriting < 0) {
1167                 return -1; // if we got an error, pass it along
1168         }
1169
1170         // returns the index, or -1 on error
1171         blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
1172
1173         return blk_index;
1174 }
1175
1176 static int
1177 replay_journal(journal *jnl)
1178 {
1179         int             i, bad_blocks=0;
1180         unsigned int    orig_checksum, checksum, check_block_checksums = 0;
1181         size_t          ret;
1182         size_t          max_bsize = 0;          /* protected by block_ptr */
1183         block_list_header *blhdr;
1184         off_t           offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1185         char            *buff, *block_ptr=NULL;
1186         struct bucket   *co_buf;
1187         int             num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1188         uint32_t        last_sequence_num = 0;
1189         int             replay_retry_count = 0;
1190
1191         // wrap the start ptr if it points to the very end of the journal
1192         if (jnl->jhdr->start == jnl->jhdr->size) {
1193                 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1194         }
1195         if (jnl->jhdr->end == jnl->jhdr->size) {
1196                 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1197         }
1198
1199         if (jnl->jhdr->start == jnl->jhdr->end) {
1200                 return 0;
1201         }
1202
1203         orig_jnl_start = jnl->jhdr->start;
1204
1205         // allocate memory for the header_block.  we'll read each blhdr into this
1206         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size, VM_KERN_MEMORY_FILE)) {
1207                 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1208                        jnl->jdev_name, jnl->jhdr->blhdr_size);
1209                 return -1;
1210         }
1211
1212         // allocate memory for the coalesce buffer
1213         if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1214                 printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1215                 return -1;
1216         }
1217
1218 restart_replay:
1219
1220         // initialize entries
1221         for(i = 0; i < num_buckets; i++) {
1222                 co_buf[i].block_num = -1;
1223         }
1224         num_full = 0; // empty at first
1225
1226
1227         printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1228                jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1229
1230         while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1231                 offset = blhdr_offset = jnl->jhdr->start;
1232                 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1233                 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1234                         printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1235                         bad_blocks = 1;
1236                         goto bad_txn_handling;
1237                 }
1238
1239                 blhdr = (block_list_header *)buff;
1240
1241                 orig_checksum = blhdr->checksum;
1242                 blhdr->checksum = 0;
1243                 if (jnl->flags & JOURNAL_NEED_SWAP) {
1244                         // calculate the checksum based on the unswapped data
1245                         // because it is done byte-at-a-time.
1246                         orig_checksum = (unsigned int)SWAP32(orig_checksum);
1247                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1248                         swap_block_list_header(jnl, blhdr);
1249                 } else {
1250                         checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1251                 }
1252
1253
1254                 //
1255                 // XXXdbg - if these checks fail, we should replay as much
1256                 //          we can in the hopes that it will still leave the
1257                 //          drive in a better state than if we didn't replay
1258                 //          anything
1259                 //
1260                 if (checksum != orig_checksum) {
1261                         if (check_past_jnl_end && in_uncharted_territory) {
1262
1263                                 if (blhdr_offset != jnl->jhdr->end) {
1264                                         printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1265                                 }
1266
1267                                 check_past_jnl_end = 0;
1268                                 jnl->jhdr->end = blhdr_offset;
1269                                 continue;
1270                         }
1271
1272                         printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1273                         jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1274
1275                         if (blhdr_offset == orig_jnl_start) {
1276                                 // if there's nothing in the journal at all, just bail out altogether.
1277                                 goto bad_replay;
1278                         }
1279
1280                         bad_blocks = 1;
1281                         goto bad_txn_handling;
1282                 }
1283
1284                 if (   (last_sequence_num != 0)
1285                        && (blhdr->binfo[0].u.bi.b.sequence_num != 0)
1286                        && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
1287                        && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
1288
1289                         txn_start_offset = jnl->jhdr->end = blhdr_offset;
1290
1291                         if (check_past_jnl_end) {
1292                                 check_past_jnl_end = 0;
1293                                 printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1294                                        jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1295                                 continue;
1296                         }
1297
1298                         printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1299                                jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1300                         bad_blocks = 1;
1301                         goto bad_txn_handling;
1302                 }
1303                 last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
1304
1305                 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1306                         if (last_sequence_num == 0) {
1307                                 check_past_jnl_end = 0;
1308                                 printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1309                                        jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1310                                 if (jnl->jhdr->start != jnl->jhdr->end) {
1311                                         jnl->jhdr->start = jnl->jhdr->end;
1312                                 }
1313                                 continue;
1314                         }
1315                         printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1316                 }
1317
1318                 if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
1319                        || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1320                         printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1321                                jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1322                         bad_blocks = 1;
1323                         goto bad_txn_handling;
1324                 }
1325
1326                 max_bsize = 0;
1327                 for (i = 1; i < blhdr->num_blocks; i++) {
1328                         if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1329                                 printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1330                                 bad_blocks = 1;
1331                                 goto bad_txn_handling;
1332                         }
1333
1334                         if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
1335                                 max_bsize = blhdr->binfo[i].u.bi.bsize;
1336                         }
1337                 }
1338
1339                 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1340                         check_block_checksums = 1;
1341                         if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) {
1342                                 goto bad_replay;
1343                         }
1344                 } else {
1345                         block_ptr = NULL;
1346                 }
1347
1348                 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1349                         txn_start_offset = blhdr_offset;
1350                 }
1351
1352                 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1353                 //       blhdr->num_blocks-1, jnl->jhdr->start);
1354                 bad_blocks = 0;
1355                 for (i = 1; i < blhdr->num_blocks; i++) {
1356                         int size, ret_val;
1357                         off_t number;
1358
1359                         size = blhdr->binfo[i].u.bi.bsize;
1360                         number = blhdr->binfo[i].bnum;
1361
1362                         // don't add "killed" blocks
1363                         if (number == (off_t)-1) {
1364                                 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1365                         } else {
1366
1367                                 if (check_block_checksums) {
1368                                         int32_t disk_cksum;
1369                                         off_t block_offset;
1370
1371                                         block_offset = offset;
1372
1373                                         // read the block so we can check the checksum
1374                                         ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1375                                         if (ret != (size_t)size) {
1376                                                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1377                                                 bad_blocks = 1;
1378                                                 goto bad_txn_handling;
1379                                         }
1380
1381                                         disk_cksum = calc_checksum(block_ptr, size);
1382
1383                                         // there is no need to swap the checksum from disk because
1384                                         // it got swapped when the blhdr was read in.
1385                                         if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
1386                                                 printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1387                                                        jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
1388                                                 printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x  0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1389                                                        *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1390                                                        *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1391
1392                                                 bad_blocks = 1;
1393                                                 goto bad_txn_handling;
1394                                         }
1395                                 }
1396
1397
1398                                 // add this bucket to co_buf, coalescing where possible
1399                                 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1400                                 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
1401
1402                                 if (ret_val == -1) {
1403                                         printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1404                                         goto bad_replay;
1405                                 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1406                         }
1407
1408                         // increment offset
1409                         offset += size;
1410
1411                         // check if the last block added puts us off the end of the jnl.
1412                         // if so, we need to wrap to the beginning and take any remainder
1413                         // into account
1414                         //
1415                         if (offset >= jnl->jhdr->size) {
1416                                 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1417                         }
1418                 }
1419
1420                 if (block_ptr) {
1421                         kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1422                         block_ptr = NULL;
1423                 }
1424
1425 bad_txn_handling:
1426                 if (bad_blocks) {
1427                         /* Journal replay got error before it found any valid
1428                          *  transations, abort replay */
1429                         if (txn_start_offset == 0) {
1430                                 printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1431                                 goto bad_replay;
1432                         }
1433
1434                         /* Repeated error during journal replay, abort replay */
1435                         if (replay_retry_count == 3) {
1436                                 printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
1437                                 goto bad_replay;
1438                         }
1439                         replay_retry_count++;
1440
1441                         /* There was an error replaying the journal (possibly
1442                          * EIO/ENXIO from the device).  So retry replaying all
1443                          * the good transactions that we found before getting
1444                          * the error.
1445                          */
1446                         jnl->jhdr->start = orig_jnl_start;
1447                         jnl->jhdr->end = txn_start_offset;
1448                         check_past_jnl_end = 0;
1449                         last_sequence_num = 0;
1450                         printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1451                         goto restart_replay;
1452                 }
1453
1454                 jnl->jhdr->start += blhdr->bytes_used;
1455                 if (jnl->jhdr->start >= jnl->jhdr->size) {
1456                         // wrap around and skip the journal header block
1457                         jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1458                 }
1459
1460                 if (jnl->jhdr->start == jnl->jhdr->end) {
1461                         in_uncharted_territory = 1;
1462                 }
1463         }
1464
1465         if (jnl->jhdr->start != jnl->jhdr->end) {
1466                 printf("jnl: %s: start %lld != end %lld.  resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1467                 jnl->jhdr->end = jnl->jhdr->start;
1468         }
1469
1470         //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1471
1472         /*
1473          * make sure it's at least one page in size, so
1474          * start max_bsize at PAGE_SIZE
1475          */
1476         for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1477
1478                 if (co_buf[i].block_num == (off_t)-1)
1479                         continue;
1480
1481                 if (co_buf[i].block_size > max_bsize)
1482                         max_bsize = co_buf[i].block_size;
1483         }
1484         /*
1485          * round max_bsize up to the nearest PAGE_SIZE multiple
1486          */
1487         if (max_bsize & (PAGE_SIZE - 1)) {
1488                 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1489         }
1490
1491         if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) {
1492                 goto bad_replay;
1493         }
1494
1495         // Replay the coalesced entries in the co-buf
1496         for(i = 0; i < num_full; i++) {
1497                 size_t size = co_buf[i].block_size;
1498                 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1499                 off_t number = co_buf[i].block_num;
1500
1501
1502                 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1503                 //      co_buf[i].block_size, co_buf[i].jnl_offset);
1504
1505                 if (number == (off_t)-1) {
1506                         // printf("jnl: replay_journal: skipping killed fs block\n");
1507                 } else {
1508
1509                         // do journal read, and set the phys. block
1510                         ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1511                         if (ret != size) {
1512                                 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1513                                 goto bad_replay;
1514                         }
1515
1516                         if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1517                                 goto bad_replay;
1518                         }
1519                 }
1520         }
1521
1522
1523         // done replaying; update jnl header
1524         if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1525                 goto bad_replay;
1526         }
1527
1528         printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1529
1530         // free block_ptr
1531         if (block_ptr) {
1532                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1533                 block_ptr = NULL;
1534         }
1535
1536         // free the coalesce buffer
1537         FREE(co_buf, M_TEMP);
1538         co_buf = NULL;
1539
1540         kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1541         return 0;
1542
1543 bad_replay:
1544         if (block_ptr) {
1545                 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1546         }
1547         if (co_buf) {
1548                 FREE(co_buf, M_TEMP);
1549         }
1550         kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1551
1552         return -1;
1553 }
1554
1555
1556 #define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
1557 #define MAX_TRANSACTION_BUFFER_SIZE      (3072*1024)
1558
1559 // XXXdbg - so I can change it in the debugger
1560 int def_tbuffer_size = 0;
1561
1562
1563 //
1564 // This function sets the size of the tbuffer and the
1565 // size of the blhdr.  It assumes that jnl->jhdr->size
1566 // and jnl->jhdr->jhdr_size are already valid.
1567 //
1568 static void
1569 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1570 {
1571         //
1572         // one-time initialization based on how much memory
1573         // there is in the machine.
1574         //
1575         if (def_tbuffer_size == 0) {
1576                 if (max_mem < (256*1024*1024)) {
1577                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1578                 } else if (max_mem < (512*1024*1024)) {
1579                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1580                 } else if (max_mem < (1024*1024*1024)) {
1581                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1582                 } else {
1583                         def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024));
1584                 }
1585         }
1586
1587         // size up the transaction buffer... can't be larger than the number
1588         // of blocks that can fit in a block_list_header block.
1589         if (tbuffer_size == 0) {
1590                 jnl->tbuffer_size = def_tbuffer_size;
1591         } else {
1592                 // make sure that the specified tbuffer_size isn't too small
1593                 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1594                         tbuffer_size = jnl->jhdr->blhdr_size * 2;
1595                 }
1596                 // and make sure it's an even multiple of the block size
1597                 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1598                         tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1599                 }
1600
1601                 jnl->tbuffer_size = tbuffer_size;
1602         }
1603
1604         if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1605                 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1606         }
1607
1608         if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1609                 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1610         }
1611
1612         jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1613         if (jnl->jhdr->blhdr_size < phys_blksz) {
1614                 jnl->jhdr->blhdr_size = phys_blksz;
1615         } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1616                 // have to round up so we're an even multiple of the physical block size
1617                 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1618         }
1619 }
1620
1621 static void
1622 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1623 {
1624         off_t   readblockcnt;
1625         off_t   writeblockcnt;
1626         off_t   readmaxcnt=0, tmp_readmaxcnt;
1627         off_t   writemaxcnt=0, tmp_writemaxcnt;
1628         off_t   readsegcnt, writesegcnt;
1629         int32_t features;
1630
1631         if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1632                 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1633                         const char *name = vnode_getname_printable(devvp);
1634                         jnl->flags |= JOURNAL_DO_FUA_WRITES;
1635                         printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
1636                         vnode_putname_printable(name);
1637                 }
1638                 if (features & DK_FEATURE_UNMAP) {
1639                         jnl->flags |= JOURNAL_USE_UNMAP;
1640                 }
1641
1642                 if (features & DK_FEATURE_BARRIER) {
1643                         jnl->flags |= JOURNAL_FEATURE_BARRIER;
1644                 }
1645         }
1646
1647         //
1648         // First check the max read size via several different mechanisms...
1649         //
1650         VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context);
1651
1652         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) {
1653                 tmp_readmaxcnt = readblockcnt * phys_blksz;
1654                 if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
1655                         readmaxcnt = tmp_readmaxcnt;
1656                 }
1657         }
1658
1659         if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) {
1660                 readsegcnt = 0;
1661         }
1662
1663         if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
1664                 readmaxcnt = readsegcnt * PAGE_SIZE;
1665         }
1666
1667         if (readmaxcnt == 0) {
1668                 readmaxcnt = 128 * 1024;
1669         } else if (readmaxcnt > UINT32_MAX) {
1670                 readmaxcnt = UINT32_MAX;
1671         }
1672
1673
1674         //
1675         // Now check the max writes size via several different mechanisms...
1676         //
1677         VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context);
1678
1679         if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) {
1680                 tmp_writemaxcnt = writeblockcnt * phys_blksz;
1681                 if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
1682                         writemaxcnt = tmp_writemaxcnt;
1683                 }
1684         }
1685
1686         if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,     (caddr_t)&writesegcnt, 0, context)) {
1687                 writesegcnt = 0;
1688         }
1689
1690         if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
1691                 writemaxcnt = writesegcnt * PAGE_SIZE;
1692         }
1693
1694         if (writemaxcnt == 0) {
1695                 writemaxcnt = 128 * 1024;
1696         } else if (writemaxcnt > UINT32_MAX) {
1697                 writemaxcnt = UINT32_MAX;
1698         }
1699
1700         jnl->max_read_size  = readmaxcnt;
1701         jnl->max_write_size = writemaxcnt;
1702         // printf("jnl: %s: max read/write: %lld k / %lld k\n",
1703         //     jnl->jdev_name ? jnl->jdev_name : "unknown",
1704         //     jnl->max_read_size/1024, jnl->max_write_size/1024);
1705 }
1706
1707
1708 journal *
1709 journal_create(struct vnode *jvp,
1710                            off_t         offset,
1711                            off_t         journal_size,
1712                            struct vnode *fsvp,
1713                            size_t        min_fs_blksz,
1714                            int32_t       flags,
1715                            int32_t       tbuffer_size,
1716                            void        (*flush)(void *arg),
1717                            void         *arg,
1718                            struct mount *fsmount)
1719 {
1720         journal         *jnl;
1721         uint32_t        phys_blksz, new_txn_base;
1722         u_int32_t       min_size;
1723         struct vfs_context context;
1724         const char      *jdev_name;
1725         /*
1726          * Cap the journal max size to 2GB.  On HFS, it will attempt to occupy
1727          * a full allocation block if the current size is smaller than the allocation
1728          * block on which it resides.  Once we hit the exabyte filesystem range, then
1729          * it will use 2GB allocation blocks.  As a result, make the cap 2GB.
1730          */
1731         context.vc_thread = current_thread();
1732         context.vc_ucred = FSCRED;
1733
1734         jdev_name = vnode_getname_printable(jvp);
1735
1736         /* Get the real physical block size. */
1737         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1738                 goto cleanup_jdev_name;
1739         }
1740
1741         if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
1742                 printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
1743                 goto cleanup_jdev_name;
1744         }
1745
1746         min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1747         /* Reject journals that are too small given the sector size of the device */
1748         if (journal_size < min_size) {
1749                 printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n",
1750                                 jdev_name, journal_size, phys_blksz);
1751                 goto cleanup_jdev_name;
1752         }
1753
1754         if (phys_blksz > min_fs_blksz) {
1755                 printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
1756                        jdev_name, phys_blksz, min_fs_blksz);
1757                 goto cleanup_jdev_name;
1758         }
1759
1760         if ((journal_size % phys_blksz) != 0) {
1761                 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
1762                        jdev_name, journal_size, phys_blksz);
1763                 goto cleanup_jdev_name;
1764         }
1765
1766
1767         MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1768         memset(jnl, 0, sizeof(*jnl));
1769
1770         jnl->jdev         = jvp;
1771         jnl->jdev_offset  = offset;
1772         jnl->fsdev        = fsvp;
1773         jnl->flush        = flush;
1774         jnl->flush_arg    = arg;
1775         jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1776         jnl->jdev_name    = jdev_name;
1777         lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1778
1779         // Keep a point to the mount around for use in IO throttling.
1780         jnl->fsmount      = fsmount;
1781         // XXX: This lock discipline looks correct based on dounmount(), but it
1782         // doesn't seem to be documented anywhere.
1783         mount_ref(fsmount, 0);
1784
1785         get_io_info(jvp, phys_blksz, jnl, &context);
1786
1787         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
1788                 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1789                 goto bad_kmem_alloc;
1790         }
1791         jnl->header_buf_size = phys_blksz;
1792
1793         jnl->jhdr = (journal_header *)jnl->header_buf;
1794         memset(jnl->jhdr, 0, sizeof(journal_header));
1795
1796         // we have to set this up here so that do_journal_io() will work
1797         jnl->jhdr->jhdr_size = phys_blksz;
1798
1799         //
1800         // We try and read the journal header to see if there is already one
1801         // out there.  If there is, it's possible that it has transactions
1802         // in it that we might replay if we happen to pick a sequence number
1803         // that is a little less than the old one, there is a crash and the
1804         // last txn written ends right at the start of a txn from the previous
1805         // incarnation of this file system.  If all that happens we would
1806         // replay the transactions from the old file system and that would
1807         // destroy your disk.  Although it is extremely unlikely for all those
1808         // conditions to happen, the probability is non-zero and the result is
1809         // severe - you lose your file system.  Therefore if we find a valid
1810         // journal header and the sequence number is non-zero we write junk
1811         // over the entire journal so that there is no way we will encounter
1812         // any old transactions.  This is slow but should be a rare event
1813         // since most tools erase the journal.
1814         //
1815         if (   read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
1816                && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
1817                && jnl->jhdr->sequence_num != 0) {
1818
1819                 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
1820                 printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
1821
1822 #if 0
1823                 int i;
1824                 off_t pos=0;
1825
1826                 for(i = 1; i < journal_size / phys_blksz; i++) {
1827                         pos = i*phys_blksz;
1828
1829                         // we don't really care what data we write just so long
1830                         // as it's not a valid transaction header.  since we have
1831                         // the header_buf sitting around we'll use that.
1832                         write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz);
1833                 }
1834                 printf("jnl: create: done clearing journal (i=%d)\n", i);
1835 #endif
1836         } else {
1837                 new_txn_base = random() & 0x00ffffff;
1838         }
1839
1840         memset(jnl->header_buf, 0, phys_blksz);
1841
1842         jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
1843         jnl->jhdr->endian     = ENDIAN_MAGIC;
1844         jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
1845         jnl->jhdr->end        = phys_blksz;
1846         jnl->jhdr->size       = journal_size;
1847         jnl->jhdr->jhdr_size  = phys_blksz;
1848         size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1849
1850         jnl->active_start     = jnl->jhdr->start;
1851
1852         // XXXdbg  - for testing you can force the journal to wrap around
1853         // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1854         // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
1855
1856         jnl->jhdr->sequence_num = new_txn_base;
1857
1858         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1859         lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
1860         lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
1861
1862
1863         jnl->flushing = FALSE;
1864         jnl->asyncIO = FALSE;
1865         jnl->flush_aborted = FALSE;
1866         jnl->writing_header = FALSE;
1867         jnl->async_trim = NULL;
1868         jnl->sequence_num = jnl->jhdr->sequence_num;
1869
1870         if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1871                 printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1872                 goto bad_write;
1873         }
1874
1875         goto journal_create_complete;
1876
1877
1878 bad_write:
1879         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1880 bad_kmem_alloc:
1881         jnl->jhdr = NULL;
1882         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1883         mount_drop(fsmount, 0);
1884 cleanup_jdev_name:
1885         vnode_putname_printable(jdev_name);
1886         jnl = NULL;
1887 journal_create_complete:
1888         return jnl;
1889 }
1890
1891
1892 journal *
1893 journal_open(struct vnode *jvp,
1894                          off_t         offset,
1895                          off_t         journal_size,
1896                          struct vnode *fsvp,
1897                          size_t        min_fs_blksz,
1898                          int32_t       flags,
1899                          int32_t       tbuffer_size,
1900                          void        (*flush)(void *arg),
1901                          void         *arg,
1902                          struct mount *fsmount)
1903 {
1904         journal         *jnl;
1905         uint32_t        orig_blksz=0;
1906         uint32_t        phys_blksz;
1907         u_int32_t       min_size = 0;
1908         int             orig_checksum, checksum;
1909         struct vfs_context context;
1910         const char      *jdev_name = vnode_getname_printable(jvp);
1911
1912         context.vc_thread = current_thread();
1913         context.vc_ucred = FSCRED;
1914
1915         /* Get the real physical block size. */
1916         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1917                 goto cleanup_jdev_name;
1918         }
1919
1920         if (phys_blksz > min_fs_blksz) {
1921                 printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
1922                        jdev_name, phys_blksz, min_fs_blksz);
1923                 goto cleanup_jdev_name;
1924         }
1925
1926         if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
1927                 printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
1928                 goto cleanup_jdev_name;
1929         }
1930
1931         min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1932         /* Reject journals that are too small given the sector size of the device */
1933         if (journal_size < min_size) {
1934                 printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n",
1935                                 jdev_name, journal_size, phys_blksz);
1936                 goto cleanup_jdev_name;
1937         }
1938
1939         if ((journal_size % phys_blksz) != 0) {
1940                 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1941                        jdev_name, journal_size, phys_blksz);
1942                 goto cleanup_jdev_name;
1943         }
1944
1945         MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1946         memset(jnl, 0, sizeof(*jnl));
1947
1948         jnl->jdev         = jvp;
1949         jnl->jdev_offset  = offset;
1950         jnl->fsdev        = fsvp;
1951         jnl->flush        = flush;
1952         jnl->flush_arg    = arg;
1953         jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
1954         jnl->jdev_name    = jdev_name;
1955         lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1956
1957         /* We need a reference to the mount to later pass to the throttling code for
1958          * IO accounting.
1959          */
1960         jnl->fsmount      = fsmount;
1961         mount_ref(fsmount, 0);
1962
1963         get_io_info(jvp, phys_blksz, jnl, &context);
1964
1965         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
1966                 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1967                 goto bad_kmem_alloc;
1968         }
1969         jnl->header_buf_size = phys_blksz;
1970
1971         jnl->jhdr = (journal_header *)jnl->header_buf;
1972         memset(jnl->jhdr, 0, sizeof(journal_header));
1973
1974         // we have to set this up here so that do_journal_io() will work
1975         jnl->jhdr->jhdr_size = phys_blksz;
1976
1977         if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1978                 printf("jnl: %s: open: could not read %u bytes for the journal header.\n",
1979                        jdev_name, phys_blksz);
1980                 goto bad_journal;
1981         }
1982
1983         orig_checksum = jnl->jhdr->checksum;
1984         jnl->jhdr->checksum = 0;
1985
1986         if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1987                 // do this before the swap since it's done byte-at-a-time
1988                 orig_checksum = SWAP32(orig_checksum);
1989                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1990                 swap_journal_header(jnl);
1991                 jnl->flags |= JOURNAL_NEED_SWAP;
1992         } else {
1993                 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1994         }
1995
1996         if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1997                 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1998                        jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1999                 goto bad_journal;
2000         }
2001
2002         // only check if we're the current journal header magic value
2003         if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
2004
2005                 if (orig_checksum != checksum) {
2006                         printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
2007                                jdev_name, orig_checksum, checksum);
2008
2009                         //goto bad_journal;
2010                 }
2011         }
2012
2013         // XXXdbg - convert old style magic numbers to the new one
2014         if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
2015                 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
2016         }
2017
2018         if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
2019                 /*
2020                  * The volume has probably been resized (such that we had to adjust the
2021                  * logical sector size), or copied to media with a different logical
2022                  * sector size.
2023                  *
2024                  * Temporarily change the device's logical block size to match the
2025                  * journal's header size.  This will allow us to replay the journal
2026                  * safely.  If the replay succeeds, we will update the journal's header
2027                  * size (later in this function).
2028                  */
2029                 orig_blksz = phys_blksz;
2030                 phys_blksz = jnl->jhdr->jhdr_size;
2031                 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
2032                 printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
2033                            jdev_name, orig_blksz, phys_blksz);
2034         }
2035
2036         if (   jnl->jhdr->start <= 0
2037                || jnl->jhdr->start > jnl->jhdr->size
2038                || jnl->jhdr->start > 1024*1024*1024) {
2039                 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
2040                        jdev_name, jnl->jhdr->start, jnl->jhdr->size);
2041                 goto bad_journal;
2042         }
2043
2044         if (   jnl->jhdr->end <= 0
2045                || jnl->jhdr->end > jnl->jhdr->size
2046                || jnl->jhdr->end > 1024*1024*1024) {
2047                 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
2048                        jdev_name, jnl->jhdr->end, jnl->jhdr->size);
2049                 goto bad_journal;
2050         }
2051
2052         if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) {
2053                 printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
2054                 goto bad_journal;
2055         }
2056
2057 // XXXdbg - can't do these checks because hfs writes all kinds of
2058 //          non-uniform sized blocks even on devices that have a block size
2059 //          that is larger than 512 bytes (i.e. optical media w/2k blocks).
2060 //          therefore these checks will fail and so we just have to punt and
2061 //          do more relaxed checking...
2062 // XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
2063         if ((jnl->jhdr->start % 512) != 0) {
2064                 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
2065                        jdev_name, jnl->jhdr->start);
2066                 goto bad_journal;
2067         }
2068
2069 //XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
2070         if ((jnl->jhdr->end % 512) != 0) {
2071                 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
2072                        jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
2073                 goto bad_journal;
2074         }
2075
2076         // take care of replaying the journal if necessary
2077         if (flags & JOURNAL_RESET) {
2078                 printf("jnl: %s: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n",
2079                        jdev_name, jnl->jhdr->start, jnl->jhdr->end);
2080                 jnl->jhdr->start = jnl->jhdr->end;
2081         } else if (replay_journal(jnl) != 0) {
2082                 printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
2083                 goto bad_journal;
2084         }
2085
2086         /*
2087          * When we get here, we know that the journal is empty (jnl->jhdr->start ==
2088          * jnl->jhdr->end).  If the device's logical block size was different from
2089          * the journal's header size, then we can now restore the device's logical
2090          * block size and update the journal's header size to match.
2091          *
2092          * Note that we also adjust the journal's start and end so that they will
2093          * be aligned on the new block size.  We pick a new sequence number to
2094          * avoid any problems if a replay found previous transactions using the old
2095          * journal header size.  (See the comments in journal_create(), above.)
2096          */
2097
2098         if (orig_blksz != 0) {
2099                 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2100                 phys_blksz = orig_blksz;
2101
2102                 orig_blksz = 0;
2103
2104                 jnl->jhdr->jhdr_size = phys_blksz;
2105                 jnl->jhdr->start = phys_blksz;
2106                 jnl->jhdr->end = phys_blksz;
2107                 jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
2108                                                                    (journal_size / phys_blksz) +
2109                                                                    (random() % 16384)) & 0x00ffffff;
2110
2111                 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
2112                         printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
2113                         goto bad_journal;
2114                 }
2115         }
2116
2117         // make sure this is in sync!
2118         jnl->active_start = jnl->jhdr->start;
2119         jnl->sequence_num = jnl->jhdr->sequence_num;
2120
2121         // set this now, after we've replayed the journal
2122         size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
2123
2124         // TODO: Does this need to change if the device's logical block size changed?
2125         if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
2126                 printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
2127                        jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
2128                 goto bad_journal;
2129         }
2130
2131         lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
2132         lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
2133         lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
2134
2135         goto journal_open_complete;
2136
2137 bad_journal:
2138         if (orig_blksz != 0) {
2139                 phys_blksz = orig_blksz;
2140                 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2141                 printf("jnl: %s: open: restored block size after error\n", jdev_name);
2142         }
2143         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
2144 bad_kmem_alloc:
2145         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2146         mount_drop(fsmount, 0);
2147 cleanup_jdev_name:
2148         vnode_putname_printable(jdev_name);
2149         jnl = NULL;
2150 journal_open_complete:
2151         return jnl;
2152 }
2153
2154
2155 int
2156 journal_is_clean(struct vnode *jvp,
2157                  off_t         offset,
2158                  off_t         journal_size,
2159                  struct vnode *fsvp,
2160                  size_t        min_fs_block_size)
2161 {
2162         journal         jnl;
2163         uint32_t        phys_blksz;
2164         int             ret;
2165         int             orig_checksum, checksum;
2166         struct vfs_context context;
2167         const           char *jdev_name = vnode_getname_printable(jvp);
2168
2169         context.vc_thread = current_thread();
2170         context.vc_ucred = FSCRED;
2171
2172         /* Get the real physical block size. */
2173         if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
2174                 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
2175                 ret = EINVAL;
2176                 goto cleanup_jdev_name;
2177         }
2178
2179         if (phys_blksz > (uint32_t)min_fs_block_size) {
2180                 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
2181                        jdev_name, phys_blksz, min_fs_block_size);
2182                 ret = EINVAL;
2183                 goto cleanup_jdev_name;
2184         }
2185
2186         if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
2187                 printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
2188                 ret = EINVAL;
2189                 goto cleanup_jdev_name;
2190         }
2191
2192         if ((journal_size % phys_blksz) != 0) {
2193                 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
2194                        jdev_name, journal_size, phys_blksz);
2195                 ret = EINVAL;
2196                 goto cleanup_jdev_name;
2197         }
2198
2199         memset(&jnl, 0, sizeof(jnl));
2200
2201         if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
2202                 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
2203                 ret = ENOMEM;
2204                 goto cleanup_jdev_name;
2205         }
2206         jnl.header_buf_size = phys_blksz;
2207
2208         get_io_info(jvp, phys_blksz, &jnl, &context);
2209
2210         jnl.jhdr = (journal_header *)jnl.header_buf;
2211         memset(jnl.jhdr, 0, sizeof(journal_header));
2212
2213         jnl.jdev        = jvp;
2214         jnl.jdev_offset = offset;
2215         jnl.fsdev       = fsvp;
2216
2217         // we have to set this up here so that do_journal_io() will work
2218         jnl.jhdr->jhdr_size = phys_blksz;
2219
2220         if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
2221                 printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
2222                        jdev_name, phys_blksz);
2223                 ret = EINVAL;
2224                 goto get_out;
2225         }
2226
2227         orig_checksum = jnl.jhdr->checksum;
2228         jnl.jhdr->checksum = 0;
2229
2230         if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
2231                 // do this before the swap since it's done byte-at-a-time
2232                 orig_checksum = SWAP32(orig_checksum);
2233                 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2234                 swap_journal_header(&jnl);
2235                 jnl.flags |= JOURNAL_NEED_SWAP;
2236         } else {
2237                 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2238         }
2239
2240         if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
2241                 printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
2242                        jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
2243                 ret = EINVAL;
2244                 goto get_out;
2245         }
2246
2247         if (orig_checksum != checksum) {
2248                 printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
2249                 ret = EINVAL;
2250                 goto get_out;
2251         }
2252
2253         //
2254         // if the start and end are equal then the journal is clean.
2255         // otherwise it's not clean and therefore an error.
2256         //
2257         if (jnl.jhdr->start == jnl.jhdr->end) {
2258                 ret = 0;
2259         } else {
2260                 ret = EBUSY;    // so the caller can differentiate an invalid journal from a "busy" one
2261         }
2262
2263 get_out:
2264         kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
2265 cleanup_jdev_name:
2266         vnode_putname_printable(jdev_name);
2267         return ret;
2268 }
2269
2270
2271 void
2272 journal_close(journal *jnl)
2273 {
2274         volatile off_t *start, *end;
2275         int             counter=0;
2276
2277         CHECK_JOURNAL(jnl);
2278
2279         // set this before doing anything that would block so that
2280         // we start tearing things down properly.
2281         //
2282         jnl->flags |= JOURNAL_CLOSE_PENDING;
2283
2284         if (jnl->owner != current_thread()) {
2285                 journal_lock(jnl);
2286         }
2287
2288         wait_condition(jnl, &jnl->flushing, "journal_close");
2289
2290         //
2291         // only write stuff to disk if the journal is still valid
2292         //
2293         if ((jnl->flags & JOURNAL_INVALID) == 0) {
2294
2295                 if (jnl->active_tr) {
2296                         /*
2297                          * "journal_end_transaction" will fire the flush asynchronously
2298                          */
2299                         journal_end_transaction(jnl);
2300                 }
2301
2302                 // flush any buffered transactions
2303                 if (jnl->cur_tr) {
2304                         transaction *tr = jnl->cur_tr;
2305
2306                         jnl->cur_tr = NULL;
2307                         /*
2308                          * "end_transaction" will wait for any in-progress flush to complete
2309                          * before flushing "cur_tr" synchronously("must_wait" == TRUE)
2310                          */
2311                         end_transaction(tr, 1, NULL, NULL, FALSE, TRUE);
2312                 }
2313                 /*
2314                  * if there was an "active_tr", make sure we wait for
2315                  * it to flush if there was no "cur_tr" to process
2316                  */
2317                 wait_condition(jnl, &jnl->flushing, "journal_close");
2318
2319                 //start = &jnl->jhdr->start;
2320                 start = &jnl->active_start;
2321                 end   = &jnl->jhdr->end;
2322
2323                 while (*start != *end && counter++ < 5000) {
2324                         //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
2325                         if (jnl->flush) {
2326                                 jnl->flush(jnl->flush_arg);
2327                         }
2328                         tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
2329                 }
2330
2331                 if (*start != *end) {
2332                         printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
2333                                jnl->jdev_name, *start, *end);
2334                 }
2335
2336                 // make sure this is in sync when we close the journal
2337                 jnl->jhdr->start = jnl->active_start;
2338
2339                 // if this fails there's not much we can do at this point...
2340                 write_journal_header(jnl, 1, jnl->sequence_num);
2341         } else {
2342                 // if we're here the journal isn't valid any more.
2343                 // so make sure we don't leave any locked blocks lying around
2344                 printf("jnl: %s: close: journal is invalid.  aborting outstanding transactions\n", jnl->jdev_name);
2345                 if (jnl->active_tr || jnl->cur_tr) {
2346                         transaction *tr;
2347
2348                         if (jnl->active_tr) {
2349                                 tr = jnl->active_tr;
2350                                 jnl->active_tr = NULL;
2351                         } else {
2352                                 tr = jnl->cur_tr;
2353                                 jnl->cur_tr = NULL;
2354                         }
2355                         abort_transaction(jnl, tr);
2356
2357                         if (jnl->active_tr || jnl->cur_tr) {
2358                                 panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
2359                         }
2360                 }
2361         }
2362         wait_condition(jnl, &jnl->asyncIO, "journal_close");
2363
2364         free_old_stuff(jnl);
2365
2366         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2367         jnl->jhdr = (void *)0xbeefbabe;
2368
2369         // Release reference on the mount
2370         if (jnl->fsmount)
2371                  mount_drop(jnl->fsmount, 0);
2372
2373         vnode_putname_printable(jnl->jdev_name);
2374
2375         journal_unlock(jnl);
2376         lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
2377         lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
2378         lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
2379         FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2380 }
2381
2382 static void
2383 dump_journal(journal *jnl)
2384 {
2385         transaction *ctr;
2386
2387         printf("journal for dev %s:", jnl->jdev_name);
2388         printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
2389         printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
2390         printf("  start: 0x%.8llx\n", jnl->jhdr->start);
2391         printf("  end:   0x%.8llx\n", jnl->jhdr->end);
2392         printf("  size:  0x%.8llx\n", jnl->jhdr->size);
2393         printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
2394         printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
2395         printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
2396
2397         printf("  completed transactions:\n");
2398         for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
2399                 printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2400         }
2401 }
2402
2403
2404
2405 static off_t
2406 free_space(journal *jnl)
2407 {
2408         off_t free_space_offset;
2409
2410         if (jnl->jhdr->start < jnl->jhdr->end) {
2411                 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2412         } else if (jnl->jhdr->start > jnl->jhdr->end) {
2413                 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2414         } else {
2415                 // journal is completely empty
2416                 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2417         }
2418
2419         return free_space_offset;
2420 }
2421
2422
2423 //
2424 // The journal must be locked on entry to this function.
2425 // The "desired_size" is in bytes.
2426 //
2427 static int
2428 check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num)
2429 {
2430         size_t  i;
2431         int     counter=0;
2432
2433         //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2434         //         desired_size, free_space(jnl));
2435
2436         if (delayed_header_write)
2437                 *delayed_header_write = FALSE;
2438
2439         while (1) {
2440                 int old_start_empty;
2441
2442                 // make sure there's space in the journal to hold this transaction
2443                 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2444                         break;
2445                 }
2446                 if (counter++ == 5000) {
2447                         dump_journal(jnl);
2448                         panic("jnl: check_free_space: buffer flushing isn't working "
2449                               "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2450                               jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2451                 }
2452                 if (counter > 7500) {
2453                         printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2454                         return ENOSPC;
2455                 }
2456
2457                 //
2458                 // here's where we lazily bump up jnl->jhdr->start.  we'll consume
2459                 // entries until there is enough space for the next transaction.
2460                 //
2461                 old_start_empty = 1;
2462                 lock_oldstart(jnl);
2463
2464                 for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2465                         int   lcl_counter;
2466
2467                         lcl_counter = 0;
2468                         while (jnl->old_start[i] & 0x8000000000000000LL) {
2469                                 if (lcl_counter++ > 10000) {
2470                                         panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2471                                               jnl->old_start[i], jnl);
2472                                 }
2473
2474                                 unlock_oldstart(jnl);
2475                                 if (jnl->flush) {
2476                                         jnl->flush(jnl->flush_arg);
2477                                 }
2478                                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2479                                 lock_oldstart(jnl);
2480                         }
2481
2482                         if (jnl->old_start[i] == 0) {
2483                                 continue;
2484                         }
2485
2486                         old_start_empty   = 0;
2487                         jnl->jhdr->start  = jnl->old_start[i];
2488                         jnl->old_start[i] = 0;
2489
2490                         if (free_space(jnl) > desired_size) {
2491
2492                                 if (delayed_header_write)
2493                                         *delayed_header_write = TRUE;
2494                                 else {
2495                                         unlock_oldstart(jnl);
2496                                         write_journal_header(jnl, 1, sequence_num);
2497                                         lock_oldstart(jnl);
2498                                 }
2499                                 break;
2500                         }
2501                 }
2502                 unlock_oldstart(jnl);
2503
2504                 // if we bumped the start, loop and try again
2505                 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2506                         continue;
2507                 } else if (old_start_empty) {
2508                         //
2509                         // if there is nothing in old_start anymore then we can
2510                         // bump the jhdr->start to be the same as active_start
2511                         // since it is possible there was only one very large
2512                         // transaction in the old_start array.  if we didn't do
2513                         // this then jhdr->start would never get updated and we
2514                         // would wind up looping until we hit the panic at the
2515                         // start of the loop.
2516                         //
2517                         jnl->jhdr->start = jnl->active_start;
2518
2519                         if (delayed_header_write)
2520                                 *delayed_header_write = TRUE;
2521                         else
2522                                 write_journal_header(jnl, 1, sequence_num);
2523                         continue;
2524                 }
2525
2526
2527                 // if the file system gave us a flush function, call it to so that
2528                 // it can flush some blocks which hopefully will cause some transactions
2529                 // to complete and thus free up space in the journal.
2530                 if (jnl->flush) {
2531                         jnl->flush(jnl->flush_arg);
2532                 }
2533
2534                 // wait for a while to avoid being cpu-bound (this will
2535                 // put us to sleep for 10 milliseconds)
2536                 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2537         }
2538
2539         return 0;
2540 }
2541
2542 /*
2543  * Allocate a new active transaction.
2544  */
2545 static errno_t
2546 journal_allocate_transaction(journal *jnl)
2547 {
2548         transaction *tr;
2549         boolean_t was_vm_privileged = FALSE;
2550         kern_return_t retval;
2551
2552         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
2553                 /*
2554                  * the disk driver can allocate memory on this path...
2555                  * if we block waiting for memory, and there is enough pressure to
2556                  * cause us to try and create a new swap file, we may end up deadlocking
2557                  * due to waiting for the journal on the swap file creation path...
2558                  * by making ourselves vm_privileged, we give ourselves the best chance
2559                  * of not blocking
2560                  */
2561                 was_vm_privileged = set_vm_privilege(TRUE);
2562         }
2563         MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2564         memset(tr, 0, sizeof(transaction));
2565
2566         tr->tbuffer_size = jnl->tbuffer_size;
2567
2568         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size, VM_KERN_MEMORY_FILE);
2569
2570         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2571                 set_vm_privilege(FALSE);
2572
2573         if (retval) {
2574                 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2575                 jnl->active_tr = NULL;
2576                 return ENOMEM;
2577         }
2578
2579         // journal replay code checksum check depends on this.
2580         memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2581         // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2582         memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2583
2584         tr->blhdr = (block_list_header *)tr->tbuffer;
2585         tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2586         tr->blhdr->num_blocks = 1;      // accounts for this header block
2587         tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2588         tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2589
2590         tr->sequence_num = ++jnl->sequence_num;
2591         tr->num_blhdrs  = 1;
2592         tr->total_bytes = jnl->jhdr->blhdr_size;
2593         tr->jnl         = jnl;
2594
2595         jnl->active_tr  = tr;
2596
2597         return 0;
2598 }
2599
2600 int
2601 journal_start_transaction(journal *jnl)
2602 {
2603         int ret;
2604
2605         CHECK_JOURNAL(jnl);
2606
2607         free_old_stuff(jnl);
2608
2609         if (jnl->flags & JOURNAL_INVALID) {
2610                 return EINVAL;
2611         }
2612         if (jnl->owner == current_thread()) {
2613                 if (jnl->active_tr == NULL) {
2614                         panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2615                               jnl, jnl->owner, current_thread());
2616                 }
2617                 jnl->nested_count++;
2618                 return 0;
2619         }
2620
2621         journal_lock(jnl);
2622
2623         if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
2624                 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2625                       jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2626         }
2627
2628         jnl->nested_count = 1;
2629
2630 #if JOE
2631         // make sure there's room in the journal
2632         if (free_space(jnl) < jnl->tbuffer_size) {
2633
2634                 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
2635
2636                 // this is the call that really waits for space to free up
2637                 // as well as updating jnl->jhdr->start
2638                 if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) {
2639                         printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2640                         ret = ENOSPC;
2641                         goto bad_start;
2642                 }
2643                 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0);
2644         }
2645 #endif
2646
2647         // if there's a buffered transaction, use it.
2648         if (jnl->cur_tr) {
2649                 jnl->active_tr = jnl->cur_tr;
2650                 jnl->cur_tr    = NULL;
2651
2652                 return 0;
2653         }
2654
2655         ret = journal_allocate_transaction(jnl);
2656         if (ret) {
2657                 goto bad_start;
2658         }
2659
2660         // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2661
2662         return 0;
2663
2664 bad_start:
2665         jnl->nested_count = 0;
2666         journal_unlock(jnl);
2667
2668         return ret;
2669 }
2670
2671
2672 int
2673 journal_modify_block_start(journal *jnl, struct buf *bp)
2674 {
2675         transaction *tr;
2676         boolean_t was_vm_privileged = FALSE;
2677
2678         CHECK_JOURNAL(jnl);
2679
2680
2681         free_old_stuff(jnl);
2682
2683         if (jnl->flags & JOURNAL_INVALID) {
2684                 return EINVAL;
2685         }
2686
2687         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
2688                 /*
2689                  * if we block waiting for memory, and there is enough pressure to
2690                  * cause us to try and create a new swap file, we may end up deadlocking
2691                  * due to waiting for the journal on the swap file creation path...
2692                  * by making ourselves vm_privileged, we give ourselves the best chance
2693                  * of not blocking
2694                  */
2695                 was_vm_privileged = set_vm_privilege(TRUE);
2696         }
2697
2698         // XXXdbg - for debugging I want this to be true.  later it may
2699         //          not be necessary.
2700         if ((buf_flags(bp) & B_META) == 0) {
2701                 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2702         }
2703
2704         tr = jnl->active_tr;
2705         CHECK_TRANSACTION(tr);
2706
2707         if (jnl->owner != current_thread()) {
2708                 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2709                       jnl, jnl->owner, current_thread());
2710         }
2711
2712         //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2713         //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2714
2715         // can't allow blocks that aren't an even multiple of the
2716         // underlying block size.
2717         if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2718                 uint32_t phys_blksz, bad=0;
2719
2720                 if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) {
2721                         bad = 1;
2722                 } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
2723                         if (phys_blksz < 512) {
2724                                 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
2725                                       phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size);
2726                         }
2727
2728                         if ((buf_size(bp) % phys_blksz) != 0) {
2729                                 bad = 1;
2730                         } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
2731                                 jnl->jhdr->jhdr_size = phys_blksz;
2732                         } else {
2733                                 // the phys_blksz is now larger... need to realloc the jhdr
2734                                 char *new_header_buf;
2735
2736                                 printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n",
2737                                        jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
2738                                 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
2739                                         printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n",
2740                                                jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
2741                                         bad = 1;
2742                                 } else {
2743                                         memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
2744                                         memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
2745                                         kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2746                                         jnl->header_buf = new_header_buf;
2747                                         jnl->header_buf_size = phys_blksz;
2748
2749                                         jnl->jhdr = (journal_header *)jnl->header_buf;
2750                                         jnl->jhdr->jhdr_size = phys_blksz;
2751                                 }
2752                         }
2753                 } else {
2754                         bad = 1;
2755                 }
2756
2757                 if (bad) {
2758                         panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2759                               buf_size(bp), jnl->jhdr->jhdr_size);
2760
2761                         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2762                                 set_vm_privilege(FALSE);
2763                         return -1;
2764                 }
2765         }
2766
2767         // make sure that this transaction isn't bigger than the whole journal
2768         if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2769                 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2770                       tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2771
2772                 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2773                         set_vm_privilege(FALSE);
2774                 return -1;
2775         }
2776
2777         // if the block is dirty and not already locked we have to write
2778         // it out before we muck with it because it has data that belongs
2779         // (presumably) to another transaction.
2780         //
2781         if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2782
2783                 if (buf_flags(bp) & B_ASYNC) {
2784                         panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2785                 }
2786                 if (bp->b_shadow_ref)
2787                         panic("modify_block_start: dirty bp @ %p has shadows!\n", bp);
2788
2789                 // this will cause it to not be buf_brelse()'d
2790                 buf_setflags(bp, B_NORELSE);
2791                 VNOP_BWRITE(bp);
2792         }
2793         buf_setflags(bp, B_LOCKED);
2794
2795         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2796                 set_vm_privilege(FALSE);
2797
2798         return 0;
2799 }
2800
2801 int
2802 journal_modify_block_abort(journal *jnl, struct buf *bp)
2803 {
2804         transaction     *tr;
2805         block_list_header *blhdr;
2806         int             i;
2807
2808         CHECK_JOURNAL(jnl);
2809
2810         free_old_stuff(jnl);
2811
2812         tr = jnl->active_tr;
2813
2814         //
2815         // if there's no active transaction then we just want to
2816         // call buf_brelse() and return since this is just a block
2817         // that happened to be modified as part of another tr.
2818         //
2819         if (tr == NULL) {
2820                 buf_brelse(bp);
2821                 return 0;
2822         }
2823
2824         if (jnl->flags & JOURNAL_INVALID) {
2825         /* Still need to buf_brelse(). Callers assume we consume the bp. */
2826         buf_brelse(bp);
2827                 return EINVAL;
2828         }
2829
2830         CHECK_TRANSACTION(tr);
2831
2832         if (jnl->owner != current_thread()) {
2833                 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2834                       jnl, jnl->owner, current_thread());
2835         }
2836
2837         // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2838
2839         // first check if it's already part of this transaction
2840         for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2841                 for (i = 1; i < blhdr->num_blocks; i++) {
2842                         if (bp == blhdr->binfo[i].u.bp) {
2843                                 break;
2844                         }
2845                 }
2846
2847                 if (i < blhdr->num_blocks) {
2848                         break;
2849                 }
2850         }
2851
2852         //
2853         // if blhdr is null, then this block has only had modify_block_start
2854         // called on it as part of the current transaction.  that means that
2855         // it is ok to clear the LOCKED bit since it hasn't actually been
2856         // modified.  if blhdr is non-null then modify_block_end was called
2857         // on it and so we need to keep it locked in memory.
2858         //
2859         if (blhdr == NULL) {
2860                 buf_clearflags(bp, B_LOCKED);
2861         }
2862
2863         buf_brelse(bp);
2864         return 0;
2865 }
2866
2867
2868 int
2869 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg)
2870 {
2871         int             i = 1;
2872         int             tbuffer_offset=0;
2873         block_list_header *blhdr, *prev=NULL;
2874         transaction     *tr;
2875
2876         CHECK_JOURNAL(jnl);
2877
2878         free_old_stuff(jnl);
2879
2880         if (jnl->flags & JOURNAL_INVALID) {
2881         /* Still need to buf_brelse(). Callers assume we consume the bp. */
2882         buf_brelse(bp);
2883                 return EINVAL;
2884         }
2885
2886         tr = jnl->active_tr;
2887         CHECK_TRANSACTION(tr);
2888
2889         if (jnl->owner != current_thread()) {
2890                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2891                       jnl, jnl->owner, current_thread());
2892         }
2893
2894         //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2895         //   bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2896
2897         if ((buf_flags(bp) & B_LOCKED) == 0) {
2898                 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2899         }
2900
2901         // first check if it's already part of this transaction
2902         for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2903                 tbuffer_offset = jnl->jhdr->blhdr_size;
2904
2905                 for (i = 1; i < blhdr->num_blocks; i++) {
2906                         if (bp == blhdr->binfo[i].u.bp) {
2907                                 break;
2908                         }
2909                         if (blhdr->binfo[i].bnum != (off_t)-1) {
2910                                 tbuffer_offset += buf_size(blhdr->binfo[i].u.bp);
2911                         } else {
2912                                 tbuffer_offset += blhdr->binfo[i].u.bi.bsize;
2913                         }
2914                 }
2915
2916                 if (i < blhdr->num_blocks) {
2917                         break;
2918                 }
2919         }
2920
2921         if (blhdr == NULL
2922             && prev
2923             && (prev->num_blocks+1) <= prev->max_blocks
2924             && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2925                 blhdr = prev;
2926
2927         } else if (blhdr == NULL) {
2928                 block_list_header *nblhdr;
2929                 if (prev == NULL) {
2930                         panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2931                 }
2932
2933                 // we got to the end of the list, didn't find the block and there's
2934                 // no room in the block_list_header pointed to by prev
2935
2936                 // we allocate another tbuffer and link it in at the end of the list
2937                 // through prev->binfo[0].bnum.  that's a skanky way to do things but
2938                 // avoids having yet another linked list of small data structures to manage.
2939
2940                 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size, VM_KERN_MEMORY_FILE)) {
2941                         panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2942                               tr, tr->total_bytes);
2943                 }
2944
2945                 // journal replay code checksum check depends on this.
2946                 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2947                 // Fill up the rest of the block with unimportant bytes
2948                 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2949
2950                 // initialize the new guy
2951                 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2952                 nblhdr->num_blocks = 1;      // accounts for this header block
2953                 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2954                 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2955
2956                 tr->num_blhdrs++;
2957                 tr->total_bytes += jnl->jhdr->blhdr_size;
2958
2959                 // then link him in at the end
2960                 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2961
2962                 // and finally switch to using the new guy
2963                 blhdr          = nblhdr;
2964                 tbuffer_offset = jnl->jhdr->blhdr_size;
2965                 i              = 1;
2966         }
2967
2968
2969         if ((i+1) > blhdr->max_blocks) {
2970                 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2971         }
2972
2973         // if this is true then this is a new block we haven't seen
2974         if (i >= blhdr->num_blocks) {
2975                 int     bsize;
2976                 vnode_t vp;
2977
2978                 vp = buf_vnode(bp);
2979                 if (vnode_ref(vp)) {
2980                         // Nobody checks the return values, so...
2981                         jnl->flags |= JOURNAL_INVALID;
2982
2983                         buf_brelse(bp);
2984
2985                         // We're probably here due to a force unmount, so EIO is appropriate
2986                         return EIO;
2987                 }
2988
2989                 bsize = buf_size(bp);
2990
2991                 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
2992                 blhdr->binfo[i].u.bp = bp;
2993
2994                 KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
2995
2996                 if (func) {
2997                         void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
2998
2999                         buf_setfilter(bp, func, arg, &old_func, &old_arg);
3000                         if (old_func != NULL && old_func != func) {
3001                             panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func);
3002                         }
3003                 }
3004
3005                 blhdr->bytes_used += bsize;
3006                 tr->total_bytes   += bsize;
3007
3008                 blhdr->num_blocks++;
3009         }
3010         buf_bdwrite(bp);
3011
3012         return 0;
3013 }
3014
3015 int
3016 journal_kill_block(journal *jnl, struct buf *bp)
3017 {
3018         int             i;
3019         int             bflags;
3020         block_list_header *blhdr;
3021         transaction     *tr;
3022
3023         CHECK_JOURNAL(jnl);
3024
3025         free_old_stuff(jnl);
3026
3027         if (jnl->flags & JOURNAL_INVALID) {
3028                 buf_brelse(bp);
3029                 return 0;
3030         }
3031
3032         tr = jnl->active_tr;
3033         CHECK_TRANSACTION(tr);
3034
3035         if (jnl->owner != current_thread()) {
3036                 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3037                       jnl, jnl->owner, current_thread());
3038         }
3039
3040         bflags = buf_flags(bp);
3041
3042         if ( !(bflags & B_LOCKED))
3043                 panic("jnl: modify_block_end: called with bp not B_LOCKED");
3044
3045         /*
3046          * bp must be BL_BUSY and B_LOCKED
3047          * first check if it's already part of this transaction
3048          */
3049         for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
3050
3051                 for (i = 1; i < blhdr->num_blocks; i++) {
3052                         if (bp == blhdr->binfo[i].u.bp) {
3053                                 vnode_t vp;
3054
3055                                 buf_clearflags(bp, B_LOCKED);
3056
3057                                 // this undoes the vnode_ref() in journal_modify_block_end()
3058                                 vp = buf_vnode(bp);
3059                                 vnode_rele_ext(vp, 0, 1);
3060
3061                                 // if the block has the DELWRI and FILTER bits sets, then
3062                                 // things are seriously weird.  if it was part of another
3063                                 // transaction then journal_modify_block_start() should
3064                                 // have force it to be written.
3065                                 //
3066                                 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
3067                                 //      panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
3068                                 //} else {
3069                                         tr->num_killed += buf_size(bp);
3070                                 //}
3071                                 blhdr->binfo[i].bnum = (off_t)-1;
3072                                 blhdr->binfo[i].u.bp = NULL;
3073                                 blhdr->binfo[i].u.bi.bsize = buf_size(bp);
3074
3075                                 buf_markinvalid(bp);
3076                                 buf_brelse(bp);
3077
3078                                 return 0;
3079                         }
3080                 }
3081         }
3082
3083         /*
3084          * We did not find the block in any transaction buffer but we still
3085          * need to release it or else it will be left locked forever.
3086          */
3087         buf_brelse(bp);
3088
3089         return 0;
3090 }
3091
3092 /*
3093 ;________________________________________________________________________________
3094 ;
3095 ; Routine:              journal_trim_set_callback
3096 ;
3097 ; Function:             Provide the journal with a routine to be called back when a
3098 ;                               TRIM has (or would have) been issued to the device.  That
3099 ;                               is, the transaction has been flushed to the device, and the
3100 ;                               blocks freed by the transaction are now safe for reuse.
3101 ;
3102 ;                               CAUTION: If the journal becomes invalid (eg., due to an I/O
3103 ;                               error when trying to write to the journal), this callback
3104 ;                               will stop getting called, even if extents got freed before
3105 ;                               the journal became invalid!
3106 ;
3107 ; Input Arguments:
3108 ;       jnl                     - The journal structure for the filesystem.
3109 ;       callback        - The function to call when the TRIM is complete.
3110 ;       arg                     - An argument to be passed to callback.
3111 ;________________________________________________________________________________
3112 */
3113 __private_extern__ void
3114 journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg)
3115 {
3116         jnl->trim_callback = callback;
3117         jnl->trim_callback_arg = arg;
3118 }
3119
3120
3121 /*
3122 ;________________________________________________________________________________
3123 ;
3124 ; Routine:              journal_trim_realloc
3125 ;
3126 ; Function:             Increase the amount of memory allocated for the list of extents
3127 ;                               to be unmapped (trimmed).  This routine will be called when
3128 ;                               adding an extent to the list, and the list already occupies
3129 ;                               all of the space allocated to it.  This routine returns ENOMEM
3130 ;                               if unable to allocate more space, or 0 if the extent list was
3131 ;                               grown successfully.
3132 ;
3133 ; Input Arguments:
3134 ;       trim            - The trim list to be resized.
3135 ;
3136 ; Output:
3137 ;       (result)        - ENOMEM or 0.
3138 ;
3139 ; Side effects:
3140 ;        The allocated_count and extents fields of tr->trim are updated
3141 ;        if the function returned 0.
3142 ;________________________________________________________________________________
3143 */
3144 static int
3145 trim_realloc(journal *jnl, struct jnl_trim_list *trim)
3146 {
3147         void *new_extents;
3148         uint32_t new_allocated_count;
3149         boolean_t was_vm_privileged = FALSE;
3150
3151         if (jnl_kdebug)
3152                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
3153
3154         new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
3155
3156         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
3157                 /*
3158                  * if we block waiting for memory, and there is enough pressure to
3159                  * cause us to try and create a new swap file, we may end up deadlocking
3160                  * due to waiting for the journal on the swap file creation path...
3161                  * by making ourselves vm_privileged, we give ourselves the best chance
3162                  * of not blocking
3163                  */
3164                 was_vm_privileged = set_vm_privilege(TRUE);
3165         }
3166         new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
3167         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
3168                 set_vm_privilege(FALSE);
3169
3170         if (new_extents == NULL) {
3171                 printf("jnl: trim_realloc: unable to grow extent list!\n");
3172                 /*
3173                  * Since we could be called when allocating space previously marked
3174                  * to be trimmed, we need to empty out the list to be safe.
3175                  */
3176                 trim->extent_count = 0;
3177                 if (jnl_kdebug)
3178                         KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0);
3179                 return ENOMEM;
3180         }
3181
3182         /* Copy the old extent list to the newly allocated list. */
3183         if (trim->extents != NULL) {
3184                 memmove(new_extents,
3185                                 trim->extents,
3186                                 trim->allocated_count * sizeof(dk_extent_t));
3187                 kfree(trim->extents,
3188                           trim->allocated_count * sizeof(dk_extent_t));
3189         }
3190
3191         trim->allocated_count = new_allocated_count;
3192         trim->extents = new_extents;
3193
3194         if (jnl_kdebug)
3195                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0);
3196
3197         return 0;
3198 }
3199
3200 /*
3201  ;________________________________________________________________________________
3202  ;
3203  ; Routine:             trim_search_extent
3204  ;
3205  ; Function:            Search the given extent list to see if any of its extents
3206  ;                              overlap the given extent.
3207  ;
3208  ; Input Arguments:
3209  ;      trim            - The trim list to be searched.
3210  ;      offset          - The first byte of the range to be searched for.
3211  ;      length          - The number of bytes of the extent being searched for.
3212  ;  overlap_start - start of the overlapping extent
3213  ;  overlap_len   - length of the overlapping extent
3214  ;
3215  ; Output:
3216  ;      (result)        - TRUE if one or more extents overlap, FALSE otherwise.
3217  ;________________________________________________________________________________
3218  */
3219 static int
3220 trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
3221                 uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
3222 {
3223         uint64_t end = offset + length;
3224         uint32_t lower = 0;                                             /* Lowest index to search */
3225         uint32_t upper = trim->extent_count;    /* Highest index to search + 1 */
3226         uint32_t middle;
3227
3228         /* A binary search over the extent list. */
3229         while (lower < upper) {
3230                 middle = (lower + upper) / 2;
3231
3232                 if (trim->extents[middle].offset >= end)
3233                         upper = middle;
3234                 else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
3235                         lower = middle + 1;
3236                 else {
3237                         if (overlap_start) {
3238                                 *overlap_start = trim->extents[middle].offset;
3239                         }
3240                         if (overlap_len) {
3241                                 *overlap_len = trim->extents[middle].length;
3242                         }
3243                         return TRUE;
3244                 }
3245         }
3246
3247         return FALSE;
3248 }
3249
3250
3251 /*
3252 ;________________________________________________________________________________
3253 ;
3254 ; Routine:              journal_trim_add_extent
3255 ;
3256 ; Function:             Keep track of extents that have been freed as part of this
3257 ;                               transaction.  If the underlying device supports TRIM (UNMAP),
3258 ;                               then those extents will be trimmed/unmapped once the
3259 ;                               transaction has been written to the journal.  (For example,
3260 ;                               SSDs can support trim/unmap and avoid having to recopy those
3261 ;                               blocks when doing wear leveling, and may reuse the same
3262 ;                               phsyical blocks for different logical blocks.)
3263 ;
3264 ;                               HFS also uses this, in combination with journal_trim_set_callback,
3265 ;                               to add recently freed extents to its free extent cache, but
3266 ;                               only after the transaction that freed them is committed to
3267 ;                               disk.  (This reduces the chance of overwriting live data in
3268 ;                               a way that causes data loss if a transaction never gets
3269 ;                               written to the journal.)
3270 ;
3271 ; Input Arguments:
3272 ;       jnl                     - The journal for the volume containing the byte range.
3273 ;       offset          - The first byte of the range to be trimmed.
3274 ;       length          - The number of bytes of the extent being trimmed.
3275 ;________________________________________________________________________________
3276 */
3277 __private_extern__ int
3278 journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
3279 {
3280         uint64_t end;
3281         transaction *tr;
3282         dk_extent_t *extent;
3283         uint32_t insert_index;
3284         uint32_t replace_count;
3285
3286         CHECK_JOURNAL(jnl);
3287
3288         /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set?  I think so... */
3289         if (jnl->flags & JOURNAL_INVALID) {
3290                 return EINVAL;
3291         }
3292
3293         tr = jnl->active_tr;
3294         CHECK_TRANSACTION(tr);
3295
3296         if (jnl_kdebug)
3297                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
3298
3299         if (jnl->owner != current_thread()) {
3300                 panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3301                           jnl, jnl->owner, current_thread());
3302         }
3303
3304         free_old_stuff(jnl);
3305
3306         end = offset + length;
3307
3308         /*
3309          * Find the range of existing extents that can be combined with the
3310          * input extent.  We start by counting the number of extents that end
3311          * strictly before the input extent, then count the number of extents
3312          * that overlap or are contiguous with the input extent.
3313          */
3314         extent = tr->trim.extents;
3315         insert_index = 0;
3316         while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) {
3317                 ++insert_index;
3318                 ++extent;
3319         }
3320         replace_count = 0;
3321         while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) {
3322                 ++replace_count;
3323                 ++extent;
3324         }
3325
3326         /*
3327          * If none of the existing extents can be combined with the input extent,
3328          * then just insert it in the list (before item number insert_index).
3329          */
3330         if (replace_count == 0) {
3331                 /* If the list was already full, we need to grow it. */
3332                 if (tr->trim.extent_count == tr->trim.allocated_count) {
3333                         if (trim_realloc(jnl, &tr->trim) != 0) {
3334                                 printf("jnl: trim_add_extent: out of memory!");
3335                                 if (jnl_kdebug)
3336                                         KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
3337                                 return ENOMEM;
3338                         }
3339                 }
3340
3341                 /* Shift any existing extents with larger offsets. */
3342                 if (insert_index < tr->trim.extent_count) {
3343                         memmove(&tr->trim.extents[insert_index+1],
3344                                         &tr->trim.extents[insert_index],
3345                                         (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t));
3346                 }
3347                 tr->trim.extent_count++;
3348
3349                 /* Store the new extent in the list. */
3350                 tr->trim.extents[insert_index].offset = offset;
3351                 tr->trim.extents[insert_index].length = length;
3352
3353                 /* We're done. */
3354                 if (jnl_kdebug)
3355                         KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3356                 return 0;
3357         }
3358
3359         /*
3360          * Update extent number insert_index to be the union of the input extent
3361          * and all of the replaced extents.
3362          */
3363         if (tr->trim.extents[insert_index].offset < offset)
3364                 offset = tr->trim.extents[insert_index].offset;
3365         extent = &tr->trim.extents[insert_index + replace_count - 1];
3366         if (extent->offset + extent->length > end)
3367                 end = extent->offset + extent->length;
3368         tr->trim.extents[insert_index].offset = offset;
3369         tr->trim.extents[insert_index].length = end - offset;
3370
3371         /*
3372          * If we were replacing more than one existing extent, then shift any
3373          * extents with larger offsets, and update the count of extents.
3374          *
3375          * We're going to leave extent #insert_index alone since it was just updated, above.
3376          * We need to move extents from index (insert_index + replace_count) through the end of
3377          * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1).
3378          */
3379         if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) {
3380                 memmove(&tr->trim.extents[insert_index + 1],
3381                                 &tr->trim.extents[insert_index + replace_count],
3382                                 (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t));
3383         }
3384         tr->trim.extent_count -= replace_count - 1;
3385
3386         if (jnl_kdebug)
3387                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3388     return 0;
3389 }
3390
3391 /*
3392  * journal_trim_extent_overlap
3393  *
3394  * Return 1 if there are any pending TRIMs that overlap with the given offset and length
3395  * Return 0 otherwise.
3396  */
3397
3398 int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
3399         transaction *tr = NULL;
3400         int overlap = 0;
3401
3402         uint64_t overlap_start;
3403         uint64_t overlap_len;
3404         tr = jnl->active_tr;
3405         CHECK_TRANSACTION(tr);
3406
3407         /*
3408          * There are two lists that need to be examined for potential overlaps:
3409          *
3410          * The first is the current transaction. Since this function requires that
3411          * a transaction be active when this is called, this is the "active_tr"
3412          * pointer in the journal struct.  This has a trimlist pointer which needs
3413          * to be searched.
3414          */
3415         overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
3416         if (overlap == 0) {
3417                 /*
3418                  * The second is the async trim list, which is only done if the current
3419                  * transaction group (active transaction) did not overlap with our target
3420                  * extent. This async trim list is the set of all previously
3421                  * committed transaction groups whose I/Os are now in-flight. We need to hold the
3422                  * trim lock in order to search this list.  If we grab the list before the
3423                  * TRIM has completed, then we will compare it. If it is grabbed AFTER the
3424                  * TRIM has completed, then the pointer will be zeroed out and we won't have
3425                  * to check anything.
3426                  */
3427                 lck_rw_lock_shared (&jnl->trim_lock);
3428                 if (jnl->async_trim != NULL) {
3429                         overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
3430                 }
3431                 lck_rw_unlock_shared (&jnl->trim_lock);
3432         }
3433
3434         if (overlap) {
3435                 /* compute the end (min) of the overlapping range */
3436                 if ( (overlap_start + overlap_len) < (offset + length)) {
3437                         *end = (overlap_start + overlap_len);
3438                 }
3439                 else {
3440                         *end = (offset + length);
3441                 }
3442         }
3443
3444
3445         return overlap;
3446 }
3447
3448 /*
3449  * journal_request_immediate_flush
3450  *
3451  * FS requests that the journal flush immediately upon the
3452  * active transaction's completion.
3453  *
3454  * Returns 0 if operation succeeds
3455  * Returns EPERM if we failed to leave hint
3456  */
3457 int
3458 journal_request_immediate_flush (journal *jnl) {
3459
3460         transaction *tr = NULL;
3461         /*
3462          * Is a transaction still in process? You must do
3463          * this while there are txns open
3464          */
3465         tr = jnl->active_tr;
3466         if (tr != NULL) {
3467                 CHECK_TRANSACTION(tr);
3468                 tr->flush_on_completion = TRUE;
3469         }
3470         else {
3471                 return EPERM;
3472         }
3473         return 0;
3474 }
3475
3476
3477
3478 /*
3479 ;________________________________________________________________________________
3480 ;
3481 ; Routine:              trim_remove_extent
3482 ;
3483 ; Function:             Indicate that a range of bytes, some of which may have previously
3484 ;                               been passed to journal_trim_add_extent, is now allocated.
3485 ;                               Any overlapping ranges currently in the journal's trim list will
3486 ;                               be removed.  If the underlying device supports TRIM (UNMAP), then
3487 ;                               these extents will not be trimmed/unmapped when the transaction
3488 ;                               is written to the journal.
3489 ;
3490 ;                               HFS also uses this to prevent newly allocated space from being
3491 ;                               added to its free extent cache (if some portion of the newly
3492 ;                               allocated space was recently freed).
3493 ;
3494 ; Input Arguments:
3495 ;       trim            - The trim list to update.
3496 ;       offset          - The first byte of the range to be trimmed.
3497 ;       length          - The number of bytes of the extent being trimmed.
3498 ;________________________________________________________________________________
3499 */
3500 static int
3501 trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
3502 {
3503         u_int64_t end;
3504         dk_extent_t *extent;
3505         u_int32_t keep_before;
3506         u_int32_t keep_after;
3507
3508         end = offset + length;
3509
3510         /*
3511          * Find any existing extents that start before or end after the input
3512          * extent.  These extents will be modified if they overlap the input
3513          * extent.  Other extents between them will be deleted.
3514          */
3515         extent = trim->extents;
3516         keep_before = 0;
3517         while (keep_before < trim->extent_count && extent->offset < offset) {
3518                 ++keep_before;
3519                 ++extent;
3520         }
3521         keep_after = keep_before;
3522         if (keep_after > 0) {
3523                 /* See if previous extent extends beyond both ends of input extent. */
3524                 --keep_after;
3525                 --extent;
3526         }
3527         while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) {
3528                 ++keep_after;
3529                 ++extent;
3530         }
3531
3532         /*
3533          * When we get here, the first keep_before extents (0 .. keep_before-1)
3534          * start before the input extent, and extents (keep_after .. extent_count-1)
3535          * end after the input extent.  We'll need to keep, all of those extents,
3536          * but possibly modify #(keep_before-1) and #keep_after to remove the portion
3537          * that overlaps with the input extent.
3538          */
3539
3540         /*
3541          * Does the input extent start after and end before the same existing
3542          * extent?  If so, we have to "punch a hole" in that extent and convert
3543          * it to two separate extents.
3544          */
3545         if (keep_before >  keep_after) {
3546                 /* If the list was already full, we need to grow it. */
3547                 if (trim->extent_count == trim->allocated_count) {
3548                         if (trim_realloc(jnl, trim) != 0) {
3549                                 printf("jnl: trim_remove_extent: out of memory!");
3550                                 return ENOMEM;
3551                         }
3552                 }
3553
3554                 /*
3555                  * Make room for a new extent by shifting extents #keep_after and later
3556                  * down by one extent.  When we're done, extents #keep_before and
3557                  * #keep_after will be identical, and we can fall through to removing
3558                  * the portion that overlaps the input extent.
3559                  */
3560                 memmove(&trim->extents[keep_before],
3561                                 &trim->extents[keep_after],
3562                                 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3563                 ++trim->extent_count;
3564                 ++keep_after;
3565
3566                 /*
3567                  * Fall through.  We now have the case where the length of extent
3568                  * #(keep_before - 1) needs to be updated, and the start of extent
3569                  * #(keep_after) needs to be updated.
3570                  */
3571         }
3572
3573         /*
3574          * May need to truncate the end of extent #(keep_before - 1) if it overlaps
3575          * the input extent.
3576          */
3577         if (keep_before > 0) {
3578                 extent = &trim->extents[keep_before - 1];
3579                 if (extent->offset + extent->length > offset) {
3580                         extent->length = offset - extent->offset;
3581                 }
3582         }
3583
3584         /*
3585          * May need to update the start of extent #(keep_after) if it overlaps the
3586          * input extent.
3587          */
3588         if (keep_after < trim->extent_count) {
3589                 extent = &trim->extents[keep_after];
3590                 if (extent->offset < end) {
3591                         extent->length = extent->offset + extent->length - end;
3592                         extent->offset = end;
3593                 }
3594         }
3595
3596         /*
3597          * If there were whole extents that overlapped the input extent, get rid
3598          * of them by shifting any following extents, and updating the count.
3599          */
3600         if (keep_after > keep_before && keep_after < trim->extent_count) {
3601                 memmove(&trim->extents[keep_before],
3602                                 &trim->extents[keep_after],
3603                                 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3604         }
3605         trim->extent_count -= keep_after - keep_before;
3606
3607         return 0;
3608 }
3609
3610 /*
3611  ;________________________________________________________________________________
3612  ;
3613  ; Routine:             journal_trim_remove_extent
3614  ;
3615  ; Function:            Make note of a range of bytes, some of which may have previously
3616  ;                              been passed to journal_trim_add_extent, is now in use on the
3617  ;                              volume.  The given bytes will be not be trimmed as part of
3618  ;                              this transaction, or a pending trim of a transaction being
3619  ;                              asynchronously flushed.
3620  ;
3621  ; Input Arguments:
3622  ;      jnl                     - The journal for the volume containing the byte range.
3623  ;      offset          - The first byte of the range to be trimmed.
3624  ;      length          - The number of bytes of the extent being trimmed.
3625  ;________________________________________________________________________________
3626  */
3627 __private_extern__ int
3628 journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
3629 {
3630         int error = 0;
3631         transaction *tr;
3632
3633         CHECK_JOURNAL(jnl);
3634
3635         /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set?  I think so... */
3636         if (jnl->flags & JOURNAL_INVALID) {
3637                 return EINVAL;
3638         }
3639
3640         tr = jnl->active_tr;
3641         CHECK_TRANSACTION(tr);
3642
3643         if (jnl_kdebug)
3644                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
3645
3646         if (jnl->owner != current_thread()) {
3647                 panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3648                           jnl, jnl->owner, current_thread());
3649         }
3650
3651         free_old_stuff(jnl);
3652
3653         error = trim_remove_extent(jnl, &tr->trim, offset, length);
3654         if (error == 0) {
3655                 int found = FALSE;
3656
3657                 /*
3658                  * See if a pending trim has any extents that overlap with the
3659                  * one we were given.
3660                  */
3661                 lck_rw_lock_shared(&jnl->trim_lock);
3662                 if (jnl->async_trim != NULL)
3663                         found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
3664                 lck_rw_unlock_shared(&jnl->trim_lock);
3665
3666                 if (found) {
3667                         /*
3668                          * There was an overlap, so avoid trimming the extent we
3669                          * just allocated.  (Otherwise, it might get trimmed after
3670                          * we've written to it, which will cause that data to be
3671                          * corrupted.)
3672                          */
3673                         uint32_t async_extent_count = 0;
3674
3675                         if (jnl_kdebug)
3676                                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
3677                         lck_rw_lock_exclusive(&jnl->trim_lock);
3678                         if (jnl->async_trim != NULL) {
3679                                 error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
3680                                 async_extent_count = jnl->async_trim->extent_count;
3681                         }
3682                         lck_rw_unlock_exclusive(&jnl->trim_lock);
3683                         if (jnl_kdebug)
3684                                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0);
3685                 }
3686         }
3687
3688         if (jnl_kdebug)
3689                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0);
3690         return error;
3691 }
3692
3693
3694 static int
3695 journal_trim_flush(journal *jnl, transaction *tr)
3696 {
3697         int errno = 0;
3698         boolean_t was_vm_privileged = FALSE;
3699
3700         if (jnl_kdebug)
3701                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
3702
3703         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
3704                 /*
3705                  * the disk driver can allocate memory on this path...
3706                  * if we block waiting for memory, and there is enough pressure to
3707                  * cause us to try and create a new swap file, we may end up deadlocking
3708                  * due to waiting for the journal on the swap file creation path...
3709                  * by making ourselves vm_privileged, we give ourselves the best chance
3710                  * of not blocking
3711                  */
3712                 was_vm_privileged = set_vm_privilege(TRUE);
3713         }
3714         lck_rw_lock_shared(&jnl->trim_lock);
3715         if (tr->trim.extent_count > 0) {
3716                 dk_unmap_t unmap;
3717
3718                 bzero(&unmap, sizeof(unmap));
3719                 if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
3720                         unmap.extents = tr->trim.extents;
3721                         unmap.extentsCount = tr->trim.extent_count;
3722                         if (jnl_kdebug)
3723                                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
3724                         errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
3725                         if (jnl_kdebug)
3726                                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
3727                 }
3728
3729                 /*
3730                  * Call back into the file system to tell them that we have
3731                  * trimmed some extents and that they can now be reused.
3732                  *
3733                  * CAUTION: If the journal becomes invalid (eg., due to an I/O
3734                  * error when trying to write to the journal), this callback
3735                  * will stop getting called, even if extents got freed before
3736                  * the journal became invalid!
3737                  */
3738                 if (jnl->trim_callback)
3739                         jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
3740         }
3741         lck_rw_unlock_shared(&jnl->trim_lock);
3742
3743         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
3744                 set_vm_privilege(FALSE);
3745         /*
3746          * If the transaction we're flushing was the async transaction, then
3747          * tell the current transaction that there is no pending trim
3748          * any more.
3749          *
3750          * NOTE: Since we released the lock, another thread could have
3751          * removed one or more extents from our list.  That's not a
3752          * problem since any writes to the re-allocated blocks
3753          * would get sent to the device after the DKIOCUNMAP.
3754          */
3755         lck_rw_lock_exclusive(&jnl->trim_lock);
3756         if (jnl->async_trim == &tr->trim)
3757                 jnl->async_trim = NULL;
3758         lck_rw_unlock_exclusive(&jnl->trim_lock);
3759
3760         /*
3761          * By the time we get here, no other thread can discover the address
3762          * of "tr", so it is safe for us to manipulate tr->trim without
3763          * holding any locks.
3764          */
3765         if (tr->trim.extents) {
3766                 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
3767                 tr->trim.allocated_count = 0;
3768                 tr->trim.extent_count = 0;
3769                 tr->trim.extents = NULL;
3770         }
3771
3772         if (jnl_kdebug)
3773                 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0);
3774
3775         return errno;
3776 }
3777
3778 static int
3779 journal_binfo_cmp(const void *a, const void *b)
3780 {
3781         const block_info *bi_a = (const struct block_info *)a;
3782         const block_info *bi_b = (const struct block_info *)b;
3783         daddr64_t res;
3784
3785         if (bi_a->bnum == (off_t)-1) {
3786                 return 1;
3787         }
3788         if (bi_b->bnum == (off_t)-1) {
3789                 return -1;
3790         }
3791
3792         // don't have to worry about negative block
3793         // numbers so this is ok to do.
3794         //
3795         res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp));
3796
3797         return (int)res;
3798 }
3799
3800
3801 /*
3802  * End a transaction.  If the transaction is small enough, and we're not forcing
3803  * a write to disk, the "active" transaction becomes the "current" transaction,
3804  * and will be reused for the next transaction that is started (group commit).
3805  *
3806  * If the transaction gets written to disk (because force_it is true, or no
3807  * group commit, or the transaction is sufficiently full), the blocks get
3808  * written into the journal first, then the are written asynchronously.  When
3809  * those async writes complete, the transaction can be freed and removed from
3810  * the journal.
3811  *
3812  * An optional callback can be supplied.  If given, it is called after the
3813  * the blocks have been written to the journal, but before the async writes
3814  * of those blocks to their normal on-disk locations.  This is used by
3815  * journal_relocate so that the location of the journal can be changed and
3816  * flushed to disk before the blocks get written to their normal locations.
3817  * Note that the callback is only called if the transaction gets written to
3818  * the journal during this end_transaction call; you probably want to set the
3819  * force_it flag.
3820  *
3821  * Inputs:
3822  *      tr                       Transaction to add to the journal
3823  *      force_it         If true, force this transaction to the on-disk journal immediately.
3824  *      callback         See description above.  Pass NULL for no callback.
3825  *      callback_arg Argument passed to callback routine.
3826  *
3827  * Result
3828  *               0              No errors
3829  *              -1              An error occurred.  The journal is marked invalid.
3830  */
3831 static int
3832 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait)
3833 {
3834         block_list_header  *blhdr=NULL, *next=NULL;
3835         int             i, ret_val = 0;
3836         errno_t         errno;
3837         journal         *jnl = tr->jnl;
3838         struct buf      *bp;
3839         size_t          tbuffer_offset;
3840         boolean_t       drop_lock_early;
3841
3842         if (jnl->cur_tr) {
3843                 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
3844                           jnl, jnl->cur_tr, tr);
3845         }
3846
3847         // if there weren't any modified blocks in the transaction
3848         // just save off the transaction pointer and return.
3849         if (tr->total_bytes == jnl->jhdr->blhdr_size) {
3850                 jnl->cur_tr = tr;
3851                 goto done;
3852         }
3853
3854     // if our transaction buffer isn't very full, just hang
3855     // on to it and don't actually flush anything.  this is
3856     // what is known as "group commit".  we will flush the
3857     // transaction buffer if it's full or if we have more than
3858     // one of them so we don't start hogging too much memory.
3859     //
3860     // We also check the device supports UNMAP/TRIM, and if so,
3861     // the number of extents waiting to be trimmed.  If it is
3862     // small enough, then keep accumulating more (so we can
3863     // reduce the overhead of trimming).  If there was a prior
3864     // trim error, then we stop issuing trims for this
3865     // volume, so we can also coalesce transactions.
3866         //
3867     if (   force_it == 0
3868                    && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
3869                    && tr->num_blhdrs < 3
3870                    && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
3871                    && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) {
3872
3873                 jnl->cur_tr = tr;
3874                 goto done;
3875         }
3876
3877         KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0);
3878
3879         lock_condition(jnl, &jnl->flushing, "end_transaction");
3880
3881         /*
3882          * if the previous 'finish_end_transaction' was being run
3883          * asynchronously, it could have encountered a condition
3884          * that caused it to mark the journal invalid... if that
3885          * occurred while we were waiting for it to finish, we
3886          * need to notice and abort the current transaction
3887          */
3888         if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) {
3889                 unlock_condition(jnl, &jnl->flushing);
3890
3891                 abort_transaction(jnl, tr);
3892                 ret_val = -1;
3893                 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
3894                 goto done;
3895         }
3896
3897         /*
3898          * Store a pointer to this transaction's trim list so that
3899          * future transactions can find it.
3900          *
3901          * Note: if there are no extents in the trim list, then don't
3902          * bother saving the pointer since nothing can add new extents
3903          * to the list (and other threads/transactions only care if
3904          * there is a trim pending).
3905          */
3906         lck_rw_lock_exclusive(&jnl->trim_lock);
3907         if (jnl->async_trim != NULL)
3908                 panic("jnl: end_transaction: async_trim already non-NULL!");
3909         if (tr->trim.extent_count > 0)
3910                 jnl->async_trim = &tr->trim;
3911         lck_rw_unlock_exclusive(&jnl->trim_lock);
3912
3913         /*
3914          * snapshot the transaction sequence number while we are still behind
3915          * the journal lock since it will be bumped upon the start of the
3916          * next transaction group which may overlap the current journal flush...
3917          * we pass the snapshot into write_journal_header during the journal
3918          * flush so that it can write the correct version in the header...
3919          * because we hold the 'flushing' condition variable for the duration
3920          * of the journal flush, 'saved_sequence_num' remains stable
3921          */
3922         jnl->saved_sequence_num = jnl->sequence_num;
3923
3924         /*
3925          * if we're here we're going to flush the transaction buffer to disk.
3926          * 'check_free_space' will not return untl there is enough free
3927          * space for this transaction in the journal and jnl->old_start[0]
3928          * is avaiable for use
3929          */
3930         KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
3931
3932         check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
3933
3934         KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0);
3935
3936         // range check the end index
3937         if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
3938                 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
3939                           jnl->jhdr->end, jnl->jhdr->size);
3940         }
3941         if (tr->delayed_header_write == TRUE) {
3942                 thread_t        thread = THREAD_NULL;
3943
3944                 lock_condition(jnl, &jnl->writing_header, "end_transaction");
3945                 /*
3946                  * fire up a thread to write the journal header
3947                  * asynchronously... when it finishes, it will call
3948                  * unlock_condition... we can overlap the preparation of
3949                  * the log and buffers during this time
3950                  */
3951                 kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread);
3952         } else
3953                 jnl->write_header_failed = FALSE;
3954
3955
3956         // this transaction starts where the current journal ends
3957         tr->journal_start = jnl->jhdr->end;
3958
3959         lock_oldstart(jnl);
3960         /*
3961          * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
3962          * slide everyone else down and put our latest guy in the last
3963          * entry in the old_start array
3964          */
3965         memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
3966         jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
3967
3968         unlock_oldstart(jnl);
3969
3970
3971         for (blhdr = tr->blhdr; blhdr; blhdr = next) {
3972                 char    *blkptr;
3973                 buf_t   sbp;
3974                 int32_t bsize;
3975
3976                 tbuffer_offset = jnl->jhdr->blhdr_size;
3977
3978                 for (i = 1; i < blhdr->num_blocks; i++) {
3979
3980                         if (blhdr->binfo[i].bnum != (off_t)-1) {
3981                                 void (*func)(buf_t, void *);
3982                                 void  *arg;
3983
3984                                 bp = blhdr->binfo[i].u.bp;
3985
3986                                 if (bp == NULL) {
3987                                         panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
3988                                                 blhdr->binfo[i].bnum, jnl, tr);
3989                                 }
3990                                 /*
3991                                  * acquire the bp here so that we can safely
3992                                  * mess around with its data.  buf_acquire()
3993                                  * will return EAGAIN if the buffer was busy,
3994                                  * so loop trying again.
3995                                  */
3996                                 do {
3997                                         errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
3998                                 } while (errno == EAGAIN);
3999
4000                                 if (errno)
4001                                         panic("could not acquire bp %p (err %d)\n", bp, errno);
4002
4003                                 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
4004                                         if (jnl->flags & JOURNAL_CLOSE_PENDING) {
4005                                                 buf_clearflags(bp, B_LOCKED);
4006                                                 buf_brelse(bp);
4007
4008                                                 /*
4009                                                  * this is an odd case that appears to happen occasionally
4010                                                  * make sure we mark this block as no longer valid
4011                                                  * so that we don't process it in "finish_end_transaction" since
4012                                                  * the bp that is recorded in our array no longer belongs
4013                                                  * to us (normally we substitute a shadow bp to be processed
4014                                                  * issuing a 'buf_bawrite' on a stale buf_t pointer leads
4015                                                  * to all kinds of problems.
4016                                                  */
4017                                                 blhdr->binfo[i].bnum = (off_t)-1;
4018                                                 continue;
4019                                         } else {
4020                                                 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
4021                                         }
4022                                 }
4023                                 bsize = buf_size(bp);
4024
4025                                 buf_setfilter(bp, NULL, NULL, &func, &arg);
4026
4027                                 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
4028
4029                                 sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0);
4030
4031                                 if (sbp == NULL)
4032                                         panic("jnl: buf_create_shadow returned NULL");
4033
4034                                 /*
4035                                  * copy the data into the transaction buffer...
4036                                  */
4037                                 memcpy(blkptr, (char *)buf_dataptr(bp), bsize);
4038
4039                                 buf_clearflags(bp, B_LOCKED);
4040                                 buf_markclean(bp);
4041                                 buf_drop(bp);
4042
4043                                 /*
4044                                  * adopt the shadow buffer for this block
4045                                  */
4046                                 if (func) {
4047                                         /*
4048                                          * transfer FS hook function to the
4049                                          * shadow buffer... it will get called
4050                                          * in finish_end_transaction
4051                                          */
4052                                         buf_setfilter(sbp, func, arg, NULL, NULL);
4053                                 }
4054                                 blhdr->binfo[i].u.bp = sbp;
4055
4056                         } else {
4057                                 // bnum == -1, only true if a block was "killed"
4058                                 bsize = blhdr->binfo[i].u.bi.bsize;
4059                         }
4060                         tbuffer_offset += bsize;
4061                 }
4062                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4063         }
4064         /*
4065          * if callback != NULL, we don't want to drop the journal
4066          * lock, or complete end_transaction asynchronously, since
4067          * the caller is expecting the callback to run in the calling
4068          * context
4069          *
4070          * if drop_lock == FALSE, we can't complete end_transaction
4071          * asynchronously
4072          */
4073         if (callback)
4074                 drop_lock_early = FALSE;
4075         else
4076                 drop_lock_early = drop_lock;
4077
4078         if (drop_lock_early == FALSE)
4079                 must_wait = TRUE;
4080
4081         if (drop_lock_early == TRUE) {
4082                 journal_unlock(jnl);
4083                 drop_lock = FALSE;
4084         }
4085         if (must_wait == TRUE)
4086                 ret_val = finish_end_transaction(tr, callback, callback_arg);
4087         else {
4088                 thread_t        thread = THREAD_NULL;
4089
4090                 /*
4091                  * fire up a thread to complete processing this transaction
4092                  * asynchronously... when it finishes, it will call
4093                  * unlock_condition
4094                  */
4095                 kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread);
4096         }
4097         KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
4098 done:
4099         if (drop_lock == TRUE) {
4100                 journal_unlock(jnl);
4101         }
4102         return (ret_val);
4103 }
4104
4105
4106 static void
4107 finish_end_thread(transaction *tr)
4108 {
4109         proc_set_task_policy(current_task(), current_thread(),
4110                              TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
4111
4112         finish_end_transaction(tr, NULL, NULL);
4113
4114         thread_deallocate(current_thread());
4115         thread_terminate(current_thread());
4116 }
4117
4118 static void
4119 write_header_thread(journal *jnl)
4120 {
4121         proc_set_task_policy(current_task(), current_thread(),
4122                              TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
4123
4124         if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
4125                 jnl->write_header_failed = TRUE;
4126         else
4127                 jnl->write_header_failed = FALSE;
4128         unlock_condition(jnl, &jnl->writing_header);
4129
4130         thread_deallocate(current_thread());
4131         thread_terminate(current_thread());
4132 }
4133
4134 static int
4135 finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg)
4136 {
4137         int             i, amt;
4138         int             ret = 0;
4139         off_t           end;
4140         journal         *jnl = tr->jnl;
4141         buf_t           bp, *bparray;
4142         vnode_t         vp;
4143         block_list_header  *blhdr=NULL, *next=NULL;
4144         size_t          tbuffer_offset;
4145         int             bufs_written = 0;
4146         int             ret_val = 0;
4147         boolean_t       was_vm_privileged = FALSE;
4148
4149         KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0);
4150
4151         if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
4152                 /*
4153                  * if we block waiting for memory, and there is enough pressure to
4154                  * cause us to try and create a new swap file, we may end up deadlocking
4155                  * due to waiting for the journal on the swap file creation path...
4156                  * by making ourselves vm_privileged, we give ourselves the best chance
4157                  * of not blocking
4158                  */
4159                 was_vm_privileged = set_vm_privilege(TRUE);
4160         }
4161         end  = jnl->jhdr->end;
4162
4163         for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
4164
4165                 amt = blhdr->bytes_used;
4166
4167                 blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
4168
4169                 blhdr->checksum = 0;
4170                 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
4171
4172                 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *), VM_KERN_MEMORY_FILE)) {
4173                         panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
4174                 }
4175                 tbuffer_offset = jnl->jhdr->blhdr_size;
4176
4177                 for (i = 1; i < blhdr->num_blocks; i++) {
4178                         void (*func)(buf_t, void *);
4179                         void    *arg;
4180                         int32_t bsize;
4181
4182                         /*
4183                          * finish preparing the shadow buf_t before
4184                          * calculating the individual block checksums
4185                          */
4186                         if (blhdr->binfo[i].bnum != (off_t)-1) {
4187                                 daddr64_t blkno;
4188                                 daddr64_t lblkno;
4189
4190                                 bp = blhdr->binfo[i].u.bp;
4191
4192                                 vp = buf_vnode(bp);
4193                                 blkno = buf_blkno(bp);
4194                                 lblkno = buf_lblkno(bp);
4195
4196                                 if (vp == NULL && lblkno == blkno) {
4197                                         printf("jnl: %s: end_tr: bad news! buffer w/null vp and l/blkno = %qd/%qd.  aborting the transaction.\n",
4198                                                jnl->jdev_name, lblkno, blkno);
4199                                         ret_val = -1;
4200                                         goto bad_journal;
4201                                 }
4202
4203                                 // if the lblkno is the same as blkno and this bp isn't
4204                                 // associated with the underlying file system device then
4205                                 // we need to call bmap() to get the actual physical block.
4206                                 //
4207                                 if ((lblkno == blkno) && (vp != jnl->fsdev)) {
4208                                         off_t   f_offset;
4209                                         size_t  contig_bytes;
4210
4211                                         if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
4212                                                 printf("jnl: %s: end_tr: vnop_blktooff failed\n", jnl->jdev_name);
4213                                                 ret_val = -1;
4214                                                 goto bad_journal;
4215                                         }
4216                                         if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
4217                                                 printf("jnl: %s: end_tr: can't blockmap the buffer", jnl->jdev_name);
4218                                                 ret_val = -1;
4219                                                 goto bad_journal;
4220                                         }
4221                                         if ((uint32_t)contig_bytes < buf_count(bp)) {
4222                                                 printf("jnl: %s: end_tr: blk not physically contiguous on disk\n", jnl->jdev_name);
4223                                                 ret_val = -1;
4224                                                 goto bad_journal;
4225                                         }
4226                                         buf_setblkno(bp, blkno);
4227                                 }
4228                                 // update this so we write out the correct physical block number!
4229                                 blhdr->binfo[i].bnum = (off_t)(blkno);
4230
4231                                 /*
4232                                  * pick up the FS hook function (if any) and prepare
4233                                  * to fire this buffer off in the next pass
4234                                  */
4235                                 buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg);
4236
4237                                 if (func) {
4238                                         /*
4239                                          * call the hook function supplied by the filesystem...
4240                                          * this needs to happen BEFORE cacl_checksum in case
4241                                          * the FS morphs the data in the buffer
4242                                          */
4243                                         func(bp, arg);
4244                                 }
4245                                 bparray[i] = bp;
4246                                 bsize = buf_size(bp);
4247                                 blhdr->binfo[i].u.bi.bsize = bsize;
4248                                 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize);
4249                         } else {
4250                                 bparray[i] = NULL;
4251                                 bsize = blhdr->binfo[i].u.bi.bsize;
4252                                 blhdr->binfo[i].u.bi.b.cksum = 0;
4253                         }
4254                         tbuffer_offset += bsize;
4255                 }
4256                 /*
4257                  * if we fired off the journal_write_header asynchronously in
4258                  * 'end_transaction', we need to wait for its completion
4259                  * before writing the actual journal data
4260                  */
4261                 wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
4262
4263                 if (jnl->write_header_failed == FALSE)
4264                         ret = write_journal_data(jnl, &end, blhdr, amt);
4265                 else
4266                         ret_val = -1;
4267                 /*
4268                  * put the bp pointers back so that we can
4269                  * make the final pass on them
4270                  */
4271                 for (i = 1; i < blhdr->num_blocks; i++)
4272                         blhdr->binfo[i].u.bp = bparray[i];
4273
4274                 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
4275
4276                 if (ret_val == -1)
4277                         goto bad_journal;
4278
4279                 if (ret != amt) {
4280                         printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
4281                                jnl->jdev_name, ret, amt);
4282
4283                         ret_val = -1;
4284                         goto bad_journal;
4285                 }
4286         }
4287         jnl->jhdr->end  = end;    // update where the journal now ends
4288         tr->journal_end = end;    // the transaction ends here too
4289
4290         if (tr->journal_start == 0 || tr->journal_end == 0) {
4291                 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
4292                       tr->journal_start, tr->journal_end);
4293         }
4294
4295         if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
4296                 ret_val = -1;
4297                 goto bad_journal;
4298         }
4299         /*
4300          * If the caller supplied a callback, call it now that the blocks have been
4301          * written to the journal.  This is used by journal_relocate so, for example,
4302          * the file system can change its pointer to the new journal.
4303          */
4304         if (callback != NULL && callback(callback_arg) != 0) {
4305                 ret_val = -1;
4306                 goto bad_journal;
4307         }
4308
4309         //
4310         // Send a DKIOCUNMAP for the extents trimmed by this transaction, and
4311         // free up the extent list.
4312         //
4313         journal_trim_flush(jnl, tr);
4314
4315         // the buffer_flushed_callback will only be called for the
4316         // real blocks that get flushed so we have to account for
4317         // the block_list_headers here.
4318         //
4319         tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
4320
4321         lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
4322
4323         //
4324         // setup for looping through all the blhdr's.
4325         //
4326         for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4327                 uint16_t        num_blocks;
4328
4329                 /*
4330                  * grab this info ahead of issuing the buf_bawrites...
4331                  * once the last one goes out, its possible for blhdr
4332                  * to be freed (especially if we get preempted) before
4333                  * we do the last check of num_blocks or
4334                  * grab the next blhdr pointer...
4335                  */
4336                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4337                 num_blocks = blhdr->num_blocks;
4338
4339                 /*
4340                  * we can re-order the buf ptrs because everything is written out already
4341                  */
4342                 qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
4343
4344                 /*
4345                  * need to make sure that the loop issuing the buf_bawrite's
4346                  * does not touch blhdr once the last buf_bawrite has been
4347                  * issued... at that point, we no longer have a legitmate
4348                  * reference on the associated storage since it will be
4349                  * released upon the completion of that last buf_bawrite
4350                  */
4351                 for (i = num_blocks-1; i >= 1; i--) {
4352                         if (blhdr->binfo[i].bnum != (off_t)-1)
4353                                 break;
4354                         num_blocks--;
4355                 }
4356                 for (i = 1; i < num_blocks; i++) {
4357
4358                         if ((bp = blhdr->binfo[i].u.bp)) {
4359                                 vp = buf_vnode(bp);
4360
4361                                 buf_bawrite(bp);
4362
4363                                 // this undoes the vnode_ref() in journal_modify_block_end()
4364                                 vnode_rele_ext(vp, 0, 1);
4365
4366                                 bufs_written++;
4367                         }
4368                 }
4369         }
4370         if (bufs_written == 0) {
4371                 /*
4372                  * since we didn't issue any buf_bawrite's, there is no
4373                  * async trigger to cause the memory associated with this
4374                  * transaction to be freed... so, move it to the garbage
4375                  * list now
4376                  */
4377                 lock_oldstart(jnl);
4378
4379                 tr->next       = jnl->tr_freeme;
4380                 jnl->tr_freeme = tr;
4381
4382                 unlock_oldstart(jnl);
4383
4384                 unlock_condition(jnl, &jnl->asyncIO);
4385         }
4386
4387         //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
4388         //   tr, tr->journal_start, tr->journal_end);
4389
4390 bad_journal:
4391         if (ret_val == -1) {
4392                 abort_transaction(jnl, tr);             // cleans up list of extents to be trimmed
4393
4394                 /*
4395                  * 'flush_aborted' is protected by the flushing condition... we need to
4396                  * set it before dropping the condition so that it will be
4397                  * noticed in 'end_transaction'... we add this additional
4398                  * aborted condition so that we can drop the 'flushing' condition
4399                  * before grabbing the journal lock... this avoids a deadlock
4400                  * in 'end_transaction' which is holding the journal lock while
4401                  * waiting for the 'flushing' condition to clear...
4402                  * everyone else will notice the JOURNAL_INVALID flag
4403                  */
4404                 jnl->flush_aborted = TRUE;
4405
4406                 unlock_condition(jnl, &jnl->flushing);
4407                 journal_lock(jnl);
4408
4409                 jnl->flags |= JOURNAL_INVALID;
4410                 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
4411
4412                 journal_unlock(jnl);
4413         } else
4414                 unlock_condition(jnl, &jnl->flushing);
4415
4416         if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
4417                 set_vm_privilege(FALSE);
4418
4419         KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0);
4420
4421         return (ret_val);
4422 }
4423
4424
4425 static void
4426 lock_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4427 {
4428
4429         KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4430
4431         lock_flush(jnl);
4432
4433         while (*condition == TRUE)
4434                 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4435
4436         *condition = TRUE;
4437         unlock_flush(jnl);
4438
4439         KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4440 }
4441
4442 static void
4443 wait_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4444 {
4445
4446         if (*condition == FALSE)
4447                 return;
4448
4449         KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4450
4451         lock_flush(jnl);
4452
4453         while (*condition == TRUE)
4454                 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4455
4456         unlock_flush(jnl);
4457
4458         KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4459 }
4460
4461 static void
4462 unlock_condition(journal *jnl, boolean_t *condition)
4463 {
4464         lock_flush(jnl);
4465
4466         *condition = FALSE;
4467         wakeup(condition);
4468
4469         unlock_flush(jnl);
4470 }
4471
4472 static void
4473 abort_transaction(journal *jnl, transaction *tr)
4474 {
4475         block_list_header *blhdr, *next;
4476
4477         // for each block list header, iterate over the blocks then
4478         // free up the memory associated with the block list.
4479         //
4480         // find each of the primary blocks (i.e. the list could
4481         // contain a mix of shadowed and real buf_t's depending
4482         // on when the abort condition was detected) and mark them
4483         // clean and locked in the cache... this at least allows
4484         // the FS a consistent view between it's incore data structures
4485         // and the meta-data held in the cache
4486         //
4487         KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0);
4488
4489         for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4490                 int     i;
4491
4492                 for (i = 1; i < blhdr->num_blocks; i++) {
4493                         buf_t           bp, tbp, sbp;
4494                         vnode_t         bp_vp;
4495                         errno_t         errno;
4496
4497                         if (blhdr->binfo[i].bnum == (off_t)-1)
4498                                 continue;
4499
4500                         tbp = blhdr->binfo[i].u.bp;
4501
4502                         bp_vp = buf_vnode(tbp);
4503
4504                         if (buf_shadow(tbp)) {
4505                                 sbp = tbp;
4506                                 buf_setfilter(tbp, NULL, NULL, NULL, NULL);
4507                         } else {
4508                                 assert(ISSET(buf_flags(tbp), B_LOCKED));
4509
4510                                 sbp = NULL;
4511
4512                                 do {
4513                                         errno = buf_acquire(tbp, BAC_REMOVE, 0, 0);
4514                                 } while (errno == EAGAIN);
4515
4516                                 if (!errno) {
4517                                         buf_setfilter(tbp, NULL, NULL, NULL, NULL);
4518                                         buf_brelse(tbp);
4519                                 }
4520                         }
4521
4522                         if (bp_vp) {
4523                                 errno = buf_meta_bread(bp_vp,
4524                                                        buf_lblkno(tbp),
4525                                                        buf_size(tbp),
4526                                                        NOCRED,
4527                                                        &bp);
4528                                 if (errno == 0) {
4529                                         if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) {
4530                                                 panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
4531                                                       bp, tbp, jnl);
4532                                         }
4533                                         /*
4534                                          * once the journal has been marked INVALID and aborted,
4535                                          * NO meta data can be written back to the disk, so
4536                                          * mark the buf_t clean and make sure it's locked in the cache
4537                                          * note: if we found a shadow, the real buf_t needs to be relocked
4538                                          */
4539                                         buf_setflags(bp, B_LOCKED);
4540                                         buf_markclean(bp);
4541                                         buf_brelse(bp);
4542
4543                                         KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0);
4544
4545                                         /*
4546                                          * this undoes the vnode_ref() in journal_modify_block_end()
4547                                          */
4548                                         vnode_rele_ext(bp_vp, 0, 1);
4549                                 } else {
4550                                         printf("jnl: %s: abort_tr: could not find block %lld for vnode!\n",
4551                                                jnl->jdev_name, blhdr->binfo[i].bnum);
4552                                         if (bp) {
4553                                                 buf_brelse(bp);
4554                                         }
4555                                 }
4556                         }
4557                         if (sbp)
4558                                 buf_brelse(sbp);
4559                 }
4560                 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4561
4562                 // we can free blhdr here since we won't need it any more
4563                 blhdr->binfo[0].bnum = 0xdeadc0de;
4564                 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
4565         }
4566
4567         /*
4568          * If the transaction we're aborting was the async transaction, then
4569          * tell the current transaction that there is no pending trim
4570          * any more.
4571          */
4572         lck_rw_lock_exclusive(&jnl->trim_lock);
4573         if (jnl->async_trim == &tr->trim)
4574                 jnl->async_trim = NULL;
4575         lck_rw_unlock_exclusive(&jnl->trim_lock);
4576
4577
4578         if (tr->trim.extents) {
4579                 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
4580         }
4581         tr->trim.allocated_count = 0;
4582         tr->trim.extent_count = 0;
4583         tr->trim.extents = NULL;
4584         tr->tbuffer     = NULL;
4585         tr->blhdr       = NULL;
4586         tr->total_bytes = 0xdbadc0de;
4587         FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
4588
4589         KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0);
4590 }
4591
4592
4593 int
4594 journal_end_transaction(journal *jnl)
4595 {
4596         int ret;
4597         transaction *tr;
4598
4599         CHECK_JOURNAL(jnl);
4600
4601         free_old_stuff(jnl);
4602
4603         if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
4604                 return 0;
4605         }
4606
4607         if (jnl->owner != current_thread()) {
4608                 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
4609                       jnl, jnl->owner, current_thread());
4610         }
4611         jnl->nested_count--;
4612
4613         if (jnl->nested_count > 0) {
4614                 return 0;
4615         } else if (jnl->nested_count < 0) {
4616                 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
4617         }
4618
4619         if (jnl->flags & JOURNAL_INVALID) {
4620                 if (jnl->active_tr) {
4621                         if (jnl->cur_tr != NULL) {
4622                                 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
4623                                       jnl, jnl->active_tr, jnl->cur_tr);
4624                         }
4625                         tr             = jnl->active_tr;
4626                         jnl->active_tr = NULL;
4627
4628                         abort_transaction(jnl, tr);
4629                 }
4630                 journal_unlock(jnl);
4631
4632                 return EINVAL;
4633         }
4634
4635         tr = jnl->active_tr;
4636         CHECK_TRANSACTION(tr);
4637
4638         // clear this out here so that when check_free_space() calls
4639         // the FS flush function, we don't panic in journal_flush()
4640         // if the FS were to call that.  note: check_free_space() is
4641         // called from end_transaction().
4642         //
4643         jnl->active_tr = NULL;
4644
4645         /* Examine the force-journal-flush state in the active txn */
4646         if (tr->flush_on_completion == TRUE) {
4647                 /*
4648                  * If the FS requested it, disallow group commit and force the
4649                  * transaction out to disk immediately.
4650                  */
4651                 ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
4652         }
4653         else {
4654                 /* in the common path we can simply use the double-buffered journal */
4655                 ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
4656         }
4657
4658         return ret;
4659 }
4660
4661
4662 /*
4663  * Flush the contents of the journal to the disk.
4664  *
4665  *  Input:
4666  *      wait_for_IO -
4667  *      If TRUE, wait to write in-memory journal to the disk
4668  *      consistently, and also wait to write all asynchronous
4669  *      metadata blocks to its corresponding locations
4670  *      consistently on the disk.  This means that the journal
4671  *      is empty at this point and does not contain any
4672  *      transactions.  This is overkill in normal scenarios
4673  *      but is useful whenever the metadata blocks are required
4674  *      to be consistent on-disk instead of just the journal
4675  *      being consistent; like before live verification
4676  *      and live volume resizing.
4677  *
4678  *      If FALSE, only wait to write in-memory journal to the
4679  *      disk consistently.  This means that the journal still
4680  *      contains uncommitted transactions and the file system
4681  *      metadata blocks in the journal transactions might be
4682  *      written asynchronously to the disk.  But there is no
4683  *      guarantee that they are written to the disk before
4684  *      returning to the caller.  Note that this option is
4685  *      sufficient for file system data integrity as it
4686  *      guarantees consistent journal content on the disk.
4687  */
4688 int
4689 journal_flush(journal *jnl, journal_flush_options_t options)
4690 {
4691         boolean_t drop_lock = FALSE;
4692         errno_t error = 0;
4693         uint32_t flush_count;
4694
4695         CHECK_JOURNAL(jnl);
4696
4697         free_old_stuff(jnl);
4698
4699         if (jnl->flags & JOURNAL_INVALID) {
4700                 return -1;
4701         }
4702
4703         KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
4704
4705         if (jnl->owner != current_thread()) {
4706                 journal_lock(jnl);
4707                 drop_lock = TRUE;
4708         }
4709
4710         if (ISSET(options, JOURNAL_FLUSH_FULL))
4711                 flush_count = jnl->flush_counter;
4712
4713         // if we're not active, flush any buffered transactions
4714         if (jnl->active_tr == NULL && jnl->cur_tr) {
4715                 transaction *tr = jnl->cur_tr;
4716
4717                 jnl->cur_tr = NULL;
4718
4719                 if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
4720                         wait_condition(jnl, &jnl->flushing, "journal_flush");
4721                         wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4722                 }
4723                 /*
4724                  * "end_transction" will wait for any current async flush
4725                  * to complete, before flushing "cur_tr"... because we've
4726                  * specified the 'must_wait' arg as TRUE, it will then
4727                  * synchronously flush the "cur_tr"
4728                  */
4729                 end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE);   // force it to get flushed
4730
4731         } else  {
4732                 if (drop_lock == TRUE) {
4733                         journal_unlock(jnl);
4734                 }
4735
4736                 /* Because of pipelined journal, the journal transactions
4737                  * might be in process of being flushed on another thread.
4738                  * If there is nothing to flush currently, we should
4739                  * synchronize ourselves with the pipelined journal thread
4740                  * to ensure that all inflight transactions, if any, are
4741                  * flushed before we return success to caller.
4742                  */
4743                 wait_condition(jnl, &jnl->flushing, "journal_flush");
4744         }
4745         if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
4746                 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4747         }
4748
4749         if (ISSET(options, JOURNAL_FLUSH_FULL)) {
4750
4751                 dk_synchronize_t sync_request = {
4752                         .options                        = 0,
4753                 };
4754
4755                 // We need a full cache flush. If it has not been done, do it here.
4756                 if (flush_count == jnl->flush_counter)
4757                         error = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel());
4758
4759                 // If external journal partition is enabled, flush filesystem data partition.
4760                 if (jnl->jdev != jnl->fsdev)
4761                         error = VNOP_IOCTL(jnl->fsdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel());
4762
4763         }
4764
4765         KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0);
4766
4767         return 0;
4768 }
4769
4770 int
4771 journal_active(journal *jnl)
4772 {
4773         if (jnl->flags & JOURNAL_INVALID) {
4774                 return -1;
4775         }
4776
4777         return (jnl->active_tr == NULL) ? 0 : 1;
4778 }
4779
4780 void *
4781 journal_owner(journal *jnl)
4782 {
4783         return jnl->owner;
4784 }
4785
4786 int journal_uses_fua(journal *jnl)
4787 {
4788         if (jnl->flags & JOURNAL_DO_FUA_WRITES)
4789                 return 1;
4790         return 0;
4791 }
4792
4793 /*
4794  * Relocate the journal.
4795  *
4796  * You provide the new starting offset and size for the journal. You may
4797  * optionally provide a new tbuffer_size; passing zero defaults to not
4798  * changing the tbuffer size except as needed to fit within the new journal
4799  * size.
4800  *
4801  * You must have already started a transaction. The transaction may contain
4802  * modified blocks (such as those needed to deallocate the old journal,
4803  * allocate the new journal, and update the location and size of the journal
4804  * in filesystem-private structures). Any transactions prior to the active
4805  * transaction will be flushed to the old journal. The new journal will be
4806  * initialized, and the blocks from the active transaction will be written to
4807  * the new journal.
4808  *
4809  * The caller will need to update the structures that identify the location
4810  * and size of the journal.  These updates should be made in the supplied
4811  * callback routine.  These updates must NOT go into a transaction.  You should
4812  * force these updates to the media before returning from the callback.  In the
4813  * even of a crash, either the old journal will be found, with an empty journal,
4814  * or the new journal will be found with the contents of the active transaction.
4815  *
4816  * Upon return from the callback, the blocks from the active transaction are
4817  * written to their normal locations on disk.
4818  *
4819  * (Remember that we have to ensure that blocks get committed to the journal
4820  * before being committed to their normal locations.  But the blocks don't count
4821  * as committed until the new journal is pointed at.)
4822  *
4823  * Upon return, there is still an active transaction: newly allocated, and
4824  * with no modified blocks.  Call journal_end_transaction as normal.  You may
4825  * modifiy additional blocks before calling journal_end_transaction, and those
4826  * blocks will (eventually) go to the relocated journal.
4827  *
4828  * Inputs:
4829  *      jnl                             The (opened) journal to relocate.
4830  *      offset                  The new journal byte offset (from start of the journal device).
4831  *      journal_size    The size, in bytes, of the new journal.
4832  *      tbuffer_size    The new desired transaction buffer size.  Pass zero to keep
4833  *                                      the same size as the current journal.  The size will be
4834  *                                      modified as needed to fit the new journal.
4835  *      callback                Routine called after the new journal has been initialized,
4836  *                                      and the active transaction written to the new journal, but
4837  *                                      before the blocks are written to their normal locations.
4838  *                                      Pass NULL for no callback.
4839  *      callback_arg    An argument passed to the callback routine.
4840  *
4841  * Result:
4842  *      0                               No errors
4843  *      EINVAL                  The offset is not block aligned
4844  *      EINVAL                  The journal_size is not a multiple of the block size
4845  *      EINVAL                  The journal is invalid
4846  *      (any)                   An error returned by journal_flush.
4847  *
4848  */
4849 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
4850         errno_t (*callback)(void *), void *callback_arg)
4851 {
4852         int             ret;
4853         transaction     *tr;
4854         size_t i = 0;
4855
4856         /*
4857          * Sanity check inputs, and adjust the size of the transaction buffer.
4858          */
4859         if ((offset % jnl->jhdr->jhdr_size) != 0) {
4860                 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
4861                        jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
4862                 return EINVAL;
4863         }
4864         if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
4865                 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
4866                        jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
4867                 return EINVAL;
4868         }
4869
4870         CHECK_JOURNAL(jnl);
4871
4872         /* Guarantee we own the active transaction. */
4873         if (jnl->flags & JOURNAL_INVALID) {
4874                 return EINVAL;
4875         }
4876         if (jnl->owner != current_thread()) {
4877                 panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
4878                       jnl, jnl->owner, current_thread());
4879         }
4880
4881         if (tbuffer_size == 0)
4882                 tbuffer_size = jnl->tbuffer_size;
4883         size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
4884
4885         /*
4886          * Flush any non-active transactions.  We have to temporarily hide the
4887          * active transaction to make journal_flush flush out non-active but
4888          * current (unwritten) transactions.
4889          */
4890         tr = jnl->active_tr;
4891         CHECK_TRANSACTION(tr);
4892         jnl->active_tr = NULL;
4893         ret = journal_flush(jnl, JOURNAL_WAIT_FOR_IO);
4894         jnl->active_tr = tr;
4895
4896         if (ret) {
4897                 return ret;
4898         }
4899         wait_condition(jnl, &jnl->flushing, "end_transaction");
4900
4901         /*
4902          * At this point, we have completely flushed the contents of the current
4903          * journal to disk (and have asynchronously written all of the txns to
4904          * their actual desired locations).  As a result, we can (and must) clear
4905          * out the old_start array.  If we do not, then if the last written transaction
4906          * started at the beginning of the journal (starting 1 block into the
4907          * journal file) it could confuse the buffer_flushed callback. This is
4908          * because we're about to reset the start/end pointers of the journal header
4909          * below.
4910          */
4911         lock_oldstart(jnl);
4912         for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) {
4913                 jnl->old_start[i] = 0;
4914         }
4915         unlock_oldstart(jnl);
4916
4917         /* Update the journal's offset and size in memory. */
4918         jnl->jdev_offset = offset;
4919         jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
4920         jnl->jhdr->size = journal_size;
4921         jnl->active_start = jnl->jhdr->start;
4922
4923         /*
4924          * Force the active transaction to be written to the new journal.  Call the
4925          * supplied callback after the blocks have been written to the journal, but
4926          * before they get written to their normal on-disk locations.
4927          */
4928         jnl->active_tr = NULL;
4929         ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE);
4930         if (ret) {
4931                 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
4932                 goto bad_journal;
4933         }
4934
4935         /*
4936          * Create a new, empty transaction to be the active transaction.  This way
4937          * our caller can use journal_end_transaction as usual.
4938          */
4939         ret = journal_allocate_transaction(jnl);
4940         if (ret) {
4941                 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
4942                 goto bad_journal;
4943         }
4944
4945         return 0;
4946
4947 bad_journal:
4948         jnl->flags |= JOURNAL_INVALID;
4949         abort_transaction(jnl, tr);
4950         return ret;
4951 }
4952
4953 uint32_t journal_current_txn(journal *jnl)
4954 {
4955         return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1);
4956 }
4957
4958 #else   // !JOURNALING - so provide stub functions
4959
4960 int journal_uses_fua(__unused journal *jnl)
4961 {
4962         return 0;
4963 }
4964
4965 journal *
4966 journal_create(__unused struct vnode *jvp,
4967                __unused off_t         offset,
4968                __unused off_t         journal_size,
4969                __unused struct vnode *fsvp,
4970                __unused size_t        min_fs_blksz,
4971                __unused int32_t       flags,
4972                __unused int32_t       tbuffer_size,
4973                __unused void        (*flush)(void *arg),
4974                __unused void         *arg,
4975                __unused struct mount *fsmount)
4976 {
4977     return NULL;
4978 }
4979
4980 journal *
4981 journal_open(__unused struct vnode *jvp,
4982              __unused off_t         offset,
4983              __unused off_t         journal_size,
4984              __unused struct vnode *fsvp,
4985              __unused size_t        min_fs_blksz,
4986              __unused int32_t       flags,
4987              __unused int32_t       tbuffer_size,
4988              __unused void        (*flush)(void *arg),
4989              __unused void         *arg,
4990              __unused struct mount *fsmount)
4991 {
4992         return NULL;
4993 }
4994
4995
4996 int
4997 journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp)
4998 {
4999         return EINVAL;
5000 }
5001
5002 int
5003 journal_modify_block_end(__unused journal *jnl,
5004                          __unused struct buf *bp,
5005                          __unused void (*func)(struct buf *bp, void *arg),
5006                          __unused void *arg)
5007 {
5008         return EINVAL;
5009 }
5010
5011 int
5012 journal_kill_block(__unused journal *jnl, __unused struct buf *bp)
5013 {
5014         return EINVAL;
5015 }
5016
5017 int journal_relocate(__unused journal *jnl,
5018                      __unused off_t offset,
5019                      __unused off_t journal_size,
5020                      __unused int32_t tbuffer_size,
5021                      __unused errno_t (*callback)(void *),
5022                      __unused void *callback_arg)
5023 {
5024         return EINVAL;
5025 }
5026
5027 void
5028 journal_close(__unused journal *jnl)
5029 {
5030 }
5031
5032 int
5033 journal_start_transaction(__unused journal *jnl)
5034 {
5035         return EINVAL;
5036 }
5037
5038 int
5039 journal_end_transaction(__unused journal *jnl)
5040 {
5041         return EINVAL;
5042 }
5043
5044 int
5045 journal_flush(__unused journal *jnl, __unused journal_flush_options_t options)
5046 {
5047         return EINVAL;
5048 }
5049
5050 int
5051 journal_is_clean(__unused struct vnode *jvp,
5052                  __unused off_t         offset,
5053                  __unused off_t         journal_size,
5054                  __unused struct vnode *fsvp,
5055                  __unused size_t        min_fs_block_size)
5056 {
5057         return 0;
5058 }
5059
5060
5061 void *
5062 journal_owner(__unused journal *jnl)
5063 {
5064         return NULL;
5065 }
5066
5067 void
5068 journal_lock(__unused journal *jnl)
5069 {
5070         return;
5071 }
5072
5073 void
5074 journal_unlock(__unused journal *jnl)
5075 {
5076         return;
5077 }
5078
5079 __private_extern__ int
5080 journal_trim_add_extent(__unused journal *jnl,
5081                         __unused uint64_t offset,
5082                         __unused uint64_t length)
5083 {
5084         return 0;
5085 }
5086
5087 int
5088 journal_request_immediate_flush(__unused journal *jnl)
5089 {
5090         return 0;
5091 }
5092
5093 __private_extern__ int
5094 journal_trim_remove_extent(__unused journal *jnl,
5095                            __unused uint64_t offset,
5096                            __unused uint64_t length)
5097 {
5098         return 0;
5099 }
5100
5101 int journal_trim_extent_overlap(__unused journal *jnl,
5102                                 __unused uint64_t offset,
5103                                 __unused uint64_t length,
5104                                 __unused uint64_t *end)
5105 {
5106         return 0;
5107 }
5108
5109 #endif  // !JOURNALING