X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/6d2010ae8f7a6078e10b361c6962983bab233e0f..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/bsd/vfs/vfs_journal.c diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 4999f814b..628e5e7dc 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2011 Apple Inc. All rights reserved. + * Copyright (c) 2002-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -115,20 +115,22 @@ SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, & #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT #endif + #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif + #if JOURNALING // -// By default, we grow the list of extents to trim by one page at a time. +// By default, we grow the list of extents to trim by 4K at a time. // We'll opt to flush a transaction if it contains at least // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number // of modified blocks is small). // enum { - JOURNAL_DEFAULT_TRIM_BYTES = PAGE_SIZE, + JOURNAL_DEFAULT_TRIM_BYTES = 4096, JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t), JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16 }; @@ -136,8 +138,7 @@ enum { unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush"); - -/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ +/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */ __private_extern__ void qsort( void * array, size_t nmembers, @@ -161,8 +162,6 @@ static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(vo static void abort_transaction(journal *jnl, transaction *tr); static void dump_journal(journal *jnl); -static __inline__ void lock_journal(journal *jnl); -static __inline__ void unlock_journal(journal *jnl); static __inline__ void lock_oldstart(journal *jnl); static __inline__ void unlock_oldstart(journal *jnl); static __inline__ void lock_flush(journal *jnl); @@ -247,10 +246,11 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of // we use it to checksum the journal header and the block list // headers that are at the start of each transaction. // -static int +static unsigned int calc_checksum(char *ptr, int len) { - int i, cksum=0; + int i; + unsigned int cksum=0; // this is a lame checksum but for now it'll do for(i = 0; i < len; i++, ptr++) { @@ -275,15 +275,20 @@ journal_init(void) jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr); } -static __inline__ void -lock_journal(journal *jnl) +__inline__ void +journal_lock(journal *jnl) { lck_mtx_lock(&jnl->jlock); + if (jnl->owner) { + panic ("jnl: owner is %p, expected NULL\n", jnl->owner); + } + jnl->owner = current_thread(); } -static __inline__ void -unlock_journal(journal *jnl) +__inline__ void +journal_unlock(journal *jnl) { + jnl->owner = NULL; lck_mtx_unlock(&jnl->jlock); } @@ -333,6 +338,7 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction size_t io_sz = 0; buf_t bp; off_t max_iosize; + struct bufattr *bap; if (*offset < 0 || *offset > jnl->jhdr->size) { panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); @@ -368,6 +374,20 @@ again: panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); } + /* + * As alluded to in the block comment at the top of the function, we use a "fake" iobuf + * here and issue directly to the disk device that the journal protects since we don't + * want this to enter the block cache. As a result, we lose the ability to mark it + * as a metadata buf_t for the layers below us that may care. If we were to + * simply attach the B_META flag into the b_flags this may confuse things further + * since this is an iobuf, not a metadata buffer. + * + * To address this, we use the extended bufattr struct embedded in the bp. + * Explicitly mark the buf here as a metadata buffer in its bufattr flags. + */ + bap = &bp->b_attr; + bap->ba_flags |= BA_META; + if (direction & JNL_READ) buf_setflags(bp, B_READ); else { @@ -570,9 +590,6 @@ buffer_flushed_callback(struct buf *bp, void *arg) CHECK_TRANSACTION(tr); jnl = tr->jnl; - if (jnl->flags & JOURNAL_INVALID) { - return; - } CHECK_JOURNAL(jnl); @@ -613,6 +630,9 @@ buffer_flushed_callback(struct buf *bp, void *arg) // cleanup for this transaction tr->total_bytes = 0xfbadc0de; + if (jnl->flags & JOURNAL_INVALID) + goto transaction_done; + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", // tr, tr->journal_start, tr->journal_end, jnl); @@ -708,6 +728,7 @@ buffer_flushed_callback(struct buf *bp, void *arg) tr->next = jnl->tr_freeme; jnl->tr_freeme = tr; } +transaction_done: unlock_oldstart(jnl); unlock_condition(jnl, &jnl->asyncIO); @@ -1090,7 +1111,8 @@ add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, _ static int replay_journal(journal *jnl) { - int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0; + int i, bad_blocks=0; + unsigned int orig_checksum, checksum, check_block_checksums = 0; size_t ret; size_t max_bsize = 0; /* protected by block_ptr */ block_list_header *blhdr; @@ -1099,6 +1121,7 @@ replay_journal(journal *jnl) struct bucket *co_buf; int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; uint32_t last_sequence_num = 0; + int replay_retry_count = 0; // wrap the start ptr if it points to the very end of the journal if (jnl->jhdr->start == jnl->jhdr->size) { @@ -1155,7 +1178,7 @@ restart_replay: if (jnl->flags & JOURNAL_NEED_SWAP) { // calculate the checksum based on the unswapped data // because it is done byte-at-a-time. - orig_checksum = SWAP32(orig_checksum); + orig_checksum = (unsigned int)SWAP32(orig_checksum); checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); swap_block_list_header(jnl, blhdr); } else { @@ -1336,11 +1359,25 @@ restart_replay: bad_txn_handling: if (bad_blocks) { + /* Journal replay got error before it found any valid + * transations, abort replay */ if (txn_start_offset == 0) { printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); goto bad_replay; } + /* Repeated error during journal replay, abort replay */ + if (replay_retry_count == 3) { + printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } + replay_retry_count++; + + /* There was an error replaying the journal (possibly + * EIO/ENXIO from the device). So retry replaying all + * the good transactions that we found before getting + * the error. + */ jnl->jhdr->start = orig_jnl_start; jnl->jhdr->end = txn_start_offset; check_past_jnl_end = 0; @@ -1452,7 +1489,7 @@ bad_replay: #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) -#define MAX_TRANSACTION_BUFFER_SIZE (2048*1024) +#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024) // XXXdbg - so I can change it in the debugger int def_tbuffer_size = 0; @@ -1471,14 +1508,14 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) // there is in the machine. // if (def_tbuffer_size == 0) { - if (mem_size < (256*1024*1024)) { + if (max_mem < (256*1024*1024)) { def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; - } else if (mem_size < (512*1024*1024)) { + } else if (max_mem < (512*1024*1024)) { def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; - } else if (mem_size < (1024*1024*1024)) { + } else if (max_mem < (1024*1024*1024)) { def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; } else { - def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (mem_size / (256*1024*1024)); + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024)); } } @@ -1516,8 +1553,6 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) } } - - static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) { @@ -1530,9 +1565,10 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { - const char *name = vnode_name(devvp); + const char *name = vnode_getname_printable(devvp); jnl->flags |= JOURNAL_DO_FUA_WRITES; - printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features); + printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features); + vnode_putname_printable(name); } if (features & DK_FEATURE_UNMAP) { jnl->flags |= JOURNAL_USE_UNMAP; @@ -1600,23 +1636,6 @@ get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_con } -static const char * -get_jdev_name(struct vnode *jvp) -{ - const char *jdev_name; - - jdev_name = vnode_name(jvp); - if (jdev_name == NULL) { - jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0); - } else { - // this just bumps the refcount on the name so we have our own copy - jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0); - } - - return jdev_name; -} - - journal * journal_create(struct vnode *jvp, off_t offset, @@ -1626,7 +1645,8 @@ journal_create(struct vnode *jvp, int32_t flags, int32_t tbuffer_size, void (*flush)(void *arg), - void *arg) + void *arg, + struct mount *fsmount) { journal *jnl; uint32_t phys_blksz, new_txn_base; @@ -1642,36 +1662,36 @@ journal_create(struct vnode *jvp, context.vc_thread = current_thread(); context.vc_ucred = FSCRED; - jdev_name = get_jdev_name(jvp); + jdev_name = vnode_getname_printable(jvp); /* Get the real physical block size. */ if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - return NULL; + goto cleanup_jdev_name; } if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { - printf("jnl: create: journal size %lld looks bogus.\n", journal_size); - return NULL; + printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size); + goto cleanup_jdev_name; } min_size = phys_blksz * (phys_blksz / sizeof(block_info)); /* Reject journals that are too small given the sector size of the device */ if (journal_size < min_size) { - printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", - journal_size, phys_blksz); - return NULL; + printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; } if (phys_blksz > min_fs_blksz) { printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", jdev_name, phys_blksz, min_fs_blksz); - return NULL; + goto cleanup_jdev_name; } if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", jdev_name, journal_size, phys_blksz); - return NULL; + goto cleanup_jdev_name; } @@ -1687,6 +1707,12 @@ journal_create(struct vnode *jvp, jnl->jdev_name = jdev_name; lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + // Keep a point to the mount around for use in IO throttling. + jnl->fsmount = fsmount; + // XXX: This lock discipline looks correct based on dounmount(), but it + // doesn't seem to be documented anywhere. + mount_ref(fsmount, 0); + get_io_info(jvp, phys_blksz, jnl, &context); if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { @@ -1722,7 +1748,7 @@ journal_create(struct vnode *jvp, && jnl->jhdr->sequence_num != 0) { new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); + printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base); #if 0 int i; @@ -1763,7 +1789,8 @@ journal_create(struct vnode *jvp, lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); - + + jnl->flushing = FALSE; jnl->asyncIO = FALSE; jnl->flush_aborted = FALSE; @@ -1776,19 +1803,20 @@ journal_create(struct vnode *jvp, goto bad_write; } - return jnl; + goto journal_create_complete; bad_write: kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); bad_kmem_alloc: - if (jdev_name) { - vfs_removename(jdev_name); - } jnl->jhdr = NULL; FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - - return NULL; + mount_drop(fsmount, 0); +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + jnl = NULL; +journal_create_complete: + return jnl; } @@ -1801,7 +1829,8 @@ journal_open(struct vnode *jvp, int32_t flags, int32_t tbuffer_size, void (*flush)(void *arg), - void *arg) + void *arg, + struct mount *fsmount) { journal *jnl; uint32_t orig_blksz=0; @@ -1809,39 +1838,39 @@ journal_open(struct vnode *jvp, u_int32_t min_size = 0; int orig_checksum, checksum; struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); + const char *jdev_name = vnode_getname_printable(jvp); context.vc_thread = current_thread(); context.vc_ucred = FSCRED; /* Get the real physical block size. */ if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - return NULL; + goto cleanup_jdev_name; } if (phys_blksz > min_fs_blksz) { printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", jdev_name, phys_blksz, min_fs_blksz); - return NULL; + goto cleanup_jdev_name; } if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: open: journal size %lld looks bogus.\n", journal_size); - return NULL; + printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size); + goto cleanup_jdev_name; } min_size = phys_blksz * (phys_blksz / sizeof(block_info)); /* Reject journals that are too small given the sector size of the device */ if (journal_size < min_size) { - printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", - journal_size, phys_blksz); - return NULL; + printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; } if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", jdev_name, journal_size, phys_blksz); - return NULL; + goto cleanup_jdev_name; } MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); @@ -1856,6 +1885,12 @@ journal_open(struct vnode *jvp, jnl->jdev_name = jdev_name; lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + /* We need a reference to the mount to later pass to the throttling code for + * IO accounting. + */ + jnl->fsmount = fsmount; + mount_ref(fsmount, 0); + get_io_info(jvp, phys_blksz, jnl, &context); if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { @@ -1911,26 +1946,24 @@ journal_open(struct vnode *jvp, jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; } - if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - /* - * The volume has probably been resized (such that we had to adjust the - * logical sector size), or copied to media with a different logical - * sector size. - * - * Temporarily change the device's logical block size to match the - * journal's header size. This will allow us to replay the journal - * safely. If the replay succeeds, we will update the journal's header - * size (later in this function). - */ - - orig_blksz = phys_blksz; - phys_blksz = jnl->jhdr->jhdr_size; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); + if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. + * + * Temporarily change the device's logical block size to match the + * journal's header size. This will allow us to replay the journal + * safely. If the replay succeeds, we will update the journal's header + * size (later in this function). + */ + orig_blksz = phys_blksz; + phys_blksz = jnl->jhdr->jhdr_size; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); + printf("jnl: %s: open: temporarily switched block size from %u to %u\n", + jdev_name, orig_blksz, phys_blksz); + } - printf("jnl: %s: open: temporarily switched block size from %u to %u\n", - jdev_name, orig_blksz, phys_blksz); - } - if ( jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size || jnl->jhdr->start > 1024*1024*1024) { @@ -1980,68 +2013,73 @@ journal_open(struct vnode *jvp, printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); goto bad_journal; } - - /* - * When we get here, we know that the journal is empty (jnl->jhdr->start == - * jnl->jhdr->end). If the device's logical block size was different from - * the journal's header size, then we can now restore the device's logical - * block size and update the journal's header size to match. - * - * Note that we also adjust the journal's start and end so that they will - * be aligned on the new block size. We pick a new sequence number to - * avoid any problems if a replay found previous transactions using the old - * journal header size. (See the comments in journal_create(), above.) - */ - if (orig_blksz != 0) { - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - phys_blksz = orig_blksz; - orig_blksz = 0; - printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz); - jnl->jhdr->jhdr_size = phys_blksz; - jnl->jhdr->start = phys_blksz; - jnl->jhdr->end = phys_blksz; - jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + - (journal_size / phys_blksz) + - (random() % 16384)) & 0x00ffffff; + /* + * When we get here, we know that the journal is empty (jnl->jhdr->start == + * jnl->jhdr->end). If the device's logical block size was different from + * the journal's header size, then we can now restore the device's logical + * block size and update the journal's header size to match. + * + * Note that we also adjust the journal's start and end so that they will + * be aligned on the new block size. We pick a new sequence number to + * avoid any problems if a replay found previous transactions using the old + * journal header size. (See the comments in journal_create(), above.) + */ - if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { - printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + if (orig_blksz != 0) { + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); + phys_blksz = orig_blksz; + + orig_blksz = 0; + + jnl->jhdr->jhdr_size = phys_blksz; + jnl->jhdr->start = phys_blksz; + jnl->jhdr->end = phys_blksz; + jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + + (journal_size / phys_blksz) + + (random() % 16384)) & 0x00ffffff; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { + printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + goto bad_journal; + } + } + + // make sure this is in sync! + jnl->active_start = jnl->jhdr->start; + jnl->sequence_num = jnl->jhdr->sequence_num; + + // set this now, after we've replayed the journal + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + // TODO: Does this need to change if the device's logical block size changed? + if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { + printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, + jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); goto bad_journal; } - } - - // make sure this is in sync! - jnl->active_start = jnl->jhdr->start; - jnl->sequence_num = jnl->jhdr->sequence_num; - - // set this now, after we've replayed the journal - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - // TODO: Does this need to change if the device's logical block size changed? - if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { - printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, - jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); - goto bad_journal; - } - - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); - - return jnl; - - bad_journal: - if (orig_blksz != 0) { - phys_blksz = orig_blksz; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz); - } - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); - bad_kmem_alloc: - if (jdev_name) { - vfs_removename(jdev_name); - } - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - return NULL; + + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + goto journal_open_complete; + +bad_journal: + if (orig_blksz != 0) { + phys_blksz = orig_blksz; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); + printf("jnl: %s: open: restored block size after error\n", jdev_name); + } + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); +bad_kmem_alloc: + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); + mount_drop(fsmount, 0); +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + jnl = NULL; +journal_open_complete: + return jnl; } @@ -2057,7 +2095,7 @@ journal_is_clean(struct vnode *jvp, int ret; int orig_checksum, checksum; struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); + const char *jdev_name = vnode_getname_printable(jvp); context.vc_thread = current_thread(); context.vc_ucred = FSCRED; @@ -2065,31 +2103,36 @@ journal_is_clean(struct vnode *jvp, /* Get the real physical block size. */ if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); - return EINVAL; + ret = EINVAL; + goto cleanup_jdev_name; } if (phys_blksz > (uint32_t)min_fs_block_size) { printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", jdev_name, phys_blksz, min_fs_block_size); - return EINVAL; + ret = EINVAL; + goto cleanup_jdev_name; } if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { - printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size); - return EINVAL; + printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size); + ret = EINVAL; + goto cleanup_jdev_name; } if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", jdev_name, journal_size, phys_blksz); - return EINVAL; + ret = EINVAL; + goto cleanup_jdev_name; } memset(&jnl, 0, sizeof(jnl)); if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); - return ENOMEM; + ret = ENOMEM; + goto cleanup_jdev_name; } jnl.header_buf_size = phys_blksz; @@ -2150,12 +2193,9 @@ journal_is_clean(struct vnode *jvp, get_out: kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); - if (jdev_name) { - vfs_removename(jdev_name); - } - - return ret; - +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + return ret; } @@ -2173,7 +2213,7 @@ journal_close(journal *jnl) jnl->flags |= JOURNAL_CLOSE_PENDING; if (jnl->owner != current_thread()) { - lock_journal(jnl); + journal_lock(jnl); } wait_condition(jnl, &jnl->flushing, "journal_close"); @@ -2251,16 +2291,23 @@ journal_close(journal *jnl) } } } + wait_condition(jnl, &jnl->asyncIO, "journal_close"); free_old_stuff(jnl); kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); jnl->jhdr = (void *)0xbeefbabe; - if (jnl->jdev_name) { - vfs_removename(jnl->jdev_name); - } + // Release reference on the mount + if (jnl->fsmount) + mount_drop(jnl->fsmount, 0); + + vnode_putname_printable(jnl->jdev_name); + journal_unlock(jnl); + lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group); + lck_mtx_destroy(&jnl->jlock, jnl_mutex_group); + lck_mtx_destroy(&jnl->flock, jnl_mutex_group); FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); } @@ -2351,7 +2398,7 @@ check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write lcl_counter = 0; while (jnl->old_start[i] & 0x8000000000000000LL) { - if (lcl_counter++ > 1000) { + if (lcl_counter++ > 10000) { panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", jnl->old_start[i], jnl); } @@ -2431,7 +2478,19 @@ static errno_t journal_allocate_transaction(journal *jnl) { transaction *tr; + boolean_t was_vm_privileged; + if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { + /* + * the disk driver can allocate memory on this path... + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK); memset(tr, 0, sizeof(transaction)); @@ -2442,6 +2501,8 @@ journal_allocate_transaction(journal *jnl) jnl->active_tr = NULL; return ENOMEM; } + if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); // journal replay code checksum check depends on this. memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); @@ -2484,14 +2545,14 @@ journal_start_transaction(journal *jnl) jnl->nested_count++; return 0; } - lock_journal(jnl); - if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { + journal_lock(jnl); + + if (jnl->nested_count != 0 || jnl->active_tr != NULL) { panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", jnl->owner, jnl->nested_count, jnl->active_tr, jnl); } - jnl->owner = current_thread(); jnl->nested_count = 1; #if JOE @@ -2529,9 +2590,8 @@ journal_start_transaction(journal *jnl) return 0; bad_start: - jnl->owner = NULL; jnl->nested_count = 0; - unlock_journal(jnl); + journal_unlock(jnl); return ret; } @@ -2829,6 +2889,8 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, vo blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); blhdr->binfo[i].u.bp = bp; + KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0); + if (func) { void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; @@ -2922,7 +2984,6 @@ journal_kill_block(journal *jnl, struct buf *bp) return 0; } - /* ;________________________________________________________________________________ ; @@ -2976,16 +3037,31 @@ journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) ;________________________________________________________________________________ */ static int -trim_realloc(struct jnl_trim_list *trim) +trim_realloc(journal *jnl, struct jnl_trim_list *trim) { void *new_extents; uint32_t new_allocated_count; + boolean_t was_vm_privileged; if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0); new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; + + if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); + if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + if (new_extents == NULL) { printf("jnl: trim_realloc: unable to grow extent list!\n"); /* @@ -3016,44 +3092,53 @@ trim_realloc(struct jnl_trim_list *trim) return 0; } - /* -;________________________________________________________________________________ -; -; Routine: trim_search_extent -; -; Function: Search the given extent list to see if any of its extents -; overlap the given extent. -; -; Input Arguments: -; trim - The trim list to be searched. -; offset - The first byte of the range to be searched for. -; length - The number of bytes of the extent being searched for. -; -; Output: -; (result) - TRUE if one or more extents overlap, FALSE otherwise. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: trim_search_extent + ; + ; Function: Search the given extent list to see if any of its extents + ; overlap the given extent. + ; + ; Input Arguments: + ; trim - The trim list to be searched. + ; offset - The first byte of the range to be searched for. + ; length - The number of bytes of the extent being searched for. + ; overlap_start - start of the overlapping extent + ; overlap_len - length of the overlapping extent + ; + ; Output: + ; (result) - TRUE if one or more extents overlap, FALSE otherwise. + ;________________________________________________________________________________ + */ static int -trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) +trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, + uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len) { uint64_t end = offset + length; uint32_t lower = 0; /* Lowest index to search */ uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ uint32_t middle; - + /* A binary search over the extent list. */ while (lower < upper) { middle = (lower + upper) / 2; - + if (trim->extents[middle].offset >= end) upper = middle; else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) lower = middle + 1; - else + else { + if (overlap_start) { + *overlap_start = trim->extents[middle].offset; + } + if (overlap_len) { + *overlap_len = trim->extents[middle].length; + } return TRUE; + } } - + return FALSE; } @@ -3092,7 +3177,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) dk_extent_t *extent; uint32_t insert_index; uint32_t replace_count; - + CHECK_JOURNAL(jnl); /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ @@ -3104,7 +3189,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) CHECK_TRANSACTION(tr); if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); if (jnl->owner != current_thread()) { panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", @@ -3112,9 +3197,9 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) } free_old_stuff(jnl); - + end = offset + length; - + /* * Find the range of existing extents that can be combined with the * input extent. We start by counting the number of extents that end @@ -3132,7 +3217,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) ++replace_count; ++extent; } - + /* * If none of the existing extents can be combined with the input extent, * then just insert it in the list (before item number insert_index). @@ -3140,7 +3225,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) if (replace_count == 0) { /* If the list was already full, we need to grow it. */ if (tr->trim.extent_count == tr->trim.allocated_count) { - if (trim_realloc(&tr->trim) != 0) { + if (trim_realloc(jnl, &tr->trim) != 0) { printf("jnl: trim_add_extent: out of memory!"); if (jnl_kdebug) KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); @@ -3198,6 +3283,92 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) return 0; } +/* + * journal_trim_extent_overlap + * + * Return 1 if there are any pending TRIMs that overlap with the given offset and length + * Return 0 otherwise. + */ + +int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) { + transaction *tr = NULL; + int overlap = 0; + + uint64_t overlap_start; + uint64_t overlap_len; + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + /* + * There are two lists that need to be examined for potential overlaps: + * + * The first is the current transaction. Since this function requires that + * a transaction be active when this is called, this is the "active_tr" + * pointer in the journal struct. This has a trimlist pointer which needs + * to be searched. + */ + overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len); + if (overlap == 0) { + /* + * The second is the async trim list, which is only done if the current + * transaction group (active transaction) did not overlap with our target + * extent. This async trim list is the set of all previously + * committed transaction groups whose I/Os are now in-flight. We need to hold the + * trim lock in order to search this list. If we grab the list before the + * TRIM has completed, then we will compare it. If it is grabbed AFTER the + * TRIM has completed, then the pointer will be zeroed out and we won't have + * to check anything. + */ + lck_rw_lock_shared (&jnl->trim_lock); + if (jnl->async_trim != NULL) { + overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len); + } + lck_rw_unlock_shared (&jnl->trim_lock); + } + + if (overlap) { + /* compute the end (min) of the overlapping range */ + if ( (overlap_start + overlap_len) < (offset + length)) { + *end = (overlap_start + overlap_len); + } + else { + *end = (offset + length); + } + } + + + return overlap; +} + +/* + * journal_request_immediate_flush + * + * FS requests that the journal flush immediately upon the + * active transaction's completion. + * + * Returns 0 if operation succeeds + * Returns EPERM if we failed to leave hint + */ +int +journal_request_immediate_flush (journal *jnl) { + + transaction *tr = NULL; + /* + * Is a transaction still in process? You must do + * this while there are txns open + */ + tr = jnl->active_tr; + if (tr != NULL) { + CHECK_TRANSACTION(tr); + tr->flush_on_completion = TRUE; + } + else { + return EPERM; + } + return 0; +} + + /* ;________________________________________________________________________________ @@ -3222,7 +3393,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) ;________________________________________________________________________________ */ static int -trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) +trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length) { u_int64_t end; dk_extent_t *extent; @@ -3269,7 +3440,7 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) if (keep_before > keep_after) { /* If the list was already full, we need to grow it. */ if (trim->extent_count == trim->allocated_count) { - if (trim_realloc(trim) != 0) { + if (trim_realloc(jnl, trim) != 0) { printf("jnl: trim_remove_extent: out of memory!"); return ENOMEM; } @@ -3331,24 +3502,23 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) return 0; } - /* -;________________________________________________________________________________ -; -; Routine: journal_trim_remove_extent -; -; Function: Make note of a range of bytes, some of which may have previously -; been passed to journal_trim_add_extent, is now in use on the -; volume. The given bytes will be not be trimmed as part of -; this transaction, or a pending trim of a transaction being -; asynchronously flushed. -; -; Input Arguments: -; jnl - The journal for the volume containing the byte range. -; offset - The first byte of the range to be trimmed. -; length - The number of bytes of the extent being trimmed. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: journal_trim_remove_extent + ; + ; Function: Make note of a range of bytes, some of which may have previously + ; been passed to journal_trim_add_extent, is now in use on the + ; volume. The given bytes will be not be trimmed as part of + ; this transaction, or a pending trim of a transaction being + ; asynchronously flushed. + ; + ; Input Arguments: + ; jnl - The journal for the volume containing the byte range. + ; offset - The first byte of the range to be trimmed. + ; length - The number of bytes of the extent being trimmed. + ;________________________________________________________________________________ + */ __private_extern__ int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) { @@ -3366,7 +3536,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) CHECK_TRANSACTION(tr); if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0); if (jnl->owner != current_thread()) { panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", @@ -3374,8 +3544,8 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) } free_old_stuff(jnl); - - error = trim_remove_extent(&tr->trim, offset, length); + + error = trim_remove_extent(jnl, &tr->trim, offset, length); if (error == 0) { int found = FALSE; @@ -3385,7 +3555,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) */ lck_rw_lock_shared(&jnl->trim_lock); if (jnl->async_trim != NULL) - found = trim_search_extent(jnl->async_trim, offset, length); + found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL); lck_rw_unlock_shared(&jnl->trim_lock); if (found) { @@ -3398,10 +3568,10 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) uint32_t async_extent_count = 0; if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0); lck_rw_lock_exclusive(&jnl->trim_lock); if (jnl->async_trim != NULL) { - error = trim_remove_extent(jnl->async_trim, offset, length); + error = trim_remove_extent(jnl, jnl->async_trim, offset, length); async_extent_count = jnl->async_trim->extent_count; } lck_rw_unlock_exclusive(&jnl->trim_lock); @@ -3420,31 +3590,37 @@ static int journal_trim_flush(journal *jnl, transaction *tr) { int errno = 0; + boolean_t was_vm_privileged; if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); + if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { + /* + * the disk driver can allocate memory on this path... + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + lck_rw_lock_shared(&jnl->trim_lock); if (tr->trim.extent_count > 0) { dk_unmap_t unmap; bzero(&unmap, sizeof(unmap)); - lck_rw_lock_shared(&jnl->trim_lock); if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { unmap.extents = tr->trim.extents; unmap.extentsCount = tr->trim.extent_count; if (jnl_kdebug) - KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0); errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); if (jnl_kdebug) KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); - if (errno) { - printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n", - errno, (unsigned long) (unmap.extents), unmap.extentsCount, - jnl->jdev_name); - jnl->flags &= ~JOURNAL_USE_UNMAP; - } } - + /* * Call back into the file system to tell them that we have * trimmed some extents and that they can now be reused. @@ -3456,10 +3632,11 @@ journal_trim_flush(journal *jnl, transaction *tr) */ if (jnl->trim_callback) jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); - - lck_rw_unlock_shared(&jnl->trim_lock); } + lck_rw_unlock_shared(&jnl->trim_lock); + if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); /* * If the transaction we're flushing was the async transaction, then * tell the current transaction that there is no pending trim @@ -3475,6 +3652,11 @@ journal_trim_flush(journal *jnl, transaction *tr) jnl->async_trim = NULL; lck_rw_unlock_exclusive(&jnl->trim_lock); + /* + * By the time we get here, no other thread can discover the address + * of "tr", so it is safe for us to manipulate tr->trim without + * holding any locks. + */ if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); tr->trim.allocated_count = 0; @@ -3488,7 +3670,6 @@ journal_trim_flush(journal *jnl, transaction *tr) return errno; } - static int journal_binfo_cmp(const void *a, const void *b) { @@ -3564,7 +3745,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void jnl->cur_tr = tr; goto done; } - + // if our transaction buffer isn't very full, just hang // on to it and don't actually flush anything. this is // what is known as "group commit". we will flush the @@ -3607,7 +3788,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); goto done; } - + /* * Store a pointer to this transaction's trim list so that * future transactions can find it. @@ -3634,7 +3815,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void * of the journal flush, 'saved_sequence_num' remains stable */ jnl->saved_sequence_num = jnl->sequence_num; - + /* * if we're here we're going to flush the transaction buffer to disk. * 'check_free_space' will not return untl there is enough free @@ -3793,8 +3974,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void must_wait = TRUE; if (drop_lock_early == TRUE) { - jnl->owner = NULL; - unlock_journal(jnl); + journal_unlock(jnl); drop_lock = FALSE; } if (must_wait == TRUE) @@ -3812,8 +3992,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); done: if (drop_lock == TRUE) { - jnl->owner = NULL; - unlock_journal(jnl); + journal_unlock(jnl); } return (ret_val); } @@ -3822,14 +4001,8 @@ done: static void finish_end_thread(transaction *tr) { -#if !CONFIG_EMBEDDED - proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); -#else /* !CONFIG_EMBEDDED */ - struct uthread *ut; - - ut = get_bsdthread_info(current_thread()); - ut->uu_iopol_disk = IOPOL_PASSIVE; -#endif /* !CONFIG_EMBEDDED */ + proc_set_task_policy(current_task(), current_thread(), + TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); finish_end_transaction(tr, NULL, NULL); @@ -3840,14 +4013,8 @@ finish_end_thread(transaction *tr) static void write_header_thread(journal *jnl) { -#if !CONFIG_EMBEDDED - proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); -#else /* !CONFIG_EMBEDDED */ - struct uthread *ut; - - ut = get_bsdthread_info(current_thread()); - ut->uu_iopol_disk = IOPOL_PASSIVE; -#endif /* !CONFIG_EMBEDDED */ + proc_set_task_policy(current_task(), current_thread(), + TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE); if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) jnl->write_header_failed = TRUE; @@ -3878,6 +4045,7 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba end = jnl->jhdr->end; for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + boolean_t was_vm_privileged; amt = blhdr->bytes_used; @@ -3886,9 +4054,22 @@ finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callba blhdr->checksum = 0; blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); + if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) { panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); } + if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + tbuffer_offset = jnl->jhdr->blhdr_size; for (i = 1; i < blhdr->num_blocks; i++) { @@ -4119,13 +4300,13 @@ bad_journal: jnl->flush_aborted = TRUE; unlock_condition(jnl, &jnl->flushing); - lock_journal(jnl); + journal_lock(jnl); jnl->flags |= JOURNAL_INVALID; jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; abort_transaction(jnl, tr); // cleans up list of extents to be trimmed - unlock_journal(jnl); + journal_unlock(jnl); } else unlock_condition(jnl, &jnl->flushing); @@ -4249,7 +4430,7 @@ abort_transaction(journal *jnl, transaction *tr) */ vnode_rele_ext(bp_vp, 0, 1); } else { - printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", + printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n", jnl->jdev_name, blhdr->binfo[i].bnum, tbp); if (bp) { buf_brelse(bp); @@ -4276,6 +4457,7 @@ abort_transaction(journal *jnl, transaction *tr) jnl->async_trim = NULL; lck_rw_unlock_exclusive(&jnl->trim_lock); + if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); } @@ -4328,8 +4510,7 @@ journal_end_transaction(journal *jnl) abort_transaction(jnl, tr); } - jnl->owner = NULL; - unlock_journal(jnl); + journal_unlock(jnl); return EINVAL; } @@ -4343,7 +4524,19 @@ journal_end_transaction(journal *jnl) // called from end_transaction(). // jnl->active_tr = NULL; - ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); + + /* Examine the force-journal-flush state in the active txn */ + if (tr->flush_on_completion == TRUE) { + /* + * If the FS requested it, disallow group commit and force the + * transaction out to disk immediately. + */ + ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE); + } + else { + /* in the common path we can simply use the double-buffered journal */ + ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); + } return ret; } @@ -4391,7 +4584,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO) KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0); if (jnl->owner != current_thread()) { - lock_journal(jnl); + journal_lock(jnl); drop_lock = TRUE; } @@ -4415,7 +4608,7 @@ journal_flush(journal *jnl, boolean_t wait_for_IO) } else { if (drop_lock == TRUE) { - unlock_journal(jnl); + journal_unlock(jnl); } /* Because of pipelined journal, the journal transactions @@ -4520,7 +4713,8 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu { int ret; transaction *tr; - + size_t i = 0; + /* * Sanity check inputs, and adjust the size of the transaction buffer. */ @@ -4565,7 +4759,23 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu return ret; } wait_condition(jnl, &jnl->flushing, "end_transaction"); - + + /* + * At this point, we have completely flushed the contents of the current + * journal to disk (and have asynchronously written all of the txns to + * their actual desired locations). As a result, we can (and must) clear + * out the old_start array. If we do not, then if the last written transaction + * started at the beginning of the journal (starting 1 block into the + * journal file) it could confuse the buffer_flushed callback. This is + * because we're about to reset the start/end pointers of the journal header + * below. + */ + lock_oldstart(jnl); + for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { + jnl->old_start[i] = 0; + } + unlock_oldstart(jnl); + /* Update the journal's offset and size in memory. */ jnl->jdev_offset = offset; jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size; @@ -4619,7 +4829,8 @@ journal_create(__unused struct vnode *jvp, __unused int32_t flags, __unused int32_t tbuffer_size, __unused void (*flush)(void *arg), - __unused void *arg) + __unused void *arg, + __unused struct mount *fsmount) { return NULL; } @@ -4633,7 +4844,8 @@ journal_open(__unused struct vnode *jvp, __unused int32_t flags, __unused int32_t tbuffer_size, __unused void (*flush)(void *arg), - __unused void *arg) + __unused void *arg, + __unused struct mount *fsmount) { return NULL; } @@ -4709,4 +4921,47 @@ journal_owner(__unused journal *jnl) { return NULL; } + +void +journal_lock(__unused journal *jnl) +{ + return; +} + +void +journal_unlock(__unused journal *jnl) +{ + return; +} + +__private_extern__ int +journal_trim_add_extent(__unused journal *jnl, + __unused uint64_t offset, + __unused uint64_t length) +{ + return 0; +} + +int +journal_request_immediate_flush(__unused journal *jnl) +{ + return 0; +} + +__private_extern__ int +journal_trim_remove_extent(__unused journal *jnl, + __unused uint64_t offset, + __unused uint64_t length) +{ + return 0; +} + +int journal_trim_extent_overlap(__unused journal *jnl, + __unused uint64_t offset, + __unused uint64_t length, + __unused uint64_t *end) +{ + return 0; +} + #endif // !JOURNALING