/*
- * Copyright (c) 2002-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2002-2014 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
#endif
+
#ifndef CONFIG_HFS_TRIM
#define CONFIG_HFS_TRIM 0
#endif
+
#if JOURNALING
//
-// By default, we grow the list of extents to trim by one page at a time.
+// By default, we grow the list of extents to trim by 4K at a time.
// We'll opt to flush a transaction if it contains at least
// JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
// of modified blocks is small).
//
enum {
- JOURNAL_DEFAULT_TRIM_BYTES = PAGE_SIZE,
+ JOURNAL_DEFAULT_TRIM_BYTES = 4096,
JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
};
unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
-
-/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
+/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
__private_extern__ void qsort(
void * array,
size_t nmembers,
static void abort_transaction(journal *jnl, transaction *tr);
static void dump_journal(journal *jnl);
-static __inline__ void lock_journal(journal *jnl);
-static __inline__ void unlock_journal(journal *jnl);
static __inline__ void lock_oldstart(journal *jnl);
static __inline__ void unlock_oldstart(journal *jnl);
static __inline__ void lock_flush(journal *jnl);
// we use it to checksum the journal header and the block list
// headers that are at the start of each transaction.
//
-static int
+static unsigned int
calc_checksum(char *ptr, int len)
{
- int i, cksum=0;
+ int i;
+ unsigned int cksum=0;
// this is a lame checksum but for now it'll do
for(i = 0; i < len; i++, ptr++) {
jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
}
-static __inline__ void
-lock_journal(journal *jnl)
+__inline__ void
+journal_lock(journal *jnl)
{
lck_mtx_lock(&jnl->jlock);
+ if (jnl->owner) {
+ panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
+ }
+ jnl->owner = current_thread();
}
-static __inline__ void
-unlock_journal(journal *jnl)
+__inline__ void
+journal_unlock(journal *jnl)
{
+ jnl->owner = NULL;
lck_mtx_unlock(&jnl->jlock);
}
size_t io_sz = 0;
buf_t bp;
off_t max_iosize;
+ struct bufattr *bap;
if (*offset < 0 || *offset > jnl->jhdr->size) {
panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
}
+ /*
+ * As alluded to in the block comment at the top of the function, we use a "fake" iobuf
+ * here and issue directly to the disk device that the journal protects since we don't
+ * want this to enter the block cache. As a result, we lose the ability to mark it
+ * as a metadata buf_t for the layers below us that may care. If we were to
+ * simply attach the B_META flag into the b_flags this may confuse things further
+ * since this is an iobuf, not a metadata buffer.
+ *
+ * To address this, we use the extended bufattr struct embedded in the bp.
+ * Explicitly mark the buf here as a metadata buffer in its bufattr flags.
+ */
+ bap = &bp->b_attr;
+ bap->ba_flags |= BA_META;
+
if (direction & JNL_READ)
buf_setflags(bp, B_READ);
else {
CHECK_TRANSACTION(tr);
jnl = tr->jnl;
- if (jnl->flags & JOURNAL_INVALID) {
- return;
- }
CHECK_JOURNAL(jnl);
// cleanup for this transaction
tr->total_bytes = 0xfbadc0de;
+ if (jnl->flags & JOURNAL_INVALID)
+ goto transaction_done;
+
//printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
// tr, tr->journal_start, tr->journal_end, jnl);
tr->next = jnl->tr_freeme;
jnl->tr_freeme = tr;
}
+transaction_done:
unlock_oldstart(jnl);
unlock_condition(jnl, &jnl->asyncIO);
static int
replay_journal(journal *jnl)
{
- int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0;
+ int i, bad_blocks=0;
+ unsigned int orig_checksum, checksum, check_block_checksums = 0;
size_t ret;
size_t max_bsize = 0; /* protected by block_ptr */
block_list_header *blhdr;
struct bucket *co_buf;
int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
uint32_t last_sequence_num = 0;
+ int replay_retry_count = 0;
// wrap the start ptr if it points to the very end of the journal
if (jnl->jhdr->start == jnl->jhdr->size) {
if (jnl->flags & JOURNAL_NEED_SWAP) {
// calculate the checksum based on the unswapped data
// because it is done byte-at-a-time.
- orig_checksum = SWAP32(orig_checksum);
+ orig_checksum = (unsigned int)SWAP32(orig_checksum);
checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
swap_block_list_header(jnl, blhdr);
} else {
bad_txn_handling:
if (bad_blocks) {
+ /* Journal replay got error before it found any valid
+ * transations, abort replay */
if (txn_start_offset == 0) {
printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
goto bad_replay;
}
+ /* Repeated error during journal replay, abort replay */
+ if (replay_retry_count == 3) {
+ printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
+ goto bad_replay;
+ }
+ replay_retry_count++;
+
+ /* There was an error replaying the journal (possibly
+ * EIO/ENXIO from the device). So retry replaying all
+ * the good transactions that we found before getting
+ * the error.
+ */
jnl->jhdr->start = orig_jnl_start;
jnl->jhdr->end = txn_start_offset;
check_past_jnl_end = 0;
#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
-#define MAX_TRANSACTION_BUFFER_SIZE (2048*1024)
+#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024)
// XXXdbg - so I can change it in the debugger
int def_tbuffer_size = 0;
// there is in the machine.
//
if (def_tbuffer_size == 0) {
- if (mem_size < (256*1024*1024)) {
+ if (max_mem < (256*1024*1024)) {
def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
- } else if (mem_size < (512*1024*1024)) {
+ } else if (max_mem < (512*1024*1024)) {
def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
- } else if (mem_size < (1024*1024*1024)) {
+ } else if (max_mem < (1024*1024*1024)) {
def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
} else {
- def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (mem_size / (256*1024*1024));
+ def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024));
}
}
}
}
-
-
static void
get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
{
if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
- const char *name = vnode_name(devvp);
+ const char *name = vnode_getname_printable(devvp);
jnl->flags |= JOURNAL_DO_FUA_WRITES;
- printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features);
+ printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
+ vnode_putname_printable(name);
}
if (features & DK_FEATURE_UNMAP) {
jnl->flags |= JOURNAL_USE_UNMAP;
}
-static const char *
-get_jdev_name(struct vnode *jvp)
-{
- const char *jdev_name;
-
- jdev_name = vnode_name(jvp);
- if (jdev_name == NULL) {
- jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
- } else {
- // this just bumps the refcount on the name so we have our own copy
- jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0);
- }
-
- return jdev_name;
-}
-
-
journal *
journal_create(struct vnode *jvp,
off_t offset,
int32_t flags,
int32_t tbuffer_size,
void (*flush)(void *arg),
- void *arg)
+ void *arg,
+ struct mount *fsmount)
{
journal *jnl;
uint32_t phys_blksz, new_txn_base;
context.vc_thread = current_thread();
context.vc_ucred = FSCRED;
- jdev_name = get_jdev_name(jvp);
+ jdev_name = vnode_getname_printable(jvp);
/* Get the real physical block size. */
if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
- return NULL;
+ goto cleanup_jdev_name;
}
if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
- printf("jnl: create: journal size %lld looks bogus.\n", journal_size);
- return NULL;
+ printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
+ goto cleanup_jdev_name;
}
min_size = phys_blksz * (phys_blksz / sizeof(block_info));
/* Reject journals that are too small given the sector size of the device */
if (journal_size < min_size) {
- printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n",
- journal_size, phys_blksz);
- return NULL;
+ printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n",
+ jdev_name, journal_size, phys_blksz);
+ goto cleanup_jdev_name;
}
if (phys_blksz > min_fs_blksz) {
printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
jdev_name, phys_blksz, min_fs_blksz);
- return NULL;
+ goto cleanup_jdev_name;
}
if ((journal_size % phys_blksz) != 0) {
printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
jdev_name, journal_size, phys_blksz);
- return NULL;
+ goto cleanup_jdev_name;
}
jnl->jdev_name = jdev_name;
lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
+ // Keep a point to the mount around for use in IO throttling.
+ jnl->fsmount = fsmount;
+ // XXX: This lock discipline looks correct based on dounmount(), but it
+ // doesn't seem to be documented anywhere.
+ mount_ref(fsmount, 0);
+
get_io_info(jvp, phys_blksz, jnl, &context);
if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
&& jnl->jhdr->sequence_num != 0) {
new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
- printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
+ printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
#if 0
int i;
lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
-
+
+
jnl->flushing = FALSE;
jnl->asyncIO = FALSE;
jnl->flush_aborted = FALSE;
goto bad_write;
}
- return jnl;
+ goto journal_create_complete;
bad_write:
kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
bad_kmem_alloc:
- if (jdev_name) {
- vfs_removename(jdev_name);
- }
jnl->jhdr = NULL;
FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
-
- return NULL;
+ mount_drop(fsmount, 0);
+cleanup_jdev_name:
+ vnode_putname_printable(jdev_name);
+ jnl = NULL;
+journal_create_complete:
+ return jnl;
}
int32_t flags,
int32_t tbuffer_size,
void (*flush)(void *arg),
- void *arg)
+ void *arg,
+ struct mount *fsmount)
{
journal *jnl;
uint32_t orig_blksz=0;
u_int32_t min_size = 0;
int orig_checksum, checksum;
struct vfs_context context;
- const char *jdev_name = get_jdev_name(jvp);
+ const char *jdev_name = vnode_getname_printable(jvp);
context.vc_thread = current_thread();
context.vc_ucred = FSCRED;
/* Get the real physical block size. */
if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
- return NULL;
+ goto cleanup_jdev_name;
}
if (phys_blksz > min_fs_blksz) {
printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
jdev_name, phys_blksz, min_fs_blksz);
- return NULL;
+ goto cleanup_jdev_name;
}
if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
- printf("jnl: open: journal size %lld looks bogus.\n", journal_size);
- return NULL;
+ printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
+ goto cleanup_jdev_name;
}
min_size = phys_blksz * (phys_blksz / sizeof(block_info));
/* Reject journals that are too small given the sector size of the device */
if (journal_size < min_size) {
- printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n",
- journal_size, phys_blksz);
- return NULL;
+ printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n",
+ jdev_name, journal_size, phys_blksz);
+ goto cleanup_jdev_name;
}
if ((journal_size % phys_blksz) != 0) {
printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
jdev_name, journal_size, phys_blksz);
- return NULL;
+ goto cleanup_jdev_name;
}
MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
jnl->jdev_name = jdev_name;
lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
+ /* We need a reference to the mount to later pass to the throttling code for
+ * IO accounting.
+ */
+ jnl->fsmount = fsmount;
+ mount_ref(fsmount, 0);
+
get_io_info(jvp, phys_blksz, jnl, &context);
if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
}
- if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
- /*
- * The volume has probably been resized (such that we had to adjust the
- * logical sector size), or copied to media with a different logical
- * sector size.
- *
- * Temporarily change the device's logical block size to match the
- * journal's header size. This will allow us to replay the journal
- * safely. If the replay succeeds, we will update the journal's header
- * size (later in this function).
- */
-
- orig_blksz = phys_blksz;
- phys_blksz = jnl->jhdr->jhdr_size;
- VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+ if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+ /*
+ * The volume has probably been resized (such that we had to adjust the
+ * logical sector size), or copied to media with a different logical
+ * sector size.
+ *
+ * Temporarily change the device's logical block size to match the
+ * journal's header size. This will allow us to replay the journal
+ * safely. If the replay succeeds, we will update the journal's header
+ * size (later in this function).
+ */
+ orig_blksz = phys_blksz;
+ phys_blksz = jnl->jhdr->jhdr_size;
+ VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
+ printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
+ jdev_name, orig_blksz, phys_blksz);
+ }
- printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
- jdev_name, orig_blksz, phys_blksz);
- }
-
if ( jnl->jhdr->start <= 0
|| jnl->jhdr->start > jnl->jhdr->size
|| jnl->jhdr->start > 1024*1024*1024) {
printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
goto bad_journal;
}
-
- /*
- * When we get here, we know that the journal is empty (jnl->jhdr->start ==
- * jnl->jhdr->end). If the device's logical block size was different from
- * the journal's header size, then we can now restore the device's logical
- * block size and update the journal's header size to match.
- *
- * Note that we also adjust the journal's start and end so that they will
- * be aligned on the new block size. We pick a new sequence number to
- * avoid any problems if a replay found previous transactions using the old
- * journal header size. (See the comments in journal_create(), above.)
- */
- if (orig_blksz != 0) {
- VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
- phys_blksz = orig_blksz;
- orig_blksz = 0;
- printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz);
- jnl->jhdr->jhdr_size = phys_blksz;
- jnl->jhdr->start = phys_blksz;
- jnl->jhdr->end = phys_blksz;
- jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
- (journal_size / phys_blksz) +
- (random() % 16384)) & 0x00ffffff;
+ /*
+ * When we get here, we know that the journal is empty (jnl->jhdr->start ==
+ * jnl->jhdr->end). If the device's logical block size was different from
+ * the journal's header size, then we can now restore the device's logical
+ * block size and update the journal's header size to match.
+ *
+ * Note that we also adjust the journal's start and end so that they will
+ * be aligned on the new block size. We pick a new sequence number to
+ * avoid any problems if a replay found previous transactions using the old
+ * journal header size. (See the comments in journal_create(), above.)
+ */
- if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
- printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+ if (orig_blksz != 0) {
+ VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+ phys_blksz = orig_blksz;
+
+ orig_blksz = 0;
+
+ jnl->jhdr->jhdr_size = phys_blksz;
+ jnl->jhdr->start = phys_blksz;
+ jnl->jhdr->end = phys_blksz;
+ jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
+ (journal_size / phys_blksz) +
+ (random() % 16384)) & 0x00ffffff;
+
+ if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
+ printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+ goto bad_journal;
+ }
+ }
+
+ // make sure this is in sync!
+ jnl->active_start = jnl->jhdr->start;
+ jnl->sequence_num = jnl->jhdr->sequence_num;
+
+ // set this now, after we've replayed the journal
+ size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+ // TODO: Does this need to change if the device's logical block size changed?
+ if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
+ printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
+ jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
goto bad_journal;
}
- }
-
- // make sure this is in sync!
- jnl->active_start = jnl->jhdr->start;
- jnl->sequence_num = jnl->jhdr->sequence_num;
-
- // set this now, after we've replayed the journal
- size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
-
- // TODO: Does this need to change if the device's logical block size changed?
- if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
- printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
- jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
- goto bad_journal;
- }
-
- lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
-
- return jnl;
-
- bad_journal:
- if (orig_blksz != 0) {
- phys_blksz = orig_blksz;
- VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
- printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz);
- }
- kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
- bad_kmem_alloc:
- if (jdev_name) {
- vfs_removename(jdev_name);
- }
- FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
- return NULL;
+
+ lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
+ lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
+ lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
+
+ goto journal_open_complete;
+
+bad_journal:
+ if (orig_blksz != 0) {
+ phys_blksz = orig_blksz;
+ VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
+ printf("jnl: %s: open: restored block size after error\n", jdev_name);
+ }
+ kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+bad_kmem_alloc:
+ FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
+ mount_drop(fsmount, 0);
+cleanup_jdev_name:
+ vnode_putname_printable(jdev_name);
+ jnl = NULL;
+journal_open_complete:
+ return jnl;
}
int ret;
int orig_checksum, checksum;
struct vfs_context context;
- const char *jdev_name = get_jdev_name(jvp);
+ const char *jdev_name = vnode_getname_printable(jvp);
context.vc_thread = current_thread();
context.vc_ucred = FSCRED;
/* Get the real physical block size. */
if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
- return EINVAL;
+ ret = EINVAL;
+ goto cleanup_jdev_name;
}
if (phys_blksz > (uint32_t)min_fs_block_size) {
printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
jdev_name, phys_blksz, min_fs_block_size);
- return EINVAL;
+ ret = EINVAL;
+ goto cleanup_jdev_name;
}
if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
- printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size);
- return EINVAL;
+ printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
+ ret = EINVAL;
+ goto cleanup_jdev_name;
}
if ((journal_size % phys_blksz) != 0) {
printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
jdev_name, journal_size, phys_blksz);
- return EINVAL;
+ ret = EINVAL;
+ goto cleanup_jdev_name;
}
memset(&jnl, 0, sizeof(jnl));
if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
- return ENOMEM;
+ ret = ENOMEM;
+ goto cleanup_jdev_name;
}
jnl.header_buf_size = phys_blksz;
get_out:
kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
- if (jdev_name) {
- vfs_removename(jdev_name);
- }
-
- return ret;
-
+cleanup_jdev_name:
+ vnode_putname_printable(jdev_name);
+ return ret;
}
jnl->flags |= JOURNAL_CLOSE_PENDING;
if (jnl->owner != current_thread()) {
- lock_journal(jnl);
+ journal_lock(jnl);
}
wait_condition(jnl, &jnl->flushing, "journal_close");
}
}
}
+ wait_condition(jnl, &jnl->asyncIO, "journal_close");
free_old_stuff(jnl);
kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
jnl->jhdr = (void *)0xbeefbabe;
- if (jnl->jdev_name) {
- vfs_removename(jnl->jdev_name);
- }
+ // Release reference on the mount
+ if (jnl->fsmount)
+ mount_drop(jnl->fsmount, 0);
+
+ vnode_putname_printable(jnl->jdev_name);
+ journal_unlock(jnl);
+ lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
+ lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
+ lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
}
lcl_counter = 0;
while (jnl->old_start[i] & 0x8000000000000000LL) {
- if (lcl_counter++ > 1000) {
+ if (lcl_counter++ > 10000) {
panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
jnl->old_start[i], jnl);
}
journal_allocate_transaction(journal *jnl)
{
transaction *tr;
+ boolean_t was_vm_privileged;
+ if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+ /*
+ * the disk driver can allocate memory on this path...
+ * if we block waiting for memory, and there is enough pressure to
+ * cause us to try and create a new swap file, we may end up deadlocking
+ * due to waiting for the journal on the swap file creation path...
+ * by making ourselves vm_privileged, we give ourselves the best chance
+ * of not blocking
+ */
+ was_vm_privileged = set_vm_privilege(TRUE);
+ }
MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
memset(tr, 0, sizeof(transaction));
jnl->active_tr = NULL;
return ENOMEM;
}
+ if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+ set_vm_privilege(FALSE);
// journal replay code checksum check depends on this.
memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
jnl->nested_count++;
return 0;
}
- lock_journal(jnl);
- if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+ journal_lock(jnl);
+
+ if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
}
- jnl->owner = current_thread();
jnl->nested_count = 1;
#if JOE
return 0;
bad_start:
- jnl->owner = NULL;
jnl->nested_count = 0;
- unlock_journal(jnl);
+ journal_unlock(jnl);
return ret;
}
blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
blhdr->binfo[i].u.bp = bp;
+ KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
+
if (func) {
void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
return 0;
}
-
/*
;________________________________________________________________________________
;
;________________________________________________________________________________
*/
static int
-trim_realloc(struct jnl_trim_list *trim)
+trim_realloc(journal *jnl, struct jnl_trim_list *trim)
{
void *new_extents;
uint32_t new_allocated_count;
+ boolean_t was_vm_privileged;
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
+
+ if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+ /*
+ * if we block waiting for memory, and there is enough pressure to
+ * cause us to try and create a new swap file, we may end up deadlocking
+ * due to waiting for the journal on the swap file creation path...
+ * by making ourselves vm_privileged, we give ourselves the best chance
+ * of not blocking
+ */
+ was_vm_privileged = set_vm_privilege(TRUE);
+ }
new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
+ if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+ set_vm_privilege(FALSE);
+
if (new_extents == NULL) {
printf("jnl: trim_realloc: unable to grow extent list!\n");
/*
return 0;
}
-
/*
-;________________________________________________________________________________
-;
-; Routine: trim_search_extent
-;
-; Function: Search the given extent list to see if any of its extents
-; overlap the given extent.
-;
-; Input Arguments:
-; trim - The trim list to be searched.
-; offset - The first byte of the range to be searched for.
-; length - The number of bytes of the extent being searched for.
-;
-; Output:
-; (result) - TRUE if one or more extents overlap, FALSE otherwise.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine: trim_search_extent
+ ;
+ ; Function: Search the given extent list to see if any of its extents
+ ; overlap the given extent.
+ ;
+ ; Input Arguments:
+ ; trim - The trim list to be searched.
+ ; offset - The first byte of the range to be searched for.
+ ; length - The number of bytes of the extent being searched for.
+ ; overlap_start - start of the overlapping extent
+ ; overlap_len - length of the overlapping extent
+ ;
+ ; Output:
+ ; (result) - TRUE if one or more extents overlap, FALSE otherwise.
+ ;________________________________________________________________________________
+ */
static int
-trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
+ uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
{
uint64_t end = offset + length;
uint32_t lower = 0; /* Lowest index to search */
uint32_t upper = trim->extent_count; /* Highest index to search + 1 */
uint32_t middle;
-
+
/* A binary search over the extent list. */
while (lower < upper) {
middle = (lower + upper) / 2;
-
+
if (trim->extents[middle].offset >= end)
upper = middle;
else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
lower = middle + 1;
- else
+ else {
+ if (overlap_start) {
+ *overlap_start = trim->extents[middle].offset;
+ }
+ if (overlap_len) {
+ *overlap_len = trim->extents[middle].length;
+ }
return TRUE;
+ }
}
-
+
return FALSE;
}
dk_extent_t *extent;
uint32_t insert_index;
uint32_t replace_count;
-
+
CHECK_JOURNAL(jnl);
/* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
CHECK_TRANSACTION(tr);
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
if (jnl->owner != current_thread()) {
panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
}
free_old_stuff(jnl);
-
+
end = offset + length;
-
+
/*
* Find the range of existing extents that can be combined with the
* input extent. We start by counting the number of extents that end
++replace_count;
++extent;
}
-
+
/*
* If none of the existing extents can be combined with the input extent,
* then just insert it in the list (before item number insert_index).
if (replace_count == 0) {
/* If the list was already full, we need to grow it. */
if (tr->trim.extent_count == tr->trim.allocated_count) {
- if (trim_realloc(&tr->trim) != 0) {
+ if (trim_realloc(jnl, &tr->trim) != 0) {
printf("jnl: trim_add_extent: out of memory!");
if (jnl_kdebug)
KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
return 0;
}
+/*
+ * journal_trim_extent_overlap
+ *
+ * Return 1 if there are any pending TRIMs that overlap with the given offset and length
+ * Return 0 otherwise.
+ */
+
+int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
+ transaction *tr = NULL;
+ int overlap = 0;
+
+ uint64_t overlap_start;
+ uint64_t overlap_len;
+ tr = jnl->active_tr;
+ CHECK_TRANSACTION(tr);
+
+ /*
+ * There are two lists that need to be examined for potential overlaps:
+ *
+ * The first is the current transaction. Since this function requires that
+ * a transaction be active when this is called, this is the "active_tr"
+ * pointer in the journal struct. This has a trimlist pointer which needs
+ * to be searched.
+ */
+ overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
+ if (overlap == 0) {
+ /*
+ * The second is the async trim list, which is only done if the current
+ * transaction group (active transaction) did not overlap with our target
+ * extent. This async trim list is the set of all previously
+ * committed transaction groups whose I/Os are now in-flight. We need to hold the
+ * trim lock in order to search this list. If we grab the list before the
+ * TRIM has completed, then we will compare it. If it is grabbed AFTER the
+ * TRIM has completed, then the pointer will be zeroed out and we won't have
+ * to check anything.
+ */
+ lck_rw_lock_shared (&jnl->trim_lock);
+ if (jnl->async_trim != NULL) {
+ overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
+ }
+ lck_rw_unlock_shared (&jnl->trim_lock);
+ }
+
+ if (overlap) {
+ /* compute the end (min) of the overlapping range */
+ if ( (overlap_start + overlap_len) < (offset + length)) {
+ *end = (overlap_start + overlap_len);
+ }
+ else {
+ *end = (offset + length);
+ }
+ }
+
+
+ return overlap;
+}
+
+/*
+ * journal_request_immediate_flush
+ *
+ * FS requests that the journal flush immediately upon the
+ * active transaction's completion.
+ *
+ * Returns 0 if operation succeeds
+ * Returns EPERM if we failed to leave hint
+ */
+int
+journal_request_immediate_flush (journal *jnl) {
+
+ transaction *tr = NULL;
+ /*
+ * Is a transaction still in process? You must do
+ * this while there are txns open
+ */
+ tr = jnl->active_tr;
+ if (tr != NULL) {
+ CHECK_TRANSACTION(tr);
+ tr->flush_on_completion = TRUE;
+ }
+ else {
+ return EPERM;
+ }
+ return 0;
+}
+
+
/*
;________________________________________________________________________________
;________________________________________________________________________________
*/
static int
-trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
+trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
{
u_int64_t end;
dk_extent_t *extent;
if (keep_before > keep_after) {
/* If the list was already full, we need to grow it. */
if (trim->extent_count == trim->allocated_count) {
- if (trim_realloc(trim) != 0) {
+ if (trim_realloc(jnl, trim) != 0) {
printf("jnl: trim_remove_extent: out of memory!");
return ENOMEM;
}
return 0;
}
-
/*
-;________________________________________________________________________________
-;
-; Routine: journal_trim_remove_extent
-;
-; Function: Make note of a range of bytes, some of which may have previously
-; been passed to journal_trim_add_extent, is now in use on the
-; volume. The given bytes will be not be trimmed as part of
-; this transaction, or a pending trim of a transaction being
-; asynchronously flushed.
-;
-; Input Arguments:
-; jnl - The journal for the volume containing the byte range.
-; offset - The first byte of the range to be trimmed.
-; length - The number of bytes of the extent being trimmed.
-;________________________________________________________________________________
-*/
+ ;________________________________________________________________________________
+ ;
+ ; Routine: journal_trim_remove_extent
+ ;
+ ; Function: Make note of a range of bytes, some of which may have previously
+ ; been passed to journal_trim_add_extent, is now in use on the
+ ; volume. The given bytes will be not be trimmed as part of
+ ; this transaction, or a pending trim of a transaction being
+ ; asynchronously flushed.
+ ;
+ ; Input Arguments:
+ ; jnl - The journal for the volume containing the byte range.
+ ; offset - The first byte of the range to be trimmed.
+ ; length - The number of bytes of the extent being trimmed.
+ ;________________________________________________________________________________
+ */
__private_extern__ int
journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
{
CHECK_TRANSACTION(tr);
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
if (jnl->owner != current_thread()) {
panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
}
free_old_stuff(jnl);
-
- error = trim_remove_extent(&tr->trim, offset, length);
+
+ error = trim_remove_extent(jnl, &tr->trim, offset, length);
if (error == 0) {
int found = FALSE;
*/
lck_rw_lock_shared(&jnl->trim_lock);
if (jnl->async_trim != NULL)
- found = trim_search_extent(jnl->async_trim, offset, length);
+ found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
lck_rw_unlock_shared(&jnl->trim_lock);
if (found) {
uint32_t async_extent_count = 0;
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
lck_rw_lock_exclusive(&jnl->trim_lock);
if (jnl->async_trim != NULL) {
- error = trim_remove_extent(jnl->async_trim, offset, length);
+ error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
async_extent_count = jnl->async_trim->extent_count;
}
lck_rw_unlock_exclusive(&jnl->trim_lock);
journal_trim_flush(journal *jnl, transaction *tr)
{
int errno = 0;
+ boolean_t was_vm_privileged;
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
+ if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+ /*
+ * the disk driver can allocate memory on this path...
+ * if we block waiting for memory, and there is enough pressure to
+ * cause us to try and create a new swap file, we may end up deadlocking
+ * due to waiting for the journal on the swap file creation path...
+ * by making ourselves vm_privileged, we give ourselves the best chance
+ * of not blocking
+ */
+ was_vm_privileged = set_vm_privilege(TRUE);
+ }
+ lck_rw_lock_shared(&jnl->trim_lock);
if (tr->trim.extent_count > 0) {
dk_unmap_t unmap;
bzero(&unmap, sizeof(unmap));
- lck_rw_lock_shared(&jnl->trim_lock);
if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
unmap.extents = tr->trim.extents;
unmap.extentsCount = tr->trim.extent_count;
if (jnl_kdebug)
- KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
+ KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
if (jnl_kdebug)
KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
- if (errno) {
- printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n",
- errno, (unsigned long) (unmap.extents), unmap.extentsCount,
- jnl->jdev_name);
- jnl->flags &= ~JOURNAL_USE_UNMAP;
- }
}
-
+
/*
* Call back into the file system to tell them that we have
* trimmed some extents and that they can now be reused.
*/
if (jnl->trim_callback)
jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
-
- lck_rw_unlock_shared(&jnl->trim_lock);
}
+ lck_rw_unlock_shared(&jnl->trim_lock);
+ if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+ set_vm_privilege(FALSE);
/*
* If the transaction we're flushing was the async transaction, then
* tell the current transaction that there is no pending trim
jnl->async_trim = NULL;
lck_rw_unlock_exclusive(&jnl->trim_lock);
+ /*
+ * By the time we get here, no other thread can discover the address
+ * of "tr", so it is safe for us to manipulate tr->trim without
+ * holding any locks.
+ */
if (tr->trim.extents) {
kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
tr->trim.allocated_count = 0;
return errno;
}
-
static int
journal_binfo_cmp(const void *a, const void *b)
{
jnl->cur_tr = tr;
goto done;
}
-
+
// if our transaction buffer isn't very full, just hang
// on to it and don't actually flush anything. this is
// what is known as "group commit". we will flush the
KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
goto done;
}
-
+
/*
* Store a pointer to this transaction's trim list so that
* future transactions can find it.
* of the journal flush, 'saved_sequence_num' remains stable
*/
jnl->saved_sequence_num = jnl->sequence_num;
-
+
/*
* if we're here we're going to flush the transaction buffer to disk.
* 'check_free_space' will not return untl there is enough free
must_wait = TRUE;
if (drop_lock_early == TRUE) {
- jnl->owner = NULL;
- unlock_journal(jnl);
+ journal_unlock(jnl);
drop_lock = FALSE;
}
if (must_wait == TRUE)
KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
done:
if (drop_lock == TRUE) {
- jnl->owner = NULL;
- unlock_journal(jnl);
+ journal_unlock(jnl);
}
return (ret_val);
}
static void
finish_end_thread(transaction *tr)
{
-#if !CONFIG_EMBEDDED
- proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
- struct uthread *ut;
-
- ut = get_bsdthread_info(current_thread());
- ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+ proc_set_task_policy(current_task(), current_thread(),
+ TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
finish_end_transaction(tr, NULL, NULL);
static void
write_header_thread(journal *jnl)
{
-#if !CONFIG_EMBEDDED
- proc_apply_thread_selfdiskacc(IOPOL_PASSIVE);
-#else /* !CONFIG_EMBEDDED */
- struct uthread *ut;
-
- ut = get_bsdthread_info(current_thread());
- ut->uu_iopol_disk = IOPOL_PASSIVE;
-#endif /* !CONFIG_EMBEDDED */
+ proc_set_task_policy(current_task(), current_thread(),
+ TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
jnl->write_header_failed = TRUE;
end = jnl->jhdr->end;
for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
+ boolean_t was_vm_privileged;
amt = blhdr->bytes_used;
blhdr->checksum = 0;
blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
+ if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
+ /*
+ * if we block waiting for memory, and there is enough pressure to
+ * cause us to try and create a new swap file, we may end up deadlocking
+ * due to waiting for the journal on the swap file creation path...
+ * by making ourselves vm_privileged, we give ourselves the best chance
+ * of not blocking
+ */
+ was_vm_privileged = set_vm_privilege(TRUE);
+ }
if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
}
+ if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
+ set_vm_privilege(FALSE);
+
tbuffer_offset = jnl->jhdr->blhdr_size;
for (i = 1; i < blhdr->num_blocks; i++) {
jnl->flush_aborted = TRUE;
unlock_condition(jnl, &jnl->flushing);
- lock_journal(jnl);
+ journal_lock(jnl);
jnl->flags |= JOURNAL_INVALID;
jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
abort_transaction(jnl, tr); // cleans up list of extents to be trimmed
- unlock_journal(jnl);
+ journal_unlock(jnl);
} else
unlock_condition(jnl, &jnl->flushing);
*/
vnode_rele_ext(bp_vp, 0, 1);
} else {
- printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
+ printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n",
jnl->jdev_name, blhdr->binfo[i].bnum, tbp);
if (bp) {
buf_brelse(bp);
jnl->async_trim = NULL;
lck_rw_unlock_exclusive(&jnl->trim_lock);
+
if (tr->trim.extents) {
kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
}
abort_transaction(jnl, tr);
}
- jnl->owner = NULL;
- unlock_journal(jnl);
+ journal_unlock(jnl);
return EINVAL;
}
// called from end_transaction().
//
jnl->active_tr = NULL;
- ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+
+ /* Examine the force-journal-flush state in the active txn */
+ if (tr->flush_on_completion == TRUE) {
+ /*
+ * If the FS requested it, disallow group commit and force the
+ * transaction out to disk immediately.
+ */
+ ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
+ }
+ else {
+ /* in the common path we can simply use the double-buffered journal */
+ ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
+ }
return ret;
}
KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
if (jnl->owner != current_thread()) {
- lock_journal(jnl);
+ journal_lock(jnl);
drop_lock = TRUE;
}
} else {
if (drop_lock == TRUE) {
- unlock_journal(jnl);
+ journal_unlock(jnl);
}
/* Because of pipelined journal, the journal transactions
{
int ret;
transaction *tr;
-
+ size_t i = 0;
+
/*
* Sanity check inputs, and adjust the size of the transaction buffer.
*/
return ret;
}
wait_condition(jnl, &jnl->flushing, "end_transaction");
-
+
+ /*
+ * At this point, we have completely flushed the contents of the current
+ * journal to disk (and have asynchronously written all of the txns to
+ * their actual desired locations). As a result, we can (and must) clear
+ * out the old_start array. If we do not, then if the last written transaction
+ * started at the beginning of the journal (starting 1 block into the
+ * journal file) it could confuse the buffer_flushed callback. This is
+ * because we're about to reset the start/end pointers of the journal header
+ * below.
+ */
+ lock_oldstart(jnl);
+ for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) {
+ jnl->old_start[i] = 0;
+ }
+ unlock_oldstart(jnl);
+
/* Update the journal's offset and size in memory. */
jnl->jdev_offset = offset;
jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
__unused int32_t flags,
__unused int32_t tbuffer_size,
__unused void (*flush)(void *arg),
- __unused void *arg)
+ __unused void *arg,
+ __unused struct mount *fsmount)
{
return NULL;
}
__unused int32_t flags,
__unused int32_t tbuffer_size,
__unused void (*flush)(void *arg),
- __unused void *arg)
+ __unused void *arg,
+ __unused struct mount *fsmount)
{
return NULL;
}
{
return NULL;
}
+
+void
+journal_lock(__unused journal *jnl)
+{
+ return;
+}
+
+void
+journal_unlock(__unused journal *jnl)
+{
+ return;
+}
+
+__private_extern__ int
+journal_trim_add_extent(__unused journal *jnl,
+ __unused uint64_t offset,
+ __unused uint64_t length)
+{
+ return 0;
+}
+
+int
+journal_request_immediate_flush(__unused journal *jnl)
+{
+ return 0;
+}
+
+__private_extern__ int
+journal_trim_remove_extent(__unused journal *jnl,
+ __unused uint64_t offset,
+ __unused uint64_t length)
+{
+ return 0;
+}
+
+int journal_trim_extent_overlap(__unused journal *jnl,
+ __unused uint64_t offset,
+ __unused uint64_t length,
+ __unused uint64_t *end)
+{
+ return 0;
+}
+
#endif // !JOURNALING