git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2002-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	//
	29	// This file implements a simple write-ahead journaling layer.
	30	// In theory any file system can make use of it by calling these
	31	// functions when the fs wants to modify meta-data blocks. See
	32	// vfs_journal.h for a more detailed description of the api and
	33	// data structures.
	34	//
	35	// Dominic Giampaolo (dbg@apple.com)
	36	//
	37
	38	#ifdef KERNEL
	39
	40	#include <sys/param.h>
	41	#include <sys/systm.h>
	42	#include <sys/kernel.h>
	43	#include <sys/file_internal.h>
	44	#include <sys/stat.h>
	45	#include <sys/buf_internal.h>
	46	#include <sys/proc_internal.h>
	47	#include <sys/mount_internal.h>
	48	#include <sys/namei.h>
	49	#include <sys/vnode_internal.h>
	50	#include <sys/ioctl.h>
	51	#include <sys/tty.h>
	52	#include <sys/ubc.h>
	53	#include <sys/malloc.h>
	54	#include <kern/task.h>
	55	#include <kern/thread.h>
	56	#include <kern/kalloc.h>
	57	#include <sys/disk.h>
	58	#include <sys/kdebug.h>
	59	#include <miscfs/specfs/specdev.h>
	60	#include <libkern/OSAtomic.h> /* OSAddAtomic */
	61
	62	kern_return_t thread_terminate(thread_t);
	63
	64	/*
	65	* Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT
	66	* logging of trim-related calls within the journal. (They're
	67	* disabled by default because there can be a lot of these events,
	68	* and we don't want to overwhelm the kernel debug buffer. If you
	69	* want to watch these events in particular, just set the sysctl.)
	70	*/
	71	static int jnl_kdebug = 0;
	72	SYSCTL_DECL(_vfs_generic);
	73	SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW\|CTLFLAG_LOCKED, 0, "Journal");
	74	SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW\|CTLFLAG_LOCKED, 0, "Journal kdebug");
	75	SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW\|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM");
	76
	77	#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1)
	78	#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2)
	79	#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3)
	80	#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4)
	81	#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5)
	82	#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6)
	83	#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7)
	84
	85	/*
	86	* Cap the journal max size to 2GB. On HFS, it will attempt to occupy
	87	* a full allocation block if the current size is smaller than the allocation
	88	* block on which it resides. Once we hit the exabyte filesystem range, then
	89	* it will use 2GB allocation blocks. As a result, make the cap 2GB.
	90	*/
	91	#define MAX_JOURNAL_SIZE 0x80000000U
	92
	93	#include <sys/sdt.h> /* DTRACE_IO1 */
	94	#else
	95
	96	#include <stdio.h>
	97	#include <stdlib.h>
	98	#include <string.h>
	99	#include <limits.h>
	100	#include <errno.h>
	101	#include <fcntl.h>
	102	#include <unistd.h>
	103	#include <stdarg.h>
	104	#include <sys/types.h>
	105	#include "compat.h"
	106
	107	#endif /* KERNEL */
	108
	109	#include "vfs_journal.h"
	110
	111	#include <sys/kdebug.h>
	112
	113	#if 0
	114	#undef KERNEL_DEBUG
	115	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	116	#endif
	117
	118
	119	#ifndef CONFIG_HFS_TRIM
	120	#define CONFIG_HFS_TRIM 0
	121	#endif
	122
	123
	124	#if JOURNALING
	125
	126	//
	127	// By default, we grow the list of extents to trim by 4K at a time.
	128	// We'll opt to flush a transaction if it contains at least
	129	// JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
	130	// of modified blocks is small).
	131	//
	132	enum {
	133	JOURNAL_DEFAULT_TRIM_BYTES = 4096,
	134	JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
	135	JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
	136	};
	137
	138	unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
	139	SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
	140
	141	/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
	142	__private_extern__ void qsort(
	143	void * array,
	144	size_t nmembers,
	145	size_t member_size,
	146	int ()(const void , const void *));
	147
	148
	149
	150	// number of bytes to checksum in a block_list_header
	151	// NOTE: this should be enough to clear out the header
	152	// fields as well as the first entry of binfo[]
	153	#define BLHDR_CHECKSUM_SIZE 32
	154
	155	static void lock_condition(journal jnl, boolean_t condition, const char *condition_name);
	156	static void wait_condition(journal jnl, boolean_t condition, const char *condition_name);
	157	static void unlock_condition(journal jnl, boolean_t condition);
	158	static void finish_end_thread(transaction *tr);
	159	static void write_header_thread(journal *jnl);
	160	static int finish_end_transaction(transaction tr, errno_t (callback)(void), void callback_arg);
	161	static int end_transaction(transaction tr, int force_it, errno_t (callback)(void), void callback_arg, boolean_t drop_lock, boolean_t must_wait);
	162	static void abort_transaction(journal jnl, transaction tr);
	163	static void dump_journal(journal *jnl);
	164
	165	static __inline__ void lock_oldstart(journal *jnl);
	166	static __inline__ void unlock_oldstart(journal *jnl);
	167	static __inline__ void lock_flush(journal *jnl);
	168	static __inline__ void unlock_flush(journal *jnl);
	169
	170
	171	//
	172	// 3105942 - Coalesce writes to the same block on journal replay
	173	//
	174
	175	typedef struct bucket {
	176	off_t block_num;
	177	uint32_t jnl_offset;
	178	uint32_t block_size;
	179	int32_t cksum;
	180	} bucket;
	181
	182	#define STARTING_BUCKETS 256
	183
	184	static int add_block(journal jnl, struct bucket buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr);
	185	static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
	186	static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
	187	static int do_overlap(journal jnl, struct bucket buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr);
	188	static int insert_block(journal jnl, struct bucket buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr, int overwriting);
	189
	190	#define CHECK_JOURNAL(jnl) \
	191	do { \
	192	if (jnl == NULL) { \
	193	panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
	194	} \
	195	if (jnl->jdev == NULL) { \
	196	panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
	197	} \
	198	if (jnl->fsdev == NULL) { \
	199	panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
	200	} \
	201	if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \
	202	panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
	203	__FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
	204	} \
	205	if ( jnl->jhdr->start <= 0 \
	206	\|\| jnl->jhdr->start > jnl->jhdr->size) { \
	207	panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
	208	__FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
	209	} \
	210	if ( jnl->jhdr->end <= 0 \
	211	\|\| jnl->jhdr->end > jnl->jhdr->size) { \
	212	panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
	213	__FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
	214	} \
	215	} while(0)
	216
	217	#define CHECK_TRANSACTION(tr) \
	218	do { \
	219	if (tr == NULL) { \
	220	panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
	221	} \
	222	if (tr->jnl == NULL) { \
	223	panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
	224	} \
	225	if (tr->blhdr != (block_list_header *)tr->tbuffer) { \
	226	panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
	227	} \
	228	if (tr->total_bytes < 0) { \
	229	panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
	230	} \
	231	if (tr->journal_start < 0) { \
	232	panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
	233	} \
	234	if (tr->journal_end < 0) { \
	235	panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
	236	} \
	237	if (tr->blhdr && (tr->blhdr->max_blocks <= 0 \|\| tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
	238	panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
	239	} \
	240	} while(0)
	241
	242
	243
	244	//
	245	// this isn't a great checksum routine but it will do for now.
	246	// we use it to checksum the journal header and the block list
	247	// headers that are at the start of each transaction.
	248	//
	249	static unsigned int
	250	calc_checksum(char *ptr, int len)
	251	{
	252	int i;
	253	unsigned int cksum=0;
	254
	255	// this is a lame checksum but for now it'll do
	256	for(i = 0; i < len; i++, ptr++) {
	257	cksum = (cksum << 8) ^ (cksum + (unsigned char )ptr);
	258	}
	259
	260	return (~cksum);
	261	}
	262
	263	//
	264	// Journal Locking
	265	//
	266	lck_grp_attr_t * jnl_group_attr;
	267	lck_attr_t * jnl_lock_attr;
	268	lck_grp_t * jnl_mutex_group;
	269
	270	void
	271	journal_init(void)
	272	{
	273	jnl_lock_attr = lck_attr_alloc_init();
	274	jnl_group_attr = lck_grp_attr_alloc_init();
	275	jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
	276	}
	277
	278	__inline__ void
	279	journal_lock(journal *jnl)
	280	{
	281	lck_mtx_lock(&jnl->jlock);
	282	if (jnl->owner) {
	283	panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
	284	}
	285	jnl->owner = current_thread();
	286	}
	287
	288	__inline__ void
	289	journal_unlock(journal *jnl)
	290	{
	291	jnl->owner = NULL;
	292	lck_mtx_unlock(&jnl->jlock);
	293	}
	294
	295	static __inline__ void
	296	lock_flush(journal *jnl)
	297	{
	298	lck_mtx_lock(&jnl->flock);
	299	}
	300
	301	static __inline__ void
	302	unlock_flush(journal *jnl)
	303	{
	304	lck_mtx_unlock(&jnl->flock);
	305	}
	306
	307	static __inline__ void
	308	lock_oldstart(journal *jnl)
	309	{
	310	lck_mtx_lock(&jnl->old_start_lock);
	311	}
	312
	313	static __inline__ void
	314	unlock_oldstart(journal *jnl)
	315	{
	316	lck_mtx_unlock(&jnl->old_start_lock);
	317	}
	318
	319
	320
	321	#define JNL_WRITE 0x0001
	322	#define JNL_READ 0x0002
	323	#define JNL_HEADER 0x8000
	324
	325	//
	326	// This function sets up a fake buf and passes it directly to the
	327	// journal device strategy routine (so that it won't get cached in
	328	// the block cache.
	329	//
	330	// It also handles range checking the i/o so that we don't write
	331	// outside the journal boundaries and it will wrap the i/o back
	332	// to the beginning if necessary (skipping over the journal header)
	333	//
	334	static size_t
	335	do_journal_io(journal jnl, off_t offset, void *data, size_t len, int direction)
	336	{
	337	int err, curlen=len;
	338	size_t io_sz = 0;
	339	buf_t bp;
	340	off_t max_iosize;
	341	struct bufattr *bap;
	342
	343	if (offset < 0 \|\| offset > jnl->jhdr->size) {
	344	panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
	345	}
	346
	347	if (direction & JNL_WRITE)
	348	max_iosize = jnl->max_write_size;
	349	else if (direction & JNL_READ)
	350	max_iosize = jnl->max_read_size;
	351	else
	352	max_iosize = 128 * 1024;
	353
	354	again:
	355	bp = alloc_io_buf(jnl->jdev, 1);
	356
	357	if (offset + (off_t)curlen > jnl->jhdr->size && offset != 0 && jnl->jhdr->size != 0) {
	358	if (*offset == jnl->jhdr->size) {
	359	*offset = jnl->jhdr->jhdr_size;
	360	} else {
	361	curlen = (off_t)jnl->jhdr->size - *offset;
	362	}
	363	}
	364
	365	if (curlen > max_iosize) {
	366	curlen = max_iosize;
	367	}
	368
	369	if (curlen <= 0) {
	370	panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len);
	371	}
	372
	373	if (*offset == 0 && (direction & JNL_HEADER) == 0) {
	374	panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
	375	}
	376
	377	/*
	378	* As alluded to in the block comment at the top of the function, we use a "fake" iobuf
	379	* here and issue directly to the disk device that the journal protects since we don't
	380	* want this to enter the block cache. As a result, we lose the ability to mark it
	381	* as a metadata buf_t for the layers below us that may care. If we were to
	382	* simply attach the B_META flag into the b_flags this may confuse things further
	383	* since this is an iobuf, not a metadata buffer.
	384	*
	385	* To address this, we use the extended bufattr struct embedded in the bp.
	386	* Explicitly mark the buf here as a metadata buffer in its bufattr flags.
	387	*/
	388	bap = &bp->b_attr;
	389	bap->ba_flags \|= BA_META;
	390
	391	if (direction & JNL_READ)
	392	buf_setflags(bp, B_READ);
	393	else {
	394	/*
	395	* don't have to set any flags
	396	*/
	397	vnode_startwrite(jnl->jdev);
	398	}
	399	buf_setsize(bp, curlen);
	400	buf_setcount(bp, curlen);
	401	buf_setdataptr(bp, (uintptr_t)data);
	402	buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
	403	buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
	404
	405	if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
	406	buf_markfua(bp);
	407	}
	408
	409	DTRACE_IO1(journal__start, buf_t, bp);
	410	err = VNOP_STRATEGY(bp);
	411	if (!err) {
	412	err = (int)buf_biowait(bp);
	413	}
	414	DTRACE_IO1(journal__done, buf_t, bp);
	415	free_io_buf(bp);
	416
	417	if (err) {
	418	printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
	419	return 0;
	420	}
	421
	422	*offset += curlen;
	423	io_sz += curlen;
	424
	425	if (io_sz != len) {
	426	// handle wrap-around
	427	data = (char *)data + curlen;
	428	curlen = len - io_sz;
	429	if (*offset >= jnl->jhdr->size) {
	430	*offset = jnl->jhdr->jhdr_size;
	431	}
	432	goto again;
	433	}
	434
	435	return io_sz;
	436	}
	437
	438	static size_t
	439	read_journal_data(journal jnl, off_t offset, void *data, size_t len)
	440	{
	441	return do_journal_io(jnl, offset, data, len, JNL_READ);
	442	}
	443
	444	static size_t
	445	write_journal_data(journal jnl, off_t offset, void *data, size_t len)
	446	{
	447	return do_journal_io(jnl, offset, data, len, JNL_WRITE);
	448	}
	449
	450
	451	static size_t
	452	read_journal_header(journal jnl, void data, size_t len)
	453	{
	454	off_t hdr_offset = 0;
	455
	456	return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ\|JNL_HEADER);
	457	}
	458
	459	static int
	460	write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num)
	461	{
	462	static int num_err_prints = 0;
	463	int ret=0;
	464	off_t jhdr_offset = 0;
	465	struct vfs_context context;
	466
	467	context.vc_thread = current_thread();
	468	context.vc_ucred = NOCRED;
	469	//
	470	// Flush the track cache if we're not doing force-unit-access
	471	// writes.
	472	//
	473	if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
	474	ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
	475	}
	476	if (ret != 0) {
	477	//
	478	// Only print this error if it's a different error than the
	479	// previous one, or if it's the first time for this device
	480	// or if the total number of printfs is less than 25. We
	481	// allow for up to 25 printfs to insure that some make it
	482	// into the on-disk syslog. Otherwise if we only printed
	483	// one, it's possible it would never make it to the syslog
	484	// for the root volume and that makes debugging hard.
	485	//
	486	if ( ret != jnl->last_flush_err
	487	\|\| (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
	488	\|\| num_err_prints++ < 25) {
	489
	490	printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
	491
	492	jnl->flags \|= JOURNAL_FLUSHCACHE_ERR;
	493	jnl->last_flush_err = ret;
	494	}
	495	}
	496
	497	jnl->jhdr->sequence_num = sequence_num;
	498	jnl->jhdr->checksum = 0;
	499	jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
	500
	501	if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE\|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
	502	printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
	503	jnl->flags \|= JOURNAL_INVALID;
	504	return -1;
	505	}
	506
	507	// If we're not doing force-unit-access writes, then we
	508	// have to flush after writing the journal header so that
	509	// a future transaction doesn't sneak out to disk before
	510	// the header does and thus overwrite data that the old
	511	// journal header refers to. Saw this exact case happen
	512	// on an IDE bus analyzer with Larry Barras so while it
	513	// may seem obscure, it's not.
	514	//
	515	if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
	516	VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
	517	}
	518
	519	return 0;
	520	}
	521
	522
	523
	524	//
	525	// this is a work function used to free up transactions that
	526	// completed. they can't be free'd from buffer_flushed_callback
	527	// because it is called from deep with the disk driver stack
	528	// and thus can't do something that would potentially cause
	529	// paging. it gets called by each of the journal api entry
	530	// points so stuff shouldn't hang around for too long.
	531	//
	532	static void
	533	free_old_stuff(journal *jnl)
	534	{
	535	transaction tr, next;
	536	block_list_header blhdr=NULL, next_blhdr=NULL;
	537
	538	if (jnl->tr_freeme == NULL)
	539	return;
	540
	541	lock_oldstart(jnl);
	542	tr = jnl->tr_freeme;
	543	jnl->tr_freeme = NULL;
	544	unlock_oldstart(jnl);
	545
	546	for(; tr; tr=next) {
	547	for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
	548	next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
	549	blhdr->binfo[0].bnum = 0xdeadc0de;
	550
	551	kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
	552
	553	KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
	554	}
	555	next = tr->next;
	556	FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
	557	}
	558	}
	559
	560
	561
	562	//
	563	// This is our callback that lets us know when a buffer has been
	564	// flushed to disk. It's called from deep within the driver stack
	565	// and thus is quite limited in what it can do. Notably, it can
	566	// not initiate any new i/o's or allocate/free memory.
	567	//
	568	static void
	569	buffer_flushed_callback(struct buf bp, void arg)
	570	{
	571	transaction *tr;
	572	journal *jnl;
	573	transaction ctr, prev=NULL, *next;
	574	size_t i;
	575	int bufsize, amt_flushed, total_bytes;
	576
	577
	578	//printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
	579	// bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
	580
	581	// snarf out the bits we want
	582	bufsize = buf_size(bp);
	583	tr = (transaction *)arg;
	584
	585	// then we've already seen it
	586	if (tr == NULL) {
	587	return;
	588	}
	589
	590	CHECK_TRANSACTION(tr);
	591
	592	jnl = tr->jnl;
	593
	594	CHECK_JOURNAL(jnl);
	595
	596	amt_flushed = tr->num_killed;
	597	total_bytes = tr->total_bytes;
	598
	599	// update the number of blocks that have been flushed.
	600	// this buf may represent more than one block so take
	601	// that into account.
	602	//
	603	// OSAddAtomic() returns the value of tr->num_flushed before the add
	604	//
	605	amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed);
	606
	607
	608	// if this transaction isn't done yet, just return as
	609	// there is nothing to do.
	610	//
	611	// NOTE: we are careful to not reference anything through
	612	// the tr pointer after doing the OSAddAtomic(). if
	613	// this if statement fails then we are the last one
	614	// and then it's ok to dereference "tr".
	615	//
	616	if ((amt_flushed + bufsize) < total_bytes) {
	617	return;
	618	}
	619
	620	// this will single thread checking the transaction
	621	lock_oldstart(jnl);
	622
	623	if (tr->total_bytes == (int)0xfbadc0de) {
	624	// then someone beat us to it...
	625	unlock_oldstart(jnl);
	626	return;
	627	}
	628
	629	// mark this so that we're the owner of dealing with the
	630	// cleanup for this transaction
	631	tr->total_bytes = 0xfbadc0de;
	632
	633	if (jnl->flags & JOURNAL_INVALID)
	634	goto transaction_done;
	635
	636	//printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
	637	// tr, tr->journal_start, tr->journal_end, jnl);
	638
	639	// find this entry in the old_start[] index and mark it completed
	640	for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
	641
	642	if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
	643	jnl->old_start[i] &= ~(0x8000000000000000ULL);
	644	break;
	645	}
	646	}
	647
	648	if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
	649	panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
	650	tr->journal_start, tr, jnl);
	651	}
	652
	653
	654	// if we are here then we need to update the journal header
	655	// to reflect that this transaction is complete
	656	if (tr->journal_start == jnl->active_start) {
	657	jnl->active_start = tr->journal_end;
	658	tr->journal_start = tr->journal_end = (off_t)0;
	659	}
	660
	661	// go through the completed_trs list and try to coalesce
	662	// entries, restarting back at the beginning if we have to.
	663	for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
	664	if (ctr->journal_start == jnl->active_start) {
	665	jnl->active_start = ctr->journal_end;
	666	if (prev) {
	667	prev->next = ctr->next;
	668	}
	669	if (ctr == jnl->completed_trs) {
	670	jnl->completed_trs = ctr->next;
	671	}
	672
	673	next = jnl->completed_trs; // this starts us over again
	674	ctr->next = jnl->tr_freeme;
	675	jnl->tr_freeme = ctr;
	676	ctr = NULL;
	677	} else if (tr->journal_end == ctr->journal_start) {
	678	ctr->journal_start = tr->journal_start;
	679	next = jnl->completed_trs; // this starts us over again
	680	ctr = NULL;
	681	tr->journal_start = tr->journal_end = (off_t)0;
	682	} else if (tr->journal_start == ctr->journal_end) {
	683	ctr->journal_end = tr->journal_end;
	684	next = ctr->next;
	685	tr->journal_start = tr->journal_end = (off_t)0;
	686	} else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
	687	// coalesce the next entry with this one and link the next
	688	// entry in at the head of the tr_freeme list
	689	next = ctr->next; // temporarily use the "next" variable
	690	ctr->journal_end = next->journal_end;
	691	ctr->next = next->next;
	692	next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list
	693	jnl->tr_freeme = next;
	694
	695	next = jnl->completed_trs; // this starts us over again
	696	ctr = NULL;
	697	} else {
	698	next = ctr->next;
	699	}
	700	}
	701
	702	// if this is true then we didn't merge with anyone
	703	// so link ourselves in at the head of the completed
	704	// transaction list.
	705	if (tr->journal_start != 0) {
	706	// put this entry into the correct sorted place
	707	// in the list instead of just at the head.
	708	//
	709
	710	prev = NULL;
	711	for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
	712	// just keep looping
	713	}
	714
	715	if (ctr == NULL && prev == NULL) {
	716	jnl->completed_trs = tr;
	717	tr->next = NULL;
	718	} else if (ctr == jnl->completed_trs) {
	719	tr->next = jnl->completed_trs;
	720	jnl->completed_trs = tr;
	721	} else {
	722	tr->next = prev->next;
	723	prev->next = tr;
	724	}
	725	} else {
	726	// if we're here this tr got merged with someone else so
	727	// put it on the list to be free'd
	728	tr->next = jnl->tr_freeme;
	729	jnl->tr_freeme = tr;
	730	}
	731	transaction_done:
	732	unlock_oldstart(jnl);
	733
	734	unlock_condition(jnl, &jnl->asyncIO);
	735	}
	736
	737
	738	#include <libkern/OSByteOrder.h>
	739
	740	#define SWAP16(x) OSSwapInt16(x)
	741	#define SWAP32(x) OSSwapInt32(x)
	742	#define SWAP64(x) OSSwapInt64(x)
	743
	744
	745	static void
	746	swap_journal_header(journal *jnl)
	747	{
	748	jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
	749	jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
	750	jnl->jhdr->start = SWAP64(jnl->jhdr->start);
	751	jnl->jhdr->end = SWAP64(jnl->jhdr->end);
	752	jnl->jhdr->size = SWAP64(jnl->jhdr->size);
	753	jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
	754	jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
	755	jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
	756	jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num);
	757	}
	758
	759	static void
	760	swap_block_list_header(journal jnl, block_list_header blhdr)
	761	{
	762	int i;
	763
	764	blhdr->max_blocks = SWAP16(blhdr->max_blocks);
	765	blhdr->num_blocks = SWAP16(blhdr->num_blocks);
	766	blhdr->bytes_used = SWAP32(blhdr->bytes_used);
	767	blhdr->checksum = SWAP32(blhdr->checksum);
	768	blhdr->flags = SWAP32(blhdr->flags);
	769
	770	if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
	771	printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
	772	return;
	773	}
	774
	775	for(i = 0; i < blhdr->num_blocks; i++) {
	776	blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
	777	blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize);
	778	blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
	779	}
	780	}
	781
	782
	783	static int
	784	update_fs_block(journal jnl, void block_ptr, off_t fs_block, size_t bsize)
	785	{
	786	int ret;
	787	struct buf *oblock_bp=NULL;
	788
	789	// first read the block we want.
	790	ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
	791	if (ret != 0) {
	792	printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
	793
	794	if (oblock_bp) {
	795	buf_brelse(oblock_bp);
	796	oblock_bp = NULL;
	797	}
	798
	799	// let's try to be aggressive here and just re-write the block
	800	oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
	801	if (oblock_bp == NULL) {
	802	printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
	803	return -1;
	804	}
	805	}
	806
	807	// make sure it's the correct size.
	808	if (buf_size(oblock_bp) != bsize) {
	809	buf_brelse(oblock_bp);
	810	return -1;
	811	}
	812
	813	// copy the journal data over top of it
	814	memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize);
	815
	816	if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
	817	printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
	818	return ret;
	819	}
	820
	821	// and now invalidate it so that if someone else wants to read
	822	// it in a different size they'll be able to do it.
	823	ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
	824	if (oblock_bp) {
	825	buf_markinvalid(oblock_bp);
	826	buf_brelse(oblock_bp);
	827	}
	828
	829	return 0;
	830	}
	831
	832	static int
	833	grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
	834	{
	835	struct bucket *newBuf;
	836	int current_size = num_buckets, i;
	837
	838	// return if newsize is less than the current size
	839	if (new_size < num_buckets) {
	840	return current_size;
	841	}
	842
	843	if ((MALLOC(newBuf, struct bucket , new_sizesizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
	844	printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
	845	return -1;
	846	}
	847
	848	// printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
	849
	850	// copy existing elements
	851	bcopy(buf_ptr, newBuf, num_bucketssizeof(struct bucket));
	852
	853	// initialize the new ones
	854	for(i = num_buckets; i < new_size; i++) {
	855	newBuf[i].block_num = (off_t)-1;
	856	}
	857
	858	// free the old container
	859	FREE(*buf_ptr, M_TEMP);
	860
	861	// reset the buf_ptr
	862	*buf_ptr = newBuf;
	863
	864	return new_size;
	865	}
	866
	867	static int
	868	lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
	869	{
	870	int lo, hi, index, matches, i;
	871
	872	if (num_full == 0) {
	873	return 0; // table is empty, so insert at index=0
	874	}
	875
	876	lo = 0;
	877	hi = num_full - 1;
	878	index = -1;
	879
	880	// perform binary search for block_num
	881	do {
	882	int mid = (hi - lo)/2 + lo;
	883	off_t this_num = (*buf_ptr)[mid].block_num;
	884
	885	if (block_num == this_num) {
	886	index = mid;
	887	break;
	888	}
	889
	890	if (block_num < this_num) {
	891	hi = mid;
	892	continue;
	893	}
	894
	895	if (block_num > this_num) {
	896	lo = mid + 1;
	897	continue;
	898	}
	899	} while (lo < hi);
	900
	901	// check if lo and hi converged on the match
	902	if (block_num == (*buf_ptr)[hi].block_num) {
	903	index = hi;
	904	}
	905
	906	// if no existing entry found, find index for new one
	907	if (index == -1) {
	908	index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
	909	} else {
	910	// make sure that we return the right-most index in the case of multiple matches
	911	matches = 0;
	912	i = index + 1;
	913	while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
	914	matches++;
	915	i++;
	916	}
	917
	918	index += matches;
	919	}
	920
	921	return index;
	922	}
	923
	924	static int
	925	insert_block(journal jnl, struct bucket buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr, int overwriting)
	926	{
	927	if (!overwriting) {
	928	// grow the table if we're out of space
	929	if (num_full_ptr >= num_buckets_ptr) {
	930	int new_size = num_buckets_ptr 2;
	931	int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
	932
	933	if (grow_size < new_size) {
	934	printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
	935	return -1;
	936	}
	937
	938	*num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
	939	}
	940
	941	// if we're not inserting at the end, we need to bcopy
	942	if (blk_index != *num_full_ptr) {
	943	bcopy( (buf_ptr)+(blk_index), (buf_ptr)+(blk_index+1), (num_full_ptr-blk_index)sizeof(struct bucket) );
	944	}
	945
	946	(*num_full_ptr)++; // increment only if we're not overwriting
	947	}
	948
	949	// sanity check the values we're about to add
	950	if ((off_t)offset >= jnl->jhdr->size) {
	951	offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
	952	}
	953	if (size <= 0) {
	954	panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
	955	}
	956
	957	(*buf_ptr)[blk_index].block_num = num;
	958	(*buf_ptr)[blk_index].block_size = size;
	959	(*buf_ptr)[blk_index].jnl_offset = offset;
	960	(*buf_ptr)[blk_index].cksum = cksum;
	961
	962	return blk_index;
	963	}
	964
	965	static int
	966	do_overlap(journal jnl, struct bucket buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr)
	967	{
	968	int num_to_remove, index, i, overwrite, err;
	969	size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
	970	off_t overlap, block_start, block_end;
	971
	972	block_start = block_num*jhdr_size;
	973	block_end = block_start + size;
	974	overwrite = (block_num == (buf_ptr)[blk_index].block_num && size >= (buf_ptr)[blk_index].block_size);
	975
	976	// first, eliminate any overlap with the previous entry
	977	if (blk_index != 0 && !overwrite) {
	978	off_t prev_block_start = (buf_ptr)[blk_index-1].block_numjhdr_size;
	979	off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
	980	overlap = prev_block_end - block_start;
	981	if (overlap > 0) {
	982	if (overlap % jhdr_size != 0) {
	983	panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
	984	}
	985
	986	// if the previous entry completely overlaps this one, we need to break it into two pieces.
	987	if (prev_block_end > block_end) {
	988	off_t new_num = block_end / jhdr_size;
	989	size_t new_size = prev_block_end - block_end;
	990
	991	new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
	992
	993	err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
	994	if (err < 0) {
	995	panic("jnl: do_overlap: error inserting during pre-overlap\n");
	996	}
	997	}
	998
	999	// Regardless, we need to truncate the previous entry to the beginning of the overlap
	1000	(*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
	1001	(*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it
	1002	}
	1003	}
	1004
	1005	// then, bail out fast if there's no overlap with the entries that follow
	1006	if (!overwrite && block_end <= (off_t)((buf_ptr)[blk_index].block_numjhdr_size)) {
	1007	return 0; // no overlap, no overwrite
	1008	} else if (overwrite && (blk_index + 1 >= num_full_ptr \|\| block_end <= (off_t)((buf_ptr)[blk_index+1].block_num*jhdr_size))) {
	1009
	1010	(*buf_ptr)[blk_index].cksum = cksum; // update this
	1011	return 1; // simple overwrite
	1012	}
	1013
	1014	// Otherwise, find all cases of total and partial overlap. We use the special
	1015	// block_num of -2 to designate entries that are completely overlapped and must
	1016	// be eliminated. The block_num, size, and jnl_offset of partially overlapped
	1017	// entries must be adjusted to keep the array consistent.
	1018	index = blk_index;
	1019	num_to_remove = 0;
	1020	while (index < num_full_ptr && block_end > (off_t)((buf_ptr)[index].block_num*jhdr_size)) {
	1021	if (block_end >= (off_t)(((buf_ptr)[index].block_numjhdr_size + (*buf_ptr)[index].block_size))) {
	1022	(*buf_ptr)[index].block_num = -2; // mark this for deletion
	1023	num_to_remove++;
	1024	} else {
	1025	overlap = block_end - (buf_ptr)[index].block_numjhdr_size;
	1026	if (overlap > 0) {
	1027	if (overlap % jhdr_size != 0) {
	1028	panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
	1029	}
	1030
	1031	// if we partially overlap this entry, adjust its block number, jnl offset, and size
	1032	(*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
	1033	(*buf_ptr)[index].cksum = 0;
	1034
	1035	new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
	1036	if ((off_t)new_offset >= jnl->jhdr->size) {
	1037	new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
	1038	}
	1039	(*buf_ptr)[index].jnl_offset = new_offset;
	1040
	1041	(*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
	1042	if ((*buf_ptr)[index].block_size <= 0) {
	1043	panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
	1044	// return -1; // if above panic is removed, return -1 for error
	1045	}
	1046	}
	1047
	1048	}
	1049
	1050	index++;
	1051	}
	1052
	1053	// bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
	1054	index--; // start with the last index used within the above loop
	1055	while (index >= blk_index) {
	1056	if ((*buf_ptr)[index].block_num == -2) {
	1057	if (index == *num_full_ptr-1) {
	1058	(*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
	1059	} else {
	1060	bcopy( (buf_ptr)+(index+1), (buf_ptr)+(index), (num_full_ptr - (index + 1)) sizeof(struct bucket) );
	1061	}
	1062	(*num_full_ptr)--;
	1063	}
	1064	index--;
	1065	}
	1066
	1067	// eliminate any stale entries at the end of the table
	1068	for(i = num_full_ptr; i < (num_full_ptr + num_to_remove); i++) {
	1069	(*buf_ptr)[i].block_num = -1;
	1070	}
	1071
	1072	return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
	1073	}
	1074
	1075	// PR-3105942: Coalesce writes to the same block in journal replay
	1076	// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
	1077	// to be replayed and the corresponding location in the journal which contains
	1078	// the most recent data for those blocks. The array is "played" once the all the
	1079	// blocks in the journal have been coalesced. The code for the case of conflicting/
	1080	// overlapping writes to a single block is the most dense. Because coalescing can
	1081	// disrupt the existing time-ordering of blocks in the journal playback, care
	1082	// is taken to catch any overlaps and keep the array consistent.
	1083	static int
	1084	add_block(journal jnl, struct bucket buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int num_buckets_ptr, int *num_full_ptr)
	1085	{
	1086	int blk_index, overwriting;
	1087
	1088	// on return from lookup_bucket(), blk_index is the index into the table where block_num should be
	1089	// inserted (or the index of the elem to overwrite).
	1090	blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
	1091
	1092	// check if the index is within bounds (if we're adding this block to the end of
	1093	// the table, blk_index will be equal to num_full)
	1094	if (blk_index < 0 \|\| blk_index > *num_full_ptr) {
	1095	//printf("jnl: add_block: trouble adding block to co_buf\n");
	1096	return -1;
	1097	} // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
	1098
	1099	// Determine whether we're overwriting an existing entry by checking for overlap
	1100	overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
	1101	if (overwriting < 0) {
	1102	return -1; // if we got an error, pass it along
	1103	}
	1104
	1105	// returns the index, or -1 on error
	1106	blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
	1107
	1108	return blk_index;
	1109	}
	1110
	1111	static int
	1112	replay_journal(journal *jnl)
	1113	{
	1114	int i, bad_blocks=0;
	1115	unsigned int orig_checksum, checksum, check_block_checksums = 0;
	1116	size_t ret;
	1117	size_t max_bsize = 0; /* protected by block_ptr */
	1118	block_list_header *blhdr;
	1119	off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
	1120	char buff, block_ptr=NULL;
	1121	struct bucket *co_buf;
	1122	int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
	1123	uint32_t last_sequence_num = 0;
	1124	int replay_retry_count = 0;
	1125
	1126	// wrap the start ptr if it points to the very end of the journal
	1127	if (jnl->jhdr->start == jnl->jhdr->size) {
	1128	jnl->jhdr->start = jnl->jhdr->jhdr_size;
	1129	}
	1130	if (jnl->jhdr->end == jnl->jhdr->size) {
	1131	jnl->jhdr->end = jnl->jhdr->jhdr_size;
	1132	}
	1133
	1134	if (jnl->jhdr->start == jnl->jhdr->end) {
	1135	return 0;
	1136	}
	1137
	1138	orig_jnl_start = jnl->jhdr->start;
	1139
	1140	// allocate memory for the header_block. we'll read each blhdr into this
	1141	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
	1142	printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
	1143	jnl->jdev_name, jnl->jhdr->blhdr_size);
	1144	return -1;
	1145	}
	1146
	1147	// allocate memory for the coalesce buffer
	1148	if ((MALLOC(co_buf, struct bucket , num_bucketssizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
	1149	printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
	1150	return -1;
	1151	}
	1152
	1153	restart_replay:
	1154
	1155	// initialize entries
	1156	for(i = 0; i < num_buckets; i++) {
	1157	co_buf[i].block_num = -1;
	1158	}
	1159	num_full = 0; // empty at first
	1160
	1161
	1162	printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
	1163	jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
	1164
	1165	while (check_past_jnl_end \|\| jnl->jhdr->start != jnl->jhdr->end) {
	1166	offset = blhdr_offset = jnl->jhdr->start;
	1167	ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
	1168	if (ret != (size_t)jnl->jhdr->blhdr_size) {
	1169	printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
	1170	bad_blocks = 1;
	1171	goto bad_txn_handling;
	1172	}
	1173
	1174	blhdr = (block_list_header *)buff;
	1175
	1176	orig_checksum = blhdr->checksum;
	1177	blhdr->checksum = 0;
	1178	if (jnl->flags & JOURNAL_NEED_SWAP) {
	1179	// calculate the checksum based on the unswapped data
	1180	// because it is done byte-at-a-time.
	1181	orig_checksum = (unsigned int)SWAP32(orig_checksum);
	1182	checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
	1183	swap_block_list_header(jnl, blhdr);
	1184	} else {
	1185	checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
	1186	}
	1187
	1188
	1189	//
	1190	// XXXdbg - if these checks fail, we should replay as much
	1191	// we can in the hopes that it will still leave the
	1192	// drive in a better state than if we didn't replay
	1193	// anything
	1194	//
	1195	if (checksum != orig_checksum) {
	1196	if (check_past_jnl_end && in_uncharted_territory) {
	1197
	1198	if (blhdr_offset != jnl->jhdr->end) {
	1199	printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
	1200	}
	1201
	1202	check_past_jnl_end = 0;
	1203	jnl->jhdr->end = blhdr_offset;
	1204	continue;
	1205	}
	1206
	1207	printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
	1208	jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
	1209
	1210	if (blhdr_offset == orig_jnl_start) {
	1211	// if there's nothing in the journal at all, just bail out altogether.
	1212	goto bad_replay;
	1213	}
	1214
	1215	bad_blocks = 1;
	1216	goto bad_txn_handling;
	1217	}
	1218
	1219	if ( (last_sequence_num != 0)
	1220	&& (blhdr->binfo[0].u.bi.b.sequence_num != 0)
	1221	&& (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
	1222	&& (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
	1223
	1224	txn_start_offset = jnl->jhdr->end = blhdr_offset;
	1225
	1226	if (check_past_jnl_end) {
	1227	check_past_jnl_end = 0;
	1228	printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
	1229	jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
	1230	continue;
	1231	}
	1232
	1233	printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
	1234	jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
	1235	bad_blocks = 1;
	1236	goto bad_txn_handling;
	1237	}
	1238	last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
	1239
	1240	if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
	1241	if (last_sequence_num == 0) {
	1242	check_past_jnl_end = 0;
	1243	printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
	1244	jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
	1245	if (jnl->jhdr->start != jnl->jhdr->end) {
	1246	jnl->jhdr->start = jnl->jhdr->end;
	1247	}
	1248	continue;
	1249	}
	1250	printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
	1251	}
	1252
	1253	if ( blhdr->max_blocks <= 0 \|\| blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
	1254	\|\| blhdr->num_blocks <= 0 \|\| blhdr->num_blocks > blhdr->max_blocks) {
	1255	printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
	1256	jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
	1257	bad_blocks = 1;
	1258	goto bad_txn_handling;
	1259	}
	1260
	1261	max_bsize = 0;
	1262	for (i = 1; i < blhdr->num_blocks; i++) {
	1263	if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
	1264	printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
	1265	bad_blocks = 1;
	1266	goto bad_txn_handling;
	1267	}
	1268
	1269	if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
	1270	max_bsize = blhdr->binfo[i].u.bi.bsize;
	1271	}
	1272	}
	1273
	1274	if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
	1275	check_block_checksums = 1;
	1276	if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
	1277	goto bad_replay;
	1278	}
	1279	} else {
	1280	block_ptr = NULL;
	1281	}
	1282
	1283	if (blhdr->flags & BLHDR_FIRST_HEADER) {
	1284	txn_start_offset = blhdr_offset;
	1285	}
	1286
	1287	//printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
	1288	// blhdr->num_blocks-1, jnl->jhdr->start);
	1289	bad_blocks = 0;
	1290	for (i = 1; i < blhdr->num_blocks; i++) {
	1291	int size, ret_val;
	1292	off_t number;
	1293
	1294	size = blhdr->binfo[i].u.bi.bsize;
	1295	number = blhdr->binfo[i].bnum;
	1296
	1297	// don't add "killed" blocks
	1298	if (number == (off_t)-1) {
	1299	//printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
	1300	} else {
	1301
	1302	if (check_block_checksums) {
	1303	int32_t disk_cksum;
	1304	off_t block_offset;
	1305
	1306	block_offset = offset;
	1307
	1308	// read the block so we can check the checksum
	1309	ret = read_journal_data(jnl, &block_offset, block_ptr, size);
	1310	if (ret != (size_t)size) {
	1311	printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
	1312	bad_blocks = 1;
	1313	goto bad_txn_handling;
	1314	}
	1315
	1316	disk_cksum = calc_checksum(block_ptr, size);
	1317
	1318	// there is no need to swap the checksum from disk because
	1319	// it got swapped when the blhdr was read in.
	1320	if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
	1321	printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
	1322	jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
	1323	printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
	1324	(int )&block_ptr[0sizeof(int)], (int )&block_ptr[1sizeof(int)], (int )&block_ptr[2sizeof(int)], (int )&block_ptr[3sizeof(int)],
	1325	(int )&block_ptr[4sizeof(int)], (int )&block_ptr[5sizeof(int)], (int )&block_ptr[6sizeof(int)], (int )&block_ptr[7sizeof(int)]);
	1326
	1327	bad_blocks = 1;
	1328	goto bad_txn_handling;
	1329	}
	1330	}
	1331
	1332
	1333	// add this bucket to co_buf, coalescing where possible
	1334	// printf("jnl: replay_journal: adding block 0x%llx\n", number);
	1335	ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
	1336
	1337	if (ret_val == -1) {
	1338	printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
	1339	goto bad_replay;
	1340	} // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
	1341	}
	1342
	1343	// increment offset
	1344	offset += size;
	1345
	1346	// check if the last block added puts us off the end of the jnl.
	1347	// if so, we need to wrap to the beginning and take any remainder
	1348	// into account
	1349	//
	1350	if (offset >= jnl->jhdr->size) {
	1351	offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
	1352	}
	1353	}
	1354
	1355	if (block_ptr) {
	1356	kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
	1357	block_ptr = NULL;
	1358	}
	1359
	1360	bad_txn_handling:
	1361	if (bad_blocks) {
	1362	/* Journal replay got error before it found any valid
	1363	* transations, abort replay */
	1364	if (txn_start_offset == 0) {
	1365	printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
	1366	goto bad_replay;
	1367	}
	1368
	1369	/* Repeated error during journal replay, abort replay */
	1370	if (replay_retry_count == 3) {
	1371	printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
	1372	goto bad_replay;
	1373	}
	1374	replay_retry_count++;
	1375
	1376	/* There was an error replaying the journal (possibly
	1377	* EIO/ENXIO from the device). So retry replaying all
	1378	* the good transactions that we found before getting
	1379	* the error.
	1380	*/
	1381	jnl->jhdr->start = orig_jnl_start;
	1382	jnl->jhdr->end = txn_start_offset;
	1383	check_past_jnl_end = 0;
	1384	last_sequence_num = 0;
	1385	printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
	1386	goto restart_replay;
	1387	}
	1388
	1389	jnl->jhdr->start += blhdr->bytes_used;
	1390	if (jnl->jhdr->start >= jnl->jhdr->size) {
	1391	// wrap around and skip the journal header block
	1392	jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
	1393	}
	1394
	1395	if (jnl->jhdr->start == jnl->jhdr->end) {
	1396	in_uncharted_territory = 1;
	1397	}
	1398	}
	1399
	1400	if (jnl->jhdr->start != jnl->jhdr->end) {
	1401	printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
	1402	jnl->jhdr->end = jnl->jhdr->start;
	1403	}
	1404
	1405	//printf("jnl: replay_journal: replaying %d blocks\n", num_full);
	1406
	1407	/*
	1408	* make sure it's at least one page in size, so
	1409	* start max_bsize at PAGE_SIZE
	1410	*/
	1411	for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
	1412
	1413	if (co_buf[i].block_num == (off_t)-1)
	1414	continue;
	1415
	1416	if (co_buf[i].block_size > max_bsize)
	1417	max_bsize = co_buf[i].block_size;
	1418	}
	1419	/*
	1420	* round max_bsize up to the nearest PAGE_SIZE multiple
	1421	*/
	1422	if (max_bsize & (PAGE_SIZE - 1)) {
	1423	max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
	1424	}
	1425
	1426	if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
	1427	goto bad_replay;
	1428	}
	1429
	1430	// Replay the coalesced entries in the co-buf
	1431	for(i = 0; i < num_full; i++) {
	1432	size_t size = co_buf[i].block_size;
	1433	off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
	1434	off_t number = co_buf[i].block_num;
	1435
	1436
	1437	// printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
	1438	// co_buf[i].block_size, co_buf[i].jnl_offset);
	1439
	1440	if (number == (off_t)-1) {
	1441	// printf("jnl: replay_journal: skipping killed fs block\n");
	1442	} else {
	1443
	1444	// do journal read, and set the phys. block
	1445	ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
	1446	if (ret != size) {
	1447	printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
	1448	goto bad_replay;
	1449	}
	1450
	1451	if (update_fs_block(jnl, block_ptr, number, size) != 0) {
	1452	goto bad_replay;
	1453	}
	1454	}
	1455	}
	1456
	1457
	1458	// done replaying; update jnl header
	1459	if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
	1460	goto bad_replay;
	1461	}
	1462
	1463	printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
	1464
	1465	// free block_ptr
	1466	if (block_ptr) {
	1467	kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
	1468	block_ptr = NULL;
	1469	}
	1470
	1471	// free the coalesce buffer
	1472	FREE(co_buf, M_TEMP);
	1473	co_buf = NULL;
	1474
	1475	kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
	1476	return 0;
	1477
	1478	bad_replay:
	1479	if (block_ptr) {
	1480	kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
	1481	}
	1482	if (co_buf) {
	1483	FREE(co_buf, M_TEMP);
	1484	}
	1485	kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
	1486
	1487	return -1;
	1488	}
	1489
	1490
	1491	#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
	1492	#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024)
	1493
	1494	// XXXdbg - so I can change it in the debugger
	1495	int def_tbuffer_size = 0;
	1496
	1497
	1498	//
	1499	// This function sets the size of the tbuffer and the
	1500	// size of the blhdr. It assumes that jnl->jhdr->size
	1501	// and jnl->jhdr->jhdr_size are already valid.
	1502	//
	1503	static void
	1504	size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
	1505	{
	1506	//
	1507	// one-time initialization based on how much memory
	1508	// there is in the machine.
	1509	//
	1510	if (def_tbuffer_size == 0) {
	1511	if (max_mem < (25610241024)) {
	1512	def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
	1513	} else if (max_mem < (51210241024)) {
	1514	def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
	1515	} else if (max_mem < (102410241024)) {
	1516	def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
	1517	} else {
	1518	def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (25610241024));
	1519	}
	1520	}
	1521
	1522	// size up the transaction buffer... can't be larger than the number
	1523	// of blocks that can fit in a block_list_header block.
	1524	if (tbuffer_size == 0) {
	1525	jnl->tbuffer_size = def_tbuffer_size;
	1526	} else {
	1527	// make sure that the specified tbuffer_size isn't too small
	1528	if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
	1529	tbuffer_size = jnl->jhdr->blhdr_size * 2;
	1530	}
	1531	// and make sure it's an even multiple of the block size
	1532	if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
	1533	tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
	1534	}
	1535
	1536	jnl->tbuffer_size = tbuffer_size;
	1537	}
	1538
	1539	if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
	1540	jnl->tbuffer_size = (jnl->jhdr->size / 2);
	1541	}
	1542
	1543	if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
	1544	jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
	1545	}
	1546
	1547	jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
	1548	if (jnl->jhdr->blhdr_size < phys_blksz) {
	1549	jnl->jhdr->blhdr_size = phys_blksz;
	1550	} else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
	1551	// have to round up so we're an even multiple of the physical block size
	1552	jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
	1553	}
	1554	}
	1555
	1556	static void
	1557	get_io_info(struct vnode devvp, size_t phys_blksz, journal jnl, struct vfs_context *context)
	1558	{
	1559	off_t readblockcnt;
	1560	off_t writeblockcnt;
	1561	off_t readmaxcnt=0, tmp_readmaxcnt;
	1562	off_t writemaxcnt=0, tmp_writemaxcnt;
	1563	off_t readsegcnt, writesegcnt;
	1564	int32_t features;
	1565
	1566	if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
	1567	if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
	1568	const char *name = vnode_getname_printable(devvp);
	1569	jnl->flags \|= JOURNAL_DO_FUA_WRITES;
	1570	printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
	1571	vnode_putname_printable(name);
	1572	}
	1573	if (features & DK_FEATURE_UNMAP) {
	1574	jnl->flags \|= JOURNAL_USE_UNMAP;
	1575	}
	1576	}
	1577
	1578	//
	1579	// First check the max read size via several different mechanisms...
	1580	//
	1581	VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context);
	1582
	1583	if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) {
	1584	tmp_readmaxcnt = readblockcnt * phys_blksz;
	1585	if (readmaxcnt == 0 \|\| (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
	1586	readmaxcnt = tmp_readmaxcnt;
	1587	}
	1588	}
	1589
	1590	if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) {
	1591	readsegcnt = 0;
	1592	}
	1593
	1594	if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
	1595	readmaxcnt = readsegcnt * PAGE_SIZE;
	1596	}
	1597
	1598	if (readmaxcnt == 0) {
	1599	readmaxcnt = 128 * 1024;
	1600	} else if (readmaxcnt > UINT32_MAX) {
	1601	readmaxcnt = UINT32_MAX;
	1602	}
	1603
	1604
	1605	//
	1606	// Now check the max writes size via several different mechanisms...
	1607	//
	1608	VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context);
	1609
	1610	if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) {
	1611	tmp_writemaxcnt = writeblockcnt * phys_blksz;
	1612	if (writemaxcnt == 0 \|\| (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
	1613	writemaxcnt = tmp_writemaxcnt;
	1614	}
	1615	}
	1616
	1617	if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) {
	1618	writesegcnt = 0;
	1619	}
	1620
	1621	if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
	1622	writemaxcnt = writesegcnt * PAGE_SIZE;
	1623	}
	1624
	1625	if (writemaxcnt == 0) {
	1626	writemaxcnt = 128 * 1024;
	1627	} else if (writemaxcnt > UINT32_MAX) {
	1628	writemaxcnt = UINT32_MAX;
	1629	}
	1630
	1631	jnl->max_read_size = readmaxcnt;
	1632	jnl->max_write_size = writemaxcnt;
	1633	// printf("jnl: %s: max read/write: %lld k / %lld k\n",
	1634	// jnl->jdev_name ? jnl->jdev_name : "unknown",
	1635	// jnl->max_read_size/1024, jnl->max_write_size/1024);
	1636	}
	1637
	1638
	1639	journal *
	1640	journal_create(struct vnode *jvp,
	1641	off_t offset,
	1642	off_t journal_size,
	1643	struct vnode *fsvp,
	1644	size_t min_fs_blksz,
	1645	int32_t flags,
	1646	int32_t tbuffer_size,
	1647	void (flush)(void arg),
	1648	void *arg,
	1649	struct mount *fsmount)
	1650	{
	1651	journal *jnl;
	1652	uint32_t phys_blksz, new_txn_base;
	1653	u_int32_t min_size;
	1654	struct vfs_context context;
	1655	const char *jdev_name;
	1656	/*
	1657	* Cap the journal max size to 2GB. On HFS, it will attempt to occupy
	1658	* a full allocation block if the current size is smaller than the allocation
	1659	* block on which it resides. Once we hit the exabyte filesystem range, then
	1660	* it will use 2GB allocation blocks. As a result, make the cap 2GB.
	1661	*/
	1662	context.vc_thread = current_thread();
	1663	context.vc_ucred = FSCRED;
	1664
	1665	jdev_name = vnode_getname_printable(jvp);
	1666
	1667	/* Get the real physical block size. */
	1668	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
	1669	goto cleanup_jdev_name;
	1670	}
	1671
	1672	if (journal_size < (256*1024) \|\| journal_size > (MAX_JOURNAL_SIZE)) {
	1673	printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
	1674	goto cleanup_jdev_name;
	1675	}
	1676
	1677	min_size = phys_blksz * (phys_blksz / sizeof(block_info));
	1678	/* Reject journals that are too small given the sector size of the device */
	1679	if (journal_size < min_size) {
	1680	printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n",
	1681	jdev_name, journal_size, phys_blksz);
	1682	goto cleanup_jdev_name;
	1683	}
	1684
	1685	if (phys_blksz > min_fs_blksz) {
	1686	printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
	1687	jdev_name, phys_blksz, min_fs_blksz);
	1688	goto cleanup_jdev_name;
	1689	}
	1690
	1691	if ((journal_size % phys_blksz) != 0) {
	1692	printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
	1693	jdev_name, journal_size, phys_blksz);
	1694	goto cleanup_jdev_name;
	1695	}
	1696
	1697
	1698	MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
	1699	memset(jnl, 0, sizeof(*jnl));
	1700
	1701	jnl->jdev = jvp;
	1702	jnl->jdev_offset = offset;
	1703	jnl->fsdev = fsvp;
	1704	jnl->flush = flush;
	1705	jnl->flush_arg = arg;
	1706	jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
	1707	jnl->jdev_name = jdev_name;
	1708	lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
	1709
	1710	// Keep a point to the mount around for use in IO throttling.
	1711	jnl->fsmount = fsmount;
	1712	// XXX: This lock discipline looks correct based on dounmount(), but it
	1713	// doesn't seem to be documented anywhere.
	1714	mount_ref(fsmount, 0);
	1715
	1716	get_io_info(jvp, phys_blksz, jnl, &context);
	1717
	1718	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
	1719	printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
	1720	goto bad_kmem_alloc;
	1721	}
	1722	jnl->header_buf_size = phys_blksz;
	1723
	1724	jnl->jhdr = (journal_header *)jnl->header_buf;
	1725	memset(jnl->jhdr, 0, sizeof(journal_header));
	1726
	1727	// we have to set this up here so that do_journal_io() will work
	1728	jnl->jhdr->jhdr_size = phys_blksz;
	1729
	1730	//
	1731	// We try and read the journal header to see if there is already one
	1732	// out there. If there is, it's possible that it has transactions
	1733	// in it that we might replay if we happen to pick a sequence number
	1734	// that is a little less than the old one, there is a crash and the
	1735	// last txn written ends right at the start of a txn from the previous
	1736	// incarnation of this file system. If all that happens we would
	1737	// replay the transactions from the old file system and that would
	1738	// destroy your disk. Although it is extremely unlikely for all those
	1739	// conditions to happen, the probability is non-zero and the result is
	1740	// severe - you lose your file system. Therefore if we find a valid
	1741	// journal header and the sequence number is non-zero we write junk
	1742	// over the entire journal so that there is no way we will encounter
	1743	// any old transactions. This is slow but should be a rare event
	1744	// since most tools erase the journal.
	1745	//
	1746	if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
	1747	&& jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
	1748	&& jnl->jhdr->sequence_num != 0) {
	1749
	1750	new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
	1751	printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
	1752
	1753	#if 0
	1754	int i;
	1755	off_t pos=0;
	1756
	1757	for(i = 1; i < journal_size / phys_blksz; i++) {
	1758	pos = i*phys_blksz;
	1759
	1760	// we don't really care what data we write just so long
	1761	// as it's not a valid transaction header. since we have
	1762	// the header_buf sitting around we'll use that.
	1763	write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz);
	1764	}
	1765	printf("jnl: create: done clearing journal (i=%d)\n", i);
	1766	#endif
	1767	} else {
	1768	new_txn_base = random() & 0x00ffffff;
	1769	}
	1770
	1771	memset(jnl->header_buf, 0, phys_blksz);
	1772
	1773	jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
	1774	jnl->jhdr->endian = ENDIAN_MAGIC;
	1775	jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
	1776	jnl->jhdr->end = phys_blksz;
	1777	jnl->jhdr->size = journal_size;
	1778	jnl->jhdr->jhdr_size = phys_blksz;
	1779	size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
	1780
	1781	jnl->active_start = jnl->jhdr->start;
	1782
	1783	// XXXdbg - for testing you can force the journal to wrap around
	1784	// jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
	1785	// jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
	1786
	1787	jnl->jhdr->sequence_num = new_txn_base;
	1788
	1789	lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
	1790	lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
	1791	lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
	1792
	1793
	1794	jnl->flushing = FALSE;
	1795	jnl->asyncIO = FALSE;
	1796	jnl->flush_aborted = FALSE;
	1797	jnl->writing_header = FALSE;
	1798	jnl->async_trim = NULL;
	1799	jnl->sequence_num = jnl->jhdr->sequence_num;
	1800
	1801	if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
	1802	printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
	1803	goto bad_write;
	1804	}
	1805
	1806	goto journal_create_complete;
	1807
	1808
	1809	bad_write:
	1810	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
	1811	bad_kmem_alloc:
	1812	jnl->jhdr = NULL;
	1813	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
	1814	mount_drop(fsmount, 0);
	1815	cleanup_jdev_name:
	1816	vnode_putname_printable(jdev_name);
	1817	jnl = NULL;
	1818	journal_create_complete:
	1819	return jnl;
	1820	}
	1821
	1822
	1823	journal *
	1824	journal_open(struct vnode *jvp,
	1825	off_t offset,
	1826	off_t journal_size,
	1827	struct vnode *fsvp,
	1828	size_t min_fs_blksz,
	1829	int32_t flags,
	1830	int32_t tbuffer_size,
	1831	void (flush)(void arg),
	1832	void *arg,
	1833	struct mount *fsmount)
	1834	{
	1835	journal *jnl;
	1836	uint32_t orig_blksz=0;
	1837	uint32_t phys_blksz;
	1838	u_int32_t min_size = 0;
	1839	int orig_checksum, checksum;
	1840	struct vfs_context context;
	1841	const char *jdev_name = vnode_getname_printable(jvp);
	1842
	1843	context.vc_thread = current_thread();
	1844	context.vc_ucred = FSCRED;
	1845
	1846	/* Get the real physical block size. */
	1847	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
	1848	goto cleanup_jdev_name;
	1849	}
	1850
	1851	if (phys_blksz > min_fs_blksz) {
	1852	printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
	1853	jdev_name, phys_blksz, min_fs_blksz);
	1854	goto cleanup_jdev_name;
	1855	}
	1856
	1857	if (journal_size < (2561024) \|\| journal_size > (10241024*1024)) {
	1858	printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
	1859	goto cleanup_jdev_name;
	1860	}
	1861
	1862	min_size = phys_blksz * (phys_blksz / sizeof(block_info));
	1863	/* Reject journals that are too small given the sector size of the device */
	1864	if (journal_size < min_size) {
	1865	printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n",
	1866	jdev_name, journal_size, phys_blksz);
	1867	goto cleanup_jdev_name;
	1868	}
	1869
	1870	if ((journal_size % phys_blksz) != 0) {
	1871	printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
	1872	jdev_name, journal_size, phys_blksz);
	1873	goto cleanup_jdev_name;
	1874	}
	1875
	1876	MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
	1877	memset(jnl, 0, sizeof(*jnl));
	1878
	1879	jnl->jdev = jvp;
	1880	jnl->jdev_offset = offset;
	1881	jnl->fsdev = fsvp;
	1882	jnl->flush = flush;
	1883	jnl->flush_arg = arg;
	1884	jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
	1885	jnl->jdev_name = jdev_name;
	1886	lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
	1887
	1888	/* We need a reference to the mount to later pass to the throttling code for
	1889	* IO accounting.
	1890	*/
	1891	jnl->fsmount = fsmount;
	1892	mount_ref(fsmount, 0);
	1893
	1894	get_io_info(jvp, phys_blksz, jnl, &context);
	1895
	1896	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
	1897	printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
	1898	goto bad_kmem_alloc;
	1899	}
	1900	jnl->header_buf_size = phys_blksz;
	1901
	1902	jnl->jhdr = (journal_header *)jnl->header_buf;
	1903	memset(jnl->jhdr, 0, sizeof(journal_header));
	1904
	1905	// we have to set this up here so that do_journal_io() will work
	1906	jnl->jhdr->jhdr_size = phys_blksz;
	1907
	1908	if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
	1909	printf("jnl: %s: open: could not read %u bytes for the journal header.\n",
	1910	jdev_name, phys_blksz);
	1911	goto bad_journal;
	1912	}
	1913
	1914	orig_checksum = jnl->jhdr->checksum;
	1915	jnl->jhdr->checksum = 0;
	1916
	1917	if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
	1918	// do this before the swap since it's done byte-at-a-time
	1919	orig_checksum = SWAP32(orig_checksum);
	1920	checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
	1921	swap_journal_header(jnl);
	1922	jnl->flags \|= JOURNAL_NEED_SWAP;
	1923	} else {
	1924	checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
	1925	}
	1926
	1927	if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
	1928	printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
	1929	jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
	1930	goto bad_journal;
	1931	}
	1932
	1933	// only check if we're the current journal header magic value
	1934	if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
	1935
	1936	if (orig_checksum != checksum) {
	1937	printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
	1938	jdev_name, orig_checksum, checksum);
	1939
	1940	//goto bad_journal;
	1941	}
	1942	}
	1943
	1944	// XXXdbg - convert old style magic numbers to the new one
	1945	if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
	1946	jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
	1947	}
	1948
	1949	if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
	1950	/*
	1951	* The volume has probably been resized (such that we had to adjust the
	1952	* logical sector size), or copied to media with a different logical
	1953	* sector size.
	1954	*
	1955	* Temporarily change the device's logical block size to match the
	1956	* journal's header size. This will allow us to replay the journal
	1957	* safely. If the replay succeeds, we will update the journal's header
	1958	* size (later in this function).
	1959	*/
	1960	orig_blksz = phys_blksz;
	1961	phys_blksz = jnl->jhdr->jhdr_size;
	1962	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
	1963	printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
	1964	jdev_name, orig_blksz, phys_blksz);
	1965	}
	1966
	1967	if ( jnl->jhdr->start <= 0
	1968	\|\| jnl->jhdr->start > jnl->jhdr->size
	1969	\|\| jnl->jhdr->start > 102410241024) {
	1970	printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
	1971	jdev_name, jnl->jhdr->start, jnl->jhdr->size);
	1972	goto bad_journal;
	1973	}
	1974
	1975	if ( jnl->jhdr->end <= 0
	1976	\|\| jnl->jhdr->end > jnl->jhdr->size
	1977	\|\| jnl->jhdr->end > 102410241024) {
	1978	printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
	1979	jdev_name, jnl->jhdr->end, jnl->jhdr->size);
	1980	goto bad_journal;
	1981	}
	1982
	1983	if (jnl->jhdr->size < (2561024) \|\| jnl->jhdr->size > 10241024*1024) {
	1984	printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
	1985	goto bad_journal;
	1986	}
	1987
	1988	// XXXdbg - can't do these checks because hfs writes all kinds of
	1989	// non-uniform sized blocks even on devices that have a block size
	1990	// that is larger than 512 bytes (i.e. optical media w/2k blocks).
	1991	// therefore these checks will fail and so we just have to punt and
	1992	// do more relaxed checking...
	1993	// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
	1994	if ((jnl->jhdr->start % 512) != 0) {
	1995	printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
	1996	jdev_name, jnl->jhdr->start);
	1997	goto bad_journal;
	1998	}
	1999
	2000	//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
	2001	if ((jnl->jhdr->end % 512) != 0) {
	2002	printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
	2003	jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
	2004	goto bad_journal;
	2005	}
	2006
	2007	// take care of replaying the journal if necessary
	2008	if (flags & JOURNAL_RESET) {
	2009	printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
	2010	jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
	2011	jnl->jhdr->start = jnl->jhdr->end;
	2012	} else if (replay_journal(jnl) != 0) {
	2013	printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
	2014	goto bad_journal;
	2015	}
	2016
	2017	/*
	2018	* When we get here, we know that the journal is empty (jnl->jhdr->start ==
	2019	* jnl->jhdr->end). If the device's logical block size was different from
	2020	* the journal's header size, then we can now restore the device's logical
	2021	* block size and update the journal's header size to match.
	2022	*
	2023	* Note that we also adjust the journal's start and end so that they will
	2024	* be aligned on the new block size. We pick a new sequence number to
	2025	* avoid any problems if a replay found previous transactions using the old
	2026	* journal header size. (See the comments in journal_create(), above.)
	2027	*/
	2028
	2029	if (orig_blksz != 0) {
	2030	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
	2031	phys_blksz = orig_blksz;
	2032
	2033	orig_blksz = 0;
	2034
	2035	jnl->jhdr->jhdr_size = phys_blksz;
	2036	jnl->jhdr->start = phys_blksz;
	2037	jnl->jhdr->end = phys_blksz;
	2038	jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
	2039	(journal_size / phys_blksz) +
	2040	(random() % 16384)) & 0x00ffffff;
	2041
	2042	if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
	2043	printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
	2044	goto bad_journal;
	2045	}
	2046	}
	2047
	2048	// make sure this is in sync!
	2049	jnl->active_start = jnl->jhdr->start;
	2050	jnl->sequence_num = jnl->jhdr->sequence_num;
	2051
	2052	// set this now, after we've replayed the journal
	2053	size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
	2054
	2055	// TODO: Does this need to change if the device's logical block size changed?
	2056	if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
	2057	printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
	2058	jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
	2059	goto bad_journal;
	2060	}
	2061
	2062	lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
	2063	lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
	2064	lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
	2065
	2066	goto journal_open_complete;
	2067
	2068	bad_journal:
	2069	if (orig_blksz != 0) {
	2070	phys_blksz = orig_blksz;
	2071	VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
	2072	printf("jnl: %s: open: restored block size after error\n", jdev_name);
	2073	}
	2074	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
	2075	bad_kmem_alloc:
	2076	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
	2077	mount_drop(fsmount, 0);
	2078	cleanup_jdev_name:
	2079	vnode_putname_printable(jdev_name);
	2080	jnl = NULL;
	2081	journal_open_complete:
	2082	return jnl;
	2083	}
	2084
	2085
	2086	int
	2087	journal_is_clean(struct vnode *jvp,
	2088	off_t offset,
	2089	off_t journal_size,
	2090	struct vnode *fsvp,
	2091	size_t min_fs_block_size)
	2092	{
	2093	journal jnl;
	2094	uint32_t phys_blksz;
	2095	int ret;
	2096	int orig_checksum, checksum;
	2097	struct vfs_context context;
	2098	const char *jdev_name = vnode_getname_printable(jvp);
	2099
	2100	context.vc_thread = current_thread();
	2101	context.vc_ucred = FSCRED;
	2102
	2103	/* Get the real physical block size. */
	2104	if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
	2105	printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
	2106	ret = EINVAL;
	2107	goto cleanup_jdev_name;
	2108	}
	2109
	2110	if (phys_blksz > (uint32_t)min_fs_block_size) {
	2111	printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
	2112	jdev_name, phys_blksz, min_fs_block_size);
	2113	ret = EINVAL;
	2114	goto cleanup_jdev_name;
	2115	}
	2116
	2117	if (journal_size < (256*1024) \|\| journal_size > (MAX_JOURNAL_SIZE)) {
	2118	printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
	2119	ret = EINVAL;
	2120	goto cleanup_jdev_name;
	2121	}
	2122
	2123	if ((journal_size % phys_blksz) != 0) {
	2124	printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
	2125	jdev_name, journal_size, phys_blksz);
	2126	ret = EINVAL;
	2127	goto cleanup_jdev_name;
	2128	}
	2129
	2130	memset(&jnl, 0, sizeof(jnl));
	2131
	2132	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
	2133	printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
	2134	ret = ENOMEM;
	2135	goto cleanup_jdev_name;
	2136	}
	2137	jnl.header_buf_size = phys_blksz;
	2138
	2139	get_io_info(jvp, phys_blksz, &jnl, &context);
	2140
	2141	jnl.jhdr = (journal_header *)jnl.header_buf;
	2142	memset(jnl.jhdr, 0, sizeof(journal_header));
	2143
	2144	jnl.jdev = jvp;
	2145	jnl.jdev_offset = offset;
	2146	jnl.fsdev = fsvp;
	2147
	2148	// we have to set this up here so that do_journal_io() will work
	2149	jnl.jhdr->jhdr_size = phys_blksz;
	2150
	2151	if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
	2152	printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
	2153	jdev_name, phys_blksz);
	2154	ret = EINVAL;
	2155	goto get_out;
	2156	}
	2157
	2158	orig_checksum = jnl.jhdr->checksum;
	2159	jnl.jhdr->checksum = 0;
	2160
	2161	if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
	2162	// do this before the swap since it's done byte-at-a-time
	2163	orig_checksum = SWAP32(orig_checksum);
	2164	checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
	2165	swap_journal_header(&jnl);
	2166	jnl.flags \|= JOURNAL_NEED_SWAP;
	2167	} else {
	2168	checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
	2169	}
	2170
	2171	if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
	2172	printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
	2173	jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
	2174	ret = EINVAL;
	2175	goto get_out;
	2176	}
	2177
	2178	if (orig_checksum != checksum) {
	2179	printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
	2180	ret = EINVAL;
	2181	goto get_out;
	2182	}
	2183
	2184	//
	2185	// if the start and end are equal then the journal is clean.
	2186	// otherwise it's not clean and therefore an error.
	2187	//
	2188	if (jnl.jhdr->start == jnl.jhdr->end) {
	2189	ret = 0;
	2190	} else {
	2191	ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one
	2192	}
	2193
	2194	get_out:
	2195	kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
	2196	cleanup_jdev_name:
	2197	vnode_putname_printable(jdev_name);
	2198	return ret;
	2199	}
	2200
	2201
	2202	void
	2203	journal_close(journal *jnl)
	2204	{
	2205	volatile off_t start, end;
	2206	int counter=0;
	2207
	2208	CHECK_JOURNAL(jnl);
	2209
	2210	// set this before doing anything that would block so that
	2211	// we start tearing things down properly.
	2212	//
	2213	jnl->flags \|= JOURNAL_CLOSE_PENDING;
	2214
	2215	if (jnl->owner != current_thread()) {
	2216	journal_lock(jnl);
	2217	}
	2218
	2219	wait_condition(jnl, &jnl->flushing, "journal_close");
	2220
	2221	//
	2222	// only write stuff to disk if the journal is still valid
	2223	//
	2224	if ((jnl->flags & JOURNAL_INVALID) == 0) {
	2225
	2226	if (jnl->active_tr) {
	2227	/*
	2228	* "journal_end_transaction" will fire the flush asynchronously
	2229	*/
	2230	journal_end_transaction(jnl);
	2231	}
	2232
	2233	// flush any buffered transactions
	2234	if (jnl->cur_tr) {
	2235	transaction *tr = jnl->cur_tr;
	2236
	2237	jnl->cur_tr = NULL;
	2238	/*
	2239	* "end_transaction" will wait for any in-progress flush to complete
	2240	* before flushing "cur_tr" synchronously("must_wait" == TRUE)
	2241	*/
	2242	end_transaction(tr, 1, NULL, NULL, FALSE, TRUE);
	2243	}
	2244	/*
	2245	* if there was an "active_tr", make sure we wait for
	2246	* it to flush if there was no "cur_tr" to process
	2247	*/
	2248	wait_condition(jnl, &jnl->flushing, "journal_close");
	2249
	2250	//start = &jnl->jhdr->start;
	2251	start = &jnl->active_start;
	2252	end = &jnl->jhdr->end;
	2253
	2254	while (start != end && counter++ < 5000) {
	2255	//printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", start, end);
	2256	if (jnl->flush) {
	2257	jnl->flush(jnl->flush_arg);
	2258	}
	2259	tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
	2260	}
	2261
	2262	if (start != end) {
	2263	printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
	2264	jnl->jdev_name, start, end);
	2265	}
	2266
	2267	// make sure this is in sync when we close the journal
	2268	jnl->jhdr->start = jnl->active_start;
	2269
	2270	// if this fails there's not much we can do at this point...
	2271	write_journal_header(jnl, 1, jnl->sequence_num);
	2272	} else {
	2273	// if we're here the journal isn't valid any more.
	2274	// so make sure we don't leave any locked blocks lying around
	2275	printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl->jdev_name, jnl);
	2276
	2277	if (jnl->active_tr \|\| jnl->cur_tr) {
	2278	transaction *tr;
	2279
	2280	if (jnl->active_tr) {
	2281	tr = jnl->active_tr;
	2282	jnl->active_tr = NULL;
	2283	} else {
	2284	tr = jnl->cur_tr;
	2285	jnl->cur_tr = NULL;
	2286	}
	2287	abort_transaction(jnl, tr);
	2288
	2289	if (jnl->active_tr \|\| jnl->cur_tr) {
	2290	panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
	2291	}
	2292	}
	2293	}
	2294	wait_condition(jnl, &jnl->asyncIO, "journal_close");
	2295
	2296	free_old_stuff(jnl);
	2297
	2298	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
	2299	jnl->jhdr = (void *)0xbeefbabe;
	2300
	2301	// Release reference on the mount
	2302	if (jnl->fsmount)
	2303	mount_drop(jnl->fsmount, 0);
	2304
	2305	vnode_putname_printable(jnl->jdev_name);
	2306
	2307	journal_unlock(jnl);
	2308	lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
	2309	lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
	2310	lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
	2311	FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
	2312	}
	2313
	2314	static void
	2315	dump_journal(journal *jnl)
	2316	{
	2317	transaction *ctr;
	2318
	2319	printf("journal for dev %s:", jnl->jdev_name);
	2320	printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
	2321	printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
	2322	printf(" start: 0x%.8llx\n", jnl->jhdr->start);
	2323	printf(" end: 0x%.8llx\n", jnl->jhdr->end);
	2324	printf(" size: 0x%.8llx\n", jnl->jhdr->size);
	2325	printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
	2326	printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
	2327	printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
	2328
	2329	printf(" completed transactions:\n");
	2330	for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
	2331	printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
	2332	}
	2333	}
	2334
	2335
	2336
	2337	static off_t
	2338	free_space(journal *jnl)
	2339	{
	2340	off_t free_space_offset;
	2341
	2342	if (jnl->jhdr->start < jnl->jhdr->end) {
	2343	free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
	2344	} else if (jnl->jhdr->start > jnl->jhdr->end) {
	2345	free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
	2346	} else {
	2347	// journal is completely empty
	2348	free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
	2349	}
	2350
	2351	return free_space_offset;
	2352	}
	2353
	2354
	2355	//
	2356	// The journal must be locked on entry to this function.
	2357	// The "desired_size" is in bytes.
	2358	//
	2359	static int
	2360	check_free_space(journal jnl, int desired_size, boolean_t delayed_header_write, uint32_t sequence_num)
	2361	{
	2362	size_t i;
	2363	int counter=0;
	2364
	2365	//printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
	2366	// desired_size, free_space(jnl));
	2367
	2368	if (delayed_header_write)
	2369	*delayed_header_write = FALSE;
	2370
	2371	while (1) {
	2372	int old_start_empty;
	2373
	2374	// make sure there's space in the journal to hold this transaction
	2375	if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
	2376	break;
	2377	}
	2378	if (counter++ == 5000) {
	2379	dump_journal(jnl);
	2380	panic("jnl: check_free_space: buffer flushing isn't working "
	2381	"(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
	2382	jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
	2383	}
	2384	if (counter > 7500) {
	2385	printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
	2386	return ENOSPC;
	2387	}
	2388
	2389	//
	2390	// here's where we lazily bump up jnl->jhdr->start. we'll consume
	2391	// entries until there is enough space for the next transaction.
	2392	//
	2393	old_start_empty = 1;
	2394	lock_oldstart(jnl);
	2395
	2396	for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
	2397	int lcl_counter;
	2398
	2399	lcl_counter = 0;
	2400	while (jnl->old_start[i] & 0x8000000000000000LL) {
	2401	if (lcl_counter++ > 10000) {
	2402	panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
	2403	jnl->old_start[i], jnl);
	2404	}
	2405
	2406	unlock_oldstart(jnl);
	2407	if (jnl->flush) {
	2408	jnl->flush(jnl->flush_arg);
	2409	}
	2410	tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
	2411	lock_oldstart(jnl);
	2412	}
	2413
	2414	if (jnl->old_start[i] == 0) {
	2415	continue;
	2416	}
	2417
	2418	old_start_empty = 0;
	2419	jnl->jhdr->start = jnl->old_start[i];
	2420	jnl->old_start[i] = 0;
	2421
	2422	if (free_space(jnl) > desired_size) {
	2423
	2424	if (delayed_header_write)
	2425	*delayed_header_write = TRUE;
	2426	else {
	2427	unlock_oldstart(jnl);
	2428	write_journal_header(jnl, 1, sequence_num);
	2429	lock_oldstart(jnl);
	2430	}
	2431	break;
	2432	}
	2433	}
	2434	unlock_oldstart(jnl);
	2435
	2436	// if we bumped the start, loop and try again
	2437	if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
	2438	continue;
	2439	} else if (old_start_empty) {
	2440	//
	2441	// if there is nothing in old_start anymore then we can
	2442	// bump the jhdr->start to be the same as active_start
	2443	// since it is possible there was only one very large
	2444	// transaction in the old_start array. if we didn't do
	2445	// this then jhdr->start would never get updated and we
	2446	// would wind up looping until we hit the panic at the
	2447	// start of the loop.
	2448	//
	2449	jnl->jhdr->start = jnl->active_start;
	2450
	2451	if (delayed_header_write)
	2452	*delayed_header_write = TRUE;
	2453	else
	2454	write_journal_header(jnl, 1, sequence_num);
	2455	continue;
	2456	}
	2457
	2458
	2459	// if the file system gave us a flush function, call it to so that
	2460	// it can flush some blocks which hopefully will cause some transactions
	2461	// to complete and thus free up space in the journal.
	2462	if (jnl->flush) {
	2463	jnl->flush(jnl->flush_arg);
	2464	}
	2465
	2466	// wait for a while to avoid being cpu-bound (this will
	2467	// put us to sleep for 10 milliseconds)
	2468	tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
	2469	}
	2470
	2471	return 0;
	2472	}
	2473
	2474	/*
	2475	* Allocate a new active transaction.
	2476	*/
	2477	static errno_t
	2478	journal_allocate_transaction(journal *jnl)
	2479	{
	2480	transaction *tr;
	2481	boolean_t was_vm_privileged;
	2482
	2483	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
	2484	/*
	2485	* the disk driver can allocate memory on this path...
	2486	* if we block waiting for memory, and there is enough pressure to
	2487	* cause us to try and create a new swap file, we may end up deadlocking
	2488	* due to waiting for the journal on the swap file creation path...
	2489	* by making ourselves vm_privileged, we give ourselves the best chance
	2490	* of not blocking
	2491	*/
	2492	was_vm_privileged = set_vm_privilege(TRUE);
	2493	}
	2494	MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
	2495	memset(tr, 0, sizeof(transaction));
	2496
	2497	tr->tbuffer_size = jnl->tbuffer_size;
	2498
	2499	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
	2500	FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
	2501	jnl->active_tr = NULL;
	2502	return ENOMEM;
	2503	}
	2504	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
	2505	set_vm_privilege(FALSE);
	2506
	2507	// journal replay code checksum check depends on this.
	2508	memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
	2509	// Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
	2510	memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
	2511
	2512	tr->blhdr = (block_list_header *)tr->tbuffer;
	2513	tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
	2514	tr->blhdr->num_blocks = 1; // accounts for this header block
	2515	tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
	2516	tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS \| BLHDR_FIRST_HEADER;
	2517
	2518	tr->sequence_num = ++jnl->sequence_num;
	2519	tr->num_blhdrs = 1;
	2520	tr->total_bytes = jnl->jhdr->blhdr_size;
	2521	tr->jnl = jnl;
	2522
	2523	jnl->active_tr = tr;
	2524
	2525	return 0;
	2526	}
	2527
	2528	int
	2529	journal_start_transaction(journal *jnl)
	2530	{
	2531	int ret;
	2532
	2533	CHECK_JOURNAL(jnl);
	2534
	2535	free_old_stuff(jnl);
	2536
	2537	if (jnl->flags & JOURNAL_INVALID) {
	2538	return EINVAL;
	2539	}
	2540	if (jnl->owner == current_thread()) {
	2541	if (jnl->active_tr == NULL) {
	2542	panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
	2543	jnl, jnl->owner, current_thread());
	2544	}
	2545	jnl->nested_count++;
	2546	return 0;
	2547	}
	2548
	2549	journal_lock(jnl);
	2550
	2551	if (jnl->nested_count != 0 \|\| jnl->active_tr != NULL) {
	2552	panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
	2553	jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
	2554	}
	2555
	2556	jnl->nested_count = 1;
	2557
	2558	#if JOE
	2559	// make sure there's room in the journal
	2560	if (free_space(jnl) < jnl->tbuffer_size) {
	2561
	2562	KERNEL_DEBUG(0xbbbbc030 \| DBG_FUNC_START, jnl, 0, 0, 0, 0);
	2563
	2564	// this is the call that really waits for space to free up
	2565	// as well as updating jnl->jhdr->start
	2566	if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) {
	2567	printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
	2568	ret = ENOSPC;
	2569	goto bad_start;
	2570	}
	2571	KERNEL_DEBUG(0xbbbbc030 \| DBG_FUNC_END, jnl, 0, 0, 0, 0);
	2572	}
	2573	#endif
	2574
	2575	// if there's a buffered transaction, use it.
	2576	if (jnl->cur_tr) {
	2577	jnl->active_tr = jnl->cur_tr;
	2578	jnl->cur_tr = NULL;
	2579
	2580	return 0;
	2581	}
	2582
	2583	ret = journal_allocate_transaction(jnl);
	2584	if (ret) {
	2585	goto bad_start;
	2586	}
	2587
	2588	// printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
	2589
	2590	return 0;
	2591
	2592	bad_start:
	2593	jnl->nested_count = 0;
	2594	journal_unlock(jnl);
	2595
	2596	return ret;
	2597	}
	2598
	2599
	2600	int
	2601	journal_modify_block_start(journal jnl, struct buf bp)
	2602	{
	2603	transaction *tr;
	2604
	2605	CHECK_JOURNAL(jnl);
	2606
	2607
	2608	free_old_stuff(jnl);
	2609
	2610	if (jnl->flags & JOURNAL_INVALID) {
	2611	return EINVAL;
	2612	}
	2613
	2614	// XXXdbg - for debugging I want this to be true. later it may
	2615	// not be necessary.
	2616	if ((buf_flags(bp) & B_META) == 0) {
	2617	panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
	2618	}
	2619
	2620	tr = jnl->active_tr;
	2621	CHECK_TRANSACTION(tr);
	2622
	2623	if (jnl->owner != current_thread()) {
	2624	panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	2625	jnl, jnl->owner, current_thread());
	2626	}
	2627
	2628	//printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
	2629	// bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
	2630
	2631	// can't allow blocks that aren't an even multiple of the
	2632	// underlying block size.
	2633	if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
	2634	uint32_t phys_blksz, bad=0;
	2635
	2636	if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) {
	2637	bad = 1;
	2638	} else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
	2639	if (phys_blksz < 512) {
	2640	panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
	2641	phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size);
	2642	}
	2643
	2644	if ((buf_size(bp) % phys_blksz) != 0) {
	2645	bad = 1;
	2646	} else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
	2647	jnl->jhdr->jhdr_size = phys_blksz;
	2648	} else {
	2649	// the phys_blksz is now larger... need to realloc the jhdr
	2650	char *new_header_buf;
	2651
	2652	printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n",
	2653	jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
	2654	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) {
	2655	printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n",
	2656	jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
	2657	bad = 1;
	2658	} else {
	2659	memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
	2660	memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
	2661	kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
	2662	jnl->header_buf = new_header_buf;
	2663	jnl->header_buf_size = phys_blksz;
	2664
	2665	jnl->jhdr = (journal_header *)jnl->header_buf;
	2666	jnl->jhdr->jhdr_size = phys_blksz;
	2667	}
	2668	}
	2669	} else {
	2670	bad = 1;
	2671	}
	2672
	2673	if (bad) {
	2674	panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
	2675	buf_size(bp), jnl->jhdr->jhdr_size);
	2676	return -1;
	2677	}
	2678	}
	2679
	2680	// make sure that this transaction isn't bigger than the whole journal
	2681	if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
	2682	panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
	2683	tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
	2684	return -1;
	2685	}
	2686
	2687	// if the block is dirty and not already locked we have to write
	2688	// it out before we muck with it because it has data that belongs
	2689	// (presumably) to another transaction.
	2690	//
	2691	if ((buf_flags(bp) & (B_DELWRI \| B_LOCKED)) == B_DELWRI) {
	2692
	2693	if (buf_flags(bp) & B_ASYNC) {
	2694	panic("modify_block_start: bp @ %p has async flag set!\n", bp);
	2695	}
	2696	if (bp->b_shadow_ref)
	2697	panic("modify_block_start: dirty bp @ %p has shadows!\n", bp);
	2698
	2699	// this will cause it to not be buf_brelse()'d
	2700	buf_setflags(bp, B_NORELSE);
	2701	VNOP_BWRITE(bp);
	2702	}
	2703	buf_setflags(bp, B_LOCKED);
	2704
	2705	return 0;
	2706	}
	2707
	2708	int
	2709	journal_modify_block_abort(journal jnl, struct buf bp)
	2710	{
	2711	transaction *tr;
	2712	block_list_header *blhdr;
	2713	int i;
	2714
	2715	CHECK_JOURNAL(jnl);
	2716
	2717	free_old_stuff(jnl);
	2718
	2719	tr = jnl->active_tr;
	2720
	2721	//
	2722	// if there's no active transaction then we just want to
	2723	// call buf_brelse() and return since this is just a block
	2724	// that happened to be modified as part of another tr.
	2725	//
	2726	if (tr == NULL) {
	2727	buf_brelse(bp);
	2728	return 0;
	2729	}
	2730
	2731	if (jnl->flags & JOURNAL_INVALID) {
	2732	/* Still need to buf_brelse(). Callers assume we consume the bp. */
	2733	buf_brelse(bp);
	2734	return EINVAL;
	2735	}
	2736
	2737	CHECK_TRANSACTION(tr);
	2738
	2739	if (jnl->owner != current_thread()) {
	2740	panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	2741	jnl, jnl->owner, current_thread());
	2742	}
	2743
	2744	// printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
	2745
	2746	// first check if it's already part of this transaction
	2747	for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
	2748	for (i = 1; i < blhdr->num_blocks; i++) {
	2749	if (bp == blhdr->binfo[i].u.bp) {
	2750	break;
	2751	}
	2752	}
	2753
	2754	if (i < blhdr->num_blocks) {
	2755	break;
	2756	}
	2757	}
	2758
	2759	//
	2760	// if blhdr is null, then this block has only had modify_block_start
	2761	// called on it as part of the current transaction. that means that
	2762	// it is ok to clear the LOCKED bit since it hasn't actually been
	2763	// modified. if blhdr is non-null then modify_block_end was called
	2764	// on it and so we need to keep it locked in memory.
	2765	//
	2766	if (blhdr == NULL) {
	2767	buf_clearflags(bp, B_LOCKED);
	2768	}
	2769
	2770	buf_brelse(bp);
	2771	return 0;
	2772	}
	2773
	2774
	2775	int
	2776	journal_modify_block_end(journal jnl, struct buf bp, void (func)(buf_t bp, void arg), void *arg)
	2777	{
	2778	int i = 1;
	2779	int tbuffer_offset=0;
	2780	block_list_header blhdr, prev=NULL;
	2781	transaction *tr;
	2782
	2783	CHECK_JOURNAL(jnl);
	2784
	2785	free_old_stuff(jnl);
	2786
	2787	if (jnl->flags & JOURNAL_INVALID) {
	2788	/* Still need to buf_brelse(). Callers assume we consume the bp. */
	2789	buf_brelse(bp);
	2790	return EINVAL;
	2791	}
	2792
	2793	tr = jnl->active_tr;
	2794	CHECK_TRANSACTION(tr);
	2795
	2796	if (jnl->owner != current_thread()) {
	2797	panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	2798	jnl, jnl->owner, current_thread());
	2799	}
	2800
	2801	//printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
	2802	// bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
	2803
	2804	if ((buf_flags(bp) & B_LOCKED) == 0) {
	2805	panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
	2806	}
	2807
	2808	// first check if it's already part of this transaction
	2809	for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
	2810	tbuffer_offset = jnl->jhdr->blhdr_size;
	2811
	2812	for (i = 1; i < blhdr->num_blocks; i++) {
	2813	if (bp == blhdr->binfo[i].u.bp) {
	2814	break;
	2815	}
	2816	if (blhdr->binfo[i].bnum != (off_t)-1) {
	2817	tbuffer_offset += buf_size(blhdr->binfo[i].u.bp);
	2818	} else {
	2819	tbuffer_offset += blhdr->binfo[i].u.bi.bsize;
	2820	}
	2821	}
	2822
	2823	if (i < blhdr->num_blocks) {
	2824	break;
	2825	}
	2826	}
	2827
	2828	if (blhdr == NULL
	2829	&& prev
	2830	&& (prev->num_blocks+1) <= prev->max_blocks
	2831	&& (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
	2832	blhdr = prev;
	2833
	2834	} else if (blhdr == NULL) {
	2835	block_list_header *nblhdr;
	2836	if (prev == NULL) {
	2837	panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
	2838	}
	2839
	2840	// we got to the end of the list, didn't find the block and there's
	2841	// no room in the block_list_header pointed to by prev
	2842
	2843	// we allocate another tbuffer and link it in at the end of the list
	2844	// through prev->binfo[0].bnum. that's a skanky way to do things but
	2845	// avoids having yet another linked list of small data structures to manage.
	2846
	2847	if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
	2848	panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
	2849	tr, tr->total_bytes);
	2850	}
	2851
	2852	// journal replay code checksum check depends on this.
	2853	memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
	2854	// Fill up the rest of the block with unimportant bytes
	2855	memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
	2856
	2857	// initialize the new guy
	2858	nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
	2859	nblhdr->num_blocks = 1; // accounts for this header block
	2860	nblhdr->bytes_used = jnl->jhdr->blhdr_size;
	2861	nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
	2862
	2863	tr->num_blhdrs++;
	2864	tr->total_bytes += jnl->jhdr->blhdr_size;
	2865
	2866	// then link him in at the end
	2867	prev->binfo[0].bnum = (off_t)((long)nblhdr);
	2868
	2869	// and finally switch to using the new guy
	2870	blhdr = nblhdr;
	2871	tbuffer_offset = jnl->jhdr->blhdr_size;
	2872	i = 1;
	2873	}
	2874
	2875
	2876	if ((i+1) > blhdr->max_blocks) {
	2877	panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
	2878	}
	2879
	2880	// if this is true then this is a new block we haven't seen
	2881	if (i >= blhdr->num_blocks) {
	2882	int bsize;
	2883	vnode_t vp;
	2884
	2885	vp = buf_vnode(bp);
	2886	vnode_ref(vp);
	2887	bsize = buf_size(bp);
	2888
	2889	blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
	2890	blhdr->binfo[i].u.bp = bp;
	2891
	2892	KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
	2893
	2894	if (func) {
	2895	void (old_func)(buf_t, void )=NULL, *old_arg=NULL;
	2896
	2897	buf_setfilter(bp, func, arg, &old_func, &old_arg);
	2898	if (old_func != NULL && old_func != func) {
	2899	panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func);
	2900	}
	2901	}
	2902
	2903	blhdr->bytes_used += bsize;
	2904	tr->total_bytes += bsize;
	2905
	2906	blhdr->num_blocks++;
	2907	}
	2908	buf_bdwrite(bp);
	2909
	2910	return 0;
	2911	}
	2912
	2913	int
	2914	journal_kill_block(journal jnl, struct buf bp)
	2915	{
	2916	int i;
	2917	int bflags;
	2918	block_list_header *blhdr;
	2919	transaction *tr;
	2920
	2921	CHECK_JOURNAL(jnl);
	2922
	2923	free_old_stuff(jnl);
	2924
	2925	if (jnl->flags & JOURNAL_INVALID) {
	2926	return EINVAL;
	2927	}
	2928
	2929	tr = jnl->active_tr;
	2930	CHECK_TRANSACTION(tr);
	2931
	2932	if (jnl->owner != current_thread()) {
	2933	panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	2934	jnl, jnl->owner, current_thread());
	2935	}
	2936
	2937	bflags = buf_flags(bp);
	2938
	2939	if ( !(bflags & B_LOCKED))
	2940	panic("jnl: modify_block_end: called with bp not B_LOCKED");
	2941
	2942	/*
	2943	* bp must be BL_BUSY and B_LOCKED
	2944	* first check if it's already part of this transaction
	2945	*/
	2946	for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
	2947
	2948	for (i = 1; i < blhdr->num_blocks; i++) {
	2949	if (bp == blhdr->binfo[i].u.bp) {
	2950	vnode_t vp;
	2951
	2952	buf_clearflags(bp, B_LOCKED);
	2953
	2954	// this undoes the vnode_ref() in journal_modify_block_end()
	2955	vp = buf_vnode(bp);
	2956	vnode_rele_ext(vp, 0, 1);
	2957
	2958	// if the block has the DELWRI and FILTER bits sets, then
	2959	// things are seriously weird. if it was part of another
	2960	// transaction then journal_modify_block_start() should
	2961	// have force it to be written.
	2962	//
	2963	//if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
	2964	// panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
	2965	//} else {
	2966	tr->num_killed += buf_size(bp);
	2967	//}
	2968	blhdr->binfo[i].bnum = (off_t)-1;
	2969	blhdr->binfo[i].u.bp = NULL;
	2970	blhdr->binfo[i].u.bi.bsize = buf_size(bp);
	2971
	2972	buf_markinvalid(bp);
	2973	buf_brelse(bp);
	2974
	2975	break;
	2976	}
	2977	}
	2978
	2979	if (i < blhdr->num_blocks) {
	2980	break;
	2981	}
	2982	}
	2983
	2984	return 0;
	2985	}
	2986
	2987	/*
	2988	;________________________________________________________________________________
	2989	;
	2990	; Routine: journal_trim_set_callback
	2991	;
	2992	; Function: Provide the journal with a routine to be called back when a
	2993	; TRIM has (or would have) been issued to the device. That
	2994	; is, the transaction has been flushed to the device, and the
	2995	; blocks freed by the transaction are now safe for reuse.
	2996	;
	2997	; CAUTION: If the journal becomes invalid (eg., due to an I/O
	2998	; error when trying to write to the journal), this callback
	2999	; will stop getting called, even if extents got freed before
	3000	; the journal became invalid!
	3001	;
	3002	; Input Arguments:
	3003	; jnl - The journal structure for the filesystem.
	3004	; callback - The function to call when the TRIM is complete.
	3005	; arg - An argument to be passed to callback.
	3006	;________________________________________________________________________________
	3007	*/
	3008	__private_extern__ void
	3009	journal_trim_set_callback(journal jnl, jnl_trim_callback_t callback, void arg)
	3010	{
	3011	jnl->trim_callback = callback;
	3012	jnl->trim_callback_arg = arg;
	3013	}
	3014
	3015
	3016	/*
	3017	;________________________________________________________________________________
	3018	;
	3019	; Routine: journal_trim_realloc
	3020	;
	3021	; Function: Increase the amount of memory allocated for the list of extents
	3022	; to be unmapped (trimmed). This routine will be called when
	3023	; adding an extent to the list, and the list already occupies
	3024	; all of the space allocated to it. This routine returns ENOMEM
	3025	; if unable to allocate more space, or 0 if the extent list was
	3026	; grown successfully.
	3027	;
	3028	; Input Arguments:
	3029	; trim - The trim list to be resized.
	3030	;
	3031	; Output:
	3032	; (result) - ENOMEM or 0.
	3033	;
	3034	; Side effects:
	3035	; The allocated_count and extents fields of tr->trim are updated
	3036	; if the function returned 0.
	3037	;________________________________________________________________________________
	3038	*/
	3039	static int
	3040	trim_realloc(journal jnl, struct jnl_trim_list trim)
	3041	{
	3042	void *new_extents;
	3043	uint32_t new_allocated_count;
	3044	boolean_t was_vm_privileged;
	3045
	3046	if (jnl_kdebug)
	3047	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
	3048
	3049	new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
	3050
	3051	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
	3052	/*
	3053	* if we block waiting for memory, and there is enough pressure to
	3054	* cause us to try and create a new swap file, we may end up deadlocking
	3055	* due to waiting for the journal on the swap file creation path...
	3056	* by making ourselves vm_privileged, we give ourselves the best chance
	3057	* of not blocking
	3058	*/
	3059	was_vm_privileged = set_vm_privilege(TRUE);
	3060	}
	3061	new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
	3062	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
	3063	set_vm_privilege(FALSE);
	3064
	3065	if (new_extents == NULL) {
	3066	printf("jnl: trim_realloc: unable to grow extent list!\n");
	3067	/*
	3068	* Since we could be called when allocating space previously marked
	3069	* to be trimmed, we need to empty out the list to be safe.
	3070	*/
	3071	trim->extent_count = 0;
	3072	if (jnl_kdebug)
	3073	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC \| DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0);
	3074	return ENOMEM;
	3075	}
	3076
	3077	/* Copy the old extent list to the newly allocated list. */
	3078	if (trim->extents != NULL) {
	3079	memmove(new_extents,
	3080	trim->extents,
	3081	trim->allocated_count * sizeof(dk_extent_t));
	3082	kfree(trim->extents,
	3083	trim->allocated_count * sizeof(dk_extent_t));
	3084	}
	3085
	3086	trim->allocated_count = new_allocated_count;
	3087	trim->extents = new_extents;
	3088
	3089	if (jnl_kdebug)
	3090	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC \| DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0);
	3091
	3092	return 0;
	3093	}
	3094
	3095	/*
	3096	;________________________________________________________________________________
	3097	;
	3098	; Routine: trim_search_extent
	3099	;
	3100	; Function: Search the given extent list to see if any of its extents
	3101	; overlap the given extent.
	3102	;
	3103	; Input Arguments:
	3104	; trim - The trim list to be searched.
	3105	; offset - The first byte of the range to be searched for.
	3106	; length - The number of bytes of the extent being searched for.
	3107	; overlap_start - start of the overlapping extent
	3108	; overlap_len - length of the overlapping extent
	3109	;
	3110	; Output:
	3111	; (result) - TRUE if one or more extents overlap, FALSE otherwise.
	3112	;________________________________________________________________________________
	3113	*/
	3114	static int
	3115	trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
	3116	uint64_t length, uint64_t overlap_start, uint64_t overlap_len)
	3117	{
	3118	uint64_t end = offset + length;
	3119	uint32_t lower = 0; /* Lowest index to search */
	3120	uint32_t upper = trim->extent_count; /* Highest index to search + 1 */
	3121	uint32_t middle;
	3122
	3123	/* A binary search over the extent list. */
	3124	while (lower < upper) {
	3125	middle = (lower + upper) / 2;
	3126
	3127	if (trim->extents[middle].offset >= end)
	3128	upper = middle;
	3129	else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
	3130	lower = middle + 1;
	3131	else {
	3132	if (overlap_start) {
	3133	*overlap_start = trim->extents[middle].offset;
	3134	}
	3135	if (overlap_len) {
	3136	*overlap_len = trim->extents[middle].length;
	3137	}
	3138	return TRUE;
	3139	}
	3140	}
	3141
	3142	return FALSE;
	3143	}
	3144
	3145
	3146	/*
	3147	;________________________________________________________________________________
	3148	;
	3149	; Routine: journal_trim_add_extent
	3150	;
	3151	; Function: Keep track of extents that have been freed as part of this
	3152	; transaction. If the underlying device supports TRIM (UNMAP),
	3153	; then those extents will be trimmed/unmapped once the
	3154	; transaction has been written to the journal. (For example,
	3155	; SSDs can support trim/unmap and avoid having to recopy those
	3156	; blocks when doing wear leveling, and may reuse the same
	3157	; phsyical blocks for different logical blocks.)
	3158	;
	3159	; HFS also uses this, in combination with journal_trim_set_callback,
	3160	; to add recently freed extents to its free extent cache, but
	3161	; only after the transaction that freed them is committed to
	3162	; disk. (This reduces the chance of overwriting live data in
	3163	; a way that causes data loss if a transaction never gets
	3164	; written to the journal.)
	3165	;
	3166	; Input Arguments:
	3167	; jnl - The journal for the volume containing the byte range.
	3168	; offset - The first byte of the range to be trimmed.
	3169	; length - The number of bytes of the extent being trimmed.
	3170	;________________________________________________________________________________
	3171	*/
	3172	__private_extern__ int
	3173	journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
	3174	{
	3175	uint64_t end;
	3176	transaction *tr;
	3177	dk_extent_t *extent;
	3178	uint32_t insert_index;
	3179	uint32_t replace_count;
	3180
	3181	CHECK_JOURNAL(jnl);
	3182
	3183	/* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
	3184	if (jnl->flags & JOURNAL_INVALID) {
	3185	return EINVAL;
	3186	}
	3187
	3188	tr = jnl->active_tr;
	3189	CHECK_TRANSACTION(tr);
	3190
	3191	if (jnl_kdebug)
	3192	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
	3193
	3194	if (jnl->owner != current_thread()) {
	3195	panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	3196	jnl, jnl->owner, current_thread());
	3197	}
	3198
	3199	free_old_stuff(jnl);
	3200
	3201	end = offset + length;
	3202
	3203	/*
	3204	* Find the range of existing extents that can be combined with the
	3205	* input extent. We start by counting the number of extents that end
	3206	* strictly before the input extent, then count the number of extents
	3207	* that overlap or are contiguous with the input extent.
	3208	*/
	3209	extent = tr->trim.extents;
	3210	insert_index = 0;
	3211	while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) {
	3212	++insert_index;
	3213	++extent;
	3214	}
	3215	replace_count = 0;
	3216	while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) {
	3217	++replace_count;
	3218	++extent;
	3219	}
	3220
	3221	/*
	3222	* If none of the existing extents can be combined with the input extent,
	3223	* then just insert it in the list (before item number insert_index).
	3224	*/
	3225	if (replace_count == 0) {
	3226	/* If the list was already full, we need to grow it. */
	3227	if (tr->trim.extent_count == tr->trim.allocated_count) {
	3228	if (trim_realloc(jnl, &tr->trim) != 0) {
	3229	printf("jnl: trim_add_extent: out of memory!");
	3230	if (jnl_kdebug)
	3231	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD \| DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
	3232	return ENOMEM;
	3233	}
	3234	}
	3235
	3236	/* Shift any existing extents with larger offsets. */
	3237	if (insert_index < tr->trim.extent_count) {
	3238	memmove(&tr->trim.extents[insert_index+1],
	3239	&tr->trim.extents[insert_index],
	3240	(tr->trim.extent_count - insert_index) * sizeof(dk_extent_t));
	3241	}
	3242	tr->trim.extent_count++;
	3243
	3244	/* Store the new extent in the list. */
	3245	tr->trim.extents[insert_index].offset = offset;
	3246	tr->trim.extents[insert_index].length = length;
	3247
	3248	/* We're done. */
	3249	if (jnl_kdebug)
	3250	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD \| DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
	3251	return 0;
	3252	}
	3253
	3254	/*
	3255	* Update extent number insert_index to be the union of the input extent
	3256	* and all of the replaced extents.
	3257	*/
	3258	if (tr->trim.extents[insert_index].offset < offset)
	3259	offset = tr->trim.extents[insert_index].offset;
	3260	extent = &tr->trim.extents[insert_index + replace_count - 1];
	3261	if (extent->offset + extent->length > end)
	3262	end = extent->offset + extent->length;
	3263	tr->trim.extents[insert_index].offset = offset;
	3264	tr->trim.extents[insert_index].length = end - offset;
	3265
	3266	/*
	3267	* If we were replacing more than one existing extent, then shift any
	3268	* extents with larger offsets, and update the count of extents.
	3269	*
	3270	* We're going to leave extent #insert_index alone since it was just updated, above.
	3271	* We need to move extents from index (insert_index + replace_count) through the end of
	3272	* the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1).
	3273	*/
	3274	if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) {
	3275	memmove(&tr->trim.extents[insert_index + 1],
	3276	&tr->trim.extents[insert_index + replace_count],
	3277	(tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t));
	3278	}
	3279	tr->trim.extent_count -= replace_count - 1;
	3280
	3281	if (jnl_kdebug)
	3282	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD \| DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
	3283	return 0;
	3284	}
	3285
	3286	/*
	3287	* journal_trim_extent_overlap
	3288	*
	3289	* Return 1 if there are any pending TRIMs that overlap with the given offset and length
	3290	* Return 0 otherwise.
	3291	*/
	3292
	3293	int journal_trim_extent_overlap (journal jnl, uint64_t offset, uint64_t length, uint64_t end) {
	3294	transaction *tr = NULL;
	3295	int overlap = 0;
	3296
	3297	uint64_t overlap_start;
	3298	uint64_t overlap_len;
	3299	tr = jnl->active_tr;
	3300	CHECK_TRANSACTION(tr);
	3301
	3302	/*
	3303	* There are two lists that need to be examined for potential overlaps:
	3304	*
	3305	* The first is the current transaction. Since this function requires that
	3306	* a transaction be active when this is called, this is the "active_tr"
	3307	* pointer in the journal struct. This has a trimlist pointer which needs
	3308	* to be searched.
	3309	*/
	3310	overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
	3311	if (overlap == 0) {
	3312	/*
	3313	* The second is the async trim list, which is only done if the current
	3314	* transaction group (active transaction) did not overlap with our target
	3315	* extent. This async trim list is the set of all previously
	3316	* committed transaction groups whose I/Os are now in-flight. We need to hold the
	3317	* trim lock in order to search this list. If we grab the list before the
	3318	* TRIM has completed, then we will compare it. If it is grabbed AFTER the
	3319	* TRIM has completed, then the pointer will be zeroed out and we won't have
	3320	* to check anything.
	3321	*/
	3322	lck_rw_lock_shared (&jnl->trim_lock);
	3323	if (jnl->async_trim != NULL) {
	3324	overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
	3325	}
	3326	lck_rw_unlock_shared (&jnl->trim_lock);
	3327	}
	3328
	3329	if (overlap) {
	3330	/* compute the end (min) of the overlapping range */
	3331	if ( (overlap_start + overlap_len) < (offset + length)) {
	3332	*end = (overlap_start + overlap_len);
	3333	}
	3334	else {
	3335	*end = (offset + length);
	3336	}
	3337	}
	3338
	3339
	3340	return overlap;
	3341	}
	3342
	3343	/*
	3344	* journal_request_immediate_flush
	3345	*
	3346	* FS requests that the journal flush immediately upon the
	3347	* active transaction's completion.
	3348	*
	3349	* Returns 0 if operation succeeds
	3350	* Returns EPERM if we failed to leave hint
	3351	*/
	3352	int
	3353	journal_request_immediate_flush (journal *jnl) {
	3354
	3355	transaction *tr = NULL;
	3356	/*
	3357	* Is a transaction still in process? You must do
	3358	* this while there are txns open
	3359	*/
	3360	tr = jnl->active_tr;
	3361	if (tr != NULL) {
	3362	CHECK_TRANSACTION(tr);
	3363	tr->flush_on_completion = TRUE;
	3364	}
	3365	else {
	3366	return EPERM;
	3367	}
	3368	return 0;
	3369	}
	3370
	3371
	3372
	3373	/*
	3374	;________________________________________________________________________________
	3375	;
	3376	; Routine: trim_remove_extent
	3377	;
	3378	; Function: Indicate that a range of bytes, some of which may have previously
	3379	; been passed to journal_trim_add_extent, is now allocated.
	3380	; Any overlapping ranges currently in the journal's trim list will
	3381	; be removed. If the underlying device supports TRIM (UNMAP), then
	3382	; these extents will not be trimmed/unmapped when the transaction
	3383	; is written to the journal.
	3384	;
	3385	; HFS also uses this to prevent newly allocated space from being
	3386	; added to its free extent cache (if some portion of the newly
	3387	; allocated space was recently freed).
	3388	;
	3389	; Input Arguments:
	3390	; trim - The trim list to update.
	3391	; offset - The first byte of the range to be trimmed.
	3392	; length - The number of bytes of the extent being trimmed.
	3393	;________________________________________________________________________________
	3394	*/
	3395	static int
	3396	trim_remove_extent(journal jnl, struct jnl_trim_list trim, uint64_t offset, uint64_t length)
	3397	{
	3398	u_int64_t end;
	3399	dk_extent_t *extent;
	3400	u_int32_t keep_before;
	3401	u_int32_t keep_after;
	3402
	3403	end = offset + length;
	3404
	3405	/*
	3406	* Find any existing extents that start before or end after the input
	3407	* extent. These extents will be modified if they overlap the input
	3408	* extent. Other extents between them will be deleted.
	3409	*/
	3410	extent = trim->extents;
	3411	keep_before = 0;
	3412	while (keep_before < trim->extent_count && extent->offset < offset) {
	3413	++keep_before;
	3414	++extent;
	3415	}
	3416	keep_after = keep_before;
	3417	if (keep_after > 0) {
	3418	/* See if previous extent extends beyond both ends of input extent. */
	3419	--keep_after;
	3420	--extent;
	3421	}
	3422	while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) {
	3423	++keep_after;
	3424	++extent;
	3425	}
	3426
	3427	/*
	3428	* When we get here, the first keep_before extents (0 .. keep_before-1)
	3429	* start before the input extent, and extents (keep_after .. extent_count-1)
	3430	* end after the input extent. We'll need to keep, all of those extents,
	3431	* but possibly modify #(keep_before-1) and #keep_after to remove the portion
	3432	* that overlaps with the input extent.
	3433	*/
	3434
	3435	/*
	3436	* Does the input extent start after and end before the same existing
	3437	* extent? If so, we have to "punch a hole" in that extent and convert
	3438	* it to two separate extents.
	3439	*/
	3440	if (keep_before > keep_after) {
	3441	/* If the list was already full, we need to grow it. */
	3442	if (trim->extent_count == trim->allocated_count) {
	3443	if (trim_realloc(jnl, trim) != 0) {
	3444	printf("jnl: trim_remove_extent: out of memory!");
	3445	return ENOMEM;
	3446	}
	3447	}
	3448
	3449	/*
	3450	* Make room for a new extent by shifting extents #keep_after and later
	3451	* down by one extent. When we're done, extents #keep_before and
	3452	* #keep_after will be identical, and we can fall through to removing
	3453	* the portion that overlaps the input extent.
	3454	*/
	3455	memmove(&trim->extents[keep_before],
	3456	&trim->extents[keep_after],
	3457	(trim->extent_count - keep_after) * sizeof(dk_extent_t));
	3458	++trim->extent_count;
	3459	++keep_after;
	3460
	3461	/*
	3462	* Fall through. We now have the case where the length of extent
	3463	* #(keep_before - 1) needs to be updated, and the start of extent
	3464	* #(keep_after) needs to be updated.
	3465	*/
	3466	}
	3467
	3468	/*
	3469	* May need to truncate the end of extent #(keep_before - 1) if it overlaps
	3470	* the input extent.
	3471	*/
	3472	if (keep_before > 0) {
	3473	extent = &trim->extents[keep_before - 1];
	3474	if (extent->offset + extent->length > offset) {
	3475	extent->length = offset - extent->offset;
	3476	}
	3477	}
	3478
	3479	/*
	3480	* May need to update the start of extent #(keep_after) if it overlaps the
	3481	* input extent.
	3482	*/
	3483	if (keep_after < trim->extent_count) {
	3484	extent = &trim->extents[keep_after];
	3485	if (extent->offset < end) {
	3486	extent->length = extent->offset + extent->length - end;
	3487	extent->offset = end;
	3488	}
	3489	}
	3490
	3491	/*
	3492	* If there were whole extents that overlapped the input extent, get rid
	3493	* of them by shifting any following extents, and updating the count.
	3494	*/
	3495	if (keep_after > keep_before && keep_after < trim->extent_count) {
	3496	memmove(&trim->extents[keep_before],
	3497	&trim->extents[keep_after],
	3498	(trim->extent_count - keep_after) * sizeof(dk_extent_t));
	3499	}
	3500	trim->extent_count -= keep_after - keep_before;
	3501
	3502	return 0;
	3503	}
	3504
	3505	/*
	3506	;________________________________________________________________________________
	3507	;
	3508	; Routine: journal_trim_remove_extent
	3509	;
	3510	; Function: Make note of a range of bytes, some of which may have previously
	3511	; been passed to journal_trim_add_extent, is now in use on the
	3512	; volume. The given bytes will be not be trimmed as part of
	3513	; this transaction, or a pending trim of a transaction being
	3514	; asynchronously flushed.
	3515	;
	3516	; Input Arguments:
	3517	; jnl - The journal for the volume containing the byte range.
	3518	; offset - The first byte of the range to be trimmed.
	3519	; length - The number of bytes of the extent being trimmed.
	3520	;________________________________________________________________________________
	3521	*/
	3522	__private_extern__ int
	3523	journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
	3524	{
	3525	int error = 0;
	3526	transaction *tr;
	3527
	3528	CHECK_JOURNAL(jnl);
	3529
	3530	/* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
	3531	if (jnl->flags & JOURNAL_INVALID) {
	3532	return EINVAL;
	3533	}
	3534
	3535	tr = jnl->active_tr;
	3536	CHECK_TRANSACTION(tr);
	3537
	3538	if (jnl_kdebug)
	3539	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
	3540
	3541	if (jnl->owner != current_thread()) {
	3542	panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
	3543	jnl, jnl->owner, current_thread());
	3544	}
	3545
	3546	free_old_stuff(jnl);
	3547
	3548	error = trim_remove_extent(jnl, &tr->trim, offset, length);
	3549	if (error == 0) {
	3550	int found = FALSE;
	3551
	3552	/*
	3553	* See if a pending trim has any extents that overlap with the
	3554	* one we were given.
	3555	*/
	3556	lck_rw_lock_shared(&jnl->trim_lock);
	3557	if (jnl->async_trim != NULL)
	3558	found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
	3559	lck_rw_unlock_shared(&jnl->trim_lock);
	3560
	3561	if (found) {
	3562	/*
	3563	* There was an overlap, so avoid trimming the extent we
	3564	* just allocated. (Otherwise, it might get trimmed after
	3565	* we've written to it, which will cause that data to be
	3566	* corrupted.)
	3567	*/
	3568	uint32_t async_extent_count = 0;
	3569
	3570	if (jnl_kdebug)
	3571	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
	3572	lck_rw_lock_exclusive(&jnl->trim_lock);
	3573	if (jnl->async_trim != NULL) {
	3574	error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
	3575	async_extent_count = jnl->async_trim->extent_count;
	3576	}
	3577	lck_rw_unlock_exclusive(&jnl->trim_lock);
	3578	if (jnl_kdebug)
	3579	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING \| DBG_FUNC_END, error, 0, 0, async_extent_count, 0);
	3580	}
	3581	}
	3582
	3583	if (jnl_kdebug)
	3584	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE \| DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0);
	3585	return error;
	3586	}
	3587
	3588
	3589	static int
	3590	journal_trim_flush(journal jnl, transaction tr)
	3591	{
	3592	int errno = 0;
	3593	boolean_t was_vm_privileged;
	3594
	3595	if (jnl_kdebug)
	3596	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
	3597
	3598	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
	3599	/*
	3600	* the disk driver can allocate memory on this path...
	3601	* if we block waiting for memory, and there is enough pressure to
	3602	* cause us to try and create a new swap file, we may end up deadlocking
	3603	* due to waiting for the journal on the swap file creation path...
	3604	* by making ourselves vm_privileged, we give ourselves the best chance
	3605	* of not blocking
	3606	*/
	3607	was_vm_privileged = set_vm_privilege(TRUE);
	3608	}
	3609	lck_rw_lock_shared(&jnl->trim_lock);
	3610	if (tr->trim.extent_count > 0) {
	3611	dk_unmap_t unmap;
	3612
	3613	bzero(&unmap, sizeof(unmap));
	3614	if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
	3615	unmap.extents = tr->trim.extents;
	3616	unmap.extentsCount = tr->trim.extent_count;
	3617	if (jnl_kdebug)
	3618	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP \| DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
	3619	errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
	3620	if (jnl_kdebug)
	3621	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP \| DBG_FUNC_END, errno, 0, 0, 0, 0);
	3622	}
	3623
	3624	/*
	3625	* Call back into the file system to tell them that we have
	3626	* trimmed some extents and that they can now be reused.
	3627	*
	3628	* CAUTION: If the journal becomes invalid (eg., due to an I/O
	3629	* error when trying to write to the journal), this callback
	3630	* will stop getting called, even if extents got freed before
	3631	* the journal became invalid!
	3632	*/
	3633	if (jnl->trim_callback)
	3634	jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
	3635	}
	3636	lck_rw_unlock_shared(&jnl->trim_lock);
	3637
	3638	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
	3639	set_vm_privilege(FALSE);
	3640	/*
	3641	* If the transaction we're flushing was the async transaction, then
	3642	* tell the current transaction that there is no pending trim
	3643	* any more.
	3644	*
	3645	* NOTE: Since we released the lock, another thread could have
	3646	* removed one or more extents from our list. That's not a
	3647	* problem since any writes to the re-allocated blocks
	3648	* would get sent to the device after the DKIOCUNMAP.
	3649	*/
	3650	lck_rw_lock_exclusive(&jnl->trim_lock);
	3651	if (jnl->async_trim == &tr->trim)
	3652	jnl->async_trim = NULL;
	3653	lck_rw_unlock_exclusive(&jnl->trim_lock);
	3654
	3655	/*
	3656	* By the time we get here, no other thread can discover the address
	3657	* of "tr", so it is safe for us to manipulate tr->trim without
	3658	* holding any locks.
	3659	*/
	3660	if (tr->trim.extents) {
	3661	kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
	3662	tr->trim.allocated_count = 0;
	3663	tr->trim.extent_count = 0;
	3664	tr->trim.extents = NULL;
	3665	}
	3666
	3667	if (jnl_kdebug)
	3668	KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH \| DBG_FUNC_END, errno, 0, 0, 0, 0);
	3669
	3670	return errno;
	3671	}
	3672
	3673	static int
	3674	journal_binfo_cmp(const void a, const void b)
	3675	{
	3676	const block_info bi_a = (const struct block_info )a;
	3677	const block_info bi_b = (const struct block_info )b;
	3678	daddr64_t res;
	3679
	3680	if (bi_a->bnum == (off_t)-1) {
	3681	return 1;
	3682	}
	3683	if (bi_b->bnum == (off_t)-1) {
	3684	return -1;
	3685	}
	3686
	3687	// don't have to worry about negative block
	3688	// numbers so this is ok to do.
	3689	//
	3690	res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp));
	3691
	3692	return (int)res;
	3693	}
	3694
	3695
	3696	/*
	3697	* End a transaction. If the transaction is small enough, and we're not forcing
	3698	* a write to disk, the "active" transaction becomes the "current" transaction,
	3699	* and will be reused for the next transaction that is started (group commit).
	3700	*
	3701	* If the transaction gets written to disk (because force_it is true, or no
	3702	* group commit, or the transaction is sufficiently full), the blocks get
	3703	* written into the journal first, then the are written asynchronously. When
	3704	* those async writes complete, the transaction can be freed and removed from
	3705	* the journal.
	3706	*
	3707	* An optional callback can be supplied. If given, it is called after the
	3708	* the blocks have been written to the journal, but before the async writes
	3709	* of those blocks to their normal on-disk locations. This is used by
	3710	* journal_relocate so that the location of the journal can be changed and
	3711	* flushed to disk before the blocks get written to their normal locations.
	3712	* Note that the callback is only called if the transaction gets written to
	3713	* the journal during this end_transaction call; you probably want to set the
	3714	* force_it flag.
	3715	*
	3716	* Inputs:
	3717	* tr Transaction to add to the journal
	3718	* force_it If true, force this transaction to the on-disk journal immediately.
	3719	* callback See description above. Pass NULL for no callback.
	3720	* callback_arg Argument passed to callback routine.
	3721	*
	3722	* Result
	3723	* 0 No errors
	3724	* -1 An error occurred. The journal is marked invalid.
	3725	*/
	3726	static int
	3727	end_transaction(transaction tr, int force_it, errno_t (callback)(void), void callback_arg, boolean_t drop_lock, boolean_t must_wait)
	3728	{
	3729	block_list_header blhdr=NULL, next=NULL;
	3730	int i, ret_val = 0;
	3731	errno_t errno;
	3732	journal *jnl = tr->jnl;
	3733	struct buf *bp;
	3734	size_t tbuffer_offset;
	3735	boolean_t drop_lock_early;
	3736
	3737	if (jnl->cur_tr) {
	3738	panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
	3739	jnl, jnl->cur_tr, tr);
	3740	}
	3741
	3742	// if there weren't any modified blocks in the transaction
	3743	// just save off the transaction pointer and return.
	3744	if (tr->total_bytes == jnl->jhdr->blhdr_size) {
	3745	jnl->cur_tr = tr;
	3746	goto done;
	3747	}
	3748
	3749	// if our transaction buffer isn't very full, just hang
	3750	// on to it and don't actually flush anything. this is
	3751	// what is known as "group commit". we will flush the
	3752	// transaction buffer if it's full or if we have more than
	3753	// one of them so we don't start hogging too much memory.
	3754	//
	3755	// We also check the device supports UNMAP/TRIM, and if so,
	3756	// the number of extents waiting to be trimmed. If it is
	3757	// small enough, then keep accumulating more (so we can
	3758	// reduce the overhead of trimming). If there was a prior
	3759	// trim error, then we stop issuing trims for this
	3760	// volume, so we can also coalesce transactions.
	3761	//
	3762	if ( force_it == 0
	3763	&& (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
	3764	&& tr->num_blhdrs < 3
	3765	&& (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
	3766	&& (!(jnl->flags & JOURNAL_USE_UNMAP) \|\| (tr->trim.extent_count < jnl_trim_flush_limit))) {
	3767
	3768	jnl->cur_tr = tr;
	3769	goto done;
	3770	}
	3771
	3772	KERNEL_DEBUG(0xbbbbc018\|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0);
	3773
	3774	lock_condition(jnl, &jnl->flushing, "end_transaction");
	3775
	3776	/*
	3777	* if the previous 'finish_end_transaction' was being run
	3778	* asynchronously, it could have encountered a condition
	3779	* that caused it to mark the journal invalid... if that
	3780	* occurred while we were waiting for it to finish, we
	3781	* need to notice and abort the current transaction
	3782	*/
	3783	if ((jnl->flags & JOURNAL_INVALID) \|\| jnl->flush_aborted == TRUE) {
	3784	unlock_condition(jnl, &jnl->flushing);
	3785
	3786	abort_transaction(jnl, tr);
	3787	ret_val = -1;
	3788	KERNEL_DEBUG(0xbbbbc018\|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
	3789	goto done;
	3790	}
	3791
	3792	/*
	3793	* Store a pointer to this transaction's trim list so that
	3794	* future transactions can find it.
	3795	*
	3796	* Note: if there are no extents in the trim list, then don't
	3797	* bother saving the pointer since nothing can add new extents
	3798	* to the list (and other threads/transactions only care if
	3799	* there is a trim pending).
	3800	*/
	3801	lck_rw_lock_exclusive(&jnl->trim_lock);
	3802	if (jnl->async_trim != NULL)
	3803	panic("jnl: end_transaction: async_trim already non-NULL!");
	3804	if (tr->trim.extent_count > 0)
	3805	jnl->async_trim = &tr->trim;
	3806	lck_rw_unlock_exclusive(&jnl->trim_lock);
	3807
	3808	/*
	3809	* snapshot the transaction sequence number while we are still behind
	3810	* the journal lock since it will be bumped upon the start of the
	3811	* next transaction group which may overlap the current journal flush...
	3812	* we pass the snapshot into write_journal_header during the journal
	3813	* flush so that it can write the correct version in the header...
	3814	* because we hold the 'flushing' condition variable for the duration
	3815	* of the journal flush, 'saved_sequence_num' remains stable
	3816	*/
	3817	jnl->saved_sequence_num = jnl->sequence_num;
	3818
	3819	/*
	3820	* if we're here we're going to flush the transaction buffer to disk.
	3821	* 'check_free_space' will not return untl there is enough free
	3822	* space for this transaction in the journal and jnl->old_start[0]
	3823	* is avaiable for use
	3824	*/
	3825	KERNEL_DEBUG(0xbbbbc030 \| DBG_FUNC_START, jnl, 0, 0, 0, 0);
	3826
	3827	check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
	3828
	3829	KERNEL_DEBUG(0xbbbbc030 \| DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0);
	3830
	3831	// range check the end index
	3832	if (jnl->jhdr->end <= 0 \|\| jnl->jhdr->end > jnl->jhdr->size) {
	3833	panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
	3834	jnl->jhdr->end, jnl->jhdr->size);
	3835	}
	3836	if (tr->delayed_header_write == TRUE) {
	3837	thread_t thread = THREAD_NULL;
	3838
	3839	lock_condition(jnl, &jnl->writing_header, "end_transaction");
	3840	/*
	3841	* fire up a thread to write the journal header
	3842	* asynchronously... when it finishes, it will call
	3843	* unlock_condition... we can overlap the preparation of
	3844	* the log and buffers during this time
	3845	*/
	3846	kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread);
	3847	} else
	3848	jnl->write_header_failed = FALSE;
	3849
	3850
	3851	// this transaction starts where the current journal ends
	3852	tr->journal_start = jnl->jhdr->end;
	3853
	3854	lock_oldstart(jnl);
	3855	/*
	3856	* Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
	3857	* slide everyone else down and put our latest guy in the last
	3858	* entry in the old_start array
	3859	*/
	3860	memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void ), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void ), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
	3861	jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start \| 0x8000000000000000LL;
	3862
	3863	unlock_oldstart(jnl);
	3864
	3865
	3866	for (blhdr = tr->blhdr; blhdr; blhdr = next) {
	3867	char *blkptr;
	3868	buf_t sbp;
	3869	int32_t bsize;
	3870
	3871	tbuffer_offset = jnl->jhdr->blhdr_size;
	3872
	3873	for (i = 1; i < blhdr->num_blocks; i++) {
	3874
	3875	if (blhdr->binfo[i].bnum != (off_t)-1) {
	3876	void (func)(buf_t, void );
	3877	void *arg;
	3878
	3879	bp = blhdr->binfo[i].u.bp;
	3880
	3881	if (bp == NULL) {
	3882	panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
	3883	blhdr->binfo[i].bnum, jnl, tr);
	3884	}
	3885	/*
	3886	* acquire the bp here so that we can safely
	3887	* mess around with its data. buf_acquire()
	3888	* will return EAGAIN if the buffer was busy,
	3889	* so loop trying again.
	3890	*/
	3891	do {
	3892	errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
	3893	} while (errno == EAGAIN);
	3894
	3895	if (errno)
	3896	panic("could not acquire bp %p (err %d)\n", bp, errno);
	3897
	3898	if ((buf_flags(bp) & (B_LOCKED\|B_DELWRI)) != (B_LOCKED\|B_DELWRI)) {
	3899	if (jnl->flags & JOURNAL_CLOSE_PENDING) {
	3900	buf_clearflags(bp, B_LOCKED);
	3901	buf_brelse(bp);
	3902
	3903	/*
	3904	* this is an odd case that appears to happen occasionally
	3905	* make sure we mark this block as no longer valid
	3906	* so that we don't process it in "finish_end_transaction" since
	3907	* the bp that is recorded in our array no longer belongs
	3908	* to us (normally we substitute a shadow bp to be processed
	3909	* issuing a 'buf_bawrite' on a stale buf_t pointer leads
	3910	* to all kinds of problems.
	3911	*/
	3912	blhdr->binfo[i].bnum = (off_t)-1;
	3913	continue;
	3914	} else {
	3915	panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
	3916	}
	3917	}
	3918	bsize = buf_size(bp);
	3919
	3920	buf_setfilter(bp, NULL, NULL, &func, &arg);
	3921
	3922	blkptr = (char )&((char )blhdr)[tbuffer_offset];
	3923
	3924	sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0);
	3925
	3926	if (sbp == NULL)
	3927	panic("jnl: buf_create_shadow returned NULL");
	3928
	3929	/*
	3930	* copy the data into the transaction buffer...
	3931	*/
	3932	memcpy(blkptr, (char *)buf_dataptr(bp), bsize);
	3933
	3934	buf_clearflags(bp, B_LOCKED);
	3935	buf_markclean(bp);
	3936	buf_drop(bp);
	3937
	3938	/*
	3939	* adopt the shadow buffer for this block
	3940	*/
	3941	if (func) {
	3942	/*
	3943	* transfer FS hook function to the
	3944	* shadow buffer... it will get called
	3945	* in finish_end_transaction
	3946	*/
	3947	buf_setfilter(sbp, func, arg, NULL, NULL);
	3948	}
	3949	blhdr->binfo[i].u.bp = sbp;
	3950
	3951	} else {
	3952	// bnum == -1, only true if a block was "killed"
	3953	bsize = blhdr->binfo[i].u.bi.bsize;
	3954	}
	3955	tbuffer_offset += bsize;
	3956	}
	3957	next = (block_list_header *)((long)blhdr->binfo[0].bnum);
	3958	}
	3959	/*
	3960	* if callback != NULL, we don't want to drop the journal
	3961	* lock, or complete end_transaction asynchronously, since
	3962	* the caller is expecting the callback to run in the calling
	3963	* context
	3964	*
	3965	* if drop_lock == FALSE, we can't complete end_transaction
	3966	* asynchronously
	3967	*/
	3968	if (callback)
	3969	drop_lock_early = FALSE;
	3970	else
	3971	drop_lock_early = drop_lock;
	3972
	3973	if (drop_lock_early == FALSE)
	3974	must_wait = TRUE;
	3975
	3976	if (drop_lock_early == TRUE) {
	3977	journal_unlock(jnl);
	3978	drop_lock = FALSE;
	3979	}
	3980	if (must_wait == TRUE)
	3981	ret_val = finish_end_transaction(tr, callback, callback_arg);
	3982	else {
	3983	thread_t thread = THREAD_NULL;
	3984
	3985	/*
	3986	* fire up a thread to complete processing this transaction
	3987	* asynchronously... when it finishes, it will call
	3988	* unlock_condition
	3989	*/
	3990	kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread);
	3991	}
	3992	KERNEL_DEBUG(0xbbbbc018\|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
	3993	done:
	3994	if (drop_lock == TRUE) {
	3995	journal_unlock(jnl);
	3996	}
	3997	return (ret_val);
	3998	}
	3999
	4000
	4001	static void
	4002	finish_end_thread(transaction *tr)
	4003	{
	4004	proc_set_task_policy(current_task(), current_thread(),
	4005	TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
	4006
	4007	finish_end_transaction(tr, NULL, NULL);
	4008
	4009	thread_deallocate(current_thread());
	4010	thread_terminate(current_thread());
	4011	}
	4012
	4013	static void
	4014	write_header_thread(journal *jnl)
	4015	{
	4016	proc_set_task_policy(current_task(), current_thread(),
	4017	TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
	4018
	4019	if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
	4020	jnl->write_header_failed = TRUE;
	4021	else
	4022	jnl->write_header_failed = FALSE;
	4023	unlock_condition(jnl, &jnl->writing_header);
	4024
	4025	thread_deallocate(current_thread());
	4026	thread_terminate(current_thread());
	4027	}
	4028
	4029	static int
	4030	finish_end_transaction(transaction tr, errno_t (callback)(void), void callback_arg)
	4031	{
	4032	int i, amt;
	4033	int ret = 0;
	4034	off_t end;
	4035	journal *jnl = tr->jnl;
	4036	buf_t bp, *bparray;
	4037	vnode_t vp;
	4038	block_list_header blhdr=NULL, next=NULL;
	4039	size_t tbuffer_offset;
	4040	int bufs_written = 0;
	4041	int ret_val = 0;
	4042
	4043	KERNEL_DEBUG(0xbbbbc028\|DBG_FUNC_START, jnl, tr, 0, 0, 0);
	4044
	4045	end = jnl->jhdr->end;
	4046
	4047	for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
	4048	boolean_t was_vm_privileged;
	4049
	4050	amt = blhdr->bytes_used;
	4051
	4052	blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
	4053
	4054	blhdr->checksum = 0;
	4055	blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
	4056
	4057	if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
	4058	/*
	4059	* if we block waiting for memory, and there is enough pressure to
	4060	* cause us to try and create a new swap file, we may end up deadlocking
	4061	* due to waiting for the journal on the swap file creation path...
	4062	* by making ourselves vm_privileged, we give ourselves the best chance
	4063	* of not blocking
	4064	*/
	4065	was_vm_privileged = set_vm_privilege(TRUE);
	4066	}
	4067	if (kmem_alloc(kernel_map, (vm_offset_t )&bparray, blhdr->num_blocks sizeof(struct buf *))) {
	4068	panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
	4069	}
	4070	if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
	4071	set_vm_privilege(FALSE);
	4072
	4073	tbuffer_offset = jnl->jhdr->blhdr_size;
	4074
	4075	for (i = 1; i < blhdr->num_blocks; i++) {
	4076	void (func)(buf_t, void );
	4077	void *arg;
	4078	int32_t bsize;
	4079
	4080	/*
	4081	* finish preparing the shadow buf_t before
	4082	* calculating the individual block checksums
	4083	*/
	4084	if (blhdr->binfo[i].bnum != (off_t)-1) {
	4085	daddr64_t blkno;
	4086	daddr64_t lblkno;
	4087
	4088	bp = blhdr->binfo[i].u.bp;
	4089
	4090	vp = buf_vnode(bp);
	4091	blkno = buf_blkno(bp);
	4092	lblkno = buf_lblkno(bp);
	4093
	4094	if (vp == NULL && lblkno == blkno) {
	4095	printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n",
	4096	jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
	4097	ret_val = -1;
	4098	goto bad_journal;
	4099	}
	4100
	4101	// if the lblkno is the same as blkno and this bp isn't
	4102	// associated with the underlying file system device then
	4103	// we need to call bmap() to get the actual physical block.
	4104	//
	4105	if ((lblkno == blkno) && (vp != jnl->fsdev)) {
	4106	off_t f_offset;
	4107	size_t contig_bytes;
	4108
	4109	if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
	4110	printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
	4111	ret_val = -1;
	4112	goto bad_journal;
	4113	}
	4114	if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
	4115	printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
	4116	ret_val = -1;
	4117	goto bad_journal;
	4118	}
	4119	if ((uint32_t)contig_bytes < buf_count(bp)) {
	4120	printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
	4121	ret_val = -1;
	4122	goto bad_journal;
	4123	}
	4124	buf_setblkno(bp, blkno);
	4125	}
	4126	// update this so we write out the correct physical block number!
	4127	blhdr->binfo[i].bnum = (off_t)(blkno);
	4128
	4129	/*
	4130	* pick up the FS hook function (if any) and prepare
	4131	* to fire this buffer off in the next pass
	4132	*/
	4133	buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg);
	4134
	4135	if (func) {
	4136	/*
	4137	* call the hook function supplied by the filesystem...
	4138	* this needs to happen BEFORE cacl_checksum in case
	4139	* the FS morphs the data in the buffer
	4140	*/
	4141	func(bp, arg);
	4142	}
	4143	bparray[i] = bp;
	4144	bsize = buf_size(bp);
	4145	blhdr->binfo[i].u.bi.bsize = bsize;
	4146	blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize);
	4147	} else {
	4148	bparray[i] = NULL;
	4149	bsize = blhdr->binfo[i].u.bi.bsize;
	4150	blhdr->binfo[i].u.bi.b.cksum = 0;
	4151	}
	4152	tbuffer_offset += bsize;
	4153	}
	4154	/*
	4155	* if we fired off the journal_write_header asynchronously in
	4156	* 'end_transaction', we need to wait for its completion
	4157	* before writing the actual journal data
	4158	*/
	4159	wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
	4160
	4161	if (jnl->write_header_failed == FALSE)
	4162	ret = write_journal_data(jnl, &end, blhdr, amt);
	4163	else
	4164	ret_val = -1;
	4165	/*
	4166	* put the bp pointers back so that we can
	4167	* make the final pass on them
	4168	*/
	4169	for (i = 1; i < blhdr->num_blocks; i++)
	4170	blhdr->binfo[i].u.bp = bparray[i];
	4171
	4172	kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
	4173
	4174	if (ret_val == -1)
	4175	goto bad_journal;
	4176
	4177	if (ret != amt) {
	4178	printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
	4179	jnl->jdev_name, ret, amt);
	4180
	4181	ret_val = -1;
	4182	goto bad_journal;
	4183	}
	4184	}
	4185	jnl->jhdr->end = end; // update where the journal now ends
	4186	tr->journal_end = end; // the transaction ends here too
	4187
	4188	if (tr->journal_start == 0 \|\| tr->journal_end == 0) {
	4189	panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
	4190	tr->journal_start, tr->journal_end);
	4191	}
	4192
	4193	if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
	4194	ret_val = -1;
	4195	goto bad_journal;
	4196	}
	4197	/*
	4198	* If the caller supplied a callback, call it now that the blocks have been
	4199	* written to the journal. This is used by journal_relocate so, for example,
	4200	* the file system can change its pointer to the new journal.
	4201	*/
	4202	if (callback != NULL && callback(callback_arg) != 0) {
	4203	ret_val = -1;
	4204	goto bad_journal;
	4205	}
	4206
	4207	//
	4208	// Send a DKIOCUNMAP for the extents trimmed by this transaction, and
	4209	// free up the extent list.
	4210	//
	4211	journal_trim_flush(jnl, tr);
	4212
	4213	// the buffer_flushed_callback will only be called for the
	4214	// real blocks that get flushed so we have to account for
	4215	// the block_list_headers here.
	4216	//
	4217	tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
	4218
	4219	lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
	4220
	4221	//
	4222	// setup for looping through all the blhdr's.
	4223	//
	4224	for (blhdr = tr->blhdr; blhdr; blhdr = next) {
	4225	uint16_t num_blocks;
	4226
	4227	/*
	4228	* grab this info ahead of issuing the buf_bawrites...
	4229	* once the last one goes out, its possible for blhdr
	4230	* to be freed (especially if we get preempted) before
	4231	* we do the last check of num_blocks or
	4232	* grab the next blhdr pointer...
	4233	*/
	4234	next = (block_list_header *)((long)blhdr->binfo[0].bnum);
	4235	num_blocks = blhdr->num_blocks;
	4236
	4237	/*
	4238	* we can re-order the buf ptrs because everything is written out already
	4239	*/
	4240	qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
	4241
	4242	/*
	4243	* need to make sure that the loop issuing the buf_bawrite's
	4244	* does not touch blhdr once the last buf_bawrite has been
	4245	* issued... at that point, we no longer have a legitmate
	4246	* reference on the associated storage since it will be
	4247	* released upon the completion of that last buf_bawrite
	4248	*/
	4249	for (i = num_blocks-1; i >= 1; i--) {
	4250	if (blhdr->binfo[i].bnum != (off_t)-1)
	4251	break;
	4252	num_blocks--;
	4253	}
	4254	for (i = 1; i < num_blocks; i++) {
	4255
	4256	if ((bp = blhdr->binfo[i].u.bp)) {
	4257	vp = buf_vnode(bp);
	4258
	4259	buf_bawrite(bp);
	4260
	4261	// this undoes the vnode_ref() in journal_modify_block_end()
	4262	vnode_rele_ext(vp, 0, 1);
	4263
	4264	bufs_written++;
	4265	}
	4266	}
	4267	}
	4268	if (bufs_written == 0) {
	4269	/*
	4270	* since we didn't issue any buf_bawrite's, there is no
	4271	* async trigger to cause the memory associated with this
	4272	* transaction to be freed... so, move it to the garbage
	4273	* list now
	4274	*/
	4275	lock_oldstart(jnl);
	4276
	4277	tr->next = jnl->tr_freeme;
	4278	jnl->tr_freeme = tr;
	4279
	4280	unlock_oldstart(jnl);
	4281
	4282	unlock_condition(jnl, &jnl->asyncIO);
	4283	}
	4284
	4285	//printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
	4286	// tr, tr->journal_start, tr->journal_end);
	4287
	4288	bad_journal:
	4289	if (ret_val == -1) {
	4290	/*
	4291	* 'flush_aborted' is protected by the flushing condition... we need to
	4292	* set it before dropping the condition so that it will be
	4293	* noticed in 'end_transaction'... we add this additional
	4294	* aborted condition so that we can drop the 'flushing' condition
	4295	* before grabbing the journal lock... this avoids a deadlock
	4296	* in 'end_transaction' which is holding the journal lock while
	4297	* waiting for the 'flushing' condition to clear...
	4298	* everyone else will notice the JOURNAL_INVALID flag
	4299	*/
	4300	jnl->flush_aborted = TRUE;
	4301
	4302	unlock_condition(jnl, &jnl->flushing);
	4303	journal_lock(jnl);
	4304
	4305	jnl->flags \|= JOURNAL_INVALID;
	4306	jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
	4307	abort_transaction(jnl, tr); // cleans up list of extents to be trimmed
	4308
	4309	journal_unlock(jnl);
	4310	} else
	4311	unlock_condition(jnl, &jnl->flushing);
	4312
	4313	KERNEL_DEBUG(0xbbbbc028\|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0);
	4314
	4315	return (ret_val);
	4316	}
	4317
	4318
	4319	static void
	4320	lock_condition(journal jnl, boolean_t condition, const char *condition_name)
	4321	{
	4322
	4323	KERNEL_DEBUG(0xbbbbc020\|DBG_FUNC_START, jnl, condition, 0, 0, 0);
	4324
	4325	lock_flush(jnl);
	4326
	4327	while (*condition == TRUE)
	4328	msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
	4329
	4330	*condition = TRUE;
	4331	unlock_flush(jnl);
	4332
	4333	KERNEL_DEBUG(0xbbbbc020\|DBG_FUNC_END, jnl, condition, 0, 0, 0);
	4334	}
	4335
	4336	static void
	4337	wait_condition(journal jnl, boolean_t condition, const char *condition_name)
	4338	{
	4339
	4340	if (*condition == FALSE)
	4341	return;
	4342
	4343	KERNEL_DEBUG(0xbbbbc02c\|DBG_FUNC_START, jnl, condition, 0, 0, 0);
	4344
	4345	lock_flush(jnl);
	4346
	4347	while (*condition == TRUE)
	4348	msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
	4349
	4350	unlock_flush(jnl);
	4351
	4352	KERNEL_DEBUG(0xbbbbc02c\|DBG_FUNC_END, jnl, condition, 0, 0, 0);
	4353	}
	4354
	4355	static void
	4356	unlock_condition(journal jnl, boolean_t condition)
	4357	{
	4358	lock_flush(jnl);
	4359
	4360	*condition = FALSE;
	4361	wakeup(condition);
	4362
	4363	unlock_flush(jnl);
	4364	}
	4365
	4366	static void
	4367	abort_transaction(journal jnl, transaction tr)
	4368	{
	4369	block_list_header blhdr, next;
	4370
	4371	// for each block list header, iterate over the blocks then
	4372	// free up the memory associated with the block list.
	4373	//
	4374	// find each of the primary blocks (i.e. the list could
	4375	// contain a mix of shadowed and real buf_t's depending
	4376	// on when the abort condition was detected) and mark them
	4377	// clean and locked in the cache... this at least allows
	4378	// the FS a consistent view between it's incore data structures
	4379	// and the meta-data held in the cache
	4380	//
	4381	KERNEL_DEBUG(0xbbbbc034\|DBG_FUNC_START, jnl, tr, 0, 0, 0);
	4382
	4383	for (blhdr = tr->blhdr; blhdr; blhdr = next) {
	4384	int i;
	4385
	4386	for (i = 1; i < blhdr->num_blocks; i++) {
	4387	buf_t bp, tbp, sbp;
	4388	vnode_t bp_vp;
	4389	errno_t errno;
	4390
	4391	if (blhdr->binfo[i].bnum == (off_t)-1)
	4392	continue;
	4393
	4394	tbp = blhdr->binfo[i].u.bp;
	4395
	4396	bp_vp = buf_vnode(tbp);
	4397
	4398	buf_setfilter(tbp, NULL, NULL, NULL, NULL);
	4399
	4400	if (buf_shadow(tbp))
	4401	sbp = tbp;
	4402	else
	4403	sbp = NULL;
	4404
	4405	if (bp_vp) {
	4406	errno = buf_meta_bread(bp_vp,
	4407	buf_lblkno(tbp),
	4408	buf_size(tbp),
	4409	NOCRED,
	4410	&bp);
	4411	if (errno == 0) {
	4412	if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) {
	4413	panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
	4414	bp, tbp, jnl);
	4415	}
	4416	/*
	4417	* once the journal has been marked INVALID and aborted,
	4418	* NO meta data can be written back to the disk, so
	4419	* mark the buf_t clean and make sure it's locked in the cache
	4420	* note: if we found a shadow, the real buf_t needs to be relocked
	4421	*/
	4422	buf_setflags(bp, B_LOCKED);
	4423	buf_markclean(bp);
	4424	buf_brelse(bp);
	4425
	4426	KERNEL_DEBUG(0xbbbbc034\|DBG_FUNC_NONE, jnl, tr, bp, 0, 0);
	4427
	4428	/*
	4429	* this undoes the vnode_ref() in journal_modify_block_end()
	4430	*/
	4431	vnode_rele_ext(bp_vp, 0, 1);
	4432	} else {
	4433	printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n",
	4434	jnl->jdev_name, blhdr->binfo[i].bnum, tbp);
	4435	if (bp) {
	4436	buf_brelse(bp);
	4437	}
	4438	}
	4439	}
	4440	if (sbp)
	4441	buf_brelse(sbp);
	4442	}
	4443	next = (block_list_header *)((long)blhdr->binfo[0].bnum);
	4444
	4445	// we can free blhdr here since we won't need it any more
	4446	blhdr->binfo[0].bnum = 0xdeadc0de;
	4447	kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
	4448	}
	4449
	4450	/*
	4451	* If the transaction we're aborting was the async transaction, then
	4452	* tell the current transaction that there is no pending trim
	4453	* any more.
	4454	*/
	4455	lck_rw_lock_exclusive(&jnl->trim_lock);
	4456	if (jnl->async_trim == &tr->trim)
	4457	jnl->async_trim = NULL;
	4458	lck_rw_unlock_exclusive(&jnl->trim_lock);
	4459
	4460
	4461	if (tr->trim.extents) {
	4462	kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
	4463	}
	4464	tr->trim.allocated_count = 0;
	4465	tr->trim.extent_count = 0;
	4466	tr->trim.extents = NULL;
	4467	tr->tbuffer = NULL;
	4468	tr->blhdr = NULL;
	4469	tr->total_bytes = 0xdbadc0de;
	4470	FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
	4471
	4472	KERNEL_DEBUG(0xbbbbc034\|DBG_FUNC_END, jnl, tr, 0, 0, 0);
	4473	}
	4474
	4475
	4476	int
	4477	journal_end_transaction(journal *jnl)
	4478	{
	4479	int ret;
	4480	transaction *tr;
	4481
	4482	CHECK_JOURNAL(jnl);
	4483
	4484	free_old_stuff(jnl);
	4485
	4486	if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
	4487	return 0;
	4488	}
	4489
	4490	if (jnl->owner != current_thread()) {
	4491	panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
	4492	jnl, jnl->owner, current_thread());
	4493	}
	4494	jnl->nested_count--;
	4495
	4496	if (jnl->nested_count > 0) {
	4497	return 0;
	4498	} else if (jnl->nested_count < 0) {
	4499	panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
	4500	}
	4501
	4502	if (jnl->flags & JOURNAL_INVALID) {
	4503	if (jnl->active_tr) {
	4504	if (jnl->cur_tr != NULL) {
	4505	panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
	4506	jnl, jnl->active_tr, jnl->cur_tr);
	4507	}
	4508	tr = jnl->active_tr;
	4509	jnl->active_tr = NULL;
	4510
	4511	abort_transaction(jnl, tr);
	4512	}
	4513	journal_unlock(jnl);
	4514
	4515	return EINVAL;
	4516	}
	4517
	4518	tr = jnl->active_tr;
	4519	CHECK_TRANSACTION(tr);
	4520
	4521	// clear this out here so that when check_free_space() calls
	4522	// the FS flush function, we don't panic in journal_flush()
	4523	// if the FS were to call that. note: check_free_space() is
	4524	// called from end_transaction().
	4525	//
	4526	jnl->active_tr = NULL;
	4527
	4528	/* Examine the force-journal-flush state in the active txn */
	4529	if (tr->flush_on_completion == TRUE) {
	4530	/*
	4531	* If the FS requested it, disallow group commit and force the
	4532	* transaction out to disk immediately.
	4533	*/
	4534	ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
	4535	}
	4536	else {
	4537	/* in the common path we can simply use the double-buffered journal */
	4538	ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
	4539	}
	4540
	4541	return ret;
	4542	}
	4543
	4544
	4545	/*
	4546	* Flush the contents of the journal to the disk.
	4547	*
	4548	* Input:
	4549	* wait_for_IO -
	4550	* If TRUE, wait to write in-memory journal to the disk
	4551	* consistently, and also wait to write all asynchronous
	4552	* metadata blocks to its corresponding locations
	4553	* consistently on the disk. This means that the journal
	4554	* is empty at this point and does not contain any
	4555	* transactions. This is overkill in normal scenarios
	4556	* but is useful whenever the metadata blocks are required
	4557	* to be consistent on-disk instead of just the journal
	4558	* being consistent; like before live verification
	4559	* and live volume resizing.
	4560	*
	4561	* If FALSE, only wait to write in-memory journal to the
	4562	* disk consistently. This means that the journal still
	4563	* contains uncommitted transactions and the file system
	4564	* metadata blocks in the journal transactions might be
	4565	* written asynchronously to the disk. But there is no
	4566	* guarantee that they are written to the disk before
	4567	* returning to the caller. Note that this option is
	4568	* sufficient for file system data integrity as it
	4569	* guarantees consistent journal content on the disk.
	4570	*/
	4571	int
	4572	journal_flush(journal *jnl, boolean_t wait_for_IO)
	4573	{
	4574	boolean_t drop_lock = FALSE;
	4575
	4576	CHECK_JOURNAL(jnl);
	4577
	4578	free_old_stuff(jnl);
	4579
	4580	if (jnl->flags & JOURNAL_INVALID) {
	4581	return -1;
	4582	}
	4583
	4584	KERNEL_DEBUG(DBG_JOURNAL_FLUSH \| DBG_FUNC_START, jnl, 0, 0, 0, 0);
	4585
	4586	if (jnl->owner != current_thread()) {
	4587	journal_lock(jnl);
	4588	drop_lock = TRUE;
	4589	}
	4590
	4591	// if we're not active, flush any buffered transactions
	4592	if (jnl->active_tr == NULL && jnl->cur_tr) {
	4593	transaction *tr = jnl->cur_tr;
	4594
	4595	jnl->cur_tr = NULL;
	4596
	4597	if (wait_for_IO) {
	4598	wait_condition(jnl, &jnl->flushing, "journal_flush");
	4599	wait_condition(jnl, &jnl->asyncIO, "journal_flush");
	4600	}
	4601	/*
	4602	* "end_transction" will wait for any current async flush
	4603	* to complete, before flushing "cur_tr"... because we've
	4604	* specified the 'must_wait' arg as TRUE, it will then
	4605	* synchronously flush the "cur_tr"
	4606	*/
	4607	end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed
	4608
	4609	} else {
	4610	if (drop_lock == TRUE) {
	4611	journal_unlock(jnl);
	4612	}
	4613
	4614	/* Because of pipelined journal, the journal transactions
	4615	* might be in process of being flushed on another thread.
	4616	* If there is nothing to flush currently, we should
	4617	* synchronize ourselves with the pipelined journal thread
	4618	* to ensure that all inflight transactions, if any, are
	4619	* flushed before we return success to caller.
	4620	*/
	4621	wait_condition(jnl, &jnl->flushing, "journal_flush");
	4622	}
	4623	if (wait_for_IO) {
	4624	wait_condition(jnl, &jnl->asyncIO, "journal_flush");
	4625	}
	4626
	4627	KERNEL_DEBUG(DBG_JOURNAL_FLUSH \| DBG_FUNC_END, jnl, 0, 0, 0, 0);
	4628
	4629	return 0;
	4630	}
	4631
	4632	int
	4633	journal_active(journal *jnl)
	4634	{
	4635	if (jnl->flags & JOURNAL_INVALID) {
	4636	return -1;
	4637	}
	4638
	4639	return (jnl->active_tr == NULL) ? 0 : 1;
	4640	}
	4641
	4642	void *
	4643	journal_owner(journal *jnl)
	4644	{
	4645	return jnl->owner;
	4646	}
	4647
	4648	int journal_uses_fua(journal *jnl)
	4649	{
	4650	if (jnl->flags & JOURNAL_DO_FUA_WRITES)
	4651	return 1;
	4652	return 0;
	4653	}
	4654
	4655	/*
	4656	* Relocate the journal.
	4657	*
	4658	* You provide the new starting offset and size for the journal. You may
	4659	* optionally provide a new tbuffer_size; passing zero defaults to not
	4660	* changing the tbuffer size except as needed to fit within the new journal
	4661	* size.
	4662	*
	4663	* You must have already started a transaction. The transaction may contain
	4664	* modified blocks (such as those needed to deallocate the old journal,
	4665	* allocate the new journal, and update the location and size of the journal
	4666	* in filesystem-private structures). Any transactions prior to the active
	4667	* transaction will be flushed to the old journal. The new journal will be
	4668	* initialized, and the blocks from the active transaction will be written to
	4669	* the new journal.
	4670	*
	4671	* The caller will need to update the structures that identify the location
	4672	* and size of the journal. These updates should be made in the supplied
	4673	* callback routine. These updates must NOT go into a transaction. You should
	4674	* force these updates to the media before returning from the callback. In the
	4675	* even of a crash, either the old journal will be found, with an empty journal,
	4676	* or the new journal will be found with the contents of the active transaction.
	4677	*
	4678	* Upon return from the callback, the blocks from the active transaction are
	4679	* written to their normal locations on disk.
	4680	*
	4681	* (Remember that we have to ensure that blocks get committed to the journal
	4682	* before being committed to their normal locations. But the blocks don't count
	4683	* as committed until the new journal is pointed at.)
	4684	*
	4685	* Upon return, there is still an active transaction: newly allocated, and
	4686	* with no modified blocks. Call journal_end_transaction as normal. You may
	4687	* modifiy additional blocks before calling journal_end_transaction, and those
	4688	* blocks will (eventually) go to the relocated journal.
	4689	*
	4690	* Inputs:
	4691	* jnl The (opened) journal to relocate.
	4692	* offset The new journal byte offset (from start of the journal device).
	4693	* journal_size The size, in bytes, of the new journal.
	4694	* tbuffer_size The new desired transaction buffer size. Pass zero to keep
	4695	* the same size as the current journal. The size will be
	4696	* modified as needed to fit the new journal.
	4697	* callback Routine called after the new journal has been initialized,
	4698	* and the active transaction written to the new journal, but
	4699	* before the blocks are written to their normal locations.
	4700	* Pass NULL for no callback.
	4701	* callback_arg An argument passed to the callback routine.
	4702	*
	4703	* Result:
	4704	* 0 No errors
	4705	* EINVAL The offset is not block aligned
	4706	* EINVAL The journal_size is not a multiple of the block size
	4707	* EINVAL The journal is invalid
	4708	* (any) An error returned by journal_flush.
	4709	*
	4710	*/
	4711	int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
	4712	errno_t (callback)(void ), void *callback_arg)
	4713	{
	4714	int ret;
	4715	transaction *tr;
	4716	size_t i = 0;
	4717
	4718	/*
	4719	* Sanity check inputs, and adjust the size of the transaction buffer.
	4720	*/
	4721	if ((offset % jnl->jhdr->jhdr_size) != 0) {
	4722	printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
	4723	jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
	4724	return EINVAL;
	4725	}
	4726	if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
	4727	printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
	4728	jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
	4729	return EINVAL;
	4730	}
	4731
	4732	CHECK_JOURNAL(jnl);
	4733
	4734	/* Guarantee we own the active transaction. */
	4735	if (jnl->flags & JOURNAL_INVALID) {
	4736	return EINVAL;
	4737	}
	4738	if (jnl->owner != current_thread()) {
	4739	panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
	4740	jnl, jnl->owner, current_thread());
	4741	}
	4742
	4743	if (tbuffer_size == 0)
	4744	tbuffer_size = jnl->tbuffer_size;
	4745	size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
	4746
	4747	/*
	4748	* Flush any non-active transactions. We have to temporarily hide the
	4749	* active transaction to make journal_flush flush out non-active but
	4750	* current (unwritten) transactions.
	4751	*/
	4752	tr = jnl->active_tr;
	4753	CHECK_TRANSACTION(tr);
	4754	jnl->active_tr = NULL;
	4755	ret = journal_flush(jnl, TRUE);
	4756	jnl->active_tr = tr;
	4757
	4758	if (ret) {
	4759	return ret;
	4760	}
	4761	wait_condition(jnl, &jnl->flushing, "end_transaction");
	4762
	4763	/*
	4764	* At this point, we have completely flushed the contents of the current
	4765	* journal to disk (and have asynchronously written all of the txns to
	4766	* their actual desired locations). As a result, we can (and must) clear
	4767	* out the old_start array. If we do not, then if the last written transaction
	4768	* started at the beginning of the journal (starting 1 block into the
	4769	* journal file) it could confuse the buffer_flushed callback. This is
	4770	* because we're about to reset the start/end pointers of the journal header
	4771	* below.
	4772	*/
	4773	lock_oldstart(jnl);
	4774	for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) {
	4775	jnl->old_start[i] = 0;
	4776	}
	4777	unlock_oldstart(jnl);
	4778
	4779	/* Update the journal's offset and size in memory. */
	4780	jnl->jdev_offset = offset;
	4781	jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
	4782	jnl->jhdr->size = journal_size;
	4783	jnl->active_start = jnl->jhdr->start;
	4784
	4785	/*
	4786	* Force the active transaction to be written to the new journal. Call the
	4787	* supplied callback after the blocks have been written to the journal, but
	4788	* before they get written to their normal on-disk locations.
	4789	*/
	4790	jnl->active_tr = NULL;
	4791	ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE);
	4792	if (ret) {
	4793	printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
	4794	goto bad_journal;
	4795	}
	4796
	4797	/*
	4798	* Create a new, empty transaction to be the active transaction. This way
	4799	* our caller can use journal_end_transaction as usual.
	4800	*/
	4801	ret = journal_allocate_transaction(jnl);
	4802	if (ret) {
	4803	printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
	4804	goto bad_journal;
	4805	}
	4806
	4807	return 0;
	4808
	4809	bad_journal:
	4810	jnl->flags \|= JOURNAL_INVALID;
	4811	abort_transaction(jnl, tr);
	4812	return ret;
	4813	}
	4814
	4815
	4816	#else // !JOURNALING - so provide stub functions
	4817
	4818	int journal_uses_fua(__unused journal *jnl)
	4819	{
	4820	return 0;
	4821	}
	4822
	4823	journal *
	4824	journal_create(__unused struct vnode *jvp,
	4825	__unused off_t offset,
	4826	__unused off_t journal_size,
	4827	__unused struct vnode *fsvp,
	4828	__unused size_t min_fs_blksz,
	4829	__unused int32_t flags,
	4830	__unused int32_t tbuffer_size,
	4831	__unused void (flush)(void arg),
	4832	__unused void *arg,
	4833	__unused struct mount *fsmount)
	4834	{
	4835	return NULL;
	4836	}
	4837
	4838	journal *
	4839	journal_open(__unused struct vnode *jvp,
	4840	__unused off_t offset,
	4841	__unused off_t journal_size,
	4842	__unused struct vnode *fsvp,
	4843	__unused size_t min_fs_blksz,
	4844	__unused int32_t flags,
	4845	__unused int32_t tbuffer_size,
	4846	__unused void (flush)(void arg),
	4847	__unused void *arg,
	4848	__unused struct mount *fsmount)
	4849	{
	4850	return NULL;
	4851	}
	4852
	4853
	4854	int
	4855	journal_modify_block_start(__unused journal jnl, __unused struct buf bp)
	4856	{
	4857	return EINVAL;
	4858	}
	4859
	4860	int
	4861	journal_modify_block_end(__unused journal *jnl,
	4862	__unused struct buf *bp,
	4863	__unused void (func)(struct buf bp, void *arg),
	4864	__unused void *arg)
	4865	{
	4866	return EINVAL;
	4867	}
	4868
	4869	int
	4870	journal_kill_block(__unused journal jnl, __unused struct buf bp)
	4871	{
	4872	return EINVAL;
	4873	}
	4874
	4875	int journal_relocate(__unused journal *jnl,
	4876	__unused off_t offset,
	4877	__unused off_t journal_size,
	4878	__unused int32_t tbuffer_size,
	4879	__unused errno_t (callback)(void ),
	4880	__unused void *callback_arg)
	4881	{
	4882	return EINVAL;
	4883	}
	4884
	4885	void
	4886	journal_close(__unused journal *jnl)
	4887	{
	4888	}
	4889
	4890	int
	4891	journal_start_transaction(__unused journal *jnl)
	4892	{
	4893	return EINVAL;
	4894	}
	4895
	4896	int
	4897	journal_end_transaction(__unused journal *jnl)
	4898	{
	4899	return EINVAL;
	4900	}
	4901
	4902	int
	4903	journal_flush(__unused journal *jnl, __unused boolean_t wait_for_IO)
	4904	{
	4905	return EINVAL;
	4906	}
	4907
	4908	int
	4909	journal_is_clean(__unused struct vnode *jvp,
	4910	__unused off_t offset,
	4911	__unused off_t journal_size,
	4912	__unused struct vnode *fsvp,
	4913	__unused size_t min_fs_block_size)
	4914	{
	4915	return 0;
	4916	}
	4917
	4918
	4919	void *
	4920	journal_owner(__unused journal *jnl)
	4921	{
	4922	return NULL;
	4923	}
	4924
	4925	void
	4926	journal_lock(__unused journal *jnl)
	4927	{
	4928	return;
	4929	}
	4930
	4931	void
	4932	journal_unlock(__unused journal *jnl)
	4933	{
	4934	return;
	4935	}
	4936
	4937	__private_extern__ int
	4938	journal_trim_add_extent(__unused journal *jnl,
	4939	__unused uint64_t offset,
	4940	__unused uint64_t length)
	4941	{
	4942	return 0;
	4943	}
	4944
	4945	int
	4946	journal_request_immediate_flush(__unused journal *jnl)
	4947	{
	4948	return 0;
	4949	}
	4950
	4951	__private_extern__ int
	4952	journal_trim_remove_extent(__unused journal *jnl,
	4953	__unused uint64_t offset,
	4954	__unused uint64_t length)
	4955	{
	4956	return 0;
	4957	}
	4958
	4959	int journal_trim_extent_overlap(__unused journal *jnl,
	4960	__unused uint64_t offset,
	4961	__unused uint64_t length,
	4962	__unused uint64_t *end)
	4963	{
	4964	return 0;
	4965	}
	4966
	4967	#endif // !JOURNALING