2 * Copyright (c) 1995-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 // This file implements a simple write-ahead journaling layer.
30 // In theory any file system can make use of it by calling these
31 // functions when the fs wants to modify meta-data blocks. See
32 // vfs_journal.h for a more detailed description of the api and
35 // Dominic Giampaolo (dbg@apple.com)
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file_internal.h>
45 #include <sys/buf_internal.h>
46 #include <sys/proc_internal.h>
47 #include <sys/mount_internal.h>
48 #include <sys/namei.h>
49 #include <sys/vnode_internal.h>
50 #include <sys/ioctl.h>
53 #include <sys/malloc.h>
54 #include <kern/thread.h>
56 #include <sys/kdebug.h>
57 #include <miscfs/specfs/specdev.h>
58 #include <libkern/OSAtomic.h> /* OSAddAtomic */
60 extern task_t kernel_task
;
62 #define DBG_JOURNAL_FLUSH 1
74 #include <sys/types.h>
79 #include "vfs_journal.h"
81 /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */
82 __private_extern__
void qsort(
86 int (*)(const void *, const void *));
90 // number of bytes to checksum in a block_list_header
91 // NOTE: this should be enough to clear out the header
92 // fields as well as the first entry of binfo[]
93 #define BLHDR_CHECKSUM_SIZE 32
96 static int end_transaction(transaction
*tr
, int force_it
, errno_t (*callback
)(void*), void *callback_arg
);
97 static void abort_transaction(journal
*jnl
, transaction
*tr
);
98 static void dump_journal(journal
*jnl
);
100 static __inline__
void lock_journal(journal
*jnl
);
101 static __inline__
void unlock_journal(journal
*jnl
);
102 static __inline__
void lock_oldstart(journal
*jnl
);
103 static __inline__
void unlock_oldstart(journal
*jnl
);
109 // 3105942 - Coalesce writes to the same block on journal replay
112 typedef struct bucket
{
119 #define STARTING_BUCKETS 256
121 static int add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
);
122 static int grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
);
123 static int lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
);
124 static int do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
);
125 static int insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
);
127 #define CHECK_JOURNAL(jnl) \
130 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
132 if (jnl->jdev == NULL) { \
133 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
135 if (jnl->fsdev == NULL) { \
136 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
138 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
139 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
140 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
142 if ( jnl->jhdr->start <= 0 \
143 || jnl->jhdr->start > jnl->jhdr->size\
144 || jnl->jhdr->start > 1024*1024*1024) {\
145 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
146 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
148 if ( jnl->jhdr->end <= 0 \
149 || jnl->jhdr->end > jnl->jhdr->size\
150 || jnl->jhdr->end > 1024*1024*1024) {\
151 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
152 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
154 if (jnl->jhdr->size > 1024*1024*1024) {\
155 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
156 __FILE__, __LINE__, jnl->jhdr->size);\
160 #define CHECK_TRANSACTION(tr) \
163 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
165 if (tr->jnl == NULL) {\
166 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
168 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
169 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
171 if (tr->total_bytes < 0) {\
172 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
174 if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
175 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
177 if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
178 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
180 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
181 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
188 // this isn't a great checksum routine but it will do for now.
189 // we use it to checksum the journal header and the block list
190 // headers that are at the start of each transaction.
193 calc_checksum(char *ptr
, int len
)
197 // this is a lame checksum but for now it'll do
198 for(i
=0; i
< len
; i
++, ptr
++) {
199 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
208 lck_grp_attr_t
* jnl_group_attr
;
209 lck_attr_t
* jnl_lock_attr
;
210 lck_grp_t
* jnl_mutex_group
;
215 jnl_lock_attr
= lck_attr_alloc_init();
216 jnl_group_attr
= lck_grp_attr_alloc_init();
217 jnl_mutex_group
= lck_grp_alloc_init("jnl-mutex", jnl_group_attr
);
220 static __inline__
void
221 lock_journal(journal
*jnl
)
223 lck_mtx_lock(&jnl
->jlock
);
226 static __inline__
void
227 unlock_journal(journal
*jnl
)
229 lck_mtx_unlock(&jnl
->jlock
);
232 static __inline__
void
233 lock_oldstart(journal
*jnl
)
235 lck_mtx_lock(&jnl
->old_start_lock
);
238 static __inline__
void
239 unlock_oldstart(journal
*jnl
)
241 lck_mtx_unlock(&jnl
->old_start_lock
);
246 #define JNL_WRITE 0x0001
247 #define JNL_READ 0x0002
248 #define JNL_HEADER 0x8000
251 // This function sets up a fake buf and passes it directly to the
252 // journal device strategy routine (so that it won't get cached in
255 // It also handles range checking the i/o so that we don't write
256 // outside the journal boundaries and it will wrap the i/o back
257 // to the beginning if necessary (skipping over the journal header)
260 do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
)
267 if (*offset
< 0 || *offset
> jnl
->jhdr
->size
) {
268 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset
, jnl
->jhdr
->size
);
271 if (direction
& JNL_WRITE
)
272 max_iosize
= jnl
->max_write_size
;
273 else if (direction
& JNL_READ
)
274 max_iosize
= jnl
->max_read_size
;
276 max_iosize
= 128 * 1024;
279 bp
= alloc_io_buf(jnl
->jdev
, 1);
281 if (*offset
+ (off_t
)curlen
> jnl
->jhdr
->size
&& *offset
!= 0 && jnl
->jhdr
->size
!= 0) {
282 if (*offset
== jnl
->jhdr
->size
) {
283 *offset
= jnl
->jhdr
->jhdr_size
;
285 curlen
= (off_t
)jnl
->jhdr
->size
- *offset
;
289 if (curlen
> max_iosize
) {
294 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %lu\n", curlen
, *offset
, len
);
297 if (*offset
== 0 && (direction
& JNL_HEADER
) == 0) {
298 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen
, data
);
301 if (direction
& JNL_READ
)
302 buf_setflags(bp
, B_READ
);
305 * don't have to set any flags
307 vnode_startwrite(jnl
->jdev
);
309 buf_setsize(bp
, curlen
);
310 buf_setcount(bp
, curlen
);
311 buf_setdataptr(bp
, (uintptr_t)data
);
312 buf_setblkno(bp
, (daddr64_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
));
313 buf_setlblkno(bp
, (daddr64_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
));
314 if ((direction
& JNL_WRITE
) && (jnl
->flags
& JOURNAL_DO_FUA_WRITES
)) {
318 err
= VNOP_STRATEGY(bp
);
320 err
= (int)buf_biowait(bp
);
325 printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl
->jdev_name
, err
);
332 // handle wrap-around
333 data
= (char *)data
+ curlen
;
334 curlen
= len
- io_sz
;
335 if (*offset
>= jnl
->jhdr
->size
) {
336 *offset
= jnl
->jhdr
->jhdr_size
;
345 read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
347 return do_journal_io(jnl
, offset
, data
, len
, JNL_READ
);
351 write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
353 return do_journal_io(jnl
, offset
, data
, len
, JNL_WRITE
);
358 read_journal_header(journal
*jnl
, void *data
, size_t len
)
360 off_t hdr_offset
= 0;
362 return do_journal_io(jnl
, &hdr_offset
, data
, len
, JNL_READ
|JNL_HEADER
);
366 write_journal_header(journal
*jnl
)
368 static int num_err_prints
= 0;
370 off_t jhdr_offset
= 0;
371 struct vfs_context context
;
373 context
.vc_thread
= current_thread();
374 context
.vc_ucred
= NOCRED
;
376 // Flush the track cache if we're not doing force-unit-access
379 if ((jnl
->flags
& JOURNAL_DO_FUA_WRITES
) == 0) {
380 ret
= VNOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, &context
);
384 // Only print this error if it's a different error than the
385 // previous one, or if it's the first time for this device
386 // or if the total number of printfs is less than 25. We
387 // allow for up to 25 printfs to insure that some make it
388 // into the on-disk syslog. Otherwise if we only printed
389 // one, it's possible it would never make it to the syslog
390 // for the root volume and that makes debugging hard.
392 if ( ret
!= jnl
->last_flush_err
393 || (jnl
->flags
& JOURNAL_FLUSHCACHE_ERR
) == 0
394 || num_err_prints
++ < 25) {
396 printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl
->jdev_name
, ret
);
398 jnl
->flags
|= JOURNAL_FLUSHCACHE_ERR
;
399 jnl
->last_flush_err
= ret
;
403 jnl
->jhdr
->checksum
= 0;
404 jnl
->jhdr
->checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
405 if (do_journal_io(jnl
, &jhdr_offset
, jnl
->header_buf
, jnl
->jhdr
->jhdr_size
, JNL_WRITE
|JNL_HEADER
) != (size_t)jnl
->jhdr
->jhdr_size
) {
406 printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl
->jdev_name
);
407 jnl
->flags
|= JOURNAL_INVALID
;
411 // If we're not doing force-unit-access writes, then we
412 // have to flush after writing the journal header so that
413 // a future transaction doesn't sneak out to disk before
414 // the header does and thus overwrite data that the old
415 // journal header refers to. Saw this exact case happen
416 // on an IDE bus analyzer with Larry Barras so while it
417 // may seem obscure, it's not.
419 if ((jnl
->flags
& JOURNAL_DO_FUA_WRITES
) == 0) {
420 VNOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, &context
);
429 // this is a work function used to free up transactions that
430 // completed. they can't be free'd from buffer_flushed_callback
431 // because it is called from deep with the disk driver stack
432 // and thus can't do something that would potentially cause
433 // paging. it gets called by each of the journal api entry
434 // points so stuff shouldn't hang around for too long.
437 free_old_stuff(journal
*jnl
)
439 transaction
*tr
, *next
;
443 jnl
->tr_freeme
= NULL
;
444 unlock_oldstart(jnl
);
448 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
456 // This is our callback that lets us know when a buffer has been
457 // flushed to disk. It's called from deep within the driver stack
458 // and thus is quite limited in what it can do. Notably, it can
459 // not initiate any new i/o's or allocate/free memory.
462 buffer_flushed_callback(struct buf
*bp
, void *arg
)
466 transaction
*ctr
, *prev
=NULL
, *next
;
468 int bufsize
, amt_flushed
, total_bytes
;
471 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
472 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
474 // snarf out the bits we want
475 bufsize
= buf_size(bp
);
476 tr
= (transaction
*)arg
;
478 // then we've already seen it
483 CHECK_TRANSACTION(tr
);
486 if (jnl
->flags
& JOURNAL_INVALID
) {
492 amt_flushed
= tr
->num_killed
;
493 total_bytes
= tr
->total_bytes
;
495 // update the number of blocks that have been flushed.
496 // this buf may represent more than one block so take
497 // that into account.
499 // OSAddAtomic() returns the value of tr->num_flushed before the add
501 amt_flushed
+= OSAddAtomic(bufsize
, (SInt32
*)&tr
->num_flushed
);
504 // if this transaction isn't done yet, just return as
505 // there is nothing to do.
507 // NOTE: we are careful to not reference anything through
508 // the tr pointer after doing the OSAddAtomic(). if
509 // this if statement fails then we are the last one
510 // and then it's ok to dereference "tr".
512 if ((amt_flushed
+ bufsize
) < total_bytes
) {
516 // this will single thread checking the transaction
519 if (tr
->total_bytes
== (int)0xfbadc0de) {
520 // then someone beat us to it...
521 unlock_oldstart(jnl
);
525 // mark this so that we're the owner of dealing with the
526 // cleanup for this transaction
527 tr
->total_bytes
= 0xfbadc0de;
529 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
530 // tr, tr->journal_start, tr->journal_end, jnl);
532 // find this entry in the old_start[] index and mark it completed
533 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
535 if ((off_t
)(jnl
->old_start
[i
] & ~(0x8000000000000000ULL
)) == tr
->journal_start
) {
536 jnl
->old_start
[i
] &= ~(0x8000000000000000ULL
);
541 if (i
>= sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
542 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
543 tr
->journal_start
, tr
, jnl
);
547 // if we are here then we need to update the journal header
548 // to reflect that this transaction is complete
549 if (tr
->journal_start
== jnl
->active_start
) {
550 jnl
->active_start
= tr
->journal_end
;
551 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
554 // go through the completed_trs list and try to coalesce
555 // entries, restarting back at the beginning if we have to.
556 for(ctr
=jnl
->completed_trs
; ctr
; prev
=ctr
, ctr
=next
) {
557 if (ctr
->journal_start
== jnl
->active_start
) {
558 jnl
->active_start
= ctr
->journal_end
;
560 prev
->next
= ctr
->next
;
562 if (ctr
== jnl
->completed_trs
) {
563 jnl
->completed_trs
= ctr
->next
;
566 next
= jnl
->completed_trs
; // this starts us over again
567 ctr
->next
= jnl
->tr_freeme
;
568 jnl
->tr_freeme
= ctr
;
570 } else if (tr
->journal_end
== ctr
->journal_start
) {
571 ctr
->journal_start
= tr
->journal_start
;
572 next
= jnl
->completed_trs
; // this starts us over again
574 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
575 } else if (tr
->journal_start
== ctr
->journal_end
) {
576 ctr
->journal_end
= tr
->journal_end
;
578 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
579 } else if (ctr
->next
&& ctr
->journal_end
== ctr
->next
->journal_start
) {
580 // coalesce the next entry with this one and link the next
581 // entry in at the head of the tr_freeme list
582 next
= ctr
->next
; // temporarily use the "next" variable
583 ctr
->journal_end
= next
->journal_end
;
584 ctr
->next
= next
->next
;
585 next
->next
= jnl
->tr_freeme
; // link in the next guy at the head of the tr_freeme list
586 jnl
->tr_freeme
= next
;
588 next
= jnl
->completed_trs
; // this starts us over again
595 // if this is true then we didn't merge with anyone
596 // so link ourselves in at the head of the completed
598 if (tr
->journal_start
!= 0) {
599 // put this entry into the correct sorted place
600 // in the list instead of just at the head.
604 for(ctr
=jnl
->completed_trs
; ctr
&& tr
->journal_start
> ctr
->journal_start
; prev
=ctr
, ctr
=ctr
->next
) {
608 if (ctr
== NULL
&& prev
== NULL
) {
609 jnl
->completed_trs
= tr
;
611 } else if (ctr
== jnl
->completed_trs
) {
612 tr
->next
= jnl
->completed_trs
;
613 jnl
->completed_trs
= tr
;
615 tr
->next
= prev
->next
;
619 // if we're here this tr got merged with someone else so
620 // put it on the list to be free'd
621 tr
->next
= jnl
->tr_freeme
;
624 unlock_oldstart(jnl
);
628 #include <libkern/OSByteOrder.h>
630 #define SWAP16(x) OSSwapInt16(x)
631 #define SWAP32(x) OSSwapInt32(x)
632 #define SWAP64(x) OSSwapInt64(x)
636 swap_journal_header(journal
*jnl
)
638 jnl
->jhdr
->magic
= SWAP32(jnl
->jhdr
->magic
);
639 jnl
->jhdr
->endian
= SWAP32(jnl
->jhdr
->endian
);
640 jnl
->jhdr
->start
= SWAP64(jnl
->jhdr
->start
);
641 jnl
->jhdr
->end
= SWAP64(jnl
->jhdr
->end
);
642 jnl
->jhdr
->size
= SWAP64(jnl
->jhdr
->size
);
643 jnl
->jhdr
->blhdr_size
= SWAP32(jnl
->jhdr
->blhdr_size
);
644 jnl
->jhdr
->checksum
= SWAP32(jnl
->jhdr
->checksum
);
645 jnl
->jhdr
->jhdr_size
= SWAP32(jnl
->jhdr
->jhdr_size
);
646 jnl
->jhdr
->sequence_num
= SWAP32(jnl
->jhdr
->sequence_num
);
650 swap_block_list_header(journal
*jnl
, block_list_header
*blhdr
)
654 blhdr
->max_blocks
= SWAP16(blhdr
->max_blocks
);
655 blhdr
->num_blocks
= SWAP16(blhdr
->num_blocks
);
656 blhdr
->bytes_used
= SWAP32(blhdr
->bytes_used
);
657 blhdr
->checksum
= SWAP32(blhdr
->checksum
);
658 blhdr
->flags
= SWAP32(blhdr
->flags
);
660 if (blhdr
->num_blocks
>= ((jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1)) {
661 printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl
->jdev_name
, blhdr
->num_blocks
, jnl
->jhdr
->blhdr_size
);
665 for(i
=0; i
< blhdr
->num_blocks
; i
++) {
666 blhdr
->binfo
[i
].bnum
= SWAP64(blhdr
->binfo
[i
].bnum
);
667 blhdr
->binfo
[i
].bsize
= SWAP32(blhdr
->binfo
[i
].bsize
);
668 blhdr
->binfo
[i
].b
.cksum
= SWAP32(blhdr
->binfo
[i
].b
.cksum
);
674 update_fs_block(journal
*jnl
, void *block_ptr
, off_t fs_block
, size_t bsize
)
677 struct buf
*oblock_bp
=NULL
;
679 // first read the block we want.
680 ret
= buf_meta_bread(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
682 printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl
->jdev_name
, fs_block
, ret
);
685 buf_brelse(oblock_bp
);
689 // let's try to be aggressive here and just re-write the block
690 oblock_bp
= buf_getblk(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, 0, 0, BLK_META
);
691 if (oblock_bp
== NULL
) {
692 printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl
->jdev_name
, fs_block
);
697 // make sure it's the correct size.
698 if (buf_size(oblock_bp
) != bsize
) {
699 buf_brelse(oblock_bp
);
703 // copy the journal data over top of it
704 memcpy((char *)0 + buf_dataptr(oblock_bp
), block_ptr
, bsize
);
706 if ((ret
= VNOP_BWRITE(oblock_bp
)) != 0) {
707 printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl
->jdev_name
, fs_block
,ret
);
711 // and now invalidate it so that if someone else wants to read
712 // it in a different size they'll be able to do it.
713 ret
= buf_meta_bread(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
715 buf_markinvalid(oblock_bp
);
716 buf_brelse(oblock_bp
);
723 grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
)
725 struct bucket
*newBuf
;
726 int current_size
= num_buckets
, i
;
728 // return if newsize is less than the current size
729 if (new_size
< num_buckets
) {
733 if ((MALLOC(newBuf
, struct bucket
*, new_size
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
734 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
738 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
740 // copy existing elements
741 bcopy(*buf_ptr
, newBuf
, num_buckets
*sizeof(struct bucket
));
743 // initialize the new ones
744 for(i
=num_buckets
; i
< new_size
; i
++) {
745 newBuf
[i
].block_num
= (off_t
)-1;
748 // free the old container
749 FREE(*buf_ptr
, M_TEMP
);
758 lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
)
760 int lo
, hi
, index
, matches
, i
;
763 return 0; // table is empty, so insert at index=0
770 // perform binary search for block_num
772 int mid
= (hi
- lo
)/2 + lo
;
773 off_t this_num
= (*buf_ptr
)[mid
].block_num
;
775 if (block_num
== this_num
) {
780 if (block_num
< this_num
) {
785 if (block_num
> this_num
) {
791 // check if lo and hi converged on the match
792 if (block_num
== (*buf_ptr
)[hi
].block_num
) {
796 // if no existing entry found, find index for new one
798 index
= (block_num
< (*buf_ptr
)[hi
].block_num
) ? hi
: hi
+ 1;
800 // make sure that we return the right-most index in the case of multiple matches
803 while(i
< num_full
&& block_num
== (*buf_ptr
)[i
].block_num
) {
815 insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
)
818 // grow the table if we're out of space
819 if (*num_full_ptr
>= *num_buckets_ptr
) {
820 int new_size
= *num_buckets_ptr
* 2;
821 int grow_size
= grow_table(buf_ptr
, *num_buckets_ptr
, new_size
);
823 if (grow_size
< new_size
) {
824 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl
->jdev_name
);
828 *num_buckets_ptr
= grow_size
; //update num_buckets to reflect the new size
831 // if we're not inserting at the end, we need to bcopy
832 if (blk_index
!= *num_full_ptr
) {
833 bcopy( (*buf_ptr
)+(blk_index
), (*buf_ptr
)+(blk_index
+1), (*num_full_ptr
-blk_index
)*sizeof(struct bucket
) );
836 (*num_full_ptr
)++; // increment only if we're not overwriting
839 // sanity check the values we're about to add
840 if (offset
>= jnl
->jhdr
->size
) {
841 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
844 panic("jnl: insert_block: bad size in insert_block (%lu)\n", size
);
847 (*buf_ptr
)[blk_index
].block_num
= num
;
848 (*buf_ptr
)[blk_index
].block_size
= size
;
849 (*buf_ptr
)[blk_index
].jnl_offset
= offset
;
850 (*buf_ptr
)[blk_index
].cksum
= cksum
;
856 do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, __unused
size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
)
858 int num_to_remove
, index
, i
, overwrite
, err
;
859 size_t jhdr_size
= jnl
->jhdr
->jhdr_size
, new_offset
;
860 off_t overlap
, block_start
, block_end
;
862 block_start
= block_num
*jhdr_size
;
863 block_end
= block_start
+ size
;
864 overwrite
= (block_num
== (*buf_ptr
)[blk_index
].block_num
&& size
>= (*buf_ptr
)[blk_index
].block_size
);
866 // first, eliminate any overlap with the previous entry
867 if (blk_index
!= 0 && !overwrite
) {
868 off_t prev_block_start
= (*buf_ptr
)[blk_index
-1].block_num
*jhdr_size
;
869 off_t prev_block_end
= prev_block_start
+ (*buf_ptr
)[blk_index
-1].block_size
;
870 overlap
= prev_block_end
- block_start
;
872 if (overlap
% jhdr_size
!= 0) {
873 panic("jnl: do_overlap: overlap with previous entry not a multiple of %lu\n", jhdr_size
);
876 // if the previous entry completely overlaps this one, we need to break it into two pieces.
877 if (prev_block_end
> block_end
) {
878 off_t new_num
= block_end
/ jhdr_size
;
879 size_t new_size
= prev_block_end
- block_end
;
881 new_offset
= (*buf_ptr
)[blk_index
-1].jnl_offset
+ (block_end
- prev_block_start
);
883 err
= insert_block(jnl
, buf_ptr
, blk_index
, new_num
, new_size
, new_offset
, cksum
, num_buckets_ptr
, num_full_ptr
, 0);
885 panic("jnl: do_overlap: error inserting during pre-overlap\n");
889 // Regardless, we need to truncate the previous entry to the beginning of the overlap
890 (*buf_ptr
)[blk_index
-1].block_size
= block_start
- prev_block_start
;
891 (*buf_ptr
)[blk_index
-1].cksum
= 0; // have to blow it away because there's no way to check it
895 // then, bail out fast if there's no overlap with the entries that follow
896 if (!overwrite
&& block_end
<= (*buf_ptr
)[blk_index
].block_num
*jhdr_size
) {
897 return 0; // no overlap, no overwrite
898 } else if (overwrite
&& (blk_index
+ 1 >= *num_full_ptr
|| block_end
<= (*buf_ptr
)[blk_index
+1].block_num
*jhdr_size
)) {
900 (*buf_ptr
)[blk_index
].cksum
= cksum
; // update this
901 return 1; // simple overwrite
904 // Otherwise, find all cases of total and partial overlap. We use the special
905 // block_num of -2 to designate entries that are completely overlapped and must
906 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
907 // entries must be adjusted to keep the array consistent.
910 while(index
< *num_full_ptr
&& block_end
> (*buf_ptr
)[index
].block_num
*jhdr_size
) {
911 if (block_end
>= ((*buf_ptr
)[index
].block_num
*jhdr_size
+ (*buf_ptr
)[index
].block_size
)) {
912 (*buf_ptr
)[index
].block_num
= -2; // mark this for deletion
915 overlap
= block_end
- (*buf_ptr
)[index
].block_num
*jhdr_size
;
917 if (overlap
% jhdr_size
!= 0) {
918 panic("jnl: do_overlap: overlap of %lld is not multiple of %lu\n", overlap
, jhdr_size
);
921 // if we partially overlap this entry, adjust its block number, jnl offset, and size
922 (*buf_ptr
)[index
].block_num
+= (overlap
/ jhdr_size
); // make sure overlap is multiple of jhdr_size, or round up
923 (*buf_ptr
)[index
].cksum
= 0;
925 new_offset
= (*buf_ptr
)[index
].jnl_offset
+ overlap
; // check for wrap-around
926 if (new_offset
>= jnl
->jhdr
->size
) {
927 new_offset
= jhdr_size
+ (new_offset
- jnl
->jhdr
->size
);
929 (*buf_ptr
)[index
].jnl_offset
= new_offset
;
931 (*buf_ptr
)[index
].block_size
-= overlap
; // sanity check for negative value
932 if ((*buf_ptr
)[index
].block_size
<= 0) {
933 panic("jnl: do_overlap: after overlap, new block size is invalid (%lu)\n", (*buf_ptr
)[index
].block_size
);
934 // return -1; // if above panic is removed, return -1 for error
943 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
944 index
--; // start with the last index used within the above loop
945 while(index
>= blk_index
) {
946 if ((*buf_ptr
)[index
].block_num
== -2) {
947 if (index
== *num_full_ptr
-1) {
948 (*buf_ptr
)[index
].block_num
= -1; // it's the last item in the table... just mark as free
950 bcopy( (*buf_ptr
)+(index
+1), (*buf_ptr
)+(index
), (*num_full_ptr
- (index
+ 1)) * sizeof(struct bucket
) );
957 // eliminate any stale entries at the end of the table
958 for(i
=*num_full_ptr
; i
< (*num_full_ptr
+ num_to_remove
); i
++) {
959 (*buf_ptr
)[i
].block_num
= -1;
962 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
965 // PR-3105942: Coalesce writes to the same block in journal replay
966 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
967 // to be replayed and the corresponding location in the journal which contains
968 // the most recent data for those blocks. The array is "played" once the all the
969 // blocks in the journal have been coalesced. The code for the case of conflicting/
970 // overlapping writes to a single block is the most dense. Because coalescing can
971 // disrupt the existing time-ordering of blocks in the journal playback, care
972 // is taken to catch any overlaps and keep the array consistent.
974 add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, __unused
size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
)
976 int blk_index
, overwriting
;
978 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
979 // inserted (or the index of the elem to overwrite).
980 blk_index
= lookup_bucket( buf_ptr
, block_num
, *num_full_ptr
);
982 // check if the index is within bounds (if we're adding this block to the end of
983 // the table, blk_index will be equal to num_full)
984 if (blk_index
< 0 || blk_index
> *num_full_ptr
) {
985 //printf("jnl: add_block: trouble adding block to co_buf\n");
987 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
989 // Determine whether we're overwriting an existing entry by checking for overlap
990 overwriting
= do_overlap(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, cksum
, num_buckets_ptr
, num_full_ptr
);
991 if (overwriting
< 0) {
992 return -1; // if we got an error, pass it along
995 // returns the index, or -1 on error
996 blk_index
= insert_block(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, cksum
, num_buckets_ptr
, num_full_ptr
, overwriting
);
1002 replay_journal(journal
*jnl
)
1004 int i
, orig_checksum
, checksum
, check_block_checksums
=0, bad_blocks
=0;
1006 size_t max_bsize
= 0; /* protected by block_ptr */
1007 block_list_header
*blhdr
;
1008 off_t offset
, txn_start_offset
=0, blhdr_offset
, orig_jnl_start
;
1009 char *buff
, *block_ptr
=NULL
;
1010 struct bucket
*co_buf
;
1011 int num_buckets
= STARTING_BUCKETS
, num_full
, check_past_jnl_end
= 1, in_uncharted_territory
=0;
1012 uint32_t last_sequence_num
= 0;
1014 // wrap the start ptr if it points to the very end of the journal
1015 if (jnl
->jhdr
->start
== jnl
->jhdr
->size
) {
1016 jnl
->jhdr
->start
= jnl
->jhdr
->jhdr_size
;
1018 if (jnl
->jhdr
->end
== jnl
->jhdr
->size
) {
1019 jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
1022 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
1026 orig_jnl_start
= jnl
->jhdr
->start
;
1028 // allocate memory for the header_block. we'll read each blhdr into this
1029 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&buff
, jnl
->jhdr
->blhdr_size
)) {
1030 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1031 jnl
->jdev_name
, jnl
->jhdr
->blhdr_size
);
1035 // allocate memory for the coalesce buffer
1036 if ((MALLOC(co_buf
, struct bucket
*, num_buckets
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
1037 printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl
->jdev_name
);
1043 // initialize entries
1044 for(i
=0; i
< num_buckets
; i
++) {
1045 co_buf
[i
].block_num
= -1;
1047 num_full
= 0; // empty at first
1050 printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1051 jnl
->jdev_name
, jnl
->jhdr
->start
, jnl
->jhdr
->end
, jnl
->jdev_offset
);
1053 while(check_past_jnl_end
|| jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1054 offset
= blhdr_offset
= jnl
->jhdr
->start
;
1055 ret
= read_journal_data(jnl
, &offset
, buff
, jnl
->jhdr
->blhdr_size
);
1056 if (ret
!= (size_t)jnl
->jhdr
->blhdr_size
) {
1057 printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl
->jdev_name
, offset
);
1059 goto bad_txn_handling
;
1062 blhdr
= (block_list_header
*)buff
;
1064 orig_checksum
= blhdr
->checksum
;
1065 blhdr
->checksum
= 0;
1066 if (jnl
->flags
& JOURNAL_NEED_SWAP
) {
1067 // calculate the checksum based on the unswapped data
1068 // because it is done byte-at-a-time.
1069 orig_checksum
= SWAP32(orig_checksum
);
1070 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1071 swap_block_list_header(jnl
, blhdr
);
1073 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1078 // XXXdbg - if these checks fail, we should replay as much
1079 // we can in the hopes that it will still leave the
1080 // drive in a better state than if we didn't replay
1083 if (checksum
!= orig_checksum
) {
1084 if (check_past_jnl_end
&& in_uncharted_territory
) {
1086 if (blhdr_offset
!= jnl
->jhdr
->end
) {
1087 printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl
->jdev_name
, blhdr_offset
, blhdr_offset
);
1090 check_past_jnl_end
= 0;
1091 jnl
->jhdr
->end
= blhdr_offset
;
1095 printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1096 jnl
->jdev_name
, blhdr_offset
, orig_checksum
, checksum
);
1098 if (blhdr_offset
== orig_jnl_start
) {
1099 // if there's nothing in the journal at all, just bail out altogether.
1104 goto bad_txn_handling
;
1107 if ( (last_sequence_num
!= 0)
1108 && (blhdr
->binfo
[0].b
.sequence_num
!= 0)
1109 && (blhdr
->binfo
[0].b
.sequence_num
!= last_sequence_num
)
1110 && (blhdr
->binfo
[0].b
.sequence_num
!= last_sequence_num
+1)) {
1112 txn_start_offset
= jnl
->jhdr
->end
= blhdr_offset
;
1114 if (check_past_jnl_end
) {
1115 check_past_jnl_end
= 0;
1116 printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1117 jnl
->jdev_name
, blhdr_offset
, blhdr_offset
, blhdr
->binfo
[0].b
.sequence_num
, last_sequence_num
);
1121 printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1122 jnl
->jdev_name
, blhdr_offset
, blhdr_offset
, blhdr
->binfo
[0].b
.sequence_num
, last_sequence_num
);
1124 goto bad_txn_handling
;
1126 last_sequence_num
= blhdr
->binfo
[0].b
.sequence_num
;
1128 if (blhdr_offset
>= jnl
->jhdr
->end
&& jnl
->jhdr
->start
<= jnl
->jhdr
->end
) {
1129 printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl
->jdev_name
, blhdr_offset
, blhdr_offset
);
1132 if ( blhdr
->max_blocks
<= 0 || blhdr
->max_blocks
> 2048
1133 || blhdr
->num_blocks
<= 0 || blhdr
->num_blocks
> blhdr
->max_blocks
) {
1134 printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1135 jnl
->jdev_name
, blhdr
->max_blocks
, blhdr
->num_blocks
);
1137 goto bad_txn_handling
;
1141 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1142 if (blhdr
->binfo
[i
].bnum
< 0 && blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
1143 printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl
->jdev_name
, blhdr
->binfo
[i
].bnum
);
1145 goto bad_txn_handling
;
1148 if (blhdr
->binfo
[i
].bsize
> max_bsize
) {
1149 max_bsize
= blhdr
->binfo
[i
].bsize
;
1153 if (blhdr
->flags
& BLHDR_CHECK_CHECKSUMS
) {
1154 check_block_checksums
= 1;
1155 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&block_ptr
, max_bsize
)) {
1162 if (blhdr
->flags
& BLHDR_FIRST_HEADER
) {
1163 txn_start_offset
= blhdr_offset
;
1166 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1167 // blhdr->num_blocks-1, jnl->jhdr->start);
1169 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1173 size
= blhdr
->binfo
[i
].bsize
;
1174 number
= blhdr
->binfo
[i
].bnum
;
1176 // don't add "killed" blocks
1177 if (number
== (off_t
)-1) {
1178 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1181 if (check_block_checksums
) {
1185 block_offset
= offset
;
1187 // read the block so we can check the checksum
1188 ret
= read_journal_data(jnl
, &block_offset
, block_ptr
, size
);
1189 if (ret
!= (size_t)size
) {
1190 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl
->jdev_name
, offset
);
1192 goto bad_txn_handling
;
1195 disk_cksum
= calc_checksum(block_ptr
, size
);
1197 // there is no need to swap the checksum from disk because
1198 // it got swapped when the blhdr was read in.
1199 if (blhdr
->binfo
[i
].b
.cksum
!= 0 && disk_cksum
!= blhdr
->binfo
[i
].b
.cksum
) {
1200 printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1201 jnl
->jdev_name
, txn_start_offset
, blhdr_offset
, i
, number
, size
, disk_cksum
, blhdr
->binfo
[i
].b
.cksum
);
1202 printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1203 *(int *)&block_ptr
[0*sizeof(int)], *(int *)&block_ptr
[1*sizeof(int)], *(int *)&block_ptr
[2*sizeof(int)], *(int *)&block_ptr
[3*sizeof(int)],
1204 *(int *)&block_ptr
[4*sizeof(int)], *(int *)&block_ptr
[5*sizeof(int)], *(int *)&block_ptr
[6*sizeof(int)], *(int *)&block_ptr
[7*sizeof(int)]);
1207 goto bad_txn_handling
;
1212 // add this bucket to co_buf, coalescing where possible
1213 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1214 ret_val
= add_block(jnl
, &co_buf
, number
, size
, (size_t) offset
, blhdr
->binfo
[i
].b
.cksum
, &num_buckets
, &num_full
);
1216 if (ret_val
== -1) {
1217 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl
->jdev_name
);
1219 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1225 // check if the last block added puts us off the end of the jnl.
1226 // if so, we need to wrap to the beginning and take any remainder
1229 if (offset
>= jnl
->jhdr
->size
) {
1230 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
1235 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1241 if (txn_start_offset
== 0) {
1242 printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl
->jdev_name
);
1246 jnl
->jhdr
->start
= orig_jnl_start
;
1247 jnl
->jhdr
->end
= txn_start_offset
;
1248 check_past_jnl_end
= 0;
1249 last_sequence_num
= 0;
1250 printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl
->jdev_name
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1251 goto restart_replay
;
1254 jnl
->jhdr
->start
+= blhdr
->bytes_used
;
1255 if (jnl
->jhdr
->start
>= jnl
->jhdr
->size
) {
1256 // wrap around and skip the journal header block
1257 jnl
->jhdr
->start
= (jnl
->jhdr
->start
% jnl
->jhdr
->size
) + jnl
->jhdr
->jhdr_size
;
1260 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
1261 in_uncharted_territory
= 1;
1265 if (jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1266 printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl
->jdev_name
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1267 jnl
->jhdr
->end
= jnl
->jhdr
->start
;
1270 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1273 * make sure it's at least one page in size, so
1274 * start max_bsize at PAGE_SIZE
1276 for (i
= 0, max_bsize
= PAGE_SIZE
; i
< num_full
; i
++) {
1278 if (co_buf
[i
].block_num
== (off_t
)-1)
1281 if (co_buf
[i
].block_size
> max_bsize
)
1282 max_bsize
= co_buf
[i
].block_size
;
1285 * round max_bsize up to the nearest PAGE_SIZE multiple
1287 if (max_bsize
& (PAGE_SIZE
- 1)) {
1288 max_bsize
= (max_bsize
+ PAGE_SIZE
) & ~(PAGE_SIZE
- 1);
1291 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&block_ptr
, max_bsize
)) {
1295 // Replay the coalesced entries in the co-buf
1296 for(i
=0; i
< num_full
; i
++) {
1297 size_t size
= co_buf
[i
].block_size
;
1298 off_t jnl_offset
= (off_t
) co_buf
[i
].jnl_offset
;
1299 off_t number
= co_buf
[i
].block_num
;
1302 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1303 // co_buf[i].block_size, co_buf[i].jnl_offset);
1305 if (number
== (off_t
)-1) {
1306 // printf("jnl: replay_journal: skipping killed fs block\n");
1309 // do journal read, and set the phys. block
1310 ret
= read_journal_data(jnl
, &jnl_offset
, block_ptr
, size
);
1312 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl
->jdev_name
, offset
);
1316 if (update_fs_block(jnl
, block_ptr
, number
, size
) != 0) {
1323 // done replaying; update jnl header
1324 if (write_journal_header(jnl
) != 0) {
1328 printf("jnl: %s: journal replay done.\n", jnl
->jdev_name
);
1332 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1336 // free the coalesce buffer
1337 FREE(co_buf
, M_TEMP
);
1340 kmem_free(kernel_map
, (vm_offset_t
)buff
, jnl
->jhdr
->blhdr_size
);
1345 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1348 FREE(co_buf
, M_TEMP
);
1350 kmem_free(kernel_map
, (vm_offset_t
)buff
, jnl
->jhdr
->blhdr_size
);
1356 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1357 //#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
1358 #define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
1360 // XXXdbg - so I can change it in the debugger
1361 int def_tbuffer_size
= 0;
1365 // This function sets the size of the tbuffer and the
1366 // size of the blhdr. It assumes that jnl->jhdr->size
1367 // and jnl->jhdr->jhdr_size are already valid.
1370 size_up_tbuffer(journal
*jnl
, int tbuffer_size
, int phys_blksz
)
1373 // one-time initialization based on how much memory
1374 // there is in the machine.
1376 if (def_tbuffer_size
== 0) {
1377 if (mem_size
< (256*1024*1024)) {
1378 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
;
1379 } else if (mem_size
< (512*1024*1024)) {
1380 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 2;
1381 } else if (mem_size
< (1024*1024*1024)) {
1382 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 3;
1383 } else if (mem_size
>= (1024*1024*1024)) {
1384 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 4;
1388 // size up the transaction buffer... can't be larger than the number
1389 // of blocks that can fit in a block_list_header block.
1390 if (tbuffer_size
== 0) {
1391 jnl
->tbuffer_size
= def_tbuffer_size
;
1393 // make sure that the specified tbuffer_size isn't too small
1394 if (tbuffer_size
< jnl
->jhdr
->blhdr_size
* 2) {
1395 tbuffer_size
= jnl
->jhdr
->blhdr_size
* 2;
1397 // and make sure it's an even multiple of the block size
1398 if ((tbuffer_size
% jnl
->jhdr
->jhdr_size
) != 0) {
1399 tbuffer_size
-= (tbuffer_size
% jnl
->jhdr
->jhdr_size
);
1402 jnl
->tbuffer_size
= tbuffer_size
;
1405 if (jnl
->tbuffer_size
> (jnl
->jhdr
->size
/ 2)) {
1406 jnl
->tbuffer_size
= (jnl
->jhdr
->size
/ 2);
1409 if (jnl
->tbuffer_size
> MAX_TRANSACTION_BUFFER_SIZE
) {
1410 jnl
->tbuffer_size
= MAX_TRANSACTION_BUFFER_SIZE
;
1413 jnl
->jhdr
->blhdr_size
= (jnl
->tbuffer_size
/ jnl
->jhdr
->jhdr_size
) * sizeof(block_info
);
1414 if (jnl
->jhdr
->blhdr_size
< phys_blksz
) {
1415 jnl
->jhdr
->blhdr_size
= phys_blksz
;
1416 } else if ((jnl
->jhdr
->blhdr_size
% phys_blksz
) != 0) {
1417 // have to round up so we're an even multiple of the physical block size
1418 jnl
->jhdr
->blhdr_size
= (jnl
->jhdr
->blhdr_size
+ (phys_blksz
- 1)) & ~(phys_blksz
- 1);
1425 get_io_info(struct vnode
*devvp
, size_t phys_blksz
, journal
*jnl
, struct vfs_context
*context
)
1428 off_t writeblockcnt
;
1433 if (VNOP_IOCTL(devvp
, DKIOCGETFEATURES
, (caddr_t
)&features
, 0, context
) == 0) {
1434 if (features
& DK_FEATURE_FORCE_UNIT_ACCESS
) {
1435 const char *name
= vnode_name(devvp
);
1436 jnl
->flags
|= JOURNAL_DO_FUA_WRITES
;
1437 printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name
? name
: "no-name-dev", features
);
1441 if (VNOP_IOCTL(devvp
, DKIOCGETMAXBYTECOUNTREAD
, (caddr_t
)&readmaxcnt
, 0, context
)) {
1445 if (readmaxcnt
== 0) {
1446 if (VNOP_IOCTL(devvp
, DKIOCGETMAXBLOCKCOUNTREAD
, (caddr_t
)&readblockcnt
, 0, context
)) {
1447 readmaxcnt
= 128 * 1024;
1449 readmaxcnt
= readblockcnt
* phys_blksz
;
1454 if (VNOP_IOCTL(devvp
, DKIOCGETMAXBYTECOUNTWRITE
, (caddr_t
)&writemaxcnt
, 0, context
)) {
1458 if (writemaxcnt
== 0) {
1459 if (VNOP_IOCTL(devvp
, DKIOCGETMAXBLOCKCOUNTWRITE
, (caddr_t
)&writeblockcnt
, 0, context
)) {
1460 writemaxcnt
= 128 * 1024;
1462 writemaxcnt
= writeblockcnt
* phys_blksz
;
1466 jnl
->max_read_size
= readmaxcnt
;
1467 jnl
->max_write_size
= writemaxcnt
;
1469 // just in case it's still zero...
1470 if (jnl
->max_read_size
== 0) {
1471 jnl
->max_read_size
= 128 * 1024;
1472 jnl
->max_write_size
= 128 * 1024;
1478 get_jdev_name(struct vnode
*jvp
)
1480 const char *jdev_name
;
1482 jdev_name
= vnode_name(jvp
);
1483 if (jdev_name
== NULL
) {
1484 jdev_name
= vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0);
1486 // this just bumps the refcount on the name so we have our own copy
1487 jdev_name
= vfs_addname(jdev_name
, strlen(jdev_name
), 0, 0);
1495 journal_create(struct vnode
*jvp
,
1499 size_t min_fs_blksz
,
1501 int32_t tbuffer_size
,
1502 void (*flush
)(void *arg
),
1507 struct vfs_context context
;
1508 const char *jdev_name
;
1510 context
.vc_thread
= current_thread();
1511 context
.vc_ucred
= FSCRED
;
1513 jdev_name
= get_jdev_name(jvp
);
1515 /* Get the real physical block size. */
1516 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1520 if (phys_blksz
> min_fs_blksz
) {
1521 printf("jnl: %s: create: error: phys blksize %lu bigger than min fs blksize %lu\n",
1522 jdev_name
, phys_blksz
, min_fs_blksz
);
1526 if ((journal_size
% phys_blksz
) != 0) {
1527 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1528 jdev_name
, journal_size
, phys_blksz
);
1533 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1534 memset(jnl
, 0, sizeof(*jnl
));
1537 jnl
->jdev_offset
= offset
;
1540 jnl
->flush_arg
= arg
;
1541 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1542 jnl
->jdev_name
= jdev_name
;
1543 lck_mtx_init(&jnl
->old_start_lock
, jnl_mutex_group
, jnl_lock_attr
);
1545 get_io_info(jvp
, phys_blksz
, jnl
, &context
);
1547 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1548 printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name
, phys_blksz
);
1549 goto bad_kmem_alloc
;
1552 memset(jnl
->header_buf
, 0, phys_blksz
);
1554 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1555 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1556 jnl
->jhdr
->endian
= ENDIAN_MAGIC
;
1557 jnl
->jhdr
->start
= phys_blksz
; // start at block #1, block #0 is for the jhdr itself
1558 jnl
->jhdr
->end
= phys_blksz
;
1559 jnl
->jhdr
->size
= journal_size
;
1560 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1561 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1563 jnl
->active_start
= jnl
->jhdr
->start
;
1565 // XXXdbg - for testing you can force the journal to wrap around
1566 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1567 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1569 jnl
->jhdr
->sequence_num
= random() & 0x00ffffff;
1571 lck_mtx_init(&jnl
->jlock
, jnl_mutex_group
, jnl_lock_attr
);
1573 if (write_journal_header(jnl
) != 0) {
1574 printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name
);
1582 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1585 vfs_removename(jdev_name
);
1588 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1594 journal_open(struct vnode
*jvp
,
1598 size_t min_fs_blksz
,
1600 int32_t tbuffer_size
,
1601 void (*flush
)(void *arg
),
1607 int orig_checksum
, checksum
;
1608 struct vfs_context context
;
1609 const char *jdev_name
= get_jdev_name(jvp
);
1611 context
.vc_thread
= current_thread();
1612 context
.vc_ucred
= FSCRED
;
1614 /* Get the real physical block size. */
1615 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1619 if (phys_blksz
> min_fs_blksz
) {
1620 printf("jnl: %s: open: error: phys blksize %lu bigger than min fs blksize %lu\n",
1621 jdev_name
, phys_blksz
, min_fs_blksz
);
1625 if ((journal_size
% phys_blksz
) != 0) {
1626 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%lx\n",
1627 jdev_name
, journal_size
, phys_blksz
);
1631 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1632 memset(jnl
, 0, sizeof(*jnl
));
1635 jnl
->jdev_offset
= offset
;
1638 jnl
->flush_arg
= arg
;
1639 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1640 jnl
->jdev_name
= jdev_name
;
1641 lck_mtx_init(&jnl
->old_start_lock
, jnl_mutex_group
, jnl_lock_attr
);
1643 get_io_info(jvp
, phys_blksz
, jnl
, &context
);
1645 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1646 printf("jnl: %s: create: could not allocate space for header buffer (%lu bytes)\n", jdev_name
, phys_blksz
);
1647 goto bad_kmem_alloc
;
1650 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1651 memset(jnl
->jhdr
, 0, sizeof(journal_header
));
1653 // we have to set this up here so that do_journal_io() will work
1654 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1656 if (read_journal_header(jnl
, jnl
->jhdr
, phys_blksz
) != phys_blksz
) {
1657 printf("jnl: %s: open: could not read %lu bytes for the journal header.\n",
1658 jdev_name
, phys_blksz
);
1662 orig_checksum
= jnl
->jhdr
->checksum
;
1663 jnl
->jhdr
->checksum
= 0;
1665 if (jnl
->jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
1666 // do this before the swap since it's done byte-at-a-time
1667 orig_checksum
= SWAP32(orig_checksum
);
1668 checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
1669 swap_journal_header(jnl
);
1670 jnl
->flags
|= JOURNAL_NEED_SWAP
;
1672 checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
1675 if (jnl
->jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
->jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
1676 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1677 jnl
->jdev_name
, jnl
->jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
1681 // only check if we're the current journal header magic value
1682 if (jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
) {
1684 if (orig_checksum
!= checksum
) {
1685 printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1686 jdev_name
, orig_checksum
, checksum
);
1692 // XXXdbg - convert old style magic numbers to the new one
1693 if (jnl
->jhdr
->magic
== OLD_JOURNAL_HEADER_MAGIC
) {
1694 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1697 if (phys_blksz
!= (size_t)jnl
->jhdr
->jhdr_size
&& jnl
->jhdr
->jhdr_size
!= 0) {
1698 printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n",
1699 jdev_name
, phys_blksz
, jnl
->jhdr
->jhdr_size
);
1701 orig_blksz
= phys_blksz
;
1702 phys_blksz
= jnl
->jhdr
->jhdr_size
;
1703 if (VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&phys_blksz
, FWRITE
, &context
)) {
1704 printf("jnl: %s: could not set block size to %lu bytes.\n", jdev_name
, phys_blksz
);
1707 // goto bad_journal;
1710 if ( jnl
->jhdr
->start
<= 0
1711 || jnl
->jhdr
->start
> jnl
->jhdr
->size
1712 || jnl
->jhdr
->start
> 1024*1024*1024) {
1713 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1714 jdev_name
, jnl
->jhdr
->start
, jnl
->jhdr
->size
);
1718 if ( jnl
->jhdr
->end
<= 0
1719 || jnl
->jhdr
->end
> jnl
->jhdr
->size
1720 || jnl
->jhdr
->end
> 1024*1024*1024) {
1721 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1722 jdev_name
, jnl
->jhdr
->end
, jnl
->jhdr
->size
);
1726 if (jnl
->jhdr
->size
> 1024*1024*1024) {
1727 printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name
, jnl
->jhdr
->size
);
1731 // XXXdbg - can't do these checks because hfs writes all kinds of
1732 // non-uniform sized blocks even on devices that have a block size
1733 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
1734 // therefore these checks will fail and so we just have to punt and
1735 // do more relaxed checking...
1736 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1737 if ((jnl
->jhdr
->start
% 512) != 0) {
1738 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1739 jdev_name
, jnl
->jhdr
->start
);
1743 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1744 if ((jnl
->jhdr
->end
% 512) != 0) {
1745 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1746 jdev_name
, jnl
->jhdr
->end
, jnl
->jhdr
->jhdr_size
);
1750 // take care of replaying the journal if necessary
1751 if (flags
& JOURNAL_RESET
) {
1752 printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1753 jdev_name
, jnl
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1754 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
1755 } else if (replay_journal(jnl
) != 0) {
1756 printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name
);
1760 if (orig_blksz
!= 0) {
1761 VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, &context
);
1762 phys_blksz
= orig_blksz
;
1763 if (orig_blksz
< jnl
->jhdr
->jhdr_size
) {
1764 printf("jnl: %s: open: jhdr_size is %d but orig phys blk size is %d. switching.\n",
1765 jdev_name
, jnl
->jhdr
->jhdr_size
, orig_blksz
);
1767 jnl
->jhdr
->jhdr_size
= orig_blksz
;
1771 // make sure this is in sync!
1772 jnl
->active_start
= jnl
->jhdr
->start
;
1774 // set this now, after we've replayed the journal
1775 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1777 lck_mtx_init(&jnl
->jlock
, jnl_mutex_group
, jnl_lock_attr
);
1782 if (orig_blksz
!= 0) {
1783 phys_blksz
= orig_blksz
;
1784 VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, &context
);
1786 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1789 vfs_removename(jdev_name
);
1791 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1797 journal_is_clean(struct vnode
*jvp
,
1801 size_t min_fs_block_size
)
1804 int phys_blksz
, ret
;
1805 int orig_checksum
, checksum
;
1806 struct vfs_context context
;
1807 const char *jdev_name
= get_jdev_name(jvp
);
1809 context
.vc_thread
= current_thread();
1810 context
.vc_ucred
= FSCRED
;
1812 /* Get the real physical block size. */
1813 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1814 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name
);
1818 if (phys_blksz
> (int)min_fs_block_size
) {
1819 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %lu\n",
1820 jdev_name
, phys_blksz
, min_fs_block_size
);
1824 if ((journal_size
% phys_blksz
) != 0) {
1825 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1826 jdev_name
, journal_size
, phys_blksz
);
1830 memset(&jnl
, 0, sizeof(jnl
));
1832 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
.header_buf
, phys_blksz
)) {
1833 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name
, phys_blksz
);
1837 get_io_info(jvp
, phys_blksz
, &jnl
, &context
);
1839 jnl
.jhdr
= (journal_header
*)jnl
.header_buf
;
1840 memset(jnl
.jhdr
, 0, sizeof(journal_header
));
1843 jnl
.jdev_offset
= offset
;
1846 // we have to set this up here so that do_journal_io() will work
1847 jnl
.jhdr
->jhdr_size
= phys_blksz
;
1849 if (read_journal_header(&jnl
, jnl
.jhdr
, phys_blksz
) != (unsigned)phys_blksz
) {
1850 printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
1851 jdev_name
, phys_blksz
);
1856 orig_checksum
= jnl
.jhdr
->checksum
;
1857 jnl
.jhdr
->checksum
= 0;
1859 if (jnl
.jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
1860 // do this before the swap since it's done byte-at-a-time
1861 orig_checksum
= SWAP32(orig_checksum
);
1862 checksum
= calc_checksum((char *)jnl
.jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
1863 swap_journal_header(&jnl
);
1864 jnl
.flags
|= JOURNAL_NEED_SWAP
;
1866 checksum
= calc_checksum((char *)jnl
.jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
1869 if (jnl
.jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
.jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
1870 printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1871 jdev_name
, jnl
.jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
1876 if (orig_checksum
!= checksum
) {
1877 printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name
, orig_checksum
, checksum
);
1883 // if the start and end are equal then the journal is clean.
1884 // otherwise it's not clean and therefore an error.
1886 if (jnl
.jhdr
->start
== jnl
.jhdr
->end
) {
1893 kmem_free(kernel_map
, (vm_offset_t
)jnl
.header_buf
, phys_blksz
);
1895 vfs_removename(jdev_name
);
1905 journal_close(journal
*jnl
)
1907 volatile off_t
*start
, *end
;
1912 // set this before doing anything that would block so that
1913 // we start tearing things down properly.
1915 jnl
->flags
|= JOURNAL_CLOSE_PENDING
;
1917 if (jnl
->owner
!= current_thread()) {
1922 // only write stuff to disk if the journal is still valid
1924 if ((jnl
->flags
& JOURNAL_INVALID
) == 0) {
1926 if (jnl
->active_tr
) {
1927 journal_end_transaction(jnl
);
1930 // flush any buffered transactions
1932 transaction
*tr
= jnl
->cur_tr
;
1935 end_transaction(tr
, 1, NULL
, NULL
); // force it to get flushed
1938 //start = &jnl->jhdr->start;
1939 start
= &jnl
->active_start
;
1940 end
= &jnl
->jhdr
->end
;
1942 while (*start
!= *end
&& counter
++ < 5000) {
1943 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1945 jnl
->flush(jnl
->flush_arg
);
1947 tsleep((caddr_t
)jnl
, PRIBIO
, "jnl_close", 2);
1950 if (*start
!= *end
) {
1951 printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1952 jnl
->jdev_name
, *start
, *end
);
1955 // make sure this is in sync when we close the journal
1956 jnl
->jhdr
->start
= jnl
->active_start
;
1958 // if this fails there's not much we can do at this point...
1959 write_journal_header(jnl
);
1961 // if we're here the journal isn't valid any more.
1962 // so make sure we don't leave any locked blocks lying around
1963 printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl
->jdev_name
, jnl
);
1964 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1966 if (jnl
->active_tr
) {
1967 tr
= jnl
->active_tr
;
1968 jnl
->active_tr
= NULL
;
1974 abort_transaction(jnl
, tr
);
1975 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1976 panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl
->jdev_name
, jnl
);
1981 free_old_stuff(jnl
);
1983 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, jnl
->jhdr
->jhdr_size
);
1984 jnl
->jhdr
= (void *)0xbeefbabe;
1986 if (jnl
->jdev_name
) {
1987 vfs_removename(jnl
->jdev_name
);
1990 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1994 dump_journal(journal
*jnl
)
1998 printf("journal for dev %s:", jnl
->jdev_name
);
1999 printf(" jdev_offset %.8llx\n", jnl
->jdev_offset
);
2000 printf(" magic: 0x%.8x\n", jnl
->jhdr
->magic
);
2001 printf(" start: 0x%.8llx\n", jnl
->jhdr
->start
);
2002 printf(" end: 0x%.8llx\n", jnl
->jhdr
->end
);
2003 printf(" size: 0x%.8llx\n", jnl
->jhdr
->size
);
2004 printf(" blhdr size: %d\n", jnl
->jhdr
->blhdr_size
);
2005 printf(" jhdr size: %d\n", jnl
->jhdr
->jhdr_size
);
2006 printf(" chksum: 0x%.8x\n", jnl
->jhdr
->checksum
);
2008 printf(" completed transactions:\n");
2009 for(ctr
=jnl
->completed_trs
; ctr
; ctr
=ctr
->next
) {
2010 printf(" 0x%.8llx - 0x%.8llx\n", ctr
->journal_start
, ctr
->journal_end
);
2017 free_space(journal
*jnl
)
2019 off_t free_space_offset
;
2021 if (jnl
->jhdr
->start
< jnl
->jhdr
->end
) {
2022 free_space_offset
= jnl
->jhdr
->size
- (jnl
->jhdr
->end
- jnl
->jhdr
->start
) - jnl
->jhdr
->jhdr_size
;
2023 } else if (jnl
->jhdr
->start
> jnl
->jhdr
->end
) {
2024 free_space_offset
= jnl
->jhdr
->start
- jnl
->jhdr
->end
;
2026 // journal is completely empty
2027 free_space_offset
= jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
;
2030 return free_space_offset
;
2035 // The journal must be locked on entry to this function.
2036 // The "desired_size" is in bytes.
2039 check_free_space(journal
*jnl
, int desired_size
)
2044 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2045 // desired_size, free_space(jnl));
2048 int old_start_empty
;
2050 if (counter
++ == 5000) {
2052 panic("jnl: check_free_space: buffer flushing isn't working "
2053 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl
,
2054 jnl
->jhdr
->start
, jnl
->jhdr
->end
, free_space(jnl
), jnl
->active_start
);
2056 if (counter
> 7500) {
2057 printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl
->jdev_name
);
2061 // make sure there's space in the journal to hold this transaction
2062 if (free_space(jnl
) > desired_size
&& jnl
->old_start
[0] == 0) {
2066 // here's where we lazily bump up jnl->jhdr->start. we'll consume
2067 // entries until there is enough space for the next transaction.
2069 old_start_empty
= 1;
2071 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
2075 while (jnl
->old_start
[i
] & 0x8000000000000000LL
) {
2076 if (lcl_counter
++ > 100) {
2077 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2078 jnl
->old_start
[i
], jnl
);
2081 unlock_oldstart(jnl
);
2083 jnl
->flush(jnl
->flush_arg
);
2085 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space1", 1);
2089 if (jnl
->old_start
[i
] == 0) {
2093 old_start_empty
= 0;
2094 jnl
->jhdr
->start
= jnl
->old_start
[i
];
2095 jnl
->old_start
[i
] = 0;
2096 if (free_space(jnl
) > desired_size
) {
2097 unlock_oldstart(jnl
);
2098 write_journal_header(jnl
);
2103 unlock_oldstart(jnl
);
2105 // if we bumped the start, loop and try again
2106 if (i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
2108 } else if (old_start_empty
) {
2110 // if there is nothing in old_start anymore then we can
2111 // bump the jhdr->start to be the same as active_start
2112 // since it is possible there was only one very large
2113 // transaction in the old_start array. if we didn't do
2114 // this then jhdr->start would never get updated and we
2115 // would wind up looping until we hit the panic at the
2116 // start of the loop.
2118 jnl
->jhdr
->start
= jnl
->active_start
;
2119 write_journal_header(jnl
);
2124 // if the file system gave us a flush function, call it to so that
2125 // it can flush some blocks which hopefully will cause some transactions
2126 // to complete and thus free up space in the journal.
2128 jnl
->flush(jnl
->flush_arg
);
2131 // wait for a while to avoid being cpu-bound (this will
2132 // put us to sleep for 10 milliseconds)
2133 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space2", 1);
2140 * Allocate a new active transaction.
2143 journal_allocate_transaction(journal
*jnl
)
2147 MALLOC_ZONE(tr
, transaction
*, sizeof(transaction
), M_JNL_TR
, M_WAITOK
);
2148 memset(tr
, 0, sizeof(transaction
));
2150 tr
->tbuffer_size
= jnl
->tbuffer_size
;
2152 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&tr
->tbuffer
, tr
->tbuffer_size
)) {
2153 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
2154 jnl
->active_tr
= NULL
;
2158 // journal replay code checksum check depends on this.
2159 memset(tr
->tbuffer
, 0, BLHDR_CHECKSUM_SIZE
);
2160 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2161 memset(tr
->tbuffer
+ BLHDR_CHECKSUM_SIZE
, 0x5a, jnl
->jhdr
->blhdr_size
- BLHDR_CHECKSUM_SIZE
);
2163 tr
->blhdr
= (block_list_header
*)tr
->tbuffer
;
2164 tr
->blhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
2165 tr
->blhdr
->num_blocks
= 1; // accounts for this header block
2166 tr
->blhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
2167 tr
->blhdr
->flags
= BLHDR_CHECK_CHECKSUMS
| BLHDR_FIRST_HEADER
;
2169 tr
->sequence_num
= ++jnl
->jhdr
->sequence_num
;
2171 tr
->total_bytes
= jnl
->jhdr
->blhdr_size
;
2174 jnl
->active_tr
= tr
;
2180 journal_start_transaction(journal
*jnl
)
2186 if (jnl
->flags
& JOURNAL_INVALID
) {
2190 if (jnl
->owner
== current_thread()) {
2191 if (jnl
->active_tr
== NULL
) {
2192 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2193 jnl
, jnl
->owner
, current_thread());
2195 jnl
->nested_count
++;
2201 if (jnl
->owner
!= NULL
|| jnl
->nested_count
!= 0 || jnl
->active_tr
!= NULL
) {
2202 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2203 jnl
->owner
, jnl
->nested_count
, jnl
->active_tr
, jnl
);
2206 jnl
->owner
= current_thread();
2207 jnl
->nested_count
= 1;
2209 free_old_stuff(jnl
);
2211 // make sure there's room in the journal
2212 if (free_space(jnl
) < jnl
->tbuffer_size
) {
2213 // this is the call that really waits for space to free up
2214 // as well as updating jnl->jhdr->start
2215 if (check_free_space(jnl
, jnl
->tbuffer_size
) != 0) {
2216 printf("jnl: %s: start transaction failed: no space\n", jnl
->jdev_name
);
2222 // if there's a buffered transaction, use it.
2224 jnl
->active_tr
= jnl
->cur_tr
;
2230 ret
= journal_allocate_transaction(jnl
);
2235 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2241 jnl
->nested_count
= 0;
2242 unlock_journal(jnl
);
2248 journal_modify_block_start(journal
*jnl
, struct buf
*bp
)
2254 if (jnl
->flags
& JOURNAL_INVALID
) {
2258 // XXXdbg - for debugging I want this to be true. later it may
2259 // not be necessary.
2260 if ((buf_flags(bp
) & B_META
) == 0) {
2261 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp
, jnl
);
2264 tr
= jnl
->active_tr
;
2265 CHECK_TRANSACTION(tr
);
2267 if (jnl
->owner
!= current_thread()) {
2268 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2269 jnl
, jnl
->owner
, current_thread());
2272 free_old_stuff(jnl
);
2274 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2275 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2277 // can't allow blocks that aren't an even multiple of the
2278 // underlying block size.
2279 if ((buf_size(bp
) % jnl
->jhdr
->jhdr_size
) != 0) {
2280 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2281 buf_size(bp
), jnl
->jhdr
->jhdr_size
);
2285 // make sure that this transaction isn't bigger than the whole journal
2286 if (tr
->total_bytes
+buf_size(bp
) >= (jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
)) {
2287 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2288 tr
->total_bytes
, (tr
->jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
), buf_size(bp
), tr
, bp
);
2292 // if the block is dirty and not already locked we have to write
2293 // it out before we muck with it because it has data that belongs
2294 // (presumably) to another transaction.
2296 if ((buf_flags(bp
) & (B_DELWRI
| B_LOCKED
)) == B_DELWRI
) {
2298 if (buf_flags(bp
) & B_ASYNC
) {
2299 panic("modify_block_start: bp @ %p has async flag set!\n", bp
);
2302 // this will cause it to not be buf_brelse()'d
2303 buf_setflags(bp
, B_NORELSE
);
2306 buf_setflags(bp
, B_LOCKED
);
2312 journal_modify_block_abort(journal
*jnl
, struct buf
*bp
)
2315 block_list_header
*blhdr
;
2320 tr
= jnl
->active_tr
;
2323 // if there's no active transaction then we just want to
2324 // call buf_brelse() and return since this is just a block
2325 // that happened to be modified as part of another tr.
2332 if (jnl
->flags
& JOURNAL_INVALID
) {
2336 CHECK_TRANSACTION(tr
);
2338 if (jnl
->owner
!= current_thread()) {
2339 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2340 jnl
, jnl
->owner
, current_thread());
2343 free_old_stuff(jnl
);
2345 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2347 // first check if it's already part of this transaction
2348 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2349 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2350 if (bp
== blhdr
->binfo
[i
].b
.bp
) {
2351 if (buf_size(bp
) != blhdr
->binfo
[i
].bsize
) {
2352 panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2353 bp
, buf_size(bp
), blhdr
->binfo
[i
].bsize
, jnl
);
2359 if (i
< blhdr
->num_blocks
) {
2365 // if blhdr is null, then this block has only had modify_block_start
2366 // called on it as part of the current transaction. that means that
2367 // it is ok to clear the LOCKED bit since it hasn't actually been
2368 // modified. if blhdr is non-null then modify_block_end was called
2369 // on it and so we need to keep it locked in memory.
2371 if (blhdr
== NULL
) {
2372 buf_clearflags(bp
, B_LOCKED
);
2381 journal_modify_block_end(journal
*jnl
, struct buf
*bp
, void (*func
)(struct buf
*bp
, void *arg
), void *arg
)
2384 int tbuffer_offset
=0;
2386 block_list_header
*blhdr
, *prev
=NULL
;
2391 if (jnl
->flags
& JOURNAL_INVALID
) {
2395 tr
= jnl
->active_tr
;
2396 CHECK_TRANSACTION(tr
);
2398 if (jnl
->owner
!= current_thread()) {
2399 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2400 jnl
, jnl
->owner
, current_thread());
2403 free_old_stuff(jnl
);
2405 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2406 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2408 if ((buf_flags(bp
) & B_LOCKED
) == 0) {
2409 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp
, jnl
);
2412 // first check if it's already part of this transaction
2413 for(blhdr
=tr
->blhdr
; blhdr
; prev
=blhdr
,blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2414 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2416 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2417 if (bp
== blhdr
->binfo
[i
].b
.bp
) {
2418 if (buf_size(bp
) != blhdr
->binfo
[i
].bsize
) {
2419 panic("jnl: bp @ %p changed size on me! (%d vs. %lu, jnl %p)\n",
2420 bp
, buf_size(bp
), blhdr
->binfo
[i
].bsize
, jnl
);
2424 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
2427 if (i
< blhdr
->num_blocks
) {
2434 && (prev
->num_blocks
+1) <= prev
->max_blocks
2435 && (prev
->bytes_used
+buf_size(bp
)) <= (uint32_t)tr
->tbuffer_size
) {
2437 } else if (blhdr
== NULL
) {
2438 block_list_header
*nblhdr
;
2441 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl
, bp
);
2444 // we got to the end of the list, didn't find the block and there's
2445 // no room in the block_list_header pointed to by prev
2447 // we allocate another tbuffer and link it in at the end of the list
2448 // through prev->binfo[0].bnum. that's a skanky way to do things but
2449 // avoids having yet another linked list of small data structures to manage.
2451 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&nblhdr
, tr
->tbuffer_size
)) {
2452 panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2453 tr
, tr
->total_bytes
);
2456 // journal replay code checksum check depends on this.
2457 memset(nblhdr
, 0, BLHDR_CHECKSUM_SIZE
);
2458 // Fill up the rest of the block with unimportant bytes
2459 memset(nblhdr
+ BLHDR_CHECKSUM_SIZE
, 0x5a, jnl
->jhdr
->blhdr_size
- BLHDR_CHECKSUM_SIZE
);
2461 // initialize the new guy
2462 nblhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
2463 nblhdr
->num_blocks
= 1; // accounts for this header block
2464 nblhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
2465 nblhdr
->flags
= BLHDR_CHECK_CHECKSUMS
;
2468 tr
->total_bytes
+= jnl
->jhdr
->blhdr_size
;
2470 // then link him in at the end
2471 prev
->binfo
[0].bnum
= (off_t
)((long)nblhdr
);
2473 // and finally switch to using the new guy
2475 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2480 if ((i
+1) > blhdr
->max_blocks
) {
2481 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i
, blhdr
->max_blocks
);
2484 // if the function pointer is not set then copy the
2485 // block of data now. if the function pointer is set
2486 // the copy will happen after calling the callback in
2487 // end_transaction() just before it goes to disk.
2490 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
2491 memcpy(blkptr
, (char *)0 + buf_dataptr(bp
), buf_size(bp
));
2494 // if this is true then this is a new block we haven't seen
2495 if (i
>= blhdr
->num_blocks
) {
2501 bsize
= buf_size(bp
);
2503 blhdr
->binfo
[i
].bnum
= (off_t
)(buf_blkno(bp
));
2504 blhdr
->binfo
[i
].bsize
= bsize
;
2505 blhdr
->binfo
[i
].b
.bp
= bp
;
2507 void *old_func
=NULL
, *old_arg
=NULL
;
2509 buf_setfilter(bp
, func
, arg
, &old_func
, &old_arg
);
2510 if (old_func
!= NULL
) {
2511 panic("jnl: modify_block_end: old func %p / arg %p", old_func
, old_arg
);
2515 blhdr
->bytes_used
+= bsize
;
2516 tr
->total_bytes
+= bsize
;
2518 blhdr
->num_blocks
++;
2526 journal_kill_block(journal
*jnl
, struct buf
*bp
)
2530 block_list_header
*blhdr
;
2535 if (jnl
->flags
& JOURNAL_INVALID
) {
2539 tr
= jnl
->active_tr
;
2540 CHECK_TRANSACTION(tr
);
2542 if (jnl
->owner
!= current_thread()) {
2543 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2544 jnl
, jnl
->owner
, current_thread());
2547 free_old_stuff(jnl
);
2549 bflags
= buf_flags(bp
);
2551 if ( !(bflags
& B_LOCKED
))
2552 panic("jnl: modify_block_end: called with bp not B_LOCKED");
2555 * bp must be BL_BUSY and B_LOCKED
2557 // first check if it's already part of this transaction
2558 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2560 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2561 if (bp
== blhdr
->binfo
[i
].b
.bp
) {
2564 buf_clearflags(bp
, B_LOCKED
);
2566 // this undoes the vnode_ref() in journal_modify_block_end()
2568 vnode_rele_ext(vp
, 0, 1);
2570 // if the block has the DELWRI and FILTER bits sets, then
2571 // things are seriously weird. if it was part of another
2572 // transaction then journal_modify_block_start() should
2573 // have force it to be written.
2575 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2576 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2578 tr
->num_killed
+= buf_size(bp
);
2580 blhdr
->binfo
[i
].b
.bp
= NULL
;
2581 blhdr
->binfo
[i
].bnum
= (off_t
)-1;
2583 buf_markinvalid(bp
);
2590 if (i
< blhdr
->num_blocks
) {
2600 journal_binfo_cmp(const void *a
, const void *b
)
2602 const block_info
*bi_a
= (const struct block_info
*)a
;
2603 const block_info
*bi_b
= (const struct block_info
*)b
;
2606 if (bi_a
->b
.bp
== NULL
) {
2609 if (bi_b
->b
.bp
== NULL
) {
2613 // don't have to worry about negative block
2614 // numbers so this is ok to do.
2616 res
= (buf_blkno(bi_a
->b
.bp
) - buf_blkno(bi_b
->b
.bp
));
2623 * End a transaction. If the transaction is small enough, and we're not forcing
2624 * a write to disk, the "active" transaction becomes the "current" transaction,
2625 * and will be reused for the next transaction that is started (group commit).
2627 * If the transaction gets written to disk (because force_it is true, or no
2628 * group commit, or the transaction is sufficiently full), the blocks get
2629 * written into the journal first, then the are written asynchronously. When
2630 * those async writes complete, the transaction can be freed and removed from
2633 * An optional callback can be supplied. If given, it is called after the
2634 * the blocks have been written to the journal, but before the async writes
2635 * of those blocks to their normal on-disk locations. This is used by
2636 * journal_relocate so that the location of the journal can be changed and
2637 * flushed to disk before the blocks get written to their normal locations.
2638 * Note that the callback is only called if the transaction gets written to
2639 * the journal during this end_transaction call; you probably want to set the
2643 * tr Transaction to add to the journal
2644 * force_it If true, force this transaction to the on-disk journal immediately.
2645 * callback See description above. Pass NULL for no callback.
2646 * callback_arg Argument passed to callback routine.
2650 * -1 An error occurred. The journal is marked invalid.
2653 end_transaction(transaction
*tr
, int force_it
, errno_t (*callback
)(void*), void *callback_arg
)
2658 journal
*jnl
= tr
->jnl
;
2659 struct buf
*bp
, **bparray
;
2660 block_list_header
*blhdr
=NULL
, *next
=NULL
;
2661 size_t tbuffer_offset
;
2664 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2665 jnl
, jnl
->cur_tr
, tr
);
2668 // if there weren't any modified blocks in the transaction
2669 // just save off the transaction pointer and return.
2670 if (tr
->total_bytes
== jnl
->jhdr
->blhdr_size
) {
2675 // if our transaction buffer isn't very full, just hang
2676 // on to it and don't actually flush anything. this is
2677 // what is known as "group commit". we will flush the
2678 // transaction buffer if it's full or if we have more than
2679 // one of them so we don't start hogging too much memory.
2682 && (jnl
->flags
& JOURNAL_NO_GROUP_COMMIT
) == 0
2683 && tr
->num_blhdrs
< 3
2684 && (tr
->total_bytes
<= ((tr
->tbuffer_size
*tr
->num_blhdrs
) - tr
->tbuffer_size
/8))) {
2691 // if we're here we're going to flush the transaction buffer to disk.
2692 // make sure there is room in the journal first.
2693 check_free_space(jnl
, tr
->total_bytes
);
2695 // range check the end index
2696 if (jnl
->jhdr
->end
<= 0 || jnl
->jhdr
->end
> jnl
->jhdr
->size
) {
2697 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2698 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
2701 // this transaction starts where the current journal ends
2702 tr
->journal_start
= jnl
->jhdr
->end
;
2703 end
= jnl
->jhdr
->end
;
2706 // if the first entry in old_start[] isn't free yet, loop calling the
2707 // file system flush routine until it is (or we panic).
2711 while ((jnl
->old_start
[0] & 0x8000000000000000LL
) != 0) {
2713 unlock_oldstart(jnl
);
2716 jnl
->flush(jnl
->flush_arg
);
2719 // yield the cpu so others can get in to clear the lock bit
2720 (void)tsleep((void *)jnl
, PRIBIO
, "jnl-old-start-sleep", 1);
2725 panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n",
2726 jnl
->old_start
[0] & (~0x8000000000000000LL
), jnl
);
2731 // slide everyone else down and put our latest guy in the last
2732 // entry in the old_start array
2734 memcpy(&jnl
->old_start
[0], &jnl
->old_start
[1], sizeof(jnl
->old_start
)-sizeof(jnl
->old_start
[0]));
2735 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] = tr
->journal_start
| 0x8000000000000000LL
;
2737 unlock_oldstart(jnl
);
2740 // for each block, make sure that the physical block # is set
2741 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
2744 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2745 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2750 bp
= blhdr
->binfo
[i
].b
.bp
;
2752 // if this block has a callback function set, call
2753 // it now and then copy the data from the bp into
2756 void (*func
)(struct buf
*, void *);
2759 buf_setfilter(bp
, NULL
, NULL
, (void **)&func
, &arg
);
2762 // acquire the bp here so that we can safely
2763 // mess around with its data. buf_acquire()
2764 // will return EAGAIN if the buffer was busy,
2765 // so loop trying again.
2767 errno
= buf_acquire(bp
, 0, 0, 0);
2768 } while (errno
== EAGAIN
);
2772 // call the hook function and then copy the
2773 // data into the transaction buffer...
2776 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
2777 memcpy(blkptr
, (char *)buf_dataptr(bp
), buf_size(bp
));
2781 panic("could not acquire bp %p (err %d)\n", bp
, errno
);
2785 } else { // bp == NULL, only true if a block was "killed"
2786 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
2787 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2788 blhdr
->binfo
[i
].bnum
, jnl
, tr
);
2791 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
2795 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
2798 blkno
= buf_blkno(bp
);
2799 lblkno
= buf_lblkno(bp
);
2801 if (vp
== NULL
&& lblkno
== blkno
) {
2802 printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n",
2803 jnl
->jdev_name
, bp
, lblkno
, blkno
, tr
, jnl
);
2807 // if the lblkno is the same as blkno and this bp isn't
2808 // associated with the underlying file system device then
2809 // we need to call bmap() to get the actual physical block.
2811 if ((lblkno
== blkno
) && (vp
!= jnl
->fsdev
)) {
2813 size_t contig_bytes
;
2815 if (VNOP_BLKTOOFF(vp
, lblkno
, &f_offset
)) {
2816 printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl
->jdev_name
, bp
, jnl
);
2819 if (VNOP_BLOCKMAP(vp
, f_offset
, buf_count(bp
), &blkno
, &contig_bytes
, NULL
, 0, NULL
)) {
2820 printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl
->jdev_name
, bp
, jnl
);
2823 if ((uint32_t)contig_bytes
< buf_count(bp
)) {
2824 printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl
->jdev_name
, bp
, jnl
);
2827 buf_setblkno(bp
, blkno
);
2829 // update this so we write out the correct physical block number!
2830 blhdr
->binfo
[i
].bnum
= (off_t
)(blkno
);
2833 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2838 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2839 amt
= blhdr
->bytes_used
;
2841 blhdr
->binfo
[0].b
.sequence_num
= tr
->sequence_num
;
2843 blhdr
->checksum
= 0;
2844 blhdr
->checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
2846 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&bparray
, blhdr
->num_blocks
* sizeof(struct buf
*))) {
2847 panic("can't allocate %lu bytes for bparray\n", blhdr
->num_blocks
* sizeof(struct buf
*));
2850 // calculate individual block checksums
2851 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2852 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2853 bparray
[i
] = blhdr
->binfo
[i
].b
.bp
;
2855 blhdr
->binfo
[i
].b
.cksum
= calc_checksum(&((char *)blhdr
)[tbuffer_offset
], blhdr
->binfo
[i
].bsize
);
2857 blhdr
->binfo
[i
].b
.cksum
= 0;
2860 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
2863 ret
= write_journal_data(jnl
, &end
, blhdr
, amt
);
2865 // always put the bp pointers back
2866 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2867 blhdr
->binfo
[i
].b
.bp
= bparray
[i
];
2870 kmem_free(kernel_map
, (vm_offset_t
)bparray
, blhdr
->num_blocks
* sizeof(struct buf
*));
2873 printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
2874 jnl
->jdev_name
, ret
, amt
);
2880 jnl
->jhdr
->end
= end
; // update where the journal now ends
2881 tr
->journal_end
= end
; // the transaction ends here too
2882 if (tr
->journal_start
== 0 || tr
->journal_end
== 0) {
2883 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2884 tr
->journal_start
, tr
->journal_end
);
2887 if (write_journal_header(jnl
) != 0) {
2892 * If the caller supplied a callback, call it now that the blocks have been
2893 * written to the journal. This is used by journal_relocate so, for example,
2894 * the file system can change its pointer to the new journal.
2896 if (callback
!= NULL
&& callback(callback_arg
) != 0) {
2901 // setup for looping through all the blhdr's. we null out the
2902 // tbuffer and blhdr fields so that they're not used any more.
2908 // the buffer_flushed_callback will only be called for the
2909 // real blocks that get flushed so we have to account for
2910 // the block_list_headers here.
2912 tr
->num_flushed
= tr
->num_blhdrs
* jnl
->jhdr
->blhdr_size
;
2914 // for each block, set the iodone callback and unlock it
2915 for(; blhdr
; blhdr
=next
) {
2917 // we can re-order the buf ptrs because everything is written out already
2918 qsort(&blhdr
->binfo
[1], blhdr
->num_blocks
-1, sizeof(block_info
), journal_binfo_cmp
);
2920 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2921 if (blhdr
->binfo
[i
].b
.bp
== NULL
) {
2925 bp
= blhdr
->binfo
[i
].b
.bp
;
2927 // have to pass BAC_REMOVE here because we're going to bawrite()
2928 // the buffer when we're done
2930 errno
= buf_acquire(bp
, BAC_REMOVE
, 0, 0);
2931 } while (errno
== EAGAIN
);
2934 struct vnode
*save_vp
;
2937 if ((buf_flags(bp
) & (B_LOCKED
|B_DELWRI
)) != (B_LOCKED
|B_DELWRI
)) {
2938 if (jnl
->flags
& JOURNAL_CLOSE_PENDING
) {
2939 buf_clearflags(bp
, B_LOCKED
);
2943 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp
, buf_flags(bp
));
2946 save_vp
= buf_vnode(bp
);
2948 buf_setfilter(bp
, buffer_flushed_callback
, tr
, &cur_filter
, NULL
);
2951 panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n",
2952 bp
, buf_blkno(bp
), save_vp
, cur_filter
, buffer_flushed_callback
);
2954 buf_clearflags(bp
, B_LOCKED
);
2956 // kicking off the write here helps performance
2958 // XXXdbg this is good for testing: buf_bdwrite(bp);
2961 // this undoes the vnode_ref() in journal_modify_block_end()
2962 vnode_rele_ext(save_vp
, 0, 1);
2964 printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n",
2965 jnl
->jdev_name
,bp
, errno
);
2969 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2971 // we can free blhdr here since we won't need it any more
2972 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
2973 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
2976 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2977 // tr, tr->journal_start, tr->journal_end);
2982 jnl
->flags
|= JOURNAL_INVALID
;
2983 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] &= ~0x8000000000000000LL
;
2984 abort_transaction(jnl
, tr
);
2989 abort_transaction(journal
*jnl
, transaction
*tr
)
2993 block_list_header
*blhdr
, *next
;
2995 struct vnode
*save_vp
;
2997 // for each block list header, iterate over the blocks then
2998 // free up the memory associated with the block list.
3000 // for each block, clear the lock bit and release it.
3002 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
3004 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
3005 if (blhdr
->binfo
[i
].b
.bp
== NULL
) {
3008 if ( (buf_vnode(blhdr
->binfo
[i
].b
.bp
) == NULL
) ||
3009 !(buf_flags(blhdr
->binfo
[i
].b
.bp
) & B_LOCKED
) ) {
3013 errno
= buf_meta_bread(buf_vnode(blhdr
->binfo
[i
].b
.bp
),
3014 buf_lblkno(blhdr
->binfo
[i
].b
.bp
),
3015 buf_size(blhdr
->binfo
[i
].b
.bp
),
3019 if (bp
!= blhdr
->binfo
[i
].b
.bp
) {
3020 panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
3021 bp
, blhdr
->binfo
[i
].b
.bp
, jnl
);
3024 // releasing a bp marked invalid
3025 // also clears the locked and delayed state
3026 buf_markinvalid(bp
);
3027 save_vp
= buf_vnode(bp
);
3031 vnode_rele_ext(save_vp
, 0, 1);
3033 printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n",
3034 jnl
->jdev_name
, blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].b
.bp
);
3041 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
3043 // we can free blhdr here since we won't need it any more
3044 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
3045 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
3050 tr
->total_bytes
= 0xdbadc0de;
3051 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
3056 journal_end_transaction(journal
*jnl
)
3063 if ((jnl
->flags
& JOURNAL_INVALID
) && jnl
->owner
== NULL
) {
3067 if (jnl
->owner
!= current_thread()) {
3068 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
3069 jnl
, jnl
->owner
, current_thread());
3072 free_old_stuff(jnl
);
3074 jnl
->nested_count
--;
3075 if (jnl
->nested_count
> 0) {
3077 } else if (jnl
->nested_count
< 0) {
3078 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl
, jnl
->nested_count
);
3081 if (jnl
->flags
& JOURNAL_INVALID
) {
3082 if (jnl
->active_tr
) {
3083 if (jnl
->cur_tr
!= NULL
) {
3084 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
3085 jnl
, jnl
->active_tr
, jnl
->cur_tr
);
3088 tr
= jnl
->active_tr
;
3089 jnl
->active_tr
= NULL
;
3090 abort_transaction(jnl
, tr
);
3094 unlock_journal(jnl
);
3099 tr
= jnl
->active_tr
;
3100 CHECK_TRANSACTION(tr
);
3102 // clear this out here so that when check_free_space() calls
3103 // the FS flush function, we don't panic in journal_flush()
3104 // if the FS were to call that. note: check_free_space() is
3105 // called from end_transaction().
3107 jnl
->active_tr
= NULL
;
3108 ret
= end_transaction(tr
, 0, NULL
, NULL
);
3111 unlock_journal(jnl
);
3118 journal_flush(journal
*jnl
)
3120 int need_signal
= 0;
3124 if (jnl
->flags
& JOURNAL_INVALID
) {
3128 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL
, DBG_JOURNAL_FLUSH
))
3129 | DBG_FUNC_START
, 0, 0, 0, 0, 0);
3131 if (jnl
->owner
!= current_thread()) {
3136 free_old_stuff(jnl
);
3138 // if we're not active, flush any buffered transactions
3139 if (jnl
->active_tr
== NULL
&& jnl
->cur_tr
) {
3140 transaction
*tr
= jnl
->cur_tr
;
3143 end_transaction(tr
, 1, NULL
, NULL
); // force it to get flushed
3147 unlock_journal(jnl
);
3150 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL
, DBG_JOURNAL_FLUSH
))
3151 | DBG_FUNC_END
, 0, 0, 0, 0, 0);
3157 journal_active(journal
*jnl
)
3159 if (jnl
->flags
& JOURNAL_INVALID
) {
3163 return (jnl
->active_tr
== NULL
) ? 0 : 1;
3167 journal_owner(journal
*jnl
)
3172 int journal_uses_fua(journal
*jnl
)
3174 if (jnl
->flags
& JOURNAL_DO_FUA_WRITES
)
3180 * Relocate the journal.
3182 * You provide the new starting offset and size for the journal. You may
3183 * optionally provide a new tbuffer_size; passing zero defaults to not
3184 * changing the tbuffer size except as needed to fit within the new journal
3187 * You must have already started a transaction. The transaction may contain
3188 * modified blocks (such as those needed to deallocate the old journal,
3189 * allocate the new journal, and update the location and size of the journal
3190 * in filesystem-private structures). Any transactions prior to the active
3191 * transaction will be flushed to the old journal. The new journal will be
3192 * initialized, and the blocks from the active transaction will be written to
3195 * The caller will need to update the structures that identify the location
3196 * and size of the journal. These updates should be made in the supplied
3197 * callback routine. These updates must NOT go into a transaction. You should
3198 * force these updates to the media before returning from the callback. In the
3199 * even of a crash, either the old journal will be found, with an empty journal,
3200 * or the new journal will be found with the contents of the active transaction.
3202 * Upon return from the callback, the blocks from the active transaction are
3203 * written to their normal locations on disk.
3205 * (Remember that we have to ensure that blocks get committed to the journal
3206 * before being committed to their normal locations. But the blocks don't count
3207 * as committed until the new journal is pointed at.)
3209 * Upon return, there is still an active transaction: newly allocated, and
3210 * with no modified blocks. Call journal_end_transaction as normal. You may
3211 * modifiy additional blocks before calling journal_end_transaction, and those
3212 * blocks will (eventually) go to the relocated journal.
3215 * jnl The (opened) journal to relocate.
3216 * offset The new journal byte offset (from start of the journal device).
3217 * journal_size The size, in bytes, of the new journal.
3218 * tbuffer_size The new desired transaction buffer size. Pass zero to keep
3219 * the same size as the current journal. The size will be
3220 * modified as needed to fit the new journal.
3221 * callback Routine called after the new journal has been initialized,
3222 * and the active transaction written to the new journal, but
3223 * before the blocks are written to their normal locations.
3224 * Pass NULL for no callback.
3225 * callback_arg An argument passed to the callback routine.
3229 * EINVAL The offset is not block aligned
3230 * EINVAL The journal_size is not a multiple of the block size
3231 * EINVAL The journal is invalid
3232 * (any) An error returned by journal_flush.
3235 int journal_relocate(journal
*jnl
, off_t offset
, off_t journal_size
, int32_t tbuffer_size
,
3236 errno_t (*callback
)(void *), void *callback_arg
)
3242 * Sanity check inputs, and adjust the size of the transaction buffer.
3244 if ((offset
% jnl
->jhdr
->jhdr_size
) != 0) {
3245 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
3246 jnl
->jdev_name
, offset
, jnl
->jhdr
->jhdr_size
);
3249 if ((journal_size
% jnl
->jhdr
->jhdr_size
) != 0) {
3250 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3251 jnl
->jdev_name
, journal_size
, jnl
->jhdr
->jhdr_size
);
3257 /* Guarantee we own the active transaction. */
3258 if (jnl
->flags
& JOURNAL_INVALID
) {
3261 if (jnl
->owner
!= current_thread()) {
3262 panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
3263 jnl
, jnl
->owner
, current_thread());
3266 if (tbuffer_size
== 0)
3267 tbuffer_size
= jnl
->tbuffer_size
;
3268 size_up_tbuffer(jnl
, tbuffer_size
, jnl
->jhdr
->jhdr_size
);
3271 * Flush any non-active transactions. We have to temporarily hide the
3272 * active transaction to make journal_flush flush out non-active but
3273 * current (unwritten) transactions.
3275 tr
= jnl
->active_tr
;
3276 CHECK_TRANSACTION(tr
);
3277 jnl
->active_tr
= NULL
;
3278 ret
= journal_flush(jnl
);
3279 jnl
->active_tr
= tr
;
3284 /* Update the journal's offset and size in memory. */
3285 jnl
->jdev_offset
= offset
;
3286 jnl
->jhdr
->start
= jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
3287 jnl
->jhdr
->size
= journal_size
;
3288 jnl
->active_start
= jnl
->jhdr
->start
;
3291 * Force the active transaction to be written to the new journal. Call the
3292 * supplied callback after the blocks have been written to the journal, but
3293 * before they get written to their normal on-disk locations.
3295 jnl
->active_tr
= NULL
;
3296 ret
= end_transaction(tr
, 1, callback
, callback_arg
);
3298 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl
->jdev_name
, ret
);
3303 * Create a new, empty transaction to be the active transaction. This way
3304 * our caller can use journal_end_transaction as usual.
3306 ret
= journal_allocate_transaction(jnl
);
3308 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl
->jdev_name
, ret
);
3315 jnl
->flags
|= JOURNAL_INVALID
;
3316 abort_transaction(jnl
, tr
);