2 * Copyright (c) 1995-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
31 // This file implements a simple write-ahead journaling layer.
32 // In theory any file system can make use of it by calling these
33 // functions when the fs wants to modify meta-data blocks. See
34 // vfs_journal.h for a more detailed description of the api and
37 // Dominic Giampaolo (dbg@apple.com)
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/file_internal.h>
47 #include <sys/buf_internal.h>
48 #include <sys/proc_internal.h>
49 #include <sys/mount_internal.h>
50 #include <sys/namei.h>
51 #include <sys/vnode_internal.h>
52 #include <sys/ioctl.h>
55 #include <sys/malloc.h>
56 #include <kern/thread.h>
58 #include <miscfs/specfs/specdev.h>
60 extern task_t kernel_task
;
72 #include <sys/types.h>
77 #include "vfs_journal.h"
80 // number of bytes to checksum in a block_list_header
81 // NOTE: this should be enough to clear out the header
82 // fields as well as the first entry of binfo[]
83 #define BLHDR_CHECKSUM_SIZE 32
87 static int end_transaction(transaction
*tr
, int force_it
);
88 static void abort_transaction(journal
*jnl
, transaction
*tr
);
89 static void dump_journal(journal
*jnl
);
91 static __inline__
void lock_journal(journal
*jnl
);
92 static __inline__
void unlock_journal(journal
*jnl
);
93 static __inline__
void lock_oldstart(journal
*jnl
);
94 static __inline__
void unlock_oldstart(journal
*jnl
);
100 // 3105942 - Coalesce writes to the same block on journal replay
103 typedef struct bucket
{
109 #define STARTING_BUCKETS 256
111 static int add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
);
112 static int grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
);
113 static int lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
);
114 static int do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
);
115 static int insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
);
117 #define CHECK_JOURNAL(jnl) \
120 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
122 if (jnl->jdev == NULL) { \
123 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
125 if (jnl->fsdev == NULL) { \
126 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
128 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
129 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
130 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
132 if ( jnl->jhdr->start <= 0 \
133 || jnl->jhdr->start > jnl->jhdr->size\
134 || jnl->jhdr->start > 1024*1024*1024) {\
135 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
136 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
138 if ( jnl->jhdr->end <= 0 \
139 || jnl->jhdr->end > jnl->jhdr->size\
140 || jnl->jhdr->end > 1024*1024*1024) {\
141 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
142 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
144 if (jnl->jhdr->size > 1024*1024*1024) {\
145 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
146 __FILE__, __LINE__, jnl->jhdr->size);\
150 #define CHECK_TRANSACTION(tr) \
153 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
155 if (tr->jnl == NULL) {\
156 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
158 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
159 panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
161 if (tr->total_bytes < 0) {\
162 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
164 if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
165 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
167 if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
168 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
170 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
171 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
178 // this isn't a great checksum routine but it will do for now.
179 // we use it to checksum the journal header and the block list
180 // headers that are at the start of each transaction.
183 calc_checksum(char *ptr
, int len
)
187 // this is a lame checksum but for now it'll do
188 for(i
=0; i
< len
; i
++, ptr
++) {
189 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
198 lck_grp_attr_t
* jnl_group_attr
;
199 lck_attr_t
* jnl_lock_attr
;
200 lck_grp_t
* jnl_mutex_group
;
205 jnl_lock_attr
= lck_attr_alloc_init();
206 jnl_group_attr
= lck_grp_attr_alloc_init();
207 jnl_mutex_group
= lck_grp_alloc_init("jnl-mutex", jnl_group_attr
);
209 /* Turn on lock debugging */
210 //lck_attr_setdebug(jnl_lock_attr);
213 static __inline__
void
214 lock_journal(journal
*jnl
)
216 lck_mtx_lock(&jnl
->jlock
);
219 static __inline__
void
220 unlock_journal(journal
*jnl
)
222 lck_mtx_unlock(&jnl
->jlock
);
225 static __inline__
void
226 lock_oldstart(journal
*jnl
)
228 lck_mtx_lock(&jnl
->old_start_lock
);
231 static __inline__
void
232 unlock_oldstart(journal
*jnl
)
234 lck_mtx_unlock(&jnl
->old_start_lock
);
239 #define JNL_WRITE 0x0001
240 #define JNL_READ 0x0002
241 #define JNL_HEADER 0x8000
244 // This function sets up a fake buf and passes it directly to the
245 // journal device strategy routine (so that it won't get cached in
248 // It also handles range checking the i/o so that we don't write
249 // outside the journal boundaries and it will wrap the i/o back
250 // to the beginning if necessary (skipping over the journal header)
253 do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
)
255 int err
, io_sz
=0, curlen
=len
;
257 int max_iosize
= 128 * 1024;
258 struct vfsioattr ioattr
;
260 if (*offset
< 0 || *offset
> jnl
->jhdr
->size
) {
261 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset
, jnl
->jhdr
->size
);
263 vfs_ioattr(vnode_mount(jnl
->jdev
), &ioattr
);
265 if (direction
& JNL_WRITE
)
266 max_iosize
= ioattr
.io_maxwritecnt
;
267 else if (direction
& JNL_READ
)
268 max_iosize
= ioattr
.io_maxreadcnt
;
271 bp
= alloc_io_buf(jnl
->jdev
, 1);
273 if (*offset
+ (off_t
)curlen
> jnl
->jhdr
->size
&& *offset
!= 0 && jnl
->jhdr
->size
!= 0) {
274 if (*offset
== jnl
->jhdr
->size
) {
275 *offset
= jnl
->jhdr
->jhdr_size
;
277 curlen
= (off_t
)jnl
->jhdr
->size
- *offset
;
281 if (curlen
> max_iosize
) {
286 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen
, *offset
, len
);
289 if (*offset
== 0 && (direction
& JNL_HEADER
) == 0) {
290 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen
, data
);
293 if (direction
& JNL_READ
)
294 buf_setflags(bp
, B_READ
);
297 * don't have to set any flags
299 vnode_startwrite(jnl
->jdev
);
301 buf_setsize(bp
, curlen
);
302 buf_setcount(bp
, curlen
);
303 buf_setdataptr(bp
, (uintptr_t)data
);
304 buf_setblkno(bp
, (daddr64_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
));
305 buf_setlblkno(bp
, (daddr64_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
));
307 err
= VNOP_STRATEGY(bp
);
309 err
= (int)buf_biowait(bp
);
314 printf("jnl: do_jnl_io: strategy err 0x%x\n", err
);
321 // handle wrap-around
322 data
= (char *)data
+ curlen
;
323 curlen
= len
- io_sz
;
324 if (*offset
>= jnl
->jhdr
->size
) {
325 *offset
= jnl
->jhdr
->jhdr_size
;
334 read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
336 return do_journal_io(jnl
, offset
, data
, len
, JNL_READ
);
340 write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
342 return do_journal_io(jnl
, offset
, data
, len
, JNL_WRITE
);
347 read_journal_header(journal
*jnl
, void *data
, size_t len
)
349 off_t hdr_offset
= 0;
351 return do_journal_io(jnl
, &hdr_offset
, data
, len
, JNL_READ
|JNL_HEADER
);
355 write_journal_header(journal
*jnl
)
357 static int num_err_prints
= 0;
359 off_t jhdr_offset
= 0;
360 struct vfs_context context
;
362 context
.vc_proc
= current_proc();
363 context
.vc_ucred
= NOCRED
;
365 // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
367 ret
= VNOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, &context
);
370 // Only print this error if it's a different error than the
371 // previous one, or if it's the first time for this device
372 // or if the total number of printfs is less than 25. We
373 // allow for up to 25 printfs to insure that some make it
374 // into the on-disk syslog. Otherwise if we only printed
375 // one, it's possible it would never make it to the syslog
376 // for the root volume and that makes debugging hard.
378 if ( ret
!= jnl
->last_flush_err
379 || (jnl
->flags
& JOURNAL_FLUSHCACHE_ERR
) == 0
380 || num_err_prints
++ < 25) {
382 printf("jnl: flushing fs disk buffer returned 0x%x\n", ret
);
384 jnl
->flags
|= JOURNAL_FLUSHCACHE_ERR
;
385 jnl
->last_flush_err
= ret
;
390 jnl
->jhdr
->checksum
= 0;
391 jnl
->jhdr
->checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
392 if (do_journal_io(jnl
, &jhdr_offset
, jnl
->header_buf
, jnl
->jhdr
->jhdr_size
, JNL_WRITE
|JNL_HEADER
) != jnl
->jhdr
->jhdr_size
) {
393 printf("jnl: write_journal_header: error writing the journal header!\n");
394 jnl
->flags
|= JOURNAL_INVALID
;
398 // Have to flush after writing the journal header so that
399 // a future transaction doesn't sneak out to disk before
400 // the header does and thus overwrite data that the old
401 // journal header refers to. Saw this exact case happen
402 // on an IDE bus analyzer with Larry Barras so while it
403 // may seem obscure, it's not.
405 VNOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, &context
);
413 // this is a work function used to free up transactions that
414 // completed. they can't be free'd from buffer_flushed_callback
415 // because it is called from deep with the disk driver stack
416 // and thus can't do something that would potentially cause
417 // paging. it gets called by each of the journal api entry
418 // points so stuff shouldn't hang around for too long.
421 free_old_stuff(journal
*jnl
)
423 transaction
*tr
, *next
;
427 jnl
->tr_freeme
= NULL
;
428 unlock_oldstart(jnl
);
432 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
440 // This is our callback that lets us know when a buffer has been
441 // flushed to disk. It's called from deep within the driver stack
442 // and thus is quite limited in what it can do. Notably, it can
443 // not initiate any new i/o's or allocate/free memory.
446 buffer_flushed_callback(struct buf
*bp
, void *arg
)
450 transaction
*ctr
, *prev
=NULL
, *next
;
454 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
455 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
457 // snarf out the bits we want
458 bufsize
= buf_size(bp
);
459 tr
= (transaction
*)arg
;
461 // then we've already seen it
466 CHECK_TRANSACTION(tr
);
469 if (jnl
->flags
& JOURNAL_INVALID
) {
475 // update the number of blocks that have been flushed.
476 // this buf may represent more than one block so take
477 // that into account.
478 OSAddAtomic(bufsize
, &tr
->num_flushed
);
481 // if this transaction isn't done yet, just return as
482 // there is nothing to do.
483 if ((tr
->num_flushed
+ tr
->num_killed
) < tr
->total_bytes
) {
487 // this will single thread checking the transaction
490 if (tr
->total_bytes
== 0xfbadc0de) {
491 // then someone beat us to it...
492 unlock_oldstart(jnl
);
496 // mark this so that we're the owner of dealing with the
497 // cleanup for this transaction
498 tr
->total_bytes
= 0xfbadc0de;
500 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
501 // tr, tr->journal_start, tr->journal_end, jnl);
503 // find this entry in the old_start[] index and mark it completed
504 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
506 if ((jnl
->old_start
[i
] & ~(0x8000000000000000LL
)) == tr
->journal_start
) {
507 jnl
->old_start
[i
] &= ~(0x8000000000000000LL
);
511 if (i
>= sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
512 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
513 tr
->journal_start
, tr
, jnl
);
515 unlock_oldstart(jnl
);
518 // if we are here then we need to update the journal header
519 // to reflect that this transaction is complete
520 if (tr
->journal_start
== jnl
->active_start
) {
521 jnl
->active_start
= tr
->journal_end
;
522 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
525 // go through the completed_trs list and try to coalesce
526 // entries, restarting back at the beginning if we have to.
527 for(ctr
=jnl
->completed_trs
; ctr
; prev
=ctr
, ctr
=next
) {
528 if (ctr
->journal_start
== jnl
->active_start
) {
529 jnl
->active_start
= ctr
->journal_end
;
531 prev
->next
= ctr
->next
;
533 if (ctr
== jnl
->completed_trs
) {
534 jnl
->completed_trs
= ctr
->next
;
538 next
= jnl
->completed_trs
; // this starts us over again
539 ctr
->next
= jnl
->tr_freeme
;
540 jnl
->tr_freeme
= ctr
;
542 unlock_oldstart(jnl
);
543 } else if (tr
->journal_end
== ctr
->journal_start
) {
544 ctr
->journal_start
= tr
->journal_start
;
545 next
= jnl
->completed_trs
; // this starts us over again
547 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
548 } else if (tr
->journal_start
== ctr
->journal_end
) {
549 ctr
->journal_end
= tr
->journal_end
;
551 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
557 // if this is true then we didn't merge with anyone
558 // so link ourselves in at the head of the completed
560 if (tr
->journal_start
!= 0) {
561 // put this entry into the correct sorted place
562 // in the list instead of just at the head.
566 for(ctr
=jnl
->completed_trs
; ctr
&& tr
->journal_start
> ctr
->journal_start
; prev
=ctr
, ctr
=ctr
->next
) {
570 if (ctr
== NULL
&& prev
== NULL
) {
571 jnl
->completed_trs
= tr
;
573 } else if (ctr
== jnl
->completed_trs
) {
574 tr
->next
= jnl
->completed_trs
;
575 jnl
->completed_trs
= tr
;
577 tr
->next
= prev
->next
;
581 // if we're here this tr got merged with someone else so
582 // put it on the list to be free'd
584 tr
->next
= jnl
->tr_freeme
;
586 unlock_oldstart(jnl
);
591 #include <libkern/OSByteOrder.h>
593 #define SWAP16(x) OSSwapInt16(x)
594 #define SWAP32(x) OSSwapInt32(x)
595 #define SWAP64(x) OSSwapInt64(x)
599 swap_journal_header(journal
*jnl
)
601 jnl
->jhdr
->magic
= SWAP32(jnl
->jhdr
->magic
);
602 jnl
->jhdr
->endian
= SWAP32(jnl
->jhdr
->endian
);
603 jnl
->jhdr
->start
= SWAP64(jnl
->jhdr
->start
);
604 jnl
->jhdr
->end
= SWAP64(jnl
->jhdr
->end
);
605 jnl
->jhdr
->size
= SWAP64(jnl
->jhdr
->size
);
606 jnl
->jhdr
->blhdr_size
= SWAP32(jnl
->jhdr
->blhdr_size
);
607 jnl
->jhdr
->checksum
= SWAP32(jnl
->jhdr
->checksum
);
608 jnl
->jhdr
->jhdr_size
= SWAP32(jnl
->jhdr
->jhdr_size
);
612 swap_block_list_header(journal
*jnl
, block_list_header
*blhdr
)
616 blhdr
->max_blocks
= SWAP16(blhdr
->max_blocks
);
617 blhdr
->num_blocks
= SWAP16(blhdr
->num_blocks
);
618 blhdr
->bytes_used
= SWAP32(blhdr
->bytes_used
);
619 blhdr
->checksum
= SWAP32(blhdr
->checksum
);
620 blhdr
->pad
= SWAP32(blhdr
->pad
);
622 if (blhdr
->num_blocks
* sizeof(blhdr
->binfo
[0]) > jnl
->jhdr
->blhdr_size
) {
623 printf("jnl: blhdr num blocks looks suspicious (%d). not swapping.\n", blhdr
->num_blocks
);
627 for(i
=0; i
< blhdr
->num_blocks
; i
++) {
628 blhdr
->binfo
[i
].bnum
= SWAP64(blhdr
->binfo
[i
].bnum
);
629 blhdr
->binfo
[i
].bsize
= SWAP32(blhdr
->binfo
[i
].bsize
);
630 blhdr
->binfo
[i
].bp
= (void *)SWAP32((int)blhdr
->binfo
[i
].bp
);
636 update_fs_block(journal
*jnl
, void *block_ptr
, off_t fs_block
, size_t bsize
)
639 struct buf
*oblock_bp
=NULL
;
641 // first read the block we want.
642 ret
= buf_meta_bread(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
644 printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block
, ret
);
647 buf_brelse(oblock_bp
);
651 // let's try to be aggressive here and just re-write the block
652 oblock_bp
= buf_getblk(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, 0, 0, BLK_META
);
653 if (oblock_bp
== NULL
) {
654 printf("jnl: update_fs_block: buf_getblk() for %lld failed! failing update.\n", fs_block
);
659 // make sure it's the correct size.
660 if (buf_size(oblock_bp
) != bsize
) {
661 buf_brelse(oblock_bp
);
665 // copy the journal data over top of it
666 memcpy((void *)buf_dataptr(oblock_bp
), block_ptr
, bsize
);
668 if ((ret
= VNOP_BWRITE(oblock_bp
)) != 0) {
669 printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block
,ret
);
673 // and now invalidate it so that if someone else wants to read
674 // it in a different size they'll be able to do it.
675 ret
= buf_meta_bread(jnl
->fsdev
, (daddr64_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
677 buf_markinvalid(oblock_bp
);
678 buf_brelse(oblock_bp
);
685 grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
)
687 struct bucket
*newBuf
;
688 int current_size
= num_buckets
, i
;
690 // return if newsize is less than the current size
691 if (new_size
< num_buckets
) {
695 if ((MALLOC(newBuf
, struct bucket
*, new_size
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
696 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
700 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
702 // copy existing elements
703 bcopy(*buf_ptr
, newBuf
, num_buckets
*sizeof(struct bucket
));
705 // initialize the new ones
706 for(i
=num_buckets
; i
< new_size
; i
++) {
707 newBuf
[i
].block_num
= (off_t
)-1;
710 // free the old container
711 FREE(*buf_ptr
, M_TEMP
);
720 lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
)
722 int lo
, hi
, index
, matches
, i
;
725 return 0; // table is empty, so insert at index=0
732 // perform binary search for block_num
734 int mid
= (hi
- lo
)/2 + lo
;
735 off_t this_num
= (*buf_ptr
)[mid
].block_num
;
737 if (block_num
== this_num
) {
742 if (block_num
< this_num
) {
747 if (block_num
> this_num
) {
753 // check if lo and hi converged on the match
754 if (block_num
== (*buf_ptr
)[hi
].block_num
) {
758 // if no existing entry found, find index for new one
760 index
= (block_num
< (*buf_ptr
)[hi
].block_num
) ? hi
: hi
+ 1;
762 // make sure that we return the right-most index in the case of multiple matches
765 while(i
< num_full
&& block_num
== (*buf_ptr
)[i
].block_num
) {
777 insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
)
780 // grow the table if we're out of space
781 if (*num_full_ptr
>= *num_buckets_ptr
) {
782 int new_size
= *num_buckets_ptr
* 2;
783 int grow_size
= grow_table(buf_ptr
, *num_buckets_ptr
, new_size
);
785 if (grow_size
< new_size
) {
786 printf("jnl: add_block: grow_table returned an error!\n");
790 *num_buckets_ptr
= grow_size
; //update num_buckets to reflect the new size
793 // if we're not inserting at the end, we need to bcopy
794 if (blk_index
!= *num_full_ptr
) {
795 bcopy( (*buf_ptr
)+(blk_index
), (*buf_ptr
)+(blk_index
+1), (*num_full_ptr
-blk_index
)*sizeof(struct bucket
) );
798 (*num_full_ptr
)++; // increment only if we're not overwriting
801 // sanity check the values we're about to add
802 if (offset
>= jnl
->jhdr
->size
) {
803 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
806 panic("jnl: insert_block: bad size in insert_block (%d)\n", size
);
809 (*buf_ptr
)[blk_index
].block_num
= num
;
810 (*buf_ptr
)[blk_index
].block_size
= size
;
811 (*buf_ptr
)[blk_index
].jnl_offset
= offset
;
817 do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
)
819 int num_to_remove
, index
, i
, overwrite
, err
;
820 size_t jhdr_size
= jnl
->jhdr
->jhdr_size
, new_offset
;
821 off_t overlap
, block_start
, block_end
;
823 block_start
= block_num
*jhdr_size
;
824 block_end
= block_start
+ size
;
825 overwrite
= (block_num
== (*buf_ptr
)[blk_index
].block_num
&& size
>= (*buf_ptr
)[blk_index
].block_size
);
827 // first, eliminate any overlap with the previous entry
828 if (blk_index
!= 0 && !overwrite
) {
829 off_t prev_block_start
= (*buf_ptr
)[blk_index
-1].block_num
*jhdr_size
;
830 off_t prev_block_end
= prev_block_start
+ (*buf_ptr
)[blk_index
-1].block_size
;
831 overlap
= prev_block_end
- block_start
;
833 if (overlap
% jhdr_size
!= 0) {
834 panic("jnl: do_overlap: overlap with previous entry not a multiple of %d\n", jhdr_size
);
837 // if the previous entry completely overlaps this one, we need to break it into two pieces.
838 if (prev_block_end
> block_end
) {
839 off_t new_num
= block_end
/ jhdr_size
;
840 size_t new_size
= prev_block_end
- block_end
;
842 new_offset
= (*buf_ptr
)[blk_index
-1].jnl_offset
+ (block_end
- prev_block_start
);
844 err
= insert_block(jnl
, buf_ptr
, blk_index
, new_num
, new_size
, new_offset
, num_buckets_ptr
, num_full_ptr
, 0);
846 panic("jnl: do_overlap: error inserting during pre-overlap\n");
850 // Regardless, we need to truncate the previous entry to the beginning of the overlap
851 (*buf_ptr
)[blk_index
-1].block_size
= block_start
- prev_block_start
;
855 // then, bail out fast if there's no overlap with the entries that follow
856 if (!overwrite
&& block_end
<= (*buf_ptr
)[blk_index
].block_num
*jhdr_size
) {
857 return 0; // no overlap, no overwrite
858 } else if (overwrite
&& (blk_index
+ 1 >= *num_full_ptr
|| block_end
<= (*buf_ptr
)[blk_index
+1].block_num
*jhdr_size
)) {
859 return 1; // simple overwrite
862 // Otherwise, find all cases of total and partial overlap. We use the special
863 // block_num of -2 to designate entries that are completely overlapped and must
864 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
865 // entries must be adjusted to keep the array consistent.
868 while(index
< *num_full_ptr
&& block_end
> (*buf_ptr
)[index
].block_num
*jhdr_size
) {
869 if (block_end
>= ((*buf_ptr
)[index
].block_num
*jhdr_size
+ (*buf_ptr
)[index
].block_size
)) {
870 (*buf_ptr
)[index
].block_num
= -2; // mark this for deletion
873 overlap
= block_end
- (*buf_ptr
)[index
].block_num
*jhdr_size
;
875 if (overlap
% jhdr_size
!= 0) {
876 panic("jnl: do_overlap: overlap of %lld is not multiple of %d\n", overlap
, jhdr_size
);
879 // if we partially overlap this entry, adjust its block number, jnl offset, and size
880 (*buf_ptr
)[index
].block_num
+= (overlap
/ jhdr_size
); // make sure overlap is multiple of jhdr_size, or round up
882 new_offset
= (*buf_ptr
)[index
].jnl_offset
+ overlap
; // check for wrap-around
883 if (new_offset
>= jnl
->jhdr
->size
) {
884 new_offset
= jhdr_size
+ (new_offset
- jnl
->jhdr
->size
);
886 (*buf_ptr
)[index
].jnl_offset
= new_offset
;
888 (*buf_ptr
)[index
].block_size
-= overlap
; // sanity check for negative value
889 if ((*buf_ptr
)[index
].block_size
<= 0) {
890 panic("jnl: do_overlap: after overlap, new block size is invalid (%d)\n", (*buf_ptr
)[index
].block_size
);
891 // return -1; // if above panic is removed, return -1 for error
900 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
901 index
--; // start with the last index used within the above loop
902 while(index
>= blk_index
) {
903 if ((*buf_ptr
)[index
].block_num
== -2) {
904 if (index
== *num_full_ptr
-1) {
905 (*buf_ptr
)[index
].block_num
= -1; // it's the last item in the table... just mark as free
907 bcopy( (*buf_ptr
)+(index
+1), (*buf_ptr
)+(index
), (*num_full_ptr
- (index
+ 1)) * sizeof(struct bucket
) );
914 // eliminate any stale entries at the end of the table
915 for(i
=*num_full_ptr
; i
< (*num_full_ptr
+ num_to_remove
); i
++) {
916 (*buf_ptr
)[i
].block_num
= -1;
919 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
922 // PR-3105942: Coalesce writes to the same block in journal replay
923 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
924 // to be replayed and the corresponding location in the journal which contains
925 // the most recent data for those blocks. The array is "played" once the all the
926 // blocks in the journal have been coalesced. The code for the case of conflicting/
927 // overlapping writes to a single block is the most dense. Because coalescing can
928 // disrupt the existing time-ordering of blocks in the journal playback, care
929 // is taken to catch any overlaps and keep the array consistent.
931 add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
)
933 int blk_index
, overwriting
;
935 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
936 // inserted (or the index of the elem to overwrite).
937 blk_index
= lookup_bucket( buf_ptr
, block_num
, *num_full_ptr
);
939 // check if the index is within bounds (if we're adding this block to the end of
940 // the table, blk_index will be equal to num_full)
941 if (blk_index
< 0 || blk_index
> *num_full_ptr
) {
942 //printf("jnl: add_block: trouble adding block to co_buf\n");
944 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
946 // Determine whether we're overwriting an existing entry by checking for overlap
947 overwriting
= do_overlap(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, num_buckets_ptr
, num_full_ptr
);
948 if (overwriting
< 0) {
949 return -1; // if we got an error, pass it along
952 // returns the index, or -1 on error
953 blk_index
= insert_block(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, num_buckets_ptr
, num_full_ptr
, overwriting
);
959 replay_journal(journal
*jnl
)
961 int i
, ret
, orig_checksum
, checksum
, max_bsize
;
962 block_list_header
*blhdr
;
964 char *buff
, *block_ptr
=NULL
;
965 struct bucket
*co_buf
;
966 int num_buckets
= STARTING_BUCKETS
, num_full
;
968 // wrap the start ptr if it points to the very end of the journal
969 if (jnl
->jhdr
->start
== jnl
->jhdr
->size
) {
970 jnl
->jhdr
->start
= jnl
->jhdr
->jhdr_size
;
972 if (jnl
->jhdr
->end
== jnl
->jhdr
->size
) {
973 jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
976 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
980 // allocate memory for the header_block. we'll read each blhdr into this
981 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&buff
, jnl
->jhdr
->blhdr_size
)) {
982 printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
983 jnl
->jhdr
->blhdr_size
);
987 // allocate memory for the coalesce buffer
988 if ((MALLOC(co_buf
, struct bucket
*, num_buckets
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
989 printf("jnl: replay_journal: no memory for coalesce buffer!\n");
993 // initialize entries
994 for(i
=0; i
< num_buckets
; i
++) {
995 co_buf
[i
].block_num
= -1;
997 num_full
= 0; // empty at first
1000 printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1001 jnl
->jhdr
->start
, jnl
->jhdr
->end
, jnl
->jdev_offset
);
1003 while(jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1004 offset
= jnl
->jhdr
->start
;
1005 ret
= read_journal_data(jnl
, &offset
, buff
, jnl
->jhdr
->blhdr_size
);
1006 if (ret
!= jnl
->jhdr
->blhdr_size
) {
1007 printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset
);
1011 blhdr
= (block_list_header
*)buff
;
1013 orig_checksum
= blhdr
->checksum
;
1014 blhdr
->checksum
= 0;
1015 if (jnl
->flags
& JOURNAL_NEED_SWAP
) {
1016 // calculate the checksum based on the unswapped data
1017 // because it is done byte-at-a-time.
1018 orig_checksum
= SWAP32(orig_checksum
);
1019 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1020 swap_block_list_header(jnl
, blhdr
);
1022 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1024 if (checksum
!= orig_checksum
) {
1025 printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1026 offset
, orig_checksum
, checksum
);
1029 if ( blhdr
->max_blocks
<= 0 || blhdr
->max_blocks
> 2048
1030 || blhdr
->num_blocks
<= 0 || blhdr
->num_blocks
> blhdr
->max_blocks
) {
1031 printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
1032 blhdr
->max_blocks
, blhdr
->num_blocks
);
1036 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1037 if (blhdr
->binfo
[i
].bnum
< 0 && blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
1038 printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr
->binfo
[i
].bnum
);
1043 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1044 // blhdr->num_blocks-1, jnl->jhdr->start);
1045 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1049 size
= blhdr
->binfo
[i
].bsize
;
1050 number
= blhdr
->binfo
[i
].bnum
;
1052 // don't add "killed" blocks
1053 if (number
== (off_t
)-1) {
1054 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1056 // add this bucket to co_buf, coalescing where possible
1057 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1058 ret_val
= add_block(jnl
, &co_buf
, number
, size
, (size_t) offset
, &num_buckets
, &num_full
);
1060 if (ret_val
== -1) {
1061 printf("jnl: replay_journal: trouble adding block to co_buf\n");
1063 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1069 // check if the last block added puts us off the end of the jnl.
1070 // if so, we need to wrap to the beginning and take any remainder
1073 if (offset
>= jnl
->jhdr
->size
) {
1074 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
1079 jnl
->jhdr
->start
+= blhdr
->bytes_used
;
1080 if (jnl
->jhdr
->start
>= jnl
->jhdr
->size
) {
1081 // wrap around and skip the journal header block
1082 jnl
->jhdr
->start
= (jnl
->jhdr
->start
% jnl
->jhdr
->size
) + jnl
->jhdr
->jhdr_size
;
1087 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1090 * make sure it's at least one page in size, so
1091 * start max_bsize at PAGE_SIZE
1093 for (i
= 0, max_bsize
= PAGE_SIZE
; i
< num_full
; i
++) {
1095 if (co_buf
[i
].block_num
== (off_t
)-1)
1098 if (co_buf
[i
].block_size
> max_bsize
)
1099 max_bsize
= co_buf
[i
].block_size
;
1102 * round max_bsize up to the nearest PAGE_SIZE multiple
1104 if (max_bsize
& (PAGE_SIZE
- 1)) {
1105 max_bsize
= (max_bsize
+ PAGE_SIZE
) & ~(PAGE_SIZE
- 1);
1108 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&block_ptr
, max_bsize
)) {
1112 // Replay the coalesced entries in the co-buf
1113 for(i
=0; i
< num_full
; i
++) {
1114 size_t size
= co_buf
[i
].block_size
;
1115 off_t jnl_offset
= (off_t
) co_buf
[i
].jnl_offset
;
1116 off_t number
= co_buf
[i
].block_num
;
1119 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1120 // co_buf[i].block_size, co_buf[i].jnl_offset);
1122 if (number
== (off_t
)-1) {
1123 // printf("jnl: replay_journal: skipping killed fs block\n");
1126 // do journal read, and set the phys. block
1127 ret
= read_journal_data(jnl
, &jnl_offset
, block_ptr
, size
);
1129 printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset
);
1133 if (update_fs_block(jnl
, block_ptr
, number
, size
) != 0) {
1140 // done replaying; update jnl header
1141 if (write_journal_header(jnl
) != 0) {
1146 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1149 // free the coalesce buffer
1150 FREE(co_buf
, M_TEMP
);
1153 kmem_free(kernel_map
, (vm_offset_t
)buff
, jnl
->jhdr
->blhdr_size
);
1158 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1161 FREE(co_buf
, M_TEMP
);
1163 kmem_free(kernel_map
, (vm_offset_t
)buff
, jnl
->jhdr
->blhdr_size
);
1169 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1170 //#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
1171 #define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
1173 // XXXdbg - so I can change it in the debugger
1174 int def_tbuffer_size
= 0;
1178 // This function sets the size of the tbuffer and the
1179 // size of the blhdr. It assumes that jnl->jhdr->size
1180 // and jnl->jhdr->jhdr_size are already valid.
1183 size_up_tbuffer(journal
*jnl
, int tbuffer_size
, int phys_blksz
)
1186 // one-time initialization based on how much memory
1187 // there is in the machine.
1189 if (def_tbuffer_size
== 0) {
1190 if (mem_size
< (256*1024*1024)) {
1191 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
;
1192 } else if (mem_size
< (512*1024*1024)) {
1193 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 2;
1194 } else if (mem_size
< (1024*1024*1024)) {
1195 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 3;
1196 } else if (mem_size
>= (1024*1024*1024)) {
1197 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 4;
1201 // size up the transaction buffer... can't be larger than the number
1202 // of blocks that can fit in a block_list_header block.
1203 if (tbuffer_size
== 0) {
1204 jnl
->tbuffer_size
= def_tbuffer_size
;
1206 // make sure that the specified tbuffer_size isn't too small
1207 if (tbuffer_size
< jnl
->jhdr
->blhdr_size
* 2) {
1208 tbuffer_size
= jnl
->jhdr
->blhdr_size
* 2;
1210 // and make sure it's an even multiple of the block size
1211 if ((tbuffer_size
% jnl
->jhdr
->jhdr_size
) != 0) {
1212 tbuffer_size
-= (tbuffer_size
% jnl
->jhdr
->jhdr_size
);
1215 jnl
->tbuffer_size
= tbuffer_size
;
1218 if (jnl
->tbuffer_size
> (jnl
->jhdr
->size
/ 2)) {
1219 jnl
->tbuffer_size
= (jnl
->jhdr
->size
/ 2);
1222 if (jnl
->tbuffer_size
> MAX_TRANSACTION_BUFFER_SIZE
) {
1223 jnl
->tbuffer_size
= MAX_TRANSACTION_BUFFER_SIZE
;
1226 jnl
->jhdr
->blhdr_size
= (jnl
->tbuffer_size
/ jnl
->jhdr
->jhdr_size
) * sizeof(block_info
);
1227 if (jnl
->jhdr
->blhdr_size
< phys_blksz
) {
1228 jnl
->jhdr
->blhdr_size
= phys_blksz
;
1229 } else if ((jnl
->jhdr
->blhdr_size
% phys_blksz
) != 0) {
1230 // have to round up so we're an even multiple of the physical block size
1231 jnl
->jhdr
->blhdr_size
= (jnl
->jhdr
->blhdr_size
+ (phys_blksz
- 1)) & ~(phys_blksz
- 1);
1238 journal_create(struct vnode
*jvp
,
1242 size_t min_fs_blksz
,
1244 int32_t tbuffer_size
,
1245 void (*flush
)(void *arg
),
1250 struct vfs_context context
;
1252 context
.vc_proc
= current_proc();
1253 context
.vc_ucred
= FSCRED
;
1255 /* Get the real physical block size. */
1256 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1260 if (phys_blksz
> min_fs_blksz
) {
1261 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1262 phys_blksz
, min_fs_blksz
);
1266 if ((journal_size
% phys_blksz
) != 0) {
1267 printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1268 journal_size
, phys_blksz
);
1272 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1273 memset(jnl
, 0, sizeof(*jnl
));
1276 jnl
->jdev_offset
= offset
;
1279 jnl
->flush_arg
= arg
;
1280 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1281 lck_mtx_init(&jnl
->old_start_lock
, jnl_mutex_group
, jnl_lock_attr
);
1283 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1284 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
1285 goto bad_kmem_alloc
;
1288 memset(jnl
->header_buf
, 0, phys_blksz
);
1290 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1291 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1292 jnl
->jhdr
->endian
= ENDIAN_MAGIC
;
1293 jnl
->jhdr
->start
= phys_blksz
; // start at block #1, block #0 is for the jhdr itself
1294 jnl
->jhdr
->end
= phys_blksz
;
1295 jnl
->jhdr
->size
= journal_size
;
1296 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1297 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1299 jnl
->active_start
= jnl
->jhdr
->start
;
1301 // XXXdbg - for testing you can force the journal to wrap around
1302 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1303 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1305 lck_mtx_init(&jnl
->jlock
, jnl_mutex_group
, jnl_lock_attr
);
1307 if (write_journal_header(jnl
) != 0) {
1308 printf("jnl: journal_create: failed to write journal header.\n");
1316 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1319 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1325 journal_open(struct vnode
*jvp
,
1329 size_t min_fs_blksz
,
1331 int32_t tbuffer_size
,
1332 void (*flush
)(void *arg
),
1336 int orig_blksz
=0, phys_blksz
;
1337 int orig_checksum
, checksum
;
1338 struct vfs_context context
;
1340 context
.vc_proc
= current_proc();
1341 context
.vc_ucred
= FSCRED
;
1343 /* Get the real physical block size. */
1344 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1348 if (phys_blksz
> min_fs_blksz
) {
1349 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1350 phys_blksz
, min_fs_blksz
);
1354 if ((journal_size
% phys_blksz
) != 0) {
1355 printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1356 journal_size
, phys_blksz
);
1360 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1361 memset(jnl
, 0, sizeof(*jnl
));
1364 jnl
->jdev_offset
= offset
;
1367 jnl
->flush_arg
= arg
;
1368 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1369 lck_mtx_init(&jnl
->old_start_lock
, jnl_mutex_group
, jnl_lock_attr
);
1371 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1372 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
1373 goto bad_kmem_alloc
;
1376 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1377 memset(jnl
->jhdr
, 0, sizeof(journal_header
)+4);
1379 // we have to set this up here so that do_journal_io() will work
1380 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1382 if (read_journal_header(jnl
, jnl
->jhdr
, phys_blksz
) != phys_blksz
) {
1383 printf("jnl: open: could not read %d bytes for the journal header.\n",
1388 orig_checksum
= jnl
->jhdr
->checksum
;
1389 jnl
->jhdr
->checksum
= 0;
1391 if (jnl
->jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
1392 // do this before the swap since it's done byte-at-a-time
1393 orig_checksum
= SWAP32(orig_checksum
);
1394 checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
1395 swap_journal_header(jnl
);
1396 jnl
->flags
|= JOURNAL_NEED_SWAP
;
1398 checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
1401 if (jnl
->jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
->jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
1402 printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
1403 jnl
->jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
1407 // only check if we're the current journal header magic value
1408 if (jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
) {
1410 if (orig_checksum
!= checksum
) {
1411 printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
1412 orig_checksum
, checksum
);
1418 // XXXdbg - convert old style magic numbers to the new one
1419 if (jnl
->jhdr
->magic
== OLD_JOURNAL_HEADER_MAGIC
) {
1420 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1423 if (phys_blksz
!= jnl
->jhdr
->jhdr_size
&& jnl
->jhdr
->jhdr_size
!= 0) {
1424 printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
1425 phys_blksz
, jnl
->jhdr
->jhdr_size
);
1427 orig_blksz
= phys_blksz
;
1428 phys_blksz
= jnl
->jhdr
->jhdr_size
;
1429 if (VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&phys_blksz
, FWRITE
, &context
)) {
1430 printf("jnl: could not set block size to %d bytes.\n", phys_blksz
);
1433 // goto bad_journal;
1436 if ( jnl
->jhdr
->start
<= 0
1437 || jnl
->jhdr
->start
> jnl
->jhdr
->size
1438 || jnl
->jhdr
->start
> 1024*1024*1024) {
1439 printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1440 jnl
->jhdr
->start
, jnl
->jhdr
->size
);
1444 if ( jnl
->jhdr
->end
<= 0
1445 || jnl
->jhdr
->end
> jnl
->jhdr
->size
1446 || jnl
->jhdr
->end
> 1024*1024*1024) {
1447 printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1448 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
1452 if (jnl
->jhdr
->size
> 1024*1024*1024) {
1453 printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl
->jhdr
->size
);
1457 // XXXdbg - can't do these checks because hfs writes all kinds of
1458 // non-uniform sized blocks even on devices that have a block size
1459 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
1460 // therefore these checks will fail and so we just have to punt and
1461 // do more relaxed checking...
1462 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1463 if ((jnl
->jhdr
->start
% 512) != 0) {
1464 printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
1469 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1470 if ((jnl
->jhdr
->end
% 512) != 0) {
1471 printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1472 jnl
->jhdr
->end
, jnl
->jhdr
->jhdr_size
);
1476 // take care of replaying the journal if necessary
1477 if (flags
& JOURNAL_RESET
) {
1478 printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
1479 jnl
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1480 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
1481 } else if (replay_journal(jnl
) != 0) {
1482 printf("jnl: journal_open: Error replaying the journal!\n");
1486 if (orig_blksz
!= 0) {
1487 VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, &context
);
1488 phys_blksz
= orig_blksz
;
1489 if (orig_blksz
< jnl
->jhdr
->jhdr_size
) {
1490 printf("jnl: open: jhdr_size is %d but orig phys blk size is %d. switching.\n",
1491 jnl
->jhdr
->jhdr_size
, orig_blksz
);
1493 jnl
->jhdr
->jhdr_size
= orig_blksz
;
1497 // make sure this is in sync!
1498 jnl
->active_start
= jnl
->jhdr
->start
;
1500 // set this now, after we've replayed the journal
1501 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1503 lck_mtx_init(&jnl
->jlock
, jnl_mutex_group
, jnl_lock_attr
);
1508 if (orig_blksz
!= 0) {
1509 phys_blksz
= orig_blksz
;
1510 VNOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, &context
);
1512 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1514 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1520 journal_is_clean(struct vnode
*jvp
,
1524 size_t min_fs_block_size
)
1527 int phys_blksz
, ret
;
1528 int orig_checksum
, checksum
;
1529 struct vfs_context context
;
1531 context
.vc_proc
= current_proc();
1532 context
.vc_ucred
= FSCRED
;
1534 /* Get the real physical block size. */
1535 if (VNOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, &context
)) {
1536 printf("jnl: is_clean: failed to get device block size.\n");
1540 if (phys_blksz
> min_fs_block_size
) {
1541 printf("jnl: is_clean: error: phys blksize %d bigger than min fs blksize %d\n",
1542 phys_blksz
, min_fs_block_size
);
1546 if ((journal_size
% phys_blksz
) != 0) {
1547 printf("jnl: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1548 journal_size
, phys_blksz
);
1552 memset(&jnl
, 0, sizeof(jnl
));
1554 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
.header_buf
, phys_blksz
)) {
1555 printf("jnl: is_clean: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
1559 jnl
.jhdr
= (journal_header
*)jnl
.header_buf
;
1560 memset(jnl
.jhdr
, 0, sizeof(journal_header
)+4);
1563 jnl
.jdev_offset
= offset
;
1566 // we have to set this up here so that do_journal_io() will work
1567 jnl
.jhdr
->jhdr_size
= phys_blksz
;
1569 if (read_journal_header(&jnl
, jnl
.jhdr
, phys_blksz
) != phys_blksz
) {
1570 printf("jnl: is_clean: could not read %d bytes for the journal header.\n",
1576 orig_checksum
= jnl
.jhdr
->checksum
;
1577 jnl
.jhdr
->checksum
= 0;
1579 if (jnl
.jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
1580 // do this before the swap since it's done byte-at-a-time
1581 orig_checksum
= SWAP32(orig_checksum
);
1582 checksum
= calc_checksum((char *)jnl
.jhdr
, sizeof(struct journal_header
));
1583 swap_journal_header(&jnl
);
1584 jnl
.flags
|= JOURNAL_NEED_SWAP
;
1586 checksum
= calc_checksum((char *)jnl
.jhdr
, sizeof(struct journal_header
));
1589 if (jnl
.jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
.jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
1590 printf("jnl: is_clean: journal magic is bad (0x%x != 0x%x)\n",
1591 jnl
.jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
1596 if (orig_checksum
!= checksum
) {
1597 printf("jnl: is_clean: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum
, checksum
);
1603 // if the start and end are equal then the journal is clean.
1604 // otherwise it's not clean and therefore an error.
1606 if (jnl
.jhdr
->start
== jnl
.jhdr
->end
) {
1613 kmem_free(kernel_map
, (vm_offset_t
)jnl
.header_buf
, phys_blksz
);
1623 journal_close(journal
*jnl
)
1625 volatile off_t
*start
, *end
;
1630 // set this before doing anything that would block so that
1631 // we start tearing things down properly.
1633 jnl
->flags
|= JOURNAL_CLOSE_PENDING
;
1635 if (jnl
->owner
!= current_thread()) {
1640 // only write stuff to disk if the journal is still valid
1642 if ((jnl
->flags
& JOURNAL_INVALID
) == 0) {
1644 if (jnl
->active_tr
) {
1645 journal_end_transaction(jnl
);
1648 // flush any buffered transactions
1650 transaction
*tr
= jnl
->cur_tr
;
1653 end_transaction(tr
, 1); // force it to get flushed
1656 //start = &jnl->jhdr->start;
1657 start
= &jnl
->active_start
;
1658 end
= &jnl
->jhdr
->end
;
1660 while (*start
!= *end
&& counter
++ < 500) {
1661 printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start
, *end
);
1663 jnl
->flush(jnl
->flush_arg
);
1665 tsleep((caddr_t
)jnl
, PRIBIO
, "jnl_close", 1);
1668 if (*start
!= *end
) {
1669 printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1673 // make sure this is in sync when we close the journal
1674 jnl
->jhdr
->start
= jnl
->active_start
;
1676 // if this fails there's not much we can do at this point...
1677 write_journal_header(jnl
);
1679 // if we're here the journal isn't valid any more.
1680 // so make sure we don't leave any locked blocks lying around
1681 printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl
);
1682 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1684 if (jnl
->active_tr
) {
1685 tr
= jnl
->active_tr
;
1686 jnl
->active_tr
= NULL
;
1692 abort_transaction(jnl
, tr
);
1693 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1694 panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl
);
1699 free_old_stuff(jnl
);
1701 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, jnl
->jhdr
->jhdr_size
);
1702 jnl
->jhdr
= (void *)0xbeefbabe;
1704 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1708 dump_journal(journal
*jnl
)
1713 printf(" jdev_offset %.8llx\n", jnl
->jdev_offset
);
1714 printf(" magic: 0x%.8x\n", jnl
->jhdr
->magic
);
1715 printf(" start: 0x%.8llx\n", jnl
->jhdr
->start
);
1716 printf(" end: 0x%.8llx\n", jnl
->jhdr
->end
);
1717 printf(" size: 0x%.8llx\n", jnl
->jhdr
->size
);
1718 printf(" blhdr size: %d\n", jnl
->jhdr
->blhdr_size
);
1719 printf(" jhdr size: %d\n", jnl
->jhdr
->jhdr_size
);
1720 printf(" chksum: 0x%.8x\n", jnl
->jhdr
->checksum
);
1722 printf(" completed transactions:\n");
1723 for(ctr
=jnl
->completed_trs
; ctr
; ctr
=ctr
->next
) {
1724 printf(" 0x%.8llx - 0x%.8llx\n", ctr
->journal_start
, ctr
->journal_end
);
1731 free_space(journal
*jnl
)
1735 if (jnl
->jhdr
->start
< jnl
->jhdr
->end
) {
1736 free_space
= jnl
->jhdr
->size
- (jnl
->jhdr
->end
- jnl
->jhdr
->start
) - jnl
->jhdr
->jhdr_size
;
1737 } else if (jnl
->jhdr
->start
> jnl
->jhdr
->end
) {
1738 free_space
= jnl
->jhdr
->start
- jnl
->jhdr
->end
;
1740 // journal is completely empty
1741 free_space
= jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
;
1749 // The journal must be locked on entry to this function.
1750 // The "desired_size" is in bytes.
1753 check_free_space(journal
*jnl
, int desired_size
)
1757 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
1758 // desired_size, free_space(jnl));
1761 int old_start_empty
;
1763 if (counter
++ == 5000) {
1765 panic("jnl: check_free_space: buffer flushing isn't working "
1766 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl
,
1767 jnl
->jhdr
->start
, jnl
->jhdr
->end
, free_space(jnl
), jnl
->active_start
);
1769 if (counter
> 7500) {
1770 printf("jnl: check_free_space: giving up waiting for free space.\n");
1774 // make sure there's space in the journal to hold this transaction
1775 if (free_space(jnl
) > desired_size
) {
1780 // here's where we lazily bump up jnl->jhdr->start. we'll consume
1781 // entries until there is enough space for the next transaction.
1783 old_start_empty
= 1;
1785 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
1789 while (jnl
->old_start
[i
] & 0x8000000000000000LL
) {
1790 if (counter
++ > 100) {
1791 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
1792 jnl
->old_start
[i
], jnl
);
1795 unlock_oldstart(jnl
);
1797 jnl
->flush(jnl
->flush_arg
);
1799 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space1", 1);
1803 if (jnl
->old_start
[i
] == 0) {
1807 old_start_empty
= 0;
1808 jnl
->jhdr
->start
= jnl
->old_start
[i
];
1809 jnl
->old_start
[i
] = 0;
1810 if (free_space(jnl
) > desired_size
) {
1811 unlock_oldstart(jnl
);
1812 write_journal_header(jnl
);
1817 unlock_oldstart(jnl
);
1819 // if we bumped the start, loop and try again
1820 if (i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
1822 } else if (old_start_empty
) {
1824 // if there is nothing in old_start anymore then we can
1825 // bump the jhdr->start to be the same as active_start
1826 // since it is possible there was only one very large
1827 // transaction in the old_start array. if we didn't do
1828 // this then jhdr->start would never get updated and we
1829 // would wind up looping until we hit the panic at the
1830 // start of the loop.
1832 jnl
->jhdr
->start
= jnl
->active_start
;
1833 write_journal_header(jnl
);
1838 // if the file system gave us a flush function, call it to so that
1839 // it can flush some blocks which hopefully will cause some transactions
1840 // to complete and thus free up space in the journal.
1842 jnl
->flush(jnl
->flush_arg
);
1845 // wait for a while to avoid being cpu-bound (this will
1846 // put us to sleep for 10 milliseconds)
1847 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space2", 1);
1854 journal_start_transaction(journal
*jnl
)
1861 if (jnl
->flags
& JOURNAL_INVALID
) {
1865 if (jnl
->owner
== current_thread()) {
1866 if (jnl
->active_tr
== NULL
) {
1867 panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_thread 0x%x\n",
1868 jnl
, jnl
->owner
, current_thread());
1870 jnl
->nested_count
++;
1876 if (jnl
->owner
!= NULL
|| jnl
->nested_count
!= 0 || jnl
->active_tr
!= NULL
) {
1877 panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
1878 jnl
->owner
, jnl
->nested_count
, jnl
->active_tr
, jnl
);
1881 jnl
->owner
= current_thread();
1882 jnl
->nested_count
= 1;
1884 free_old_stuff(jnl
);
1886 // make sure there's room in the journal
1887 if (check_free_space(jnl
, jnl
->tbuffer_size
) != 0) {
1888 printf("jnl: start transaction failed: no space\n");
1893 // if there's a buffered transaction, use it.
1895 jnl
->active_tr
= jnl
->cur_tr
;
1901 MALLOC_ZONE(tr
, transaction
*, sizeof(transaction
), M_JNL_TR
, M_WAITOK
);
1902 memset(tr
, 0, sizeof(transaction
));
1904 tr
->tbuffer_size
= jnl
->tbuffer_size
;
1906 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&tr
->tbuffer
, tr
->tbuffer_size
)) {
1907 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
1908 printf("jnl: start transaction failed: no tbuffer mem\n");
1913 // journal replay code checksum check depends on this.
1914 memset(tr
->tbuffer
, 0, BLHDR_CHECKSUM_SIZE
);
1916 tr
->blhdr
= (block_list_header
*)tr
->tbuffer
;
1917 tr
->blhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1918 tr
->blhdr
->num_blocks
= 1; // accounts for this header block
1919 tr
->blhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
1922 tr
->total_bytes
= jnl
->jhdr
->blhdr_size
;
1925 jnl
->active_tr
= tr
;
1927 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
1933 jnl
->nested_count
= 0;
1934 unlock_journal(jnl
);
1940 journal_modify_block_start(journal
*jnl
, struct buf
*bp
)
1946 if (jnl
->flags
& JOURNAL_INVALID
) {
1950 // XXXdbg - for debugging I want this to be true. later it may
1951 // not be necessary.
1952 if ((buf_flags(bp
) & B_META
) == 0) {
1953 panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp
, jnl
);
1956 tr
= jnl
->active_tr
;
1957 CHECK_TRANSACTION(tr
);
1959 if (jnl
->owner
!= current_thread()) {
1960 panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1961 jnl
, jnl
->owner
, current_thread());
1964 free_old_stuff(jnl
);
1966 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
1967 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
1969 // can't allow blocks that aren't an even multiple of the
1970 // underlying block size.
1971 if ((buf_size(bp
) % jnl
->jhdr
->jhdr_size
) != 0) {
1972 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1973 buf_size(bp
), jnl
->jhdr
->jhdr_size
);
1977 // make sure that this transaction isn't bigger than the whole journal
1978 if (tr
->total_bytes
+buf_size(bp
) >= (jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
)) {
1979 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
1980 tr
->total_bytes
, (tr
->jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
), buf_size(bp
), tr
, bp
);
1984 // if the block is dirty and not already locked we have to write
1985 // it out before we muck with it because it has data that belongs
1986 // (presumably) to another transaction.
1988 if ((buf_flags(bp
) & (B_DELWRI
| B_LOCKED
)) == B_DELWRI
) {
1990 if (buf_flags(bp
) & B_ASYNC
) {
1991 panic("modify_block_start: bp @ 0x% has async flag set!\n", bp
);
1994 // this will cause it to not be buf_brelse()'d
1995 buf_setflags(bp
, B_NORELSE
);
1998 buf_setflags(bp
, B_LOCKED
);
2004 journal_modify_block_abort(journal
*jnl
, struct buf
*bp
)
2007 block_list_header
*blhdr
;
2012 tr
= jnl
->active_tr
;
2015 // if there's no active transaction then we just want to
2016 // call buf_brelse() and return since this is just a block
2017 // that happened to be modified as part of another tr.
2024 if (jnl
->flags
& JOURNAL_INVALID
) {
2028 CHECK_TRANSACTION(tr
);
2030 if (jnl
->owner
!= current_thread()) {
2031 panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2032 jnl
, jnl
->owner
, current_thread());
2035 free_old_stuff(jnl
);
2037 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2039 // first check if it's already part of this transaction
2040 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2041 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2042 if (bp
== blhdr
->binfo
[i
].bp
) {
2043 if (buf_size(bp
) != blhdr
->binfo
[i
].bsize
) {
2044 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
2045 bp
, buf_size(bp
), blhdr
->binfo
[i
].bsize
, jnl
);
2051 if (i
< blhdr
->num_blocks
) {
2057 // if blhdr is null, then this block has only had modify_block_start
2058 // called on it as part of the current transaction. that means that
2059 // it is ok to clear the LOCKED bit since it hasn't actually been
2060 // modified. if blhdr is non-null then modify_block_end was called
2061 // on it and so we need to keep it locked in memory.
2063 if (blhdr
== NULL
) {
2064 buf_clearflags(bp
, B_LOCKED
);
2073 journal_modify_block_end(journal
*jnl
, struct buf
*bp
)
2075 int i
, j
, tbuffer_offset
;
2077 block_list_header
*blhdr
, *prev
=NULL
;
2082 if (jnl
->flags
& JOURNAL_INVALID
) {
2086 tr
= jnl
->active_tr
;
2087 CHECK_TRANSACTION(tr
);
2089 if (jnl
->owner
!= current_thread()) {
2090 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2091 jnl
, jnl
->owner
, current_thread());
2094 free_old_stuff(jnl
);
2096 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2097 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2099 if ((buf_flags(bp
) & B_LOCKED
) == 0) {
2100 panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp
, jnl
);
2103 // first check if it's already part of this transaction
2104 for(blhdr
=tr
->blhdr
; blhdr
; prev
=blhdr
,blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2105 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2107 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2108 if (bp
== blhdr
->binfo
[i
].bp
) {
2109 if (buf_size(bp
) != blhdr
->binfo
[i
].bsize
) {
2110 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
2111 bp
, buf_size(bp
), blhdr
->binfo
[i
].bsize
, jnl
);
2115 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
2118 if (i
< blhdr
->num_blocks
) {
2125 && (prev
->num_blocks
+1) <= prev
->max_blocks
2126 && (prev
->bytes_used
+buf_size(bp
)) <= tr
->tbuffer_size
) {
2128 } else if (blhdr
== NULL
) {
2129 block_list_header
*nblhdr
;
2132 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl
, bp
);
2135 // we got to the end of the list, didn't find the block and there's
2136 // no room in the block_list_header pointed to by prev
2138 // we allocate another tbuffer and link it in at the end of the list
2139 // through prev->binfo[0].bnum. that's a skanky way to do things but
2140 // avoids having yet another linked list of small data structures to manage.
2142 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&nblhdr
, tr
->tbuffer_size
)) {
2143 panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
2144 tr
, tr
->total_bytes
);
2147 // journal replay code checksum check depends on this.
2148 memset(nblhdr
, 0, BLHDR_CHECKSUM_SIZE
);
2150 // initialize the new guy
2151 nblhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
2152 nblhdr
->num_blocks
= 1; // accounts for this header block
2153 nblhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
2156 tr
->total_bytes
+= jnl
->jhdr
->blhdr_size
;
2158 // then link him in at the end
2159 prev
->binfo
[0].bnum
= (off_t
)((long)nblhdr
);
2161 // and finally switch to using the new guy
2163 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2168 if ((i
+1) > blhdr
->max_blocks
) {
2169 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i
, blhdr
->max_blocks
);
2172 // copy the data into the in-memory transaction buffer
2173 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
2174 memcpy(blkptr
, buf_dataptr(bp
), buf_size(bp
));
2176 // if this is true then this is a new block we haven't seen
2177 if (i
>= blhdr
->num_blocks
) {
2183 bsize
= buf_size(bp
);
2185 blhdr
->binfo
[i
].bnum
= (off_t
)(buf_blkno(bp
));
2186 blhdr
->binfo
[i
].bsize
= bsize
;
2187 blhdr
->binfo
[i
].bp
= bp
;
2189 blhdr
->bytes_used
+= bsize
;
2190 tr
->total_bytes
+= bsize
;
2192 blhdr
->num_blocks
++;
2200 journal_kill_block(journal
*jnl
, struct buf
*bp
)
2204 block_list_header
*blhdr
;
2209 if (jnl
->flags
& JOURNAL_INVALID
) {
2213 tr
= jnl
->active_tr
;
2214 CHECK_TRANSACTION(tr
);
2216 if (jnl
->owner
!= current_thread()) {
2217 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2218 jnl
, jnl
->owner
, current_thread());
2221 free_old_stuff(jnl
);
2223 bflags
= buf_flags(bp
);
2225 if ( !(bflags
& B_LOCKED
))
2226 panic("jnl: modify_block_end: called with bp not B_LOCKED");
2229 * bp must be BL_BUSY and B_LOCKED
2231 // first check if it's already part of this transaction
2232 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2234 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2235 if (bp
== blhdr
->binfo
[i
].bp
) {
2238 buf_clearflags(bp
, B_LOCKED
);
2240 // this undoes the vnode_ref() in journal_modify_block_end()
2242 vnode_rele_ext(vp
, 0, 1);
2244 // if the block has the DELWRI and FILTER bits sets, then
2245 // things are seriously weird. if it was part of another
2246 // transaction then journal_modify_block_start() should
2247 // have force it to be written.
2249 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2250 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2252 tr
->num_killed
+= buf_size(bp
);
2254 blhdr
->binfo
[i
].bp
= NULL
;
2255 blhdr
->binfo
[i
].bnum
= (off_t
)-1;
2263 if (i
< blhdr
->num_blocks
) {
2273 journal_binfo_cmp(void *a
, void *b
)
2275 block_info
*bi_a
= (struct block_info
*)a
;
2276 block_info
*bi_b
= (struct block_info
*)b
;
2279 if (bi_a
->bp
== NULL
) {
2282 if (bi_b
->bp
== NULL
) {
2286 // don't have to worry about negative block
2287 // numbers so this is ok to do.
2289 res
= (buf_blkno(bi_a
->bp
) - buf_blkno(bi_b
->bp
));
2296 end_transaction(transaction
*tr
, int force_it
)
2301 journal
*jnl
= tr
->jnl
;
2303 block_list_header
*blhdr
=NULL
, *next
=NULL
;
2306 panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
2307 jnl
, jnl
->cur_tr
, tr
);
2310 // if there weren't any modified blocks in the transaction
2311 // just save off the transaction pointer and return.
2312 if (tr
->total_bytes
== jnl
->jhdr
->blhdr_size
) {
2317 // if our transaction buffer isn't very full, just hang
2318 // on to it and don't actually flush anything. this is
2319 // what is known as "group commit". we will flush the
2320 // transaction buffer if it's full or if we have more than
2321 // one of them so we don't start hogging too much memory.
2324 && (jnl
->flags
& JOURNAL_NO_GROUP_COMMIT
) == 0
2325 && tr
->num_blhdrs
< 3
2326 && (tr
->total_bytes
<= ((tr
->tbuffer_size
*tr
->num_blhdrs
) - tr
->tbuffer_size
/8))) {
2333 // if we're here we're going to flush the transaction buffer to disk.
2334 // make sure there is room in the journal first.
2335 check_free_space(jnl
, tr
->total_bytes
);
2337 // range check the end index
2338 if (jnl
->jhdr
->end
<= 0 || jnl
->jhdr
->end
> jnl
->jhdr
->size
) {
2339 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2340 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
2343 // this transaction starts where the current journal ends
2344 tr
->journal_start
= jnl
->jhdr
->end
;
2345 end
= jnl
->jhdr
->end
;
2348 // if the first entry in old_start[] isn't free yet, loop calling the
2349 // file system flush routine until it is (or we panic).
2353 while ((jnl
->old_start
[0] & 0x8000000000000000LL
) != 0) {
2355 unlock_oldstart(jnl
);
2358 jnl
->flush(jnl
->flush_arg
);
2361 // yield the cpu so others can get in to clear the lock bit
2362 (void)tsleep((void *)jnl
, PRIBIO
, "jnl-old-start-sleep", 1);
2367 panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
2368 jnl
->old_start
[0] & (~0x8000000000000000LL
), jnl
);
2373 // slide everyone else down and put our latest guy in the last
2374 // entry in the old_start array
2376 memcpy(&jnl
->old_start
[0], &jnl
->old_start
[1], sizeof(jnl
->old_start
)-sizeof(jnl
->old_start
[0]));
2377 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] = tr
->journal_start
| 0x8000000000000000LL
;
2379 unlock_oldstart(jnl
);
2382 // for each block, make sure that the physical block # is set
2383 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
2385 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2390 bp
= blhdr
->binfo
[i
].bp
;
2391 if (bp
== NULL
) { // only true if a block was "killed"
2392 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
2393 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
2394 blhdr
->binfo
[i
].bnum
, jnl
, tr
);
2399 blkno
= buf_blkno(bp
);
2400 lblkno
= buf_lblkno(bp
);
2402 if (vp
== NULL
&& lblkno
== blkno
) {
2403 printf("jnl: end_tr: bad news! bp @ 0x%x w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr 0x%x jnl 0x%x).\n",
2404 bp
, lblkno
, blkno
, tr
, jnl
);
2408 // if the lblkno is the same as blkno and this bp isn't
2409 // associated with the underlying file system device then
2410 // we need to call bmap() to get the actual physical block.
2412 if ((lblkno
== blkno
) && (vp
!= jnl
->fsdev
)) {
2414 size_t contig_bytes
;
2416 if (VNOP_BLKTOOFF(vp
, lblkno
, &f_offset
)) {
2417 printf("jnl: end_tr: vnop_blktooff failed @ 0x%x, jnl 0x%x\n", bp
, jnl
);
2420 if (VNOP_BLOCKMAP(vp
, f_offset
, buf_count(bp
), &blkno
, &contig_bytes
, NULL
, 0, NULL
)) {
2421 printf("jnl: end_tr: can't blockmap the bp @ 0x%x, jnl 0x%x\n", bp
, jnl
);
2424 if ((uint32_t)contig_bytes
< buf_count(bp
)) {
2425 printf("jnl: end_tr: blk not physically contiguous on disk@ 0x%x, jnl 0x%x\n", bp
, jnl
);
2428 buf_setblkno(bp
, blkno
);
2430 // update this so we write out the correct physical block number!
2431 blhdr
->binfo
[i
].bnum
= (off_t
)(blkno
);
2434 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2437 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2439 amt
= blhdr
->bytes_used
;
2441 blhdr
->checksum
= 0;
2442 blhdr
->checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
2444 ret
= write_journal_data(jnl
, &end
, blhdr
, amt
);
2446 printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
2453 jnl
->jhdr
->end
= end
; // update where the journal now ends
2454 tr
->journal_end
= end
; // the transaction ends here too
2455 if (tr
->journal_start
== 0 || tr
->journal_end
== 0) {
2456 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2457 tr
->journal_start
, tr
->journal_end
);
2460 if (write_journal_header(jnl
) != 0) {
2465 // setup for looping through all the blhdr's. we null out the
2466 // tbuffer and blhdr fields so that they're not used any more.
2472 // the buffer_flushed_callback will only be called for the
2473 // real blocks that get flushed so we have to account for
2474 // the block_list_headers here.
2476 tr
->num_flushed
= tr
->num_blhdrs
* jnl
->jhdr
->blhdr_size
;
2478 // for each block, set the iodone callback and unlock it
2479 for(; blhdr
; blhdr
=next
) {
2481 // we can re-order the buf ptrs because everything is written out already
2482 qsort(&blhdr
->binfo
[1], blhdr
->num_blocks
-1, sizeof(block_info
), journal_binfo_cmp
);
2484 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2485 if (blhdr
->binfo
[i
].bp
== NULL
) {
2489 errno
= buf_meta_bread(buf_vnode(blhdr
->binfo
[i
].bp
),
2490 buf_lblkno(blhdr
->binfo
[i
].bp
),
2491 buf_size(blhdr
->binfo
[i
].bp
),
2494 if (errno
== 0 && bp
!= NULL
) {
2495 struct vnode
*save_vp
;
2498 if (bp
!= blhdr
->binfo
[i
].bp
) {
2499 panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2500 bp
, blhdr
->binfo
[i
].bp
, jnl
);
2503 if ((buf_flags(bp
) & (B_LOCKED
|B_DELWRI
)) != (B_LOCKED
|B_DELWRI
)) {
2504 if (jnl
->flags
& JOURNAL_CLOSE_PENDING
) {
2505 buf_clearflags(bp
, B_LOCKED
);
2509 panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp
, buf_flags(bp
));
2512 save_vp
= buf_vnode(bp
);
2514 buf_setfilter(bp
, buffer_flushed_callback
, tr
, &cur_filter
, NULL
);
2517 panic("jnl: bp @ 0x%x (blkno %qd, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
2518 bp
, buf_blkno(bp
), save_vp
, cur_filter
, buffer_flushed_callback
);
2520 buf_clearflags(bp
, B_LOCKED
);
2522 // kicking off the write here helps performance
2524 // XXXdbg this is good for testing: buf_bdwrite(bp);
2527 // this undoes the vnode_ref() in journal_modify_block_end()
2528 vnode_rele_ext(save_vp
, 0, 1);
2530 printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
2531 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
2533 buf_clearflags(bp
, B_LOCKED
);
2539 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2541 // we can free blhdr here since we won't need it any more
2542 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
2543 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
2546 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2547 // tr, tr->journal_start, tr->journal_end);
2552 jnl
->flags
|= JOURNAL_INVALID
;
2553 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] &= ~0x8000000000000000LL
;
2554 abort_transaction(jnl
, tr
);
2559 abort_transaction(journal
*jnl
, transaction
*tr
)
2563 block_list_header
*blhdr
, *next
;
2565 struct vnode
*save_vp
;
2567 // for each block list header, iterate over the blocks then
2568 // free up the memory associated with the block list.
2570 // for each block, clear the lock bit and release it.
2572 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
2574 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2575 if (blhdr
->binfo
[i
].bp
== NULL
) {
2578 if ( (buf_vnode(blhdr
->binfo
[i
].bp
) == NULL
) ||
2579 !(buf_flags(blhdr
->binfo
[i
].bp
) & B_LOCKED
) ) {
2583 errno
= buf_meta_bread(buf_vnode(blhdr
->binfo
[i
].bp
),
2584 buf_lblkno(blhdr
->binfo
[i
].bp
),
2585 buf_size(blhdr
->binfo
[i
].bp
),
2589 if (bp
!= blhdr
->binfo
[i
].bp
) {
2590 panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2591 bp
, blhdr
->binfo
[i
].bp
, jnl
);
2594 // releasing a bp marked invalid
2595 // also clears the locked and delayed state
2596 buf_markinvalid(bp
);
2597 save_vp
= buf_vnode(bp
);
2601 vnode_rele_ext(save_vp
, 0, 1);
2603 printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
2604 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
2611 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2613 // we can free blhdr here since we won't need it any more
2614 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
2615 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
2620 tr
->total_bytes
= 0xdbadc0de;
2621 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
2626 journal_end_transaction(journal
*jnl
)
2633 if ((jnl
->flags
& JOURNAL_INVALID
) && jnl
->owner
== NULL
) {
2637 if (jnl
->owner
!= current_thread()) {
2638 panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2639 jnl
, jnl
->owner
, current_thread());
2642 free_old_stuff(jnl
);
2644 jnl
->nested_count
--;
2645 if (jnl
->nested_count
> 0) {
2647 } else if (jnl
->nested_count
< 0) {
2648 panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl
, jnl
->nested_count
);
2651 if (jnl
->flags
& JOURNAL_INVALID
) {
2652 if (jnl
->active_tr
) {
2653 if (jnl
->cur_tr
!= NULL
) {
2654 panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
2655 jnl
, jnl
->active_tr
, jnl
->cur_tr
);
2658 tr
= jnl
->active_tr
;
2659 jnl
->active_tr
= NULL
;
2660 abort_transaction(jnl
, tr
);
2664 unlock_journal(jnl
);
2669 tr
= jnl
->active_tr
;
2670 CHECK_TRANSACTION(tr
);
2672 // clear this out here so that when check_free_space() calls
2673 // the FS flush function, we don't panic in journal_flush()
2674 // if the FS were to call that. note: check_free_space() is
2675 // called from end_transaction().
2677 jnl
->active_tr
= NULL
;
2678 ret
= end_transaction(tr
, 0);
2681 unlock_journal(jnl
);
2688 journal_flush(journal
*jnl
)
2690 int need_signal
= 0;
2694 if (jnl
->flags
& JOURNAL_INVALID
) {
2698 if (jnl
->owner
!= current_thread()) {
2705 free_old_stuff(jnl
);
2707 // if we're not active, flush any buffered transactions
2708 if (jnl
->active_tr
== NULL
&& jnl
->cur_tr
) {
2709 transaction
*tr
= jnl
->cur_tr
;
2712 end_transaction(tr
, 1); // force it to get flushed
2716 unlock_journal(jnl
);
2723 journal_active(journal
*jnl
)
2725 if (jnl
->flags
& JOURNAL_INVALID
) {
2729 return (jnl
->active_tr
== NULL
) ? 0 : 1;
2733 journal_owner(journal
*jnl
)