2 * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
26 // This file implements a simple write-ahead journaling layer.
27 // In theory any file system can make use of it by calling these
28 // functions when the fs wants to modify meta-data blocks. See
29 // vfs_journal.h for a more detailed description of the api and
32 // Dominic Giampaolo (dbg@apple.com)
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
44 #include <sys/mount.h>
45 #include <sys/namei.h>
46 #include <sys/vnode.h>
47 #include <sys/ioctl.h>
50 #include <sys/malloc.h>
51 #include <sys/vnode.h>
52 #include <kern/thread_act.h>
54 #include <miscfs/specfs/specdev.h>
56 extern task_t kernel_task
;
68 #include <sys/types.h>
73 #include "vfs_journal.h"
76 // number of bytes to checksum in a block_list_header
77 // NOTE: this should be enough to clear out the header
78 // fields as well as the first entry of binfo[]
79 #define BLHDR_CHECKSUM_SIZE 32
83 static int end_transaction(transaction
*tr
, int force_it
);
84 static void abort_transaction(journal
*jnl
, transaction
*tr
);
85 static void dump_journal(journal
*jnl
);
88 #define CHECK_JOURNAL(jnl) \
91 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
93 if (jnl->jdev == NULL) { \
94 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
96 if (jnl->fsdev == NULL) { \
97 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
99 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
100 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
101 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
103 if ( jnl->jhdr->start <= 0 \
104 || jnl->jhdr->start > jnl->jhdr->size\
105 || jnl->jhdr->start > 128*1024*1024) {\
106 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
107 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
109 if ( jnl->jhdr->end <= 0 \
110 || jnl->jhdr->end > jnl->jhdr->size\
111 || jnl->jhdr->end > 128*1024*1024) {\
112 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
113 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
115 if (jnl->jhdr->size > 128*1024*1024) {\
116 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
117 __FILE__, __LINE__, jnl->jhdr->size);\
121 #define CHECK_TRANSACTION(tr) \
124 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
126 if (tr->jnl == NULL) {\
127 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
129 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
130 panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
132 if (tr->total_bytes < 0) {\
133 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
135 if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\
136 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
138 if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\
139 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
141 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\
142 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
149 // this isn't a great checksum routine but it will do for now.
150 // we use it to checksum the journal header and the block list
151 // headers that are at the start of each transaction.
154 calc_checksum(char *ptr
, int len
)
158 // this is a lame checksum but for now it'll do
159 for(i
=0; i
< len
; i
++, ptr
++) {
160 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
171 // This function sets up a fake buf and passes it directly to the
172 // journal device strategy routine (so that it won't get cached in
175 // It also handles range checking the i/o so that we don't write
176 // outside the journal boundaries and it will wrap the i/o back
177 // to the beginning if necessary (skipping over the journal header)
180 do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
)
182 int err
, io_sz
=0, curlen
=len
;
184 int max_iosize
=0, max_vectors
;
186 if (*offset
< 0 || *offset
> jnl
->jhdr
->size
) {
187 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset
, jnl
->jhdr
->size
);
191 bp
= alloc_io_buf(jnl
->jdev
, 1);
193 if (direction
== JNL_WRITE
) {
194 bp
->b_flags
|= 0; // don't have to set any flags (was: B_WRITEINPROG)
195 jnl
->jdev
->v_numoutput
++;
196 vfs_io_attributes(jnl
->jdev
, B_WRITE
, &max_iosize
, &max_vectors
);
197 } else if (direction
== JNL_READ
) {
198 bp
->b_flags
|= B_READ
;
199 vfs_io_attributes(jnl
->jdev
, B_READ
, &max_iosize
, &max_vectors
);
202 if (max_iosize
== 0) {
203 max_iosize
= 128 * 1024;
206 if (*offset
+ (off_t
)curlen
> jnl
->jhdr
->size
&& *offset
!= 0 && jnl
->jhdr
->size
!= 0) {
207 if (*offset
== jnl
->jhdr
->size
) {
208 *offset
= jnl
->jhdr
->jhdr_size
;
210 curlen
= (off_t
)jnl
->jhdr
->size
- *offset
;
214 if (curlen
> max_iosize
) {
219 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen
, *offset
, len
);
222 bp
->b_bufsize
= curlen
;
223 bp
->b_bcount
= curlen
;
225 bp
->b_blkno
= (daddr_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
);
226 bp
->b_lblkno
= (daddr_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
);
228 err
= VOP_STRATEGY(bp
);
234 bp
->b_bufsize
= bp
->b_bcount
= 0;
235 bp
->b_blkno
= bp
->b_lblkno
= -1;
240 printf("jnl: do_jnl_io: strategy err 0x%x\n", err
);
247 // handle wrap-around
248 data
= (char *)data
+ curlen
;
249 curlen
= len
- io_sz
;
250 if (*offset
>= jnl
->jhdr
->size
) {
251 *offset
= jnl
->jhdr
->jhdr_size
;
260 read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
262 return do_journal_io(jnl
, offset
, data
, len
, JNL_READ
);
266 write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
268 return do_journal_io(jnl
, offset
, data
, len
, JNL_WRITE
);
273 write_journal_header(journal
*jnl
)
276 off_t jhdr_offset
= 0;
279 // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
281 ret
= VOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, NOCRED
, current_proc());
283 printf("jnl: flushing fs disk buffer returned 0x%x\n", ret
);
287 jnl
->jhdr
->checksum
= 0;
288 jnl
->jhdr
->checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
289 if (write_journal_data(jnl
, &jhdr_offset
, jnl
->header_buf
, jnl
->jhdr
->jhdr_size
) != jnl
->jhdr
->jhdr_size
) {
290 printf("jnl: write_journal_header: error writing the journal header!\n");
291 jnl
->flags
|= JOURNAL_INVALID
;
301 // this is a work function used to free up transactions that
302 // completed. they can't be free'd from buffer_flushed_callback
303 // because it is called from deep with the disk driver stack
304 // and thus can't do something that would potentially cause
305 // paging. it gets called by each of the journal api entry
306 // points so stuff shouldn't hang around for too long.
309 free_old_stuff(journal
*jnl
)
311 transaction
*tr
, *next
;
313 for(tr
=jnl
->tr_freeme
; tr
; tr
=next
) {
315 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
318 jnl
->tr_freeme
= NULL
;
324 // This is our callback that lets us know when a buffer has been
325 // flushed to disk. It's called from deep within the driver stack
326 // and thus is quite limited in what it can do. Notably, it can
327 // not initiate any new i/o's or allocate/free memory.
330 buffer_flushed_callback(struct buf
*bp
)
334 transaction
*ctr
, *prev
=NULL
, *next
;
338 //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
339 // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
341 // snarf out the bits we want
342 bufsize
= bp
->b_bufsize
;
343 tr
= bp
->b_transaction
;
345 bp
->b_iodone
= NULL
; // don't call us for this guy again
346 bp
->b_transaction
= NULL
;
349 // This is what biodone() would do if it didn't call us.
350 // NOTE: THIS CODE *HAS* TO BE HERE!
352 if (ISSET(bp
->b_flags
, B_ASYNC
)) { /* if async, release it */
354 } else { /* or just wakeup the buffer */
355 CLR(bp
->b_flags
, B_WANTED
);
359 // NOTE: from here on out we do *NOT* touch bp anymore.
362 // then we've already seen it
367 CHECK_TRANSACTION(tr
);
370 if (jnl
->flags
& JOURNAL_INVALID
) {
376 // update the number of blocks that have been flushed.
377 // this buf may represent more than one block so take
378 // that into account.
379 tr
->num_flushed
+= bufsize
;
382 // if this transaction isn't done yet, just return as
383 // there is nothing to do.
384 if ((tr
->num_flushed
+ tr
->num_killed
) < tr
->total_bytes
) {
388 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
389 // tr, tr->journal_start, tr->journal_end, jnl);
391 // find this entry in the old_start[] index and mark it completed
392 simple_lock(&jnl
->old_start_lock
);
393 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
395 if ((jnl
->old_start
[i
] & ~(0x8000000000000000LL
)) == tr
->journal_start
) {
396 jnl
->old_start
[i
] &= ~(0x8000000000000000LL
);
400 if (i
>= sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
401 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
402 tr
->journal_start
, tr
, jnl
);
404 simple_unlock(&jnl
->old_start_lock
);
407 // if we are here then we need to update the journal header
408 // to reflect that this transaction is complete
409 if (tr
->journal_start
== jnl
->active_start
) {
410 jnl
->active_start
= tr
->journal_end
;
411 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
414 // go through the completed_trs list and try to coalesce
415 // entries, restarting back at the beginning if we have to.
416 for(ctr
=jnl
->completed_trs
; ctr
; prev
=ctr
, ctr
=next
) {
417 if (ctr
->journal_start
== jnl
->active_start
) {
418 jnl
->active_start
= ctr
->journal_end
;
420 prev
->next
= ctr
->next
;
422 if (ctr
== jnl
->completed_trs
) {
423 jnl
->completed_trs
= ctr
->next
;
426 next
= jnl
->completed_trs
; // this starts us over again
427 ctr
->next
= jnl
->tr_freeme
;
428 jnl
->tr_freeme
= ctr
;
430 } else if (tr
->journal_end
== ctr
->journal_start
) {
431 ctr
->journal_start
= tr
->journal_start
;
432 next
= jnl
->completed_trs
; // this starts us over again
434 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
435 } else if (tr
->journal_start
== ctr
->journal_end
) {
436 ctr
->journal_end
= tr
->journal_end
;
438 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
444 // at this point no one should be using this guy anymore
445 tr
->total_bytes
= 0xfbadc0de;
447 // if this is true then we didn't merge with anyone
448 // so link ourselves in at the head of the completed
450 if (tr
->journal_start
!= 0) {
451 // put this entry into the correct sorted place
452 // in the list instead of just at the head.
456 for(ctr
=jnl
->completed_trs
; ctr
&& tr
->journal_start
> ctr
->journal_start
; prev
=ctr
, ctr
=ctr
->next
) {
460 if (ctr
== NULL
&& prev
== NULL
) {
461 jnl
->completed_trs
= tr
;
463 } else if (ctr
== jnl
->completed_trs
) {
464 tr
->next
= jnl
->completed_trs
;
465 jnl
->completed_trs
= tr
;
467 tr
->next
= prev
->next
;
471 // if we're here this tr got merged with someone else so
472 // put it on the list to be free'd
473 tr
->next
= jnl
->tr_freeme
;
479 update_fs_block(journal
*jnl
, void *block_ptr
, off_t fs_block
, size_t bsize
)
482 struct buf
*oblock_bp
=NULL
;
484 // first read the block we want.
485 ret
= meta_bread(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
487 printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block
, ret
);
494 // let's try to be aggressive here and just re-write the block
495 oblock_bp
= getblk(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, 0, 0, BLK_META
);
496 if (oblock_bp
== NULL
) {
497 printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block
);
502 // make sure it's the correct size.
503 if (oblock_bp
->b_bufsize
!= bsize
) {
508 // copy the journal data over top of it
509 memcpy(oblock_bp
->b_data
, block_ptr
, bsize
);
511 if ((ret
= VOP_BWRITE(oblock_bp
)) != 0) {
512 printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block
,ret
);
516 // and now invalidate it so that if someone else wants to read
517 // it in a different size they'll be able to do it.
518 ret
= meta_bread(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
520 oblock_bp
->b_flags
|= B_INVAL
;
529 replay_journal(journal
*jnl
)
531 int i
, ret
, checksum
, max_bsize
;
532 struct buf
*oblock_bp
;
533 block_list_header
*blhdr
;
535 char *buf
, *block_ptr
=NULL
;
537 // wrap the start ptr if it points to the very end of the journal
538 if (jnl
->jhdr
->start
== jnl
->jhdr
->size
) {
539 jnl
->jhdr
->start
= jnl
->jhdr
->jhdr_size
;
541 if (jnl
->jhdr
->end
== jnl
->jhdr
->size
) {
542 jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
545 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
549 // allocate memory for the header_block. we'll read each blhdr into this
550 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&buf
, jnl
->jhdr
->blhdr_size
)) {
551 printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
552 jnl
->jhdr
->blhdr_size
);
557 printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
558 jnl
->jhdr
->start
, jnl
->jhdr
->end
, jnl
->jdev_offset
);
560 while(jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
561 offset
= jnl
->jhdr
->start
;
562 ret
= read_journal_data(jnl
, &offset
, buf
, jnl
->jhdr
->blhdr_size
);
563 if (ret
!= jnl
->jhdr
->blhdr_size
) {
564 printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset
);
568 blhdr
= (block_list_header
*)buf
;
569 checksum
= blhdr
->checksum
;
571 if (checksum
!= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
)) {
572 printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
573 offset
, checksum
, calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
));
576 if ( blhdr
->max_blocks
<= 0 || blhdr
->max_blocks
> 2048
577 || blhdr
->num_blocks
<= 0 || blhdr
->num_blocks
> blhdr
->max_blocks
) {
578 printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
579 blhdr
->max_blocks
, blhdr
->num_blocks
);
583 for(i
=1,max_bsize
=0; i
< blhdr
->num_blocks
; i
++) {
584 if (blhdr
->binfo
[i
].bnum
< 0 && blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
585 printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr
->binfo
[i
].bnum
);
588 if (blhdr
->binfo
[i
].bsize
> max_bsize
) {
589 max_bsize
= blhdr
->binfo
[i
].bsize
;
593 // make sure it's at least one page in size.
594 if (max_bsize
& (PAGE_SIZE
- 1)) {
595 max_bsize
= (max_bsize
+ PAGE_SIZE
) & ~(PAGE_SIZE
- 1);
598 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&block_ptr
, max_bsize
)) {
602 //printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1,
603 // jnl->jhdr->start);
604 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
607 size
= blhdr
->binfo
[i
].bsize
;
609 ret
= read_journal_data(jnl
, &offset
, block_ptr
, size
);
611 printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset
);
615 // don't replay "killed" blocks
616 if (blhdr
->binfo
[i
].bnum
== (off_t
)-1) {
617 // printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i);
619 //printf("jnl: replay_journal: fixing fs block # %lld (%d)\n",
620 // blhdr->binfo[i].bnum, blhdr->binfo[i].bsize);
622 if (update_fs_block(jnl
, block_ptr
, blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bsize
) != 0) {
627 // check if we need to wrap offset back to the beginning
628 // (which is just past the journal header)
630 if (offset
>= jnl
->jhdr
->size
) {
631 offset
= jnl
->jhdr
->jhdr_size
;
635 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
638 jnl
->jhdr
->start
+= blhdr
->bytes_used
;
639 if (jnl
->jhdr
->start
>= jnl
->jhdr
->size
) {
640 // wrap around and skip the journal header block
641 jnl
->jhdr
->start
= (jnl
->jhdr
->start
% jnl
->jhdr
->size
) + jnl
->jhdr
->jhdr_size
;
644 // only update the on-disk journal header if we've reached the
645 // last chunk of updates from this transaction. if binfo[0].bnum
646 // is zero then we know we're at the end.
647 if (blhdr
->binfo
[0].bnum
== 0) {
648 if (write_journal_header(jnl
) != 0) {
654 kmem_free(kernel_map
, (vm_offset_t
)buf
, jnl
->jhdr
->blhdr_size
);
659 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
661 kmem_free(kernel_map
, (vm_offset_t
)buf
, jnl
->jhdr
->blhdr_size
);
666 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
667 //#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
668 #define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
670 // XXXdbg - so I can change it in the debugger
671 int def_tbuffer_size
= 0;
675 // This function sets the size of the tbuffer and the
676 // size of the blhdr. It assumes that jnl->jhdr->size
677 // and jnl->jhdr->jhdr_size are already valid.
680 size_up_tbuffer(journal
*jnl
, int tbuffer_size
, int phys_blksz
)
683 // one-time initialization based on how much memory
684 // there is in the machine.
686 if (def_tbuffer_size
== 0) {
687 if (mem_size
< (256*1024*1024)) {
688 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
;
689 } else if (mem_size
< (512*1024*1024)) {
690 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 2;
691 } else if (mem_size
< (1024*1024*1024)) {
692 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 3;
693 } else if (mem_size
>= (1024*1024*1024)) {
694 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 4;
698 // size up the transaction buffer... can't be larger than the number
699 // of blocks that can fit in a block_list_header block.
700 if (tbuffer_size
== 0) {
701 jnl
->tbuffer_size
= def_tbuffer_size
;
703 // make sure that the specified tbuffer_size isn't too small
704 if (tbuffer_size
< jnl
->jhdr
->blhdr_size
* 2) {
705 tbuffer_size
= jnl
->jhdr
->blhdr_size
* 2;
707 // and make sure it's an even multiple of the block size
708 if ((tbuffer_size
% jnl
->jhdr
->jhdr_size
) != 0) {
709 tbuffer_size
-= (tbuffer_size
% jnl
->jhdr
->jhdr_size
);
712 jnl
->tbuffer_size
= tbuffer_size
;
715 if (jnl
->tbuffer_size
> (jnl
->jhdr
->size
/ 2)) {
716 jnl
->tbuffer_size
= (jnl
->jhdr
->size
/ 2);
719 if (jnl
->tbuffer_size
> MAX_TRANSACTION_BUFFER_SIZE
) {
720 jnl
->tbuffer_size
= MAX_TRANSACTION_BUFFER_SIZE
;
723 jnl
->jhdr
->blhdr_size
= (jnl
->tbuffer_size
/ jnl
->jhdr
->jhdr_size
) * sizeof(block_info
);
724 if (jnl
->jhdr
->blhdr_size
< phys_blksz
) {
725 jnl
->jhdr
->blhdr_size
= phys_blksz
;
732 journal_create(struct vnode
*jvp
,
738 int32_t tbuffer_size
,
739 void (*flush
)(void *arg
),
745 /* Get the real physical block size. */
746 if (VOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, FSCRED
, NULL
)) {
750 if (phys_blksz
> min_fs_blksz
) {
751 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
752 phys_blksz
, min_fs_blksz
);
756 if ((journal_size
% phys_blksz
) != 0) {
757 printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
758 journal_size
, phys_blksz
);
762 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
763 memset(jnl
, 0, sizeof(*jnl
));
766 jnl
->jdev_offset
= offset
;
769 jnl
->flush_arg
= arg
;
770 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
771 simple_lock_init(&jnl
->old_start_lock
);
773 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
774 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
778 memset(jnl
->header_buf
, 0, phys_blksz
);
780 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
781 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
782 jnl
->jhdr
->endian
= ENDIAN_MAGIC
;
783 jnl
->jhdr
->start
= phys_blksz
; // start at block #1, block #0 is for the jhdr itself
784 jnl
->jhdr
->end
= phys_blksz
;
785 jnl
->jhdr
->size
= journal_size
;
786 jnl
->jhdr
->jhdr_size
= phys_blksz
;
787 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
789 jnl
->active_start
= jnl
->jhdr
->start
;
791 // XXXdbg - for testing you can force the journal to wrap around
792 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
793 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
795 if (semaphore_create(kernel_task
, &jnl
->jsem
, SYNC_POLICY_FIFO
, 1) != 0) {
796 printf("jnl: journal_create: failed to create journal semaphore..\n");
800 if (write_journal_header(jnl
) != 0) {
801 printf("jnl: journal_create: failed to write journal header.\n");
809 semaphore_destroy(kernel_task
, jnl
->jsem
);
811 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
814 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
820 journal_open(struct vnode
*jvp
,
826 int32_t tbuffer_size
,
827 void (*flush
)(void *arg
),
831 int orig_blksz
=0, phys_blksz
, blhdr_size
;
834 /* Get the real physical block size. */
835 if (VOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, FSCRED
, NULL
)) {
839 if (phys_blksz
> min_fs_blksz
) {
840 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
841 phys_blksz
, min_fs_blksz
);
845 if ((journal_size
% phys_blksz
) != 0) {
846 printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
847 journal_size
, phys_blksz
);
851 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
852 memset(jnl
, 0, sizeof(*jnl
));
855 jnl
->jdev_offset
= offset
;
858 jnl
->flush_arg
= arg
;
859 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
860 simple_lock_init(&jnl
->old_start_lock
);
862 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
863 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
867 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
868 memset(jnl
->jhdr
, 0, sizeof(journal_header
)+4);
870 // we have to set this up here so that do_journal_io() will work
871 jnl
->jhdr
->jhdr_size
= phys_blksz
;
873 if (read_journal_data(jnl
, &hdr_offset
, jnl
->jhdr
, phys_blksz
) != phys_blksz
) {
874 printf("jnl: open: could not read %d bytes for the journal header.\n",
879 if (jnl
->jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
->jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
880 printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
881 jnl
->jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
885 // only check if we're the current journal header magic value
886 if (jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
) {
887 int orig_checksum
= jnl
->jhdr
->checksum
;
889 jnl
->jhdr
->checksum
= 0;
890 if (orig_checksum
!= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
))) {
891 printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum
,
892 calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
)));
897 // XXXdbg - convert old style magic numbers to the new one
898 if (jnl
->jhdr
->magic
== OLD_JOURNAL_HEADER_MAGIC
) {
899 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
902 if (phys_blksz
!= jnl
->jhdr
->jhdr_size
&& jnl
->jhdr
->jhdr_size
!= 0) {
903 printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
904 phys_blksz
, jnl
->jhdr
->jhdr_size
);
906 orig_blksz
= phys_blksz
;
907 phys_blksz
= jnl
->jhdr
->jhdr_size
;
908 if (VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&phys_blksz
, FWRITE
, FSCRED
, NULL
)) {
909 printf("jnl: could not set block size to %d bytes.\n", phys_blksz
);
915 if ( jnl
->jhdr
->start
<= 0
916 || jnl
->jhdr
->start
> jnl
->jhdr
->size
917 || jnl
->jhdr
->start
> 128*1024*1024) {
918 printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
919 jnl
->jhdr
->start
, jnl
->jhdr
->size
);
923 if ( jnl
->jhdr
->end
<= 0
924 || jnl
->jhdr
->end
> jnl
->jhdr
->size
925 || jnl
->jhdr
->end
> 128*1024*1024) {
926 printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
927 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
931 if (jnl
->jhdr
->size
> 128*1024*1024) {
932 printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl
->jhdr
->size
);
936 // XXXdbg - can't do these checks because hfs writes all kinds of
937 // non-uniform sized blocks even on devices that have a block size
938 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
939 // therefore these checks will fail and so we just have to punt and
940 // do more relaxed checking...
941 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
942 if ((jnl
->jhdr
->start
% 512) != 0) {
943 printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
948 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
949 if ((jnl
->jhdr
->end
% 512) != 0) {
950 printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
951 jnl
->jhdr
->end
, jnl
->jhdr
->jhdr_size
);
955 // take care of replaying the journal if necessary
956 if (flags
& JOURNAL_RESET
) {
957 printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
958 jnl
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
959 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
960 } else if (replay_journal(jnl
) != 0) {
961 printf("jnl: journal_open: Error replaying the journal!\n");
965 if (orig_blksz
!= 0) {
966 VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, FSCRED
, NULL
);
967 phys_blksz
= orig_blksz
;
970 // make sure this is in sync!
971 jnl
->active_start
= jnl
->jhdr
->start
;
973 // set this now, after we've replayed the journal
974 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
976 if (semaphore_create(kernel_task
, &jnl
->jsem
, SYNC_POLICY_FIFO
, 1) != 0) {
977 printf("jnl: journal_create: failed to create journal semaphore..\n");
984 if (orig_blksz
!= 0) {
985 phys_blksz
= orig_blksz
;
986 VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, FSCRED
, NULL
);
988 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
990 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
995 journal_close(journal
*jnl
)
997 volatile off_t
*start
, *end
;
1002 // set this before doing anything that would block so that
1003 // we start tearing things down properly.
1005 jnl
->flags
|= JOURNAL_CLOSE_PENDING
;
1007 if (jnl
->owner
!= current_act()) {
1010 while ((ret
= semaphore_wait(jnl
->jsem
)) == KERN_ABORTED
) {
1011 // just keep trying if we've been ^C'ed
1014 printf("jnl: close: sem wait failed.\n");
1020 // only write stuff to disk if the journal is still valid
1022 if ((jnl
->flags
& JOURNAL_INVALID
) == 0) {
1024 if (jnl
->active_tr
) {
1025 journal_end_transaction(jnl
);
1028 // flush any buffered transactions
1030 transaction
*tr
= jnl
->cur_tr
;
1033 end_transaction(tr
, 1); // force it to get flushed
1036 //start = &jnl->jhdr->start;
1037 start
= &jnl
->active_start
;
1038 end
= &jnl
->jhdr
->end
;
1040 while (*start
!= *end
&& counter
++ < 500) {
1041 printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start
, *end
);
1043 jnl
->flush(jnl
->flush_arg
);
1045 tsleep((caddr_t
)jnl
, PRIBIO
, "jnl_close", 1);
1048 if (*start
!= *end
) {
1049 printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1053 // make sure this is in sync when we close the journal
1054 jnl
->jhdr
->start
= jnl
->active_start
;
1056 // if this fails there's not much we can do at this point...
1057 write_journal_header(jnl
);
1059 // if we're here the journal isn't valid any more.
1060 // so make sure we don't leave any locked blocks lying around
1061 printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl
);
1062 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1064 if (jnl
->active_tr
) {
1065 tr
= jnl
->active_tr
;
1066 jnl
->active_tr
= NULL
;
1072 abort_transaction(jnl
, tr
);
1073 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1074 panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl
);
1079 free_old_stuff(jnl
);
1081 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, jnl
->jhdr
->jhdr_size
);
1082 jnl
->jhdr
= (void *)0xbeefbabe;
1084 semaphore_destroy(kernel_task
, jnl
->jsem
);
1085 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1089 dump_journal(journal
*jnl
)
1094 printf(" jdev_offset %.8llx\n", jnl
->jdev_offset
);
1095 printf(" magic: 0x%.8x\n", jnl
->jhdr
->magic
);
1096 printf(" start: 0x%.8llx\n", jnl
->jhdr
->start
);
1097 printf(" end: 0x%.8llx\n", jnl
->jhdr
->end
);
1098 printf(" size: 0x%.8llx\n", jnl
->jhdr
->size
);
1099 printf(" blhdr size: %d\n", jnl
->jhdr
->blhdr_size
);
1100 printf(" jhdr size: %d\n", jnl
->jhdr
->jhdr_size
);
1101 printf(" chksum: 0x%.8x\n", jnl
->jhdr
->checksum
);
1103 printf(" completed transactions:\n");
1104 for(ctr
=jnl
->completed_trs
; ctr
; ctr
=ctr
->next
) {
1105 printf(" 0x%.8llx - 0x%.8llx\n", ctr
->journal_start
, ctr
->journal_end
);
1112 free_space(journal
*jnl
)
1116 if (jnl
->jhdr
->start
< jnl
->jhdr
->end
) {
1117 free_space
= jnl
->jhdr
->size
- (jnl
->jhdr
->end
- jnl
->jhdr
->start
) - jnl
->jhdr
->jhdr_size
;
1118 } else if (jnl
->jhdr
->start
> jnl
->jhdr
->end
) {
1119 free_space
= jnl
->jhdr
->start
- jnl
->jhdr
->end
;
1121 // journal is completely empty
1122 free_space
= jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
;
1130 // The journal must be locked on entry to this function.
1131 // The "desired_size" is in bytes.
1134 check_free_space(journal
*jnl
, int desired_size
)
1138 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
1139 // desired_size, free_space(jnl));
1142 int old_start_empty
;
1144 if (counter
++ == 5000) {
1146 panic("jnl: check_free_space: buffer flushing isn't working "
1147 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl
,
1148 jnl
->jhdr
->start
, jnl
->jhdr
->end
, free_space(jnl
), jnl
->active_start
);
1150 if (counter
> 7500) {
1151 printf("jnl: check_free_space: giving up waiting for free space.\n");
1155 // make sure there's space in the journal to hold this transaction
1156 if (free_space(jnl
) > desired_size
) {
1161 // here's where we lazily bump up jnl->jhdr->start. we'll consume
1162 // entries until there is enough space for the next transaction.
1164 old_start_empty
= 1;
1165 simple_lock(&jnl
->old_start_lock
);
1166 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
1170 while (jnl
->old_start
[i
] & 0x8000000000000000LL
) {
1171 if (counter
++ > 100) {
1172 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
1173 jnl
->old_start
[i
], jnl
);
1176 simple_unlock(&jnl
->old_start_lock
);
1178 jnl
->flush(jnl
->flush_arg
);
1180 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space1", 1);
1181 simple_lock(&jnl
->old_start_lock
);
1184 if (jnl
->old_start
[i
] == 0) {
1188 old_start_empty
= 0;
1189 jnl
->jhdr
->start
= jnl
->old_start
[i
];
1190 jnl
->old_start
[i
] = 0;
1191 if (free_space(jnl
) > desired_size
) {
1192 write_journal_header(jnl
);
1196 simple_unlock(&jnl
->old_start_lock
);
1198 // if we bumped the start, loop and try again
1199 if (i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
1201 } else if (old_start_empty
) {
1203 // if there is nothing in old_start anymore then we can
1204 // bump the jhdr->start to be the same as active_start
1205 // since it is possible there was only one very large
1206 // transaction in the old_start array. if we didn't do
1207 // this then jhdr->start would never get updated and we
1208 // would wind up looping until we hit the panic at the
1209 // start of the loop.
1211 jnl
->jhdr
->start
= jnl
->active_start
;
1212 write_journal_header(jnl
);
1217 // if the file system gave us a flush function, call it to so that
1218 // it can flush some blocks which hopefully will cause some transactions
1219 // to complete and thus free up space in the journal.
1221 jnl
->flush(jnl
->flush_arg
);
1224 // wait for a while to avoid being cpu-bound (this will
1225 // put us to sleep for 10 milliseconds)
1226 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space2", 1);
1233 journal_start_transaction(journal
*jnl
)
1240 if (jnl
->flags
& JOURNAL_INVALID
) {
1244 if (jnl
->owner
== current_act()) {
1245 if (jnl
->active_tr
== NULL
) {
1246 panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
1247 jnl
, jnl
->owner
, current_act());
1249 jnl
->nested_count
++;
1253 while ((ret
= semaphore_wait(jnl
->jsem
)) == KERN_ABORTED
) {
1254 // just keep looping if we've been ^C'ed
1257 printf("jnl: start_tr: sem wait failed.\n");
1261 if (jnl
->owner
!= NULL
|| jnl
->nested_count
!= 0 || jnl
->active_tr
!= NULL
) {
1262 panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
1263 jnl
->owner
, jnl
->nested_count
, jnl
->active_tr
, jnl
);
1266 jnl
->owner
= current_act();
1267 jnl
->nested_count
= 1;
1269 free_old_stuff(jnl
);
1271 // make sure there's room in the journal
1272 if (check_free_space(jnl
, jnl
->tbuffer_size
) != 0) {
1273 printf("jnl: start transaction failed: no space\n");
1278 // if there's a buffered transaction, use it.
1280 jnl
->active_tr
= jnl
->cur_tr
;
1286 MALLOC_ZONE(tr
, transaction
*, sizeof(transaction
), M_JNL_TR
, M_WAITOK
);
1287 memset(tr
, 0, sizeof(transaction
));
1289 tr
->tbuffer_size
= jnl
->tbuffer_size
;
1290 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&tr
->tbuffer
, tr
->tbuffer_size
)) {
1291 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
1292 printf("jnl: start transaction failed: no tbuffer mem\n");
1297 // journal replay code checksum check depends on this.
1298 memset(tr
->tbuffer
, 0, BLHDR_CHECKSUM_SIZE
);
1300 tr
->blhdr
= (block_list_header
*)tr
->tbuffer
;
1301 tr
->blhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1302 tr
->blhdr
->num_blocks
= 1; // accounts for this header block
1303 tr
->blhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
1306 tr
->total_bytes
= jnl
->jhdr
->blhdr_size
;
1309 jnl
->active_tr
= tr
;
1311 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
1317 jnl
->nested_count
= 0;
1318 semaphore_signal(jnl
->jsem
);
1324 journal_modify_block_start(journal
*jnl
, struct buf
*bp
)
1330 if (jnl
->flags
& JOURNAL_INVALID
) {
1334 // XXXdbg - for debugging I want this to be true. later it may
1335 // not be necessary.
1336 if ((bp
->b_flags
& B_META
) == 0) {
1337 panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp
, jnl
);
1340 tr
= jnl
->active_tr
;
1341 CHECK_TRANSACTION(tr
);
1343 if (jnl
->owner
!= current_act()) {
1344 panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1345 jnl
, jnl
->owner
, current_act());
1348 free_old_stuff(jnl
);
1350 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
1351 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1353 // can't allow blocks that aren't an even multiple of the
1354 // underlying block size.
1355 if ((bp
->b_bufsize
% jnl
->jhdr
->jhdr_size
) != 0) {
1356 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1357 bp
->b_bufsize
, jnl
->jhdr
->jhdr_size
);
1361 // make sure that this transaction isn't bigger than the whole journal
1362 if (tr
->total_bytes
+bp
->b_bufsize
>= (jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
)) {
1363 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
1364 tr
->total_bytes
, (tr
->jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
), bp
->b_bufsize
, tr
, bp
);
1368 // if the block is dirty and not already locked we have to write
1369 // it out before we muck with it because it has data that belongs
1370 // (presumably) to another transaction.
1372 if ((bp
->b_flags
& B_DELWRI
) && (bp
->b_flags
& B_LOCKED
) == 0) {
1374 // this will cause it to not be brelse()'d
1375 bp
->b_flags
|= B_NORELSE
;
1379 bp
->b_flags
|= B_LOCKED
;
1385 journal_modify_block_abort(journal
*jnl
, struct buf
*bp
)
1388 block_list_header
*blhdr
;
1393 tr
= jnl
->active_tr
;
1396 // if there's no active transaction then we just want to
1397 // call brelse() and return since this is just a block
1398 // that happened to be modified as part of another tr.
1405 if (jnl
->flags
& JOURNAL_INVALID
) {
1409 CHECK_TRANSACTION(tr
);
1411 if (jnl
->owner
!= current_act()) {
1412 panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1413 jnl
, jnl
->owner
, current_act());
1416 free_old_stuff(jnl
);
1418 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
1420 // first check if it's already part of this transaction
1421 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1422 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1423 if (bp
== blhdr
->binfo
[i
].bp
) {
1424 if (bp
->b_bufsize
!= blhdr
->binfo
[i
].bsize
) {
1425 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1426 bp
, bp
->b_bufsize
, blhdr
->binfo
[i
].bsize
, jnl
);
1432 if (i
< blhdr
->num_blocks
) {
1438 // if blhdr is null, then this block has only had modify_block_start
1439 // called on it as part of the current transaction. that means that
1440 // it is ok to clear the LOCKED bit since it hasn't actually been
1441 // modified. if blhdr is non-null then modify_block_end was called
1442 // on it and so we need to keep it locked in memory.
1444 if (blhdr
== NULL
) {
1445 bp
->b_flags
&= ~(B_LOCKED
);
1454 journal_modify_block_end(journal
*jnl
, struct buf
*bp
)
1456 int i
, j
, tbuffer_offset
;
1458 block_list_header
*blhdr
, *prev
=NULL
;
1463 if (jnl
->flags
& JOURNAL_INVALID
) {
1467 tr
= jnl
->active_tr
;
1468 CHECK_TRANSACTION(tr
);
1470 if (jnl
->owner
!= current_act()) {
1471 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1472 jnl
, jnl
->owner
, current_act());
1475 free_old_stuff(jnl
);
1477 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n",
1478 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1480 if ((bp
->b_flags
& B_LOCKED
) == 0) {
1481 panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp
, jnl
);
1482 bp
->b_flags
|= B_LOCKED
;
1485 // first check if it's already part of this transaction
1486 for(blhdr
=tr
->blhdr
; blhdr
; prev
=blhdr
,blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1487 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
1489 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1490 if (bp
== blhdr
->binfo
[i
].bp
) {
1491 if (bp
->b_bufsize
!= blhdr
->binfo
[i
].bsize
) {
1492 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1493 bp
, bp
->b_bufsize
, blhdr
->binfo
[i
].bsize
, jnl
);
1497 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
1500 if (i
< blhdr
->num_blocks
) {
1507 && (prev
->num_blocks
+1) <= prev
->max_blocks
1508 && (prev
->bytes_used
+bp
->b_bufsize
) <= tr
->tbuffer_size
) {
1510 } else if (blhdr
== NULL
) {
1511 block_list_header
*nblhdr
;
1514 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl
, bp
);
1517 // we got to the end of the list, didn't find the block and there's
1518 // no room in the block_list_header pointed to by prev
1520 // we allocate another tbuffer and link it in at the end of the list
1521 // through prev->binfo[0].bnum. that's a skanky way to do things but
1522 // avoids having yet another linked list of small data structures to manage.
1524 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&nblhdr
, tr
->tbuffer_size
)) {
1525 panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
1526 tr
, tr
->total_bytes
);
1529 // journal replay code checksum check depends on this.
1530 memset(nblhdr
, 0, BLHDR_CHECKSUM_SIZE
);
1532 // initialize the new guy
1533 nblhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1534 nblhdr
->num_blocks
= 1; // accounts for this header block
1535 nblhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
1538 tr
->total_bytes
+= jnl
->jhdr
->blhdr_size
;
1540 // then link him in at the end
1541 prev
->binfo
[0].bnum
= (off_t
)((long)nblhdr
);
1543 // and finally switch to using the new guy
1545 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
1550 if ((i
+1) > blhdr
->max_blocks
) {
1551 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i
, blhdr
->max_blocks
);
1554 // copy the data into the in-memory transaction buffer
1555 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
1556 memcpy(blkptr
, bp
->b_data
, bp
->b_bufsize
);
1558 // if this is true then this is a new block we haven't seen
1559 if (i
>= blhdr
->num_blocks
) {
1560 vget(bp
->b_vp
, 0, current_proc());
1562 blhdr
->binfo
[i
].bnum
= bp
->b_blkno
;
1563 blhdr
->binfo
[i
].bsize
= bp
->b_bufsize
;
1564 blhdr
->binfo
[i
].bp
= bp
;
1566 blhdr
->bytes_used
+= bp
->b_bufsize
;
1567 tr
->total_bytes
+= bp
->b_bufsize
;
1569 blhdr
->num_blocks
++;
1578 journal_kill_block(journal
*jnl
, struct buf
*bp
)
1581 block_list_header
*blhdr
;
1586 if (jnl
->flags
& JOURNAL_INVALID
) {
1590 tr
= jnl
->active_tr
;
1591 CHECK_TRANSACTION(tr
);
1593 if (jnl
->owner
!= current_act()) {
1594 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1595 jnl
, jnl
->owner
, current_act());
1598 free_old_stuff(jnl
);
1600 if ((bp
->b_flags
& B_LOCKED
) == 0) {
1601 panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp
, jnl
);
1604 // first check if it's already part of this transaction
1605 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1607 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1608 if (bp
== blhdr
->binfo
[i
].bp
) {
1609 bp
->b_flags
&= ~B_LOCKED
;
1611 // this undoes the vget() in journal_modify_block_end()
1614 // if the block has the DELWRI and CALL bits sets, then
1615 // things are seriously weird. if it was part of another
1616 // transaction then journal_modify_block_start() should
1617 // have force it to be written.
1619 if ((bp
->b_flags
& B_DELWRI
) && (bp
->b_flags
& B_CALL
)) {
1620 panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp
);
1622 tr
->num_killed
+= bp
->b_bufsize
;
1625 if (bp
->b_flags
& B_BUSY
) {
1629 blhdr
->binfo
[i
].bp
= NULL
;
1630 blhdr
->binfo
[i
].bnum
= (off_t
)-1;
1635 if (i
< blhdr
->num_blocks
) {
1645 journal_binfo_cmp(void *a
, void *b
)
1647 block_info
*bi_a
= (struct block_info
*)a
,
1648 *bi_b
= (struct block_info
*)b
;
1651 if (bi_a
->bp
== NULL
) {
1654 if (bi_b
->bp
== NULL
) {
1658 // don't have to worry about negative block
1659 // numbers so this is ok to do.
1661 res
= (bi_a
->bp
->b_blkno
- bi_b
->bp
->b_blkno
);
1668 end_transaction(transaction
*tr
, int force_it
)
1672 journal
*jnl
= tr
->jnl
;
1674 block_list_header
*blhdr
=NULL
, *next
=NULL
;
1677 panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
1678 jnl
, jnl
->cur_tr
, tr
);
1681 // if there weren't any modified blocks in the transaction
1682 // just save off the transaction pointer and return.
1683 if (tr
->total_bytes
== jnl
->jhdr
->blhdr_size
) {
1688 // if our transaction buffer isn't very full, just hang
1689 // on to it and don't actually flush anything. this is
1690 // what is known as "group commit". we will flush the
1691 // transaction buffer if it's full or if we have more than
1692 // one of them so we don't start hogging too much memory.
1695 && (jnl
->flags
& JOURNAL_NO_GROUP_COMMIT
) == 0
1696 && tr
->num_blhdrs
< 3
1697 && (tr
->total_bytes
<= ((tr
->tbuffer_size
*tr
->num_blhdrs
) - tr
->tbuffer_size
/8))) {
1704 // if we're here we're going to flush the transaction buffer to disk.
1705 // make sure there is room in the journal first.
1706 check_free_space(jnl
, tr
->total_bytes
);
1708 // range check the end index
1709 if (jnl
->jhdr
->end
<= 0 || jnl
->jhdr
->end
> jnl
->jhdr
->size
) {
1710 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
1711 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
1714 // this transaction starts where the current journal ends
1715 tr
->journal_start
= jnl
->jhdr
->end
;
1716 end
= jnl
->jhdr
->end
;
1719 // if the first entry in old_start[] isn't free yet, loop calling the
1720 // file system flush routine until it is (or we panic).
1723 simple_lock(&jnl
->old_start_lock
);
1724 while ((jnl
->old_start
[0] & 0x8000000000000000LL
) != 0) {
1726 simple_unlock(&jnl
->old_start_lock
);
1729 jnl
->flush(jnl
->flush_arg
);
1732 // yield the cpu so others can get in to clear the lock bit
1733 (void)tsleep((void *)jnl
, PRIBIO
, "jnl-old-start-sleep", 1);
1735 simple_lock(&jnl
->old_start_lock
);
1738 panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
1739 jnl
->old_start
[0] & (~0x8000000000000000LL
), jnl
);
1744 // slide everyone else down and put our latest guy in the last
1745 // entry in the old_start array
1747 memcpy(&jnl
->old_start
[0], &jnl
->old_start
[1], sizeof(jnl
->old_start
)-sizeof(jnl
->old_start
[0]));
1748 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] = tr
->journal_start
| 0x8000000000000000LL
;
1750 simple_unlock(&jnl
->old_start_lock
);
1753 // for each block, make sure that the physical block # is set
1754 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
1756 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1758 bp
= blhdr
->binfo
[i
].bp
;
1759 if (bp
== NULL
) { // only true if a block was "killed"
1760 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
1761 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
1762 blhdr
->binfo
[i
].bnum
, jnl
, tr
);
1767 if (bp
->b_vp
== NULL
&& bp
->b_lblkno
== bp
->b_blkno
) {
1768 panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
1769 bp
, bp
->b_lblkno
, bp
->b_blkno
);
1772 // if the lblkno is the same as blkno and this bp isn't
1773 // associated with the underlying file system device then
1774 // we need to call bmap() to get the actual physical block.
1776 if ((bp
->b_lblkno
== bp
->b_blkno
) && (bp
->b_vp
!= jnl
->fsdev
)) {
1777 if (VOP_BMAP(bp
->b_vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
, NULL
) != 0) {
1778 printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp
, jnl
);
1783 // update this so we write out the correct physical block number!
1784 blhdr
->binfo
[i
].bnum
= bp
->b_blkno
;
1787 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
1790 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1792 amt
= blhdr
->bytes_used
;
1794 blhdr
->checksum
= 0;
1795 blhdr
->checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1797 ret
= write_journal_data(jnl
, &end
, blhdr
, amt
);
1799 printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
1806 jnl
->jhdr
->end
= end
; // update where the journal now ends
1807 tr
->journal_end
= end
; // the transaction ends here too
1808 if (tr
->journal_start
== 0 || tr
->journal_end
== 0) {
1809 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
1810 tr
->journal_start
, tr
->journal_end
);
1813 if (write_journal_header(jnl
) != 0) {
1818 // setup for looping through all the blhdr's. we null out the
1819 // tbuffer and blhdr fields so that they're not used any more.
1825 // the buffer_flushed_callback will only be called for the
1826 // real blocks that get flushed so we have to account for
1827 // the block_list_headers here.
1829 tr
->num_flushed
= tr
->num_blhdrs
* jnl
->jhdr
->blhdr_size
;
1831 // for each block, set the iodone callback and unlock it
1832 for(; blhdr
; blhdr
=next
) {
1834 // we can re-order the buf ptrs because everything is written out already
1835 qsort(&blhdr
->binfo
[1], blhdr
->num_blocks
-1, sizeof(block_info
), journal_binfo_cmp
);
1837 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1838 if (blhdr
->binfo
[i
].bp
== NULL
) {
1842 ret
= meta_bread(blhdr
->binfo
[i
].bp
->b_vp
,
1843 (daddr_t
)blhdr
->binfo
[i
].bp
->b_lblkno
,
1844 blhdr
->binfo
[i
].bp
->b_bufsize
,
1847 if (ret
== 0 && bp
!= NULL
) {
1848 struct vnode
*save_vp
;
1850 if (bp
!= blhdr
->binfo
[i
].bp
) {
1851 panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
1852 bp
, blhdr
->binfo
[i
].bp
, jnl
);
1855 if ((bp
->b_flags
& (B_LOCKED
|B_DELWRI
)) != (B_LOCKED
|B_DELWRI
)) {
1856 if (jnl
->flags
& JOURNAL_CLOSE_PENDING
) {
1860 panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp
, bp
->b_flags
);
1864 if (bp
->b_iodone
!= NULL
) {
1865 panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
1866 bp
, bp
->b_blkno
, bp
->b_vp
, bp
->b_iodone
, buffer_flushed_callback
);
1871 bp
->b_iodone
= buffer_flushed_callback
;
1872 bp
->b_transaction
= tr
;
1873 bp
->b_flags
|= B_CALL
;
1874 bp
->b_flags
&= ~(B_LOCKED
);
1876 // kicking off the write here helps performance
1878 // XXXdbg this is good for testing: bdwrite(bp);
1881 // this undoes the vget() in journal_modify_block_end()
1885 printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
1886 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
1893 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
1895 // we can free blhdr here since we won't need it any more
1896 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
1897 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
1900 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
1901 // tr, tr->journal_start, tr->journal_end);
1906 jnl
->flags
|= JOURNAL_INVALID
;
1907 abort_transaction(jnl
, tr
);
1912 abort_transaction(journal
*jnl
, transaction
*tr
)
1915 block_list_header
*blhdr
, *next
;
1918 // for each block list header, iterate over the blocks then
1919 // free up the memory associated with the block list.
1921 // for each block, clear the lock bit and release it.
1923 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
1925 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1926 if (blhdr
->binfo
[i
].bp
== NULL
) {
1930 ret
= meta_bread(blhdr
->binfo
[i
].bp
->b_vp
,
1931 (daddr_t
)blhdr
->binfo
[i
].bp
->b_lblkno
,
1932 blhdr
->binfo
[i
].bp
->b_bufsize
,
1936 if (bp
!= blhdr
->binfo
[i
].bp
) {
1937 panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
1938 bp
, blhdr
->binfo
[i
].bp
, jnl
);
1941 // clear the locked bit and the delayed-write bit. we
1942 // don't want these blocks going to disk.
1943 bp
->b_flags
&= ~(B_LOCKED
|B_DELWRI
);
1944 bp
->b_flags
|= B_INVAL
;
1949 printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
1950 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
1957 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
1959 // we can free blhdr here since we won't need it any more
1960 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
1961 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
1966 tr
->total_bytes
= 0xdbadc0de;
1967 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
1972 journal_end_transaction(journal
*jnl
)
1979 if ((jnl
->flags
& JOURNAL_INVALID
) && jnl
->owner
== NULL
) {
1983 if (jnl
->owner
!= current_act()) {
1984 panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1985 jnl
, jnl
->owner
, current_act());
1988 free_old_stuff(jnl
);
1990 jnl
->nested_count
--;
1991 if (jnl
->nested_count
> 0) {
1993 } else if (jnl
->nested_count
< 0) {
1994 panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl
, jnl
->nested_count
);
1997 if (jnl
->flags
& JOURNAL_INVALID
) {
1998 if (jnl
->active_tr
) {
2001 if (jnl
->cur_tr
!= NULL
) {
2002 panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
2003 jnl
, jnl
->active_tr
, jnl
->cur_tr
);
2006 tr
= jnl
->active_tr
;
2007 jnl
->active_tr
= NULL
;
2008 abort_transaction(jnl
, tr
);
2012 semaphore_signal(jnl
->jsem
);
2017 tr
= jnl
->active_tr
;
2018 CHECK_TRANSACTION(tr
);
2020 // clear this out here so that when check_free_space() calls
2021 // the FS flush function, we don't panic in journal_flush()
2022 // if the FS were to call that. note: check_free_space() is
2023 // called from end_transaction().
2025 jnl
->active_tr
= NULL
;
2026 ret
= end_transaction(tr
, 0);
2029 semaphore_signal(jnl
->jsem
);
2036 journal_flush(journal
*jnl
)
2038 int need_signal
= 0;
2042 if (jnl
->flags
& JOURNAL_INVALID
) {
2046 if (jnl
->owner
!= current_act()) {
2049 while ((ret
= semaphore_wait(jnl
->jsem
)) == KERN_ABORTED
) {
2050 // just keep looping if we've ben ^C'ed
2053 printf("jnl: flush: sem wait failed.\n");
2059 free_old_stuff(jnl
);
2061 // if we're not active, flush any buffered transactions
2062 if (jnl
->active_tr
== NULL
&& jnl
->cur_tr
) {
2063 transaction
*tr
= jnl
->cur_tr
;
2066 end_transaction(tr
, 1); // force it to get flushed
2070 semaphore_signal(jnl
->jsem
);
2077 journal_active(journal
*jnl
)
2079 if (jnl
->flags
& JOURNAL_INVALID
) {
2083 return (jnl
->active_tr
== NULL
) ? 0 : 1;