2 * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 // This file implements a simple write-ahead journaling layer.
24 // In theory any file system can make use of it by calling these
25 // functions when the fs wants to modify meta-data blocks. See
26 // vfs_journal.h for a more detailed description of the api and
29 // Dominic Giampaolo (dbg@apple.com)
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
41 #include <sys/mount.h>
42 #include <sys/namei.h>
43 #include <sys/vnode.h>
44 #include <sys/ioctl.h>
47 #include <sys/malloc.h>
48 #include <sys/vnode.h>
49 #include <kern/thread_act.h>
51 #include <miscfs/specfs/specdev.h>
53 extern task_t kernel_task
;
65 #include <sys/types.h>
70 #include "vfs_journal.h"
73 // number of bytes to checksum in a block_list_header
74 // NOTE: this should be enough to clear out the header
75 // fields as well as the first entry of binfo[]
76 #define BLHDR_CHECKSUM_SIZE 32
80 static int end_transaction(transaction
*tr
, int force_it
);
81 static void abort_transaction(journal
*jnl
, transaction
*tr
);
82 static void dump_journal(journal
*jnl
);
86 // 3105942 - Coalesce writes to the same block on journal replay
89 typedef struct bucket
{
95 #define STARTING_BUCKETS 256
97 static int add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
);
98 static int grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
);
99 static int lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
);
100 static int do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
);
101 static int insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
);
103 #define CHECK_JOURNAL(jnl) \
106 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
108 if (jnl->jdev == NULL) { \
109 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
111 if (jnl->fsdev == NULL) { \
112 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
114 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
115 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
116 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
118 if ( jnl->jhdr->start <= 0 \
119 || jnl->jhdr->start > jnl->jhdr->size\
120 || jnl->jhdr->start > 1024*1024*1024) {\
121 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
122 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
124 if ( jnl->jhdr->end <= 0 \
125 || jnl->jhdr->end > jnl->jhdr->size\
126 || jnl->jhdr->end > 1024*1024*1024) {\
127 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
128 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
130 if (jnl->jhdr->size > 1024*1024*1024) {\
131 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
132 __FILE__, __LINE__, jnl->jhdr->size);\
136 #define CHECK_TRANSACTION(tr) \
139 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
141 if (tr->jnl == NULL) {\
142 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
144 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
145 panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
147 if (tr->total_bytes < 0) {\
148 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
150 if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
151 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
153 if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
154 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
156 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
157 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
164 // this isn't a great checksum routine but it will do for now.
165 // we use it to checksum the journal header and the block list
166 // headers that are at the start of each transaction.
169 calc_checksum(char *ptr
, int len
)
173 // this is a lame checksum but for now it'll do
174 for(i
=0; i
< len
; i
++, ptr
++) {
175 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
182 #define JNL_WRITE 0x0001
183 #define JNL_READ 0x0002
184 #define JNL_HEADER 0x8000
187 // This function sets up a fake buf and passes it directly to the
188 // journal device strategy routine (so that it won't get cached in
191 // It also handles range checking the i/o so that we don't write
192 // outside the journal boundaries and it will wrap the i/o back
193 // to the beginning if necessary (skipping over the journal header)
196 do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
)
198 int err
, io_sz
=0, curlen
=len
;
200 int max_iosize
=0, max_vectors
;
202 if (*offset
< 0 || *offset
> jnl
->jhdr
->size
) {
203 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset
, jnl
->jhdr
->size
);
207 bp
= alloc_io_buf(jnl
->jdev
, 1);
209 if (direction
& JNL_WRITE
) {
210 bp
->b_flags
|= 0; // don't have to set any flags (was: B_WRITEINPROG)
211 jnl
->jdev
->v_numoutput
++;
212 vfs_io_attributes(jnl
->jdev
, B_WRITE
, &max_iosize
, &max_vectors
);
213 } else if (direction
& JNL_READ
) {
214 bp
->b_flags
|= B_READ
;
215 vfs_io_attributes(jnl
->jdev
, B_READ
, &max_iosize
, &max_vectors
);
218 if (max_iosize
== 0) {
219 max_iosize
= 128 * 1024;
222 if (*offset
+ (off_t
)curlen
> jnl
->jhdr
->size
&& *offset
!= 0 && jnl
->jhdr
->size
!= 0) {
223 if (*offset
== jnl
->jhdr
->size
) {
224 *offset
= jnl
->jhdr
->jhdr_size
;
226 curlen
= (off_t
)jnl
->jhdr
->size
- *offset
;
230 if (curlen
> max_iosize
) {
235 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen
, *offset
, len
);
238 if (*offset
== 0 && (direction
& JNL_HEADER
) == 0) {
239 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen
, data
);
242 bp
->b_bufsize
= curlen
;
243 bp
->b_bcount
= curlen
;
245 bp
->b_blkno
= (daddr_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
);
246 bp
->b_lblkno
= (daddr_t
) ((jnl
->jdev_offset
+ *offset
) / (off_t
)jnl
->jhdr
->jhdr_size
);
248 err
= VOP_STRATEGY(bp
);
254 bp
->b_bufsize
= bp
->b_bcount
= 0;
255 bp
->b_blkno
= bp
->b_lblkno
= -1;
260 printf("jnl: do_jnl_io: strategy err 0x%x\n", err
);
267 // handle wrap-around
268 data
= (char *)data
+ curlen
;
269 curlen
= len
- io_sz
;
270 if (*offset
>= jnl
->jhdr
->size
) {
271 *offset
= jnl
->jhdr
->jhdr_size
;
280 read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
282 return do_journal_io(jnl
, offset
, data
, len
, JNL_READ
);
286 write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
)
288 return do_journal_io(jnl
, offset
, data
, len
, JNL_WRITE
);
293 read_journal_header(journal
*jnl
, void *data
, size_t len
)
295 off_t hdr_offset
= 0;
297 return do_journal_io(jnl
, &hdr_offset
, data
, len
, JNL_READ
|JNL_HEADER
);
301 write_journal_header(journal
*jnl
)
303 static int num_err_prints
= 0;
305 off_t jhdr_offset
= 0;
308 // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
310 ret
= VOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, NOCRED
, current_proc());
313 // Only print this error if it's a different error than the
314 // previous one, or if it's the first time for this device
315 // or if the total number of printfs is less than 25. We
316 // allow for up to 25 printfs to insure that some make it
317 // into the on-disk syslog. Otherwise if we only printed
318 // one, it's possible it would never make it to the syslog
319 // for the root volume and that makes debugging hard.
321 if ( ret
!= jnl
->last_flush_err
322 || (jnl
->flags
& JOURNAL_FLUSHCACHE_ERR
) == 0
323 || num_err_prints
++ < 25) {
325 printf("jnl: flushing fs disk buffer returned 0x%x\n", ret
);
327 jnl
->flags
|= JOURNAL_FLUSHCACHE_ERR
;
328 jnl
->last_flush_err
= ret
;
333 jnl
->jhdr
->checksum
= 0;
334 jnl
->jhdr
->checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
335 if (do_journal_io(jnl
, &jhdr_offset
, jnl
->header_buf
, jnl
->jhdr
->jhdr_size
, JNL_WRITE
|JNL_HEADER
) != jnl
->jhdr
->jhdr_size
) {
336 printf("jnl: write_journal_header: error writing the journal header!\n");
337 jnl
->flags
|= JOURNAL_INVALID
;
341 // Have to flush after writing the journal header so that
342 // a future transaction doesn't sneak out to disk before
343 // the header does and thus overwrite data that the old
344 // journal header refers to. Saw this exact case happen
345 // on an IDE bus analyzer with Larry Barras so while it
346 // may seem obscure, it's not.
348 VOP_IOCTL(jnl
->jdev
, DKIOCSYNCHRONIZECACHE
, NULL
, FWRITE
, NOCRED
, current_proc());
356 // this is a work function used to free up transactions that
357 // completed. they can't be free'd from buffer_flushed_callback
358 // because it is called from deep with the disk driver stack
359 // and thus can't do something that would potentially cause
360 // paging. it gets called by each of the journal api entry
361 // points so stuff shouldn't hang around for too long.
364 free_old_stuff(journal
*jnl
)
366 transaction
*tr
, *next
;
368 for(tr
=jnl
->tr_freeme
; tr
; tr
=next
) {
370 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
373 jnl
->tr_freeme
= NULL
;
379 // This is our callback that lets us know when a buffer has been
380 // flushed to disk. It's called from deep within the driver stack
381 // and thus is quite limited in what it can do. Notably, it can
382 // not initiate any new i/o's or allocate/free memory.
385 buffer_flushed_callback(struct buf
*bp
)
389 transaction
*ctr
, *prev
=NULL
, *next
;
393 //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
394 // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
396 // snarf out the bits we want
397 bufsize
= bp
->b_bufsize
;
398 tr
= bp
->b_transaction
;
400 bp
->b_iodone
= NULL
; // don't call us for this guy again
401 bp
->b_transaction
= NULL
;
404 // This is what biodone() would do if it didn't call us.
405 // NOTE: THIS CODE *HAS* TO BE HERE!
407 if (ISSET(bp
->b_flags
, B_ASYNC
)) { /* if async, release it */
409 } else { /* or just wakeup the buffer */
410 CLR(bp
->b_flags
, B_WANTED
);
414 // NOTE: from here on out we do *NOT* touch bp anymore.
417 // then we've already seen it
422 CHECK_TRANSACTION(tr
);
425 if (jnl
->flags
& JOURNAL_INVALID
) {
431 // update the number of blocks that have been flushed.
432 // this buf may represent more than one block so take
433 // that into account.
434 tr
->num_flushed
+= bufsize
;
437 // if this transaction isn't done yet, just return as
438 // there is nothing to do.
439 if ((tr
->num_flushed
+ tr
->num_killed
) < tr
->total_bytes
) {
443 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
444 // tr, tr->journal_start, tr->journal_end, jnl);
446 // find this entry in the old_start[] index and mark it completed
447 simple_lock(&jnl
->old_start_lock
);
448 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
450 if ((jnl
->old_start
[i
] & ~(0x8000000000000000LL
)) == tr
->journal_start
) {
451 jnl
->old_start
[i
] &= ~(0x8000000000000000LL
);
455 if (i
>= sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
456 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
457 tr
->journal_start
, tr
, jnl
);
459 simple_unlock(&jnl
->old_start_lock
);
462 // if we are here then we need to update the journal header
463 // to reflect that this transaction is complete
464 if (tr
->journal_start
== jnl
->active_start
) {
465 jnl
->active_start
= tr
->journal_end
;
466 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
469 // go through the completed_trs list and try to coalesce
470 // entries, restarting back at the beginning if we have to.
471 for(ctr
=jnl
->completed_trs
; ctr
; prev
=ctr
, ctr
=next
) {
472 if (ctr
->journal_start
== jnl
->active_start
) {
473 jnl
->active_start
= ctr
->journal_end
;
475 prev
->next
= ctr
->next
;
477 if (ctr
== jnl
->completed_trs
) {
478 jnl
->completed_trs
= ctr
->next
;
481 next
= jnl
->completed_trs
; // this starts us over again
482 ctr
->next
= jnl
->tr_freeme
;
483 jnl
->tr_freeme
= ctr
;
485 } else if (tr
->journal_end
== ctr
->journal_start
) {
486 ctr
->journal_start
= tr
->journal_start
;
487 next
= jnl
->completed_trs
; // this starts us over again
489 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
490 } else if (tr
->journal_start
== ctr
->journal_end
) {
491 ctr
->journal_end
= tr
->journal_end
;
493 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
499 // at this point no one should be using this guy anymore
500 tr
->total_bytes
= 0xfbadc0de;
502 // if this is true then we didn't merge with anyone
503 // so link ourselves in at the head of the completed
505 if (tr
->journal_start
!= 0) {
506 // put this entry into the correct sorted place
507 // in the list instead of just at the head.
511 for(ctr
=jnl
->completed_trs
; ctr
&& tr
->journal_start
> ctr
->journal_start
; prev
=ctr
, ctr
=ctr
->next
) {
515 if (ctr
== NULL
&& prev
== NULL
) {
516 jnl
->completed_trs
= tr
;
518 } else if (ctr
== jnl
->completed_trs
) {
519 tr
->next
= jnl
->completed_trs
;
520 jnl
->completed_trs
= tr
;
522 tr
->next
= prev
->next
;
526 // if we're here this tr got merged with someone else so
527 // put it on the list to be free'd
528 tr
->next
= jnl
->tr_freeme
;
534 #include <libkern/OSByteOrder.h>
536 #define SWAP16(x) OSSwapInt16(x)
537 #define SWAP32(x) OSSwapInt32(x)
538 #define SWAP64(x) OSSwapInt64(x)
542 swap_journal_header(journal
*jnl
)
544 jnl
->jhdr
->magic
= SWAP32(jnl
->jhdr
->magic
);
545 jnl
->jhdr
->endian
= SWAP32(jnl
->jhdr
->endian
);
546 jnl
->jhdr
->start
= SWAP64(jnl
->jhdr
->start
);
547 jnl
->jhdr
->end
= SWAP64(jnl
->jhdr
->end
);
548 jnl
->jhdr
->size
= SWAP64(jnl
->jhdr
->size
);
549 jnl
->jhdr
->blhdr_size
= SWAP32(jnl
->jhdr
->blhdr_size
);
550 jnl
->jhdr
->checksum
= SWAP32(jnl
->jhdr
->checksum
);
551 jnl
->jhdr
->jhdr_size
= SWAP32(jnl
->jhdr
->jhdr_size
);
555 swap_block_list_header(journal
*jnl
, block_list_header
*blhdr
)
559 blhdr
->max_blocks
= SWAP16(blhdr
->max_blocks
);
560 blhdr
->num_blocks
= SWAP16(blhdr
->num_blocks
);
561 blhdr
->bytes_used
= SWAP32(blhdr
->bytes_used
);
562 blhdr
->checksum
= SWAP32(blhdr
->checksum
);
563 blhdr
->pad
= SWAP32(blhdr
->pad
);
565 if (blhdr
->num_blocks
* sizeof(blhdr
->binfo
[0]) > jnl
->jhdr
->blhdr_size
) {
566 printf("jnl: blhdr num blocks looks suspicious (%d). not swapping.\n", blhdr
->num_blocks
);
570 for(i
=0; i
< blhdr
->num_blocks
; i
++) {
571 blhdr
->binfo
[i
].bnum
= SWAP64(blhdr
->binfo
[i
].bnum
);
572 blhdr
->binfo
[i
].bsize
= SWAP32(blhdr
->binfo
[i
].bsize
);
573 blhdr
->binfo
[i
].bp
= (void *)SWAP32((int)blhdr
->binfo
[i
].bp
);
579 update_fs_block(journal
*jnl
, void *block_ptr
, off_t fs_block
, size_t bsize
)
582 struct buf
*oblock_bp
=NULL
;
584 // first read the block we want.
585 ret
= meta_bread(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
587 printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block
, ret
);
594 // let's try to be aggressive here and just re-write the block
595 oblock_bp
= getblk(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, 0, 0, BLK_META
);
596 if (oblock_bp
== NULL
) {
597 printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block
);
602 // make sure it's the correct size.
603 if (oblock_bp
->b_bufsize
!= bsize
) {
608 // copy the journal data over top of it
609 memcpy(oblock_bp
->b_data
, block_ptr
, bsize
);
611 if ((ret
= VOP_BWRITE(oblock_bp
)) != 0) {
612 printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block
,ret
);
616 // and now invalidate it so that if someone else wants to read
617 // it in a different size they'll be able to do it.
618 ret
= meta_bread(jnl
->fsdev
, (daddr_t
)fs_block
, bsize
, NOCRED
, &oblock_bp
);
620 oblock_bp
->b_flags
|= B_INVAL
;
628 grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
)
630 struct bucket
*newBuf
;
631 int current_size
= num_buckets
, i
;
633 // return if newsize is less than the current size
634 if (new_size
< num_buckets
) {
638 if ((MALLOC(newBuf
, struct bucket
*, new_size
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
639 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
643 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
645 // copy existing elements
646 bcopy(*buf_ptr
, newBuf
, num_buckets
*sizeof(struct bucket
));
648 // initialize the new ones
649 for(i
=num_buckets
; i
< new_size
; i
++) {
650 newBuf
[i
].block_num
= (off_t
)-1;
653 // free the old container
654 FREE(*buf_ptr
, M_TEMP
);
663 lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
)
665 int lo
, hi
, index
, matches
, i
;
668 return 0; // table is empty, so insert at index=0
675 // perform binary search for block_num
677 int mid
= (hi
- lo
)/2 + lo
;
678 off_t this_num
= (*buf_ptr
)[mid
].block_num
;
680 if (block_num
== this_num
) {
685 if (block_num
< this_num
) {
690 if (block_num
> this_num
) {
696 // check if lo and hi converged on the match
697 if (block_num
== (*buf_ptr
)[hi
].block_num
) {
701 // if no existing entry found, find index for new one
703 index
= (block_num
< (*buf_ptr
)[hi
].block_num
) ? hi
: hi
+ 1;
705 // make sure that we return the right-most index in the case of multiple matches
708 while(i
< num_full
&& block_num
== (*buf_ptr
)[i
].block_num
) {
720 insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
)
723 // grow the table if we're out of space
724 if (*num_full_ptr
>= *num_buckets_ptr
) {
725 int new_size
= *num_buckets_ptr
* 2;
726 int grow_size
= grow_table(buf_ptr
, *num_buckets_ptr
, new_size
);
728 if (grow_size
< new_size
) {
729 printf("jnl: add_block: grow_table returned an error!\n");
733 *num_buckets_ptr
= grow_size
; //update num_buckets to reflect the new size
736 // if we're not inserting at the end, we need to bcopy
737 if (blk_index
!= *num_full_ptr
) {
738 bcopy( (*buf_ptr
)+(blk_index
), (*buf_ptr
)+(blk_index
+1), (*num_full_ptr
-blk_index
)*sizeof(struct bucket
) );
741 (*num_full_ptr
)++; // increment only if we're not overwriting
744 // sanity check the values we're about to add
745 if (offset
>= jnl
->jhdr
->size
) {
746 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
749 panic("jnl: insert_block: bad size in insert_block (%d)\n", size
);
752 (*buf_ptr
)[blk_index
].block_num
= num
;
753 (*buf_ptr
)[blk_index
].block_size
= size
;
754 (*buf_ptr
)[blk_index
].jnl_offset
= offset
;
760 do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
)
762 int num_to_remove
, index
, i
, overwrite
, err
;
763 size_t jhdr_size
= jnl
->jhdr
->jhdr_size
, new_offset
;
764 off_t overlap
, block_start
, block_end
;
766 block_start
= block_num
*jhdr_size
;
767 block_end
= block_start
+ size
;
768 overwrite
= (block_num
== (*buf_ptr
)[blk_index
].block_num
&& size
>= (*buf_ptr
)[blk_index
].block_size
);
770 // first, eliminate any overlap with the previous entry
771 if (blk_index
!= 0 && !overwrite
) {
772 off_t prev_block_start
= (*buf_ptr
)[blk_index
-1].block_num
*jhdr_size
;
773 off_t prev_block_end
= prev_block_start
+ (*buf_ptr
)[blk_index
-1].block_size
;
774 overlap
= prev_block_end
- block_start
;
776 if (overlap
% jhdr_size
!= 0) {
777 panic("jnl: do_overlap: overlap with previous entry not a multiple of %d\n", jhdr_size
);
780 // if the previous entry completely overlaps this one, we need to break it into two pieces.
781 if (prev_block_end
> block_end
) {
782 off_t new_num
= block_end
/ jhdr_size
;
783 size_t new_size
= prev_block_end
- block_end
;
784 size_t new_offset
= (*buf_ptr
)[blk_index
-1].jnl_offset
+ (block_end
- prev_block_start
);
786 err
= insert_block(jnl
, buf_ptr
, blk_index
, new_num
, new_size
, new_offset
, num_buckets_ptr
, num_full_ptr
, 0);
788 panic("jnl: do_overlap: error inserting during pre-overlap\n");
792 // Regardless, we need to truncate the previous entry to the beginning of the overlap
793 (*buf_ptr
)[blk_index
-1].block_size
= block_start
- prev_block_start
;
797 // then, bail out fast if there's no overlap with the entries that follow
798 if (!overwrite
&& block_end
<= (*buf_ptr
)[blk_index
].block_num
*jhdr_size
) {
799 return 0; // no overlap, no overwrite
800 } else if (overwrite
&& (blk_index
+ 1 >= *num_full_ptr
|| block_end
<= (*buf_ptr
)[blk_index
+1].block_num
*jhdr_size
)) {
801 return 1; // simple overwrite
804 // Otherwise, find all cases of total and partial overlap. We use the special
805 // block_num of -2 to designate entries that are completely overlapped and must
806 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
807 // entries must be adjusted to keep the array consistent.
810 while(index
< *num_full_ptr
&& block_end
> (*buf_ptr
)[index
].block_num
*jhdr_size
) {
811 if (block_end
>= ((*buf_ptr
)[index
].block_num
*jhdr_size
+ (*buf_ptr
)[index
].block_size
)) {
812 (*buf_ptr
)[index
].block_num
= -2; // mark this for deletion
815 overlap
= block_end
- (*buf_ptr
)[index
].block_num
*jhdr_size
;
817 if (overlap
% jhdr_size
!= 0) {
818 panic("jnl: do_overlap: overlap of %d is not multiple of %d\n", overlap
, jhdr_size
);
821 // if we partially overlap this entry, adjust its block number, jnl offset, and size
822 (*buf_ptr
)[index
].block_num
+= (overlap
/ jhdr_size
); // make sure overlap is multiple of jhdr_size, or round up
824 new_offset
= (*buf_ptr
)[index
].jnl_offset
+ overlap
; // check for wrap-around
825 if (new_offset
>= jnl
->jhdr
->size
) {
826 new_offset
= jhdr_size
+ (new_offset
- jnl
->jhdr
->size
);
828 (*buf_ptr
)[index
].jnl_offset
= new_offset
;
830 (*buf_ptr
)[index
].block_size
-= overlap
; // sanity check for negative value
831 if ((*buf_ptr
)[index
].block_size
<= 0) {
832 panic("jnl: do_overlap: after overlap, new block size is invalid (%d)\n", (*buf_ptr
)[index
].block_size
);
833 // return -1; // if above panic is removed, return -1 for error
842 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
843 index
--; // start with the last index used within the above loop
844 while(index
>= blk_index
) {
845 if ((*buf_ptr
)[index
].block_num
== -2) {
846 if (index
== *num_full_ptr
-1) {
847 (*buf_ptr
)[index
].block_num
= -1; // it's the last item in the table... just mark as free
849 bcopy( (*buf_ptr
)+(index
+1), (*buf_ptr
)+(index
), (*num_full_ptr
- (index
+ 1)) * sizeof(struct bucket
) );
856 // eliminate any stale entries at the end of the table
857 for(i
=*num_full_ptr
; i
< (*num_full_ptr
+ num_to_remove
); i
++) {
858 (*buf_ptr
)[i
].block_num
= -1;
861 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
864 // PR-3105942: Coalesce writes to the same block in journal replay
865 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
866 // to be replayed and the corresponding location in the journal which contains
867 // the most recent data for those blocks. The array is "played" once the all the
868 // blocks in the journal have been coalesced. The code for the case of conflicting/
869 // overlapping writes to a single block is the most dense. Because coalescing can
870 // disrupt the existing time-ordering of blocks in the journal playback, care
871 // is taken to catch any overlaps and keep the array consistent.
873 add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int *num_buckets_ptr
, int *num_full_ptr
)
875 int blk_index
, overwriting
;
876 size_t jhdr_size
= jnl
->jhdr
->jhdr_size
;
878 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
879 // inserted (or the index of the elem to overwrite).
880 blk_index
= lookup_bucket( buf_ptr
, block_num
, *num_full_ptr
);
882 // check if the index is within bounds (if we're adding this block to the end of
883 // the table, blk_index will be equal to num_full)
884 if (blk_index
< 0 || blk_index
> *num_full_ptr
) {
885 //printf("jnl: add_block: trouble adding block to co_buf\n");
887 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
889 // Determine whether we're overwriting an existing entry by checking for overlap
890 overwriting
= do_overlap(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, num_buckets_ptr
, num_full_ptr
);
891 if (overwriting
< 0) {
892 return -1; // if we got an error, pass it along
895 // returns the index, or -1 on error
896 blk_index
= insert_block(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, num_buckets_ptr
, num_full_ptr
, overwriting
);
902 replay_journal(journal
*jnl
)
904 int i
, ret
, orig_checksum
, checksum
, max_bsize
;
905 struct buf
*oblock_bp
;
906 block_list_header
*blhdr
;
908 char *buf
, *block_ptr
=NULL
;
909 struct bucket
*co_buf
;
910 int num_buckets
= STARTING_BUCKETS
, num_full
;
912 // wrap the start ptr if it points to the very end of the journal
913 if (jnl
->jhdr
->start
== jnl
->jhdr
->size
) {
914 jnl
->jhdr
->start
= jnl
->jhdr
->jhdr_size
;
916 if (jnl
->jhdr
->end
== jnl
->jhdr
->size
) {
917 jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
920 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
924 // allocate memory for the header_block. we'll read each blhdr into this
925 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&buf
, jnl
->jhdr
->blhdr_size
)) {
926 printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
927 jnl
->jhdr
->blhdr_size
);
931 // allocate memory for the coalesce buffer
932 if ((MALLOC(co_buf
, struct bucket
*, num_buckets
*sizeof(struct bucket
), M_TEMP
, M_WAITOK
)) == NULL
) {
933 printf("jnl: replay_journal: no memory for coalesce buffer!\n");
937 // initialize entries
938 for(i
=0; i
< num_buckets
; i
++) {
939 co_buf
[i
].block_num
= -1;
941 num_full
= 0; // empty at first
944 printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
945 jnl
->jhdr
->start
, jnl
->jhdr
->end
, jnl
->jdev_offset
);
947 while(jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
948 offset
= jnl
->jhdr
->start
;
949 ret
= read_journal_data(jnl
, &offset
, buf
, jnl
->jhdr
->blhdr_size
);
950 if (ret
!= jnl
->jhdr
->blhdr_size
) {
951 printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset
);
955 blhdr
= (block_list_header
*)buf
;
957 orig_checksum
= blhdr
->checksum
;
959 if (jnl
->flags
& JOURNAL_NEED_SWAP
) {
960 // calculate the checksum based on the unswapped data
961 // because it is done byte-at-a-time.
962 orig_checksum
= SWAP32(orig_checksum
);
963 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
964 swap_block_list_header(jnl
, blhdr
);
966 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
968 if (checksum
!= orig_checksum
) {
969 printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
970 offset
, orig_checksum
, checksum
);
973 if ( blhdr
->max_blocks
<= 0 || blhdr
->max_blocks
> 2048
974 || blhdr
->num_blocks
<= 0 || blhdr
->num_blocks
> blhdr
->max_blocks
) {
975 printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
976 blhdr
->max_blocks
, blhdr
->num_blocks
);
980 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
981 if (blhdr
->binfo
[i
].bnum
< 0 && blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
982 printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr
->binfo
[i
].bnum
);
987 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
988 // blhdr->num_blocks-1, jnl->jhdr->start);
989 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
993 size
= blhdr
->binfo
[i
].bsize
;
994 number
= blhdr
->binfo
[i
].bnum
;
996 // don't add "killed" blocks
997 if (number
== (off_t
)-1) {
998 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1000 // add this bucket to co_buf, coalescing where possible
1001 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1002 ret_val
= add_block(jnl
, &co_buf
, number
, size
, (size_t) offset
, &num_buckets
, &num_full
);
1004 if (ret_val
== -1) {
1005 printf("jnl: replay_journal: trouble adding block to co_buf\n");
1007 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1013 // check if the last block added puts us off the end of the jnl.
1014 // if so, we need to wrap to the beginning and take any remainder
1017 if (offset
>= jnl
->jhdr
->size
) {
1018 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
1023 jnl
->jhdr
->start
+= blhdr
->bytes_used
;
1024 if (jnl
->jhdr
->start
>= jnl
->jhdr
->size
) {
1025 // wrap around and skip the journal header block
1026 jnl
->jhdr
->start
= (jnl
->jhdr
->start
% jnl
->jhdr
->size
) + jnl
->jhdr
->jhdr_size
;
1031 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1034 * make sure it's at least one page in size, so
1035 * start max_bsize at PAGE_SIZE
1037 for (i
= 0, max_bsize
= PAGE_SIZE
; i
< num_full
; i
++) {
1039 if (co_buf
[i
].block_num
== (off_t
)-1)
1042 if (co_buf
[i
].block_size
> max_bsize
)
1043 max_bsize
= co_buf
[i
].block_size
;
1046 * round max_bsize up to the nearest PAGE_SIZE multiple
1048 if (max_bsize
& (PAGE_SIZE
- 1)) {
1049 max_bsize
= (max_bsize
+ PAGE_SIZE
) & ~(PAGE_SIZE
- 1);
1052 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&block_ptr
, max_bsize
)) {
1056 // Replay the coalesced entries in the co-buf
1057 for(i
=0; i
< num_full
; i
++) {
1058 size_t size
= co_buf
[i
].block_size
;
1059 off_t jnl_offset
= (off_t
) co_buf
[i
].jnl_offset
;
1060 off_t number
= co_buf
[i
].block_num
;
1063 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1064 // co_buf[i].block_size, co_buf[i].jnl_offset);
1066 if (number
== (off_t
)-1) {
1067 // printf("jnl: replay_journal: skipping killed fs block\n");
1070 // do journal read, and set the phys. block
1071 ret
= read_journal_data(jnl
, &jnl_offset
, block_ptr
, size
);
1073 printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset
);
1077 if (update_fs_block(jnl
, block_ptr
, number
, size
) != 0) {
1084 // done replaying; update jnl header
1085 if (write_journal_header(jnl
) != 0) {
1090 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1093 // free the coalesce buffer
1094 FREE(co_buf
, M_TEMP
);
1097 kmem_free(kernel_map
, (vm_offset_t
)buf
, jnl
->jhdr
->blhdr_size
);
1102 kmem_free(kernel_map
, (vm_offset_t
)block_ptr
, max_bsize
);
1105 FREE(co_buf
, M_TEMP
);
1107 kmem_free(kernel_map
, (vm_offset_t
)buf
, jnl
->jhdr
->blhdr_size
);
1113 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1114 //#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
1115 #define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
1117 // XXXdbg - so I can change it in the debugger
1118 int def_tbuffer_size
= 0;
1122 // This function sets the size of the tbuffer and the
1123 // size of the blhdr. It assumes that jnl->jhdr->size
1124 // and jnl->jhdr->jhdr_size are already valid.
1127 size_up_tbuffer(journal
*jnl
, int tbuffer_size
, int phys_blksz
)
1130 // one-time initialization based on how much memory
1131 // there is in the machine.
1133 if (def_tbuffer_size
== 0) {
1134 if (mem_size
< (256*1024*1024)) {
1135 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
;
1136 } else if (mem_size
< (512*1024*1024)) {
1137 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 2;
1138 } else if (mem_size
< (1024*1024*1024)) {
1139 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 3;
1140 } else if (mem_size
>= (1024*1024*1024)) {
1141 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 4;
1145 // size up the transaction buffer... can't be larger than the number
1146 // of blocks that can fit in a block_list_header block.
1147 if (tbuffer_size
== 0) {
1148 jnl
->tbuffer_size
= def_tbuffer_size
;
1150 // make sure that the specified tbuffer_size isn't too small
1151 if (tbuffer_size
< jnl
->jhdr
->blhdr_size
* 2) {
1152 tbuffer_size
= jnl
->jhdr
->blhdr_size
* 2;
1154 // and make sure it's an even multiple of the block size
1155 if ((tbuffer_size
% jnl
->jhdr
->jhdr_size
) != 0) {
1156 tbuffer_size
-= (tbuffer_size
% jnl
->jhdr
->jhdr_size
);
1159 jnl
->tbuffer_size
= tbuffer_size
;
1162 if (jnl
->tbuffer_size
> (jnl
->jhdr
->size
/ 2)) {
1163 jnl
->tbuffer_size
= (jnl
->jhdr
->size
/ 2);
1166 if (jnl
->tbuffer_size
> MAX_TRANSACTION_BUFFER_SIZE
) {
1167 jnl
->tbuffer_size
= MAX_TRANSACTION_BUFFER_SIZE
;
1170 jnl
->jhdr
->blhdr_size
= (jnl
->tbuffer_size
/ jnl
->jhdr
->jhdr_size
) * sizeof(block_info
);
1171 if (jnl
->jhdr
->blhdr_size
< phys_blksz
) {
1172 jnl
->jhdr
->blhdr_size
= phys_blksz
;
1173 } else if ((jnl
->jhdr
->blhdr_size
% phys_blksz
) != 0) {
1174 // have to round up so we're an even multiple of the physical block size
1175 jnl
->jhdr
->blhdr_size
= (jnl
->jhdr
->blhdr_size
+ (phys_blksz
- 1)) & ~(phys_blksz
- 1);
1182 journal_create(struct vnode
*jvp
,
1186 size_t min_fs_blksz
,
1188 int32_t tbuffer_size
,
1189 void (*flush
)(void *arg
),
1193 int ret
, phys_blksz
;
1195 /* Get the real physical block size. */
1196 if (VOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, FSCRED
, NULL
)) {
1200 if (phys_blksz
> min_fs_blksz
) {
1201 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1202 phys_blksz
, min_fs_blksz
);
1206 if ((journal_size
% phys_blksz
) != 0) {
1207 printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1208 journal_size
, phys_blksz
);
1212 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1213 memset(jnl
, 0, sizeof(*jnl
));
1216 jnl
->jdev_offset
= offset
;
1219 jnl
->flush_arg
= arg
;
1220 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1221 simple_lock_init(&jnl
->old_start_lock
);
1223 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1224 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
1225 goto bad_kmem_alloc
;
1228 memset(jnl
->header_buf
, 0, phys_blksz
);
1230 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1231 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1232 jnl
->jhdr
->endian
= ENDIAN_MAGIC
;
1233 jnl
->jhdr
->start
= phys_blksz
; // start at block #1, block #0 is for the jhdr itself
1234 jnl
->jhdr
->end
= phys_blksz
;
1235 jnl
->jhdr
->size
= journal_size
;
1236 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1237 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1239 jnl
->active_start
= jnl
->jhdr
->start
;
1241 // XXXdbg - for testing you can force the journal to wrap around
1242 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1243 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1245 lockinit(&jnl
->jlock
, PINOD
, "journal", 0, 0);
1247 if (write_journal_header(jnl
) != 0) {
1248 printf("jnl: journal_create: failed to write journal header.\n");
1256 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1259 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1265 journal_open(struct vnode
*jvp
,
1269 size_t min_fs_blksz
,
1271 int32_t tbuffer_size
,
1272 void (*flush
)(void *arg
),
1276 int orig_blksz
=0, phys_blksz
, blhdr_size
;
1277 int orig_checksum
, checksum
;
1279 /* Get the real physical block size. */
1280 if (VOP_IOCTL(jvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
, 0, FSCRED
, NULL
)) {
1284 if (phys_blksz
> min_fs_blksz
) {
1285 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1286 phys_blksz
, min_fs_blksz
);
1290 if ((journal_size
% phys_blksz
) != 0) {
1291 printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1292 journal_size
, phys_blksz
);
1296 MALLOC_ZONE(jnl
, struct journal
*, sizeof(struct journal
), M_JNL_JNL
, M_WAITOK
);
1297 memset(jnl
, 0, sizeof(*jnl
));
1300 jnl
->jdev_offset
= offset
;
1303 jnl
->flush_arg
= arg
;
1304 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
1305 simple_lock_init(&jnl
->old_start_lock
);
1307 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&jnl
->header_buf
, phys_blksz
)) {
1308 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz
);
1309 goto bad_kmem_alloc
;
1312 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1313 memset(jnl
->jhdr
, 0, sizeof(journal_header
)+4);
1315 // we have to set this up here so that do_journal_io() will work
1316 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1318 if (read_journal_header(jnl
, jnl
->jhdr
, phys_blksz
) != phys_blksz
) {
1319 printf("jnl: open: could not read %d bytes for the journal header.\n",
1324 orig_checksum
= jnl
->jhdr
->checksum
;
1325 jnl
->jhdr
->checksum
= 0;
1327 if (jnl
->jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
1328 // do this before the swap since it's done byte-at-a-time
1329 orig_checksum
= SWAP32(orig_checksum
);
1330 checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
1331 swap_journal_header(jnl
);
1332 jnl
->flags
|= JOURNAL_NEED_SWAP
;
1334 checksum
= calc_checksum((char *)jnl
->jhdr
, sizeof(struct journal_header
));
1337 if (jnl
->jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
->jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
1338 printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
1339 jnl
->jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
1343 // only check if we're the current journal header magic value
1344 if (jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
) {
1346 if (orig_checksum
!= checksum
) {
1347 printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
1348 orig_checksum
, checksum
);
1354 // XXXdbg - convert old style magic numbers to the new one
1355 if (jnl
->jhdr
->magic
== OLD_JOURNAL_HEADER_MAGIC
) {
1356 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
1359 if (phys_blksz
!= jnl
->jhdr
->jhdr_size
&& jnl
->jhdr
->jhdr_size
!= 0) {
1360 printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
1361 phys_blksz
, jnl
->jhdr
->jhdr_size
);
1363 orig_blksz
= phys_blksz
;
1364 phys_blksz
= jnl
->jhdr
->jhdr_size
;
1365 if (VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&phys_blksz
, FWRITE
, FSCRED
, NULL
)) {
1366 printf("jnl: could not set block size to %d bytes.\n", phys_blksz
);
1369 // goto bad_journal;
1372 if ( jnl
->jhdr
->start
<= 0
1373 || jnl
->jhdr
->start
> jnl
->jhdr
->size
1374 || jnl
->jhdr
->start
> 1024*1024*1024) {
1375 printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1376 jnl
->jhdr
->start
, jnl
->jhdr
->size
);
1380 if ( jnl
->jhdr
->end
<= 0
1381 || jnl
->jhdr
->end
> jnl
->jhdr
->size
1382 || jnl
->jhdr
->end
> 1024*1024*1024) {
1383 printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1384 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
1388 if (jnl
->jhdr
->size
> 1024*1024*1024) {
1389 printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl
->jhdr
->size
);
1393 // XXXdbg - can't do these checks because hfs writes all kinds of
1394 // non-uniform sized blocks even on devices that have a block size
1395 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
1396 // therefore these checks will fail and so we just have to punt and
1397 // do more relaxed checking...
1398 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1399 if ((jnl
->jhdr
->start
% 512) != 0) {
1400 printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
1405 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1406 if ((jnl
->jhdr
->end
% 512) != 0) {
1407 printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1408 jnl
->jhdr
->end
, jnl
->jhdr
->jhdr_size
);
1412 // take care of replaying the journal if necessary
1413 if (flags
& JOURNAL_RESET
) {
1414 printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
1415 jnl
, jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1416 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
1417 } else if (replay_journal(jnl
) != 0) {
1418 printf("jnl: journal_open: Error replaying the journal!\n");
1422 if (orig_blksz
!= 0) {
1423 VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, FSCRED
, NULL
);
1424 phys_blksz
= orig_blksz
;
1425 if (orig_blksz
< jnl
->jhdr
->jhdr_size
) {
1426 printf("jnl: open: jhdr_size is %d but orig phys blk size is %d. switching.\n",
1427 jnl
->jhdr
->jhdr_size
, orig_blksz
);
1429 jnl
->jhdr
->jhdr_size
= orig_blksz
;
1433 // make sure this is in sync!
1434 jnl
->active_start
= jnl
->jhdr
->start
;
1436 // set this now, after we've replayed the journal
1437 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
1439 lockinit(&jnl
->jlock
, PINOD
, "journal", 0, 0);
1444 if (orig_blksz
!= 0) {
1445 phys_blksz
= orig_blksz
;
1446 VOP_IOCTL(jvp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&orig_blksz
, FWRITE
, FSCRED
, NULL
);
1448 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, phys_blksz
);
1450 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1455 journal_close(journal
*jnl
)
1457 volatile off_t
*start
, *end
;
1462 // set this before doing anything that would block so that
1463 // we start tearing things down properly.
1465 jnl
->flags
|= JOURNAL_CLOSE_PENDING
;
1467 if (jnl
->owner
!= current_act()) {
1470 ret
= lockmgr(&jnl
->jlock
, LK_EXCLUSIVE
|LK_RETRY
, NULL
, current_proc());
1472 printf("jnl: close: locking the journal (0x%x) failed %d.\n", jnl
, ret
);
1478 // only write stuff to disk if the journal is still valid
1480 if ((jnl
->flags
& JOURNAL_INVALID
) == 0) {
1482 if (jnl
->active_tr
) {
1483 journal_end_transaction(jnl
);
1486 // flush any buffered transactions
1488 transaction
*tr
= jnl
->cur_tr
;
1491 end_transaction(tr
, 1); // force it to get flushed
1494 //start = &jnl->jhdr->start;
1495 start
= &jnl
->active_start
;
1496 end
= &jnl
->jhdr
->end
;
1498 while (*start
!= *end
&& counter
++ < 500) {
1499 printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start
, *end
);
1501 jnl
->flush(jnl
->flush_arg
);
1503 tsleep((caddr_t
)jnl
, PRIBIO
, "jnl_close", 1);
1506 if (*start
!= *end
) {
1507 printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1511 // make sure this is in sync when we close the journal
1512 jnl
->jhdr
->start
= jnl
->active_start
;
1514 // if this fails there's not much we can do at this point...
1515 write_journal_header(jnl
);
1517 // if we're here the journal isn't valid any more.
1518 // so make sure we don't leave any locked blocks lying around
1519 printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl
);
1520 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1522 if (jnl
->active_tr
) {
1523 tr
= jnl
->active_tr
;
1524 jnl
->active_tr
= NULL
;
1530 abort_transaction(jnl
, tr
);
1531 if (jnl
->active_tr
|| jnl
->cur_tr
) {
1532 panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl
);
1537 free_old_stuff(jnl
);
1539 kmem_free(kernel_map
, (vm_offset_t
)jnl
->header_buf
, jnl
->jhdr
->jhdr_size
);
1540 jnl
->jhdr
= (void *)0xbeefbabe;
1542 FREE_ZONE(jnl
, sizeof(struct journal
), M_JNL_JNL
);
1546 dump_journal(journal
*jnl
)
1551 printf(" jdev_offset %.8llx\n", jnl
->jdev_offset
);
1552 printf(" magic: 0x%.8x\n", jnl
->jhdr
->magic
);
1553 printf(" start: 0x%.8llx\n", jnl
->jhdr
->start
);
1554 printf(" end: 0x%.8llx\n", jnl
->jhdr
->end
);
1555 printf(" size: 0x%.8llx\n", jnl
->jhdr
->size
);
1556 printf(" blhdr size: %d\n", jnl
->jhdr
->blhdr_size
);
1557 printf(" jhdr size: %d\n", jnl
->jhdr
->jhdr_size
);
1558 printf(" chksum: 0x%.8x\n", jnl
->jhdr
->checksum
);
1560 printf(" completed transactions:\n");
1561 for(ctr
=jnl
->completed_trs
; ctr
; ctr
=ctr
->next
) {
1562 printf(" 0x%.8llx - 0x%.8llx\n", ctr
->journal_start
, ctr
->journal_end
);
1569 free_space(journal
*jnl
)
1573 if (jnl
->jhdr
->start
< jnl
->jhdr
->end
) {
1574 free_space
= jnl
->jhdr
->size
- (jnl
->jhdr
->end
- jnl
->jhdr
->start
) - jnl
->jhdr
->jhdr_size
;
1575 } else if (jnl
->jhdr
->start
> jnl
->jhdr
->end
) {
1576 free_space
= jnl
->jhdr
->start
- jnl
->jhdr
->end
;
1578 // journal is completely empty
1579 free_space
= jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
;
1587 // The journal must be locked on entry to this function.
1588 // The "desired_size" is in bytes.
1591 check_free_space(journal
*jnl
, int desired_size
)
1595 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
1596 // desired_size, free_space(jnl));
1599 int old_start_empty
;
1601 if (counter
++ == 5000) {
1603 panic("jnl: check_free_space: buffer flushing isn't working "
1604 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl
,
1605 jnl
->jhdr
->start
, jnl
->jhdr
->end
, free_space(jnl
), jnl
->active_start
);
1607 if (counter
> 7500) {
1608 printf("jnl: check_free_space: giving up waiting for free space.\n");
1612 // make sure there's space in the journal to hold this transaction
1613 if (free_space(jnl
) > desired_size
) {
1618 // here's where we lazily bump up jnl->jhdr->start. we'll consume
1619 // entries until there is enough space for the next transaction.
1621 old_start_empty
= 1;
1622 simple_lock(&jnl
->old_start_lock
);
1623 for(i
=0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
1627 while (jnl
->old_start
[i
] & 0x8000000000000000LL
) {
1628 if (counter
++ > 100) {
1629 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
1630 jnl
->old_start
[i
], jnl
);
1633 simple_unlock(&jnl
->old_start_lock
);
1635 jnl
->flush(jnl
->flush_arg
);
1637 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space1", 1);
1638 simple_lock(&jnl
->old_start_lock
);
1641 if (jnl
->old_start
[i
] == 0) {
1645 old_start_empty
= 0;
1646 jnl
->jhdr
->start
= jnl
->old_start
[i
];
1647 jnl
->old_start
[i
] = 0;
1648 if (free_space(jnl
) > desired_size
) {
1649 write_journal_header(jnl
);
1653 simple_unlock(&jnl
->old_start_lock
);
1655 // if we bumped the start, loop and try again
1656 if (i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
1658 } else if (old_start_empty
) {
1660 // if there is nothing in old_start anymore then we can
1661 // bump the jhdr->start to be the same as active_start
1662 // since it is possible there was only one very large
1663 // transaction in the old_start array. if we didn't do
1664 // this then jhdr->start would never get updated and we
1665 // would wind up looping until we hit the panic at the
1666 // start of the loop.
1668 jnl
->jhdr
->start
= jnl
->active_start
;
1669 write_journal_header(jnl
);
1674 // if the file system gave us a flush function, call it to so that
1675 // it can flush some blocks which hopefully will cause some transactions
1676 // to complete and thus free up space in the journal.
1678 jnl
->flush(jnl
->flush_arg
);
1681 // wait for a while to avoid being cpu-bound (this will
1682 // put us to sleep for 10 milliseconds)
1683 tsleep((caddr_t
)jnl
, PRIBIO
, "check_free_space2", 1);
1690 journal_start_transaction(journal
*jnl
)
1698 if (jnl
->flags
& JOURNAL_INVALID
) {
1702 if (jnl
->owner
== current_act()) {
1703 if (jnl
->active_tr
== NULL
) {
1704 panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
1705 jnl
, jnl
->owner
, current_act());
1707 jnl
->nested_count
++;
1711 ret
= lockmgr(&jnl
->jlock
, LK_EXCLUSIVE
|LK_RETRY
, NULL
, current_proc());
1713 printf("jnl: start_tr: locking the journal (0x%x) failed %d.\n", jnl
, ret
);
1717 if (jnl
->owner
!= NULL
|| jnl
->nested_count
!= 0 || jnl
->active_tr
!= NULL
) {
1718 panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
1719 jnl
->owner
, jnl
->nested_count
, jnl
->active_tr
, jnl
);
1722 jnl
->owner
= current_act();
1723 jnl
->nested_count
= 1;
1725 free_old_stuff(jnl
);
1727 // make sure there's room in the journal
1728 if (check_free_space(jnl
, jnl
->tbuffer_size
) != 0) {
1729 printf("jnl: start transaction failed: no space\n");
1734 // if there's a buffered transaction, use it.
1736 jnl
->active_tr
= jnl
->cur_tr
;
1742 MALLOC_ZONE(tr
, transaction
*, sizeof(transaction
), M_JNL_TR
, M_WAITOK
);
1743 memset(tr
, 0, sizeof(transaction
));
1745 tr
->tbuffer_size
= jnl
->tbuffer_size
;
1746 thread_wire_internal(host_priv_self(), current_act(), TRUE
, &prev_priv
);
1747 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&tr
->tbuffer
, tr
->tbuffer_size
)) {
1748 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
1749 printf("jnl: start transaction failed: no tbuffer mem\n");
1751 thread_wire_internal(host_priv_self(), current_act(), prev_priv
, NULL
);
1754 thread_wire_internal(host_priv_self(), current_act(), prev_priv
, NULL
);
1756 // journal replay code checksum check depends on this.
1757 memset(tr
->tbuffer
, 0, BLHDR_CHECKSUM_SIZE
);
1759 tr
->blhdr
= (block_list_header
*)tr
->tbuffer
;
1760 tr
->blhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1761 tr
->blhdr
->num_blocks
= 1; // accounts for this header block
1762 tr
->blhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
1765 tr
->total_bytes
= jnl
->jhdr
->blhdr_size
;
1768 jnl
->active_tr
= tr
;
1770 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
1776 jnl
->nested_count
= 0;
1777 lockmgr(&jnl
->jlock
, LK_RELEASE
, NULL
, current_proc());
1783 journal_modify_block_start(journal
*jnl
, struct buf
*bp
)
1789 if (jnl
->flags
& JOURNAL_INVALID
) {
1793 // XXXdbg - for debugging I want this to be true. later it may
1794 // not be necessary.
1795 if ((bp
->b_flags
& B_META
) == 0) {
1796 panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp
, jnl
);
1799 tr
= jnl
->active_tr
;
1800 CHECK_TRANSACTION(tr
);
1802 if (jnl
->owner
!= current_act()) {
1803 panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1804 jnl
, jnl
->owner
, current_act());
1807 free_old_stuff(jnl
);
1809 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
1810 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1812 // can't allow blocks that aren't an even multiple of the
1813 // underlying block size.
1814 if ((bp
->b_bufsize
% jnl
->jhdr
->jhdr_size
) != 0) {
1815 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1816 bp
->b_bufsize
, jnl
->jhdr
->jhdr_size
);
1820 // make sure that this transaction isn't bigger than the whole journal
1821 if (tr
->total_bytes
+bp
->b_bufsize
>= (jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
)) {
1822 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
1823 tr
->total_bytes
, (tr
->jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
), bp
->b_bufsize
, tr
, bp
);
1827 // if the block is dirty and not already locked we have to write
1828 // it out before we muck with it because it has data that belongs
1829 // (presumably) to another transaction.
1831 if ((bp
->b_flags
& B_DELWRI
) && (bp
->b_flags
& B_LOCKED
) == 0) {
1833 // this will cause it to not be brelse()'d
1834 bp
->b_flags
|= B_NORELSE
;
1838 bp
->b_flags
|= B_LOCKED
;
1844 journal_modify_block_abort(journal
*jnl
, struct buf
*bp
)
1847 block_list_header
*blhdr
;
1852 tr
= jnl
->active_tr
;
1855 // if there's no active transaction then we just want to
1856 // call brelse() and return since this is just a block
1857 // that happened to be modified as part of another tr.
1864 if (jnl
->flags
& JOURNAL_INVALID
) {
1868 CHECK_TRANSACTION(tr
);
1870 if (jnl
->owner
!= current_act()) {
1871 panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1872 jnl
, jnl
->owner
, current_act());
1875 free_old_stuff(jnl
);
1877 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
1879 // first check if it's already part of this transaction
1880 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1881 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1882 if (bp
== blhdr
->binfo
[i
].bp
) {
1883 if (bp
->b_bufsize
!= blhdr
->binfo
[i
].bsize
) {
1884 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1885 bp
, bp
->b_bufsize
, blhdr
->binfo
[i
].bsize
, jnl
);
1891 if (i
< blhdr
->num_blocks
) {
1897 // if blhdr is null, then this block has only had modify_block_start
1898 // called on it as part of the current transaction. that means that
1899 // it is ok to clear the LOCKED bit since it hasn't actually been
1900 // modified. if blhdr is non-null then modify_block_end was called
1901 // on it and so we need to keep it locked in memory.
1903 if (blhdr
== NULL
) {
1904 bp
->b_flags
&= ~(B_LOCKED
);
1913 journal_modify_block_end(journal
*jnl
, struct buf
*bp
)
1915 int i
, j
, tbuffer_offset
;
1917 block_list_header
*blhdr
, *prev
=NULL
;
1922 if (jnl
->flags
& JOURNAL_INVALID
) {
1926 tr
= jnl
->active_tr
;
1927 CHECK_TRANSACTION(tr
);
1929 if (jnl
->owner
!= current_act()) {
1930 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1931 jnl
, jnl
->owner
, current_act());
1934 free_old_stuff(jnl
);
1936 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n",
1937 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1939 if ((bp
->b_flags
& B_LOCKED
) == 0) {
1940 panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp
, jnl
);
1941 bp
->b_flags
|= B_LOCKED
;
1944 // first check if it's already part of this transaction
1945 for(blhdr
=tr
->blhdr
; blhdr
; prev
=blhdr
,blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1946 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
1948 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
1949 if (bp
== blhdr
->binfo
[i
].bp
) {
1950 if (bp
->b_bufsize
!= blhdr
->binfo
[i
].bsize
) {
1951 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1952 bp
, bp
->b_bufsize
, blhdr
->binfo
[i
].bsize
, jnl
);
1956 tbuffer_offset
+= blhdr
->binfo
[i
].bsize
;
1959 if (i
< blhdr
->num_blocks
) {
1966 && (prev
->num_blocks
+1) <= prev
->max_blocks
1967 && (prev
->bytes_used
+bp
->b_bufsize
) <= tr
->tbuffer_size
) {
1969 } else if (blhdr
== NULL
) {
1970 block_list_header
*nblhdr
;
1974 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl
, bp
);
1977 // we got to the end of the list, didn't find the block and there's
1978 // no room in the block_list_header pointed to by prev
1980 // we allocate another tbuffer and link it in at the end of the list
1981 // through prev->binfo[0].bnum. that's a skanky way to do things but
1982 // avoids having yet another linked list of small data structures to manage.
1984 thread_wire_internal(host_priv_self(), current_act(), TRUE
, &prev_priv
);
1985 if (kmem_alloc(kernel_map
, (vm_offset_t
*)&nblhdr
, tr
->tbuffer_size
)) {
1986 panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
1987 tr
, tr
->total_bytes
);
1989 thread_wire_internal(host_priv_self(), current_act(), prev_priv
, NULL
);
1991 // journal replay code checksum check depends on this.
1992 memset(nblhdr
, 0, BLHDR_CHECKSUM_SIZE
);
1994 // initialize the new guy
1995 nblhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1996 nblhdr
->num_blocks
= 1; // accounts for this header block
1997 nblhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
2000 tr
->total_bytes
+= jnl
->jhdr
->blhdr_size
;
2002 // then link him in at the end
2003 prev
->binfo
[0].bnum
= (off_t
)((long)nblhdr
);
2005 // and finally switch to using the new guy
2007 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2012 if ((i
+1) > blhdr
->max_blocks
) {
2013 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i
, blhdr
->max_blocks
);
2016 // copy the data into the in-memory transaction buffer
2017 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
2018 memcpy(blkptr
, bp
->b_data
, bp
->b_bufsize
);
2020 // if this is true then this is a new block we haven't seen
2021 if (i
>= blhdr
->num_blocks
) {
2022 vget(bp
->b_vp
, 0, current_proc());
2024 blhdr
->binfo
[i
].bnum
= (off_t
)((unsigned)bp
->b_blkno
);
2025 blhdr
->binfo
[i
].bsize
= bp
->b_bufsize
;
2026 blhdr
->binfo
[i
].bp
= bp
;
2028 blhdr
->bytes_used
+= bp
->b_bufsize
;
2029 tr
->total_bytes
+= bp
->b_bufsize
;
2031 blhdr
->num_blocks
++;
2040 journal_kill_block(journal
*jnl
, struct buf
*bp
)
2043 block_list_header
*blhdr
;
2048 if (jnl
->flags
& JOURNAL_INVALID
) {
2052 tr
= jnl
->active_tr
;
2053 CHECK_TRANSACTION(tr
);
2055 if (jnl
->owner
!= current_act()) {
2056 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2057 jnl
, jnl
->owner
, current_act());
2060 free_old_stuff(jnl
);
2062 if ((bp
->b_flags
& B_LOCKED
) == 0) {
2063 panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp
, jnl
);
2066 // first check if it's already part of this transaction
2067 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2069 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2070 if (bp
== blhdr
->binfo
[i
].bp
) {
2071 bp
->b_flags
&= ~B_LOCKED
;
2073 // this undoes the vget() in journal_modify_block_end()
2076 // if the block has the DELWRI and CALL bits sets, then
2077 // things are seriously weird. if it was part of another
2078 // transaction then journal_modify_block_start() should
2079 // have force it to be written.
2081 if ((bp
->b_flags
& B_DELWRI
) && (bp
->b_flags
& B_CALL
)) {
2082 panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp
);
2084 tr
->num_killed
+= bp
->b_bufsize
;
2087 if (bp
->b_flags
& B_BUSY
) {
2091 blhdr
->binfo
[i
].bp
= NULL
;
2092 blhdr
->binfo
[i
].bnum
= (off_t
)-1;
2097 if (i
< blhdr
->num_blocks
) {
2107 journal_binfo_cmp(void *a
, void *b
)
2109 block_info
*bi_a
= (struct block_info
*)a
,
2110 *bi_b
= (struct block_info
*)b
;
2113 if (bi_a
->bp
== NULL
) {
2116 if (bi_b
->bp
== NULL
) {
2120 // don't have to worry about negative block
2121 // numbers so this is ok to do.
2123 res
= (bi_a
->bp
->b_blkno
- bi_b
->bp
->b_blkno
);
2130 end_transaction(transaction
*tr
, int force_it
)
2134 journal
*jnl
= tr
->jnl
;
2136 block_list_header
*blhdr
=NULL
, *next
=NULL
;
2139 panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
2140 jnl
, jnl
->cur_tr
, tr
);
2143 // if there weren't any modified blocks in the transaction
2144 // just save off the transaction pointer and return.
2145 if (tr
->total_bytes
== jnl
->jhdr
->blhdr_size
) {
2150 // if our transaction buffer isn't very full, just hang
2151 // on to it and don't actually flush anything. this is
2152 // what is known as "group commit". we will flush the
2153 // transaction buffer if it's full or if we have more than
2154 // one of them so we don't start hogging too much memory.
2157 && (jnl
->flags
& JOURNAL_NO_GROUP_COMMIT
) == 0
2158 && tr
->num_blhdrs
< 3
2159 && (tr
->total_bytes
<= ((tr
->tbuffer_size
*tr
->num_blhdrs
) - tr
->tbuffer_size
/8))) {
2166 // if we're here we're going to flush the transaction buffer to disk.
2167 // make sure there is room in the journal first.
2168 check_free_space(jnl
, tr
->total_bytes
);
2170 // range check the end index
2171 if (jnl
->jhdr
->end
<= 0 || jnl
->jhdr
->end
> jnl
->jhdr
->size
) {
2172 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2173 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
2176 // this transaction starts where the current journal ends
2177 tr
->journal_start
= jnl
->jhdr
->end
;
2178 end
= jnl
->jhdr
->end
;
2181 // if the first entry in old_start[] isn't free yet, loop calling the
2182 // file system flush routine until it is (or we panic).
2185 simple_lock(&jnl
->old_start_lock
);
2186 while ((jnl
->old_start
[0] & 0x8000000000000000LL
) != 0) {
2188 simple_unlock(&jnl
->old_start_lock
);
2191 jnl
->flush(jnl
->flush_arg
);
2194 // yield the cpu so others can get in to clear the lock bit
2195 (void)tsleep((void *)jnl
, PRIBIO
, "jnl-old-start-sleep", 1);
2197 simple_lock(&jnl
->old_start_lock
);
2200 panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
2201 jnl
->old_start
[0] & (~0x8000000000000000LL
), jnl
);
2206 // slide everyone else down and put our latest guy in the last
2207 // entry in the old_start array
2209 memcpy(&jnl
->old_start
[0], &jnl
->old_start
[1], sizeof(jnl
->old_start
)-sizeof(jnl
->old_start
[0]));
2210 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] = tr
->journal_start
| 0x8000000000000000LL
;
2212 simple_unlock(&jnl
->old_start_lock
);
2215 // for each block, make sure that the physical block # is set
2216 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
2218 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2220 bp
= blhdr
->binfo
[i
].bp
;
2221 if (bp
== NULL
) { // only true if a block was "killed"
2222 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
2223 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
2224 blhdr
->binfo
[i
].bnum
, jnl
, tr
);
2229 if (bp
->b_vp
== NULL
&& bp
->b_lblkno
== bp
->b_blkno
) {
2230 panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
2231 bp
, bp
->b_lblkno
, bp
->b_blkno
);
2234 // if the lblkno is the same as blkno and this bp isn't
2235 // associated with the underlying file system device then
2236 // we need to call bmap() to get the actual physical block.
2238 if ((bp
->b_lblkno
== bp
->b_blkno
) && (bp
->b_vp
!= jnl
->fsdev
)) {
2239 if (VOP_BMAP(bp
->b_vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
, NULL
) != 0) {
2240 printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp
, jnl
);
2245 // update this so we write out the correct physical block number!
2246 blhdr
->binfo
[i
].bnum
= (off_t
)((unsigned)bp
->b_blkno
);
2249 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2252 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=(block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2254 amt
= blhdr
->bytes_used
;
2256 blhdr
->checksum
= 0;
2257 blhdr
->checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
2259 ret
= write_journal_data(jnl
, &end
, blhdr
, amt
);
2261 printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
2268 jnl
->jhdr
->end
= end
; // update where the journal now ends
2269 tr
->journal_end
= end
; // the transaction ends here too
2270 if (tr
->journal_start
== 0 || tr
->journal_end
== 0) {
2271 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2272 tr
->journal_start
, tr
->journal_end
);
2275 if (write_journal_header(jnl
) != 0) {
2280 // setup for looping through all the blhdr's. we null out the
2281 // tbuffer and blhdr fields so that they're not used any more.
2287 // the buffer_flushed_callback will only be called for the
2288 // real blocks that get flushed so we have to account for
2289 // the block_list_headers here.
2291 tr
->num_flushed
= tr
->num_blhdrs
* jnl
->jhdr
->blhdr_size
;
2293 // for each block, set the iodone callback and unlock it
2294 for(; blhdr
; blhdr
=next
) {
2296 // we can re-order the buf ptrs because everything is written out already
2297 qsort(&blhdr
->binfo
[1], blhdr
->num_blocks
-1, sizeof(block_info
), journal_binfo_cmp
);
2299 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2300 if (blhdr
->binfo
[i
].bp
== NULL
) {
2304 ret
= meta_bread(blhdr
->binfo
[i
].bp
->b_vp
,
2305 (daddr_t
)blhdr
->binfo
[i
].bp
->b_lblkno
,
2306 blhdr
->binfo
[i
].bp
->b_bufsize
,
2309 if (ret
== 0 && bp
!= NULL
) {
2310 struct vnode
*save_vp
;
2312 if (bp
!= blhdr
->binfo
[i
].bp
) {
2313 panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2314 bp
, blhdr
->binfo
[i
].bp
, jnl
);
2317 if ((bp
->b_flags
& (B_LOCKED
|B_DELWRI
)) != (B_LOCKED
|B_DELWRI
)) {
2318 if (jnl
->flags
& JOURNAL_CLOSE_PENDING
) {
2322 panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp
, bp
->b_flags
);
2326 if (bp
->b_iodone
!= NULL
) {
2327 panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
2328 bp
, bp
->b_blkno
, bp
->b_vp
, bp
->b_iodone
, buffer_flushed_callback
);
2333 bp
->b_iodone
= buffer_flushed_callback
;
2334 bp
->b_transaction
= tr
;
2335 bp
->b_flags
|= B_CALL
;
2336 bp
->b_flags
&= ~(B_LOCKED
);
2338 // kicking off the write here helps performance
2340 // XXXdbg this is good for testing: bdwrite(bp);
2343 // this undoes the vget() in journal_modify_block_end()
2347 printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
2348 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
2355 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2357 // we can free blhdr here since we won't need it any more
2358 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
2359 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
2362 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2363 // tr, tr->journal_start, tr->journal_end);
2368 jnl
->flags
|= JOURNAL_INVALID
;
2369 abort_transaction(jnl
, tr
);
2374 abort_transaction(journal
*jnl
, transaction
*tr
)
2377 block_list_header
*blhdr
, *next
;
2379 struct vnode
*save_vp
;
2381 // for each block list header, iterate over the blocks then
2382 // free up the memory associated with the block list.
2384 // for each block, clear the lock bit and release it.
2386 for(blhdr
=tr
->blhdr
; blhdr
; blhdr
=next
) {
2388 for(i
=1; i
< blhdr
->num_blocks
; i
++) {
2389 if (blhdr
->binfo
[i
].bp
== NULL
) {
2393 ret
= meta_bread(blhdr
->binfo
[i
].bp
->b_vp
,
2394 (daddr_t
)blhdr
->binfo
[i
].bp
->b_lblkno
,
2395 blhdr
->binfo
[i
].bp
->b_bufsize
,
2399 if (bp
!= blhdr
->binfo
[i
].bp
) {
2400 panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2401 bp
, blhdr
->binfo
[i
].bp
, jnl
);
2404 // clear the locked bit and the delayed-write bit. we
2405 // don't want these blocks going to disk.
2406 bp
->b_flags
&= ~(B_LOCKED
|B_DELWRI
);
2407 bp
->b_flags
|= B_INVAL
;
2415 printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
2416 blhdr
->binfo
[i
].bnum
, blhdr
->binfo
[i
].bp
);
2423 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2425 // we can free blhdr here since we won't need it any more
2426 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
2427 kmem_free(kernel_map
, (vm_offset_t
)blhdr
, tr
->tbuffer_size
);
2432 tr
->total_bytes
= 0xdbadc0de;
2433 FREE_ZONE(tr
, sizeof(transaction
), M_JNL_TR
);
2438 journal_end_transaction(journal
*jnl
)
2445 if ((jnl
->flags
& JOURNAL_INVALID
) && jnl
->owner
== NULL
) {
2449 if (jnl
->owner
!= current_act()) {
2450 panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2451 jnl
, jnl
->owner
, current_act());
2454 free_old_stuff(jnl
);
2456 jnl
->nested_count
--;
2457 if (jnl
->nested_count
> 0) {
2459 } else if (jnl
->nested_count
< 0) {
2460 panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl
, jnl
->nested_count
);
2463 if (jnl
->flags
& JOURNAL_INVALID
) {
2464 if (jnl
->active_tr
) {
2467 if (jnl
->cur_tr
!= NULL
) {
2468 panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
2469 jnl
, jnl
->active_tr
, jnl
->cur_tr
);
2472 tr
= jnl
->active_tr
;
2473 jnl
->active_tr
= NULL
;
2474 abort_transaction(jnl
, tr
);
2478 lockmgr(&jnl
->jlock
, LK_RELEASE
, NULL
, current_proc());
2483 tr
= jnl
->active_tr
;
2484 CHECK_TRANSACTION(tr
);
2486 // clear this out here so that when check_free_space() calls
2487 // the FS flush function, we don't panic in journal_flush()
2488 // if the FS were to call that. note: check_free_space() is
2489 // called from end_transaction().
2491 jnl
->active_tr
= NULL
;
2492 ret
= end_transaction(tr
, 0);
2495 lockmgr(&jnl
->jlock
, LK_RELEASE
, NULL
, current_proc());
2502 journal_flush(journal
*jnl
)
2504 int need_signal
= 0;
2508 if (jnl
->flags
& JOURNAL_INVALID
) {
2512 if (jnl
->owner
!= current_act()) {
2515 ret
= lockmgr(&jnl
->jlock
, LK_EXCLUSIVE
|LK_RETRY
, NULL
, current_proc());
2517 printf("jnl: flush: locking the journal (0x%x) failed %d.\n", jnl
, ret
);
2523 free_old_stuff(jnl
);
2525 // if we're not active, flush any buffered transactions
2526 if (jnl
->active_tr
== NULL
&& jnl
->cur_tr
) {
2527 transaction
*tr
= jnl
->cur_tr
;
2530 end_transaction(tr
, 1); // force it to get flushed
2534 lockmgr(&jnl
->jlock
, LK_RELEASE
, NULL
, current_proc());
2541 journal_active(journal
*jnl
)
2543 if (jnl
->flags
& JOURNAL_INVALID
) {
2547 return (jnl
->active_tr
== NULL
) ? 0 : 1;