]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_journal.c
xnu-792.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_journal.c
1 /*
2 * Copyright (c) 1995-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 //
23 // This file implements a simple write-ahead journaling layer.
24 // In theory any file system can make use of it by calling these
25 // functions when the fs wants to modify meta-data blocks. See
26 // vfs_journal.h for a more detailed description of the api and
27 // data structures.
28 //
29 // Dominic Giampaolo (dbg@apple.com)
30 //
31
32 #ifdef KERNEL
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/file_internal.h>
38 #include <sys/stat.h>
39 #include <sys/buf_internal.h>
40 #include <sys/proc_internal.h>
41 #include <sys/mount_internal.h>
42 #include <sys/namei.h>
43 #include <sys/vnode_internal.h>
44 #include <sys/ioctl.h>
45 #include <sys/tty.h>
46 #include <sys/ubc.h>
47 #include <sys/malloc.h>
48 #include <kern/thread.h>
49 #include <sys/disk.h>
50 #include <miscfs/specfs/specdev.h>
51
52 extern task_t kernel_task;
53
54 #else
55
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <limits.h>
60 #include <errno.h>
61 #include <fcntl.h>
62 #include <unistd.h>
63 #include <stdarg.h>
64 #include <sys/types.h>
65 #include "compat.h"
66
67 #endif /* KERNEL */
68
69 #include "vfs_journal.h"
70
71
72 // number of bytes to checksum in a block_list_header
73 // NOTE: this should be enough to clear out the header
74 // fields as well as the first entry of binfo[]
75 #define BLHDR_CHECKSUM_SIZE 32
76
77
78
79 static int end_transaction(transaction *tr, int force_it);
80 static void abort_transaction(journal *jnl, transaction *tr);
81 static void dump_journal(journal *jnl);
82
83 static __inline__ void lock_journal(journal *jnl);
84 static __inline__ void unlock_journal(journal *jnl);
85 static __inline__ void lock_oldstart(journal *jnl);
86 static __inline__ void unlock_oldstart(journal *jnl);
87
88
89
90
91 //
92 // 3105942 - Coalesce writes to the same block on journal replay
93 //
94
95 typedef struct bucket {
96 off_t block_num;
97 size_t jnl_offset;
98 size_t block_size;
99 } bucket;
100
101 #define STARTING_BUCKETS 256
102
103 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr);
104 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
105 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
106 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr);
107 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
108
109 #define CHECK_JOURNAL(jnl) \
110 do { \
111 if (jnl == NULL) {\
112 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
113 }\
114 if (jnl->jdev == NULL) { \
115 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
116 } \
117 if (jnl->fsdev == NULL) { \
118 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
119 } \
120 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
121 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
122 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
123 }\
124 if ( jnl->jhdr->start <= 0 \
125 || jnl->jhdr->start > jnl->jhdr->size\
126 || jnl->jhdr->start > 1024*1024*1024) {\
127 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
128 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
129 }\
130 if ( jnl->jhdr->end <= 0 \
131 || jnl->jhdr->end > jnl->jhdr->size\
132 || jnl->jhdr->end > 1024*1024*1024) {\
133 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
134 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
135 }\
136 if (jnl->jhdr->size > 1024*1024*1024) {\
137 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
138 __FILE__, __LINE__, jnl->jhdr->size);\
139 } \
140 } while(0)
141
142 #define CHECK_TRANSACTION(tr) \
143 do {\
144 if (tr == NULL) {\
145 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
146 }\
147 if (tr->jnl == NULL) {\
148 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
149 }\
150 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
151 panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
152 }\
153 if (tr->total_bytes < 0) {\
154 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
155 }\
156 if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
157 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
158 }\
159 if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
160 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
161 }\
162 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
163 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
164 }\
165 } while(0)
166
167
168
169 //
170 // this isn't a great checksum routine but it will do for now.
171 // we use it to checksum the journal header and the block list
172 // headers that are at the start of each transaction.
173 //
174 static int
175 calc_checksum(char *ptr, int len)
176 {
177 int i, cksum=0;
178
179 // this is a lame checksum but for now it'll do
180 for(i=0; i < len; i++, ptr++) {
181 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
182 }
183
184 return (~cksum);
185 }
186
187 //
188 // Journal Locking
189 //
190 lck_grp_attr_t * jnl_group_attr;
191 lck_attr_t * jnl_lock_attr;
192 lck_grp_t * jnl_mutex_group;
193
194 void
195 journal_init()
196 {
197 jnl_lock_attr = lck_attr_alloc_init();
198 jnl_group_attr = lck_grp_attr_alloc_init();
199 jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
200
201 /* Turn on lock debugging */
202 //lck_attr_setdebug(jnl_lock_attr);
203 }
204
205 static __inline__ void
206 lock_journal(journal *jnl)
207 {
208 lck_mtx_lock(&jnl->jlock);
209 }
210
211 static __inline__ void
212 unlock_journal(journal *jnl)
213 {
214 lck_mtx_unlock(&jnl->jlock);
215 }
216
217 static __inline__ void
218 lock_oldstart(journal *jnl)
219 {
220 lck_mtx_lock(&jnl->old_start_lock);
221 }
222
223 static __inline__ void
224 unlock_oldstart(journal *jnl)
225 {
226 lck_mtx_unlock(&jnl->old_start_lock);
227 }
228
229
230
231 #define JNL_WRITE 0x0001
232 #define JNL_READ 0x0002
233 #define JNL_HEADER 0x8000
234
235 //
236 // This function sets up a fake buf and passes it directly to the
237 // journal device strategy routine (so that it won't get cached in
238 // the block cache.
239 //
240 // It also handles range checking the i/o so that we don't write
241 // outside the journal boundaries and it will wrap the i/o back
242 // to the beginning if necessary (skipping over the journal header)
243 //
244 static size_t
245 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
246 {
247 int err, io_sz=0, curlen=len;
248 buf_t bp;
249 int max_iosize = 128 * 1024;
250 struct vfsioattr ioattr;
251
252 if (*offset < 0 || *offset > jnl->jhdr->size) {
253 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
254 }
255 vfs_ioattr(vnode_mount(jnl->jdev), &ioattr);
256
257 if (direction & JNL_WRITE)
258 max_iosize = ioattr.io_maxwritecnt;
259 else if (direction & JNL_READ)
260 max_iosize = ioattr.io_maxreadcnt;
261
262 again:
263 bp = alloc_io_buf(jnl->jdev, 1);
264
265 if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
266 if (*offset == jnl->jhdr->size) {
267 *offset = jnl->jhdr->jhdr_size;
268 } else {
269 curlen = (off_t)jnl->jhdr->size - *offset;
270 }
271 }
272
273 if (curlen > max_iosize) {
274 curlen = max_iosize;
275 }
276
277 if (curlen <= 0) {
278 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
279 }
280
281 if (*offset == 0 && (direction & JNL_HEADER) == 0) {
282 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
283 }
284
285 if (direction & JNL_READ)
286 buf_setflags(bp, B_READ);
287 else {
288 /*
289 * don't have to set any flags
290 */
291 vnode_startwrite(jnl->jdev);
292 }
293 buf_setsize(bp, curlen);
294 buf_setcount(bp, curlen);
295 buf_setdataptr(bp, (uintptr_t)data);
296 buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
297 buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
298
299 err = VNOP_STRATEGY(bp);
300 if (!err) {
301 err = (int)buf_biowait(bp);
302 }
303 free_io_buf(bp);
304
305 if (err) {
306 printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
307 return 0;
308 }
309
310 *offset += curlen;
311 io_sz += curlen;
312 if (io_sz != len) {
313 // handle wrap-around
314 data = (char *)data + curlen;
315 curlen = len - io_sz;
316 if (*offset >= jnl->jhdr->size) {
317 *offset = jnl->jhdr->jhdr_size;
318 }
319 goto again;
320 }
321
322 return io_sz;
323 }
324
325 static size_t
326 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
327 {
328 return do_journal_io(jnl, offset, data, len, JNL_READ);
329 }
330
331 static size_t
332 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
333 {
334 return do_journal_io(jnl, offset, data, len, JNL_WRITE);
335 }
336
337
338 static int
339 read_journal_header(journal *jnl, void *data, size_t len)
340 {
341 off_t hdr_offset = 0;
342
343 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
344 }
345
346 static int
347 write_journal_header(journal *jnl)
348 {
349 static int num_err_prints = 0;
350 int ret;
351 off_t jhdr_offset = 0;
352 struct vfs_context context;
353
354 context.vc_proc = current_proc();
355 context.vc_ucred = NOCRED;
356 //
357 // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
358 //
359 ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
360 if (ret != 0) {
361 //
362 // Only print this error if it's a different error than the
363 // previous one, or if it's the first time for this device
364 // or if the total number of printfs is less than 25. We
365 // allow for up to 25 printfs to insure that some make it
366 // into the on-disk syslog. Otherwise if we only printed
367 // one, it's possible it would never make it to the syslog
368 // for the root volume and that makes debugging hard.
369 //
370 if ( ret != jnl->last_flush_err
371 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
372 || num_err_prints++ < 25) {
373
374 printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
375
376 jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
377 jnl->last_flush_err = ret;
378 }
379 }
380
381
382 jnl->jhdr->checksum = 0;
383 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
384 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != jnl->jhdr->jhdr_size) {
385 printf("jnl: write_journal_header: error writing the journal header!\n");
386 jnl->flags |= JOURNAL_INVALID;
387 return -1;
388 }
389
390 // Have to flush after writing the journal header so that
391 // a future transaction doesn't sneak out to disk before
392 // the header does and thus overwrite data that the old
393 // journal header refers to. Saw this exact case happen
394 // on an IDE bus analyzer with Larry Barras so while it
395 // may seem obscure, it's not.
396 //
397 VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
398
399 return 0;
400 }
401
402
403
404 //
405 // this is a work function used to free up transactions that
406 // completed. they can't be free'd from buffer_flushed_callback
407 // because it is called from deep with the disk driver stack
408 // and thus can't do something that would potentially cause
409 // paging. it gets called by each of the journal api entry
410 // points so stuff shouldn't hang around for too long.
411 //
412 static void
413 free_old_stuff(journal *jnl)
414 {
415 transaction *tr, *next;
416
417 lock_oldstart(jnl);
418 tr = jnl->tr_freeme;
419 jnl->tr_freeme = NULL;
420 unlock_oldstart(jnl);
421
422 for(; tr; tr=next) {
423 next = tr->next;
424 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
425 }
426
427 }
428
429
430
431 //
432 // This is our callback that lets us know when a buffer has been
433 // flushed to disk. It's called from deep within the driver stack
434 // and thus is quite limited in what it can do. Notably, it can
435 // not initiate any new i/o's or allocate/free memory.
436 //
437 static void
438 buffer_flushed_callback(struct buf *bp, void *arg)
439 {
440 transaction *tr;
441 journal *jnl;
442 transaction *ctr, *prev=NULL, *next;
443 int i, bufsize;
444
445
446 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
447 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
448
449 // snarf out the bits we want
450 bufsize = buf_size(bp);
451 tr = (transaction *)arg;
452
453 // then we've already seen it
454 if (tr == NULL) {
455 return;
456 }
457
458 CHECK_TRANSACTION(tr);
459
460 jnl = tr->jnl;
461 if (jnl->flags & JOURNAL_INVALID) {
462 return;
463 }
464
465 CHECK_JOURNAL(jnl);
466
467 // update the number of blocks that have been flushed.
468 // this buf may represent more than one block so take
469 // that into account.
470 OSAddAtomic(bufsize, &tr->num_flushed);
471
472
473 // if this transaction isn't done yet, just return as
474 // there is nothing to do.
475 if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
476 return;
477 }
478
479 // this will single thread checking the transaction
480 lock_oldstart(jnl);
481
482 if (tr->total_bytes == 0xfbadc0de) {
483 // then someone beat us to it...
484 unlock_oldstart(jnl);
485 return;
486 }
487
488 // mark this so that we're the owner of dealing with the
489 // cleanup for this transaction
490 tr->total_bytes = 0xfbadc0de;
491
492 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
493 // tr, tr->journal_start, tr->journal_end, jnl);
494
495 // find this entry in the old_start[] index and mark it completed
496 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
497
498 if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
499 jnl->old_start[i] &= ~(0x8000000000000000LL);
500 break;
501 }
502 }
503 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
504 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
505 tr->journal_start, tr, jnl);
506 }
507 unlock_oldstart(jnl);
508
509
510 // if we are here then we need to update the journal header
511 // to reflect that this transaction is complete
512 if (tr->journal_start == jnl->active_start) {
513 jnl->active_start = tr->journal_end;
514 tr->journal_start = tr->journal_end = (off_t)0;
515 }
516
517 // go through the completed_trs list and try to coalesce
518 // entries, restarting back at the beginning if we have to.
519 for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
520 if (ctr->journal_start == jnl->active_start) {
521 jnl->active_start = ctr->journal_end;
522 if (prev) {
523 prev->next = ctr->next;
524 }
525 if (ctr == jnl->completed_trs) {
526 jnl->completed_trs = ctr->next;
527 }
528
529 lock_oldstart(jnl);
530 next = jnl->completed_trs; // this starts us over again
531 ctr->next = jnl->tr_freeme;
532 jnl->tr_freeme = ctr;
533 ctr = NULL;
534 unlock_oldstart(jnl);
535 } else if (tr->journal_end == ctr->journal_start) {
536 ctr->journal_start = tr->journal_start;
537 next = jnl->completed_trs; // this starts us over again
538 ctr = NULL;
539 tr->journal_start = tr->journal_end = (off_t)0;
540 } else if (tr->journal_start == ctr->journal_end) {
541 ctr->journal_end = tr->journal_end;
542 next = ctr->next;
543 tr->journal_start = tr->journal_end = (off_t)0;
544 } else {
545 next = ctr->next;
546 }
547 }
548
549 // if this is true then we didn't merge with anyone
550 // so link ourselves in at the head of the completed
551 // transaction list.
552 if (tr->journal_start != 0) {
553 // put this entry into the correct sorted place
554 // in the list instead of just at the head.
555 //
556
557 prev = NULL;
558 for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
559 // just keep looping
560 }
561
562 if (ctr == NULL && prev == NULL) {
563 jnl->completed_trs = tr;
564 tr->next = NULL;
565 } else if (ctr == jnl->completed_trs) {
566 tr->next = jnl->completed_trs;
567 jnl->completed_trs = tr;
568 } else {
569 tr->next = prev->next;
570 prev->next = tr;
571 }
572 } else {
573 // if we're here this tr got merged with someone else so
574 // put it on the list to be free'd
575 lock_oldstart(jnl);
576 tr->next = jnl->tr_freeme;
577 jnl->tr_freeme = tr;
578 unlock_oldstart(jnl);
579 }
580 }
581
582
583 #include <libkern/OSByteOrder.h>
584
585 #define SWAP16(x) OSSwapInt16(x)
586 #define SWAP32(x) OSSwapInt32(x)
587 #define SWAP64(x) OSSwapInt64(x)
588
589
590 static void
591 swap_journal_header(journal *jnl)
592 {
593 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
594 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
595 jnl->jhdr->start = SWAP64(jnl->jhdr->start);
596 jnl->jhdr->end = SWAP64(jnl->jhdr->end);
597 jnl->jhdr->size = SWAP64(jnl->jhdr->size);
598 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
599 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
600 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
601 }
602
603 static void
604 swap_block_list_header(journal *jnl, block_list_header *blhdr)
605 {
606 int i;
607
608 blhdr->max_blocks = SWAP16(blhdr->max_blocks);
609 blhdr->num_blocks = SWAP16(blhdr->num_blocks);
610 blhdr->bytes_used = SWAP32(blhdr->bytes_used);
611 blhdr->checksum = SWAP32(blhdr->checksum);
612 blhdr->pad = SWAP32(blhdr->pad);
613
614 if (blhdr->num_blocks * sizeof(blhdr->binfo[0]) > jnl->jhdr->blhdr_size) {
615 printf("jnl: blhdr num blocks looks suspicious (%d). not swapping.\n", blhdr->num_blocks);
616 return;
617 }
618
619 for(i=0; i < blhdr->num_blocks; i++) {
620 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
621 blhdr->binfo[i].bsize = SWAP32(blhdr->binfo[i].bsize);
622 blhdr->binfo[i].bp = (void *)SWAP32((int)blhdr->binfo[i].bp);
623 }
624 }
625
626
627 static int
628 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
629 {
630 int ret;
631 struct buf *oblock_bp=NULL;
632
633 // first read the block we want.
634 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
635 if (ret != 0) {
636 printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
637
638 if (oblock_bp) {
639 buf_brelse(oblock_bp);
640 oblock_bp = NULL;
641 }
642
643 // let's try to be aggressive here and just re-write the block
644 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
645 if (oblock_bp == NULL) {
646 printf("jnl: update_fs_block: buf_getblk() for %lld failed! failing update.\n", fs_block);
647 return -1;
648 }
649 }
650
651 // make sure it's the correct size.
652 if (buf_size(oblock_bp) != bsize) {
653 buf_brelse(oblock_bp);
654 return -1;
655 }
656
657 // copy the journal data over top of it
658 memcpy((void *)buf_dataptr(oblock_bp), block_ptr, bsize);
659
660 if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
661 printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
662 return ret;
663 }
664
665 // and now invalidate it so that if someone else wants to read
666 // it in a different size they'll be able to do it.
667 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
668 if (oblock_bp) {
669 buf_markinvalid(oblock_bp);
670 buf_brelse(oblock_bp);
671 }
672
673 return 0;
674 }
675
676 static int
677 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
678 {
679 struct bucket *newBuf;
680 int current_size = num_buckets, i;
681
682 // return if newsize is less than the current size
683 if (new_size < num_buckets) {
684 return current_size;
685 }
686
687 if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
688 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
689 return -1;
690 }
691
692 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
693
694 // copy existing elements
695 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
696
697 // initialize the new ones
698 for(i=num_buckets; i < new_size; i++) {
699 newBuf[i].block_num = (off_t)-1;
700 }
701
702 // free the old container
703 FREE(*buf_ptr, M_TEMP);
704
705 // reset the buf_ptr
706 *buf_ptr = newBuf;
707
708 return new_size;
709 }
710
711 static int
712 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
713 {
714 int lo, hi, index, matches, i;
715
716 if (num_full == 0) {
717 return 0; // table is empty, so insert at index=0
718 }
719
720 lo = 0;
721 hi = num_full - 1;
722 index = -1;
723
724 // perform binary search for block_num
725 do {
726 int mid = (hi - lo)/2 + lo;
727 off_t this_num = (*buf_ptr)[mid].block_num;
728
729 if (block_num == this_num) {
730 index = mid;
731 break;
732 }
733
734 if (block_num < this_num) {
735 hi = mid;
736 continue;
737 }
738
739 if (block_num > this_num) {
740 lo = mid + 1;
741 continue;
742 }
743 } while(lo < hi);
744
745 // check if lo and hi converged on the match
746 if (block_num == (*buf_ptr)[hi].block_num) {
747 index = hi;
748 }
749
750 // if no existing entry found, find index for new one
751 if (index == -1) {
752 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
753 } else {
754 // make sure that we return the right-most index in the case of multiple matches
755 matches = 0;
756 i = index + 1;
757 while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
758 matches++;
759 i++;
760 }
761
762 index += matches;
763 }
764
765 return index;
766 }
767
768 static int
769 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
770 {
771 if (!overwriting) {
772 // grow the table if we're out of space
773 if (*num_full_ptr >= *num_buckets_ptr) {
774 int new_size = *num_buckets_ptr * 2;
775 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
776
777 if (grow_size < new_size) {
778 printf("jnl: add_block: grow_table returned an error!\n");
779 return -1;
780 }
781
782 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
783 }
784
785 // if we're not inserting at the end, we need to bcopy
786 if (blk_index != *num_full_ptr) {
787 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
788 }
789
790 (*num_full_ptr)++; // increment only if we're not overwriting
791 }
792
793 // sanity check the values we're about to add
794 if (offset >= jnl->jhdr->size) {
795 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
796 }
797 if (size <= 0) {
798 panic("jnl: insert_block: bad size in insert_block (%d)\n", size);
799 }
800
801 (*buf_ptr)[blk_index].block_num = num;
802 (*buf_ptr)[blk_index].block_size = size;
803 (*buf_ptr)[blk_index].jnl_offset = offset;
804
805 return blk_index;
806 }
807
808 static int
809 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
810 {
811 int num_to_remove, index, i, overwrite, err;
812 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
813 off_t overlap, block_start, block_end;
814
815 block_start = block_num*jhdr_size;
816 block_end = block_start + size;
817 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
818
819 // first, eliminate any overlap with the previous entry
820 if (blk_index != 0 && !overwrite) {
821 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
822 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
823 overlap = prev_block_end - block_start;
824 if (overlap > 0) {
825 if (overlap % jhdr_size != 0) {
826 panic("jnl: do_overlap: overlap with previous entry not a multiple of %d\n", jhdr_size);
827 }
828
829 // if the previous entry completely overlaps this one, we need to break it into two pieces.
830 if (prev_block_end > block_end) {
831 off_t new_num = block_end / jhdr_size;
832 size_t new_size = prev_block_end - block_end;
833
834 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
835
836 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, num_buckets_ptr, num_full_ptr, 0);
837 if (err < 0) {
838 panic("jnl: do_overlap: error inserting during pre-overlap\n");
839 }
840 }
841
842 // Regardless, we need to truncate the previous entry to the beginning of the overlap
843 (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
844 }
845 }
846
847 // then, bail out fast if there's no overlap with the entries that follow
848 if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
849 return 0; // no overlap, no overwrite
850 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
851 return 1; // simple overwrite
852 }
853
854 // Otherwise, find all cases of total and partial overlap. We use the special
855 // block_num of -2 to designate entries that are completely overlapped and must
856 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
857 // entries must be adjusted to keep the array consistent.
858 index = blk_index;
859 num_to_remove = 0;
860 while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
861 if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
862 (*buf_ptr)[index].block_num = -2; // mark this for deletion
863 num_to_remove++;
864 } else {
865 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
866 if (overlap > 0) {
867 if (overlap % jhdr_size != 0) {
868 panic("jnl: do_overlap: overlap of %lld is not multiple of %d\n", overlap, jhdr_size);
869 }
870
871 // if we partially overlap this entry, adjust its block number, jnl offset, and size
872 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
873
874 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
875 if (new_offset >= jnl->jhdr->size) {
876 new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
877 }
878 (*buf_ptr)[index].jnl_offset = new_offset;
879
880 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
881 if ((*buf_ptr)[index].block_size <= 0) {
882 panic("jnl: do_overlap: after overlap, new block size is invalid (%d)\n", (*buf_ptr)[index].block_size);
883 // return -1; // if above panic is removed, return -1 for error
884 }
885 }
886
887 }
888
889 index++;
890 }
891
892 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
893 index--; // start with the last index used within the above loop
894 while(index >= blk_index) {
895 if ((*buf_ptr)[index].block_num == -2) {
896 if (index == *num_full_ptr-1) {
897 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
898 } else {
899 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
900 }
901 (*num_full_ptr)--;
902 }
903 index--;
904 }
905
906 // eliminate any stale entries at the end of the table
907 for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
908 (*buf_ptr)[i].block_num = -1;
909 }
910
911 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
912 }
913
914 // PR-3105942: Coalesce writes to the same block in journal replay
915 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
916 // to be replayed and the corresponding location in the journal which contains
917 // the most recent data for those blocks. The array is "played" once the all the
918 // blocks in the journal have been coalesced. The code for the case of conflicting/
919 // overlapping writes to a single block is the most dense. Because coalescing can
920 // disrupt the existing time-ordering of blocks in the journal playback, care
921 // is taken to catch any overlaps and keep the array consistent.
922 static int
923 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
924 {
925 int blk_index, overwriting;
926
927 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
928 // inserted (or the index of the elem to overwrite).
929 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
930
931 // check if the index is within bounds (if we're adding this block to the end of
932 // the table, blk_index will be equal to num_full)
933 if (blk_index < 0 || blk_index > *num_full_ptr) {
934 //printf("jnl: add_block: trouble adding block to co_buf\n");
935 return -1;
936 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
937
938 // Determine whether we're overwriting an existing entry by checking for overlap
939 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr);
940 if (overwriting < 0) {
941 return -1; // if we got an error, pass it along
942 }
943
944 // returns the index, or -1 on error
945 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr, overwriting);
946
947 return blk_index;
948 }
949
950 static int
951 replay_journal(journal *jnl)
952 {
953 int i, ret, orig_checksum, checksum, max_bsize;
954 block_list_header *blhdr;
955 off_t offset;
956 char *buff, *block_ptr=NULL;
957 struct bucket *co_buf;
958 int num_buckets = STARTING_BUCKETS, num_full;
959
960 // wrap the start ptr if it points to the very end of the journal
961 if (jnl->jhdr->start == jnl->jhdr->size) {
962 jnl->jhdr->start = jnl->jhdr->jhdr_size;
963 }
964 if (jnl->jhdr->end == jnl->jhdr->size) {
965 jnl->jhdr->end = jnl->jhdr->jhdr_size;
966 }
967
968 if (jnl->jhdr->start == jnl->jhdr->end) {
969 return 0;
970 }
971
972 // allocate memory for the header_block. we'll read each blhdr into this
973 if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
974 printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
975 jnl->jhdr->blhdr_size);
976 return -1;
977 }
978
979 // allocate memory for the coalesce buffer
980 if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
981 printf("jnl: replay_journal: no memory for coalesce buffer!\n");
982 return -1;
983 }
984
985 // initialize entries
986 for(i=0; i < num_buckets; i++) {
987 co_buf[i].block_num = -1;
988 }
989 num_full = 0; // empty at first
990
991
992 printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
993 jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
994
995 while(jnl->jhdr->start != jnl->jhdr->end) {
996 offset = jnl->jhdr->start;
997 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
998 if (ret != jnl->jhdr->blhdr_size) {
999 printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
1000 goto bad_replay;
1001 }
1002
1003 blhdr = (block_list_header *)buff;
1004
1005 orig_checksum = blhdr->checksum;
1006 blhdr->checksum = 0;
1007 if (jnl->flags & JOURNAL_NEED_SWAP) {
1008 // calculate the checksum based on the unswapped data
1009 // because it is done byte-at-a-time.
1010 orig_checksum = SWAP32(orig_checksum);
1011 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1012 swap_block_list_header(jnl, blhdr);
1013 } else {
1014 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1015 }
1016 if (checksum != orig_checksum) {
1017 printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1018 offset, orig_checksum, checksum);
1019 goto bad_replay;
1020 }
1021 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
1022 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1023 printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
1024 blhdr->max_blocks, blhdr->num_blocks);
1025 goto bad_replay;
1026 }
1027
1028 for(i=1; i < blhdr->num_blocks; i++) {
1029 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1030 printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
1031 goto bad_replay;
1032 }
1033 }
1034
1035 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1036 // blhdr->num_blocks-1, jnl->jhdr->start);
1037 for(i=1; i < blhdr->num_blocks; i++) {
1038 int size, ret_val;
1039 off_t number;
1040
1041 size = blhdr->binfo[i].bsize;
1042 number = blhdr->binfo[i].bnum;
1043
1044 // don't add "killed" blocks
1045 if (number == (off_t)-1) {
1046 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1047 } else {
1048 // add this bucket to co_buf, coalescing where possible
1049 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1050 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, &num_buckets, &num_full);
1051
1052 if (ret_val == -1) {
1053 printf("jnl: replay_journal: trouble adding block to co_buf\n");
1054 goto bad_replay;
1055 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1056 }
1057
1058 // increment offset
1059 offset += size;
1060
1061 // check if the last block added puts us off the end of the jnl.
1062 // if so, we need to wrap to the beginning and take any remainder
1063 // into account
1064 //
1065 if (offset >= jnl->jhdr->size) {
1066 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1067 }
1068 }
1069
1070
1071 jnl->jhdr->start += blhdr->bytes_used;
1072 if (jnl->jhdr->start >= jnl->jhdr->size) {
1073 // wrap around and skip the journal header block
1074 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1075 }
1076 }
1077
1078
1079 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1080
1081 /*
1082 * make sure it's at least one page in size, so
1083 * start max_bsize at PAGE_SIZE
1084 */
1085 for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1086
1087 if (co_buf[i].block_num == (off_t)-1)
1088 continue;
1089
1090 if (co_buf[i].block_size > max_bsize)
1091 max_bsize = co_buf[i].block_size;
1092 }
1093 /*
1094 * round max_bsize up to the nearest PAGE_SIZE multiple
1095 */
1096 if (max_bsize & (PAGE_SIZE - 1)) {
1097 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1098 }
1099
1100 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1101 goto bad_replay;
1102 }
1103
1104 // Replay the coalesced entries in the co-buf
1105 for(i=0; i < num_full; i++) {
1106 size_t size = co_buf[i].block_size;
1107 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1108 off_t number = co_buf[i].block_num;
1109
1110
1111 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1112 // co_buf[i].block_size, co_buf[i].jnl_offset);
1113
1114 if (number == (off_t)-1) {
1115 // printf("jnl: replay_journal: skipping killed fs block\n");
1116 } else {
1117
1118 // do journal read, and set the phys. block
1119 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1120 if (ret != size) {
1121 printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
1122 goto bad_replay;
1123 }
1124
1125 if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1126 goto bad_replay;
1127 }
1128 }
1129 }
1130
1131
1132 // done replaying; update jnl header
1133 if (write_journal_header(jnl) != 0) {
1134 goto bad_replay;
1135 }
1136
1137 // free block_ptr
1138 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1139 block_ptr = NULL;
1140
1141 // free the coalesce buffer
1142 FREE(co_buf, M_TEMP);
1143 co_buf = NULL;
1144
1145 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1146 return 0;
1147
1148 bad_replay:
1149 if (block_ptr) {
1150 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1151 }
1152 if (co_buf) {
1153 FREE(co_buf, M_TEMP);
1154 }
1155 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1156
1157 return -1;
1158 }
1159
1160
1161 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1162 //#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
1163 #define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
1164
1165 // XXXdbg - so I can change it in the debugger
1166 int def_tbuffer_size = 0;
1167
1168
1169 //
1170 // This function sets the size of the tbuffer and the
1171 // size of the blhdr. It assumes that jnl->jhdr->size
1172 // and jnl->jhdr->jhdr_size are already valid.
1173 //
1174 static void
1175 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1176 {
1177 //
1178 // one-time initialization based on how much memory
1179 // there is in the machine.
1180 //
1181 if (def_tbuffer_size == 0) {
1182 if (mem_size < (256*1024*1024)) {
1183 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1184 } else if (mem_size < (512*1024*1024)) {
1185 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1186 } else if (mem_size < (1024*1024*1024)) {
1187 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1188 } else if (mem_size >= (1024*1024*1024)) {
1189 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1190 }
1191 }
1192
1193 // size up the transaction buffer... can't be larger than the number
1194 // of blocks that can fit in a block_list_header block.
1195 if (tbuffer_size == 0) {
1196 jnl->tbuffer_size = def_tbuffer_size;
1197 } else {
1198 // make sure that the specified tbuffer_size isn't too small
1199 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1200 tbuffer_size = jnl->jhdr->blhdr_size * 2;
1201 }
1202 // and make sure it's an even multiple of the block size
1203 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1204 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1205 }
1206
1207 jnl->tbuffer_size = tbuffer_size;
1208 }
1209
1210 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1211 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1212 }
1213
1214 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1215 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1216 }
1217
1218 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1219 if (jnl->jhdr->blhdr_size < phys_blksz) {
1220 jnl->jhdr->blhdr_size = phys_blksz;
1221 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1222 // have to round up so we're an even multiple of the physical block size
1223 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1224 }
1225 }
1226
1227
1228
1229 journal *
1230 journal_create(struct vnode *jvp,
1231 off_t offset,
1232 off_t journal_size,
1233 struct vnode *fsvp,
1234 size_t min_fs_blksz,
1235 int32_t flags,
1236 int32_t tbuffer_size,
1237 void (*flush)(void *arg),
1238 void *arg)
1239 {
1240 journal *jnl;
1241 int phys_blksz;
1242 struct vfs_context context;
1243
1244 context.vc_proc = current_proc();
1245 context.vc_ucred = FSCRED;
1246
1247 /* Get the real physical block size. */
1248 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1249 return NULL;
1250 }
1251
1252 if (phys_blksz > min_fs_blksz) {
1253 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1254 phys_blksz, min_fs_blksz);
1255 return NULL;
1256 }
1257
1258 if ((journal_size % phys_blksz) != 0) {
1259 printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1260 journal_size, phys_blksz);
1261 return NULL;
1262 }
1263
1264 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1265 memset(jnl, 0, sizeof(*jnl));
1266
1267 jnl->jdev = jvp;
1268 jnl->jdev_offset = offset;
1269 jnl->fsdev = fsvp;
1270 jnl->flush = flush;
1271 jnl->flush_arg = arg;
1272 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1273 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1274
1275 if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1276 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
1277 goto bad_kmem_alloc;
1278 }
1279
1280 memset(jnl->header_buf, 0, phys_blksz);
1281
1282 jnl->jhdr = (journal_header *)jnl->header_buf;
1283 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1284 jnl->jhdr->endian = ENDIAN_MAGIC;
1285 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
1286 jnl->jhdr->end = phys_blksz;
1287 jnl->jhdr->size = journal_size;
1288 jnl->jhdr->jhdr_size = phys_blksz;
1289 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1290
1291 jnl->active_start = jnl->jhdr->start;
1292
1293 // XXXdbg - for testing you can force the journal to wrap around
1294 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1295 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1296
1297 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1298
1299 if (write_journal_header(jnl) != 0) {
1300 printf("jnl: journal_create: failed to write journal header.\n");
1301 goto bad_write;
1302 }
1303
1304 return jnl;
1305
1306
1307 bad_write:
1308 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1309 bad_kmem_alloc:
1310 jnl->jhdr = NULL;
1311 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1312 return NULL;
1313 }
1314
1315
1316 journal *
1317 journal_open(struct vnode *jvp,
1318 off_t offset,
1319 off_t journal_size,
1320 struct vnode *fsvp,
1321 size_t min_fs_blksz,
1322 int32_t flags,
1323 int32_t tbuffer_size,
1324 void (*flush)(void *arg),
1325 void *arg)
1326 {
1327 journal *jnl;
1328 int orig_blksz=0, phys_blksz;
1329 int orig_checksum, checksum;
1330 struct vfs_context context;
1331
1332 context.vc_proc = current_proc();
1333 context.vc_ucred = FSCRED;
1334
1335 /* Get the real physical block size. */
1336 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1337 return NULL;
1338 }
1339
1340 if (phys_blksz > min_fs_blksz) {
1341 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1342 phys_blksz, min_fs_blksz);
1343 return NULL;
1344 }
1345
1346 if ((journal_size % phys_blksz) != 0) {
1347 printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1348 journal_size, phys_blksz);
1349 return NULL;
1350 }
1351
1352 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1353 memset(jnl, 0, sizeof(*jnl));
1354
1355 jnl->jdev = jvp;
1356 jnl->jdev_offset = offset;
1357 jnl->fsdev = fsvp;
1358 jnl->flush = flush;
1359 jnl->flush_arg = arg;
1360 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1361 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1362
1363 if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1364 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
1365 goto bad_kmem_alloc;
1366 }
1367
1368 jnl->jhdr = (journal_header *)jnl->header_buf;
1369 memset(jnl->jhdr, 0, sizeof(journal_header)+4);
1370
1371 // we have to set this up here so that do_journal_io() will work
1372 jnl->jhdr->jhdr_size = phys_blksz;
1373
1374 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1375 printf("jnl: open: could not read %d bytes for the journal header.\n",
1376 phys_blksz);
1377 goto bad_journal;
1378 }
1379
1380 orig_checksum = jnl->jhdr->checksum;
1381 jnl->jhdr->checksum = 0;
1382
1383 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1384 // do this before the swap since it's done byte-at-a-time
1385 orig_checksum = SWAP32(orig_checksum);
1386 checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
1387 swap_journal_header(jnl);
1388 jnl->flags |= JOURNAL_NEED_SWAP;
1389 } else {
1390 checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
1391 }
1392
1393 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1394 printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
1395 jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1396 goto bad_journal;
1397 }
1398
1399 // only check if we're the current journal header magic value
1400 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1401
1402 if (orig_checksum != checksum) {
1403 printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
1404 orig_checksum, checksum);
1405
1406 //goto bad_journal;
1407 }
1408 }
1409
1410 // XXXdbg - convert old style magic numbers to the new one
1411 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1412 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1413 }
1414
1415 if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1416 printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
1417 phys_blksz, jnl->jhdr->jhdr_size);
1418
1419 orig_blksz = phys_blksz;
1420 phys_blksz = jnl->jhdr->jhdr_size;
1421 if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) {
1422 printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
1423 goto bad_journal;
1424 }
1425 // goto bad_journal;
1426 }
1427
1428 if ( jnl->jhdr->start <= 0
1429 || jnl->jhdr->start > jnl->jhdr->size
1430 || jnl->jhdr->start > 1024*1024*1024) {
1431 printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1432 jnl->jhdr->start, jnl->jhdr->size);
1433 goto bad_journal;
1434 }
1435
1436 if ( jnl->jhdr->end <= 0
1437 || jnl->jhdr->end > jnl->jhdr->size
1438 || jnl->jhdr->end > 1024*1024*1024) {
1439 printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1440 jnl->jhdr->end, jnl->jhdr->size);
1441 goto bad_journal;
1442 }
1443
1444 if (jnl->jhdr->size > 1024*1024*1024) {
1445 printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
1446 goto bad_journal;
1447 }
1448
1449 // XXXdbg - can't do these checks because hfs writes all kinds of
1450 // non-uniform sized blocks even on devices that have a block size
1451 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
1452 // therefore these checks will fail and so we just have to punt and
1453 // do more relaxed checking...
1454 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1455 if ((jnl->jhdr->start % 512) != 0) {
1456 printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
1457 jnl->jhdr->start);
1458 goto bad_journal;
1459 }
1460
1461 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1462 if ((jnl->jhdr->end % 512) != 0) {
1463 printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1464 jnl->jhdr->end, jnl->jhdr->jhdr_size);
1465 goto bad_journal;
1466 }
1467
1468 // take care of replaying the journal if necessary
1469 if (flags & JOURNAL_RESET) {
1470 printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
1471 jnl, jnl->jhdr->start, jnl->jhdr->end);
1472 jnl->jhdr->start = jnl->jhdr->end;
1473 } else if (replay_journal(jnl) != 0) {
1474 printf("jnl: journal_open: Error replaying the journal!\n");
1475 goto bad_journal;
1476 }
1477
1478 if (orig_blksz != 0) {
1479 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1480 phys_blksz = orig_blksz;
1481 if (orig_blksz < jnl->jhdr->jhdr_size) {
1482 printf("jnl: open: jhdr_size is %d but orig phys blk size is %d. switching.\n",
1483 jnl->jhdr->jhdr_size, orig_blksz);
1484
1485 jnl->jhdr->jhdr_size = orig_blksz;
1486 }
1487 }
1488
1489 // make sure this is in sync!
1490 jnl->active_start = jnl->jhdr->start;
1491
1492 // set this now, after we've replayed the journal
1493 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1494
1495 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1496
1497 return jnl;
1498
1499 bad_journal:
1500 if (orig_blksz != 0) {
1501 phys_blksz = orig_blksz;
1502 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
1503 }
1504 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1505 bad_kmem_alloc:
1506 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1507 return NULL;
1508 }
1509
1510 void
1511 journal_close(journal *jnl)
1512 {
1513 volatile off_t *start, *end;
1514 int counter=0;
1515
1516 CHECK_JOURNAL(jnl);
1517
1518 // set this before doing anything that would block so that
1519 // we start tearing things down properly.
1520 //
1521 jnl->flags |= JOURNAL_CLOSE_PENDING;
1522
1523 if (jnl->owner != current_thread()) {
1524 lock_journal(jnl);
1525 }
1526
1527 //
1528 // only write stuff to disk if the journal is still valid
1529 //
1530 if ((jnl->flags & JOURNAL_INVALID) == 0) {
1531
1532 if (jnl->active_tr) {
1533 journal_end_transaction(jnl);
1534 }
1535
1536 // flush any buffered transactions
1537 if (jnl->cur_tr) {
1538 transaction *tr = jnl->cur_tr;
1539
1540 jnl->cur_tr = NULL;
1541 end_transaction(tr, 1); // force it to get flushed
1542 }
1543
1544 //start = &jnl->jhdr->start;
1545 start = &jnl->active_start;
1546 end = &jnl->jhdr->end;
1547
1548 while (*start != *end && counter++ < 500) {
1549 printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1550 if (jnl->flush) {
1551 jnl->flush(jnl->flush_arg);
1552 }
1553 tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 1);
1554 }
1555
1556 if (*start != *end) {
1557 printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1558 *start, *end);
1559 }
1560
1561 // make sure this is in sync when we close the journal
1562 jnl->jhdr->start = jnl->active_start;
1563
1564 // if this fails there's not much we can do at this point...
1565 write_journal_header(jnl);
1566 } else {
1567 // if we're here the journal isn't valid any more.
1568 // so make sure we don't leave any locked blocks lying around
1569 printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl);
1570 if (jnl->active_tr || jnl->cur_tr) {
1571 transaction *tr;
1572 if (jnl->active_tr) {
1573 tr = jnl->active_tr;
1574 jnl->active_tr = NULL;
1575 } else {
1576 tr = jnl->cur_tr;
1577 jnl->cur_tr = NULL;
1578 }
1579
1580 abort_transaction(jnl, tr);
1581 if (jnl->active_tr || jnl->cur_tr) {
1582 panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
1583 }
1584 }
1585 }
1586
1587 free_old_stuff(jnl);
1588
1589 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
1590 jnl->jhdr = (void *)0xbeefbabe;
1591
1592 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1593 }
1594
1595 static void
1596 dump_journal(journal *jnl)
1597 {
1598 transaction *ctr;
1599
1600 printf("journal:");
1601 printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
1602 printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
1603 printf(" start: 0x%.8llx\n", jnl->jhdr->start);
1604 printf(" end: 0x%.8llx\n", jnl->jhdr->end);
1605 printf(" size: 0x%.8llx\n", jnl->jhdr->size);
1606 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
1607 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
1608 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
1609
1610 printf(" completed transactions:\n");
1611 for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
1612 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
1613 }
1614 }
1615
1616
1617
1618 static off_t
1619 free_space(journal *jnl)
1620 {
1621 off_t free_space;
1622
1623 if (jnl->jhdr->start < jnl->jhdr->end) {
1624 free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
1625 } else if (jnl->jhdr->start > jnl->jhdr->end) {
1626 free_space = jnl->jhdr->start - jnl->jhdr->end;
1627 } else {
1628 // journal is completely empty
1629 free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
1630 }
1631
1632 return free_space;
1633 }
1634
1635
1636 //
1637 // The journal must be locked on entry to this function.
1638 // The "desired_size" is in bytes.
1639 //
1640 static int
1641 check_free_space(journal *jnl, int desired_size)
1642 {
1643 int i, counter=0;
1644
1645 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
1646 // desired_size, free_space(jnl));
1647
1648 while (1) {
1649 int old_start_empty;
1650
1651 if (counter++ == 5000) {
1652 dump_journal(jnl);
1653 panic("jnl: check_free_space: buffer flushing isn't working "
1654 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
1655 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
1656 }
1657 if (counter > 7500) {
1658 printf("jnl: check_free_space: giving up waiting for free space.\n");
1659 return ENOSPC;
1660 }
1661
1662 // make sure there's space in the journal to hold this transaction
1663 if (free_space(jnl) > desired_size) {
1664 break;
1665 }
1666
1667 //
1668 // here's where we lazily bump up jnl->jhdr->start. we'll consume
1669 // entries until there is enough space for the next transaction.
1670 //
1671 old_start_empty = 1;
1672 lock_oldstart(jnl);
1673 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
1674 int counter;
1675
1676 counter = 0;
1677 while (jnl->old_start[i] & 0x8000000000000000LL) {
1678 if (counter++ > 100) {
1679 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
1680 jnl->old_start[i], jnl);
1681 }
1682
1683 unlock_oldstart(jnl);
1684 if (jnl->flush) {
1685 jnl->flush(jnl->flush_arg);
1686 }
1687 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
1688 lock_oldstart(jnl);
1689 }
1690
1691 if (jnl->old_start[i] == 0) {
1692 continue;
1693 }
1694
1695 old_start_empty = 0;
1696 jnl->jhdr->start = jnl->old_start[i];
1697 jnl->old_start[i] = 0;
1698 if (free_space(jnl) > desired_size) {
1699 unlock_oldstart(jnl);
1700 write_journal_header(jnl);
1701 lock_oldstart(jnl);
1702 break;
1703 }
1704 }
1705 unlock_oldstart(jnl);
1706
1707 // if we bumped the start, loop and try again
1708 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
1709 continue;
1710 } else if (old_start_empty) {
1711 //
1712 // if there is nothing in old_start anymore then we can
1713 // bump the jhdr->start to be the same as active_start
1714 // since it is possible there was only one very large
1715 // transaction in the old_start array. if we didn't do
1716 // this then jhdr->start would never get updated and we
1717 // would wind up looping until we hit the panic at the
1718 // start of the loop.
1719 //
1720 jnl->jhdr->start = jnl->active_start;
1721 write_journal_header(jnl);
1722 continue;
1723 }
1724
1725
1726 // if the file system gave us a flush function, call it to so that
1727 // it can flush some blocks which hopefully will cause some transactions
1728 // to complete and thus free up space in the journal.
1729 if (jnl->flush) {
1730 jnl->flush(jnl->flush_arg);
1731 }
1732
1733 // wait for a while to avoid being cpu-bound (this will
1734 // put us to sleep for 10 milliseconds)
1735 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
1736 }
1737
1738 return 0;
1739 }
1740
1741 int
1742 journal_start_transaction(journal *jnl)
1743 {
1744 int ret;
1745 transaction *tr;
1746
1747 CHECK_JOURNAL(jnl);
1748
1749 if (jnl->flags & JOURNAL_INVALID) {
1750 return EINVAL;
1751 }
1752
1753 if (jnl->owner == current_thread()) {
1754 if (jnl->active_tr == NULL) {
1755 panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_thread 0x%x\n",
1756 jnl, jnl->owner, current_thread());
1757 }
1758 jnl->nested_count++;
1759 return 0;
1760 }
1761
1762 lock_journal(jnl);
1763
1764 if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
1765 panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
1766 jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
1767 }
1768
1769 jnl->owner = current_thread();
1770 jnl->nested_count = 1;
1771
1772 free_old_stuff(jnl);
1773
1774 // make sure there's room in the journal
1775 if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
1776 printf("jnl: start transaction failed: no space\n");
1777 ret = ENOSPC;
1778 goto bad_start;
1779 }
1780
1781 // if there's a buffered transaction, use it.
1782 if (jnl->cur_tr) {
1783 jnl->active_tr = jnl->cur_tr;
1784 jnl->cur_tr = NULL;
1785
1786 return 0;
1787 }
1788
1789 MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
1790 memset(tr, 0, sizeof(transaction));
1791
1792 tr->tbuffer_size = jnl->tbuffer_size;
1793
1794 if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
1795 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
1796 printf("jnl: start transaction failed: no tbuffer mem\n");
1797 ret = ENOMEM;
1798 goto bad_start;
1799 }
1800
1801 // journal replay code checksum check depends on this.
1802 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
1803
1804 tr->blhdr = (block_list_header *)tr->tbuffer;
1805 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
1806 tr->blhdr->num_blocks = 1; // accounts for this header block
1807 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
1808
1809 tr->num_blhdrs = 1;
1810 tr->total_bytes = jnl->jhdr->blhdr_size;
1811 tr->jnl = jnl;
1812
1813 jnl->active_tr = tr;
1814
1815 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
1816
1817 return 0;
1818
1819 bad_start:
1820 jnl->owner = NULL;
1821 jnl->nested_count = 0;
1822 unlock_journal(jnl);
1823 return ret;
1824 }
1825
1826
1827 int
1828 journal_modify_block_start(journal *jnl, struct buf *bp)
1829 {
1830 transaction *tr;
1831
1832 CHECK_JOURNAL(jnl);
1833
1834 if (jnl->flags & JOURNAL_INVALID) {
1835 return EINVAL;
1836 }
1837
1838 // XXXdbg - for debugging I want this to be true. later it may
1839 // not be necessary.
1840 if ((buf_flags(bp) & B_META) == 0) {
1841 panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
1842 }
1843
1844 tr = jnl->active_tr;
1845 CHECK_TRANSACTION(tr);
1846
1847 if (jnl->owner != current_thread()) {
1848 panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1849 jnl, jnl->owner, current_thread());
1850 }
1851
1852 free_old_stuff(jnl);
1853
1854 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
1855 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
1856
1857 // can't allow blocks that aren't an even multiple of the
1858 // underlying block size.
1859 if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
1860 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1861 buf_size(bp), jnl->jhdr->jhdr_size);
1862 return -1;
1863 }
1864
1865 // make sure that this transaction isn't bigger than the whole journal
1866 if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
1867 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
1868 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
1869 return -1;
1870 }
1871
1872 // if the block is dirty and not already locked we have to write
1873 // it out before we muck with it because it has data that belongs
1874 // (presumably) to another transaction.
1875 //
1876 if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
1877
1878 if (buf_flags(bp) & B_ASYNC) {
1879 panic("modify_block_start: bp @ 0x% has async flag set!\n", bp);
1880 }
1881
1882 // this will cause it to not be buf_brelse()'d
1883 buf_setflags(bp, B_NORELSE);
1884 VNOP_BWRITE(bp);
1885 }
1886 buf_setflags(bp, B_LOCKED);
1887
1888 return 0;
1889 }
1890
1891 int
1892 journal_modify_block_abort(journal *jnl, struct buf *bp)
1893 {
1894 transaction *tr;
1895 block_list_header *blhdr;
1896 int i, j;
1897
1898 CHECK_JOURNAL(jnl);
1899
1900 tr = jnl->active_tr;
1901
1902 //
1903 // if there's no active transaction then we just want to
1904 // call buf_brelse() and return since this is just a block
1905 // that happened to be modified as part of another tr.
1906 //
1907 if (tr == NULL) {
1908 buf_brelse(bp);
1909 return 0;
1910 }
1911
1912 if (jnl->flags & JOURNAL_INVALID) {
1913 return EINVAL;
1914 }
1915
1916 CHECK_TRANSACTION(tr);
1917
1918 if (jnl->owner != current_thread()) {
1919 panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1920 jnl, jnl->owner, current_thread());
1921 }
1922
1923 free_old_stuff(jnl);
1924
1925 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
1926
1927 // first check if it's already part of this transaction
1928 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
1929 for(i=1; i < blhdr->num_blocks; i++) {
1930 if (bp == blhdr->binfo[i].bp) {
1931 if (buf_size(bp) != blhdr->binfo[i].bsize) {
1932 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1933 bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
1934 }
1935 break;
1936 }
1937 }
1938
1939 if (i < blhdr->num_blocks) {
1940 break;
1941 }
1942 }
1943
1944 //
1945 // if blhdr is null, then this block has only had modify_block_start
1946 // called on it as part of the current transaction. that means that
1947 // it is ok to clear the LOCKED bit since it hasn't actually been
1948 // modified. if blhdr is non-null then modify_block_end was called
1949 // on it and so we need to keep it locked in memory.
1950 //
1951 if (blhdr == NULL) {
1952 buf_clearflags(bp, B_LOCKED);
1953 }
1954
1955 buf_brelse(bp);
1956 return 0;
1957 }
1958
1959
1960 int
1961 journal_modify_block_end(journal *jnl, struct buf *bp)
1962 {
1963 int i, j, tbuffer_offset;
1964 char *blkptr;
1965 block_list_header *blhdr, *prev=NULL;
1966 transaction *tr;
1967
1968 CHECK_JOURNAL(jnl);
1969
1970 if (jnl->flags & JOURNAL_INVALID) {
1971 return EINVAL;
1972 }
1973
1974 tr = jnl->active_tr;
1975 CHECK_TRANSACTION(tr);
1976
1977 if (jnl->owner != current_thread()) {
1978 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1979 jnl, jnl->owner, current_thread());
1980 }
1981
1982 free_old_stuff(jnl);
1983
1984 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
1985 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
1986
1987 if ((buf_flags(bp) & B_LOCKED) == 0) {
1988 panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
1989 }
1990
1991 // first check if it's already part of this transaction
1992 for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
1993 tbuffer_offset = jnl->jhdr->blhdr_size;
1994
1995 for(i=1; i < blhdr->num_blocks; i++) {
1996 if (bp == blhdr->binfo[i].bp) {
1997 if (buf_size(bp) != blhdr->binfo[i].bsize) {
1998 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1999 bp, buf_size(bp), blhdr->binfo[i].bsize, jnl);
2000 }
2001 break;
2002 }
2003 tbuffer_offset += blhdr->binfo[i].bsize;
2004 }
2005
2006 if (i < blhdr->num_blocks) {
2007 break;
2008 }
2009 }
2010
2011 if (blhdr == NULL
2012 && prev
2013 && (prev->num_blocks+1) <= prev->max_blocks
2014 && (prev->bytes_used+buf_size(bp)) <= tr->tbuffer_size) {
2015 blhdr = prev;
2016 } else if (blhdr == NULL) {
2017 block_list_header *nblhdr;
2018
2019 if (prev == NULL) {
2020 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
2021 }
2022
2023 // we got to the end of the list, didn't find the block and there's
2024 // no room in the block_list_header pointed to by prev
2025
2026 // we allocate another tbuffer and link it in at the end of the list
2027 // through prev->binfo[0].bnum. that's a skanky way to do things but
2028 // avoids having yet another linked list of small data structures to manage.
2029
2030 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2031 panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
2032 tr, tr->total_bytes);
2033 }
2034
2035 // journal replay code checksum check depends on this.
2036 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2037
2038 // initialize the new guy
2039 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2040 nblhdr->num_blocks = 1; // accounts for this header block
2041 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2042
2043 tr->num_blhdrs++;
2044 tr->total_bytes += jnl->jhdr->blhdr_size;
2045
2046 // then link him in at the end
2047 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2048
2049 // and finally switch to using the new guy
2050 blhdr = nblhdr;
2051 tbuffer_offset = jnl->jhdr->blhdr_size;
2052 i = 1;
2053 }
2054
2055
2056 if ((i+1) > blhdr->max_blocks) {
2057 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2058 }
2059
2060 // copy the data into the in-memory transaction buffer
2061 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2062 memcpy(blkptr, buf_dataptr(bp), buf_size(bp));
2063
2064 // if this is true then this is a new block we haven't seen
2065 if (i >= blhdr->num_blocks) {
2066 int bsize;
2067 vnode_t vp;
2068
2069 vp = buf_vnode(bp);
2070 vnode_ref(vp);
2071 bsize = buf_size(bp);
2072
2073 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
2074 blhdr->binfo[i].bsize = bsize;
2075 blhdr->binfo[i].bp = bp;
2076
2077 blhdr->bytes_used += bsize;
2078 tr->total_bytes += bsize;
2079
2080 blhdr->num_blocks++;
2081 }
2082 buf_bdwrite(bp);
2083
2084 return 0;
2085 }
2086
2087 int
2088 journal_kill_block(journal *jnl, struct buf *bp)
2089 {
2090 int i;
2091 int bflags;
2092 block_list_header *blhdr;
2093 transaction *tr;
2094
2095 CHECK_JOURNAL(jnl);
2096
2097 if (jnl->flags & JOURNAL_INVALID) {
2098 return EINVAL;
2099 }
2100
2101 tr = jnl->active_tr;
2102 CHECK_TRANSACTION(tr);
2103
2104 if (jnl->owner != current_thread()) {
2105 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2106 jnl, jnl->owner, current_thread());
2107 }
2108
2109 free_old_stuff(jnl);
2110
2111 bflags = buf_flags(bp);
2112
2113 if ( !(bflags & B_LOCKED))
2114 panic("jnl: modify_block_end: called with bp not B_LOCKED");
2115
2116 /*
2117 * bp must be BL_BUSY and B_LOCKED
2118 */
2119 // first check if it's already part of this transaction
2120 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2121
2122 for(i=1; i < blhdr->num_blocks; i++) {
2123 if (bp == blhdr->binfo[i].bp) {
2124 vnode_t vp;
2125
2126 buf_clearflags(bp, B_LOCKED);
2127
2128 // this undoes the vnode_ref() in journal_modify_block_end()
2129 vp = buf_vnode(bp);
2130 vnode_rele_ext(vp, 0, 1);
2131
2132 // if the block has the DELWRI and FILTER bits sets, then
2133 // things are seriously weird. if it was part of another
2134 // transaction then journal_modify_block_start() should
2135 // have force it to be written.
2136 //
2137 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2138 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2139 //} else {
2140 tr->num_killed += buf_size(bp);
2141 //}
2142 blhdr->binfo[i].bp = NULL;
2143 blhdr->binfo[i].bnum = (off_t)-1;
2144
2145 buf_brelse(bp);
2146
2147 break;
2148 }
2149 }
2150
2151 if (i < blhdr->num_blocks) {
2152 break;
2153 }
2154 }
2155
2156 return 0;
2157 }
2158
2159
2160 static int
2161 journal_binfo_cmp(void *a, void *b)
2162 {
2163 block_info *bi_a = (struct block_info *)a;
2164 block_info *bi_b = (struct block_info *)b;
2165 daddr64_t res;
2166
2167 if (bi_a->bp == NULL) {
2168 return 1;
2169 }
2170 if (bi_b->bp == NULL) {
2171 return -1;
2172 }
2173
2174 // don't have to worry about negative block
2175 // numbers so this is ok to do.
2176 //
2177 res = (buf_blkno(bi_a->bp) - buf_blkno(bi_b->bp));
2178
2179 return (int)res;
2180 }
2181
2182
2183 static int
2184 end_transaction(transaction *tr, int force_it)
2185 {
2186 int i, j, ret, amt;
2187 errno_t errno;
2188 off_t end;
2189 journal *jnl = tr->jnl;
2190 struct buf *bp;
2191 block_list_header *blhdr=NULL, *next=NULL;
2192
2193 if (jnl->cur_tr) {
2194 panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
2195 jnl, jnl->cur_tr, tr);
2196 }
2197
2198 // if there weren't any modified blocks in the transaction
2199 // just save off the transaction pointer and return.
2200 if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2201 jnl->cur_tr = tr;
2202 return 0;
2203 }
2204
2205 // if our transaction buffer isn't very full, just hang
2206 // on to it and don't actually flush anything. this is
2207 // what is known as "group commit". we will flush the
2208 // transaction buffer if it's full or if we have more than
2209 // one of them so we don't start hogging too much memory.
2210 //
2211 if ( force_it == 0
2212 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2213 && tr->num_blhdrs < 3
2214 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2215
2216 jnl->cur_tr = tr;
2217 return 0;
2218 }
2219
2220
2221 // if we're here we're going to flush the transaction buffer to disk.
2222 // make sure there is room in the journal first.
2223 check_free_space(jnl, tr->total_bytes);
2224
2225 // range check the end index
2226 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2227 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2228 jnl->jhdr->end, jnl->jhdr->size);
2229 }
2230
2231 // this transaction starts where the current journal ends
2232 tr->journal_start = jnl->jhdr->end;
2233 end = jnl->jhdr->end;
2234
2235 //
2236 // if the first entry in old_start[] isn't free yet, loop calling the
2237 // file system flush routine until it is (or we panic).
2238 //
2239 i = 0;
2240 lock_oldstart(jnl);
2241 while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2242 if (jnl->flush) {
2243 unlock_oldstart(jnl);
2244
2245 if (jnl->flush) {
2246 jnl->flush(jnl->flush_arg);
2247 }
2248
2249 // yield the cpu so others can get in to clear the lock bit
2250 (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2251
2252 lock_oldstart(jnl);
2253 }
2254 if (i++ >= 500) {
2255 panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
2256 jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2257 }
2258 }
2259
2260 //
2261 // slide everyone else down and put our latest guy in the last
2262 // entry in the old_start array
2263 //
2264 memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2265 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2266
2267 unlock_oldstart(jnl);
2268
2269
2270 // for each block, make sure that the physical block # is set
2271 for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2272
2273 for(i=1; i < blhdr->num_blocks; i++) {
2274 daddr64_t blkno;
2275 daddr64_t lblkno;
2276 struct vnode *vp;
2277
2278 bp = blhdr->binfo[i].bp;
2279 if (bp == NULL) { // only true if a block was "killed"
2280 if (blhdr->binfo[i].bnum != (off_t)-1) {
2281 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
2282 blhdr->binfo[i].bnum, jnl, tr);
2283 }
2284 continue;
2285 }
2286 vp = buf_vnode(bp);
2287 blkno = buf_blkno(bp);
2288 lblkno = buf_lblkno(bp);
2289
2290 if (vp == NULL && lblkno == blkno) {
2291 printf("jnl: end_tr: bad news! bp @ 0x%x w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr 0x%x jnl 0x%x).\n",
2292 bp, lblkno, blkno, tr, jnl);
2293 goto bad_journal;
2294 }
2295
2296 // if the lblkno is the same as blkno and this bp isn't
2297 // associated with the underlying file system device then
2298 // we need to call bmap() to get the actual physical block.
2299 //
2300 if ((lblkno == blkno) && (vp != jnl->fsdev)) {
2301 off_t f_offset;
2302 size_t contig_bytes;
2303
2304 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
2305 printf("jnl: end_tr: vnop_blktooff failed @ 0x%x, jnl 0x%x\n", bp, jnl);
2306 goto bad_journal;
2307 }
2308 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
2309 printf("jnl: end_tr: can't blockmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
2310 goto bad_journal;
2311 }
2312 if ((uint32_t)contig_bytes < buf_count(bp)) {
2313 printf("jnl: end_tr: blk not physically contiguous on disk@ 0x%x, jnl 0x%x\n", bp, jnl);
2314 goto bad_journal;
2315 }
2316 buf_setblkno(bp, blkno);
2317 }
2318 // update this so we write out the correct physical block number!
2319 blhdr->binfo[i].bnum = (off_t)(blkno);
2320 }
2321
2322 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2323 }
2324
2325 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2326
2327 amt = blhdr->bytes_used;
2328
2329 blhdr->checksum = 0;
2330 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2331
2332 ret = write_journal_data(jnl, &end, blhdr, amt);
2333 if (ret != amt) {
2334 printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
2335 ret, amt);
2336
2337 goto bad_journal;
2338 }
2339 }
2340
2341 jnl->jhdr->end = end; // update where the journal now ends
2342 tr->journal_end = end; // the transaction ends here too
2343 if (tr->journal_start == 0 || tr->journal_end == 0) {
2344 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2345 tr->journal_start, tr->journal_end);
2346 }
2347
2348 if (write_journal_header(jnl) != 0) {
2349 goto bad_journal;
2350 }
2351
2352 //
2353 // setup for looping through all the blhdr's. we null out the
2354 // tbuffer and blhdr fields so that they're not used any more.
2355 //
2356 blhdr = tr->blhdr;
2357 tr->tbuffer = NULL;
2358 tr->blhdr = NULL;
2359
2360 // the buffer_flushed_callback will only be called for the
2361 // real blocks that get flushed so we have to account for
2362 // the block_list_headers here.
2363 //
2364 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2365
2366 // for each block, set the iodone callback and unlock it
2367 for(; blhdr; blhdr=next) {
2368
2369 // we can re-order the buf ptrs because everything is written out already
2370 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2371
2372 for(i=1; i < blhdr->num_blocks; i++) {
2373 if (blhdr->binfo[i].bp == NULL) {
2374 continue;
2375 }
2376
2377 errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].bp),
2378 buf_lblkno(blhdr->binfo[i].bp),
2379 buf_size(blhdr->binfo[i].bp),
2380 NOCRED,
2381 &bp);
2382 if (errno == 0 && bp != NULL) {
2383 struct vnode *save_vp;
2384 void *cur_filter;
2385
2386 if (bp != blhdr->binfo[i].bp) {
2387 panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2388 bp, blhdr->binfo[i].bp, jnl);
2389 }
2390
2391 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2392 if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2393 buf_clearflags(bp, B_LOCKED);
2394 buf_brelse(bp);
2395 continue;
2396 } else {
2397 panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
2398 }
2399 }
2400 save_vp = buf_vnode(bp);
2401
2402 buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL);
2403
2404 if (cur_filter) {
2405 panic("jnl: bp @ 0x%x (blkno %qd, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
2406 bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback);
2407 }
2408 buf_clearflags(bp, B_LOCKED);
2409
2410 // kicking off the write here helps performance
2411 buf_bawrite(bp);
2412 // XXXdbg this is good for testing: buf_bdwrite(bp);
2413 //buf_bdwrite(bp);
2414
2415 // this undoes the vnode_ref() in journal_modify_block_end()
2416 vnode_rele_ext(save_vp, 0, 1);
2417 } else {
2418 printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
2419 blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
2420 if (bp) {
2421 buf_clearflags(bp, B_LOCKED);
2422 buf_brelse(bp);
2423 }
2424 }
2425 }
2426
2427 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2428
2429 // we can free blhdr here since we won't need it any more
2430 blhdr->binfo[0].bnum = 0xdeadc0de;
2431 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2432 }
2433
2434 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2435 // tr, tr->journal_start, tr->journal_end);
2436 return 0;
2437
2438
2439 bad_journal:
2440 jnl->flags |= JOURNAL_INVALID;
2441 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2442 abort_transaction(jnl, tr);
2443 return -1;
2444 }
2445
2446 static void
2447 abort_transaction(journal *jnl, transaction *tr)
2448 {
2449 int i;
2450 errno_t errno;
2451 block_list_header *blhdr, *next;
2452 struct buf *bp;
2453 struct vnode *save_vp;
2454
2455 // for each block list header, iterate over the blocks then
2456 // free up the memory associated with the block list.
2457 //
2458 // for each block, clear the lock bit and release it.
2459 //
2460 for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2461
2462 for(i=1; i < blhdr->num_blocks; i++) {
2463 if (blhdr->binfo[i].bp == NULL) {
2464 continue;
2465 }
2466 if ( (buf_vnode(blhdr->binfo[i].bp) == NULL) ||
2467 !(buf_flags(blhdr->binfo[i].bp) & B_LOCKED) ) {
2468 continue;
2469 }
2470
2471 errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].bp),
2472 buf_lblkno(blhdr->binfo[i].bp),
2473 buf_size(blhdr->binfo[i].bp),
2474 NOCRED,
2475 &bp);
2476 if (errno == 0) {
2477 if (bp != blhdr->binfo[i].bp) {
2478 panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2479 bp, blhdr->binfo[i].bp, jnl);
2480 }
2481
2482 // releasing a bp marked invalid
2483 // also clears the locked and delayed state
2484 buf_markinvalid(bp);
2485 save_vp = buf_vnode(bp);
2486
2487 buf_brelse(bp);
2488
2489 vnode_rele_ext(save_vp, 0, 1);
2490 } else {
2491 printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
2492 blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
2493 if (bp) {
2494 buf_brelse(bp);
2495 }
2496 }
2497 }
2498
2499 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2500
2501 // we can free blhdr here since we won't need it any more
2502 blhdr->binfo[0].bnum = 0xdeadc0de;
2503 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2504 }
2505
2506 tr->tbuffer = NULL;
2507 tr->blhdr = NULL;
2508 tr->total_bytes = 0xdbadc0de;
2509 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2510 }
2511
2512
2513 int
2514 journal_end_transaction(journal *jnl)
2515 {
2516 int ret;
2517 transaction *tr;
2518
2519 CHECK_JOURNAL(jnl);
2520
2521 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
2522 return 0;
2523 }
2524
2525 if (jnl->owner != current_thread()) {
2526 panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2527 jnl, jnl->owner, current_thread());
2528 }
2529
2530 free_old_stuff(jnl);
2531
2532 jnl->nested_count--;
2533 if (jnl->nested_count > 0) {
2534 return 0;
2535 } else if (jnl->nested_count < 0) {
2536 panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
2537 }
2538
2539 if (jnl->flags & JOURNAL_INVALID) {
2540 if (jnl->active_tr) {
2541 if (jnl->cur_tr != NULL) {
2542 panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
2543 jnl, jnl->active_tr, jnl->cur_tr);
2544 }
2545
2546 tr = jnl->active_tr;
2547 jnl->active_tr = NULL;
2548 abort_transaction(jnl, tr);
2549 }
2550
2551 jnl->owner = NULL;
2552 unlock_journal(jnl);
2553
2554 return EINVAL;
2555 }
2556
2557 tr = jnl->active_tr;
2558 CHECK_TRANSACTION(tr);
2559
2560 // clear this out here so that when check_free_space() calls
2561 // the FS flush function, we don't panic in journal_flush()
2562 // if the FS were to call that. note: check_free_space() is
2563 // called from end_transaction().
2564 //
2565 jnl->active_tr = NULL;
2566 ret = end_transaction(tr, 0);
2567
2568 jnl->owner = NULL;
2569 unlock_journal(jnl);
2570
2571 return ret;
2572 }
2573
2574
2575 int
2576 journal_flush(journal *jnl)
2577 {
2578 int need_signal = 0;
2579
2580 CHECK_JOURNAL(jnl);
2581
2582 if (jnl->flags & JOURNAL_INVALID) {
2583 return -1;
2584 }
2585
2586 if (jnl->owner != current_thread()) {
2587 int ret;
2588
2589 lock_journal(jnl);
2590 need_signal = 1;
2591 }
2592
2593 free_old_stuff(jnl);
2594
2595 // if we're not active, flush any buffered transactions
2596 if (jnl->active_tr == NULL && jnl->cur_tr) {
2597 transaction *tr = jnl->cur_tr;
2598
2599 jnl->cur_tr = NULL;
2600 end_transaction(tr, 1); // force it to get flushed
2601 }
2602
2603 if (need_signal) {
2604 unlock_journal(jnl);
2605 }
2606
2607 return 0;
2608 }
2609
2610 int
2611 journal_active(journal *jnl)
2612 {
2613 if (jnl->flags & JOURNAL_INVALID) {
2614 return -1;
2615 }
2616
2617 return (jnl->active_tr == NULL) ? 0 : 1;
2618 }
2619
2620 void *
2621 journal_owner(journal *jnl)
2622 {
2623 return jnl->owner;
2624 }