]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_journal.c
xnu-517.3.15.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_journal.c
CommitLineData
b4c24cb9
A
1/*
2 * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
43866e37 6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
b4c24cb9 7 *
43866e37
A
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
b4c24cb9
A
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
43866e37
A
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
b4c24cb9
A
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25//
26// This file implements a simple write-ahead journaling layer.
27// In theory any file system can make use of it by calling these
28// functions when the fs wants to modify meta-data blocks. See
29// vfs_journal.h for a more detailed description of the api and
30// data structures.
31//
32// Dominic Giampaolo (dbg@apple.com)
33//
34
35#ifdef KERNEL
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/buf.h>
43#include <sys/proc.h>
44#include <sys/mount.h>
45#include <sys/namei.h>
46#include <sys/vnode.h>
47#include <sys/ioctl.h>
48#include <sys/tty.h>
49#include <sys/ubc.h>
50#include <sys/malloc.h>
51#include <sys/vnode.h>
52#include <kern/thread_act.h>
53#include <sys/disk.h>
54#include <miscfs/specfs/specdev.h>
55
56extern task_t kernel_task;
57
58#else
59
60#include <stdio.h>
61#include <stdlib.h>
62#include <string.h>
63#include <limits.h>
64#include <errno.h>
65#include <fcntl.h>
66#include <unistd.h>
67#include <stdarg.h>
68#include <sys/types.h>
69#include "compat.h"
70
71#endif /* KERNEL */
72
73#include "vfs_journal.h"
74
75
76// number of bytes to checksum in a block_list_header
77// NOTE: this should be enough to clear out the header
78// fields as well as the first entry of binfo[]
79#define BLHDR_CHECKSUM_SIZE 32
80
81
82
83static int end_transaction(transaction *tr, int force_it);
84static void abort_transaction(journal *jnl, transaction *tr);
85static void dump_journal(journal *jnl);
86
87
55e303ae
A
88//
89// 3105942 - Coalesce writes to the same block on journal replay
90//
91
92typedef struct bucket {
93 off_t block_num;
94 size_t jnl_offset;
95 size_t block_size;
96} bucket;
97
98#define STARTING_BUCKETS 256
99
100static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr);
101static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
102static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
103static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr);
104static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
105
b4c24cb9
A
106#define CHECK_JOURNAL(jnl) \
107 do { \
108 if (jnl == NULL) {\
109 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
110 }\
111 if (jnl->jdev == NULL) { \
112 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
113 } \
114 if (jnl->fsdev == NULL) { \
115 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
116 } \
117 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
118 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
119 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
120 }\
121 if ( jnl->jhdr->start <= 0 \
122 || jnl->jhdr->start > jnl->jhdr->size\
55e303ae 123 || jnl->jhdr->start > 1024*1024*1024) {\
b4c24cb9
A
124 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
125 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
126 }\
127 if ( jnl->jhdr->end <= 0 \
128 || jnl->jhdr->end > jnl->jhdr->size\
55e303ae 129 || jnl->jhdr->end > 1024*1024*1024) {\
b4c24cb9
A
130 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
131 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
132 }\
55e303ae 133 if (jnl->jhdr->size > 1024*1024*1024) {\
b4c24cb9
A
134 panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
135 __FILE__, __LINE__, jnl->jhdr->size);\
136 } \
137 } while(0)
138
139#define CHECK_TRANSACTION(tr) \
140 do {\
141 if (tr == NULL) {\
142 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
143 }\
144 if (tr->jnl == NULL) {\
145 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
146 }\
147 if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
148 panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
149 }\
150 if (tr->total_bytes < 0) {\
151 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
152 }\
55e303ae 153 if (tr->journal_start < 0 || tr->journal_start > 1024*1024*1024) {\
b4c24cb9
A
154 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
155 }\
55e303ae 156 if (tr->journal_end < 0 || tr->journal_end > 1024*1024*1024) {\
b4c24cb9
A
157 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
158 }\
55e303ae 159 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\
b4c24cb9
A
160 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
161 }\
162 } while(0)
163
164
165
166//
167// this isn't a great checksum routine but it will do for now.
168// we use it to checksum the journal header and the block list
169// headers that are at the start of each transaction.
170//
171static int
172calc_checksum(char *ptr, int len)
173{
174 int i, cksum=0;
175
176 // this is a lame checksum but for now it'll do
177 for(i=0; i < len; i++, ptr++) {
178 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
179 }
180
181 return (~cksum);
182}
183
184
55e303ae
A
185#define JNL_WRITE 0x0001
186#define JNL_READ 0x0002
187#define JNL_HEADER 0x8000
b4c24cb9
A
188
189//
190// This function sets up a fake buf and passes it directly to the
191// journal device strategy routine (so that it won't get cached in
192// the block cache.
193//
194// It also handles range checking the i/o so that we don't write
195// outside the journal boundaries and it will wrap the i/o back
196// to the beginning if necessary (skipping over the journal header)
197//
198static size_t
199do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
200{
201 int err, io_sz=0, curlen=len;
202 struct buf *bp;
203 int max_iosize=0, max_vectors;
204
205 if (*offset < 0 || *offset > jnl->jhdr->size) {
206 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
207 }
208
209 again:
210 bp = alloc_io_buf(jnl->jdev, 1);
211
55e303ae 212 if (direction & JNL_WRITE) {
b4c24cb9
A
213 bp->b_flags |= 0; // don't have to set any flags (was: B_WRITEINPROG)
214 jnl->jdev->v_numoutput++;
215 vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors);
55e303ae 216 } else if (direction & JNL_READ) {
b4c24cb9
A
217 bp->b_flags |= B_READ;
218 vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors);
219 }
220
221 if (max_iosize == 0) {
222 max_iosize = 128 * 1024;
223 }
224
225 if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
226 if (*offset == jnl->jhdr->size) {
227 *offset = jnl->jhdr->jhdr_size;
228 } else {
229 curlen = (off_t)jnl->jhdr->size - *offset;
230 }
231 }
232
233 if (curlen > max_iosize) {
234 curlen = max_iosize;
235 }
236
237 if (curlen <= 0) {
238 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
239 }
240
55e303ae
A
241 if (*offset == 0 && (direction & JNL_HEADER) == 0) {
242 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
243 }
244
b4c24cb9
A
245 bp->b_bufsize = curlen;
246 bp->b_bcount = curlen;
247 bp->b_data = data;
248 bp->b_blkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
249 bp->b_lblkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
250
251 err = VOP_STRATEGY(bp);
252 if (!err) {
253 err = biowait(bp);
254 }
255
256 bp->b_data = NULL;
257 bp->b_bufsize = bp->b_bcount = 0;
258 bp->b_blkno = bp->b_lblkno = -1;
259
260 free_io_buf(bp);
261
262 if (err) {
263 printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
264 return 0;
265 }
266
267 *offset += curlen;
268 io_sz += curlen;
269 if (io_sz != len) {
270 // handle wrap-around
271 data = (char *)data + curlen;
272 curlen = len - io_sz;
273 if (*offset >= jnl->jhdr->size) {
274 *offset = jnl->jhdr->jhdr_size;
275 }
276 goto again;
277 }
278
279 return io_sz;
280}
281
282static size_t
283read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
284{
285 return do_journal_io(jnl, offset, data, len, JNL_READ);
286}
287
288static size_t
289write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
290{
291 return do_journal_io(jnl, offset, data, len, JNL_WRITE);
292}
293
294
55e303ae
A
295static int
296read_journal_header(journal *jnl, void *data, size_t len)
297{
298 off_t hdr_offset = 0;
299
300 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
301}
302
b4c24cb9
A
303static int
304write_journal_header(journal *jnl)
305{
55e303ae 306 static int num_err_prints = 0;
b4c24cb9
A
307 int ret;
308 off_t jhdr_offset = 0;
309
310 //
311 // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
312 //
313 ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
314 if (ret != 0) {
55e303ae
A
315 //
316 // Only print this error if it's a different error than the
317 // previous one, or if it's the first time for this device
318 // or if the total number of printfs is less than 25. We
319 // allow for up to 25 printfs to insure that some make it
320 // into the on-disk syslog. Otherwise if we only printed
321 // one, it's possible it would never make it to the syslog
322 // for the root volume and that makes debugging hard.
323 //
324 if ( ret != jnl->last_flush_err
325 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
326 || num_err_prints++ < 25) {
327
328 printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
329
330 jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
331 jnl->last_flush_err = ret;
332 }
b4c24cb9
A
333 }
334
55e303ae 335
b4c24cb9
A
336 jnl->jhdr->checksum = 0;
337 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
55e303ae
A
338 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != jnl->jhdr->jhdr_size) {
339 printf("jnl: write_journal_header: error writing the journal header!\n");
340 jnl->flags |= JOURNAL_INVALID;
341 return -1;
b4c24cb9
A
342 }
343
55e303ae
A
344 // Have to flush after writing the journal header so that
345 // a future transaction doesn't sneak out to disk before
346 // the header does and thus overwrite data that the old
347 // journal header refers to. Saw this exact case happen
348 // on an IDE bus analyzer with Larry Barras so while it
349 // may seem obscure, it's not.
350 //
351 VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
352
b4c24cb9
A
353 return 0;
354}
355
356
357
358//
359// this is a work function used to free up transactions that
360// completed. they can't be free'd from buffer_flushed_callback
361// because it is called from deep with the disk driver stack
362// and thus can't do something that would potentially cause
363// paging. it gets called by each of the journal api entry
364// points so stuff shouldn't hang around for too long.
365//
366static void
367free_old_stuff(journal *jnl)
368{
369 transaction *tr, *next;
370
371 for(tr=jnl->tr_freeme; tr; tr=next) {
372 next = tr->next;
d7e50217 373 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
b4c24cb9
A
374 }
375
376 jnl->tr_freeme = NULL;
377}
378
379
380
381//
382// This is our callback that lets us know when a buffer has been
383// flushed to disk. It's called from deep within the driver stack
384// and thus is quite limited in what it can do. Notably, it can
385// not initiate any new i/o's or allocate/free memory.
386//
387static void
388buffer_flushed_callback(struct buf *bp)
389{
390 transaction *tr;
391 journal *jnl;
392 transaction *ctr, *prev=NULL, *next;
393 int i, bufsize;
394
395
396 //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
397 // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
398
399 // snarf out the bits we want
400 bufsize = bp->b_bufsize;
401 tr = bp->b_transaction;
402
403 bp->b_iodone = NULL; // don't call us for this guy again
404 bp->b_transaction = NULL;
405
406 //
407 // This is what biodone() would do if it didn't call us.
408 // NOTE: THIS CODE *HAS* TO BE HERE!
409 //
410 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
411 brelse(bp);
412 } else { /* or just wakeup the buffer */
413 CLR(bp->b_flags, B_WANTED);
414 wakeup(bp);
415 }
416
417 // NOTE: from here on out we do *NOT* touch bp anymore.
418
419
420 // then we've already seen it
421 if (tr == NULL) {
422 return;
423 }
424
425 CHECK_TRANSACTION(tr);
426
427 jnl = tr->jnl;
428 if (jnl->flags & JOURNAL_INVALID) {
429 return;
430 }
431
432 CHECK_JOURNAL(jnl);
433
434 // update the number of blocks that have been flushed.
435 // this buf may represent more than one block so take
436 // that into account.
437 tr->num_flushed += bufsize;
438
439
440 // if this transaction isn't done yet, just return as
441 // there is nothing to do.
442 if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
443 return;
444 }
445
446 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
447 // tr, tr->journal_start, tr->journal_end, jnl);
448
449 // find this entry in the old_start[] index and mark it completed
450 simple_lock(&jnl->old_start_lock);
451 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
452
453 if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
454 jnl->old_start[i] &= ~(0x8000000000000000LL);
455 break;
456 }
457 }
458 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
459 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
460 tr->journal_start, tr, jnl);
461 }
462 simple_unlock(&jnl->old_start_lock);
463
464
465 // if we are here then we need to update the journal header
466 // to reflect that this transaction is complete
467 if (tr->journal_start == jnl->active_start) {
468 jnl->active_start = tr->journal_end;
469 tr->journal_start = tr->journal_end = (off_t)0;
470 }
471
472 // go through the completed_trs list and try to coalesce
473 // entries, restarting back at the beginning if we have to.
474 for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
475 if (ctr->journal_start == jnl->active_start) {
476 jnl->active_start = ctr->journal_end;
477 if (prev) {
478 prev->next = ctr->next;
479 }
480 if (ctr == jnl->completed_trs) {
481 jnl->completed_trs = ctr->next;
482 }
483
484 next = jnl->completed_trs; // this starts us over again
485 ctr->next = jnl->tr_freeme;
486 jnl->tr_freeme = ctr;
487 ctr = NULL;
488 } else if (tr->journal_end == ctr->journal_start) {
489 ctr->journal_start = tr->journal_start;
490 next = jnl->completed_trs; // this starts us over again
491 ctr = NULL;
492 tr->journal_start = tr->journal_end = (off_t)0;
493 } else if (tr->journal_start == ctr->journal_end) {
494 ctr->journal_end = tr->journal_end;
495 next = ctr->next;
496 tr->journal_start = tr->journal_end = (off_t)0;
497 } else {
498 next = ctr->next;
499 }
500 }
501
502 // at this point no one should be using this guy anymore
503 tr->total_bytes = 0xfbadc0de;
504
505 // if this is true then we didn't merge with anyone
506 // so link ourselves in at the head of the completed
507 // transaction list.
508 if (tr->journal_start != 0) {
509 // put this entry into the correct sorted place
510 // in the list instead of just at the head.
511 //
512
513 prev = NULL;
514 for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
515 // just keep looping
516 }
517
518 if (ctr == NULL && prev == NULL) {
519 jnl->completed_trs = tr;
520 tr->next = NULL;
521 } else if (ctr == jnl->completed_trs) {
522 tr->next = jnl->completed_trs;
523 jnl->completed_trs = tr;
524 } else {
525 tr->next = prev->next;
526 prev->next = tr;
527 }
528 } else {
529 // if we're here this tr got merged with someone else so
530 // put it on the list to be free'd
531 tr->next = jnl->tr_freeme;
532 jnl->tr_freeme = tr;
533 }
534}
535
55e303ae
A
536
537#include <libkern/OSByteOrder.h>
538
539#define SWAP16(x) OSSwapInt16(x)
540#define SWAP32(x) OSSwapInt32(x)
541#define SWAP64(x) OSSwapInt64(x)
542
543
544static void
545swap_journal_header(journal *jnl)
546{
547 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
548 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
549 jnl->jhdr->start = SWAP64(jnl->jhdr->start);
550 jnl->jhdr->end = SWAP64(jnl->jhdr->end);
551 jnl->jhdr->size = SWAP64(jnl->jhdr->size);
552 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
553 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
554 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
555}
556
557static void
558swap_block_list_header(journal *jnl, block_list_header *blhdr)
559{
560 int i;
561
562 blhdr->max_blocks = SWAP16(blhdr->max_blocks);
563 blhdr->num_blocks = SWAP16(blhdr->num_blocks);
564 blhdr->bytes_used = SWAP32(blhdr->bytes_used);
565 blhdr->checksum = SWAP32(blhdr->checksum);
566 blhdr->pad = SWAP32(blhdr->pad);
567
568 if (blhdr->num_blocks * sizeof(blhdr->binfo[0]) > jnl->jhdr->blhdr_size) {
569 printf("jnl: blhdr num blocks looks suspicious (%d). not swapping.\n", blhdr->num_blocks);
570 return;
571 }
572
573 for(i=0; i < blhdr->num_blocks; i++) {
574 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
575 blhdr->binfo[i].bsize = SWAP32(blhdr->binfo[i].bsize);
576 blhdr->binfo[i].bp = (void *)SWAP32((int)blhdr->binfo[i].bp);
577 }
578}
579
580
b4c24cb9
A
581static int
582update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
583{
584 int ret;
585 struct buf *oblock_bp=NULL;
586
587 // first read the block we want.
588 ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
589 if (ret != 0) {
590 printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
591
592 if (oblock_bp) {
593 brelse(oblock_bp);
594 oblock_bp = NULL;
595 }
596
597 // let's try to be aggressive here and just re-write the block
598 oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META);
599 if (oblock_bp == NULL) {
600 printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block);
601 return -1;
602 }
603 }
604
605 // make sure it's the correct size.
606 if (oblock_bp->b_bufsize != bsize) {
607 brelse(oblock_bp);
608 return -1;
609 }
610
611 // copy the journal data over top of it
612 memcpy(oblock_bp->b_data, block_ptr, bsize);
613
614 if ((ret = VOP_BWRITE(oblock_bp)) != 0) {
615 printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
b4c24cb9
A
616 return ret;
617 }
618
619 // and now invalidate it so that if someone else wants to read
620 // it in a different size they'll be able to do it.
621 ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
622 if (oblock_bp) {
623 oblock_bp->b_flags |= B_INVAL;
624 brelse(oblock_bp);
625 }
626
627 return 0;
628}
629
55e303ae
A
630static int
631grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
632{
633 struct bucket *newBuf;
634 int current_size = num_buckets, i;
635
636 // return if newsize is less than the current size
637 if (new_size < num_buckets) {
638 return current_size;
639 }
640
641 if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
642 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
643 return -1;
644 }
645
646 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
647
648 // copy existing elements
649 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
650
651 // initialize the new ones
652 for(i=num_buckets; i < new_size; i++) {
653 newBuf[i].block_num = (off_t)-1;
654 }
655
656 // free the old container
657 FREE(*buf_ptr, M_TEMP);
658
659 // reset the buf_ptr
660 *buf_ptr = newBuf;
661
662 return new_size;
663}
664
665static int
666lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
667{
668 int lo, hi, index, matches, i;
669
670 if (num_full == 0) {
671 return 0; // table is empty, so insert at index=0
672 }
673
674 lo = 0;
675 hi = num_full - 1;
676 index = -1;
677
678 // perform binary search for block_num
679 do {
680 int mid = (hi - lo)/2 + lo;
681 off_t this_num = (*buf_ptr)[mid].block_num;
682
683 if (block_num == this_num) {
684 index = mid;
685 break;
686 }
687
688 if (block_num < this_num) {
689 hi = mid;
690 continue;
691 }
692
693 if (block_num > this_num) {
694 lo = mid + 1;
695 continue;
696 }
697 } while(lo < hi);
698
699 // check if lo and hi converged on the match
700 if (block_num == (*buf_ptr)[hi].block_num) {
701 index = hi;
702 }
703
704 // if no existing entry found, find index for new one
705 if (index == -1) {
706 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
707 } else {
708 // make sure that we return the right-most index in the case of multiple matches
709 matches = 0;
710 i = index + 1;
711 while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
712 matches++;
713 i++;
714 }
715
716 index += matches;
717 }
718
719 return index;
720}
721
722static int
723insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
724{
725 if (!overwriting) {
726 // grow the table if we're out of space
727 if (*num_full_ptr >= *num_buckets_ptr) {
728 int new_size = *num_buckets_ptr * 2;
729 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
730
731 if (grow_size < new_size) {
732 printf("jnl: add_block: grow_table returned an error!\n");
733 return -1;
734 }
735
736 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
737 }
738
739 // if we're not inserting at the end, we need to bcopy
740 if (blk_index != *num_full_ptr) {
741 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
742 }
743
744 (*num_full_ptr)++; // increment only if we're not overwriting
745 }
746
747 // sanity check the values we're about to add
748 if (offset >= jnl->jhdr->size) {
749 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
750 }
751 if (size <= 0) {
752 panic("jnl: insert_block: bad size in insert_block (%d)\n", size);
753 }
754
755 (*buf_ptr)[blk_index].block_num = num;
756 (*buf_ptr)[blk_index].block_size = size;
757 (*buf_ptr)[blk_index].jnl_offset = offset;
758
759 return blk_index;
760}
761
762static int
763do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
764{
765 int num_to_remove, index, i, overwrite, err;
766 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
767 off_t overlap, block_start, block_end;
768
769 block_start = block_num*jhdr_size;
770 block_end = block_start + size;
771 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
772
773 // first, eliminate any overlap with the previous entry
774 if (blk_index != 0 && !overwrite) {
775 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
776 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
777 overlap = prev_block_end - block_start;
778 if (overlap > 0) {
779 if (overlap % jhdr_size != 0) {
780 panic("jnl: do_overlap: overlap with previous entry not a multiple of %d\n", jhdr_size);
781 }
782
783 // if the previous entry completely overlaps this one, we need to break it into two pieces.
784 if (prev_block_end > block_end) {
785 off_t new_num = block_end / jhdr_size;
786 size_t new_size = prev_block_end - block_end;
787 size_t new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
788
789 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, num_buckets_ptr, num_full_ptr, 0);
790 if (err < 0) {
791 panic("jnl: do_overlap: error inserting during pre-overlap\n");
792 }
793 }
794
795 // Regardless, we need to truncate the previous entry to the beginning of the overlap
796 (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
797 }
798 }
799
800 // then, bail out fast if there's no overlap with the entries that follow
801 if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
802 return 0; // no overlap, no overwrite
803 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
804 return 1; // simple overwrite
805 }
806
807 // Otherwise, find all cases of total and partial overlap. We use the special
808 // block_num of -2 to designate entries that are completely overlapped and must
809 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
810 // entries must be adjusted to keep the array consistent.
811 index = blk_index;
812 num_to_remove = 0;
813 while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
814 if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
815 (*buf_ptr)[index].block_num = -2; // mark this for deletion
816 num_to_remove++;
817 } else {
818 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
819 if (overlap > 0) {
820 if (overlap % jhdr_size != 0) {
821 panic("jnl: do_overlap: overlap of %d is not multiple of %d\n", overlap, jhdr_size);
822 }
823
824 // if we partially overlap this entry, adjust its block number, jnl offset, and size
825 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
826
827 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
828 if (new_offset >= jnl->jhdr->size) {
829 new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
830 }
831 (*buf_ptr)[index].jnl_offset = new_offset;
832
833 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
834 if ((*buf_ptr)[index].block_size <= 0) {
835 panic("jnl: do_overlap: after overlap, new block size is invalid (%d)\n", (*buf_ptr)[index].block_size);
836 // return -1; // if above panic is removed, return -1 for error
837 }
838 }
839
840 }
841
842 index++;
843 }
844
845 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
846 index--; // start with the last index used within the above loop
847 while(index >= blk_index) {
848 if ((*buf_ptr)[index].block_num == -2) {
849 if (index == *num_full_ptr-1) {
850 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
851 } else {
852 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
853 }
854 (*num_full_ptr)--;
855 }
856 index--;
857 }
858
859 // eliminate any stale entries at the end of the table
860 for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
861 (*buf_ptr)[i].block_num = -1;
862 }
863
864 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
865}
866
867// PR-3105942: Coalesce writes to the same block in journal replay
868// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
869// to be replayed and the corresponding location in the journal which contains
870// the most recent data for those blocks. The array is "played" once the all the
871// blocks in the journal have been coalesced. The code for the case of conflicting/
872// overlapping writes to a single block is the most dense. Because coalescing can
873// disrupt the existing time-ordering of blocks in the journal playback, care
874// is taken to catch any overlaps and keep the array consistent.
875static int
876add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
877{
878 int blk_index, overwriting;
879 size_t jhdr_size = jnl->jhdr->jhdr_size;
880
881 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
882 // inserted (or the index of the elem to overwrite).
883 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
884
885 // check if the index is within bounds (if we're adding this block to the end of
886 // the table, blk_index will be equal to num_full)
887 if (blk_index < 0 || blk_index > *num_full_ptr) {
888 //printf("jnl: add_block: trouble adding block to co_buf\n");
889 return -1;
890 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
891
892 // Determine whether we're overwriting an existing entry by checking for overlap
893 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr);
894 if (overwriting < 0) {
895 return -1; // if we got an error, pass it along
896 }
897
898 // returns the index, or -1 on error
899 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr, overwriting);
900
901 return blk_index;
902}
b4c24cb9
A
903
904static int
905replay_journal(journal *jnl)
906{
55e303ae 907 int i, ret, orig_checksum, checksum, max_bsize;
b4c24cb9
A
908 struct buf *oblock_bp;
909 block_list_header *blhdr;
910 off_t offset;
911 char *buf, *block_ptr=NULL;
55e303ae
A
912 struct bucket *co_buf;
913 int num_buckets = STARTING_BUCKETS, num_full;
914
b4c24cb9
A
915 // wrap the start ptr if it points to the very end of the journal
916 if (jnl->jhdr->start == jnl->jhdr->size) {
917 jnl->jhdr->start = jnl->jhdr->jhdr_size;
918 }
919 if (jnl->jhdr->end == jnl->jhdr->size) {
920 jnl->jhdr->end = jnl->jhdr->jhdr_size;
921 }
922
923 if (jnl->jhdr->start == jnl->jhdr->end) {
924 return 0;
925 }
926
927 // allocate memory for the header_block. we'll read each blhdr into this
928 if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) {
929 printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
930 jnl->jhdr->blhdr_size);
931 return -1;
932 }
55e303ae
A
933
934 // allocate memory for the coalesce buffer
935 if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
936 printf("jnl: replay_journal: no memory for coalesce buffer!\n");
937 return -1;
938 }
939
940 // initialize entries
941 for(i=0; i < num_buckets; i++) {
942 co_buf[i].block_num = -1;
943 }
944 num_full = 0; // empty at first
945
b4c24cb9
A
946
947 printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
948 jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
949
950 while(jnl->jhdr->start != jnl->jhdr->end) {
951 offset = jnl->jhdr->start;
952 ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size);
953 if (ret != jnl->jhdr->blhdr_size) {
954 printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
955 goto bad_replay;
956 }
957
958 blhdr = (block_list_header *)buf;
55e303ae
A
959
960 orig_checksum = blhdr->checksum;
b4c24cb9 961 blhdr->checksum = 0;
55e303ae
A
962 if (jnl->flags & JOURNAL_NEED_SWAP) {
963 // calculate the checksum based on the unswapped data
964 // because it is done byte-at-a-time.
965 orig_checksum = SWAP32(orig_checksum);
966 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
967 swap_block_list_header(jnl, blhdr);
968 } else {
969 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
b4c24cb9 970 }
55e303ae
A
971 if (checksum != orig_checksum) {
972 printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
973 offset, orig_checksum, checksum);
974 goto bad_replay;
975 }
b4c24cb9
A
976 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
977 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
978 printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
979 blhdr->max_blocks, blhdr->num_blocks);
980 goto bad_replay;
981 }
982
983 for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) {
984 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
985 printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
986 goto bad_replay;
987 }
988 if (blhdr->binfo[i].bsize > max_bsize) {
989 max_bsize = blhdr->binfo[i].bsize;
990 }
991 }
992
993 // make sure it's at least one page in size.
994 if (max_bsize & (PAGE_SIZE - 1)) {
995 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
996 }
997
b4c24cb9 998
55e303ae
A
999 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1000 // blhdr->num_blocks-1, jnl->jhdr->start);
b4c24cb9 1001 for(i=1; i < blhdr->num_blocks; i++) {
55e303ae
A
1002 int size, ret_val;
1003 off_t number;
b4c24cb9
A
1004
1005 size = blhdr->binfo[i].bsize;
55e303ae
A
1006 number = blhdr->binfo[i].bnum;
1007
1008 // don't add "killed" blocks
1009 if (number == (off_t)-1) {
1010 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
b4c24cb9 1011 } else {
55e303ae
A
1012 // add this bucket to co_buf, coalescing where possible
1013 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1014 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, &num_buckets, &num_full);
1015
1016 if (ret_val == -1) {
1017 printf("jnl: replay_journal: trouble adding block to co_buf\n");
1018 goto bad_replay;
1019 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
b4c24cb9 1020 }
55e303ae
A
1021
1022 // increment offset
1023 offset += size;
1024
1025 // check if the last block added puts us off the end of the jnl.
1026 // if so, we need to wrap to the beginning and take any remainder
1027 // into account
b4c24cb9
A
1028 //
1029 if (offset >= jnl->jhdr->size) {
55e303ae 1030 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
b4c24cb9
A
1031 }
1032 }
1033
55e303ae 1034
b4c24cb9
A
1035 jnl->jhdr->start += blhdr->bytes_used;
1036 if (jnl->jhdr->start >= jnl->jhdr->size) {
1037 // wrap around and skip the journal header block
1038 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1039 }
55e303ae 1040 }
b4c24cb9 1041
55e303ae
A
1042
1043 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1044
1045 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1046 goto bad_replay;
1047 }
1048
1049 // Replay the coalesced entries in the co-buf
1050 for(i=0; i < num_full; i++) {
1051 size_t size = co_buf[i].block_size;
1052 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1053 off_t number = co_buf[i].block_num;
1054
1055
1056 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1057 // co_buf[i].block_size, co_buf[i].jnl_offset);
1058
1059 if (number == (off_t)-1) {
1060 // printf("jnl: replay_journal: skipping killed fs block\n");
1061 } else {
1062
1063 // do journal read, and set the phys. block
1064 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1065 if (ret != size) {
1066 printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
1067 goto bad_replay;
1068 }
1069
1070 if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1071 goto bad_replay;
1072 }
1073 }
b4c24cb9 1074 }
55e303ae 1075
b4c24cb9 1076
55e303ae
A
1077 // done replaying; update jnl header
1078 if (write_journal_header(jnl) != 0) {
1079 goto bad_replay;
1080 }
1081
1082 // free block_ptr
1083 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1084 block_ptr = NULL;
1085
1086 // free the coalesce buffer
1087 FREE(co_buf, M_TEMP);
1088 co_buf = NULL;
1089
b4c24cb9
A
1090 kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
1091 return 0;
1092
1093 bad_replay:
1094 if (block_ptr) {
1095 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1096 }
55e303ae
A
1097 if (co_buf) {
1098 FREE(co_buf, M_TEMP);
1099 }
b4c24cb9 1100 kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
55e303ae 1101
b4c24cb9
A
1102 return -1;
1103}
1104
1105
1106#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1107//#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
1108#define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
1109
1110// XXXdbg - so I can change it in the debugger
1111int def_tbuffer_size = 0;
1112
1113
1114//
1115// This function sets the size of the tbuffer and the
1116// size of the blhdr. It assumes that jnl->jhdr->size
1117// and jnl->jhdr->jhdr_size are already valid.
1118//
1119static void
1120size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1121{
1122 //
1123 // one-time initialization based on how much memory
1124 // there is in the machine.
1125 //
1126 if (def_tbuffer_size == 0) {
1127 if (mem_size < (256*1024*1024)) {
1128 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1129 } else if (mem_size < (512*1024*1024)) {
1130 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1131 } else if (mem_size < (1024*1024*1024)) {
1132 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1133 } else if (mem_size >= (1024*1024*1024)) {
1134 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
1135 }
1136 }
1137
1138 // size up the transaction buffer... can't be larger than the number
1139 // of blocks that can fit in a block_list_header block.
1140 if (tbuffer_size == 0) {
1141 jnl->tbuffer_size = def_tbuffer_size;
1142 } else {
1143 // make sure that the specified tbuffer_size isn't too small
1144 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1145 tbuffer_size = jnl->jhdr->blhdr_size * 2;
1146 }
1147 // and make sure it's an even multiple of the block size
1148 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1149 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1150 }
1151
1152 jnl->tbuffer_size = tbuffer_size;
1153 }
1154
1155 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1156 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1157 }
1158
1159 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1160 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1161 }
1162
1163 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
55e303ae
A
1164 if (jnl->jhdr->blhdr_size < phys_blksz) {
1165 jnl->jhdr->blhdr_size = phys_blksz;
1166 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1167 // have to round up so we're an even multiple of the physical block size
1168 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1169 }
b4c24cb9
A
1170}
1171
1172
1173
1174journal *
1175journal_create(struct vnode *jvp,
1176 off_t offset,
1177 off_t journal_size,
1178 struct vnode *fsvp,
1179 size_t min_fs_blksz,
1180 int32_t flags,
1181 int32_t tbuffer_size,
1182 void (*flush)(void *arg),
1183 void *arg)
1184{
1185 journal *jnl;
1186 int ret, phys_blksz;
1187
1188 /* Get the real physical block size. */
1189 if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
1190 return NULL;
1191 }
1192
1193 if (phys_blksz > min_fs_blksz) {
1194 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1195 phys_blksz, min_fs_blksz);
1196 return NULL;
1197 }
1198
1199 if ((journal_size % phys_blksz) != 0) {
1200 printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1201 journal_size, phys_blksz);
1202 return NULL;
1203 }
1204
d7e50217 1205 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
b4c24cb9
A
1206 memset(jnl, 0, sizeof(*jnl));
1207
1208 jnl->jdev = jvp;
1209 jnl->jdev_offset = offset;
1210 jnl->fsdev = fsvp;
1211 jnl->flush = flush;
1212 jnl->flush_arg = arg;
1213 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1214 simple_lock_init(&jnl->old_start_lock);
1215
1216 if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1217 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
1218 goto bad_kmem_alloc;
1219 }
1220
1221 memset(jnl->header_buf, 0, phys_blksz);
1222
1223 jnl->jhdr = (journal_header *)jnl->header_buf;
1224 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1225 jnl->jhdr->endian = ENDIAN_MAGIC;
1226 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
1227 jnl->jhdr->end = phys_blksz;
1228 jnl->jhdr->size = journal_size;
1229 jnl->jhdr->jhdr_size = phys_blksz;
1230 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1231
1232 jnl->active_start = jnl->jhdr->start;
1233
1234 // XXXdbg - for testing you can force the journal to wrap around
1235 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1236 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1237
55e303ae 1238 lockinit(&jnl->jlock, PINOD, "journal", 0, 0);
b4c24cb9
A
1239
1240 if (write_journal_header(jnl) != 0) {
1241 printf("jnl: journal_create: failed to write journal header.\n");
1242 goto bad_write;
1243 }
1244
1245 return jnl;
1246
1247
1248 bad_write:
b4c24cb9
A
1249 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1250 bad_kmem_alloc:
1251 jnl->jhdr = NULL;
d7e50217 1252 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
b4c24cb9
A
1253 return NULL;
1254}
1255
1256
1257journal *
1258journal_open(struct vnode *jvp,
1259 off_t offset,
1260 off_t journal_size,
1261 struct vnode *fsvp,
1262 size_t min_fs_blksz,
1263 int32_t flags,
1264 int32_t tbuffer_size,
1265 void (*flush)(void *arg),
1266 void *arg)
1267{
1268 journal *jnl;
1269 int orig_blksz=0, phys_blksz, blhdr_size;
55e303ae 1270 int orig_checksum, checksum;
b4c24cb9
A
1271
1272 /* Get the real physical block size. */
1273 if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
1274 return NULL;
1275 }
1276
1277 if (phys_blksz > min_fs_blksz) {
1278 printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
1279 phys_blksz, min_fs_blksz);
1280 return NULL;
1281 }
1282
1283 if ((journal_size % phys_blksz) != 0) {
1284 printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1285 journal_size, phys_blksz);
1286 return NULL;
1287 }
1288
d7e50217 1289 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
b4c24cb9
A
1290 memset(jnl, 0, sizeof(*jnl));
1291
1292 jnl->jdev = jvp;
1293 jnl->jdev_offset = offset;
1294 jnl->fsdev = fsvp;
1295 jnl->flush = flush;
1296 jnl->flush_arg = arg;
1297 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1298 simple_lock_init(&jnl->old_start_lock);
1299
1300 if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1301 printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
1302 goto bad_kmem_alloc;
1303 }
1304
1305 jnl->jhdr = (journal_header *)jnl->header_buf;
1306 memset(jnl->jhdr, 0, sizeof(journal_header)+4);
1307
1308 // we have to set this up here so that do_journal_io() will work
1309 jnl->jhdr->jhdr_size = phys_blksz;
1310
55e303ae 1311 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
b4c24cb9
A
1312 printf("jnl: open: could not read %d bytes for the journal header.\n",
1313 phys_blksz);
1314 goto bad_journal;
1315 }
1316
55e303ae
A
1317 orig_checksum = jnl->jhdr->checksum;
1318 jnl->jhdr->checksum = 0;
1319
1320 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1321 // do this before the swap since it's done byte-at-a-time
1322 orig_checksum = SWAP32(orig_checksum);
1323 checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
1324 swap_journal_header(jnl);
1325 jnl->flags |= JOURNAL_NEED_SWAP;
1326 } else {
1327 checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
1328 }
1329
b4c24cb9
A
1330 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1331 printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
1332 jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1333 goto bad_journal;
1334 }
1335
1336 // only check if we're the current journal header magic value
1337 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
b4c24cb9 1338
55e303ae
A
1339 if (orig_checksum != checksum) {
1340 printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
1341 orig_checksum, checksum);
1342
b4c24cb9
A
1343 //goto bad_journal;
1344 }
1345 }
1346
1347 // XXXdbg - convert old style magic numbers to the new one
1348 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1349 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1350 }
1351
1352 if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1353 printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
1354 phys_blksz, jnl->jhdr->jhdr_size);
1355
1356 orig_blksz = phys_blksz;
1357 phys_blksz = jnl->jhdr->jhdr_size;
1358 if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) {
1359 printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
1360 goto bad_journal;
1361 }
1362// goto bad_journal;
1363 }
1364
1365 if ( jnl->jhdr->start <= 0
1366 || jnl->jhdr->start > jnl->jhdr->size
55e303ae 1367 || jnl->jhdr->start > 1024*1024*1024) {
b4c24cb9
A
1368 printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1369 jnl->jhdr->start, jnl->jhdr->size);
1370 goto bad_journal;
1371 }
1372
1373 if ( jnl->jhdr->end <= 0
1374 || jnl->jhdr->end > jnl->jhdr->size
55e303ae 1375 || jnl->jhdr->end > 1024*1024*1024) {
b4c24cb9
A
1376 printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1377 jnl->jhdr->end, jnl->jhdr->size);
1378 goto bad_journal;
1379 }
1380
55e303ae 1381 if (jnl->jhdr->size > 1024*1024*1024) {
b4c24cb9
A
1382 printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
1383 goto bad_journal;
1384 }
1385
1386// XXXdbg - can't do these checks because hfs writes all kinds of
1387// non-uniform sized blocks even on devices that have a block size
1388// that is larger than 512 bytes (i.e. optical media w/2k blocks).
1389// therefore these checks will fail and so we just have to punt and
1390// do more relaxed checking...
1391// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1392 if ((jnl->jhdr->start % 512) != 0) {
1393 printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
1394 jnl->jhdr->start);
1395 goto bad_journal;
1396 }
1397
1398//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1399 if ((jnl->jhdr->end % 512) != 0) {
1400 printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1401 jnl->jhdr->end, jnl->jhdr->jhdr_size);
1402 goto bad_journal;
1403 }
1404
1405 // take care of replaying the journal if necessary
1406 if (flags & JOURNAL_RESET) {
1407 printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
1408 jnl, jnl->jhdr->start, jnl->jhdr->end);
1409 jnl->jhdr->start = jnl->jhdr->end;
1410 } else if (replay_journal(jnl) != 0) {
1411 printf("jnl: journal_open: Error replaying the journal!\n");
1412 goto bad_journal;
1413 }
1414
1415 if (orig_blksz != 0) {
1416 VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
1417 phys_blksz = orig_blksz;
55e303ae
A
1418 if (orig_blksz < jnl->jhdr->jhdr_size) {
1419 printf("jnl: open: jhdr_size is %d but orig phys blk size is %d. switching.\n",
1420 jnl->jhdr->jhdr_size, orig_blksz);
1421
1422 jnl->jhdr->jhdr_size = orig_blksz;
1423 }
b4c24cb9
A
1424 }
1425
1426 // make sure this is in sync!
1427 jnl->active_start = jnl->jhdr->start;
1428
1429 // set this now, after we've replayed the journal
1430 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1431
55e303ae 1432 lockinit(&jnl->jlock, PINOD, "journal", 0, 0);
b4c24cb9
A
1433
1434 return jnl;
1435
1436 bad_journal:
1437 if (orig_blksz != 0) {
1438 phys_blksz = orig_blksz;
1439 VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
1440 }
1441 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1442 bad_kmem_alloc:
d7e50217 1443 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
b4c24cb9
A
1444 return NULL;
1445}
1446
1447void
1448journal_close(journal *jnl)
1449{
1450 volatile off_t *start, *end;
1451 int counter=0;
1452
1453 CHECK_JOURNAL(jnl);
1454
1455 // set this before doing anything that would block so that
1456 // we start tearing things down properly.
1457 //
1458 jnl->flags |= JOURNAL_CLOSE_PENDING;
1459
1460 if (jnl->owner != current_act()) {
1461 int ret;
1462
55e303ae 1463 ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc());
b4c24cb9 1464 if (ret != 0) {
55e303ae 1465 printf("jnl: close: locking the journal (0x%x) failed %d.\n", jnl, ret);
b4c24cb9
A
1466 return;
1467 }
1468 }
1469
1470 //
1471 // only write stuff to disk if the journal is still valid
1472 //
1473 if ((jnl->flags & JOURNAL_INVALID) == 0) {
1474
1475 if (jnl->active_tr) {
1476 journal_end_transaction(jnl);
1477 }
1478
1479 // flush any buffered transactions
1480 if (jnl->cur_tr) {
1481 transaction *tr = jnl->cur_tr;
1482
1483 jnl->cur_tr = NULL;
1484 end_transaction(tr, 1); // force it to get flushed
1485 }
1486
1487 //start = &jnl->jhdr->start;
1488 start = &jnl->active_start;
1489 end = &jnl->jhdr->end;
1490
1491 while (*start != *end && counter++ < 500) {
1492 printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
1493 if (jnl->flush) {
1494 jnl->flush(jnl->flush_arg);
1495 }
d7e50217 1496 tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 1);
b4c24cb9
A
1497 }
1498
1499 if (*start != *end) {
1500 printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
1501 *start, *end);
1502 }
1503
1504 // make sure this is in sync when we close the journal
1505 jnl->jhdr->start = jnl->active_start;
1506
1507 // if this fails there's not much we can do at this point...
1508 write_journal_header(jnl);
1509 } else {
1510 // if we're here the journal isn't valid any more.
1511 // so make sure we don't leave any locked blocks lying around
1512 printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl);
1513 if (jnl->active_tr || jnl->cur_tr) {
1514 transaction *tr;
1515 if (jnl->active_tr) {
1516 tr = jnl->active_tr;
1517 jnl->active_tr = NULL;
1518 } else {
1519 tr = jnl->cur_tr;
1520 jnl->cur_tr = NULL;
1521 }
1522
1523 abort_transaction(jnl, tr);
1524 if (jnl->active_tr || jnl->cur_tr) {
1525 panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
1526 }
1527 }
1528 }
1529
1530 free_old_stuff(jnl);
1531
1532 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
1533 jnl->jhdr = (void *)0xbeefbabe;
1534
d7e50217 1535 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
b4c24cb9
A
1536}
1537
1538static void
1539dump_journal(journal *jnl)
1540{
1541 transaction *ctr;
1542
1543 printf("journal:");
1544 printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
1545 printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
1546 printf(" start: 0x%.8llx\n", jnl->jhdr->start);
1547 printf(" end: 0x%.8llx\n", jnl->jhdr->end);
1548 printf(" size: 0x%.8llx\n", jnl->jhdr->size);
1549 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
1550 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
1551 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
1552
1553 printf(" completed transactions:\n");
1554 for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
1555 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
1556 }
1557}
1558
1559
1560
1561static off_t
1562free_space(journal *jnl)
1563{
1564 off_t free_space;
1565
1566 if (jnl->jhdr->start < jnl->jhdr->end) {
1567 free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
1568 } else if (jnl->jhdr->start > jnl->jhdr->end) {
1569 free_space = jnl->jhdr->start - jnl->jhdr->end;
1570 } else {
1571 // journal is completely empty
1572 free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
1573 }
1574
1575 return free_space;
1576}
1577
1578
1579//
1580// The journal must be locked on entry to this function.
1581// The "desired_size" is in bytes.
1582//
1583static int
1584check_free_space(journal *jnl, int desired_size)
1585{
1586 int i, counter=0;
1587
1588 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
1589// desired_size, free_space(jnl));
1590
1591 while (1) {
55e303ae
A
1592 int old_start_empty;
1593
b4c24cb9
A
1594 if (counter++ == 5000) {
1595 dump_journal(jnl);
1596 panic("jnl: check_free_space: buffer flushing isn't working "
1597 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
1598 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
1599 }
1600 if (counter > 7500) {
1601 printf("jnl: check_free_space: giving up waiting for free space.\n");
1602 return ENOSPC;
1603 }
1604
1605 // make sure there's space in the journal to hold this transaction
1606 if (free_space(jnl) > desired_size) {
1607 break;
1608 }
1609
1610 //
1611 // here's where we lazily bump up jnl->jhdr->start. we'll consume
1612 // entries until there is enough space for the next transaction.
1613 //
55e303ae 1614 old_start_empty = 1;
b4c24cb9
A
1615 simple_lock(&jnl->old_start_lock);
1616 for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
1617 int counter;
1618
1619 counter = 0;
1620 while (jnl->old_start[i] & 0x8000000000000000LL) {
1621 if (counter++ > 100) {
1622 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
1623 jnl->old_start[i], jnl);
1624 }
1625
1626 simple_unlock(&jnl->old_start_lock);
1627 if (jnl->flush) {
1628 jnl->flush(jnl->flush_arg);
1629 }
1630 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
1631 simple_lock(&jnl->old_start_lock);
1632 }
1633
1634 if (jnl->old_start[i] == 0) {
1635 continue;
1636 }
1637
55e303ae 1638 old_start_empty = 0;
b4c24cb9
A
1639 jnl->jhdr->start = jnl->old_start[i];
1640 jnl->old_start[i] = 0;
1641 if (free_space(jnl) > desired_size) {
1642 write_journal_header(jnl);
1643 break;
1644 }
1645 }
1646 simple_unlock(&jnl->old_start_lock);
1647
1648 // if we bumped the start, loop and try again
1649 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
1650 continue;
55e303ae
A
1651 } else if (old_start_empty) {
1652 //
1653 // if there is nothing in old_start anymore then we can
1654 // bump the jhdr->start to be the same as active_start
1655 // since it is possible there was only one very large
1656 // transaction in the old_start array. if we didn't do
1657 // this then jhdr->start would never get updated and we
1658 // would wind up looping until we hit the panic at the
1659 // start of the loop.
1660 //
1661 jnl->jhdr->start = jnl->active_start;
1662 write_journal_header(jnl);
1663 continue;
b4c24cb9
A
1664 }
1665
1666
1667 // if the file system gave us a flush function, call it to so that
1668 // it can flush some blocks which hopefully will cause some transactions
1669 // to complete and thus free up space in the journal.
1670 if (jnl->flush) {
1671 jnl->flush(jnl->flush_arg);
1672 }
1673
1674 // wait for a while to avoid being cpu-bound (this will
1675 // put us to sleep for 10 milliseconds)
1676 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
1677 }
1678
1679 return 0;
1680}
1681
1682int
1683journal_start_transaction(journal *jnl)
1684{
1685 int ret;
1686 transaction *tr;
55e303ae 1687 int prev_priv;
b4c24cb9
A
1688
1689 CHECK_JOURNAL(jnl);
1690
1691 if (jnl->flags & JOURNAL_INVALID) {
1692 return EINVAL;
1693 }
1694
1695 if (jnl->owner == current_act()) {
1696 if (jnl->active_tr == NULL) {
1697 panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
1698 jnl, jnl->owner, current_act());
1699 }
1700 jnl->nested_count++;
1701 return 0;
1702 }
1703
55e303ae 1704 ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc());
b4c24cb9 1705 if (ret != 0) {
55e303ae 1706 printf("jnl: start_tr: locking the journal (0x%x) failed %d.\n", jnl, ret);
b4c24cb9
A
1707 return EINVAL;
1708 }
1709
1710 if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
1711 panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
1712 jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
1713 }
1714
1715 jnl->owner = current_act();
1716 jnl->nested_count = 1;
1717
1718 free_old_stuff(jnl);
1719
1720 // make sure there's room in the journal
1721 if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
1722 printf("jnl: start transaction failed: no space\n");
1723 ret = ENOSPC;
1724 goto bad_start;
1725 }
1726
1727 // if there's a buffered transaction, use it.
1728 if (jnl->cur_tr) {
1729 jnl->active_tr = jnl->cur_tr;
1730 jnl->cur_tr = NULL;
1731
1732 return 0;
1733 }
1734
d7e50217 1735 MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
b4c24cb9
A
1736 memset(tr, 0, sizeof(transaction));
1737
1738 tr->tbuffer_size = jnl->tbuffer_size;
55e303ae 1739 thread_wire_internal(host_priv_self(), current_act(), TRUE, &prev_priv);
b4c24cb9 1740 if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
d7e50217 1741 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
b4c24cb9
A
1742 printf("jnl: start transaction failed: no tbuffer mem\n");
1743 ret = ENOMEM;
55e303ae 1744 thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL);
b4c24cb9
A
1745 goto bad_start;
1746 }
55e303ae 1747 thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL);
b4c24cb9
A
1748
1749 // journal replay code checksum check depends on this.
1750 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
1751
1752 tr->blhdr = (block_list_header *)tr->tbuffer;
1753 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
1754 tr->blhdr->num_blocks = 1; // accounts for this header block
1755 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
1756
1757 tr->num_blhdrs = 1;
1758 tr->total_bytes = jnl->jhdr->blhdr_size;
1759 tr->jnl = jnl;
1760
1761 jnl->active_tr = tr;
1762
1763 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
1764
1765 return 0;
1766
1767 bad_start:
1768 jnl->owner = NULL;
1769 jnl->nested_count = 0;
55e303ae 1770 lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc());
b4c24cb9
A
1771 return ret;
1772}
1773
1774
1775int
1776journal_modify_block_start(journal *jnl, struct buf *bp)
1777{
1778 transaction *tr;
1779
1780 CHECK_JOURNAL(jnl);
1781
1782 if (jnl->flags & JOURNAL_INVALID) {
1783 return EINVAL;
1784 }
1785
1786 // XXXdbg - for debugging I want this to be true. later it may
1787 // not be necessary.
1788 if ((bp->b_flags & B_META) == 0) {
1789 panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
1790 }
1791
1792 tr = jnl->active_tr;
1793 CHECK_TRANSACTION(tr);
1794
1795 if (jnl->owner != current_act()) {
1796 panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1797 jnl, jnl->owner, current_act());
1798 }
1799
1800 free_old_stuff(jnl);
1801
1802 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
1803 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1804
1805 // can't allow blocks that aren't an even multiple of the
1806 // underlying block size.
1807 if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) {
1808 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1809 bp->b_bufsize, jnl->jhdr->jhdr_size);
1810 return -1;
1811 }
1812
1813 // make sure that this transaction isn't bigger than the whole journal
1814 if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
1815 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
1816 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp);
1817 return -1;
1818 }
1819
1820 // if the block is dirty and not already locked we have to write
1821 // it out before we muck with it because it has data that belongs
1822 // (presumably) to another transaction.
1823 //
1824 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
1825
1826 // this will cause it to not be brelse()'d
1827 bp->b_flags |= B_NORELSE;
1828 VOP_BWRITE(bp);
1829 }
1830
1831 bp->b_flags |= B_LOCKED;
1832
1833 return 0;
1834}
1835
1836int
1837journal_modify_block_abort(journal *jnl, struct buf *bp)
1838{
1839 transaction *tr;
1840 block_list_header *blhdr;
1841 int i, j;
1842
1843 CHECK_JOURNAL(jnl);
1844
1845 tr = jnl->active_tr;
1846
1847 //
1848 // if there's no active transaction then we just want to
1849 // call brelse() and return since this is just a block
1850 // that happened to be modified as part of another tr.
1851 //
1852 if (tr == NULL) {
1853 brelse(bp);
1854 return 0;
1855 }
1856
1857 if (jnl->flags & JOURNAL_INVALID) {
1858 return EINVAL;
1859 }
1860
1861 CHECK_TRANSACTION(tr);
1862
1863 if (jnl->owner != current_act()) {
1864 panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1865 jnl, jnl->owner, current_act());
1866 }
1867
1868 free_old_stuff(jnl);
1869
1870 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
1871
1872 // first check if it's already part of this transaction
1873 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
1874 for(i=1; i < blhdr->num_blocks; i++) {
1875 if (bp == blhdr->binfo[i].bp) {
1876 if (bp->b_bufsize != blhdr->binfo[i].bsize) {
1877 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1878 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
1879 }
1880 break;
1881 }
1882 }
1883
1884 if (i < blhdr->num_blocks) {
1885 break;
1886 }
1887 }
1888
1889 //
1890 // if blhdr is null, then this block has only had modify_block_start
1891 // called on it as part of the current transaction. that means that
1892 // it is ok to clear the LOCKED bit since it hasn't actually been
1893 // modified. if blhdr is non-null then modify_block_end was called
1894 // on it and so we need to keep it locked in memory.
1895 //
1896 if (blhdr == NULL) {
1897 bp->b_flags &= ~(B_LOCKED);
1898 }
1899
1900 brelse(bp);
1901 return 0;
1902}
1903
1904
1905int
1906journal_modify_block_end(journal *jnl, struct buf *bp)
1907{
1908 int i, j, tbuffer_offset;
1909 char *blkptr;
1910 block_list_header *blhdr, *prev=NULL;
1911 transaction *tr;
1912
1913 CHECK_JOURNAL(jnl);
1914
1915 if (jnl->flags & JOURNAL_INVALID) {
1916 return EINVAL;
1917 }
1918
1919 tr = jnl->active_tr;
1920 CHECK_TRANSACTION(tr);
1921
1922 if (jnl->owner != current_act()) {
1923 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
1924 jnl, jnl->owner, current_act());
1925 }
1926
1927 free_old_stuff(jnl);
1928
1929 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n",
1930 // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
1931
1932 if ((bp->b_flags & B_LOCKED) == 0) {
1933 panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
1934 bp->b_flags |= B_LOCKED;
1935 }
1936
1937 // first check if it's already part of this transaction
1938 for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
1939 tbuffer_offset = jnl->jhdr->blhdr_size;
1940
1941 for(i=1; i < blhdr->num_blocks; i++) {
1942 if (bp == blhdr->binfo[i].bp) {
1943 if (bp->b_bufsize != blhdr->binfo[i].bsize) {
1944 panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
1945 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
1946 }
1947 break;
1948 }
1949 tbuffer_offset += blhdr->binfo[i].bsize;
1950 }
1951
1952 if (i < blhdr->num_blocks) {
1953 break;
1954 }
1955 }
1956
1957 if (blhdr == NULL
1958 && prev
1959 && (prev->num_blocks+1) <= prev->max_blocks
1960 && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) {
1961 blhdr = prev;
1962 } else if (blhdr == NULL) {
1963 block_list_header *nblhdr;
55e303ae 1964 int prev_priv;
b4c24cb9
A
1965
1966 if (prev == NULL) {
1967 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
1968 }
1969
1970 // we got to the end of the list, didn't find the block and there's
1971 // no room in the block_list_header pointed to by prev
1972
1973 // we allocate another tbuffer and link it in at the end of the list
1974 // through prev->binfo[0].bnum. that's a skanky way to do things but
1975 // avoids having yet another linked list of small data structures to manage.
1976
55e303ae 1977 thread_wire_internal(host_priv_self(), current_act(), TRUE, &prev_priv);
b4c24cb9
A
1978 if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
1979 panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
1980 tr, tr->total_bytes);
1981 }
55e303ae 1982 thread_wire_internal(host_priv_self(), current_act(), prev_priv, NULL);
b4c24cb9
A
1983
1984 // journal replay code checksum check depends on this.
1985 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
1986
1987 // initialize the new guy
1988 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
1989 nblhdr->num_blocks = 1; // accounts for this header block
1990 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
1991
1992 tr->num_blhdrs++;
1993 tr->total_bytes += jnl->jhdr->blhdr_size;
1994
1995 // then link him in at the end
1996 prev->binfo[0].bnum = (off_t)((long)nblhdr);
1997
1998 // and finally switch to using the new guy
1999 blhdr = nblhdr;
2000 tbuffer_offset = jnl->jhdr->blhdr_size;
2001 i = 1;
2002 }
2003
2004
2005 if ((i+1) > blhdr->max_blocks) {
2006 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2007 }
2008
2009 // copy the data into the in-memory transaction buffer
2010 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2011 memcpy(blkptr, bp->b_data, bp->b_bufsize);
2012
2013 // if this is true then this is a new block we haven't seen
2014 if (i >= blhdr->num_blocks) {
2015 vget(bp->b_vp, 0, current_proc());
2016
55e303ae 2017 blhdr->binfo[i].bnum = (off_t)((unsigned)bp->b_blkno);
b4c24cb9
A
2018 blhdr->binfo[i].bsize = bp->b_bufsize;
2019 blhdr->binfo[i].bp = bp;
2020
2021 blhdr->bytes_used += bp->b_bufsize;
2022 tr->total_bytes += bp->b_bufsize;
2023
2024 blhdr->num_blocks++;
2025 }
2026
2027 bdwrite(bp);
2028
2029 return 0;
2030}
2031
2032int
2033journal_kill_block(journal *jnl, struct buf *bp)
2034{
2035 int i;
2036 block_list_header *blhdr;
2037 transaction *tr;
2038
2039 CHECK_JOURNAL(jnl);
2040
2041 if (jnl->flags & JOURNAL_INVALID) {
2042 return EINVAL;
2043 }
2044
2045 tr = jnl->active_tr;
2046 CHECK_TRANSACTION(tr);
2047
2048 if (jnl->owner != current_act()) {
2049 panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2050 jnl, jnl->owner, current_act());
2051 }
2052
2053 free_old_stuff(jnl);
2054
2055 if ((bp->b_flags & B_LOCKED) == 0) {
2056 panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
2057 }
2058
2059 // first check if it's already part of this transaction
2060 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2061
2062 for(i=1; i < blhdr->num_blocks; i++) {
2063 if (bp == blhdr->binfo[i].bp) {
2064 bp->b_flags &= ~B_LOCKED;
2065
2066 // this undoes the vget() in journal_modify_block_end()
2067 vrele(bp->b_vp);
2068
2069 // if the block has the DELWRI and CALL bits sets, then
2070 // things are seriously weird. if it was part of another
2071 // transaction then journal_modify_block_start() should
2072 // have force it to be written.
2073 //
2074 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) {
2075 panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2076 } else {
2077 tr->num_killed += bp->b_bufsize;
2078 }
2079
2080 if (bp->b_flags & B_BUSY) {
2081 brelse(bp);
2082 }
2083
2084 blhdr->binfo[i].bp = NULL;
2085 blhdr->binfo[i].bnum = (off_t)-1;
2086 break;
2087 }
2088 }
2089
2090 if (i < blhdr->num_blocks) {
2091 break;
2092 }
2093 }
2094
2095 return 0;
2096}
2097
2098
2099static int
2100journal_binfo_cmp(void *a, void *b)
2101{
2102 block_info *bi_a = (struct block_info *)a,
2103 *bi_b = (struct block_info *)b;
2104 daddr_t res;
2105
2106 if (bi_a->bp == NULL) {
2107 return 1;
2108 }
2109 if (bi_b->bp == NULL) {
2110 return -1;
2111 }
2112
2113 // don't have to worry about negative block
2114 // numbers so this is ok to do.
2115 //
2116 res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno);
2117
2118 return (int)res;
2119}
2120
2121
2122static int
2123end_transaction(transaction *tr, int force_it)
2124{
2125 int i, j, ret, amt;
2126 off_t end;
2127 journal *jnl = tr->jnl;
2128 struct buf *bp;
2129 block_list_header *blhdr=NULL, *next=NULL;
2130
2131 if (jnl->cur_tr) {
2132 panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
2133 jnl, jnl->cur_tr, tr);
2134 }
2135
2136 // if there weren't any modified blocks in the transaction
2137 // just save off the transaction pointer and return.
2138 if (tr->total_bytes == jnl->jhdr->blhdr_size) {
2139 jnl->cur_tr = tr;
2140 return;
2141 }
2142
2143 // if our transaction buffer isn't very full, just hang
2144 // on to it and don't actually flush anything. this is
2145 // what is known as "group commit". we will flush the
2146 // transaction buffer if it's full or if we have more than
2147 // one of them so we don't start hogging too much memory.
2148 //
2149 if ( force_it == 0
2150 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2151 && tr->num_blhdrs < 3
2152 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
2153
2154 jnl->cur_tr = tr;
2155 return;
2156 }
2157
2158
2159 // if we're here we're going to flush the transaction buffer to disk.
2160 // make sure there is room in the journal first.
2161 check_free_space(jnl, tr->total_bytes);
2162
2163 // range check the end index
2164 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2165 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2166 jnl->jhdr->end, jnl->jhdr->size);
2167 }
2168
2169 // this transaction starts where the current journal ends
2170 tr->journal_start = jnl->jhdr->end;
2171 end = jnl->jhdr->end;
2172
2173 //
2174 // if the first entry in old_start[] isn't free yet, loop calling the
2175 // file system flush routine until it is (or we panic).
2176 //
2177 i = 0;
2178 simple_lock(&jnl->old_start_lock);
2179 while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
2180 if (jnl->flush) {
2181 simple_unlock(&jnl->old_start_lock);
2182
2183 if (jnl->flush) {
2184 jnl->flush(jnl->flush_arg);
2185 }
2186
2187 // yield the cpu so others can get in to clear the lock bit
2188 (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
2189
2190 simple_lock(&jnl->old_start_lock);
2191 }
2192 if (i++ >= 100) {
2193 panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
2194 jnl->old_start[0] & (~0x8000000000000000LL), jnl);
2195 }
2196 }
2197
2198 //
2199 // slide everyone else down and put our latest guy in the last
2200 // entry in the old_start array
2201 //
2202 memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2203 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2204
2205 simple_unlock(&jnl->old_start_lock);
2206
2207
2208 // for each block, make sure that the physical block # is set
2209 for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2210
2211 for(i=1; i < blhdr->num_blocks; i++) {
2212
2213 bp = blhdr->binfo[i].bp;
2214 if (bp == NULL) { // only true if a block was "killed"
2215 if (blhdr->binfo[i].bnum != (off_t)-1) {
2216 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
2217 blhdr->binfo[i].bnum, jnl, tr);
2218 }
2219 continue;
2220 }
2221
2222 if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) {
2223 panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
2224 bp, bp->b_lblkno, bp->b_blkno);
2225 }
2226
2227 // if the lblkno is the same as blkno and this bp isn't
2228 // associated with the underlying file system device then
2229 // we need to call bmap() to get the actual physical block.
2230 //
2231 if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) {
2232 if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) {
2233 printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
2234 goto bad_journal;
2235 }
2236 }
2237
2238 // update this so we write out the correct physical block number!
55e303ae 2239 blhdr->binfo[i].bnum = (off_t)((unsigned)bp->b_blkno);
b4c24cb9
A
2240 }
2241
2242 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2243 }
2244
2245 for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
2246
2247 amt = blhdr->bytes_used;
2248
2249 blhdr->checksum = 0;
2250 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2251
2252 ret = write_journal_data(jnl, &end, blhdr, amt);
2253 if (ret != amt) {
2254 printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
2255 ret, amt);
2256
2257 goto bad_journal;
2258 }
2259 }
2260
2261 jnl->jhdr->end = end; // update where the journal now ends
2262 tr->journal_end = end; // the transaction ends here too
2263 if (tr->journal_start == 0 || tr->journal_end == 0) {
2264 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2265 tr->journal_start, tr->journal_end);
2266 }
2267
2268 if (write_journal_header(jnl) != 0) {
2269 goto bad_journal;
2270 }
2271
2272 //
2273 // setup for looping through all the blhdr's. we null out the
2274 // tbuffer and blhdr fields so that they're not used any more.
2275 //
2276 blhdr = tr->blhdr;
2277 tr->tbuffer = NULL;
2278 tr->blhdr = NULL;
2279
2280 // the buffer_flushed_callback will only be called for the
2281 // real blocks that get flushed so we have to account for
2282 // the block_list_headers here.
2283 //
2284 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2285
2286 // for each block, set the iodone callback and unlock it
2287 for(; blhdr; blhdr=next) {
2288
2289 // we can re-order the buf ptrs because everything is written out already
2290 qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2291
2292 for(i=1; i < blhdr->num_blocks; i++) {
2293 if (blhdr->binfo[i].bp == NULL) {
2294 continue;
2295 }
2296
2297 ret = meta_bread(blhdr->binfo[i].bp->b_vp,
2298 (daddr_t)blhdr->binfo[i].bp->b_lblkno,
2299 blhdr->binfo[i].bp->b_bufsize,
2300 NOCRED,
2301 &bp);
2302 if (ret == 0 && bp != NULL) {
2303 struct vnode *save_vp;
2304
2305 if (bp != blhdr->binfo[i].bp) {
2306 panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2307 bp, blhdr->binfo[i].bp, jnl);
2308 }
2309
2310 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
2311 if (jnl->flags & JOURNAL_CLOSE_PENDING) {
2312 brelse(bp);
2313 continue;
2314 } else {
2315 panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags);
2316 }
2317 }
2318
2319 if (bp->b_iodone != NULL) {
2320 panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
2321 bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback);
2322 }
2323
2324 save_vp = bp->b_vp;
2325
2326 bp->b_iodone = buffer_flushed_callback;
2327 bp->b_transaction = tr;
2328 bp->b_flags |= B_CALL;
2329 bp->b_flags &= ~(B_LOCKED);
2330
2331 // kicking off the write here helps performance
2332 bawrite(bp);
2333 // XXXdbg this is good for testing: bdwrite(bp);
2334 //bdwrite(bp);
2335
2336 // this undoes the vget() in journal_modify_block_end()
2337 vrele(save_vp);
2338
2339 } else {
2340 printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
2341 blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
d7e50217
A
2342 if (bp) {
2343 brelse(bp);
2344 }
b4c24cb9
A
2345 }
2346 }
2347
2348 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2349
2350 // we can free blhdr here since we won't need it any more
2351 blhdr->binfo[0].bnum = 0xdeadc0de;
2352 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2353 }
2354
2355 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2356 // tr, tr->journal_start, tr->journal_end);
2357 return 0;
2358
2359
2360 bad_journal:
2361 jnl->flags |= JOURNAL_INVALID;
2362 abort_transaction(jnl, tr);
2363 return -1;
2364}
2365
2366static void
2367abort_transaction(journal *jnl, transaction *tr)
2368{
2369 int i, ret;
2370 block_list_header *blhdr, *next;
2371 struct buf *bp;
55e303ae 2372 struct vnode *save_vp;
b4c24cb9
A
2373
2374 // for each block list header, iterate over the blocks then
2375 // free up the memory associated with the block list.
2376 //
2377 // for each block, clear the lock bit and release it.
2378 //
2379 for(blhdr=tr->blhdr; blhdr; blhdr=next) {
2380
2381 for(i=1; i < blhdr->num_blocks; i++) {
2382 if (blhdr->binfo[i].bp == NULL) {
2383 continue;
2384 }
2385
2386 ret = meta_bread(blhdr->binfo[i].bp->b_vp,
2387 (daddr_t)blhdr->binfo[i].bp->b_lblkno,
2388 blhdr->binfo[i].bp->b_bufsize,
2389 NOCRED,
2390 &bp);
d7e50217 2391 if (ret == 0) {
b4c24cb9
A
2392 if (bp != blhdr->binfo[i].bp) {
2393 panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
2394 bp, blhdr->binfo[i].bp, jnl);
2395 }
2396
2397 // clear the locked bit and the delayed-write bit. we
2398 // don't want these blocks going to disk.
2399 bp->b_flags &= ~(B_LOCKED|B_DELWRI);
2400 bp->b_flags |= B_INVAL;
55e303ae 2401 save_vp = bp->b_vp;
b4c24cb9
A
2402
2403 brelse(bp);
2404
55e303ae
A
2405 vrele(save_vp);
2406
b4c24cb9
A
2407 } else {
2408 printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
2409 blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
d7e50217
A
2410 if (bp) {
2411 brelse(bp);
2412 }
b4c24cb9
A
2413 }
2414 }
2415
2416 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2417
2418 // we can free blhdr here since we won't need it any more
2419 blhdr->binfo[0].bnum = 0xdeadc0de;
2420 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
2421 }
2422
2423 tr->tbuffer = NULL;
2424 tr->blhdr = NULL;
2425 tr->total_bytes = 0xdbadc0de;
d7e50217 2426 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
b4c24cb9
A
2427}
2428
2429
2430int
2431journal_end_transaction(journal *jnl)
2432{
2433 int ret;
2434 transaction *tr;
2435
2436 CHECK_JOURNAL(jnl);
2437
2438 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
2439 return 0;
2440 }
2441
2442 if (jnl->owner != current_act()) {
2443 panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
2444 jnl, jnl->owner, current_act());
2445 }
2446
2447 free_old_stuff(jnl);
2448
2449 jnl->nested_count--;
2450 if (jnl->nested_count > 0) {
2451 return 0;
2452 } else if (jnl->nested_count < 0) {
2453 panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
2454 }
2455
2456 if (jnl->flags & JOURNAL_INVALID) {
2457 if (jnl->active_tr) {
2458 transaction *tr;
2459
2460 if (jnl->cur_tr != NULL) {
2461 panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
2462 jnl, jnl->active_tr, jnl->cur_tr);
2463 }
2464
2465 tr = jnl->active_tr;
2466 jnl->active_tr = NULL;
2467 abort_transaction(jnl, tr);
2468 }
2469
2470 jnl->owner = NULL;
55e303ae 2471 lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc());
b4c24cb9
A
2472
2473 return EINVAL;
2474 }
2475
2476 tr = jnl->active_tr;
2477 CHECK_TRANSACTION(tr);
2478
2479 // clear this out here so that when check_free_space() calls
2480 // the FS flush function, we don't panic in journal_flush()
2481 // if the FS were to call that. note: check_free_space() is
2482 // called from end_transaction().
2483 //
2484 jnl->active_tr = NULL;
2485 ret = end_transaction(tr, 0);
2486
2487 jnl->owner = NULL;
55e303ae 2488 lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc());
b4c24cb9
A
2489
2490 return ret;
2491}
2492
2493
2494int
2495journal_flush(journal *jnl)
2496{
2497 int need_signal = 0;
2498
2499 CHECK_JOURNAL(jnl);
2500
2501 if (jnl->flags & JOURNAL_INVALID) {
2502 return -1;
2503 }
2504
2505 if (jnl->owner != current_act()) {
2506 int ret;
2507
55e303ae 2508 ret = lockmgr(&jnl->jlock, LK_EXCLUSIVE|LK_RETRY, NULL, current_proc());
b4c24cb9 2509 if (ret != 0) {
55e303ae 2510 printf("jnl: flush: locking the journal (0x%x) failed %d.\n", jnl, ret);
b4c24cb9
A
2511 return -1;
2512 }
2513 need_signal = 1;
2514 }
2515
2516 free_old_stuff(jnl);
2517
2518 // if we're not active, flush any buffered transactions
2519 if (jnl->active_tr == NULL && jnl->cur_tr) {
2520 transaction *tr = jnl->cur_tr;
2521
2522 jnl->cur_tr = NULL;
2523 end_transaction(tr, 1); // force it to get flushed
2524 }
2525
2526 if (need_signal) {
55e303ae 2527 lockmgr(&jnl->jlock, LK_RELEASE, NULL, current_proc());
b4c24cb9
A
2528 }
2529
2530 return 0;
2531}
2532
2533int
2534journal_active(journal *jnl)
2535{
2536 if (jnl->flags & JOURNAL_INVALID) {
2537 return -1;
2538 }
2539
2540 return (jnl->active_tr == NULL) ? 0 : 1;
2541}