]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_journal.c
d385566923ef5a5ba5971bf9e804eaf7b5722577
[apple/xnu.git] / bsd / vfs / vfs_journal.c
1 /*
2 * Copyright (c) 2002-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 //
29 // This file implements a simple write-ahead journaling layer.
30 // In theory any file system can make use of it by calling these
31 // functions when the fs wants to modify meta-data blocks. See
32 // vfs_journal.h for a more detailed description of the api and
33 // data structures.
34 //
35 // Dominic Giampaolo (dbg@apple.com)
36 //
37
38 #ifdef KERNEL
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file_internal.h>
44 #include <sys/stat.h>
45 #include <sys/buf_internal.h>
46 #include <sys/proc_internal.h>
47 #include <sys/mount_internal.h>
48 #include <sys/namei.h>
49 #include <sys/vnode_internal.h>
50 #include <sys/ioctl.h>
51 #include <sys/tty.h>
52 #include <sys/ubc.h>
53 #include <sys/malloc.h>
54 #include <kern/task.h>
55 #include <kern/thread.h>
56 #include <kern/kalloc.h>
57 #include <sys/disk.h>
58 #include <sys/kdebug.h>
59 #include <miscfs/specfs/specdev.h>
60 #include <libkern/OSAtomic.h> /* OSAddAtomic */
61
62 kern_return_t thread_terminate(thread_t);
63
64 /*
65 * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT
66 * logging of trim-related calls within the journal. (They're
67 * disabled by default because there can be a lot of these events,
68 * and we don't want to overwhelm the kernel debug buffer. If you
69 * want to watch these events in particular, just set the sysctl.)
70 */
71 static int jnl_kdebug = 0;
72 SYSCTL_DECL(_vfs_generic);
73 SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal");
74 SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug");
75 SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM");
76
77 #define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1)
78 #define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2)
79 #define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3)
80 #define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4)
81 #define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5)
82 #define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6)
83 #define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7)
84
85 /*
86 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
87 * a full allocation block if the current size is smaller than the allocation
88 * block on which it resides. Once we hit the exabyte filesystem range, then
89 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
90 */
91 #define MAX_JOURNAL_SIZE 0x80000000U
92
93 #include <sys/sdt.h> /* DTRACE_IO1 */
94 #else
95
96 #include <stdio.h>
97 #include <stdlib.h>
98 #include <string.h>
99 #include <limits.h>
100 #include <errno.h>
101 #include <fcntl.h>
102 #include <unistd.h>
103 #include <stdarg.h>
104 #include <sys/types.h>
105 #include "compat.h"
106
107 #endif /* KERNEL */
108
109 #include "vfs_journal.h"
110
111 #include <sys/kdebug.h>
112
113 #if 0
114 #undef KERNEL_DEBUG
115 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
116 #endif
117
118
119 #ifndef CONFIG_HFS_TRIM
120 #define CONFIG_HFS_TRIM 0
121 #endif
122
123
124 #if JOURNALING
125
126 //
127 // By default, we grow the list of extents to trim by one page at a time.
128 // We'll opt to flush a transaction if it contains at least
129 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
130 // of modified blocks is small).
131 //
132 enum {
133 JOURNAL_DEFAULT_TRIM_BYTES = PAGE_SIZE,
134 JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
135 JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
136 };
137
138 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
139 SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
140
141 /* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
142 __private_extern__ void qsort(
143 void * array,
144 size_t nmembers,
145 size_t member_size,
146 int (*)(const void *, const void *));
147
148
149
150 // number of bytes to checksum in a block_list_header
151 // NOTE: this should be enough to clear out the header
152 // fields as well as the first entry of binfo[]
153 #define BLHDR_CHECKSUM_SIZE 32
154
155 static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name);
156 static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name);
157 static void unlock_condition(journal *jnl, boolean_t *condition);
158 static void finish_end_thread(transaction *tr);
159 static void write_header_thread(journal *jnl);
160 static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg);
161 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait);
162 static void abort_transaction(journal *jnl, transaction *tr);
163 static void dump_journal(journal *jnl);
164
165 static __inline__ void lock_journal(journal *jnl);
166 static __inline__ void unlock_journal(journal *jnl);
167 static __inline__ void lock_oldstart(journal *jnl);
168 static __inline__ void unlock_oldstart(journal *jnl);
169 static __inline__ void lock_flush(journal *jnl);
170 static __inline__ void unlock_flush(journal *jnl);
171
172
173 //
174 // 3105942 - Coalesce writes to the same block on journal replay
175 //
176
177 typedef struct bucket {
178 off_t block_num;
179 uint32_t jnl_offset;
180 uint32_t block_size;
181 int32_t cksum;
182 } bucket;
183
184 #define STARTING_BUCKETS 256
185
186 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
187 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
188 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
189 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
190 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
191
192 #define CHECK_JOURNAL(jnl) \
193 do { \
194 if (jnl == NULL) { \
195 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
196 } \
197 if (jnl->jdev == NULL) { \
198 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
199 } \
200 if (jnl->fsdev == NULL) { \
201 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
202 } \
203 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \
204 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
205 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
206 } \
207 if ( jnl->jhdr->start <= 0 \
208 || jnl->jhdr->start > jnl->jhdr->size) { \
209 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
210 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
211 } \
212 if ( jnl->jhdr->end <= 0 \
213 || jnl->jhdr->end > jnl->jhdr->size) { \
214 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
215 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
216 } \
217 } while(0)
218
219 #define CHECK_TRANSACTION(tr) \
220 do { \
221 if (tr == NULL) { \
222 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
223 } \
224 if (tr->jnl == NULL) { \
225 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
226 } \
227 if (tr->blhdr != (block_list_header *)tr->tbuffer) { \
228 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
229 } \
230 if (tr->total_bytes < 0) { \
231 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
232 } \
233 if (tr->journal_start < 0) { \
234 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
235 } \
236 if (tr->journal_end < 0) { \
237 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
238 } \
239 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
240 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
241 } \
242 } while(0)
243
244
245
246 //
247 // this isn't a great checksum routine but it will do for now.
248 // we use it to checksum the journal header and the block list
249 // headers that are at the start of each transaction.
250 //
251 static unsigned int
252 calc_checksum(char *ptr, int len)
253 {
254 int i;
255 unsigned int cksum=0;
256
257 // this is a lame checksum but for now it'll do
258 for(i = 0; i < len; i++, ptr++) {
259 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
260 }
261
262 return (~cksum);
263 }
264
265 //
266 // Journal Locking
267 //
268 lck_grp_attr_t * jnl_group_attr;
269 lck_attr_t * jnl_lock_attr;
270 lck_grp_t * jnl_mutex_group;
271
272 void
273 journal_init(void)
274 {
275 jnl_lock_attr = lck_attr_alloc_init();
276 jnl_group_attr = lck_grp_attr_alloc_init();
277 jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
278 }
279
280 static __inline__ void
281 lock_journal(journal *jnl)
282 {
283 lck_mtx_lock(&jnl->jlock);
284 }
285
286 static __inline__ void
287 unlock_journal(journal *jnl)
288 {
289 lck_mtx_unlock(&jnl->jlock);
290 }
291
292 static __inline__ void
293 lock_flush(journal *jnl)
294 {
295 lck_mtx_lock(&jnl->flock);
296 }
297
298 static __inline__ void
299 unlock_flush(journal *jnl)
300 {
301 lck_mtx_unlock(&jnl->flock);
302 }
303
304 static __inline__ void
305 lock_oldstart(journal *jnl)
306 {
307 lck_mtx_lock(&jnl->old_start_lock);
308 }
309
310 static __inline__ void
311 unlock_oldstart(journal *jnl)
312 {
313 lck_mtx_unlock(&jnl->old_start_lock);
314 }
315
316
317
318 #define JNL_WRITE 0x0001
319 #define JNL_READ 0x0002
320 #define JNL_HEADER 0x8000
321
322 //
323 // This function sets up a fake buf and passes it directly to the
324 // journal device strategy routine (so that it won't get cached in
325 // the block cache.
326 //
327 // It also handles range checking the i/o so that we don't write
328 // outside the journal boundaries and it will wrap the i/o back
329 // to the beginning if necessary (skipping over the journal header)
330 //
331 static size_t
332 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
333 {
334 int err, curlen=len;
335 size_t io_sz = 0;
336 buf_t bp;
337 off_t max_iosize;
338
339 if (*offset < 0 || *offset > jnl->jhdr->size) {
340 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
341 }
342
343 if (direction & JNL_WRITE)
344 max_iosize = jnl->max_write_size;
345 else if (direction & JNL_READ)
346 max_iosize = jnl->max_read_size;
347 else
348 max_iosize = 128 * 1024;
349
350 again:
351 bp = alloc_io_buf(jnl->jdev, 1);
352
353 if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
354 if (*offset == jnl->jhdr->size) {
355 *offset = jnl->jhdr->jhdr_size;
356 } else {
357 curlen = (off_t)jnl->jhdr->size - *offset;
358 }
359 }
360
361 if (curlen > max_iosize) {
362 curlen = max_iosize;
363 }
364
365 if (curlen <= 0) {
366 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len);
367 }
368
369 if (*offset == 0 && (direction & JNL_HEADER) == 0) {
370 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
371 }
372
373 if (direction & JNL_READ)
374 buf_setflags(bp, B_READ);
375 else {
376 /*
377 * don't have to set any flags
378 */
379 vnode_startwrite(jnl->jdev);
380 }
381 buf_setsize(bp, curlen);
382 buf_setcount(bp, curlen);
383 buf_setdataptr(bp, (uintptr_t)data);
384 buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
385 buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
386
387 if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
388 buf_markfua(bp);
389 }
390
391 DTRACE_IO1(journal__start, buf_t, bp);
392 err = VNOP_STRATEGY(bp);
393 if (!err) {
394 err = (int)buf_biowait(bp);
395 }
396 DTRACE_IO1(journal__done, buf_t, bp);
397 free_io_buf(bp);
398
399 if (err) {
400 printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
401 return 0;
402 }
403
404 *offset += curlen;
405 io_sz += curlen;
406
407 if (io_sz != len) {
408 // handle wrap-around
409 data = (char *)data + curlen;
410 curlen = len - io_sz;
411 if (*offset >= jnl->jhdr->size) {
412 *offset = jnl->jhdr->jhdr_size;
413 }
414 goto again;
415 }
416
417 return io_sz;
418 }
419
420 static size_t
421 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
422 {
423 return do_journal_io(jnl, offset, data, len, JNL_READ);
424 }
425
426 static size_t
427 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
428 {
429 return do_journal_io(jnl, offset, data, len, JNL_WRITE);
430 }
431
432
433 static size_t
434 read_journal_header(journal *jnl, void *data, size_t len)
435 {
436 off_t hdr_offset = 0;
437
438 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
439 }
440
441 static int
442 write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num)
443 {
444 static int num_err_prints = 0;
445 int ret=0;
446 off_t jhdr_offset = 0;
447 struct vfs_context context;
448
449 context.vc_thread = current_thread();
450 context.vc_ucred = NOCRED;
451 //
452 // Flush the track cache if we're not doing force-unit-access
453 // writes.
454 //
455 if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
456 ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
457 }
458 if (ret != 0) {
459 //
460 // Only print this error if it's a different error than the
461 // previous one, or if it's the first time for this device
462 // or if the total number of printfs is less than 25. We
463 // allow for up to 25 printfs to insure that some make it
464 // into the on-disk syslog. Otherwise if we only printed
465 // one, it's possible it would never make it to the syslog
466 // for the root volume and that makes debugging hard.
467 //
468 if ( ret != jnl->last_flush_err
469 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
470 || num_err_prints++ < 25) {
471
472 printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
473
474 jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
475 jnl->last_flush_err = ret;
476 }
477 }
478
479 jnl->jhdr->sequence_num = sequence_num;
480 jnl->jhdr->checksum = 0;
481 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
482
483 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
484 printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
485 jnl->flags |= JOURNAL_INVALID;
486 return -1;
487 }
488
489 // If we're not doing force-unit-access writes, then we
490 // have to flush after writing the journal header so that
491 // a future transaction doesn't sneak out to disk before
492 // the header does and thus overwrite data that the old
493 // journal header refers to. Saw this exact case happen
494 // on an IDE bus analyzer with Larry Barras so while it
495 // may seem obscure, it's not.
496 //
497 if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
498 VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context);
499 }
500
501 return 0;
502 }
503
504
505
506 //
507 // this is a work function used to free up transactions that
508 // completed. they can't be free'd from buffer_flushed_callback
509 // because it is called from deep with the disk driver stack
510 // and thus can't do something that would potentially cause
511 // paging. it gets called by each of the journal api entry
512 // points so stuff shouldn't hang around for too long.
513 //
514 static void
515 free_old_stuff(journal *jnl)
516 {
517 transaction *tr, *next;
518 block_list_header *blhdr=NULL, *next_blhdr=NULL;
519
520 if (jnl->tr_freeme == NULL)
521 return;
522
523 lock_oldstart(jnl);
524 tr = jnl->tr_freeme;
525 jnl->tr_freeme = NULL;
526 unlock_oldstart(jnl);
527
528 for(; tr; tr=next) {
529 for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
530 next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
531 blhdr->binfo[0].bnum = 0xdeadc0de;
532
533 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
534
535 KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
536 }
537 next = tr->next;
538 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
539 }
540 }
541
542
543
544 //
545 // This is our callback that lets us know when a buffer has been
546 // flushed to disk. It's called from deep within the driver stack
547 // and thus is quite limited in what it can do. Notably, it can
548 // not initiate any new i/o's or allocate/free memory.
549 //
550 static void
551 buffer_flushed_callback(struct buf *bp, void *arg)
552 {
553 transaction *tr;
554 journal *jnl;
555 transaction *ctr, *prev=NULL, *next;
556 size_t i;
557 int bufsize, amt_flushed, total_bytes;
558
559
560 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
561 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
562
563 // snarf out the bits we want
564 bufsize = buf_size(bp);
565 tr = (transaction *)arg;
566
567 // then we've already seen it
568 if (tr == NULL) {
569 return;
570 }
571
572 CHECK_TRANSACTION(tr);
573
574 jnl = tr->jnl;
575 if (jnl->flags & JOURNAL_INVALID) {
576 return;
577 }
578
579 CHECK_JOURNAL(jnl);
580
581 amt_flushed = tr->num_killed;
582 total_bytes = tr->total_bytes;
583
584 // update the number of blocks that have been flushed.
585 // this buf may represent more than one block so take
586 // that into account.
587 //
588 // OSAddAtomic() returns the value of tr->num_flushed before the add
589 //
590 amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed);
591
592
593 // if this transaction isn't done yet, just return as
594 // there is nothing to do.
595 //
596 // NOTE: we are careful to not reference anything through
597 // the tr pointer after doing the OSAddAtomic(). if
598 // this if statement fails then we are the last one
599 // and then it's ok to dereference "tr".
600 //
601 if ((amt_flushed + bufsize) < total_bytes) {
602 return;
603 }
604
605 // this will single thread checking the transaction
606 lock_oldstart(jnl);
607
608 if (tr->total_bytes == (int)0xfbadc0de) {
609 // then someone beat us to it...
610 unlock_oldstart(jnl);
611 return;
612 }
613
614 // mark this so that we're the owner of dealing with the
615 // cleanup for this transaction
616 tr->total_bytes = 0xfbadc0de;
617
618 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
619 // tr, tr->journal_start, tr->journal_end, jnl);
620
621 // find this entry in the old_start[] index and mark it completed
622 for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
623
624 if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
625 jnl->old_start[i] &= ~(0x8000000000000000ULL);
626 break;
627 }
628 }
629
630 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
631 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
632 tr->journal_start, tr, jnl);
633 }
634
635
636 // if we are here then we need to update the journal header
637 // to reflect that this transaction is complete
638 if (tr->journal_start == jnl->active_start) {
639 jnl->active_start = tr->journal_end;
640 tr->journal_start = tr->journal_end = (off_t)0;
641 }
642
643 // go through the completed_trs list and try to coalesce
644 // entries, restarting back at the beginning if we have to.
645 for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
646 if (ctr->journal_start == jnl->active_start) {
647 jnl->active_start = ctr->journal_end;
648 if (prev) {
649 prev->next = ctr->next;
650 }
651 if (ctr == jnl->completed_trs) {
652 jnl->completed_trs = ctr->next;
653 }
654
655 next = jnl->completed_trs; // this starts us over again
656 ctr->next = jnl->tr_freeme;
657 jnl->tr_freeme = ctr;
658 ctr = NULL;
659 } else if (tr->journal_end == ctr->journal_start) {
660 ctr->journal_start = tr->journal_start;
661 next = jnl->completed_trs; // this starts us over again
662 ctr = NULL;
663 tr->journal_start = tr->journal_end = (off_t)0;
664 } else if (tr->journal_start == ctr->journal_end) {
665 ctr->journal_end = tr->journal_end;
666 next = ctr->next;
667 tr->journal_start = tr->journal_end = (off_t)0;
668 } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
669 // coalesce the next entry with this one and link the next
670 // entry in at the head of the tr_freeme list
671 next = ctr->next; // temporarily use the "next" variable
672 ctr->journal_end = next->journal_end;
673 ctr->next = next->next;
674 next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list
675 jnl->tr_freeme = next;
676
677 next = jnl->completed_trs; // this starts us over again
678 ctr = NULL;
679 } else {
680 next = ctr->next;
681 }
682 }
683
684 // if this is true then we didn't merge with anyone
685 // so link ourselves in at the head of the completed
686 // transaction list.
687 if (tr->journal_start != 0) {
688 // put this entry into the correct sorted place
689 // in the list instead of just at the head.
690 //
691
692 prev = NULL;
693 for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
694 // just keep looping
695 }
696
697 if (ctr == NULL && prev == NULL) {
698 jnl->completed_trs = tr;
699 tr->next = NULL;
700 } else if (ctr == jnl->completed_trs) {
701 tr->next = jnl->completed_trs;
702 jnl->completed_trs = tr;
703 } else {
704 tr->next = prev->next;
705 prev->next = tr;
706 }
707 } else {
708 // if we're here this tr got merged with someone else so
709 // put it on the list to be free'd
710 tr->next = jnl->tr_freeme;
711 jnl->tr_freeme = tr;
712 }
713 unlock_oldstart(jnl);
714
715 unlock_condition(jnl, &jnl->asyncIO);
716 }
717
718
719 #include <libkern/OSByteOrder.h>
720
721 #define SWAP16(x) OSSwapInt16(x)
722 #define SWAP32(x) OSSwapInt32(x)
723 #define SWAP64(x) OSSwapInt64(x)
724
725
726 static void
727 swap_journal_header(journal *jnl)
728 {
729 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
730 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
731 jnl->jhdr->start = SWAP64(jnl->jhdr->start);
732 jnl->jhdr->end = SWAP64(jnl->jhdr->end);
733 jnl->jhdr->size = SWAP64(jnl->jhdr->size);
734 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
735 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
736 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
737 jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num);
738 }
739
740 static void
741 swap_block_list_header(journal *jnl, block_list_header *blhdr)
742 {
743 int i;
744
745 blhdr->max_blocks = SWAP16(blhdr->max_blocks);
746 blhdr->num_blocks = SWAP16(blhdr->num_blocks);
747 blhdr->bytes_used = SWAP32(blhdr->bytes_used);
748 blhdr->checksum = SWAP32(blhdr->checksum);
749 blhdr->flags = SWAP32(blhdr->flags);
750
751 if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
752 printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
753 return;
754 }
755
756 for(i = 0; i < blhdr->num_blocks; i++) {
757 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
758 blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize);
759 blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
760 }
761 }
762
763
764 static int
765 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
766 {
767 int ret;
768 struct buf *oblock_bp=NULL;
769
770 // first read the block we want.
771 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
772 if (ret != 0) {
773 printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
774
775 if (oblock_bp) {
776 buf_brelse(oblock_bp);
777 oblock_bp = NULL;
778 }
779
780 // let's try to be aggressive here and just re-write the block
781 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
782 if (oblock_bp == NULL) {
783 printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
784 return -1;
785 }
786 }
787
788 // make sure it's the correct size.
789 if (buf_size(oblock_bp) != bsize) {
790 buf_brelse(oblock_bp);
791 return -1;
792 }
793
794 // copy the journal data over top of it
795 memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize);
796
797 if ((ret = VNOP_BWRITE(oblock_bp)) != 0) {
798 printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
799 return ret;
800 }
801
802 // and now invalidate it so that if someone else wants to read
803 // it in a different size they'll be able to do it.
804 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
805 if (oblock_bp) {
806 buf_markinvalid(oblock_bp);
807 buf_brelse(oblock_bp);
808 }
809
810 return 0;
811 }
812
813 static int
814 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
815 {
816 struct bucket *newBuf;
817 int current_size = num_buckets, i;
818
819 // return if newsize is less than the current size
820 if (new_size < num_buckets) {
821 return current_size;
822 }
823
824 if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
825 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
826 return -1;
827 }
828
829 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
830
831 // copy existing elements
832 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
833
834 // initialize the new ones
835 for(i = num_buckets; i < new_size; i++) {
836 newBuf[i].block_num = (off_t)-1;
837 }
838
839 // free the old container
840 FREE(*buf_ptr, M_TEMP);
841
842 // reset the buf_ptr
843 *buf_ptr = newBuf;
844
845 return new_size;
846 }
847
848 static int
849 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
850 {
851 int lo, hi, index, matches, i;
852
853 if (num_full == 0) {
854 return 0; // table is empty, so insert at index=0
855 }
856
857 lo = 0;
858 hi = num_full - 1;
859 index = -1;
860
861 // perform binary search for block_num
862 do {
863 int mid = (hi - lo)/2 + lo;
864 off_t this_num = (*buf_ptr)[mid].block_num;
865
866 if (block_num == this_num) {
867 index = mid;
868 break;
869 }
870
871 if (block_num < this_num) {
872 hi = mid;
873 continue;
874 }
875
876 if (block_num > this_num) {
877 lo = mid + 1;
878 continue;
879 }
880 } while (lo < hi);
881
882 // check if lo and hi converged on the match
883 if (block_num == (*buf_ptr)[hi].block_num) {
884 index = hi;
885 }
886
887 // if no existing entry found, find index for new one
888 if (index == -1) {
889 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
890 } else {
891 // make sure that we return the right-most index in the case of multiple matches
892 matches = 0;
893 i = index + 1;
894 while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
895 matches++;
896 i++;
897 }
898
899 index += matches;
900 }
901
902 return index;
903 }
904
905 static int
906 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
907 {
908 if (!overwriting) {
909 // grow the table if we're out of space
910 if (*num_full_ptr >= *num_buckets_ptr) {
911 int new_size = *num_buckets_ptr * 2;
912 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
913
914 if (grow_size < new_size) {
915 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
916 return -1;
917 }
918
919 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
920 }
921
922 // if we're not inserting at the end, we need to bcopy
923 if (blk_index != *num_full_ptr) {
924 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
925 }
926
927 (*num_full_ptr)++; // increment only if we're not overwriting
928 }
929
930 // sanity check the values we're about to add
931 if ((off_t)offset >= jnl->jhdr->size) {
932 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
933 }
934 if (size <= 0) {
935 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
936 }
937
938 (*buf_ptr)[blk_index].block_num = num;
939 (*buf_ptr)[blk_index].block_size = size;
940 (*buf_ptr)[blk_index].jnl_offset = offset;
941 (*buf_ptr)[blk_index].cksum = cksum;
942
943 return blk_index;
944 }
945
946 static int
947 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
948 {
949 int num_to_remove, index, i, overwrite, err;
950 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
951 off_t overlap, block_start, block_end;
952
953 block_start = block_num*jhdr_size;
954 block_end = block_start + size;
955 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
956
957 // first, eliminate any overlap with the previous entry
958 if (blk_index != 0 && !overwrite) {
959 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
960 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
961 overlap = prev_block_end - block_start;
962 if (overlap > 0) {
963 if (overlap % jhdr_size != 0) {
964 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
965 }
966
967 // if the previous entry completely overlaps this one, we need to break it into two pieces.
968 if (prev_block_end > block_end) {
969 off_t new_num = block_end / jhdr_size;
970 size_t new_size = prev_block_end - block_end;
971
972 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
973
974 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
975 if (err < 0) {
976 panic("jnl: do_overlap: error inserting during pre-overlap\n");
977 }
978 }
979
980 // Regardless, we need to truncate the previous entry to the beginning of the overlap
981 (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
982 (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it
983 }
984 }
985
986 // then, bail out fast if there's no overlap with the entries that follow
987 if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) {
988 return 0; // no overlap, no overwrite
989 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) {
990
991 (*buf_ptr)[blk_index].cksum = cksum; // update this
992 return 1; // simple overwrite
993 }
994
995 // Otherwise, find all cases of total and partial overlap. We use the special
996 // block_num of -2 to designate entries that are completely overlapped and must
997 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
998 // entries must be adjusted to keep the array consistent.
999 index = blk_index;
1000 num_to_remove = 0;
1001 while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) {
1002 if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) {
1003 (*buf_ptr)[index].block_num = -2; // mark this for deletion
1004 num_to_remove++;
1005 } else {
1006 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
1007 if (overlap > 0) {
1008 if (overlap % jhdr_size != 0) {
1009 panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
1010 }
1011
1012 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1013 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
1014 (*buf_ptr)[index].cksum = 0;
1015
1016 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
1017 if ((off_t)new_offset >= jnl->jhdr->size) {
1018 new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
1019 }
1020 (*buf_ptr)[index].jnl_offset = new_offset;
1021
1022 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
1023 if ((*buf_ptr)[index].block_size <= 0) {
1024 panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
1025 // return -1; // if above panic is removed, return -1 for error
1026 }
1027 }
1028
1029 }
1030
1031 index++;
1032 }
1033
1034 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1035 index--; // start with the last index used within the above loop
1036 while (index >= blk_index) {
1037 if ((*buf_ptr)[index].block_num == -2) {
1038 if (index == *num_full_ptr-1) {
1039 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
1040 } else {
1041 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
1042 }
1043 (*num_full_ptr)--;
1044 }
1045 index--;
1046 }
1047
1048 // eliminate any stale entries at the end of the table
1049 for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
1050 (*buf_ptr)[i].block_num = -1;
1051 }
1052
1053 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1054 }
1055
1056 // PR-3105942: Coalesce writes to the same block in journal replay
1057 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1058 // to be replayed and the corresponding location in the journal which contains
1059 // the most recent data for those blocks. The array is "played" once the all the
1060 // blocks in the journal have been coalesced. The code for the case of conflicting/
1061 // overlapping writes to a single block is the most dense. Because coalescing can
1062 // disrupt the existing time-ordering of blocks in the journal playback, care
1063 // is taken to catch any overlaps and keep the array consistent.
1064 static int
1065 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
1066 {
1067 int blk_index, overwriting;
1068
1069 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1070 // inserted (or the index of the elem to overwrite).
1071 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
1072
1073 // check if the index is within bounds (if we're adding this block to the end of
1074 // the table, blk_index will be equal to num_full)
1075 if (blk_index < 0 || blk_index > *num_full_ptr) {
1076 //printf("jnl: add_block: trouble adding block to co_buf\n");
1077 return -1;
1078 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1079
1080 // Determine whether we're overwriting an existing entry by checking for overlap
1081 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
1082 if (overwriting < 0) {
1083 return -1; // if we got an error, pass it along
1084 }
1085
1086 // returns the index, or -1 on error
1087 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
1088
1089 return blk_index;
1090 }
1091
1092 static int
1093 replay_journal(journal *jnl)
1094 {
1095 int i, bad_blocks=0;
1096 unsigned int orig_checksum, checksum, check_block_checksums = 0;
1097 size_t ret;
1098 size_t max_bsize = 0; /* protected by block_ptr */
1099 block_list_header *blhdr;
1100 off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1101 char *buff, *block_ptr=NULL;
1102 struct bucket *co_buf;
1103 int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1104 uint32_t last_sequence_num = 0;
1105 int replay_retry_count = 0;
1106
1107 // wrap the start ptr if it points to the very end of the journal
1108 if (jnl->jhdr->start == jnl->jhdr->size) {
1109 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1110 }
1111 if (jnl->jhdr->end == jnl->jhdr->size) {
1112 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1113 }
1114
1115 if (jnl->jhdr->start == jnl->jhdr->end) {
1116 return 0;
1117 }
1118
1119 orig_jnl_start = jnl->jhdr->start;
1120
1121 // allocate memory for the header_block. we'll read each blhdr into this
1122 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) {
1123 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1124 jnl->jdev_name, jnl->jhdr->blhdr_size);
1125 return -1;
1126 }
1127
1128 // allocate memory for the coalesce buffer
1129 if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1130 printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1131 return -1;
1132 }
1133
1134 restart_replay:
1135
1136 // initialize entries
1137 for(i = 0; i < num_buckets; i++) {
1138 co_buf[i].block_num = -1;
1139 }
1140 num_full = 0; // empty at first
1141
1142
1143 printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1144 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1145
1146 while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1147 offset = blhdr_offset = jnl->jhdr->start;
1148 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1149 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1150 printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1151 bad_blocks = 1;
1152 goto bad_txn_handling;
1153 }
1154
1155 blhdr = (block_list_header *)buff;
1156
1157 orig_checksum = blhdr->checksum;
1158 blhdr->checksum = 0;
1159 if (jnl->flags & JOURNAL_NEED_SWAP) {
1160 // calculate the checksum based on the unswapped data
1161 // because it is done byte-at-a-time.
1162 orig_checksum = (unsigned int)SWAP32(orig_checksum);
1163 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1164 swap_block_list_header(jnl, blhdr);
1165 } else {
1166 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1167 }
1168
1169
1170 //
1171 // XXXdbg - if these checks fail, we should replay as much
1172 // we can in the hopes that it will still leave the
1173 // drive in a better state than if we didn't replay
1174 // anything
1175 //
1176 if (checksum != orig_checksum) {
1177 if (check_past_jnl_end && in_uncharted_territory) {
1178
1179 if (blhdr_offset != jnl->jhdr->end) {
1180 printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1181 }
1182
1183 check_past_jnl_end = 0;
1184 jnl->jhdr->end = blhdr_offset;
1185 continue;
1186 }
1187
1188 printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1189 jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1190
1191 if (blhdr_offset == orig_jnl_start) {
1192 // if there's nothing in the journal at all, just bail out altogether.
1193 goto bad_replay;
1194 }
1195
1196 bad_blocks = 1;
1197 goto bad_txn_handling;
1198 }
1199
1200 if ( (last_sequence_num != 0)
1201 && (blhdr->binfo[0].u.bi.b.sequence_num != 0)
1202 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
1203 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
1204
1205 txn_start_offset = jnl->jhdr->end = blhdr_offset;
1206
1207 if (check_past_jnl_end) {
1208 check_past_jnl_end = 0;
1209 printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1210 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1211 continue;
1212 }
1213
1214 printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1215 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1216 bad_blocks = 1;
1217 goto bad_txn_handling;
1218 }
1219 last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
1220
1221 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1222 if (last_sequence_num == 0) {
1223 check_past_jnl_end = 0;
1224 printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1225 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1226 if (jnl->jhdr->start != jnl->jhdr->end) {
1227 jnl->jhdr->start = jnl->jhdr->end;
1228 }
1229 continue;
1230 }
1231 printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1232 }
1233
1234 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
1235 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1236 printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1237 jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1238 bad_blocks = 1;
1239 goto bad_txn_handling;
1240 }
1241
1242 max_bsize = 0;
1243 for (i = 1; i < blhdr->num_blocks; i++) {
1244 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1245 printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1246 bad_blocks = 1;
1247 goto bad_txn_handling;
1248 }
1249
1250 if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
1251 max_bsize = blhdr->binfo[i].u.bi.bsize;
1252 }
1253 }
1254
1255 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1256 check_block_checksums = 1;
1257 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1258 goto bad_replay;
1259 }
1260 } else {
1261 block_ptr = NULL;
1262 }
1263
1264 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1265 txn_start_offset = blhdr_offset;
1266 }
1267
1268 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1269 // blhdr->num_blocks-1, jnl->jhdr->start);
1270 bad_blocks = 0;
1271 for (i = 1; i < blhdr->num_blocks; i++) {
1272 int size, ret_val;
1273 off_t number;
1274
1275 size = blhdr->binfo[i].u.bi.bsize;
1276 number = blhdr->binfo[i].bnum;
1277
1278 // don't add "killed" blocks
1279 if (number == (off_t)-1) {
1280 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1281 } else {
1282
1283 if (check_block_checksums) {
1284 int32_t disk_cksum;
1285 off_t block_offset;
1286
1287 block_offset = offset;
1288
1289 // read the block so we can check the checksum
1290 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1291 if (ret != (size_t)size) {
1292 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1293 bad_blocks = 1;
1294 goto bad_txn_handling;
1295 }
1296
1297 disk_cksum = calc_checksum(block_ptr, size);
1298
1299 // there is no need to swap the checksum from disk because
1300 // it got swapped when the blhdr was read in.
1301 if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
1302 printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1303 jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
1304 printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1305 *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1306 *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1307
1308 bad_blocks = 1;
1309 goto bad_txn_handling;
1310 }
1311 }
1312
1313
1314 // add this bucket to co_buf, coalescing where possible
1315 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1316 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
1317
1318 if (ret_val == -1) {
1319 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1320 goto bad_replay;
1321 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1322 }
1323
1324 // increment offset
1325 offset += size;
1326
1327 // check if the last block added puts us off the end of the jnl.
1328 // if so, we need to wrap to the beginning and take any remainder
1329 // into account
1330 //
1331 if (offset >= jnl->jhdr->size) {
1332 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1333 }
1334 }
1335
1336 if (block_ptr) {
1337 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1338 block_ptr = NULL;
1339 }
1340
1341 bad_txn_handling:
1342 if (bad_blocks) {
1343 /* Journal replay got error before it found any valid
1344 * transations, abort replay */
1345 if (txn_start_offset == 0) {
1346 printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1347 goto bad_replay;
1348 }
1349
1350 /* Repeated error during journal replay, abort replay */
1351 if (replay_retry_count == 3) {
1352 printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
1353 goto bad_replay;
1354 }
1355 replay_retry_count++;
1356
1357 /* There was an error replaying the journal (possibly
1358 * EIO/ENXIO from the device). So retry replaying all
1359 * the good transactions that we found before getting
1360 * the error.
1361 */
1362 jnl->jhdr->start = orig_jnl_start;
1363 jnl->jhdr->end = txn_start_offset;
1364 check_past_jnl_end = 0;
1365 last_sequence_num = 0;
1366 printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1367 goto restart_replay;
1368 }
1369
1370 jnl->jhdr->start += blhdr->bytes_used;
1371 if (jnl->jhdr->start >= jnl->jhdr->size) {
1372 // wrap around and skip the journal header block
1373 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1374 }
1375
1376 if (jnl->jhdr->start == jnl->jhdr->end) {
1377 in_uncharted_territory = 1;
1378 }
1379 }
1380
1381 if (jnl->jhdr->start != jnl->jhdr->end) {
1382 printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1383 jnl->jhdr->end = jnl->jhdr->start;
1384 }
1385
1386 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1387
1388 /*
1389 * make sure it's at least one page in size, so
1390 * start max_bsize at PAGE_SIZE
1391 */
1392 for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1393
1394 if (co_buf[i].block_num == (off_t)-1)
1395 continue;
1396
1397 if (co_buf[i].block_size > max_bsize)
1398 max_bsize = co_buf[i].block_size;
1399 }
1400 /*
1401 * round max_bsize up to the nearest PAGE_SIZE multiple
1402 */
1403 if (max_bsize & (PAGE_SIZE - 1)) {
1404 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1405 }
1406
1407 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
1408 goto bad_replay;
1409 }
1410
1411 // Replay the coalesced entries in the co-buf
1412 for(i = 0; i < num_full; i++) {
1413 size_t size = co_buf[i].block_size;
1414 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1415 off_t number = co_buf[i].block_num;
1416
1417
1418 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1419 // co_buf[i].block_size, co_buf[i].jnl_offset);
1420
1421 if (number == (off_t)-1) {
1422 // printf("jnl: replay_journal: skipping killed fs block\n");
1423 } else {
1424
1425 // do journal read, and set the phys. block
1426 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1427 if (ret != size) {
1428 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1429 goto bad_replay;
1430 }
1431
1432 if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1433 goto bad_replay;
1434 }
1435 }
1436 }
1437
1438
1439 // done replaying; update jnl header
1440 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1441 goto bad_replay;
1442 }
1443
1444 printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1445
1446 // free block_ptr
1447 if (block_ptr) {
1448 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1449 block_ptr = NULL;
1450 }
1451
1452 // free the coalesce buffer
1453 FREE(co_buf, M_TEMP);
1454 co_buf = NULL;
1455
1456 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1457 return 0;
1458
1459 bad_replay:
1460 if (block_ptr) {
1461 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1462 }
1463 if (co_buf) {
1464 FREE(co_buf, M_TEMP);
1465 }
1466 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1467
1468 return -1;
1469 }
1470
1471
1472 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1473 #define MAX_TRANSACTION_BUFFER_SIZE (2048*1024)
1474
1475 // XXXdbg - so I can change it in the debugger
1476 int def_tbuffer_size = 0;
1477
1478
1479 //
1480 // This function sets the size of the tbuffer and the
1481 // size of the blhdr. It assumes that jnl->jhdr->size
1482 // and jnl->jhdr->jhdr_size are already valid.
1483 //
1484 static void
1485 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1486 {
1487 //
1488 // one-time initialization based on how much memory
1489 // there is in the machine.
1490 //
1491 if (def_tbuffer_size == 0) {
1492 if (mem_size < (256*1024*1024)) {
1493 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1494 } else if (mem_size < (512*1024*1024)) {
1495 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1496 } else if (mem_size < (1024*1024*1024)) {
1497 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1498 } else {
1499 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (mem_size / (256*1024*1024));
1500 }
1501 }
1502
1503 // size up the transaction buffer... can't be larger than the number
1504 // of blocks that can fit in a block_list_header block.
1505 if (tbuffer_size == 0) {
1506 jnl->tbuffer_size = def_tbuffer_size;
1507 } else {
1508 // make sure that the specified tbuffer_size isn't too small
1509 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1510 tbuffer_size = jnl->jhdr->blhdr_size * 2;
1511 }
1512 // and make sure it's an even multiple of the block size
1513 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1514 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1515 }
1516
1517 jnl->tbuffer_size = tbuffer_size;
1518 }
1519
1520 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1521 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1522 }
1523
1524 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1525 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1526 }
1527
1528 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1529 if (jnl->jhdr->blhdr_size < phys_blksz) {
1530 jnl->jhdr->blhdr_size = phys_blksz;
1531 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1532 // have to round up so we're an even multiple of the physical block size
1533 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1534 }
1535 }
1536
1537
1538
1539 static void
1540 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1541 {
1542 off_t readblockcnt;
1543 off_t writeblockcnt;
1544 off_t readmaxcnt=0, tmp_readmaxcnt;
1545 off_t writemaxcnt=0, tmp_writemaxcnt;
1546 off_t readsegcnt, writesegcnt;
1547 int32_t features;
1548
1549 if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1550 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1551 const char *name = vnode_getname_printable(devvp);
1552 jnl->flags |= JOURNAL_DO_FUA_WRITES;
1553 printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
1554 vnode_putname_printable(name);
1555 }
1556 if (features & DK_FEATURE_UNMAP) {
1557 jnl->flags |= JOURNAL_USE_UNMAP;
1558 }
1559 }
1560
1561 //
1562 // First check the max read size via several different mechanisms...
1563 //
1564 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context);
1565
1566 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) {
1567 tmp_readmaxcnt = readblockcnt * phys_blksz;
1568 if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
1569 readmaxcnt = tmp_readmaxcnt;
1570 }
1571 }
1572
1573 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) {
1574 readsegcnt = 0;
1575 }
1576
1577 if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
1578 readmaxcnt = readsegcnt * PAGE_SIZE;
1579 }
1580
1581 if (readmaxcnt == 0) {
1582 readmaxcnt = 128 * 1024;
1583 } else if (readmaxcnt > UINT32_MAX) {
1584 readmaxcnt = UINT32_MAX;
1585 }
1586
1587
1588 //
1589 // Now check the max writes size via several different mechanisms...
1590 //
1591 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context);
1592
1593 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) {
1594 tmp_writemaxcnt = writeblockcnt * phys_blksz;
1595 if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
1596 writemaxcnt = tmp_writemaxcnt;
1597 }
1598 }
1599
1600 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) {
1601 writesegcnt = 0;
1602 }
1603
1604 if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
1605 writemaxcnt = writesegcnt * PAGE_SIZE;
1606 }
1607
1608 if (writemaxcnt == 0) {
1609 writemaxcnt = 128 * 1024;
1610 } else if (writemaxcnt > UINT32_MAX) {
1611 writemaxcnt = UINT32_MAX;
1612 }
1613
1614 jnl->max_read_size = readmaxcnt;
1615 jnl->max_write_size = writemaxcnt;
1616 // printf("jnl: %s: max read/write: %lld k / %lld k\n",
1617 // jnl->jdev_name ? jnl->jdev_name : "unknown",
1618 // jnl->max_read_size/1024, jnl->max_write_size/1024);
1619 }
1620
1621
1622 journal *
1623 journal_create(struct vnode *jvp,
1624 off_t offset,
1625 off_t journal_size,
1626 struct vnode *fsvp,
1627 size_t min_fs_blksz,
1628 int32_t flags,
1629 int32_t tbuffer_size,
1630 void (*flush)(void *arg),
1631 void *arg,
1632 struct mount *fsmount)
1633 {
1634 journal *jnl;
1635 uint32_t phys_blksz, new_txn_base;
1636 u_int32_t min_size;
1637 struct vfs_context context;
1638 const char *jdev_name;
1639 /*
1640 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
1641 * a full allocation block if the current size is smaller than the allocation
1642 * block on which it resides. Once we hit the exabyte filesystem range, then
1643 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
1644 */
1645 context.vc_thread = current_thread();
1646 context.vc_ucred = FSCRED;
1647
1648 jdev_name = vnode_getname_printable(jvp);
1649
1650 /* Get the real physical block size. */
1651 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1652 goto cleanup_jdev_name;
1653 }
1654
1655 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
1656 printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
1657 goto cleanup_jdev_name;
1658 }
1659
1660 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1661 /* Reject journals that are too small given the sector size of the device */
1662 if (journal_size < min_size) {
1663 printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n",
1664 jdev_name, journal_size, phys_blksz);
1665 goto cleanup_jdev_name;
1666 }
1667
1668 if (phys_blksz > min_fs_blksz) {
1669 printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
1670 jdev_name, phys_blksz, min_fs_blksz);
1671 goto cleanup_jdev_name;
1672 }
1673
1674 if ((journal_size % phys_blksz) != 0) {
1675 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
1676 jdev_name, journal_size, phys_blksz);
1677 goto cleanup_jdev_name;
1678 }
1679
1680
1681 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1682 memset(jnl, 0, sizeof(*jnl));
1683
1684 jnl->jdev = jvp;
1685 jnl->jdev_offset = offset;
1686 jnl->fsdev = fsvp;
1687 jnl->flush = flush;
1688 jnl->flush_arg = arg;
1689 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1690 jnl->jdev_name = jdev_name;
1691 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1692
1693 // Keep a point to the mount around for use in IO throttling.
1694 jnl->fsmount = fsmount;
1695 // XXX: This lock discipline looks correct based on dounmount(), but it
1696 // doesn't seem to be documented anywhere.
1697 mount_ref(fsmount, 0);
1698
1699 get_io_info(jvp, phys_blksz, jnl, &context);
1700
1701 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1702 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1703 goto bad_kmem_alloc;
1704 }
1705 jnl->header_buf_size = phys_blksz;
1706
1707 jnl->jhdr = (journal_header *)jnl->header_buf;
1708 memset(jnl->jhdr, 0, sizeof(journal_header));
1709
1710 // we have to set this up here so that do_journal_io() will work
1711 jnl->jhdr->jhdr_size = phys_blksz;
1712
1713 //
1714 // We try and read the journal header to see if there is already one
1715 // out there. If there is, it's possible that it has transactions
1716 // in it that we might replay if we happen to pick a sequence number
1717 // that is a little less than the old one, there is a crash and the
1718 // last txn written ends right at the start of a txn from the previous
1719 // incarnation of this file system. If all that happens we would
1720 // replay the transactions from the old file system and that would
1721 // destroy your disk. Although it is extremely unlikely for all those
1722 // conditions to happen, the probability is non-zero and the result is
1723 // severe - you lose your file system. Therefore if we find a valid
1724 // journal header and the sequence number is non-zero we write junk
1725 // over the entire journal so that there is no way we will encounter
1726 // any old transactions. This is slow but should be a rare event
1727 // since most tools erase the journal.
1728 //
1729 if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
1730 && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
1731 && jnl->jhdr->sequence_num != 0) {
1732
1733 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
1734 printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
1735
1736 #if 0
1737 int i;
1738 off_t pos=0;
1739
1740 for(i = 1; i < journal_size / phys_blksz; i++) {
1741 pos = i*phys_blksz;
1742
1743 // we don't really care what data we write just so long
1744 // as it's not a valid transaction header. since we have
1745 // the header_buf sitting around we'll use that.
1746 write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz);
1747 }
1748 printf("jnl: create: done clearing journal (i=%d)\n", i);
1749 #endif
1750 } else {
1751 new_txn_base = random() & 0x00ffffff;
1752 }
1753
1754 memset(jnl->header_buf, 0, phys_blksz);
1755
1756 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1757 jnl->jhdr->endian = ENDIAN_MAGIC;
1758 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
1759 jnl->jhdr->end = phys_blksz;
1760 jnl->jhdr->size = journal_size;
1761 jnl->jhdr->jhdr_size = phys_blksz;
1762 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1763
1764 jnl->active_start = jnl->jhdr->start;
1765
1766 // XXXdbg - for testing you can force the journal to wrap around
1767 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1768 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1769
1770 jnl->jhdr->sequence_num = new_txn_base;
1771
1772 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1773 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
1774 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
1775
1776
1777 jnl->flushing = FALSE;
1778 jnl->asyncIO = FALSE;
1779 jnl->flush_aborted = FALSE;
1780 jnl->writing_header = FALSE;
1781 jnl->async_trim = NULL;
1782 jnl->sequence_num = jnl->jhdr->sequence_num;
1783
1784 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1785 printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1786 goto bad_write;
1787 }
1788
1789 goto journal_create_complete;
1790
1791
1792 bad_write:
1793 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1794 bad_kmem_alloc:
1795 jnl->jhdr = NULL;
1796 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1797 mount_drop(fsmount, 0);
1798 cleanup_jdev_name:
1799 vnode_putname_printable(jdev_name);
1800 jnl = NULL;
1801 journal_create_complete:
1802 return jnl;
1803 }
1804
1805
1806 journal *
1807 journal_open(struct vnode *jvp,
1808 off_t offset,
1809 off_t journal_size,
1810 struct vnode *fsvp,
1811 size_t min_fs_blksz,
1812 int32_t flags,
1813 int32_t tbuffer_size,
1814 void (*flush)(void *arg),
1815 void *arg,
1816 struct mount *fsmount)
1817 {
1818 journal *jnl;
1819 uint32_t orig_blksz=0;
1820 uint32_t phys_blksz;
1821 u_int32_t min_size = 0;
1822 int orig_checksum, checksum;
1823 struct vfs_context context;
1824 const char *jdev_name = vnode_getname_printable(jvp);
1825
1826 context.vc_thread = current_thread();
1827 context.vc_ucred = FSCRED;
1828
1829 /* Get the real physical block size. */
1830 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1831 goto cleanup_jdev_name;
1832 }
1833
1834 if (phys_blksz > min_fs_blksz) {
1835 printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
1836 jdev_name, phys_blksz, min_fs_blksz);
1837 goto cleanup_jdev_name;
1838 }
1839
1840 if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
1841 printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
1842 goto cleanup_jdev_name;
1843 }
1844
1845 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1846 /* Reject journals that are too small given the sector size of the device */
1847 if (journal_size < min_size) {
1848 printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n",
1849 jdev_name, journal_size, phys_blksz);
1850 goto cleanup_jdev_name;
1851 }
1852
1853 if ((journal_size % phys_blksz) != 0) {
1854 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1855 jdev_name, journal_size, phys_blksz);
1856 goto cleanup_jdev_name;
1857 }
1858
1859 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1860 memset(jnl, 0, sizeof(*jnl));
1861
1862 jnl->jdev = jvp;
1863 jnl->jdev_offset = offset;
1864 jnl->fsdev = fsvp;
1865 jnl->flush = flush;
1866 jnl->flush_arg = arg;
1867 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1868 jnl->jdev_name = jdev_name;
1869 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1870
1871 /* We need a reference to the mount to later pass to the throttling code for
1872 * IO accounting.
1873 */
1874 jnl->fsmount = fsmount;
1875 mount_ref(fsmount, 0);
1876
1877 get_io_info(jvp, phys_blksz, jnl, &context);
1878
1879 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
1880 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1881 goto bad_kmem_alloc;
1882 }
1883 jnl->header_buf_size = phys_blksz;
1884
1885 jnl->jhdr = (journal_header *)jnl->header_buf;
1886 memset(jnl->jhdr, 0, sizeof(journal_header));
1887
1888 // we have to set this up here so that do_journal_io() will work
1889 jnl->jhdr->jhdr_size = phys_blksz;
1890
1891 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1892 printf("jnl: %s: open: could not read %u bytes for the journal header.\n",
1893 jdev_name, phys_blksz);
1894 goto bad_journal;
1895 }
1896
1897 orig_checksum = jnl->jhdr->checksum;
1898 jnl->jhdr->checksum = 0;
1899
1900 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1901 // do this before the swap since it's done byte-at-a-time
1902 orig_checksum = SWAP32(orig_checksum);
1903 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1904 swap_journal_header(jnl);
1905 jnl->flags |= JOURNAL_NEED_SWAP;
1906 } else {
1907 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1908 }
1909
1910 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1911 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1912 jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1913 goto bad_journal;
1914 }
1915
1916 // only check if we're the current journal header magic value
1917 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
1918
1919 if (orig_checksum != checksum) {
1920 printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
1921 jdev_name, orig_checksum, checksum);
1922
1923 //goto bad_journal;
1924 }
1925 }
1926
1927 // XXXdbg - convert old style magic numbers to the new one
1928 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
1929 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1930 }
1931
1932 if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
1933 /*
1934 * The volume has probably been resized (such that we had to adjust the
1935 * logical sector size), or copied to media with a different logical
1936 * sector size.
1937 *
1938 * Temporarily change the device's logical block size to match the
1939 * journal's header size. This will allow us to replay the journal
1940 * safely. If the replay succeeds, we will update the journal's header
1941 * size (later in this function).
1942 */
1943 orig_blksz = phys_blksz;
1944 phys_blksz = jnl->jhdr->jhdr_size;
1945 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
1946 printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
1947 jdev_name, orig_blksz, phys_blksz);
1948 }
1949
1950 if ( jnl->jhdr->start <= 0
1951 || jnl->jhdr->start > jnl->jhdr->size
1952 || jnl->jhdr->start > 1024*1024*1024) {
1953 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
1954 jdev_name, jnl->jhdr->start, jnl->jhdr->size);
1955 goto bad_journal;
1956 }
1957
1958 if ( jnl->jhdr->end <= 0
1959 || jnl->jhdr->end > jnl->jhdr->size
1960 || jnl->jhdr->end > 1024*1024*1024) {
1961 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
1962 jdev_name, jnl->jhdr->end, jnl->jhdr->size);
1963 goto bad_journal;
1964 }
1965
1966 if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) {
1967 printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
1968 goto bad_journal;
1969 }
1970
1971 // XXXdbg - can't do these checks because hfs writes all kinds of
1972 // non-uniform sized blocks even on devices that have a block size
1973 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
1974 // therefore these checks will fail and so we just have to punt and
1975 // do more relaxed checking...
1976 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
1977 if ((jnl->jhdr->start % 512) != 0) {
1978 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
1979 jdev_name, jnl->jhdr->start);
1980 goto bad_journal;
1981 }
1982
1983 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
1984 if ((jnl->jhdr->end % 512) != 0) {
1985 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
1986 jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
1987 goto bad_journal;
1988 }
1989
1990 // take care of replaying the journal if necessary
1991 if (flags & JOURNAL_RESET) {
1992 printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n",
1993 jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end);
1994 jnl->jhdr->start = jnl->jhdr->end;
1995 } else if (replay_journal(jnl) != 0) {
1996 printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
1997 goto bad_journal;
1998 }
1999
2000 /*
2001 * When we get here, we know that the journal is empty (jnl->jhdr->start ==
2002 * jnl->jhdr->end). If the device's logical block size was different from
2003 * the journal's header size, then we can now restore the device's logical
2004 * block size and update the journal's header size to match.
2005 *
2006 * Note that we also adjust the journal's start and end so that they will
2007 * be aligned on the new block size. We pick a new sequence number to
2008 * avoid any problems if a replay found previous transactions using the old
2009 * journal header size. (See the comments in journal_create(), above.)
2010 */
2011
2012 if (orig_blksz != 0) {
2013 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2014 phys_blksz = orig_blksz;
2015
2016 orig_blksz = 0;
2017
2018 jnl->jhdr->jhdr_size = phys_blksz;
2019 jnl->jhdr->start = phys_blksz;
2020 jnl->jhdr->end = phys_blksz;
2021 jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
2022 (journal_size / phys_blksz) +
2023 (random() % 16384)) & 0x00ffffff;
2024
2025 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
2026 printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
2027 goto bad_journal;
2028 }
2029 }
2030
2031 // make sure this is in sync!
2032 jnl->active_start = jnl->jhdr->start;
2033 jnl->sequence_num = jnl->jhdr->sequence_num;
2034
2035 // set this now, after we've replayed the journal
2036 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
2037
2038 // TODO: Does this need to change if the device's logical block size changed?
2039 if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
2040 printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
2041 jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
2042 goto bad_journal;
2043 }
2044
2045 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
2046 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
2047 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
2048
2049 goto journal_open_complete;
2050
2051 bad_journal:
2052 if (orig_blksz != 0) {
2053 phys_blksz = orig_blksz;
2054 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2055 printf("jnl: %s: open: restored block size after error\n", jdev_name);
2056 }
2057 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
2058 bad_kmem_alloc:
2059 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2060 mount_drop(fsmount, 0);
2061 cleanup_jdev_name:
2062 vnode_putname_printable(jdev_name);
2063 jnl = NULL;
2064 journal_open_complete:
2065 return jnl;
2066 }
2067
2068
2069 int
2070 journal_is_clean(struct vnode *jvp,
2071 off_t offset,
2072 off_t journal_size,
2073 struct vnode *fsvp,
2074 size_t min_fs_block_size)
2075 {
2076 journal jnl;
2077 uint32_t phys_blksz;
2078 int ret;
2079 int orig_checksum, checksum;
2080 struct vfs_context context;
2081 const char *jdev_name = vnode_getname_printable(jvp);
2082
2083 context.vc_thread = current_thread();
2084 context.vc_ucred = FSCRED;
2085
2086 /* Get the real physical block size. */
2087 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
2088 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
2089 ret = EINVAL;
2090 goto cleanup_jdev_name;
2091 }
2092
2093 if (phys_blksz > (uint32_t)min_fs_block_size) {
2094 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
2095 jdev_name, phys_blksz, min_fs_block_size);
2096 ret = EINVAL;
2097 goto cleanup_jdev_name;
2098 }
2099
2100 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
2101 printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
2102 ret = EINVAL;
2103 goto cleanup_jdev_name;
2104 }
2105
2106 if ((journal_size % phys_blksz) != 0) {
2107 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
2108 jdev_name, journal_size, phys_blksz);
2109 ret = EINVAL;
2110 goto cleanup_jdev_name;
2111 }
2112
2113 memset(&jnl, 0, sizeof(jnl));
2114
2115 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) {
2116 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
2117 ret = ENOMEM;
2118 goto cleanup_jdev_name;
2119 }
2120 jnl.header_buf_size = phys_blksz;
2121
2122 get_io_info(jvp, phys_blksz, &jnl, &context);
2123
2124 jnl.jhdr = (journal_header *)jnl.header_buf;
2125 memset(jnl.jhdr, 0, sizeof(journal_header));
2126
2127 jnl.jdev = jvp;
2128 jnl.jdev_offset = offset;
2129 jnl.fsdev = fsvp;
2130
2131 // we have to set this up here so that do_journal_io() will work
2132 jnl.jhdr->jhdr_size = phys_blksz;
2133
2134 if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
2135 printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
2136 jdev_name, phys_blksz);
2137 ret = EINVAL;
2138 goto get_out;
2139 }
2140
2141 orig_checksum = jnl.jhdr->checksum;
2142 jnl.jhdr->checksum = 0;
2143
2144 if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
2145 // do this before the swap since it's done byte-at-a-time
2146 orig_checksum = SWAP32(orig_checksum);
2147 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2148 swap_journal_header(&jnl);
2149 jnl.flags |= JOURNAL_NEED_SWAP;
2150 } else {
2151 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2152 }
2153
2154 if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
2155 printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
2156 jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
2157 ret = EINVAL;
2158 goto get_out;
2159 }
2160
2161 if (orig_checksum != checksum) {
2162 printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
2163 ret = EINVAL;
2164 goto get_out;
2165 }
2166
2167 //
2168 // if the start and end are equal then the journal is clean.
2169 // otherwise it's not clean and therefore an error.
2170 //
2171 if (jnl.jhdr->start == jnl.jhdr->end) {
2172 ret = 0;
2173 } else {
2174 ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one
2175 }
2176
2177 get_out:
2178 kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
2179 cleanup_jdev_name:
2180 vnode_putname_printable(jdev_name);
2181 return ret;
2182 }
2183
2184
2185 void
2186 journal_close(journal *jnl)
2187 {
2188 volatile off_t *start, *end;
2189 int counter=0;
2190
2191 CHECK_JOURNAL(jnl);
2192
2193 // set this before doing anything that would block so that
2194 // we start tearing things down properly.
2195 //
2196 jnl->flags |= JOURNAL_CLOSE_PENDING;
2197
2198 if (jnl->owner != current_thread()) {
2199 lock_journal(jnl);
2200 }
2201
2202 wait_condition(jnl, &jnl->flushing, "journal_close");
2203
2204 //
2205 // only write stuff to disk if the journal is still valid
2206 //
2207 if ((jnl->flags & JOURNAL_INVALID) == 0) {
2208
2209 if (jnl->active_tr) {
2210 /*
2211 * "journal_end_transaction" will fire the flush asynchronously
2212 */
2213 journal_end_transaction(jnl);
2214 }
2215
2216 // flush any buffered transactions
2217 if (jnl->cur_tr) {
2218 transaction *tr = jnl->cur_tr;
2219
2220 jnl->cur_tr = NULL;
2221 /*
2222 * "end_transaction" will wait for any in-progress flush to complete
2223 * before flushing "cur_tr" synchronously("must_wait" == TRUE)
2224 */
2225 end_transaction(tr, 1, NULL, NULL, FALSE, TRUE);
2226 }
2227 /*
2228 * if there was an "active_tr", make sure we wait for
2229 * it to flush if there was no "cur_tr" to process
2230 */
2231 wait_condition(jnl, &jnl->flushing, "journal_close");
2232
2233 //start = &jnl->jhdr->start;
2234 start = &jnl->active_start;
2235 end = &jnl->jhdr->end;
2236
2237 while (*start != *end && counter++ < 5000) {
2238 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
2239 if (jnl->flush) {
2240 jnl->flush(jnl->flush_arg);
2241 }
2242 tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
2243 }
2244
2245 if (*start != *end) {
2246 printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
2247 jnl->jdev_name, *start, *end);
2248 }
2249
2250 // make sure this is in sync when we close the journal
2251 jnl->jhdr->start = jnl->active_start;
2252
2253 // if this fails there's not much we can do at this point...
2254 write_journal_header(jnl, 1, jnl->sequence_num);
2255 } else {
2256 // if we're here the journal isn't valid any more.
2257 // so make sure we don't leave any locked blocks lying around
2258 printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl->jdev_name, jnl);
2259
2260 if (jnl->active_tr || jnl->cur_tr) {
2261 transaction *tr;
2262
2263 if (jnl->active_tr) {
2264 tr = jnl->active_tr;
2265 jnl->active_tr = NULL;
2266 } else {
2267 tr = jnl->cur_tr;
2268 jnl->cur_tr = NULL;
2269 }
2270 abort_transaction(jnl, tr);
2271
2272 if (jnl->active_tr || jnl->cur_tr) {
2273 panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
2274 }
2275 }
2276 }
2277
2278 free_old_stuff(jnl);
2279
2280 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2281 jnl->jhdr = (void *)0xbeefbabe;
2282
2283 // Release reference on the mount
2284 if (jnl->fsmount)
2285 mount_drop(jnl->fsmount, 0);
2286
2287 vnode_putname_printable(jnl->jdev_name);
2288
2289 unlock_journal(jnl);
2290 lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
2291 lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
2292 lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
2293 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2294 }
2295
2296 static void
2297 dump_journal(journal *jnl)
2298 {
2299 transaction *ctr;
2300
2301 printf("journal for dev %s:", jnl->jdev_name);
2302 printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
2303 printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
2304 printf(" start: 0x%.8llx\n", jnl->jhdr->start);
2305 printf(" end: 0x%.8llx\n", jnl->jhdr->end);
2306 printf(" size: 0x%.8llx\n", jnl->jhdr->size);
2307 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
2308 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
2309 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
2310
2311 printf(" completed transactions:\n");
2312 for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
2313 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2314 }
2315 }
2316
2317
2318
2319 static off_t
2320 free_space(journal *jnl)
2321 {
2322 off_t free_space_offset;
2323
2324 if (jnl->jhdr->start < jnl->jhdr->end) {
2325 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2326 } else if (jnl->jhdr->start > jnl->jhdr->end) {
2327 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2328 } else {
2329 // journal is completely empty
2330 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2331 }
2332
2333 return free_space_offset;
2334 }
2335
2336
2337 //
2338 // The journal must be locked on entry to this function.
2339 // The "desired_size" is in bytes.
2340 //
2341 static int
2342 check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num)
2343 {
2344 size_t i;
2345 int counter=0;
2346
2347 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2348 // desired_size, free_space(jnl));
2349
2350 if (delayed_header_write)
2351 *delayed_header_write = FALSE;
2352
2353 while (1) {
2354 int old_start_empty;
2355
2356 // make sure there's space in the journal to hold this transaction
2357 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2358 break;
2359 }
2360 if (counter++ == 5000) {
2361 dump_journal(jnl);
2362 panic("jnl: check_free_space: buffer flushing isn't working "
2363 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2364 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2365 }
2366 if (counter > 7500) {
2367 printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2368 return ENOSPC;
2369 }
2370
2371 //
2372 // here's where we lazily bump up jnl->jhdr->start. we'll consume
2373 // entries until there is enough space for the next transaction.
2374 //
2375 old_start_empty = 1;
2376 lock_oldstart(jnl);
2377
2378 for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2379 int lcl_counter;
2380
2381 lcl_counter = 0;
2382 while (jnl->old_start[i] & 0x8000000000000000LL) {
2383 if (lcl_counter++ > 10000) {
2384 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2385 jnl->old_start[i], jnl);
2386 }
2387
2388 unlock_oldstart(jnl);
2389 if (jnl->flush) {
2390 jnl->flush(jnl->flush_arg);
2391 }
2392 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2393 lock_oldstart(jnl);
2394 }
2395
2396 if (jnl->old_start[i] == 0) {
2397 continue;
2398 }
2399
2400 old_start_empty = 0;
2401 jnl->jhdr->start = jnl->old_start[i];
2402 jnl->old_start[i] = 0;
2403
2404 if (free_space(jnl) > desired_size) {
2405
2406 if (delayed_header_write)
2407 *delayed_header_write = TRUE;
2408 else {
2409 unlock_oldstart(jnl);
2410 write_journal_header(jnl, 1, sequence_num);
2411 lock_oldstart(jnl);
2412 }
2413 break;
2414 }
2415 }
2416 unlock_oldstart(jnl);
2417
2418 // if we bumped the start, loop and try again
2419 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2420 continue;
2421 } else if (old_start_empty) {
2422 //
2423 // if there is nothing in old_start anymore then we can
2424 // bump the jhdr->start to be the same as active_start
2425 // since it is possible there was only one very large
2426 // transaction in the old_start array. if we didn't do
2427 // this then jhdr->start would never get updated and we
2428 // would wind up looping until we hit the panic at the
2429 // start of the loop.
2430 //
2431 jnl->jhdr->start = jnl->active_start;
2432
2433 if (delayed_header_write)
2434 *delayed_header_write = TRUE;
2435 else
2436 write_journal_header(jnl, 1, sequence_num);
2437 continue;
2438 }
2439
2440
2441 // if the file system gave us a flush function, call it to so that
2442 // it can flush some blocks which hopefully will cause some transactions
2443 // to complete and thus free up space in the journal.
2444 if (jnl->flush) {
2445 jnl->flush(jnl->flush_arg);
2446 }
2447
2448 // wait for a while to avoid being cpu-bound (this will
2449 // put us to sleep for 10 milliseconds)
2450 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2451 }
2452
2453 return 0;
2454 }
2455
2456 /*
2457 * Allocate a new active transaction.
2458 */
2459 static errno_t
2460 journal_allocate_transaction(journal *jnl)
2461 {
2462 transaction *tr;
2463
2464 MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2465 memset(tr, 0, sizeof(transaction));
2466
2467 tr->tbuffer_size = jnl->tbuffer_size;
2468
2469 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
2470 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2471 jnl->active_tr = NULL;
2472 return ENOMEM;
2473 }
2474
2475 // journal replay code checksum check depends on this.
2476 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2477 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2478 memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2479
2480 tr->blhdr = (block_list_header *)tr->tbuffer;
2481 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2482 tr->blhdr->num_blocks = 1; // accounts for this header block
2483 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2484 tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2485
2486 tr->sequence_num = ++jnl->sequence_num;
2487 tr->num_blhdrs = 1;
2488 tr->total_bytes = jnl->jhdr->blhdr_size;
2489 tr->jnl = jnl;
2490
2491 jnl->active_tr = tr;
2492
2493 return 0;
2494 }
2495
2496 int
2497 journal_start_transaction(journal *jnl)
2498 {
2499 int ret;
2500
2501 CHECK_JOURNAL(jnl);
2502
2503 free_old_stuff(jnl);
2504
2505 if (jnl->flags & JOURNAL_INVALID) {
2506 return EINVAL;
2507 }
2508 if (jnl->owner == current_thread()) {
2509 if (jnl->active_tr == NULL) {
2510 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2511 jnl, jnl->owner, current_thread());
2512 }
2513 jnl->nested_count++;
2514 return 0;
2515 }
2516 lock_journal(jnl);
2517
2518 if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
2519 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2520 jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2521 }
2522
2523 jnl->owner = current_thread();
2524 jnl->nested_count = 1;
2525
2526 #if JOE
2527 // make sure there's room in the journal
2528 if (free_space(jnl) < jnl->tbuffer_size) {
2529
2530 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
2531
2532 // this is the call that really waits for space to free up
2533 // as well as updating jnl->jhdr->start
2534 if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) {
2535 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2536 ret = ENOSPC;
2537 goto bad_start;
2538 }
2539 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0);
2540 }
2541 #endif
2542
2543 // if there's a buffered transaction, use it.
2544 if (jnl->cur_tr) {
2545 jnl->active_tr = jnl->cur_tr;
2546 jnl->cur_tr = NULL;
2547
2548 return 0;
2549 }
2550
2551 ret = journal_allocate_transaction(jnl);
2552 if (ret) {
2553 goto bad_start;
2554 }
2555
2556 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2557
2558 return 0;
2559
2560 bad_start:
2561 jnl->owner = NULL;
2562 jnl->nested_count = 0;
2563 unlock_journal(jnl);
2564
2565 return ret;
2566 }
2567
2568
2569 int
2570 journal_modify_block_start(journal *jnl, struct buf *bp)
2571 {
2572 transaction *tr;
2573
2574 CHECK_JOURNAL(jnl);
2575
2576
2577 free_old_stuff(jnl);
2578
2579 if (jnl->flags & JOURNAL_INVALID) {
2580 return EINVAL;
2581 }
2582
2583 // XXXdbg - for debugging I want this to be true. later it may
2584 // not be necessary.
2585 if ((buf_flags(bp) & B_META) == 0) {
2586 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2587 }
2588
2589 tr = jnl->active_tr;
2590 CHECK_TRANSACTION(tr);
2591
2592 if (jnl->owner != current_thread()) {
2593 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2594 jnl, jnl->owner, current_thread());
2595 }
2596
2597 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2598 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2599
2600 // can't allow blocks that aren't an even multiple of the
2601 // underlying block size.
2602 if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2603 uint32_t phys_blksz, bad=0;
2604
2605 if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) {
2606 bad = 1;
2607 } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
2608 if (phys_blksz < 512) {
2609 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
2610 phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size);
2611 }
2612
2613 if ((buf_size(bp) % phys_blksz) != 0) {
2614 bad = 1;
2615 } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
2616 jnl->jhdr->jhdr_size = phys_blksz;
2617 } else {
2618 // the phys_blksz is now larger... need to realloc the jhdr
2619 char *new_header_buf;
2620
2621 printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n",
2622 jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
2623 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) {
2624 printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n",
2625 jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
2626 bad = 1;
2627 } else {
2628 memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
2629 memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
2630 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2631 jnl->header_buf = new_header_buf;
2632 jnl->header_buf_size = phys_blksz;
2633
2634 jnl->jhdr = (journal_header *)jnl->header_buf;
2635 jnl->jhdr->jhdr_size = phys_blksz;
2636 }
2637 }
2638 } else {
2639 bad = 1;
2640 }
2641
2642 if (bad) {
2643 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2644 buf_size(bp), jnl->jhdr->jhdr_size);
2645 return -1;
2646 }
2647 }
2648
2649 // make sure that this transaction isn't bigger than the whole journal
2650 if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2651 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2652 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2653 return -1;
2654 }
2655
2656 // if the block is dirty and not already locked we have to write
2657 // it out before we muck with it because it has data that belongs
2658 // (presumably) to another transaction.
2659 //
2660 if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2661
2662 if (buf_flags(bp) & B_ASYNC) {
2663 panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2664 }
2665 if (bp->b_shadow_ref)
2666 panic("modify_block_start: dirty bp @ %p has shadows!\n", bp);
2667
2668 // this will cause it to not be buf_brelse()'d
2669 buf_setflags(bp, B_NORELSE);
2670 VNOP_BWRITE(bp);
2671 }
2672 buf_setflags(bp, B_LOCKED);
2673
2674 return 0;
2675 }
2676
2677 int
2678 journal_modify_block_abort(journal *jnl, struct buf *bp)
2679 {
2680 transaction *tr;
2681 block_list_header *blhdr;
2682 int i;
2683
2684 CHECK_JOURNAL(jnl);
2685
2686 free_old_stuff(jnl);
2687
2688 tr = jnl->active_tr;
2689
2690 //
2691 // if there's no active transaction then we just want to
2692 // call buf_brelse() and return since this is just a block
2693 // that happened to be modified as part of another tr.
2694 //
2695 if (tr == NULL) {
2696 buf_brelse(bp);
2697 return 0;
2698 }
2699
2700 if (jnl->flags & JOURNAL_INVALID) {
2701 /* Still need to buf_brelse(). Callers assume we consume the bp. */
2702 buf_brelse(bp);
2703 return EINVAL;
2704 }
2705
2706 CHECK_TRANSACTION(tr);
2707
2708 if (jnl->owner != current_thread()) {
2709 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2710 jnl, jnl->owner, current_thread());
2711 }
2712
2713 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2714
2715 // first check if it's already part of this transaction
2716 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2717 for (i = 1; i < blhdr->num_blocks; i++) {
2718 if (bp == blhdr->binfo[i].u.bp) {
2719 break;
2720 }
2721 }
2722
2723 if (i < blhdr->num_blocks) {
2724 break;
2725 }
2726 }
2727
2728 //
2729 // if blhdr is null, then this block has only had modify_block_start
2730 // called on it as part of the current transaction. that means that
2731 // it is ok to clear the LOCKED bit since it hasn't actually been
2732 // modified. if blhdr is non-null then modify_block_end was called
2733 // on it and so we need to keep it locked in memory.
2734 //
2735 if (blhdr == NULL) {
2736 buf_clearflags(bp, B_LOCKED);
2737 }
2738
2739 buf_brelse(bp);
2740 return 0;
2741 }
2742
2743
2744 int
2745 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg)
2746 {
2747 int i = 1;
2748 int tbuffer_offset=0;
2749 block_list_header *blhdr, *prev=NULL;
2750 transaction *tr;
2751
2752 CHECK_JOURNAL(jnl);
2753
2754 free_old_stuff(jnl);
2755
2756 if (jnl->flags & JOURNAL_INVALID) {
2757 /* Still need to buf_brelse(). Callers assume we consume the bp. */
2758 buf_brelse(bp);
2759 return EINVAL;
2760 }
2761
2762 tr = jnl->active_tr;
2763 CHECK_TRANSACTION(tr);
2764
2765 if (jnl->owner != current_thread()) {
2766 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2767 jnl, jnl->owner, current_thread());
2768 }
2769
2770 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2771 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2772
2773 if ((buf_flags(bp) & B_LOCKED) == 0) {
2774 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2775 }
2776
2777 // first check if it's already part of this transaction
2778 for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2779 tbuffer_offset = jnl->jhdr->blhdr_size;
2780
2781 for (i = 1; i < blhdr->num_blocks; i++) {
2782 if (bp == blhdr->binfo[i].u.bp) {
2783 break;
2784 }
2785 if (blhdr->binfo[i].bnum != (off_t)-1) {
2786 tbuffer_offset += buf_size(blhdr->binfo[i].u.bp);
2787 } else {
2788 tbuffer_offset += blhdr->binfo[i].u.bi.bsize;
2789 }
2790 }
2791
2792 if (i < blhdr->num_blocks) {
2793 break;
2794 }
2795 }
2796
2797 if (blhdr == NULL
2798 && prev
2799 && (prev->num_blocks+1) <= prev->max_blocks
2800 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2801 blhdr = prev;
2802
2803 } else if (blhdr == NULL) {
2804 block_list_header *nblhdr;
2805 if (prev == NULL) {
2806 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2807 }
2808
2809 // we got to the end of the list, didn't find the block and there's
2810 // no room in the block_list_header pointed to by prev
2811
2812 // we allocate another tbuffer and link it in at the end of the list
2813 // through prev->binfo[0].bnum. that's a skanky way to do things but
2814 // avoids having yet another linked list of small data structures to manage.
2815
2816 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
2817 panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2818 tr, tr->total_bytes);
2819 }
2820
2821 // journal replay code checksum check depends on this.
2822 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2823 // Fill up the rest of the block with unimportant bytes
2824 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2825
2826 // initialize the new guy
2827 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2828 nblhdr->num_blocks = 1; // accounts for this header block
2829 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2830 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2831
2832 tr->num_blhdrs++;
2833 tr->total_bytes += jnl->jhdr->blhdr_size;
2834
2835 // then link him in at the end
2836 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2837
2838 // and finally switch to using the new guy
2839 blhdr = nblhdr;
2840 tbuffer_offset = jnl->jhdr->blhdr_size;
2841 i = 1;
2842 }
2843
2844
2845 if ((i+1) > blhdr->max_blocks) {
2846 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2847 }
2848
2849 // if this is true then this is a new block we haven't seen
2850 if (i >= blhdr->num_blocks) {
2851 int bsize;
2852 vnode_t vp;
2853
2854 vp = buf_vnode(bp);
2855 vnode_ref(vp);
2856 bsize = buf_size(bp);
2857
2858 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
2859 blhdr->binfo[i].u.bp = bp;
2860
2861 KERNEL_DEBUG_CONSTANT(0x3018004, vp, blhdr->binfo[i].bnum, bsize, 0, 0);
2862
2863 if (func) {
2864 void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
2865
2866 buf_setfilter(bp, func, arg, &old_func, &old_arg);
2867 if (old_func != NULL && old_func != func) {
2868 panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func);
2869 }
2870 }
2871
2872 blhdr->bytes_used += bsize;
2873 tr->total_bytes += bsize;
2874
2875 blhdr->num_blocks++;
2876 }
2877 buf_bdwrite(bp);
2878
2879 return 0;
2880 }
2881
2882 int
2883 journal_kill_block(journal *jnl, struct buf *bp)
2884 {
2885 int i;
2886 int bflags;
2887 block_list_header *blhdr;
2888 transaction *tr;
2889
2890 CHECK_JOURNAL(jnl);
2891
2892 free_old_stuff(jnl);
2893
2894 if (jnl->flags & JOURNAL_INVALID) {
2895 return EINVAL;
2896 }
2897
2898 tr = jnl->active_tr;
2899 CHECK_TRANSACTION(tr);
2900
2901 if (jnl->owner != current_thread()) {
2902 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2903 jnl, jnl->owner, current_thread());
2904 }
2905
2906 bflags = buf_flags(bp);
2907
2908 if ( !(bflags & B_LOCKED))
2909 panic("jnl: modify_block_end: called with bp not B_LOCKED");
2910
2911 /*
2912 * bp must be BL_BUSY and B_LOCKED
2913 * first check if it's already part of this transaction
2914 */
2915 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2916
2917 for (i = 1; i < blhdr->num_blocks; i++) {
2918 if (bp == blhdr->binfo[i].u.bp) {
2919 vnode_t vp;
2920
2921 buf_clearflags(bp, B_LOCKED);
2922
2923 // this undoes the vnode_ref() in journal_modify_block_end()
2924 vp = buf_vnode(bp);
2925 vnode_rele_ext(vp, 0, 1);
2926
2927 // if the block has the DELWRI and FILTER bits sets, then
2928 // things are seriously weird. if it was part of another
2929 // transaction then journal_modify_block_start() should
2930 // have force it to be written.
2931 //
2932 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
2933 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
2934 //} else {
2935 tr->num_killed += buf_size(bp);
2936 //}
2937 blhdr->binfo[i].bnum = (off_t)-1;
2938 blhdr->binfo[i].u.bp = NULL;
2939 blhdr->binfo[i].u.bi.bsize = buf_size(bp);
2940
2941 buf_markinvalid(bp);
2942 buf_brelse(bp);
2943
2944 break;
2945 }
2946 }
2947
2948 if (i < blhdr->num_blocks) {
2949 break;
2950 }
2951 }
2952
2953 return 0;
2954 }
2955
2956 /*
2957 ;________________________________________________________________________________
2958 ;
2959 ; Routine: journal_trim_set_callback
2960 ;
2961 ; Function: Provide the journal with a routine to be called back when a
2962 ; TRIM has (or would have) been issued to the device. That
2963 ; is, the transaction has been flushed to the device, and the
2964 ; blocks freed by the transaction are now safe for reuse.
2965 ;
2966 ; CAUTION: If the journal becomes invalid (eg., due to an I/O
2967 ; error when trying to write to the journal), this callback
2968 ; will stop getting called, even if extents got freed before
2969 ; the journal became invalid!
2970 ;
2971 ; Input Arguments:
2972 ; jnl - The journal structure for the filesystem.
2973 ; callback - The function to call when the TRIM is complete.
2974 ; arg - An argument to be passed to callback.
2975 ;________________________________________________________________________________
2976 */
2977 __private_extern__ void
2978 journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg)
2979 {
2980 jnl->trim_callback = callback;
2981 jnl->trim_callback_arg = arg;
2982 }
2983
2984
2985 /*
2986 ;________________________________________________________________________________
2987 ;
2988 ; Routine: journal_trim_realloc
2989 ;
2990 ; Function: Increase the amount of memory allocated for the list of extents
2991 ; to be unmapped (trimmed). This routine will be called when
2992 ; adding an extent to the list, and the list already occupies
2993 ; all of the space allocated to it. This routine returns ENOMEM
2994 ; if unable to allocate more space, or 0 if the extent list was
2995 ; grown successfully.
2996 ;
2997 ; Input Arguments:
2998 ; trim - The trim list to be resized.
2999 ;
3000 ; Output:
3001 ; (result) - ENOMEM or 0.
3002 ;
3003 ; Side effects:
3004 ; The allocated_count and extents fields of tr->trim are updated
3005 ; if the function returned 0.
3006 ;________________________________________________________________________________
3007 */
3008 static int
3009 trim_realloc(struct jnl_trim_list *trim)
3010 {
3011 void *new_extents;
3012 uint32_t new_allocated_count;
3013
3014 if (jnl_kdebug)
3015 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0);
3016
3017 new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
3018 new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
3019 if (new_extents == NULL) {
3020 printf("jnl: trim_realloc: unable to grow extent list!\n");
3021 /*
3022 * Since we could be called when allocating space previously marked
3023 * to be trimmed, we need to empty out the list to be safe.
3024 */
3025 trim->extent_count = 0;
3026 if (jnl_kdebug)
3027 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0);
3028 return ENOMEM;
3029 }
3030
3031 /* Copy the old extent list to the newly allocated list. */
3032 if (trim->extents != NULL) {
3033 memmove(new_extents,
3034 trim->extents,
3035 trim->allocated_count * sizeof(dk_extent_t));
3036 kfree(trim->extents,
3037 trim->allocated_count * sizeof(dk_extent_t));
3038 }
3039
3040 trim->allocated_count = new_allocated_count;
3041 trim->extents = new_extents;
3042
3043 if (jnl_kdebug)
3044 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0);
3045
3046 return 0;
3047 }
3048
3049 /*
3050 ;________________________________________________________________________________
3051 ;
3052 ; Routine: trim_search_extent
3053 ;
3054 ; Function: Search the given extent list to see if any of its extents
3055 ; overlap the given extent.
3056 ;
3057 ; Input Arguments:
3058 ; trim - The trim list to be searched.
3059 ; offset - The first byte of the range to be searched for.
3060 ; length - The number of bytes of the extent being searched for.
3061 ; overlap_start - start of the overlapping extent
3062 ; overlap_len - length of the overlapping extent
3063 ;
3064 ; Output:
3065 ; (result) - TRUE if one or more extents overlap, FALSE otherwise.
3066 ;________________________________________________________________________________
3067 */
3068 static int
3069 trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
3070 uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
3071 {
3072 uint64_t end = offset + length;
3073 uint32_t lower = 0; /* Lowest index to search */
3074 uint32_t upper = trim->extent_count; /* Highest index to search + 1 */
3075 uint32_t middle;
3076
3077 /* A binary search over the extent list. */
3078 while (lower < upper) {
3079 middle = (lower + upper) / 2;
3080
3081 if (trim->extents[middle].offset >= end)
3082 upper = middle;
3083 else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
3084 lower = middle + 1;
3085 else {
3086 if (overlap_start) {
3087 *overlap_start = trim->extents[middle].offset;
3088 }
3089 if (overlap_len) {
3090 *overlap_len = trim->extents[middle].length;
3091 }
3092 return TRUE;
3093 }
3094 }
3095
3096 return FALSE;
3097 }
3098
3099
3100 /*
3101 ;________________________________________________________________________________
3102 ;
3103 ; Routine: journal_trim_add_extent
3104 ;
3105 ; Function: Keep track of extents that have been freed as part of this
3106 ; transaction. If the underlying device supports TRIM (UNMAP),
3107 ; then those extents will be trimmed/unmapped once the
3108 ; transaction has been written to the journal. (For example,
3109 ; SSDs can support trim/unmap and avoid having to recopy those
3110 ; blocks when doing wear leveling, and may reuse the same
3111 ; phsyical blocks for different logical blocks.)
3112 ;
3113 ; HFS also uses this, in combination with journal_trim_set_callback,
3114 ; to add recently freed extents to its free extent cache, but
3115 ; only after the transaction that freed them is committed to
3116 ; disk. (This reduces the chance of overwriting live data in
3117 ; a way that causes data loss if a transaction never gets
3118 ; written to the journal.)
3119 ;
3120 ; Input Arguments:
3121 ; jnl - The journal for the volume containing the byte range.
3122 ; offset - The first byte of the range to be trimmed.
3123 ; length - The number of bytes of the extent being trimmed.
3124 ;________________________________________________________________________________
3125 */
3126 __private_extern__ int
3127 journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
3128 {
3129 uint64_t end;
3130 transaction *tr;
3131 dk_extent_t *extent;
3132 uint32_t insert_index;
3133 uint32_t replace_count;
3134
3135 CHECK_JOURNAL(jnl);
3136
3137 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
3138 if (jnl->flags & JOURNAL_INVALID) {
3139 return EINVAL;
3140 }
3141
3142 tr = jnl->active_tr;
3143 CHECK_TRANSACTION(tr);
3144
3145 if (jnl_kdebug)
3146 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
3147
3148 if (jnl->owner != current_thread()) {
3149 panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3150 jnl, jnl->owner, current_thread());
3151 }
3152
3153 free_old_stuff(jnl);
3154
3155 end = offset + length;
3156
3157 /*
3158 * Find the range of existing extents that can be combined with the
3159 * input extent. We start by counting the number of extents that end
3160 * strictly before the input extent, then count the number of extents
3161 * that overlap or are contiguous with the input extent.
3162 */
3163 extent = tr->trim.extents;
3164 insert_index = 0;
3165 while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) {
3166 ++insert_index;
3167 ++extent;
3168 }
3169 replace_count = 0;
3170 while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) {
3171 ++replace_count;
3172 ++extent;
3173 }
3174
3175 /*
3176 * If none of the existing extents can be combined with the input extent,
3177 * then just insert it in the list (before item number insert_index).
3178 */
3179 if (replace_count == 0) {
3180 /* If the list was already full, we need to grow it. */
3181 if (tr->trim.extent_count == tr->trim.allocated_count) {
3182 if (trim_realloc(&tr->trim) != 0) {
3183 printf("jnl: trim_add_extent: out of memory!");
3184 if (jnl_kdebug)
3185 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
3186 return ENOMEM;
3187 }
3188 }
3189
3190 /* Shift any existing extents with larger offsets. */
3191 if (insert_index < tr->trim.extent_count) {
3192 memmove(&tr->trim.extents[insert_index+1],
3193 &tr->trim.extents[insert_index],
3194 (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t));
3195 }
3196 tr->trim.extent_count++;
3197
3198 /* Store the new extent in the list. */
3199 tr->trim.extents[insert_index].offset = offset;
3200 tr->trim.extents[insert_index].length = length;
3201
3202 /* We're done. */
3203 if (jnl_kdebug)
3204 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3205 return 0;
3206 }
3207
3208 /*
3209 * Update extent number insert_index to be the union of the input extent
3210 * and all of the replaced extents.
3211 */
3212 if (tr->trim.extents[insert_index].offset < offset)
3213 offset = tr->trim.extents[insert_index].offset;
3214 extent = &tr->trim.extents[insert_index + replace_count - 1];
3215 if (extent->offset + extent->length > end)
3216 end = extent->offset + extent->length;
3217 tr->trim.extents[insert_index].offset = offset;
3218 tr->trim.extents[insert_index].length = end - offset;
3219
3220 /*
3221 * If we were replacing more than one existing extent, then shift any
3222 * extents with larger offsets, and update the count of extents.
3223 *
3224 * We're going to leave extent #insert_index alone since it was just updated, above.
3225 * We need to move extents from index (insert_index + replace_count) through the end of
3226 * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1).
3227 */
3228 if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) {
3229 memmove(&tr->trim.extents[insert_index + 1],
3230 &tr->trim.extents[insert_index + replace_count],
3231 (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t));
3232 }
3233 tr->trim.extent_count -= replace_count - 1;
3234
3235 if (jnl_kdebug)
3236 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3237 return 0;
3238 }
3239
3240 /*
3241 * journal_trim_extent_overlap
3242 *
3243 * Return 1 if there are any pending TRIMs that overlap with the given offset and length
3244 * Return 0 otherwise.
3245 */
3246
3247 int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
3248 transaction *tr = NULL;
3249 int overlap = 0;
3250
3251 uint64_t overlap_start;
3252 uint64_t overlap_len;
3253 tr = jnl->active_tr;
3254 CHECK_TRANSACTION(tr);
3255
3256 /*
3257 * There are two lists that need to be examined for potential overlaps:
3258 *
3259 * The first is the current transaction. Since this function requires that
3260 * a transaction be active when this is called, this is the "active_tr"
3261 * pointer in the journal struct. This has a trimlist pointer which needs
3262 * to be searched.
3263 */
3264 overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
3265 if (overlap == 0) {
3266 /*
3267 * The second is the async trim list, which is only done if the current
3268 * transaction group (active transaction) did not overlap with our target
3269 * extent. This async trim list is the set of all previously
3270 * committed transaction groups whose I/Os are now in-flight. We need to hold the
3271 * trim lock in order to search this list. If we grab the list before the
3272 * TRIM has completed, then we will compare it. If it is grabbed AFTER the
3273 * TRIM has completed, then the pointer will be zeroed out and we won't have
3274 * to check anything.
3275 */
3276 lck_rw_lock_shared (&jnl->trim_lock);
3277 if (jnl->async_trim != NULL) {
3278 overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
3279 }
3280 lck_rw_unlock_shared (&jnl->trim_lock);
3281 }
3282
3283 if (overlap) {
3284 /* compute the end (min) of the overlapping range */
3285 if ( (overlap_start + overlap_len) < (offset + length)) {
3286 *end = (overlap_start + overlap_len);
3287 }
3288 else {
3289 *end = (offset + length);
3290 }
3291 }
3292
3293
3294 return overlap;
3295 }
3296
3297 /*
3298 * journal_request_immediate_flush
3299 *
3300 * FS requests that the journal flush immediately upon the
3301 * active transaction's completion.
3302 *
3303 * Returns 0 if operation succeeds
3304 * Returns EPERM if we failed to leave hint
3305 */
3306 int
3307 journal_request_immediate_flush (journal *jnl) {
3308
3309 transaction *tr = NULL;
3310 /*
3311 * Is a transaction still in process? You must do
3312 * this while there are txns open
3313 */
3314 tr = jnl->active_tr;
3315 if (tr != NULL) {
3316 CHECK_TRANSACTION(tr);
3317 tr->flush_on_completion = TRUE;
3318 }
3319 else {
3320 return EPERM;
3321 }
3322 return 0;
3323 }
3324
3325
3326
3327 /*
3328 ;________________________________________________________________________________
3329 ;
3330 ; Routine: trim_remove_extent
3331 ;
3332 ; Function: Indicate that a range of bytes, some of which may have previously
3333 ; been passed to journal_trim_add_extent, is now allocated.
3334 ; Any overlapping ranges currently in the journal's trim list will
3335 ; be removed. If the underlying device supports TRIM (UNMAP), then
3336 ; these extents will not be trimmed/unmapped when the transaction
3337 ; is written to the journal.
3338 ;
3339 ; HFS also uses this to prevent newly allocated space from being
3340 ; added to its free extent cache (if some portion of the newly
3341 ; allocated space was recently freed).
3342 ;
3343 ; Input Arguments:
3344 ; trim - The trim list to update.
3345 ; offset - The first byte of the range to be trimmed.
3346 ; length - The number of bytes of the extent being trimmed.
3347 ;________________________________________________________________________________
3348 */
3349 static int
3350 trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
3351 {
3352 u_int64_t end;
3353 dk_extent_t *extent;
3354 u_int32_t keep_before;
3355 u_int32_t keep_after;
3356
3357 end = offset + length;
3358
3359 /*
3360 * Find any existing extents that start before or end after the input
3361 * extent. These extents will be modified if they overlap the input
3362 * extent. Other extents between them will be deleted.
3363 */
3364 extent = trim->extents;
3365 keep_before = 0;
3366 while (keep_before < trim->extent_count && extent->offset < offset) {
3367 ++keep_before;
3368 ++extent;
3369 }
3370 keep_after = keep_before;
3371 if (keep_after > 0) {
3372 /* See if previous extent extends beyond both ends of input extent. */
3373 --keep_after;
3374 --extent;
3375 }
3376 while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) {
3377 ++keep_after;
3378 ++extent;
3379 }
3380
3381 /*
3382 * When we get here, the first keep_before extents (0 .. keep_before-1)
3383 * start before the input extent, and extents (keep_after .. extent_count-1)
3384 * end after the input extent. We'll need to keep, all of those extents,
3385 * but possibly modify #(keep_before-1) and #keep_after to remove the portion
3386 * that overlaps with the input extent.
3387 */
3388
3389 /*
3390 * Does the input extent start after and end before the same existing
3391 * extent? If so, we have to "punch a hole" in that extent and convert
3392 * it to two separate extents.
3393 */
3394 if (keep_before > keep_after) {
3395 /* If the list was already full, we need to grow it. */
3396 if (trim->extent_count == trim->allocated_count) {
3397 if (trim_realloc(trim) != 0) {
3398 printf("jnl: trim_remove_extent: out of memory!");
3399 return ENOMEM;
3400 }
3401 }
3402
3403 /*
3404 * Make room for a new extent by shifting extents #keep_after and later
3405 * down by one extent. When we're done, extents #keep_before and
3406 * #keep_after will be identical, and we can fall through to removing
3407 * the portion that overlaps the input extent.
3408 */
3409 memmove(&trim->extents[keep_before],
3410 &trim->extents[keep_after],
3411 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3412 ++trim->extent_count;
3413 ++keep_after;
3414
3415 /*
3416 * Fall through. We now have the case where the length of extent
3417 * #(keep_before - 1) needs to be updated, and the start of extent
3418 * #(keep_after) needs to be updated.
3419 */
3420 }
3421
3422 /*
3423 * May need to truncate the end of extent #(keep_before - 1) if it overlaps
3424 * the input extent.
3425 */
3426 if (keep_before > 0) {
3427 extent = &trim->extents[keep_before - 1];
3428 if (extent->offset + extent->length > offset) {
3429 extent->length = offset - extent->offset;
3430 }
3431 }
3432
3433 /*
3434 * May need to update the start of extent #(keep_after) if it overlaps the
3435 * input extent.
3436 */
3437 if (keep_after < trim->extent_count) {
3438 extent = &trim->extents[keep_after];
3439 if (extent->offset < end) {
3440 extent->length = extent->offset + extent->length - end;
3441 extent->offset = end;
3442 }
3443 }
3444
3445 /*
3446 * If there were whole extents that overlapped the input extent, get rid
3447 * of them by shifting any following extents, and updating the count.
3448 */
3449 if (keep_after > keep_before && keep_after < trim->extent_count) {
3450 memmove(&trim->extents[keep_before],
3451 &trim->extents[keep_after],
3452 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3453 }
3454 trim->extent_count -= keep_after - keep_before;
3455
3456 return 0;
3457 }
3458
3459 /*
3460 ;________________________________________________________________________________
3461 ;
3462 ; Routine: journal_trim_remove_extent
3463 ;
3464 ; Function: Make note of a range of bytes, some of which may have previously
3465 ; been passed to journal_trim_add_extent, is now in use on the
3466 ; volume. The given bytes will be not be trimmed as part of
3467 ; this transaction, or a pending trim of a transaction being
3468 ; asynchronously flushed.
3469 ;
3470 ; Input Arguments:
3471 ; jnl - The journal for the volume containing the byte range.
3472 ; offset - The first byte of the range to be trimmed.
3473 ; length - The number of bytes of the extent being trimmed.
3474 ;________________________________________________________________________________
3475 */
3476 __private_extern__ int
3477 journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
3478 {
3479 int error = 0;
3480 transaction *tr;
3481
3482 CHECK_JOURNAL(jnl);
3483
3484 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
3485 if (jnl->flags & JOURNAL_INVALID) {
3486 return EINVAL;
3487 }
3488
3489 tr = jnl->active_tr;
3490 CHECK_TRANSACTION(tr);
3491
3492 if (jnl_kdebug)
3493 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0);
3494
3495 if (jnl->owner != current_thread()) {
3496 panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3497 jnl, jnl->owner, current_thread());
3498 }
3499
3500 free_old_stuff(jnl);
3501
3502 error = trim_remove_extent(&tr->trim, offset, length);
3503 if (error == 0) {
3504 int found = FALSE;
3505
3506 /*
3507 * See if a pending trim has any extents that overlap with the
3508 * one we were given.
3509 */
3510 lck_rw_lock_shared(&jnl->trim_lock);
3511 if (jnl->async_trim != NULL)
3512 found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
3513 lck_rw_unlock_shared(&jnl->trim_lock);
3514
3515 if (found) {
3516 /*
3517 * There was an overlap, so avoid trimming the extent we
3518 * just allocated. (Otherwise, it might get trimmed after
3519 * we've written to it, which will cause that data to be
3520 * corrupted.)
3521 */
3522 uint32_t async_extent_count = 0;
3523
3524 if (jnl_kdebug)
3525 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0);
3526 lck_rw_lock_exclusive(&jnl->trim_lock);
3527 if (jnl->async_trim != NULL) {
3528 error = trim_remove_extent(jnl->async_trim, offset, length);
3529 async_extent_count = jnl->async_trim->extent_count;
3530 }
3531 lck_rw_unlock_exclusive(&jnl->trim_lock);
3532 if (jnl_kdebug)
3533 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0);
3534 }
3535 }
3536
3537 if (jnl_kdebug)
3538 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0);
3539 return error;
3540 }
3541
3542
3543 static int
3544 journal_trim_flush(journal *jnl, transaction *tr)
3545 {
3546 int errno = 0;
3547
3548 if (jnl_kdebug)
3549 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
3550
3551 lck_rw_lock_shared(&jnl->trim_lock);
3552 if (tr->trim.extent_count > 0) {
3553 dk_unmap_t unmap;
3554
3555 bzero(&unmap, sizeof(unmap));
3556 if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
3557 unmap.extents = tr->trim.extents;
3558 unmap.extentsCount = tr->trim.extent_count;
3559 if (jnl_kdebug)
3560 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0);
3561 errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
3562 if (jnl_kdebug)
3563 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
3564 }
3565
3566 /*
3567 * Call back into the file system to tell them that we have
3568 * trimmed some extents and that they can now be reused.
3569 *
3570 * CAUTION: If the journal becomes invalid (eg., due to an I/O
3571 * error when trying to write to the journal), this callback
3572 * will stop getting called, even if extents got freed before
3573 * the journal became invalid!
3574 */
3575 if (jnl->trim_callback)
3576 jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
3577 }
3578 lck_rw_unlock_shared(&jnl->trim_lock);
3579
3580 /*
3581 * If the transaction we're flushing was the async transaction, then
3582 * tell the current transaction that there is no pending trim
3583 * any more.
3584 *
3585 * NOTE: Since we released the lock, another thread could have
3586 * removed one or more extents from our list. That's not a
3587 * problem since any writes to the re-allocated blocks
3588 * would get sent to the device after the DKIOCUNMAP.
3589 */
3590 lck_rw_lock_exclusive(&jnl->trim_lock);
3591 if (jnl->async_trim == &tr->trim)
3592 jnl->async_trim = NULL;
3593 lck_rw_unlock_exclusive(&jnl->trim_lock);
3594
3595 /*
3596 * By the time we get here, no other thread can discover the address
3597 * of "tr", so it is safe for us to manipulate tr->trim without
3598 * holding any locks.
3599 */
3600 if (tr->trim.extents) {
3601 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
3602 tr->trim.allocated_count = 0;
3603 tr->trim.extent_count = 0;
3604 tr->trim.extents = NULL;
3605 }
3606
3607 if (jnl_kdebug)
3608 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0);
3609
3610 return errno;
3611 }
3612
3613 static int
3614 journal_binfo_cmp(const void *a, const void *b)
3615 {
3616 const block_info *bi_a = (const struct block_info *)a;
3617 const block_info *bi_b = (const struct block_info *)b;
3618 daddr64_t res;
3619
3620 if (bi_a->bnum == (off_t)-1) {
3621 return 1;
3622 }
3623 if (bi_b->bnum == (off_t)-1) {
3624 return -1;
3625 }
3626
3627 // don't have to worry about negative block
3628 // numbers so this is ok to do.
3629 //
3630 res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp));
3631
3632 return (int)res;
3633 }
3634
3635
3636 /*
3637 * End a transaction. If the transaction is small enough, and we're not forcing
3638 * a write to disk, the "active" transaction becomes the "current" transaction,
3639 * and will be reused for the next transaction that is started (group commit).
3640 *
3641 * If the transaction gets written to disk (because force_it is true, or no
3642 * group commit, or the transaction is sufficiently full), the blocks get
3643 * written into the journal first, then the are written asynchronously. When
3644 * those async writes complete, the transaction can be freed and removed from
3645 * the journal.
3646 *
3647 * An optional callback can be supplied. If given, it is called after the
3648 * the blocks have been written to the journal, but before the async writes
3649 * of those blocks to their normal on-disk locations. This is used by
3650 * journal_relocate so that the location of the journal can be changed and
3651 * flushed to disk before the blocks get written to their normal locations.
3652 * Note that the callback is only called if the transaction gets written to
3653 * the journal during this end_transaction call; you probably want to set the
3654 * force_it flag.
3655 *
3656 * Inputs:
3657 * tr Transaction to add to the journal
3658 * force_it If true, force this transaction to the on-disk journal immediately.
3659 * callback See description above. Pass NULL for no callback.
3660 * callback_arg Argument passed to callback routine.
3661 *
3662 * Result
3663 * 0 No errors
3664 * -1 An error occurred. The journal is marked invalid.
3665 */
3666 static int
3667 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait)
3668 {
3669 block_list_header *blhdr=NULL, *next=NULL;
3670 int i, ret_val = 0;
3671 errno_t errno;
3672 journal *jnl = tr->jnl;
3673 struct buf *bp;
3674 size_t tbuffer_offset;
3675 boolean_t drop_lock_early;
3676
3677 if (jnl->cur_tr) {
3678 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
3679 jnl, jnl->cur_tr, tr);
3680 }
3681
3682 // if there weren't any modified blocks in the transaction
3683 // just save off the transaction pointer and return.
3684 if (tr->total_bytes == jnl->jhdr->blhdr_size) {
3685 jnl->cur_tr = tr;
3686 goto done;
3687 }
3688
3689 // if our transaction buffer isn't very full, just hang
3690 // on to it and don't actually flush anything. this is
3691 // what is known as "group commit". we will flush the
3692 // transaction buffer if it's full or if we have more than
3693 // one of them so we don't start hogging too much memory.
3694 //
3695 // We also check the device supports UNMAP/TRIM, and if so,
3696 // the number of extents waiting to be trimmed. If it is
3697 // small enough, then keep accumulating more (so we can
3698 // reduce the overhead of trimming). If there was a prior
3699 // trim error, then we stop issuing trims for this
3700 // volume, so we can also coalesce transactions.
3701 //
3702 if ( force_it == 0
3703 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
3704 && tr->num_blhdrs < 3
3705 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
3706 && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) {
3707
3708 jnl->cur_tr = tr;
3709 goto done;
3710 }
3711
3712 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0);
3713
3714 lock_condition(jnl, &jnl->flushing, "end_transaction");
3715
3716 /*
3717 * if the previous 'finish_end_transaction' was being run
3718 * asynchronously, it could have encountered a condition
3719 * that caused it to mark the journal invalid... if that
3720 * occurred while we were waiting for it to finish, we
3721 * need to notice and abort the current transaction
3722 */
3723 if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) {
3724 unlock_condition(jnl, &jnl->flushing);
3725
3726 abort_transaction(jnl, tr);
3727 ret_val = -1;
3728 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
3729 goto done;
3730 }
3731
3732 /*
3733 * Store a pointer to this transaction's trim list so that
3734 * future transactions can find it.
3735 *
3736 * Note: if there are no extents in the trim list, then don't
3737 * bother saving the pointer since nothing can add new extents
3738 * to the list (and other threads/transactions only care if
3739 * there is a trim pending).
3740 */
3741 lck_rw_lock_exclusive(&jnl->trim_lock);
3742 if (jnl->async_trim != NULL)
3743 panic("jnl: end_transaction: async_trim already non-NULL!");
3744 if (tr->trim.extent_count > 0)
3745 jnl->async_trim = &tr->trim;
3746 lck_rw_unlock_exclusive(&jnl->trim_lock);
3747
3748 /*
3749 * snapshot the transaction sequence number while we are still behind
3750 * the journal lock since it will be bumped upon the start of the
3751 * next transaction group which may overlap the current journal flush...
3752 * we pass the snapshot into write_journal_header during the journal
3753 * flush so that it can write the correct version in the header...
3754 * because we hold the 'flushing' condition variable for the duration
3755 * of the journal flush, 'saved_sequence_num' remains stable
3756 */
3757 jnl->saved_sequence_num = jnl->sequence_num;
3758
3759 /*
3760 * if we're here we're going to flush the transaction buffer to disk.
3761 * 'check_free_space' will not return untl there is enough free
3762 * space for this transaction in the journal and jnl->old_start[0]
3763 * is avaiable for use
3764 */
3765 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
3766
3767 check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
3768
3769 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0);
3770
3771 // range check the end index
3772 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
3773 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
3774 jnl->jhdr->end, jnl->jhdr->size);
3775 }
3776 if (tr->delayed_header_write == TRUE) {
3777 thread_t thread = THREAD_NULL;
3778
3779 lock_condition(jnl, &jnl->writing_header, "end_transaction");
3780 /*
3781 * fire up a thread to write the journal header
3782 * asynchronously... when it finishes, it will call
3783 * unlock_condition... we can overlap the preparation of
3784 * the log and buffers during this time
3785 */
3786 kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread);
3787 } else
3788 jnl->write_header_failed = FALSE;
3789
3790
3791 // this transaction starts where the current journal ends
3792 tr->journal_start = jnl->jhdr->end;
3793
3794 lock_oldstart(jnl);
3795 /*
3796 * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
3797 * slide everyone else down and put our latest guy in the last
3798 * entry in the old_start array
3799 */
3800 memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
3801 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
3802
3803 unlock_oldstart(jnl);
3804
3805
3806 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
3807 char *blkptr;
3808 buf_t sbp;
3809 int32_t bsize;
3810
3811 tbuffer_offset = jnl->jhdr->blhdr_size;
3812
3813 for (i = 1; i < blhdr->num_blocks; i++) {
3814
3815 if (blhdr->binfo[i].bnum != (off_t)-1) {
3816 void (*func)(buf_t, void *);
3817 void *arg;
3818
3819 bp = blhdr->binfo[i].u.bp;
3820
3821 if (bp == NULL) {
3822 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
3823 blhdr->binfo[i].bnum, jnl, tr);
3824 }
3825 /*
3826 * acquire the bp here so that we can safely
3827 * mess around with its data. buf_acquire()
3828 * will return EAGAIN if the buffer was busy,
3829 * so loop trying again.
3830 */
3831 do {
3832 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
3833 } while (errno == EAGAIN);
3834
3835 if (errno)
3836 panic("could not acquire bp %p (err %d)\n", bp, errno);
3837
3838 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
3839 if (jnl->flags & JOURNAL_CLOSE_PENDING) {
3840 buf_clearflags(bp, B_LOCKED);
3841 buf_brelse(bp);
3842
3843 /*
3844 * this is an odd case that appears to happen occasionally
3845 * make sure we mark this block as no longer valid
3846 * so that we don't process it in "finish_end_transaction" since
3847 * the bp that is recorded in our array no longer belongs
3848 * to us (normally we substitute a shadow bp to be processed
3849 * issuing a 'buf_bawrite' on a stale buf_t pointer leads
3850 * to all kinds of problems.
3851 */
3852 blhdr->binfo[i].bnum = (off_t)-1;
3853 continue;
3854 } else {
3855 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
3856 }
3857 }
3858 bsize = buf_size(bp);
3859
3860 buf_setfilter(bp, NULL, NULL, &func, &arg);
3861
3862 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
3863
3864 sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0);
3865
3866 if (sbp == NULL)
3867 panic("jnl: buf_create_shadow returned NULL");
3868
3869 /*
3870 * copy the data into the transaction buffer...
3871 */
3872 memcpy(blkptr, (char *)buf_dataptr(bp), bsize);
3873
3874 buf_clearflags(bp, B_LOCKED);
3875 buf_markclean(bp);
3876 buf_drop(bp);
3877
3878 /*
3879 * adopt the shadow buffer for this block
3880 */
3881 if (func) {
3882 /*
3883 * transfer FS hook function to the
3884 * shadow buffer... it will get called
3885 * in finish_end_transaction
3886 */
3887 buf_setfilter(sbp, func, arg, NULL, NULL);
3888 }
3889 blhdr->binfo[i].u.bp = sbp;
3890
3891 } else {
3892 // bnum == -1, only true if a block was "killed"
3893 bsize = blhdr->binfo[i].u.bi.bsize;
3894 }
3895 tbuffer_offset += bsize;
3896 }
3897 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3898 }
3899 /*
3900 * if callback != NULL, we don't want to drop the journal
3901 * lock, or complete end_transaction asynchronously, since
3902 * the caller is expecting the callback to run in the calling
3903 * context
3904 *
3905 * if drop_lock == FALSE, we can't complete end_transaction
3906 * asynchronously
3907 */
3908 if (callback)
3909 drop_lock_early = FALSE;
3910 else
3911 drop_lock_early = drop_lock;
3912
3913 if (drop_lock_early == FALSE)
3914 must_wait = TRUE;
3915
3916 if (drop_lock_early == TRUE) {
3917 jnl->owner = NULL;
3918 unlock_journal(jnl);
3919 drop_lock = FALSE;
3920 }
3921 if (must_wait == TRUE)
3922 ret_val = finish_end_transaction(tr, callback, callback_arg);
3923 else {
3924 thread_t thread = THREAD_NULL;
3925
3926 /*
3927 * fire up a thread to complete processing this transaction
3928 * asynchronously... when it finishes, it will call
3929 * unlock_condition
3930 */
3931 kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread);
3932 }
3933 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
3934 done:
3935 if (drop_lock == TRUE) {
3936 jnl->owner = NULL;
3937 unlock_journal(jnl);
3938 }
3939 return (ret_val);
3940 }
3941
3942
3943 static void
3944 finish_end_thread(transaction *tr)
3945 {
3946 proc_set_task_policy(current_task(), current_thread(),
3947 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
3948
3949 finish_end_transaction(tr, NULL, NULL);
3950
3951 thread_deallocate(current_thread());
3952 thread_terminate(current_thread());
3953 }
3954
3955 static void
3956 write_header_thread(journal *jnl)
3957 {
3958 proc_set_task_policy(current_task(), current_thread(),
3959 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
3960
3961 if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
3962 jnl->write_header_failed = TRUE;
3963 else
3964 jnl->write_header_failed = FALSE;
3965 unlock_condition(jnl, &jnl->writing_header);
3966
3967 thread_deallocate(current_thread());
3968 thread_terminate(current_thread());
3969 }
3970
3971 static int
3972 finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg)
3973 {
3974 int i, amt;
3975 int ret = 0;
3976 off_t end;
3977 journal *jnl = tr->jnl;
3978 buf_t bp, *bparray;
3979 vnode_t vp;
3980 block_list_header *blhdr=NULL, *next=NULL;
3981 size_t tbuffer_offset;
3982 int bufs_written = 0;
3983 int ret_val = 0;
3984
3985 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0);
3986
3987 end = jnl->jhdr->end;
3988
3989 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
3990
3991 amt = blhdr->bytes_used;
3992
3993 blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
3994
3995 blhdr->checksum = 0;
3996 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
3997
3998 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) {
3999 panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
4000 }
4001 tbuffer_offset = jnl->jhdr->blhdr_size;
4002
4003 for (i = 1; i < blhdr->num_blocks; i++) {
4004 void (*func)(buf_t, void *);
4005 void *arg;
4006 int32_t bsize;
4007
4008 /*
4009 * finish preparing the shadow buf_t before
4010 * calculating the individual block checksums
4011 */
4012 if (blhdr->binfo[i].bnum != (off_t)-1) {
4013 daddr64_t blkno;
4014 daddr64_t lblkno;
4015
4016 bp = blhdr->binfo[i].u.bp;
4017
4018 vp = buf_vnode(bp);
4019 blkno = buf_blkno(bp);
4020 lblkno = buf_lblkno(bp);
4021
4022 if (vp == NULL && lblkno == blkno) {
4023 printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n",
4024 jnl->jdev_name, bp, lblkno, blkno, tr, jnl);
4025 ret_val = -1;
4026 goto bad_journal;
4027 }
4028
4029 // if the lblkno is the same as blkno and this bp isn't
4030 // associated with the underlying file system device then
4031 // we need to call bmap() to get the actual physical block.
4032 //
4033 if ((lblkno == blkno) && (vp != jnl->fsdev)) {
4034 off_t f_offset;
4035 size_t contig_bytes;
4036
4037 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
4038 printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
4039 ret_val = -1;
4040 goto bad_journal;
4041 }
4042 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
4043 printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
4044 ret_val = -1;
4045 goto bad_journal;
4046 }
4047 if ((uint32_t)contig_bytes < buf_count(bp)) {
4048 printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl);
4049 ret_val = -1;
4050 goto bad_journal;
4051 }
4052 buf_setblkno(bp, blkno);
4053 }
4054 // update this so we write out the correct physical block number!
4055 blhdr->binfo[i].bnum = (off_t)(blkno);
4056
4057 /*
4058 * pick up the FS hook function (if any) and prepare
4059 * to fire this buffer off in the next pass
4060 */
4061 buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg);
4062
4063 if (func) {
4064 /*
4065 * call the hook function supplied by the filesystem...
4066 * this needs to happen BEFORE cacl_checksum in case
4067 * the FS morphs the data in the buffer
4068 */
4069 func(bp, arg);
4070 }
4071 bparray[i] = bp;
4072 bsize = buf_size(bp);
4073 blhdr->binfo[i].u.bi.bsize = bsize;
4074 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize);
4075 } else {
4076 bparray[i] = NULL;
4077 bsize = blhdr->binfo[i].u.bi.bsize;
4078 blhdr->binfo[i].u.bi.b.cksum = 0;
4079 }
4080 tbuffer_offset += bsize;
4081 }
4082 /*
4083 * if we fired off the journal_write_header asynchronously in
4084 * 'end_transaction', we need to wait for its completion
4085 * before writing the actual journal data
4086 */
4087 wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
4088
4089 if (jnl->write_header_failed == FALSE)
4090 ret = write_journal_data(jnl, &end, blhdr, amt);
4091 else
4092 ret_val = -1;
4093 /*
4094 * put the bp pointers back so that we can
4095 * make the final pass on them
4096 */
4097 for (i = 1; i < blhdr->num_blocks; i++)
4098 blhdr->binfo[i].u.bp = bparray[i];
4099
4100 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
4101
4102 if (ret_val == -1)
4103 goto bad_journal;
4104
4105 if (ret != amt) {
4106 printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
4107 jnl->jdev_name, ret, amt);
4108
4109 ret_val = -1;
4110 goto bad_journal;
4111 }
4112 }
4113 jnl->jhdr->end = end; // update where the journal now ends
4114 tr->journal_end = end; // the transaction ends here too
4115
4116 if (tr->journal_start == 0 || tr->journal_end == 0) {
4117 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
4118 tr->journal_start, tr->journal_end);
4119 }
4120
4121 if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
4122 ret_val = -1;
4123 goto bad_journal;
4124 }
4125 /*
4126 * If the caller supplied a callback, call it now that the blocks have been
4127 * written to the journal. This is used by journal_relocate so, for example,
4128 * the file system can change its pointer to the new journal.
4129 */
4130 if (callback != NULL && callback(callback_arg) != 0) {
4131 ret_val = -1;
4132 goto bad_journal;
4133 }
4134
4135 //
4136 // Send a DKIOCUNMAP for the extents trimmed by this transaction, and
4137 // free up the extent list.
4138 //
4139 journal_trim_flush(jnl, tr);
4140
4141 // the buffer_flushed_callback will only be called for the
4142 // real blocks that get flushed so we have to account for
4143 // the block_list_headers here.
4144 //
4145 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
4146
4147 lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
4148
4149 //
4150 // setup for looping through all the blhdr's.
4151 //
4152 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4153 uint16_t num_blocks;
4154
4155 /*
4156 * grab this info ahead of issuing the buf_bawrites...
4157 * once the last one goes out, its possible for blhdr
4158 * to be freed (especially if we get preempted) before
4159 * we do the last check of num_blocks or
4160 * grab the next blhdr pointer...
4161 */
4162 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4163 num_blocks = blhdr->num_blocks;
4164
4165 /*
4166 * we can re-order the buf ptrs because everything is written out already
4167 */
4168 qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
4169
4170 /*
4171 * need to make sure that the loop issuing the buf_bawrite's
4172 * does not touch blhdr once the last buf_bawrite has been
4173 * issued... at that point, we no longer have a legitmate
4174 * reference on the associated storage since it will be
4175 * released upon the completion of that last buf_bawrite
4176 */
4177 for (i = num_blocks-1; i >= 1; i--) {
4178 if (blhdr->binfo[i].bnum != (off_t)-1)
4179 break;
4180 num_blocks--;
4181 }
4182 for (i = 1; i < num_blocks; i++) {
4183
4184 if ((bp = blhdr->binfo[i].u.bp)) {
4185 vp = buf_vnode(bp);
4186
4187 buf_bawrite(bp);
4188
4189 // this undoes the vnode_ref() in journal_modify_block_end()
4190 vnode_rele_ext(vp, 0, 1);
4191
4192 bufs_written++;
4193 }
4194 }
4195 }
4196 if (bufs_written == 0) {
4197 /*
4198 * since we didn't issue any buf_bawrite's, there is no
4199 * async trigger to cause the memory associated with this
4200 * transaction to be freed... so, move it to the garbage
4201 * list now
4202 */
4203 lock_oldstart(jnl);
4204
4205 tr->next = jnl->tr_freeme;
4206 jnl->tr_freeme = tr;
4207
4208 unlock_oldstart(jnl);
4209
4210 unlock_condition(jnl, &jnl->asyncIO);
4211 }
4212
4213 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
4214 // tr, tr->journal_start, tr->journal_end);
4215
4216 bad_journal:
4217 if (ret_val == -1) {
4218 /*
4219 * 'flush_aborted' is protected by the flushing condition... we need to
4220 * set it before dropping the condition so that it will be
4221 * noticed in 'end_transaction'... we add this additional
4222 * aborted condition so that we can drop the 'flushing' condition
4223 * before grabbing the journal lock... this avoids a deadlock
4224 * in 'end_transaction' which is holding the journal lock while
4225 * waiting for the 'flushing' condition to clear...
4226 * everyone else will notice the JOURNAL_INVALID flag
4227 */
4228 jnl->flush_aborted = TRUE;
4229
4230 unlock_condition(jnl, &jnl->flushing);
4231 lock_journal(jnl);
4232
4233 jnl->flags |= JOURNAL_INVALID;
4234 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
4235 abort_transaction(jnl, tr); // cleans up list of extents to be trimmed
4236
4237 unlock_journal(jnl);
4238 } else
4239 unlock_condition(jnl, &jnl->flushing);
4240
4241 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0);
4242
4243 return (ret_val);
4244 }
4245
4246
4247 static void
4248 lock_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4249 {
4250
4251 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4252
4253 lock_flush(jnl);
4254
4255 while (*condition == TRUE)
4256 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4257
4258 *condition = TRUE;
4259 unlock_flush(jnl);
4260
4261 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4262 }
4263
4264 static void
4265 wait_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4266 {
4267
4268 if (*condition == FALSE)
4269 return;
4270
4271 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4272
4273 lock_flush(jnl);
4274
4275 while (*condition == TRUE)
4276 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4277
4278 unlock_flush(jnl);
4279
4280 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4281 }
4282
4283 static void
4284 unlock_condition(journal *jnl, boolean_t *condition)
4285 {
4286 lock_flush(jnl);
4287
4288 *condition = FALSE;
4289 wakeup(condition);
4290
4291 unlock_flush(jnl);
4292 }
4293
4294 static void
4295 abort_transaction(journal *jnl, transaction *tr)
4296 {
4297 block_list_header *blhdr, *next;
4298
4299 // for each block list header, iterate over the blocks then
4300 // free up the memory associated with the block list.
4301 //
4302 // find each of the primary blocks (i.e. the list could
4303 // contain a mix of shadowed and real buf_t's depending
4304 // on when the abort condition was detected) and mark them
4305 // clean and locked in the cache... this at least allows
4306 // the FS a consistent view between it's incore data structures
4307 // and the meta-data held in the cache
4308 //
4309 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0);
4310
4311 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4312 int i;
4313
4314 for (i = 1; i < blhdr->num_blocks; i++) {
4315 buf_t bp, tbp, sbp;
4316 vnode_t bp_vp;
4317 errno_t errno;
4318
4319 if (blhdr->binfo[i].bnum == (off_t)-1)
4320 continue;
4321
4322 tbp = blhdr->binfo[i].u.bp;
4323
4324 bp_vp = buf_vnode(tbp);
4325
4326 buf_setfilter(tbp, NULL, NULL, NULL, NULL);
4327
4328 if (buf_shadow(tbp))
4329 sbp = tbp;
4330 else
4331 sbp = NULL;
4332
4333 if (bp_vp) {
4334 errno = buf_meta_bread(bp_vp,
4335 buf_lblkno(tbp),
4336 buf_size(tbp),
4337 NOCRED,
4338 &bp);
4339 if (errno == 0) {
4340 if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) {
4341 panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
4342 bp, tbp, jnl);
4343 }
4344 /*
4345 * once the journal has been marked INVALID and aborted,
4346 * NO meta data can be written back to the disk, so
4347 * mark the buf_t clean and make sure it's locked in the cache
4348 * note: if we found a shadow, the real buf_t needs to be relocked
4349 */
4350 buf_setflags(bp, B_LOCKED);
4351 buf_markclean(bp);
4352 buf_brelse(bp);
4353
4354 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0);
4355
4356 /*
4357 * this undoes the vnode_ref() in journal_modify_block_end()
4358 */
4359 vnode_rele_ext(bp_vp, 0, 1);
4360 } else {
4361 printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n",
4362 jnl->jdev_name, blhdr->binfo[i].bnum, tbp);
4363 if (bp) {
4364 buf_brelse(bp);
4365 }
4366 }
4367 }
4368 if (sbp)
4369 buf_brelse(sbp);
4370 }
4371 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4372
4373 // we can free blhdr here since we won't need it any more
4374 blhdr->binfo[0].bnum = 0xdeadc0de;
4375 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
4376 }
4377
4378 /*
4379 * If the transaction we're aborting was the async transaction, then
4380 * tell the current transaction that there is no pending trim
4381 * any more.
4382 */
4383 lck_rw_lock_exclusive(&jnl->trim_lock);
4384 if (jnl->async_trim == &tr->trim)
4385 jnl->async_trim = NULL;
4386 lck_rw_unlock_exclusive(&jnl->trim_lock);
4387
4388
4389 if (tr->trim.extents) {
4390 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
4391 }
4392 tr->trim.allocated_count = 0;
4393 tr->trim.extent_count = 0;
4394 tr->trim.extents = NULL;
4395 tr->tbuffer = NULL;
4396 tr->blhdr = NULL;
4397 tr->total_bytes = 0xdbadc0de;
4398 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
4399
4400 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0);
4401 }
4402
4403
4404 int
4405 journal_end_transaction(journal *jnl)
4406 {
4407 int ret;
4408 transaction *tr;
4409
4410 CHECK_JOURNAL(jnl);
4411
4412 free_old_stuff(jnl);
4413
4414 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
4415 return 0;
4416 }
4417
4418 if (jnl->owner != current_thread()) {
4419 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
4420 jnl, jnl->owner, current_thread());
4421 }
4422 jnl->nested_count--;
4423
4424 if (jnl->nested_count > 0) {
4425 return 0;
4426 } else if (jnl->nested_count < 0) {
4427 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
4428 }
4429
4430 if (jnl->flags & JOURNAL_INVALID) {
4431 if (jnl->active_tr) {
4432 if (jnl->cur_tr != NULL) {
4433 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
4434 jnl, jnl->active_tr, jnl->cur_tr);
4435 }
4436 tr = jnl->active_tr;
4437 jnl->active_tr = NULL;
4438
4439 abort_transaction(jnl, tr);
4440 }
4441 jnl->owner = NULL;
4442 unlock_journal(jnl);
4443
4444 return EINVAL;
4445 }
4446
4447 tr = jnl->active_tr;
4448 CHECK_TRANSACTION(tr);
4449
4450 // clear this out here so that when check_free_space() calls
4451 // the FS flush function, we don't panic in journal_flush()
4452 // if the FS were to call that. note: check_free_space() is
4453 // called from end_transaction().
4454 //
4455 jnl->active_tr = NULL;
4456
4457 /* Examine the force-journal-flush state in the active txn */
4458 if (tr->flush_on_completion == TRUE) {
4459 /*
4460 * If the FS requested it, disallow group commit and force the
4461 * transaction out to disk immediately.
4462 */
4463 ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
4464 }
4465 else {
4466 /* in the common path we can simply use the double-buffered journal */
4467 ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
4468 }
4469
4470 return ret;
4471 }
4472
4473
4474 /*
4475 * Flush the contents of the journal to the disk.
4476 *
4477 * Input:
4478 * wait_for_IO -
4479 * If TRUE, wait to write in-memory journal to the disk
4480 * consistently, and also wait to write all asynchronous
4481 * metadata blocks to its corresponding locations
4482 * consistently on the disk. This means that the journal
4483 * is empty at this point and does not contain any
4484 * transactions. This is overkill in normal scenarios
4485 * but is useful whenever the metadata blocks are required
4486 * to be consistent on-disk instead of just the journal
4487 * being consistent; like before live verification
4488 * and live volume resizing.
4489 *
4490 * If FALSE, only wait to write in-memory journal to the
4491 * disk consistently. This means that the journal still
4492 * contains uncommitted transactions and the file system
4493 * metadata blocks in the journal transactions might be
4494 * written asynchronously to the disk. But there is no
4495 * guarantee that they are written to the disk before
4496 * returning to the caller. Note that this option is
4497 * sufficient for file system data integrity as it
4498 * guarantees consistent journal content on the disk.
4499 */
4500 int
4501 journal_flush(journal *jnl, boolean_t wait_for_IO)
4502 {
4503 boolean_t drop_lock = FALSE;
4504
4505 CHECK_JOURNAL(jnl);
4506
4507 free_old_stuff(jnl);
4508
4509 if (jnl->flags & JOURNAL_INVALID) {
4510 return -1;
4511 }
4512
4513 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
4514
4515 if (jnl->owner != current_thread()) {
4516 lock_journal(jnl);
4517 drop_lock = TRUE;
4518 }
4519
4520 // if we're not active, flush any buffered transactions
4521 if (jnl->active_tr == NULL && jnl->cur_tr) {
4522 transaction *tr = jnl->cur_tr;
4523
4524 jnl->cur_tr = NULL;
4525
4526 if (wait_for_IO) {
4527 wait_condition(jnl, &jnl->flushing, "journal_flush");
4528 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4529 }
4530 /*
4531 * "end_transction" will wait for any current async flush
4532 * to complete, before flushing "cur_tr"... because we've
4533 * specified the 'must_wait' arg as TRUE, it will then
4534 * synchronously flush the "cur_tr"
4535 */
4536 end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed
4537
4538 } else {
4539 if (drop_lock == TRUE) {
4540 unlock_journal(jnl);
4541 }
4542
4543 /* Because of pipelined journal, the journal transactions
4544 * might be in process of being flushed on another thread.
4545 * If there is nothing to flush currently, we should
4546 * synchronize ourselves with the pipelined journal thread
4547 * to ensure that all inflight transactions, if any, are
4548 * flushed before we return success to caller.
4549 */
4550 wait_condition(jnl, &jnl->flushing, "journal_flush");
4551 }
4552 if (wait_for_IO) {
4553 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4554 }
4555
4556 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0);
4557
4558 return 0;
4559 }
4560
4561 int
4562 journal_active(journal *jnl)
4563 {
4564 if (jnl->flags & JOURNAL_INVALID) {
4565 return -1;
4566 }
4567
4568 return (jnl->active_tr == NULL) ? 0 : 1;
4569 }
4570
4571 void *
4572 journal_owner(journal *jnl)
4573 {
4574 return jnl->owner;
4575 }
4576
4577 int journal_uses_fua(journal *jnl)
4578 {
4579 if (jnl->flags & JOURNAL_DO_FUA_WRITES)
4580 return 1;
4581 return 0;
4582 }
4583
4584 /*
4585 * Relocate the journal.
4586 *
4587 * You provide the new starting offset and size for the journal. You may
4588 * optionally provide a new tbuffer_size; passing zero defaults to not
4589 * changing the tbuffer size except as needed to fit within the new journal
4590 * size.
4591 *
4592 * You must have already started a transaction. The transaction may contain
4593 * modified blocks (such as those needed to deallocate the old journal,
4594 * allocate the new journal, and update the location and size of the journal
4595 * in filesystem-private structures). Any transactions prior to the active
4596 * transaction will be flushed to the old journal. The new journal will be
4597 * initialized, and the blocks from the active transaction will be written to
4598 * the new journal.
4599 *
4600 * The caller will need to update the structures that identify the location
4601 * and size of the journal. These updates should be made in the supplied
4602 * callback routine. These updates must NOT go into a transaction. You should
4603 * force these updates to the media before returning from the callback. In the
4604 * even of a crash, either the old journal will be found, with an empty journal,
4605 * or the new journal will be found with the contents of the active transaction.
4606 *
4607 * Upon return from the callback, the blocks from the active transaction are
4608 * written to their normal locations on disk.
4609 *
4610 * (Remember that we have to ensure that blocks get committed to the journal
4611 * before being committed to their normal locations. But the blocks don't count
4612 * as committed until the new journal is pointed at.)
4613 *
4614 * Upon return, there is still an active transaction: newly allocated, and
4615 * with no modified blocks. Call journal_end_transaction as normal. You may
4616 * modifiy additional blocks before calling journal_end_transaction, and those
4617 * blocks will (eventually) go to the relocated journal.
4618 *
4619 * Inputs:
4620 * jnl The (opened) journal to relocate.
4621 * offset The new journal byte offset (from start of the journal device).
4622 * journal_size The size, in bytes, of the new journal.
4623 * tbuffer_size The new desired transaction buffer size. Pass zero to keep
4624 * the same size as the current journal. The size will be
4625 * modified as needed to fit the new journal.
4626 * callback Routine called after the new journal has been initialized,
4627 * and the active transaction written to the new journal, but
4628 * before the blocks are written to their normal locations.
4629 * Pass NULL for no callback.
4630 * callback_arg An argument passed to the callback routine.
4631 *
4632 * Result:
4633 * 0 No errors
4634 * EINVAL The offset is not block aligned
4635 * EINVAL The journal_size is not a multiple of the block size
4636 * EINVAL The journal is invalid
4637 * (any) An error returned by journal_flush.
4638 *
4639 */
4640 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
4641 errno_t (*callback)(void *), void *callback_arg)
4642 {
4643 int ret;
4644 transaction *tr;
4645 size_t i = 0;
4646
4647 /*
4648 * Sanity check inputs, and adjust the size of the transaction buffer.
4649 */
4650 if ((offset % jnl->jhdr->jhdr_size) != 0) {
4651 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
4652 jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
4653 return EINVAL;
4654 }
4655 if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
4656 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
4657 jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
4658 return EINVAL;
4659 }
4660
4661 CHECK_JOURNAL(jnl);
4662
4663 /* Guarantee we own the active transaction. */
4664 if (jnl->flags & JOURNAL_INVALID) {
4665 return EINVAL;
4666 }
4667 if (jnl->owner != current_thread()) {
4668 panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
4669 jnl, jnl->owner, current_thread());
4670 }
4671
4672 if (tbuffer_size == 0)
4673 tbuffer_size = jnl->tbuffer_size;
4674 size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
4675
4676 /*
4677 * Flush any non-active transactions. We have to temporarily hide the
4678 * active transaction to make journal_flush flush out non-active but
4679 * current (unwritten) transactions.
4680 */
4681 tr = jnl->active_tr;
4682 CHECK_TRANSACTION(tr);
4683 jnl->active_tr = NULL;
4684 ret = journal_flush(jnl, TRUE);
4685 jnl->active_tr = tr;
4686
4687 if (ret) {
4688 return ret;
4689 }
4690 wait_condition(jnl, &jnl->flushing, "end_transaction");
4691
4692 /*
4693 * At this point, we have completely flushed the contents of the current
4694 * journal to disk (and have asynchronously written all of the txns to
4695 * their actual desired locations). As a result, we can (and must) clear
4696 * out the old_start array. If we do not, then if the last written transaction
4697 * started at the beginning of the journal (starting 1 block into the
4698 * journal file) it could confuse the buffer_flushed callback. This is
4699 * because we're about to reset the start/end pointers of the journal header
4700 * below.
4701 */
4702 lock_oldstart(jnl);
4703 for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) {
4704 jnl->old_start[i] = 0;
4705 }
4706 unlock_oldstart(jnl);
4707
4708 /* Update the journal's offset and size in memory. */
4709 jnl->jdev_offset = offset;
4710 jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
4711 jnl->jhdr->size = journal_size;
4712 jnl->active_start = jnl->jhdr->start;
4713
4714 /*
4715 * Force the active transaction to be written to the new journal. Call the
4716 * supplied callback after the blocks have been written to the journal, but
4717 * before they get written to their normal on-disk locations.
4718 */
4719 jnl->active_tr = NULL;
4720 ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE);
4721 if (ret) {
4722 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
4723 goto bad_journal;
4724 }
4725
4726 /*
4727 * Create a new, empty transaction to be the active transaction. This way
4728 * our caller can use journal_end_transaction as usual.
4729 */
4730 ret = journal_allocate_transaction(jnl);
4731 if (ret) {
4732 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
4733 goto bad_journal;
4734 }
4735
4736 return 0;
4737
4738 bad_journal:
4739 jnl->flags |= JOURNAL_INVALID;
4740 abort_transaction(jnl, tr);
4741 return ret;
4742 }
4743
4744
4745 #else // !JOURNALING - so provide stub functions
4746
4747 int journal_uses_fua(__unused journal *jnl)
4748 {
4749 return 0;
4750 }
4751
4752 journal *
4753 journal_create(__unused struct vnode *jvp,
4754 __unused off_t offset,
4755 __unused off_t journal_size,
4756 __unused struct vnode *fsvp,
4757 __unused size_t min_fs_blksz,
4758 __unused int32_t flags,
4759 __unused int32_t tbuffer_size,
4760 __unused void (*flush)(void *arg),
4761 __unused void *arg,
4762 __unused struct mount *fsmount)
4763 {
4764 return NULL;
4765 }
4766
4767 journal *
4768 journal_open(__unused struct vnode *jvp,
4769 __unused off_t offset,
4770 __unused off_t journal_size,
4771 __unused struct vnode *fsvp,
4772 __unused size_t min_fs_blksz,
4773 __unused int32_t flags,
4774 __unused int32_t tbuffer_size,
4775 __unused void (*flush)(void *arg),
4776 __unused void *arg,
4777 __unused struct mount *fsmount)
4778 {
4779 return NULL;
4780 }
4781
4782
4783 int
4784 journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp)
4785 {
4786 return EINVAL;
4787 }
4788
4789 int
4790 journal_modify_block_end(__unused journal *jnl,
4791 __unused struct buf *bp,
4792 __unused void (*func)(struct buf *bp, void *arg),
4793 __unused void *arg)
4794 {
4795 return EINVAL;
4796 }
4797
4798 int
4799 journal_kill_block(__unused journal *jnl, __unused struct buf *bp)
4800 {
4801 return EINVAL;
4802 }
4803
4804 int journal_relocate(__unused journal *jnl,
4805 __unused off_t offset,
4806 __unused off_t journal_size,
4807 __unused int32_t tbuffer_size,
4808 __unused errno_t (*callback)(void *),
4809 __unused void *callback_arg)
4810 {
4811 return EINVAL;
4812 }
4813
4814 void
4815 journal_close(__unused journal *jnl)
4816 {
4817 }
4818
4819 int
4820 journal_start_transaction(__unused journal *jnl)
4821 {
4822 return EINVAL;
4823 }
4824
4825 int
4826 journal_end_transaction(__unused journal *jnl)
4827 {
4828 return EINVAL;
4829 }
4830
4831 int
4832 journal_flush(__unused journal *jnl, __unused boolean_t wait_for_IO)
4833 {
4834 return EINVAL;
4835 }
4836
4837 int
4838 journal_is_clean(__unused struct vnode *jvp,
4839 __unused off_t offset,
4840 __unused off_t journal_size,
4841 __unused struct vnode *fsvp,
4842 __unused size_t min_fs_block_size)
4843 {
4844 return 0;
4845 }
4846
4847
4848 void *
4849 journal_owner(__unused journal *jnl)
4850 {
4851 return NULL;
4852 }
4853 #endif // !JOURNALING