]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_journal.c
23b21860a87cfbf937411cb70cc33056ac963765
[apple/xnu.git] / bsd / vfs / vfs_journal.c
1 /*
2 * Copyright (c) 2002-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 //
29 // This file implements a simple write-ahead journaling layer.
30 // In theory any file system can make use of it by calling these
31 // functions when the fs wants to modify meta-data blocks. See
32 // vfs_journal.h for a more detailed description of the api and
33 // data structures.
34 //
35 // Dominic Giampaolo (dbg@apple.com)
36 //
37
38 #ifdef KERNEL
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file_internal.h>
44 #include <sys/stat.h>
45 #include <sys/buf_internal.h>
46 #include <sys/proc_internal.h>
47 #include <sys/mount_internal.h>
48 #include <sys/namei.h>
49 #include <sys/vnode_internal.h>
50 #include <sys/ioctl.h>
51 #include <sys/tty.h>
52 #include <sys/ubc.h>
53 #include <sys/malloc.h>
54 #include <kern/task.h>
55 #include <kern/thread.h>
56 #include <kern/kalloc.h>
57 #include <sys/disk.h>
58 #include <sys/kdebug.h>
59 #include <miscfs/specfs/specdev.h>
60 #include <libkern/OSAtomic.h> /* OSAddAtomic */
61
62 kern_return_t thread_terminate(thread_t);
63
64 /*
65 * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT
66 * logging of trim-related calls within the journal. (They're
67 * disabled by default because there can be a lot of these events,
68 * and we don't want to overwhelm the kernel debug buffer. If you
69 * want to watch these events in particular, just set the sysctl.)
70 */
71 static int jnl_kdebug = 0;
72 SYSCTL_DECL(_vfs_generic);
73 SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal");
74 SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug");
75 SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM");
76
77 #define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1)
78 #define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2)
79 #define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3)
80 #define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4)
81 #define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5)
82 #define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6)
83 #define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7)
84
85 /*
86 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
87 * a full allocation block if the current size is smaller than the allocation
88 * block on which it resides. Once we hit the exabyte filesystem range, then
89 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
90 */
91 #define MAX_JOURNAL_SIZE 0x80000000U
92
93 #include <sys/sdt.h> /* DTRACE_IO1 */
94 #else
95
96 #include <stdio.h>
97 #include <stdlib.h>
98 #include <string.h>
99 #include <limits.h>
100 #include <errno.h>
101 #include <fcntl.h>
102 #include <unistd.h>
103 #include <stdarg.h>
104 #include <sys/types.h>
105 #include "compat.h"
106
107 #endif /* KERNEL */
108
109 #include "vfs_journal.h"
110
111 #include <sys/kdebug.h>
112
113 #if 0
114 #undef KERNEL_DEBUG
115 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
116 #endif
117
118
119 #ifndef CONFIG_HFS_TRIM
120 #define CONFIG_HFS_TRIM 0
121 #endif
122
123
124 #if JOURNALING
125
126 //
127 // By default, we grow the list of extents to trim by 4K at a time.
128 // We'll opt to flush a transaction if it contains at least
129 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
130 // of modified blocks is small).
131 //
132 enum {
133 JOURNAL_DEFAULT_TRIM_BYTES = 4096,
134 JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
135 JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
136 };
137
138 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
139 SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush");
140
141 /* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */
142 __private_extern__ void qsort(
143 void * array,
144 size_t nmembers,
145 size_t member_size,
146 int (*)(const void *, const void *));
147
148
149
150 // number of bytes to checksum in a block_list_header
151 // NOTE: this should be enough to clear out the header
152 // fields as well as the first entry of binfo[]
153 #define BLHDR_CHECKSUM_SIZE 32
154
155 static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name);
156 static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name);
157 static void unlock_condition(journal *jnl, boolean_t *condition);
158 static void finish_end_thread(transaction *tr);
159 static void write_header_thread(journal *jnl);
160 static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg);
161 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait);
162 static void abort_transaction(journal *jnl, transaction *tr);
163 static void dump_journal(journal *jnl);
164
165 static __inline__ void lock_oldstart(journal *jnl);
166 static __inline__ void unlock_oldstart(journal *jnl);
167 static __inline__ void lock_flush(journal *jnl);
168 static __inline__ void unlock_flush(journal *jnl);
169
170
171 //
172 // 3105942 - Coalesce writes to the same block on journal replay
173 //
174
175 typedef struct bucket {
176 off_t block_num;
177 uint32_t jnl_offset;
178 uint32_t block_size;
179 int32_t cksum;
180 } bucket;
181
182 #define STARTING_BUCKETS 256
183
184 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
185 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size);
186 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full);
187 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr);
188 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting);
189
190 #define CHECK_JOURNAL(jnl) \
191 do { \
192 if (jnl == NULL) { \
193 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
194 } \
195 if (jnl->jdev == NULL) { \
196 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
197 } \
198 if (jnl->fsdev == NULL) { \
199 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
200 } \
201 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \
202 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
203 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
204 } \
205 if ( jnl->jhdr->start <= 0 \
206 || jnl->jhdr->start > jnl->jhdr->size) { \
207 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
208 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
209 } \
210 if ( jnl->jhdr->end <= 0 \
211 || jnl->jhdr->end > jnl->jhdr->size) { \
212 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
213 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
214 } \
215 } while(0)
216
217 #define CHECK_TRANSACTION(tr) \
218 do { \
219 if (tr == NULL) { \
220 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
221 } \
222 if (tr->jnl == NULL) { \
223 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
224 } \
225 if (tr->blhdr != (block_list_header *)tr->tbuffer) { \
226 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
227 } \
228 if (tr->total_bytes < 0) { \
229 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
230 } \
231 if (tr->journal_start < 0) { \
232 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
233 } \
234 if (tr->journal_end < 0) { \
235 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
236 } \
237 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
238 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
239 } \
240 } while(0)
241
242
243
244 //
245 // this isn't a great checksum routine but it will do for now.
246 // we use it to checksum the journal header and the block list
247 // headers that are at the start of each transaction.
248 //
249 static unsigned int
250 calc_checksum(char *ptr, int len)
251 {
252 int i;
253 unsigned int cksum=0;
254
255 // this is a lame checksum but for now it'll do
256 for(i = 0; i < len; i++, ptr++) {
257 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
258 }
259
260 return (~cksum);
261 }
262
263 //
264 // Journal Locking
265 //
266 lck_grp_attr_t * jnl_group_attr;
267 lck_attr_t * jnl_lock_attr;
268 lck_grp_t * jnl_mutex_group;
269
270 void
271 journal_init(void)
272 {
273 jnl_lock_attr = lck_attr_alloc_init();
274 jnl_group_attr = lck_grp_attr_alloc_init();
275 jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr);
276 }
277
278 __inline__ void
279 journal_lock(journal *jnl)
280 {
281 lck_mtx_lock(&jnl->jlock);
282 if (jnl->owner) {
283 panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
284 }
285 jnl->owner = current_thread();
286 }
287
288 __inline__ void
289 journal_unlock(journal *jnl)
290 {
291 jnl->owner = NULL;
292 lck_mtx_unlock(&jnl->jlock);
293 }
294
295 static __inline__ void
296 lock_flush(journal *jnl)
297 {
298 lck_mtx_lock(&jnl->flock);
299 }
300
301 static __inline__ void
302 unlock_flush(journal *jnl)
303 {
304 lck_mtx_unlock(&jnl->flock);
305 }
306
307 static __inline__ void
308 lock_oldstart(journal *jnl)
309 {
310 lck_mtx_lock(&jnl->old_start_lock);
311 }
312
313 static __inline__ void
314 unlock_oldstart(journal *jnl)
315 {
316 lck_mtx_unlock(&jnl->old_start_lock);
317 }
318
319
320
321 #define JNL_WRITE 0x0001
322 #define JNL_READ 0x0002
323 #define JNL_HEADER 0x8000
324
325 //
326 // This function sets up a fake buf and passes it directly to the
327 // journal device strategy routine (so that it won't get cached in
328 // the block cache.
329 //
330 // It also handles range checking the i/o so that we don't write
331 // outside the journal boundaries and it will wrap the i/o back
332 // to the beginning if necessary (skipping over the journal header)
333 //
334 static size_t
335 do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
336 {
337 int err, curlen=len;
338 size_t io_sz = 0;
339 buf_t bp;
340 off_t max_iosize;
341 struct bufattr *bap;
342 boolean_t was_vm_privileged = FALSE;
343 boolean_t need_vm_privilege = FALSE;
344
345 if (jnl->fsmount) {
346 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT)
347 need_vm_privilege = TRUE;
348 }
349
350 if (*offset < 0 || *offset > jnl->jhdr->size) {
351 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
352 }
353
354 if (direction & JNL_WRITE)
355 max_iosize = jnl->max_write_size;
356 else if (direction & JNL_READ)
357 max_iosize = jnl->max_read_size;
358 else
359 max_iosize = 128 * 1024;
360
361 again:
362 bp = alloc_io_buf(jnl->jdev, 1);
363
364 if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
365 if (*offset == jnl->jhdr->size) {
366 *offset = jnl->jhdr->jhdr_size;
367 } else {
368 curlen = (off_t)jnl->jhdr->size - *offset;
369 }
370 }
371
372 if (curlen > max_iosize) {
373 curlen = max_iosize;
374 }
375
376 if (curlen <= 0) {
377 panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len);
378 }
379
380 if (*offset == 0 && (direction & JNL_HEADER) == 0) {
381 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data);
382 }
383
384 /*
385 * As alluded to in the block comment at the top of the function, we use a "fake" iobuf
386 * here and issue directly to the disk device that the journal protects since we don't
387 * want this to enter the block cache. As a result, we lose the ability to mark it
388 * as a metadata buf_t for the layers below us that may care. If we were to
389 * simply attach the B_META flag into the b_flags this may confuse things further
390 * since this is an iobuf, not a metadata buffer.
391 *
392 * To address this, we use the extended bufattr struct embedded in the bp.
393 * Explicitly mark the buf here as a metadata buffer in its bufattr flags.
394 */
395 bap = &bp->b_attr;
396 bap->ba_flags |= BA_META;
397
398 if (direction & JNL_READ)
399 buf_setflags(bp, B_READ);
400 else {
401 /*
402 * don't have to set any flags
403 */
404 vnode_startwrite(jnl->jdev);
405 }
406 buf_setsize(bp, curlen);
407 buf_setcount(bp, curlen);
408 buf_setdataptr(bp, (uintptr_t)data);
409 buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
410 buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size));
411
412 if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) {
413 buf_markfua(bp);
414 }
415
416 if (need_vm_privilege == TRUE) {
417 /*
418 * if we block waiting for memory, and there is enough pressure to
419 * cause us to try and create a new swap file, we may end up deadlocking
420 * due to waiting for the journal on the swap file creation path...
421 * by making ourselves vm_privileged, we give ourselves the best chance
422 * of not blocking
423 */
424 was_vm_privileged = set_vm_privilege(TRUE);
425 }
426 DTRACE_IO1(journal__start, buf_t, bp);
427 err = VNOP_STRATEGY(bp);
428 if (!err) {
429 err = (int)buf_biowait(bp);
430 }
431 DTRACE_IO1(journal__done, buf_t, bp);
432
433 if (need_vm_privilege == TRUE && was_vm_privileged == FALSE)
434 set_vm_privilege(FALSE);
435
436 free_io_buf(bp);
437
438 if (err) {
439 printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err);
440 return 0;
441 }
442
443 *offset += curlen;
444 io_sz += curlen;
445
446 if (io_sz != len) {
447 // handle wrap-around
448 data = (char *)data + curlen;
449 curlen = len - io_sz;
450 if (*offset >= jnl->jhdr->size) {
451 *offset = jnl->jhdr->jhdr_size;
452 }
453 goto again;
454 }
455
456 return io_sz;
457 }
458
459 static size_t
460 read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
461 {
462 return do_journal_io(jnl, offset, data, len, JNL_READ);
463 }
464
465 static size_t
466 write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
467 {
468 return do_journal_io(jnl, offset, data, len, JNL_WRITE);
469 }
470
471
472 static size_t
473 read_journal_header(journal *jnl, void *data, size_t len)
474 {
475 off_t hdr_offset = 0;
476
477 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
478 }
479
480 static int
481 write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num)
482 {
483 static int num_err_prints = 0;
484 int ret=0;
485 off_t jhdr_offset = 0;
486 struct vfs_context context;
487
488 context.vc_thread = current_thread();
489 context.vc_ucred = NOCRED;
490 //
491 // Flush the track cache if we're not doing force-unit-access
492 // writes.
493 //
494 if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
495
496 dk_synchronize_t sync_request = {
497 .options = DK_SYNCHRONIZE_OPTION_BARRIER,
498 };
499
500 /*
501 * If device doesn't support barrier-only flush, or
502 * the journal is on a different device, use full flush.
503 */
504 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
505 sync_request.options = 0;
506 jnl->flush_counter++;
507 }
508
509 ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context);
510 }
511 if (ret != 0) {
512 //
513 // Only print this error if it's a different error than the
514 // previous one, or if it's the first time for this device
515 // or if the total number of printfs is less than 25. We
516 // allow for up to 25 printfs to insure that some make it
517 // into the on-disk syslog. Otherwise if we only printed
518 // one, it's possible it would never make it to the syslog
519 // for the root volume and that makes debugging hard.
520 //
521 if ( ret != jnl->last_flush_err
522 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
523 || num_err_prints++ < 25) {
524
525 printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret);
526
527 jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
528 jnl->last_flush_err = ret;
529 }
530 }
531
532 jnl->jhdr->sequence_num = sequence_num;
533 jnl->jhdr->checksum = 0;
534 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
535
536 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
537 printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name);
538 jnl->flags |= JOURNAL_INVALID;
539 return -1;
540 }
541
542 // If we're not doing force-unit-access writes, then we
543 // have to flush after writing the journal header so that
544 // a future transaction doesn't sneak out to disk before
545 // the header does and thus overwrite data that the old
546 // journal header refers to. Saw this exact case happen
547 // on an IDE bus analyzer with Larry Barras so while it
548 // may seem obscure, it's not.
549 //
550 if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
551
552 dk_synchronize_t sync_request = {
553 .options = DK_SYNCHRONIZE_OPTION_BARRIER,
554 };
555
556 /*
557 * If device doesn't support barrier-only flush, or
558 * the journal is on a different device, use full flush.
559 */
560 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
561 sync_request.options = 0;
562 jnl->flush_counter++;
563 }
564
565 VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, &context);
566 }
567
568 return 0;
569 }
570
571
572
573 //
574 // this is a work function used to free up transactions that
575 // completed. they can't be free'd from buffer_flushed_callback
576 // because it is called from deep with the disk driver stack
577 // and thus can't do something that would potentially cause
578 // paging. it gets called by each of the journal api entry
579 // points so stuff shouldn't hang around for too long.
580 //
581 static void
582 free_old_stuff(journal *jnl)
583 {
584 transaction *tr, *next;
585 block_list_header *blhdr=NULL, *next_blhdr=NULL;
586
587 if (jnl->tr_freeme == NULL)
588 return;
589
590 lock_oldstart(jnl);
591 tr = jnl->tr_freeme;
592 jnl->tr_freeme = NULL;
593 unlock_oldstart(jnl);
594
595 for(; tr; tr=next) {
596 for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
597 next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
598 blhdr->binfo[0].bnum = 0xdeadc0de;
599
600 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
601
602 KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
603 }
604 next = tr->next;
605 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
606 }
607 }
608
609
610
611 //
612 // This is our callback that lets us know when a buffer has been
613 // flushed to disk. It's called from deep within the driver stack
614 // and thus is quite limited in what it can do. Notably, it can
615 // not initiate any new i/o's or allocate/free memory.
616 //
617 static void
618 buffer_flushed_callback(struct buf *bp, void *arg)
619 {
620 transaction *tr;
621 journal *jnl;
622 transaction *ctr, *prev=NULL, *next;
623 size_t i;
624 int bufsize, amt_flushed, total_bytes;
625
626
627 //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n",
628 // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg);
629
630 // snarf out the bits we want
631 bufsize = buf_size(bp);
632 tr = (transaction *)arg;
633
634 // then we've already seen it
635 if (tr == NULL) {
636 return;
637 }
638
639 CHECK_TRANSACTION(tr);
640
641 jnl = tr->jnl;
642
643 CHECK_JOURNAL(jnl);
644
645 amt_flushed = tr->num_killed;
646 total_bytes = tr->total_bytes;
647
648 // update the number of blocks that have been flushed.
649 // this buf may represent more than one block so take
650 // that into account.
651 //
652 // OSAddAtomic() returns the value of tr->num_flushed before the add
653 //
654 amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed);
655
656
657 // if this transaction isn't done yet, just return as
658 // there is nothing to do.
659 //
660 // NOTE: we are careful to not reference anything through
661 // the tr pointer after doing the OSAddAtomic(). if
662 // this if statement fails then we are the last one
663 // and then it's ok to dereference "tr".
664 //
665 if ((amt_flushed + bufsize) < total_bytes) {
666 return;
667 }
668
669 // this will single thread checking the transaction
670 lock_oldstart(jnl);
671
672 if (tr->total_bytes == (int)0xfbadc0de) {
673 // then someone beat us to it...
674 unlock_oldstart(jnl);
675 return;
676 }
677
678 // mark this so that we're the owner of dealing with the
679 // cleanup for this transaction
680 tr->total_bytes = 0xfbadc0de;
681
682 if (jnl->flags & JOURNAL_INVALID)
683 goto transaction_done;
684
685 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
686 // tr, tr->journal_start, tr->journal_end, jnl);
687
688 // find this entry in the old_start[] index and mark it completed
689 for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
690
691 if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
692 jnl->old_start[i] &= ~(0x8000000000000000ULL);
693 break;
694 }
695 }
696
697 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
698 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
699 tr->journal_start, tr, jnl);
700 }
701
702
703 // if we are here then we need to update the journal header
704 // to reflect that this transaction is complete
705 if (tr->journal_start == jnl->active_start) {
706 jnl->active_start = tr->journal_end;
707 tr->journal_start = tr->journal_end = (off_t)0;
708 }
709
710 // go through the completed_trs list and try to coalesce
711 // entries, restarting back at the beginning if we have to.
712 for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
713 if (ctr->journal_start == jnl->active_start) {
714 jnl->active_start = ctr->journal_end;
715 if (prev) {
716 prev->next = ctr->next;
717 }
718 if (ctr == jnl->completed_trs) {
719 jnl->completed_trs = ctr->next;
720 }
721
722 next = jnl->completed_trs; // this starts us over again
723 ctr->next = jnl->tr_freeme;
724 jnl->tr_freeme = ctr;
725 ctr = NULL;
726 } else if (tr->journal_end == ctr->journal_start) {
727 ctr->journal_start = tr->journal_start;
728 next = jnl->completed_trs; // this starts us over again
729 ctr = NULL;
730 tr->journal_start = tr->journal_end = (off_t)0;
731 } else if (tr->journal_start == ctr->journal_end) {
732 ctr->journal_end = tr->journal_end;
733 next = ctr->next;
734 tr->journal_start = tr->journal_end = (off_t)0;
735 } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
736 // coalesce the next entry with this one and link the next
737 // entry in at the head of the tr_freeme list
738 next = ctr->next; // temporarily use the "next" variable
739 ctr->journal_end = next->journal_end;
740 ctr->next = next->next;
741 next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list
742 jnl->tr_freeme = next;
743
744 next = jnl->completed_trs; // this starts us over again
745 ctr = NULL;
746 } else {
747 next = ctr->next;
748 }
749 }
750
751 // if this is true then we didn't merge with anyone
752 // so link ourselves in at the head of the completed
753 // transaction list.
754 if (tr->journal_start != 0) {
755 // put this entry into the correct sorted place
756 // in the list instead of just at the head.
757 //
758
759 prev = NULL;
760 for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
761 // just keep looping
762 }
763
764 if (ctr == NULL && prev == NULL) {
765 jnl->completed_trs = tr;
766 tr->next = NULL;
767 } else if (ctr == jnl->completed_trs) {
768 tr->next = jnl->completed_trs;
769 jnl->completed_trs = tr;
770 } else {
771 tr->next = prev->next;
772 prev->next = tr;
773 }
774 } else {
775 // if we're here this tr got merged with someone else so
776 // put it on the list to be free'd
777 tr->next = jnl->tr_freeme;
778 jnl->tr_freeme = tr;
779 }
780 transaction_done:
781 unlock_oldstart(jnl);
782
783 unlock_condition(jnl, &jnl->asyncIO);
784 }
785
786
787 #include <libkern/OSByteOrder.h>
788
789 #define SWAP16(x) OSSwapInt16(x)
790 #define SWAP32(x) OSSwapInt32(x)
791 #define SWAP64(x) OSSwapInt64(x)
792
793
794 static void
795 swap_journal_header(journal *jnl)
796 {
797 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
798 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
799 jnl->jhdr->start = SWAP64(jnl->jhdr->start);
800 jnl->jhdr->end = SWAP64(jnl->jhdr->end);
801 jnl->jhdr->size = SWAP64(jnl->jhdr->size);
802 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
803 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
804 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
805 jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num);
806 }
807
808 static void
809 swap_block_list_header(journal *jnl, block_list_header *blhdr)
810 {
811 int i;
812
813 blhdr->max_blocks = SWAP16(blhdr->max_blocks);
814 blhdr->num_blocks = SWAP16(blhdr->num_blocks);
815 blhdr->bytes_used = SWAP32(blhdr->bytes_used);
816 blhdr->checksum = SWAP32(blhdr->checksum);
817 blhdr->flags = SWAP32(blhdr->flags);
818
819 if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
820 printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size);
821 return;
822 }
823
824 for(i = 0; i < blhdr->num_blocks; i++) {
825 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
826 blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize);
827 blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
828 }
829 }
830
831
832 static int
833 update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
834 {
835 int ret;
836 struct buf *oblock_bp=NULL;
837 boolean_t was_vm_privileged = FALSE;
838
839
840 // first read the block we want.
841 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
842 if (ret != 0) {
843 printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret);
844
845 if (oblock_bp) {
846 buf_brelse(oblock_bp);
847 oblock_bp = NULL;
848 }
849
850 // let's try to be aggressive here and just re-write the block
851 oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META);
852 if (oblock_bp == NULL) {
853 printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block);
854 return -1;
855 }
856 }
857
858 // make sure it's the correct size.
859 if (buf_size(oblock_bp) != bsize) {
860 buf_brelse(oblock_bp);
861 return -1;
862 }
863
864 // copy the journal data over top of it
865 memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize);
866
867 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
868 /*
869 * if we block waiting for memory, and there is enough pressure to
870 * cause us to try and create a new swap file, we may end up deadlocking
871 * due to waiting for the journal on the swap file creation path...
872 * by making ourselves vm_privileged, we give ourselves the best chance
873 * of not blocking
874 */
875 was_vm_privileged = set_vm_privilege(TRUE);
876 }
877 ret = VNOP_BWRITE(oblock_bp);
878
879 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
880 set_vm_privilege(FALSE);
881
882 if (ret != 0) {
883 printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret);
884 return ret;
885 }
886 // and now invalidate it so that if someone else wants to read
887 // it in a different size they'll be able to do it.
888 ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp);
889 if (oblock_bp) {
890 buf_markinvalid(oblock_bp);
891 buf_brelse(oblock_bp);
892 }
893
894 return 0;
895 }
896
897 static int
898 grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
899 {
900 struct bucket *newBuf;
901 int current_size = num_buckets, i;
902
903 // return if newsize is less than the current size
904 if (new_size < num_buckets) {
905 return current_size;
906 }
907
908 if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
909 printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
910 return -1;
911 }
912
913 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
914
915 // copy existing elements
916 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
917
918 // initialize the new ones
919 for(i = num_buckets; i < new_size; i++) {
920 newBuf[i].block_num = (off_t)-1;
921 }
922
923 // free the old container
924 FREE(*buf_ptr, M_TEMP);
925
926 // reset the buf_ptr
927 *buf_ptr = newBuf;
928
929 return new_size;
930 }
931
932 static int
933 lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
934 {
935 int lo, hi, index, matches, i;
936
937 if (num_full == 0) {
938 return 0; // table is empty, so insert at index=0
939 }
940
941 lo = 0;
942 hi = num_full - 1;
943 index = -1;
944
945 // perform binary search for block_num
946 do {
947 int mid = (hi - lo)/2 + lo;
948 off_t this_num = (*buf_ptr)[mid].block_num;
949
950 if (block_num == this_num) {
951 index = mid;
952 break;
953 }
954
955 if (block_num < this_num) {
956 hi = mid;
957 continue;
958 }
959
960 if (block_num > this_num) {
961 lo = mid + 1;
962 continue;
963 }
964 } while (lo < hi);
965
966 // check if lo and hi converged on the match
967 if (block_num == (*buf_ptr)[hi].block_num) {
968 index = hi;
969 }
970
971 // if no existing entry found, find index for new one
972 if (index == -1) {
973 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
974 } else {
975 // make sure that we return the right-most index in the case of multiple matches
976 matches = 0;
977 i = index + 1;
978 while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
979 matches++;
980 i++;
981 }
982
983 index += matches;
984 }
985
986 return index;
987 }
988
989 static int
990 insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
991 {
992 if (!overwriting) {
993 // grow the table if we're out of space
994 if (*num_full_ptr >= *num_buckets_ptr) {
995 int new_size = *num_buckets_ptr * 2;
996 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
997
998 if (grow_size < new_size) {
999 printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name);
1000 return -1;
1001 }
1002
1003 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
1004 }
1005
1006 // if we're not inserting at the end, we need to bcopy
1007 if (blk_index != *num_full_ptr) {
1008 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
1009 }
1010
1011 (*num_full_ptr)++; // increment only if we're not overwriting
1012 }
1013
1014 // sanity check the values we're about to add
1015 if ((off_t)offset >= jnl->jhdr->size) {
1016 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1017 }
1018 if (size <= 0) {
1019 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
1020 }
1021
1022 (*buf_ptr)[blk_index].block_num = num;
1023 (*buf_ptr)[blk_index].block_size = size;
1024 (*buf_ptr)[blk_index].jnl_offset = offset;
1025 (*buf_ptr)[blk_index].cksum = cksum;
1026
1027 return blk_index;
1028 }
1029
1030 static int
1031 do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
1032 {
1033 int num_to_remove, index, i, overwrite, err;
1034 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
1035 off_t overlap, block_start, block_end;
1036
1037 block_start = block_num*jhdr_size;
1038 block_end = block_start + size;
1039 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
1040
1041 // first, eliminate any overlap with the previous entry
1042 if (blk_index != 0 && !overwrite) {
1043 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
1044 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
1045 overlap = prev_block_end - block_start;
1046 if (overlap > 0) {
1047 if (overlap % jhdr_size != 0) {
1048 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
1049 }
1050
1051 // if the previous entry completely overlaps this one, we need to break it into two pieces.
1052 if (prev_block_end > block_end) {
1053 off_t new_num = block_end / jhdr_size;
1054 size_t new_size = prev_block_end - block_end;
1055
1056 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
1057
1058 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
1059 if (err < 0) {
1060 panic("jnl: do_overlap: error inserting during pre-overlap\n");
1061 }
1062 }
1063
1064 // Regardless, we need to truncate the previous entry to the beginning of the overlap
1065 (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
1066 (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it
1067 }
1068 }
1069
1070 // then, bail out fast if there's no overlap with the entries that follow
1071 if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) {
1072 return 0; // no overlap, no overwrite
1073 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) {
1074
1075 (*buf_ptr)[blk_index].cksum = cksum; // update this
1076 return 1; // simple overwrite
1077 }
1078
1079 // Otherwise, find all cases of total and partial overlap. We use the special
1080 // block_num of -2 to designate entries that are completely overlapped and must
1081 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
1082 // entries must be adjusted to keep the array consistent.
1083 index = blk_index;
1084 num_to_remove = 0;
1085 while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) {
1086 if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) {
1087 (*buf_ptr)[index].block_num = -2; // mark this for deletion
1088 num_to_remove++;
1089 } else {
1090 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
1091 if (overlap > 0) {
1092 if (overlap % jhdr_size != 0) {
1093 panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
1094 }
1095
1096 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1097 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
1098 (*buf_ptr)[index].cksum = 0;
1099
1100 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
1101 if ((off_t)new_offset >= jnl->jhdr->size) {
1102 new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
1103 }
1104 (*buf_ptr)[index].jnl_offset = new_offset;
1105
1106 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
1107 if ((*buf_ptr)[index].block_size <= 0) {
1108 panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
1109 // return -1; // if above panic is removed, return -1 for error
1110 }
1111 }
1112
1113 }
1114
1115 index++;
1116 }
1117
1118 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1119 index--; // start with the last index used within the above loop
1120 while (index >= blk_index) {
1121 if ((*buf_ptr)[index].block_num == -2) {
1122 if (index == *num_full_ptr-1) {
1123 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
1124 } else {
1125 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
1126 }
1127 (*num_full_ptr)--;
1128 }
1129 index--;
1130 }
1131
1132 // eliminate any stale entries at the end of the table
1133 for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
1134 (*buf_ptr)[i].block_num = -1;
1135 }
1136
1137 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1138 }
1139
1140 // PR-3105942: Coalesce writes to the same block in journal replay
1141 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1142 // to be replayed and the corresponding location in the journal which contains
1143 // the most recent data for those blocks. The array is "played" once the all the
1144 // blocks in the journal have been coalesced. The code for the case of conflicting/
1145 // overlapping writes to a single block is the most dense. Because coalescing can
1146 // disrupt the existing time-ordering of blocks in the journal playback, care
1147 // is taken to catch any overlaps and keep the array consistent.
1148 static int
1149 add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr)
1150 {
1151 int blk_index, overwriting;
1152
1153 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1154 // inserted (or the index of the elem to overwrite).
1155 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
1156
1157 // check if the index is within bounds (if we're adding this block to the end of
1158 // the table, blk_index will be equal to num_full)
1159 if (blk_index < 0 || blk_index > *num_full_ptr) {
1160 //printf("jnl: add_block: trouble adding block to co_buf\n");
1161 return -1;
1162 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1163
1164 // Determine whether we're overwriting an existing entry by checking for overlap
1165 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
1166 if (overwriting < 0) {
1167 return -1; // if we got an error, pass it along
1168 }
1169
1170 // returns the index, or -1 on error
1171 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
1172
1173 return blk_index;
1174 }
1175
1176 static int
1177 replay_journal(journal *jnl)
1178 {
1179 int i, bad_blocks=0;
1180 unsigned int orig_checksum, checksum, check_block_checksums = 0;
1181 size_t ret;
1182 size_t max_bsize = 0; /* protected by block_ptr */
1183 block_list_header *blhdr;
1184 off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1185 char *buff, *block_ptr=NULL;
1186 struct bucket *co_buf;
1187 int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0;
1188 uint32_t last_sequence_num = 0;
1189 int replay_retry_count = 0;
1190
1191 // wrap the start ptr if it points to the very end of the journal
1192 if (jnl->jhdr->start == jnl->jhdr->size) {
1193 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1194 }
1195 if (jnl->jhdr->end == jnl->jhdr->size) {
1196 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1197 }
1198
1199 if (jnl->jhdr->start == jnl->jhdr->end) {
1200 return 0;
1201 }
1202
1203 orig_jnl_start = jnl->jhdr->start;
1204
1205 // allocate memory for the header_block. we'll read each blhdr into this
1206 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size, VM_KERN_MEMORY_FILE)) {
1207 printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n",
1208 jnl->jdev_name, jnl->jhdr->blhdr_size);
1209 return -1;
1210 }
1211
1212 // allocate memory for the coalesce buffer
1213 if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
1214 printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name);
1215 return -1;
1216 }
1217
1218 restart_replay:
1219
1220 // initialize entries
1221 for(i = 0; i < num_buckets; i++) {
1222 co_buf[i].block_num = -1;
1223 }
1224 num_full = 0; // empty at first
1225
1226
1227 printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
1228 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
1229
1230 while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1231 offset = blhdr_offset = jnl->jhdr->start;
1232 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1233 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1234 printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset);
1235 bad_blocks = 1;
1236 goto bad_txn_handling;
1237 }
1238
1239 blhdr = (block_list_header *)buff;
1240
1241 orig_checksum = blhdr->checksum;
1242 blhdr->checksum = 0;
1243 if (jnl->flags & JOURNAL_NEED_SWAP) {
1244 // calculate the checksum based on the unswapped data
1245 // because it is done byte-at-a-time.
1246 orig_checksum = (unsigned int)SWAP32(orig_checksum);
1247 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1248 swap_block_list_header(jnl, blhdr);
1249 } else {
1250 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1251 }
1252
1253
1254 //
1255 // XXXdbg - if these checks fail, we should replay as much
1256 // we can in the hopes that it will still leave the
1257 // drive in a better state than if we didn't replay
1258 // anything
1259 //
1260 if (checksum != orig_checksum) {
1261 if (check_past_jnl_end && in_uncharted_territory) {
1262
1263 if (blhdr_offset != jnl->jhdr->end) {
1264 printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1265 }
1266
1267 check_past_jnl_end = 0;
1268 jnl->jhdr->end = blhdr_offset;
1269 continue;
1270 }
1271
1272 printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1273 jnl->jdev_name, blhdr_offset, orig_checksum, checksum);
1274
1275 if (blhdr_offset == orig_jnl_start) {
1276 // if there's nothing in the journal at all, just bail out altogether.
1277 goto bad_replay;
1278 }
1279
1280 bad_blocks = 1;
1281 goto bad_txn_handling;
1282 }
1283
1284 if ( (last_sequence_num != 0)
1285 && (blhdr->binfo[0].u.bi.b.sequence_num != 0)
1286 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
1287 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
1288
1289 txn_start_offset = jnl->jhdr->end = blhdr_offset;
1290
1291 if (check_past_jnl_end) {
1292 check_past_jnl_end = 0;
1293 printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1294 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1295 continue;
1296 }
1297
1298 printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1299 jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1300 bad_blocks = 1;
1301 goto bad_txn_handling;
1302 }
1303 last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
1304
1305 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1306 if (last_sequence_num == 0) {
1307 check_past_jnl_end = 0;
1308 printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1309 jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1310 if (jnl->jhdr->start != jnl->jhdr->end) {
1311 jnl->jhdr->start = jnl->jhdr->end;
1312 }
1313 continue;
1314 }
1315 printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset);
1316 }
1317
1318 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
1319 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1320 printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n",
1321 jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks);
1322 bad_blocks = 1;
1323 goto bad_txn_handling;
1324 }
1325
1326 max_bsize = 0;
1327 for (i = 1; i < blhdr->num_blocks; i++) {
1328 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1329 printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum);
1330 bad_blocks = 1;
1331 goto bad_txn_handling;
1332 }
1333
1334 if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
1335 max_bsize = blhdr->binfo[i].u.bi.bsize;
1336 }
1337 }
1338
1339 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1340 check_block_checksums = 1;
1341 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) {
1342 goto bad_replay;
1343 }
1344 } else {
1345 block_ptr = NULL;
1346 }
1347
1348 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1349 txn_start_offset = blhdr_offset;
1350 }
1351
1352 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1353 // blhdr->num_blocks-1, jnl->jhdr->start);
1354 bad_blocks = 0;
1355 for (i = 1; i < blhdr->num_blocks; i++) {
1356 int size, ret_val;
1357 off_t number;
1358
1359 size = blhdr->binfo[i].u.bi.bsize;
1360 number = blhdr->binfo[i].bnum;
1361
1362 // don't add "killed" blocks
1363 if (number == (off_t)-1) {
1364 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1365 } else {
1366
1367 if (check_block_checksums) {
1368 int32_t disk_cksum;
1369 off_t block_offset;
1370
1371 block_offset = offset;
1372
1373 // read the block so we can check the checksum
1374 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1375 if (ret != (size_t)size) {
1376 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1377 bad_blocks = 1;
1378 goto bad_txn_handling;
1379 }
1380
1381 disk_cksum = calc_checksum(block_ptr, size);
1382
1383 // there is no need to swap the checksum from disk because
1384 // it got swapped when the blhdr was read in.
1385 if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
1386 printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1387 jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
1388 printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1389 *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1390 *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1391
1392 bad_blocks = 1;
1393 goto bad_txn_handling;
1394 }
1395 }
1396
1397
1398 // add this bucket to co_buf, coalescing where possible
1399 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1400 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
1401
1402 if (ret_val == -1) {
1403 printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name);
1404 goto bad_replay;
1405 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1406 }
1407
1408 // increment offset
1409 offset += size;
1410
1411 // check if the last block added puts us off the end of the jnl.
1412 // if so, we need to wrap to the beginning and take any remainder
1413 // into account
1414 //
1415 if (offset >= jnl->jhdr->size) {
1416 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1417 }
1418 }
1419
1420 if (block_ptr) {
1421 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1422 block_ptr = NULL;
1423 }
1424
1425 bad_txn_handling:
1426 if (bad_blocks) {
1427 /* Journal replay got error before it found any valid
1428 * transations, abort replay */
1429 if (txn_start_offset == 0) {
1430 printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name);
1431 goto bad_replay;
1432 }
1433
1434 /* Repeated error during journal replay, abort replay */
1435 if (replay_retry_count == 3) {
1436 printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name);
1437 goto bad_replay;
1438 }
1439 replay_retry_count++;
1440
1441 /* There was an error replaying the journal (possibly
1442 * EIO/ENXIO from the device). So retry replaying all
1443 * the good transactions that we found before getting
1444 * the error.
1445 */
1446 jnl->jhdr->start = orig_jnl_start;
1447 jnl->jhdr->end = txn_start_offset;
1448 check_past_jnl_end = 0;
1449 last_sequence_num = 0;
1450 printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1451 goto restart_replay;
1452 }
1453
1454 jnl->jhdr->start += blhdr->bytes_used;
1455 if (jnl->jhdr->start >= jnl->jhdr->size) {
1456 // wrap around and skip the journal header block
1457 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1458 }
1459
1460 if (jnl->jhdr->start == jnl->jhdr->end) {
1461 in_uncharted_territory = 1;
1462 }
1463 }
1464
1465 if (jnl->jhdr->start != jnl->jhdr->end) {
1466 printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end);
1467 jnl->jhdr->end = jnl->jhdr->start;
1468 }
1469
1470 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1471
1472 /*
1473 * make sure it's at least one page in size, so
1474 * start max_bsize at PAGE_SIZE
1475 */
1476 for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1477
1478 if (co_buf[i].block_num == (off_t)-1)
1479 continue;
1480
1481 if (co_buf[i].block_size > max_bsize)
1482 max_bsize = co_buf[i].block_size;
1483 }
1484 /*
1485 * round max_bsize up to the nearest PAGE_SIZE multiple
1486 */
1487 if (max_bsize & (PAGE_SIZE - 1)) {
1488 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1489 }
1490
1491 if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize, VM_KERN_MEMORY_FILE)) {
1492 goto bad_replay;
1493 }
1494
1495 // Replay the coalesced entries in the co-buf
1496 for(i = 0; i < num_full; i++) {
1497 size_t size = co_buf[i].block_size;
1498 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1499 off_t number = co_buf[i].block_num;
1500
1501
1502 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1503 // co_buf[i].block_size, co_buf[i].jnl_offset);
1504
1505 if (number == (off_t)-1) {
1506 // printf("jnl: replay_journal: skipping killed fs block\n");
1507 } else {
1508
1509 // do journal read, and set the phys. block
1510 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1511 if (ret != size) {
1512 printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset);
1513 goto bad_replay;
1514 }
1515
1516 if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1517 goto bad_replay;
1518 }
1519 }
1520 }
1521
1522
1523 // done replaying; update jnl header
1524 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1525 goto bad_replay;
1526 }
1527
1528 printf("jnl: %s: journal replay done.\n", jnl->jdev_name);
1529
1530 // free block_ptr
1531 if (block_ptr) {
1532 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1533 block_ptr = NULL;
1534 }
1535
1536 // free the coalesce buffer
1537 FREE(co_buf, M_TEMP);
1538 co_buf = NULL;
1539
1540 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1541 return 0;
1542
1543 bad_replay:
1544 if (block_ptr) {
1545 kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
1546 }
1547 if (co_buf) {
1548 FREE(co_buf, M_TEMP);
1549 }
1550 kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size);
1551
1552 return -1;
1553 }
1554
1555
1556 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
1557 #define MAX_TRANSACTION_BUFFER_SIZE (3072*1024)
1558
1559 // XXXdbg - so I can change it in the debugger
1560 int def_tbuffer_size = 0;
1561
1562
1563 //
1564 // This function sets the size of the tbuffer and the
1565 // size of the blhdr. It assumes that jnl->jhdr->size
1566 // and jnl->jhdr->jhdr_size are already valid.
1567 //
1568 static void
1569 size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
1570 {
1571 //
1572 // one-time initialization based on how much memory
1573 // there is in the machine.
1574 //
1575 if (def_tbuffer_size == 0) {
1576 if (max_mem < (256*1024*1024)) {
1577 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
1578 } else if (max_mem < (512*1024*1024)) {
1579 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
1580 } else if (max_mem < (1024*1024*1024)) {
1581 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
1582 } else {
1583 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (max_mem / (256*1024*1024));
1584 }
1585 }
1586
1587 // size up the transaction buffer... can't be larger than the number
1588 // of blocks that can fit in a block_list_header block.
1589 if (tbuffer_size == 0) {
1590 jnl->tbuffer_size = def_tbuffer_size;
1591 } else {
1592 // make sure that the specified tbuffer_size isn't too small
1593 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
1594 tbuffer_size = jnl->jhdr->blhdr_size * 2;
1595 }
1596 // and make sure it's an even multiple of the block size
1597 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
1598 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
1599 }
1600
1601 jnl->tbuffer_size = tbuffer_size;
1602 }
1603
1604 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
1605 jnl->tbuffer_size = (jnl->jhdr->size / 2);
1606 }
1607
1608 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
1609 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
1610 }
1611
1612 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
1613 if (jnl->jhdr->blhdr_size < phys_blksz) {
1614 jnl->jhdr->blhdr_size = phys_blksz;
1615 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
1616 // have to round up so we're an even multiple of the physical block size
1617 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
1618 }
1619 }
1620
1621 static void
1622 get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context)
1623 {
1624 off_t readblockcnt;
1625 off_t writeblockcnt;
1626 off_t readmaxcnt=0, tmp_readmaxcnt;
1627 off_t writemaxcnt=0, tmp_writemaxcnt;
1628 off_t readsegcnt, writesegcnt;
1629 int32_t features;
1630
1631 if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) {
1632 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) {
1633 const char *name = vnode_getname_printable(devvp);
1634 jnl->flags |= JOURNAL_DO_FUA_WRITES;
1635 printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features);
1636 vnode_putname_printable(name);
1637 }
1638 if (features & DK_FEATURE_UNMAP) {
1639 jnl->flags |= JOURNAL_USE_UNMAP;
1640 }
1641
1642 if (features & DK_FEATURE_BARRIER) {
1643 jnl->flags |= JOURNAL_FEATURE_BARRIER;
1644 }
1645 }
1646
1647 //
1648 // First check the max read size via several different mechanisms...
1649 //
1650 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context);
1651
1652 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) {
1653 tmp_readmaxcnt = readblockcnt * phys_blksz;
1654 if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
1655 readmaxcnt = tmp_readmaxcnt;
1656 }
1657 }
1658
1659 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) {
1660 readsegcnt = 0;
1661 }
1662
1663 if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
1664 readmaxcnt = readsegcnt * PAGE_SIZE;
1665 }
1666
1667 if (readmaxcnt == 0) {
1668 readmaxcnt = 128 * 1024;
1669 } else if (readmaxcnt > UINT32_MAX) {
1670 readmaxcnt = UINT32_MAX;
1671 }
1672
1673
1674 //
1675 // Now check the max writes size via several different mechanisms...
1676 //
1677 VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context);
1678
1679 if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) {
1680 tmp_writemaxcnt = writeblockcnt * phys_blksz;
1681 if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
1682 writemaxcnt = tmp_writemaxcnt;
1683 }
1684 }
1685
1686 if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) {
1687 writesegcnt = 0;
1688 }
1689
1690 if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
1691 writemaxcnt = writesegcnt * PAGE_SIZE;
1692 }
1693
1694 if (writemaxcnt == 0) {
1695 writemaxcnt = 128 * 1024;
1696 } else if (writemaxcnt > UINT32_MAX) {
1697 writemaxcnt = UINT32_MAX;
1698 }
1699
1700 jnl->max_read_size = readmaxcnt;
1701 jnl->max_write_size = writemaxcnt;
1702 // printf("jnl: %s: max read/write: %lld k / %lld k\n",
1703 // jnl->jdev_name ? jnl->jdev_name : "unknown",
1704 // jnl->max_read_size/1024, jnl->max_write_size/1024);
1705 }
1706
1707
1708 journal *
1709 journal_create(struct vnode *jvp,
1710 off_t offset,
1711 off_t journal_size,
1712 struct vnode *fsvp,
1713 size_t min_fs_blksz,
1714 int32_t flags,
1715 int32_t tbuffer_size,
1716 void (*flush)(void *arg),
1717 void *arg,
1718 struct mount *fsmount)
1719 {
1720 journal *jnl;
1721 uint32_t phys_blksz, new_txn_base;
1722 u_int32_t min_size;
1723 struct vfs_context context;
1724 const char *jdev_name;
1725 /*
1726 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
1727 * a full allocation block if the current size is smaller than the allocation
1728 * block on which it resides. Once we hit the exabyte filesystem range, then
1729 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
1730 */
1731 context.vc_thread = current_thread();
1732 context.vc_ucred = FSCRED;
1733
1734 jdev_name = vnode_getname_printable(jvp);
1735
1736 /* Get the real physical block size. */
1737 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1738 goto cleanup_jdev_name;
1739 }
1740
1741 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
1742 printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size);
1743 goto cleanup_jdev_name;
1744 }
1745
1746 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1747 /* Reject journals that are too small given the sector size of the device */
1748 if (journal_size < min_size) {
1749 printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n",
1750 jdev_name, journal_size, phys_blksz);
1751 goto cleanup_jdev_name;
1752 }
1753
1754 if (phys_blksz > min_fs_blksz) {
1755 printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n",
1756 jdev_name, phys_blksz, min_fs_blksz);
1757 goto cleanup_jdev_name;
1758 }
1759
1760 if ((journal_size % phys_blksz) != 0) {
1761 printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
1762 jdev_name, journal_size, phys_blksz);
1763 goto cleanup_jdev_name;
1764 }
1765
1766
1767 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1768 memset(jnl, 0, sizeof(*jnl));
1769
1770 jnl->jdev = jvp;
1771 jnl->jdev_offset = offset;
1772 jnl->fsdev = fsvp;
1773 jnl->flush = flush;
1774 jnl->flush_arg = arg;
1775 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1776 jnl->jdev_name = jdev_name;
1777 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1778
1779 // Keep a point to the mount around for use in IO throttling.
1780 jnl->fsmount = fsmount;
1781 // XXX: This lock discipline looks correct based on dounmount(), but it
1782 // doesn't seem to be documented anywhere.
1783 mount_ref(fsmount, 0);
1784
1785 get_io_info(jvp, phys_blksz, jnl, &context);
1786
1787 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
1788 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1789 goto bad_kmem_alloc;
1790 }
1791 jnl->header_buf_size = phys_blksz;
1792
1793 jnl->jhdr = (journal_header *)jnl->header_buf;
1794 memset(jnl->jhdr, 0, sizeof(journal_header));
1795
1796 // we have to set this up here so that do_journal_io() will work
1797 jnl->jhdr->jhdr_size = phys_blksz;
1798
1799 //
1800 // We try and read the journal header to see if there is already one
1801 // out there. If there is, it's possible that it has transactions
1802 // in it that we might replay if we happen to pick a sequence number
1803 // that is a little less than the old one, there is a crash and the
1804 // last txn written ends right at the start of a txn from the previous
1805 // incarnation of this file system. If all that happens we would
1806 // replay the transactions from the old file system and that would
1807 // destroy your disk. Although it is extremely unlikely for all those
1808 // conditions to happen, the probability is non-zero and the result is
1809 // severe - you lose your file system. Therefore if we find a valid
1810 // journal header and the sequence number is non-zero we write junk
1811 // over the entire journal so that there is no way we will encounter
1812 // any old transactions. This is slow but should be a rare event
1813 // since most tools erase the journal.
1814 //
1815 if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
1816 && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
1817 && jnl->jhdr->sequence_num != 0) {
1818
1819 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
1820 printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base);
1821
1822 #if 0
1823 int i;
1824 off_t pos=0;
1825
1826 for(i = 1; i < journal_size / phys_blksz; i++) {
1827 pos = i*phys_blksz;
1828
1829 // we don't really care what data we write just so long
1830 // as it's not a valid transaction header. since we have
1831 // the header_buf sitting around we'll use that.
1832 write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz);
1833 }
1834 printf("jnl: create: done clearing journal (i=%d)\n", i);
1835 #endif
1836 } else {
1837 new_txn_base = random() & 0x00ffffff;
1838 }
1839
1840 memset(jnl->header_buf, 0, phys_blksz);
1841
1842 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
1843 jnl->jhdr->endian = ENDIAN_MAGIC;
1844 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
1845 jnl->jhdr->end = phys_blksz;
1846 jnl->jhdr->size = journal_size;
1847 jnl->jhdr->jhdr_size = phys_blksz;
1848 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
1849
1850 jnl->active_start = jnl->jhdr->start;
1851
1852 // XXXdbg - for testing you can force the journal to wrap around
1853 // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
1854 // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
1855
1856 jnl->jhdr->sequence_num = new_txn_base;
1857
1858 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
1859 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
1860 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
1861
1862
1863 jnl->flushing = FALSE;
1864 jnl->asyncIO = FALSE;
1865 jnl->flush_aborted = FALSE;
1866 jnl->writing_header = FALSE;
1867 jnl->async_trim = NULL;
1868 jnl->sequence_num = jnl->jhdr->sequence_num;
1869
1870 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1871 printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name);
1872 goto bad_write;
1873 }
1874
1875 goto journal_create_complete;
1876
1877
1878 bad_write:
1879 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
1880 bad_kmem_alloc:
1881 jnl->jhdr = NULL;
1882 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
1883 mount_drop(fsmount, 0);
1884 cleanup_jdev_name:
1885 vnode_putname_printable(jdev_name);
1886 jnl = NULL;
1887 journal_create_complete:
1888 return jnl;
1889 }
1890
1891
1892 journal *
1893 journal_open(struct vnode *jvp,
1894 off_t offset,
1895 off_t journal_size,
1896 struct vnode *fsvp,
1897 size_t min_fs_blksz,
1898 int32_t flags,
1899 int32_t tbuffer_size,
1900 void (*flush)(void *arg),
1901 void *arg,
1902 struct mount *fsmount)
1903 {
1904 journal *jnl;
1905 uint32_t orig_blksz=0;
1906 uint32_t phys_blksz;
1907 u_int32_t min_size = 0;
1908 int orig_checksum, checksum;
1909 struct vfs_context context;
1910 const char *jdev_name = vnode_getname_printable(jvp);
1911
1912 context.vc_thread = current_thread();
1913 context.vc_ucred = FSCRED;
1914
1915 /* Get the real physical block size. */
1916 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
1917 goto cleanup_jdev_name;
1918 }
1919
1920 if (phys_blksz > min_fs_blksz) {
1921 printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n",
1922 jdev_name, phys_blksz, min_fs_blksz);
1923 goto cleanup_jdev_name;
1924 }
1925
1926 if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
1927 printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size);
1928 goto cleanup_jdev_name;
1929 }
1930
1931 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
1932 /* Reject journals that are too small given the sector size of the device */
1933 if (journal_size < min_size) {
1934 printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n",
1935 jdev_name, journal_size, phys_blksz);
1936 goto cleanup_jdev_name;
1937 }
1938
1939 if ((journal_size % phys_blksz) != 0) {
1940 printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
1941 jdev_name, journal_size, phys_blksz);
1942 goto cleanup_jdev_name;
1943 }
1944
1945 MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK);
1946 memset(jnl, 0, sizeof(*jnl));
1947
1948 jnl->jdev = jvp;
1949 jnl->jdev_offset = offset;
1950 jnl->fsdev = fsvp;
1951 jnl->flush = flush;
1952 jnl->flush_arg = arg;
1953 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
1954 jnl->jdev_name = jdev_name;
1955 lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr);
1956
1957 /* We need a reference to the mount to later pass to the throttling code for
1958 * IO accounting.
1959 */
1960 jnl->fsmount = fsmount;
1961 mount_ref(fsmount, 0);
1962
1963 get_io_info(jvp, phys_blksz, jnl, &context);
1964
1965 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
1966 printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz);
1967 goto bad_kmem_alloc;
1968 }
1969 jnl->header_buf_size = phys_blksz;
1970
1971 jnl->jhdr = (journal_header *)jnl->header_buf;
1972 memset(jnl->jhdr, 0, sizeof(journal_header));
1973
1974 // we have to set this up here so that do_journal_io() will work
1975 jnl->jhdr->jhdr_size = phys_blksz;
1976
1977 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
1978 printf("jnl: %s: open: could not read %u bytes for the journal header.\n",
1979 jdev_name, phys_blksz);
1980 goto bad_journal;
1981 }
1982
1983 orig_checksum = jnl->jhdr->checksum;
1984 jnl->jhdr->checksum = 0;
1985
1986 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
1987 // do this before the swap since it's done byte-at-a-time
1988 orig_checksum = SWAP32(orig_checksum);
1989 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1990 swap_journal_header(jnl);
1991 jnl->flags |= JOURNAL_NEED_SWAP;
1992 } else {
1993 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
1994 }
1995
1996 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
1997 printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n",
1998 jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
1999 goto bad_journal;
2000 }
2001
2002 // only check if we're the current journal header magic value
2003 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
2004
2005 if (orig_checksum != checksum) {
2006 printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n",
2007 jdev_name, orig_checksum, checksum);
2008
2009 //goto bad_journal;
2010 }
2011 }
2012
2013 // XXXdbg - convert old style magic numbers to the new one
2014 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
2015 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
2016 }
2017
2018 if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
2019 /*
2020 * The volume has probably been resized (such that we had to adjust the
2021 * logical sector size), or copied to media with a different logical
2022 * sector size.
2023 *
2024 * Temporarily change the device's logical block size to match the
2025 * journal's header size. This will allow us to replay the journal
2026 * safely. If the replay succeeds, we will update the journal's header
2027 * size (later in this function).
2028 */
2029 orig_blksz = phys_blksz;
2030 phys_blksz = jnl->jhdr->jhdr_size;
2031 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context);
2032 printf("jnl: %s: open: temporarily switched block size from %u to %u\n",
2033 jdev_name, orig_blksz, phys_blksz);
2034 }
2035
2036 if ( jnl->jhdr->start <= 0
2037 || jnl->jhdr->start > jnl->jhdr->size
2038 || jnl->jhdr->start > 1024*1024*1024) {
2039 printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
2040 jdev_name, jnl->jhdr->start, jnl->jhdr->size);
2041 goto bad_journal;
2042 }
2043
2044 if ( jnl->jhdr->end <= 0
2045 || jnl->jhdr->end > jnl->jhdr->size
2046 || jnl->jhdr->end > 1024*1024*1024) {
2047 printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
2048 jdev_name, jnl->jhdr->end, jnl->jhdr->size);
2049 goto bad_journal;
2050 }
2051
2052 if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) {
2053 printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size);
2054 goto bad_journal;
2055 }
2056
2057 // XXXdbg - can't do these checks because hfs writes all kinds of
2058 // non-uniform sized blocks even on devices that have a block size
2059 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
2060 // therefore these checks will fail and so we just have to punt and
2061 // do more relaxed checking...
2062 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
2063 if ((jnl->jhdr->start % 512) != 0) {
2064 printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n",
2065 jdev_name, jnl->jhdr->start);
2066 goto bad_journal;
2067 }
2068
2069 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
2070 if ((jnl->jhdr->end % 512) != 0) {
2071 printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
2072 jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size);
2073 goto bad_journal;
2074 }
2075
2076 // take care of replaying the journal if necessary
2077 if (flags & JOURNAL_RESET) {
2078 printf("jnl: %s: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n",
2079 jdev_name, jnl->jhdr->start, jnl->jhdr->end);
2080 jnl->jhdr->start = jnl->jhdr->end;
2081 } else if (replay_journal(jnl) != 0) {
2082 printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name);
2083 goto bad_journal;
2084 }
2085
2086 /*
2087 * When we get here, we know that the journal is empty (jnl->jhdr->start ==
2088 * jnl->jhdr->end). If the device's logical block size was different from
2089 * the journal's header size, then we can now restore the device's logical
2090 * block size and update the journal's header size to match.
2091 *
2092 * Note that we also adjust the journal's start and end so that they will
2093 * be aligned on the new block size. We pick a new sequence number to
2094 * avoid any problems if a replay found previous transactions using the old
2095 * journal header size. (See the comments in journal_create(), above.)
2096 */
2097
2098 if (orig_blksz != 0) {
2099 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2100 phys_blksz = orig_blksz;
2101
2102 orig_blksz = 0;
2103
2104 jnl->jhdr->jhdr_size = phys_blksz;
2105 jnl->jhdr->start = phys_blksz;
2106 jnl->jhdr->end = phys_blksz;
2107 jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
2108 (journal_size / phys_blksz) +
2109 (random() % 16384)) & 0x00ffffff;
2110
2111 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
2112 printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
2113 goto bad_journal;
2114 }
2115 }
2116
2117 // make sure this is in sync!
2118 jnl->active_start = jnl->jhdr->start;
2119 jnl->sequence_num = jnl->jhdr->sequence_num;
2120
2121 // set this now, after we've replayed the journal
2122 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
2123
2124 // TODO: Does this need to change if the device's logical block size changed?
2125 if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
2126 printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size,
2127 jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
2128 goto bad_journal;
2129 }
2130
2131 lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr);
2132 lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr);
2133 lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr);
2134
2135 goto journal_open_complete;
2136
2137 bad_journal:
2138 if (orig_blksz != 0) {
2139 phys_blksz = orig_blksz;
2140 VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context);
2141 printf("jnl: %s: open: restored block size after error\n", jdev_name);
2142 }
2143 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
2144 bad_kmem_alloc:
2145 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2146 mount_drop(fsmount, 0);
2147 cleanup_jdev_name:
2148 vnode_putname_printable(jdev_name);
2149 jnl = NULL;
2150 journal_open_complete:
2151 return jnl;
2152 }
2153
2154
2155 int
2156 journal_is_clean(struct vnode *jvp,
2157 off_t offset,
2158 off_t journal_size,
2159 struct vnode *fsvp,
2160 size_t min_fs_block_size)
2161 {
2162 journal jnl;
2163 uint32_t phys_blksz;
2164 int ret;
2165 int orig_checksum, checksum;
2166 struct vfs_context context;
2167 const char *jdev_name = vnode_getname_printable(jvp);
2168
2169 context.vc_thread = current_thread();
2170 context.vc_ucred = FSCRED;
2171
2172 /* Get the real physical block size. */
2173 if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) {
2174 printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name);
2175 ret = EINVAL;
2176 goto cleanup_jdev_name;
2177 }
2178
2179 if (phys_blksz > (uint32_t)min_fs_block_size) {
2180 printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
2181 jdev_name, phys_blksz, min_fs_block_size);
2182 ret = EINVAL;
2183 goto cleanup_jdev_name;
2184 }
2185
2186 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
2187 printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size);
2188 ret = EINVAL;
2189 goto cleanup_jdev_name;
2190 }
2191
2192 if ((journal_size % phys_blksz) != 0) {
2193 printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
2194 jdev_name, journal_size, phys_blksz);
2195 ret = EINVAL;
2196 goto cleanup_jdev_name;
2197 }
2198
2199 memset(&jnl, 0, sizeof(jnl));
2200
2201 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
2202 printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz);
2203 ret = ENOMEM;
2204 goto cleanup_jdev_name;
2205 }
2206 jnl.header_buf_size = phys_blksz;
2207
2208 get_io_info(jvp, phys_blksz, &jnl, &context);
2209
2210 jnl.jhdr = (journal_header *)jnl.header_buf;
2211 memset(jnl.jhdr, 0, sizeof(journal_header));
2212
2213 jnl.jdev = jvp;
2214 jnl.jdev_offset = offset;
2215 jnl.fsdev = fsvp;
2216
2217 // we have to set this up here so that do_journal_io() will work
2218 jnl.jhdr->jhdr_size = phys_blksz;
2219
2220 if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
2221 printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n",
2222 jdev_name, phys_blksz);
2223 ret = EINVAL;
2224 goto get_out;
2225 }
2226
2227 orig_checksum = jnl.jhdr->checksum;
2228 jnl.jhdr->checksum = 0;
2229
2230 if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
2231 // do this before the swap since it's done byte-at-a-time
2232 orig_checksum = SWAP32(orig_checksum);
2233 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2234 swap_journal_header(&jnl);
2235 jnl.flags |= JOURNAL_NEED_SWAP;
2236 } else {
2237 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2238 }
2239
2240 if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
2241 printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n",
2242 jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
2243 ret = EINVAL;
2244 goto get_out;
2245 }
2246
2247 if (orig_checksum != checksum) {
2248 printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum);
2249 ret = EINVAL;
2250 goto get_out;
2251 }
2252
2253 //
2254 // if the start and end are equal then the journal is clean.
2255 // otherwise it's not clean and therefore an error.
2256 //
2257 if (jnl.jhdr->start == jnl.jhdr->end) {
2258 ret = 0;
2259 } else {
2260 ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one
2261 }
2262
2263 get_out:
2264 kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz);
2265 cleanup_jdev_name:
2266 vnode_putname_printable(jdev_name);
2267 return ret;
2268 }
2269
2270
2271 void
2272 journal_close(journal *jnl)
2273 {
2274 volatile off_t *start, *end;
2275 int counter=0;
2276
2277 CHECK_JOURNAL(jnl);
2278
2279 // set this before doing anything that would block so that
2280 // we start tearing things down properly.
2281 //
2282 jnl->flags |= JOURNAL_CLOSE_PENDING;
2283
2284 if (jnl->owner != current_thread()) {
2285 journal_lock(jnl);
2286 }
2287
2288 wait_condition(jnl, &jnl->flushing, "journal_close");
2289
2290 //
2291 // only write stuff to disk if the journal is still valid
2292 //
2293 if ((jnl->flags & JOURNAL_INVALID) == 0) {
2294
2295 if (jnl->active_tr) {
2296 /*
2297 * "journal_end_transaction" will fire the flush asynchronously
2298 */
2299 journal_end_transaction(jnl);
2300 }
2301
2302 // flush any buffered transactions
2303 if (jnl->cur_tr) {
2304 transaction *tr = jnl->cur_tr;
2305
2306 jnl->cur_tr = NULL;
2307 /*
2308 * "end_transaction" will wait for any in-progress flush to complete
2309 * before flushing "cur_tr" synchronously("must_wait" == TRUE)
2310 */
2311 end_transaction(tr, 1, NULL, NULL, FALSE, TRUE);
2312 }
2313 /*
2314 * if there was an "active_tr", make sure we wait for
2315 * it to flush if there was no "cur_tr" to process
2316 */
2317 wait_condition(jnl, &jnl->flushing, "journal_close");
2318
2319 //start = &jnl->jhdr->start;
2320 start = &jnl->active_start;
2321 end = &jnl->jhdr->end;
2322
2323 while (*start != *end && counter++ < 5000) {
2324 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
2325 if (jnl->flush) {
2326 jnl->flush(jnl->flush_arg);
2327 }
2328 tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2);
2329 }
2330
2331 if (*start != *end) {
2332 printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
2333 jnl->jdev_name, *start, *end);
2334 }
2335
2336 // make sure this is in sync when we close the journal
2337 jnl->jhdr->start = jnl->active_start;
2338
2339 // if this fails there's not much we can do at this point...
2340 write_journal_header(jnl, 1, jnl->sequence_num);
2341 } else {
2342 // if we're here the journal isn't valid any more.
2343 // so make sure we don't leave any locked blocks lying around
2344 printf("jnl: %s: close: journal is invalid. aborting outstanding transactions\n", jnl->jdev_name);
2345 if (jnl->active_tr || jnl->cur_tr) {
2346 transaction *tr;
2347
2348 if (jnl->active_tr) {
2349 tr = jnl->active_tr;
2350 jnl->active_tr = NULL;
2351 } else {
2352 tr = jnl->cur_tr;
2353 jnl->cur_tr = NULL;
2354 }
2355 abort_transaction(jnl, tr);
2356
2357 if (jnl->active_tr || jnl->cur_tr) {
2358 panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl);
2359 }
2360 }
2361 }
2362 wait_condition(jnl, &jnl->asyncIO, "journal_close");
2363
2364 free_old_stuff(jnl);
2365
2366 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2367 jnl->jhdr = (void *)0xbeefbabe;
2368
2369 // Release reference on the mount
2370 if (jnl->fsmount)
2371 mount_drop(jnl->fsmount, 0);
2372
2373 vnode_putname_printable(jnl->jdev_name);
2374
2375 journal_unlock(jnl);
2376 lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group);
2377 lck_mtx_destroy(&jnl->jlock, jnl_mutex_group);
2378 lck_mtx_destroy(&jnl->flock, jnl_mutex_group);
2379 FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL);
2380 }
2381
2382 static void
2383 dump_journal(journal *jnl)
2384 {
2385 transaction *ctr;
2386
2387 printf("journal for dev %s:", jnl->jdev_name);
2388 printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
2389 printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
2390 printf(" start: 0x%.8llx\n", jnl->jhdr->start);
2391 printf(" end: 0x%.8llx\n", jnl->jhdr->end);
2392 printf(" size: 0x%.8llx\n", jnl->jhdr->size);
2393 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
2394 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
2395 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
2396
2397 printf(" completed transactions:\n");
2398 for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
2399 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2400 }
2401 }
2402
2403
2404
2405 static off_t
2406 free_space(journal *jnl)
2407 {
2408 off_t free_space_offset;
2409
2410 if (jnl->jhdr->start < jnl->jhdr->end) {
2411 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2412 } else if (jnl->jhdr->start > jnl->jhdr->end) {
2413 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2414 } else {
2415 // journal is completely empty
2416 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2417 }
2418
2419 return free_space_offset;
2420 }
2421
2422
2423 //
2424 // The journal must be locked on entry to this function.
2425 // The "desired_size" is in bytes.
2426 //
2427 static int
2428 check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num)
2429 {
2430 size_t i;
2431 int counter=0;
2432
2433 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2434 // desired_size, free_space(jnl));
2435
2436 if (delayed_header_write)
2437 *delayed_header_write = FALSE;
2438
2439 while (1) {
2440 int old_start_empty;
2441
2442 // make sure there's space in the journal to hold this transaction
2443 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2444 break;
2445 }
2446 if (counter++ == 5000) {
2447 dump_journal(jnl);
2448 panic("jnl: check_free_space: buffer flushing isn't working "
2449 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2450 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2451 }
2452 if (counter > 7500) {
2453 printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name);
2454 return ENOSPC;
2455 }
2456
2457 //
2458 // here's where we lazily bump up jnl->jhdr->start. we'll consume
2459 // entries until there is enough space for the next transaction.
2460 //
2461 old_start_empty = 1;
2462 lock_oldstart(jnl);
2463
2464 for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2465 int lcl_counter;
2466
2467 lcl_counter = 0;
2468 while (jnl->old_start[i] & 0x8000000000000000LL) {
2469 if (lcl_counter++ > 10000) {
2470 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2471 jnl->old_start[i], jnl);
2472 }
2473
2474 unlock_oldstart(jnl);
2475 if (jnl->flush) {
2476 jnl->flush(jnl->flush_arg);
2477 }
2478 tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
2479 lock_oldstart(jnl);
2480 }
2481
2482 if (jnl->old_start[i] == 0) {
2483 continue;
2484 }
2485
2486 old_start_empty = 0;
2487 jnl->jhdr->start = jnl->old_start[i];
2488 jnl->old_start[i] = 0;
2489
2490 if (free_space(jnl) > desired_size) {
2491
2492 if (delayed_header_write)
2493 *delayed_header_write = TRUE;
2494 else {
2495 unlock_oldstart(jnl);
2496 write_journal_header(jnl, 1, sequence_num);
2497 lock_oldstart(jnl);
2498 }
2499 break;
2500 }
2501 }
2502 unlock_oldstart(jnl);
2503
2504 // if we bumped the start, loop and try again
2505 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2506 continue;
2507 } else if (old_start_empty) {
2508 //
2509 // if there is nothing in old_start anymore then we can
2510 // bump the jhdr->start to be the same as active_start
2511 // since it is possible there was only one very large
2512 // transaction in the old_start array. if we didn't do
2513 // this then jhdr->start would never get updated and we
2514 // would wind up looping until we hit the panic at the
2515 // start of the loop.
2516 //
2517 jnl->jhdr->start = jnl->active_start;
2518
2519 if (delayed_header_write)
2520 *delayed_header_write = TRUE;
2521 else
2522 write_journal_header(jnl, 1, sequence_num);
2523 continue;
2524 }
2525
2526
2527 // if the file system gave us a flush function, call it to so that
2528 // it can flush some blocks which hopefully will cause some transactions
2529 // to complete and thus free up space in the journal.
2530 if (jnl->flush) {
2531 jnl->flush(jnl->flush_arg);
2532 }
2533
2534 // wait for a while to avoid being cpu-bound (this will
2535 // put us to sleep for 10 milliseconds)
2536 tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
2537 }
2538
2539 return 0;
2540 }
2541
2542 /*
2543 * Allocate a new active transaction.
2544 */
2545 static errno_t
2546 journal_allocate_transaction(journal *jnl)
2547 {
2548 transaction *tr;
2549 boolean_t was_vm_privileged = FALSE;
2550 kern_return_t retval;
2551
2552 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
2553 /*
2554 * the disk driver can allocate memory on this path...
2555 * if we block waiting for memory, and there is enough pressure to
2556 * cause us to try and create a new swap file, we may end up deadlocking
2557 * due to waiting for the journal on the swap file creation path...
2558 * by making ourselves vm_privileged, we give ourselves the best chance
2559 * of not blocking
2560 */
2561 was_vm_privileged = set_vm_privilege(TRUE);
2562 }
2563 MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK);
2564 memset(tr, 0, sizeof(transaction));
2565
2566 tr->tbuffer_size = jnl->tbuffer_size;
2567
2568 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size, VM_KERN_MEMORY_FILE);
2569
2570 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2571 set_vm_privilege(FALSE);
2572
2573 if (retval) {
2574 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
2575 jnl->active_tr = NULL;
2576 return ENOMEM;
2577 }
2578
2579 // journal replay code checksum check depends on this.
2580 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
2581 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
2582 memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2583
2584 tr->blhdr = (block_list_header *)tr->tbuffer;
2585 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2586 tr->blhdr->num_blocks = 1; // accounts for this header block
2587 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
2588 tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
2589
2590 tr->sequence_num = ++jnl->sequence_num;
2591 tr->num_blhdrs = 1;
2592 tr->total_bytes = jnl->jhdr->blhdr_size;
2593 tr->jnl = jnl;
2594
2595 jnl->active_tr = tr;
2596
2597 return 0;
2598 }
2599
2600 int
2601 journal_start_transaction(journal *jnl)
2602 {
2603 int ret;
2604
2605 CHECK_JOURNAL(jnl);
2606
2607 free_old_stuff(jnl);
2608
2609 if (jnl->flags & JOURNAL_INVALID) {
2610 return EINVAL;
2611 }
2612 if (jnl->owner == current_thread()) {
2613 if (jnl->active_tr == NULL) {
2614 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
2615 jnl, jnl->owner, current_thread());
2616 }
2617 jnl->nested_count++;
2618 return 0;
2619 }
2620
2621 journal_lock(jnl);
2622
2623 if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
2624 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
2625 jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
2626 }
2627
2628 jnl->nested_count = 1;
2629
2630 #if JOE
2631 // make sure there's room in the journal
2632 if (free_space(jnl) < jnl->tbuffer_size) {
2633
2634 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
2635
2636 // this is the call that really waits for space to free up
2637 // as well as updating jnl->jhdr->start
2638 if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) {
2639 printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name);
2640 ret = ENOSPC;
2641 goto bad_start;
2642 }
2643 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0);
2644 }
2645 #endif
2646
2647 // if there's a buffered transaction, use it.
2648 if (jnl->cur_tr) {
2649 jnl->active_tr = jnl->cur_tr;
2650 jnl->cur_tr = NULL;
2651
2652 return 0;
2653 }
2654
2655 ret = journal_allocate_transaction(jnl);
2656 if (ret) {
2657 goto bad_start;
2658 }
2659
2660 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
2661
2662 return 0;
2663
2664 bad_start:
2665 jnl->nested_count = 0;
2666 journal_unlock(jnl);
2667
2668 return ret;
2669 }
2670
2671
2672 int
2673 journal_modify_block_start(journal *jnl, struct buf *bp)
2674 {
2675 transaction *tr;
2676 boolean_t was_vm_privileged = FALSE;
2677
2678 CHECK_JOURNAL(jnl);
2679
2680
2681 free_old_stuff(jnl);
2682
2683 if (jnl->flags & JOURNAL_INVALID) {
2684 return EINVAL;
2685 }
2686
2687 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
2688 /*
2689 * if we block waiting for memory, and there is enough pressure to
2690 * cause us to try and create a new swap file, we may end up deadlocking
2691 * due to waiting for the journal on the swap file creation path...
2692 * by making ourselves vm_privileged, we give ourselves the best chance
2693 * of not blocking
2694 */
2695 was_vm_privileged = set_vm_privilege(TRUE);
2696 }
2697
2698 // XXXdbg - for debugging I want this to be true. later it may
2699 // not be necessary.
2700 if ((buf_flags(bp) & B_META) == 0) {
2701 panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl);
2702 }
2703
2704 tr = jnl->active_tr;
2705 CHECK_TRANSACTION(tr);
2706
2707 if (jnl->owner != current_thread()) {
2708 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2709 jnl, jnl->owner, current_thread());
2710 }
2711
2712 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
2713 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2714
2715 // can't allow blocks that aren't an even multiple of the
2716 // underlying block size.
2717 if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) {
2718 uint32_t phys_blksz, bad=0;
2719
2720 if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) {
2721 bad = 1;
2722 } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
2723 if (phys_blksz < 512) {
2724 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
2725 phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size);
2726 }
2727
2728 if ((buf_size(bp) % phys_blksz) != 0) {
2729 bad = 1;
2730 } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
2731 jnl->jhdr->jhdr_size = phys_blksz;
2732 } else {
2733 // the phys_blksz is now larger... need to realloc the jhdr
2734 char *new_header_buf;
2735
2736 printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n",
2737 jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
2738 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz, VM_KERN_MEMORY_FILE)) {
2739 printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n",
2740 jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
2741 bad = 1;
2742 } else {
2743 memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
2744 memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
2745 kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size);
2746 jnl->header_buf = new_header_buf;
2747 jnl->header_buf_size = phys_blksz;
2748
2749 jnl->jhdr = (journal_header *)jnl->header_buf;
2750 jnl->jhdr->jhdr_size = phys_blksz;
2751 }
2752 }
2753 } else {
2754 bad = 1;
2755 }
2756
2757 if (bad) {
2758 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
2759 buf_size(bp), jnl->jhdr->jhdr_size);
2760
2761 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2762 set_vm_privilege(FALSE);
2763 return -1;
2764 }
2765 }
2766
2767 // make sure that this transaction isn't bigger than the whole journal
2768 if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
2769 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
2770 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp);
2771
2772 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2773 set_vm_privilege(FALSE);
2774 return -1;
2775 }
2776
2777 // if the block is dirty and not already locked we have to write
2778 // it out before we muck with it because it has data that belongs
2779 // (presumably) to another transaction.
2780 //
2781 if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) {
2782
2783 if (buf_flags(bp) & B_ASYNC) {
2784 panic("modify_block_start: bp @ %p has async flag set!\n", bp);
2785 }
2786 if (bp->b_shadow_ref)
2787 panic("modify_block_start: dirty bp @ %p has shadows!\n", bp);
2788
2789 // this will cause it to not be buf_brelse()'d
2790 buf_setflags(bp, B_NORELSE);
2791 VNOP_BWRITE(bp);
2792 }
2793 buf_setflags(bp, B_LOCKED);
2794
2795 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
2796 set_vm_privilege(FALSE);
2797
2798 return 0;
2799 }
2800
2801 int
2802 journal_modify_block_abort(journal *jnl, struct buf *bp)
2803 {
2804 transaction *tr;
2805 block_list_header *blhdr;
2806 int i;
2807
2808 CHECK_JOURNAL(jnl);
2809
2810 free_old_stuff(jnl);
2811
2812 tr = jnl->active_tr;
2813
2814 //
2815 // if there's no active transaction then we just want to
2816 // call buf_brelse() and return since this is just a block
2817 // that happened to be modified as part of another tr.
2818 //
2819 if (tr == NULL) {
2820 buf_brelse(bp);
2821 return 0;
2822 }
2823
2824 if (jnl->flags & JOURNAL_INVALID) {
2825 /* Still need to buf_brelse(). Callers assume we consume the bp. */
2826 buf_brelse(bp);
2827 return EINVAL;
2828 }
2829
2830 CHECK_TRANSACTION(tr);
2831
2832 if (jnl->owner != current_thread()) {
2833 panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2834 jnl, jnl->owner, current_thread());
2835 }
2836
2837 // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
2838
2839 // first check if it's already part of this transaction
2840 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2841 for (i = 1; i < blhdr->num_blocks; i++) {
2842 if (bp == blhdr->binfo[i].u.bp) {
2843 break;
2844 }
2845 }
2846
2847 if (i < blhdr->num_blocks) {
2848 break;
2849 }
2850 }
2851
2852 //
2853 // if blhdr is null, then this block has only had modify_block_start
2854 // called on it as part of the current transaction. that means that
2855 // it is ok to clear the LOCKED bit since it hasn't actually been
2856 // modified. if blhdr is non-null then modify_block_end was called
2857 // on it and so we need to keep it locked in memory.
2858 //
2859 if (blhdr == NULL) {
2860 buf_clearflags(bp, B_LOCKED);
2861 }
2862
2863 buf_brelse(bp);
2864 return 0;
2865 }
2866
2867
2868 int
2869 journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg)
2870 {
2871 int i = 1;
2872 int tbuffer_offset=0;
2873 block_list_header *blhdr, *prev=NULL;
2874 transaction *tr;
2875
2876 CHECK_JOURNAL(jnl);
2877
2878 free_old_stuff(jnl);
2879
2880 if (jnl->flags & JOURNAL_INVALID) {
2881 /* Still need to buf_brelse(). Callers assume we consume the bp. */
2882 buf_brelse(bp);
2883 return EINVAL;
2884 }
2885
2886 tr = jnl->active_tr;
2887 CHECK_TRANSACTION(tr);
2888
2889 if (jnl->owner != current_thread()) {
2890 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
2891 jnl, jnl->owner, current_thread());
2892 }
2893
2894 //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n",
2895 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
2896
2897 if ((buf_flags(bp) & B_LOCKED) == 0) {
2898 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl);
2899 }
2900
2901 // first check if it's already part of this transaction
2902 for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2903 tbuffer_offset = jnl->jhdr->blhdr_size;
2904
2905 for (i = 1; i < blhdr->num_blocks; i++) {
2906 if (bp == blhdr->binfo[i].u.bp) {
2907 break;
2908 }
2909 if (blhdr->binfo[i].bnum != (off_t)-1) {
2910 tbuffer_offset += buf_size(blhdr->binfo[i].u.bp);
2911 } else {
2912 tbuffer_offset += blhdr->binfo[i].u.bi.bsize;
2913 }
2914 }
2915
2916 if (i < blhdr->num_blocks) {
2917 break;
2918 }
2919 }
2920
2921 if (blhdr == NULL
2922 && prev
2923 && (prev->num_blocks+1) <= prev->max_blocks
2924 && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) {
2925 blhdr = prev;
2926
2927 } else if (blhdr == NULL) {
2928 block_list_header *nblhdr;
2929 if (prev == NULL) {
2930 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp);
2931 }
2932
2933 // we got to the end of the list, didn't find the block and there's
2934 // no room in the block_list_header pointed to by prev
2935
2936 // we allocate another tbuffer and link it in at the end of the list
2937 // through prev->binfo[0].bnum. that's a skanky way to do things but
2938 // avoids having yet another linked list of small data structures to manage.
2939
2940 if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size, VM_KERN_MEMORY_FILE)) {
2941 panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n",
2942 tr, tr->total_bytes);
2943 }
2944
2945 // journal replay code checksum check depends on this.
2946 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
2947 // Fill up the rest of the block with unimportant bytes
2948 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
2949
2950 // initialize the new guy
2951 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
2952 nblhdr->num_blocks = 1; // accounts for this header block
2953 nblhdr->bytes_used = jnl->jhdr->blhdr_size;
2954 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
2955
2956 tr->num_blhdrs++;
2957 tr->total_bytes += jnl->jhdr->blhdr_size;
2958
2959 // then link him in at the end
2960 prev->binfo[0].bnum = (off_t)((long)nblhdr);
2961
2962 // and finally switch to using the new guy
2963 blhdr = nblhdr;
2964 tbuffer_offset = jnl->jhdr->blhdr_size;
2965 i = 1;
2966 }
2967
2968
2969 if ((i+1) > blhdr->max_blocks) {
2970 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
2971 }
2972
2973 // if this is true then this is a new block we haven't seen
2974 if (i >= blhdr->num_blocks) {
2975 int bsize;
2976 vnode_t vp;
2977
2978 vp = buf_vnode(bp);
2979 if (vnode_ref(vp)) {
2980 // Nobody checks the return values, so...
2981 jnl->flags |= JOURNAL_INVALID;
2982
2983 buf_brelse(bp);
2984
2985 // We're probably here due to a force unmount, so EIO is appropriate
2986 return EIO;
2987 }
2988
2989 bsize = buf_size(bp);
2990
2991 blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp));
2992 blhdr->binfo[i].u.bp = bp;
2993
2994 KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0);
2995
2996 if (func) {
2997 void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL;
2998
2999 buf_setfilter(bp, func, arg, &old_func, &old_arg);
3000 if (old_func != NULL && old_func != func) {
3001 panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func);
3002 }
3003 }
3004
3005 blhdr->bytes_used += bsize;
3006 tr->total_bytes += bsize;
3007
3008 blhdr->num_blocks++;
3009 }
3010 buf_bdwrite(bp);
3011
3012 return 0;
3013 }
3014
3015 int
3016 journal_kill_block(journal *jnl, struct buf *bp)
3017 {
3018 int i;
3019 int bflags;
3020 block_list_header *blhdr;
3021 transaction *tr;
3022
3023 CHECK_JOURNAL(jnl);
3024
3025 free_old_stuff(jnl);
3026
3027 if (jnl->flags & JOURNAL_INVALID) {
3028 buf_brelse(bp);
3029 return 0;
3030 }
3031
3032 tr = jnl->active_tr;
3033 CHECK_TRANSACTION(tr);
3034
3035 if (jnl->owner != current_thread()) {
3036 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3037 jnl, jnl->owner, current_thread());
3038 }
3039
3040 bflags = buf_flags(bp);
3041
3042 if ( !(bflags & B_LOCKED))
3043 panic("jnl: modify_block_end: called with bp not B_LOCKED");
3044
3045 /*
3046 * bp must be BL_BUSY and B_LOCKED
3047 * first check if it's already part of this transaction
3048 */
3049 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
3050
3051 for (i = 1; i < blhdr->num_blocks; i++) {
3052 if (bp == blhdr->binfo[i].u.bp) {
3053 vnode_t vp;
3054
3055 buf_clearflags(bp, B_LOCKED);
3056
3057 // this undoes the vnode_ref() in journal_modify_block_end()
3058 vp = buf_vnode(bp);
3059 vnode_rele_ext(vp, 0, 1);
3060
3061 // if the block has the DELWRI and FILTER bits sets, then
3062 // things are seriously weird. if it was part of another
3063 // transaction then journal_modify_block_start() should
3064 // have force it to be written.
3065 //
3066 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
3067 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
3068 //} else {
3069 tr->num_killed += buf_size(bp);
3070 //}
3071 blhdr->binfo[i].bnum = (off_t)-1;
3072 blhdr->binfo[i].u.bp = NULL;
3073 blhdr->binfo[i].u.bi.bsize = buf_size(bp);
3074
3075 buf_markinvalid(bp);
3076 buf_brelse(bp);
3077
3078 return 0;
3079 }
3080 }
3081 }
3082
3083 /*
3084 * We did not find the block in any transaction buffer but we still
3085 * need to release it or else it will be left locked forever.
3086 */
3087 buf_brelse(bp);
3088
3089 return 0;
3090 }
3091
3092 /*
3093 ;________________________________________________________________________________
3094 ;
3095 ; Routine: journal_trim_set_callback
3096 ;
3097 ; Function: Provide the journal with a routine to be called back when a
3098 ; TRIM has (or would have) been issued to the device. That
3099 ; is, the transaction has been flushed to the device, and the
3100 ; blocks freed by the transaction are now safe for reuse.
3101 ;
3102 ; CAUTION: If the journal becomes invalid (eg., due to an I/O
3103 ; error when trying to write to the journal), this callback
3104 ; will stop getting called, even if extents got freed before
3105 ; the journal became invalid!
3106 ;
3107 ; Input Arguments:
3108 ; jnl - The journal structure for the filesystem.
3109 ; callback - The function to call when the TRIM is complete.
3110 ; arg - An argument to be passed to callback.
3111 ;________________________________________________________________________________
3112 */
3113 __private_extern__ void
3114 journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg)
3115 {
3116 jnl->trim_callback = callback;
3117 jnl->trim_callback_arg = arg;
3118 }
3119
3120
3121 /*
3122 ;________________________________________________________________________________
3123 ;
3124 ; Routine: journal_trim_realloc
3125 ;
3126 ; Function: Increase the amount of memory allocated for the list of extents
3127 ; to be unmapped (trimmed). This routine will be called when
3128 ; adding an extent to the list, and the list already occupies
3129 ; all of the space allocated to it. This routine returns ENOMEM
3130 ; if unable to allocate more space, or 0 if the extent list was
3131 ; grown successfully.
3132 ;
3133 ; Input Arguments:
3134 ; trim - The trim list to be resized.
3135 ;
3136 ; Output:
3137 ; (result) - ENOMEM or 0.
3138 ;
3139 ; Side effects:
3140 ; The allocated_count and extents fields of tr->trim are updated
3141 ; if the function returned 0.
3142 ;________________________________________________________________________________
3143 */
3144 static int
3145 trim_realloc(journal *jnl, struct jnl_trim_list *trim)
3146 {
3147 void *new_extents;
3148 uint32_t new_allocated_count;
3149 boolean_t was_vm_privileged = FALSE;
3150
3151 if (jnl_kdebug)
3152 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, VM_KERNEL_ADDRPERM(trim), 0, trim->allocated_count, trim->extent_count, 0);
3153
3154 new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS;
3155
3156 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
3157 /*
3158 * if we block waiting for memory, and there is enough pressure to
3159 * cause us to try and create a new swap file, we may end up deadlocking
3160 * due to waiting for the journal on the swap file creation path...
3161 * by making ourselves vm_privileged, we give ourselves the best chance
3162 * of not blocking
3163 */
3164 was_vm_privileged = set_vm_privilege(TRUE);
3165 }
3166 new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t));
3167 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
3168 set_vm_privilege(FALSE);
3169
3170 if (new_extents == NULL) {
3171 printf("jnl: trim_realloc: unable to grow extent list!\n");
3172 /*
3173 * Since we could be called when allocating space previously marked
3174 * to be trimmed, we need to empty out the list to be safe.
3175 */
3176 trim->extent_count = 0;
3177 if (jnl_kdebug)
3178 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0);
3179 return ENOMEM;
3180 }
3181
3182 /* Copy the old extent list to the newly allocated list. */
3183 if (trim->extents != NULL) {
3184 memmove(new_extents,
3185 trim->extents,
3186 trim->allocated_count * sizeof(dk_extent_t));
3187 kfree(trim->extents,
3188 trim->allocated_count * sizeof(dk_extent_t));
3189 }
3190
3191 trim->allocated_count = new_allocated_count;
3192 trim->extents = new_extents;
3193
3194 if (jnl_kdebug)
3195 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0);
3196
3197 return 0;
3198 }
3199
3200 /*
3201 ;________________________________________________________________________________
3202 ;
3203 ; Routine: trim_search_extent
3204 ;
3205 ; Function: Search the given extent list to see if any of its extents
3206 ; overlap the given extent.
3207 ;
3208 ; Input Arguments:
3209 ; trim - The trim list to be searched.
3210 ; offset - The first byte of the range to be searched for.
3211 ; length - The number of bytes of the extent being searched for.
3212 ; overlap_start - start of the overlapping extent
3213 ; overlap_len - length of the overlapping extent
3214 ;
3215 ; Output:
3216 ; (result) - TRUE if one or more extents overlap, FALSE otherwise.
3217 ;________________________________________________________________________________
3218 */
3219 static int
3220 trim_search_extent(struct jnl_trim_list *trim, uint64_t offset,
3221 uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len)
3222 {
3223 uint64_t end = offset + length;
3224 uint32_t lower = 0; /* Lowest index to search */
3225 uint32_t upper = trim->extent_count; /* Highest index to search + 1 */
3226 uint32_t middle;
3227
3228 /* A binary search over the extent list. */
3229 while (lower < upper) {
3230 middle = (lower + upper) / 2;
3231
3232 if (trim->extents[middle].offset >= end)
3233 upper = middle;
3234 else if (trim->extents[middle].offset + trim->extents[middle].length <= offset)
3235 lower = middle + 1;
3236 else {
3237 if (overlap_start) {
3238 *overlap_start = trim->extents[middle].offset;
3239 }
3240 if (overlap_len) {
3241 *overlap_len = trim->extents[middle].length;
3242 }
3243 return TRUE;
3244 }
3245 }
3246
3247 return FALSE;
3248 }
3249
3250
3251 /*
3252 ;________________________________________________________________________________
3253 ;
3254 ; Routine: journal_trim_add_extent
3255 ;
3256 ; Function: Keep track of extents that have been freed as part of this
3257 ; transaction. If the underlying device supports TRIM (UNMAP),
3258 ; then those extents will be trimmed/unmapped once the
3259 ; transaction has been written to the journal. (For example,
3260 ; SSDs can support trim/unmap and avoid having to recopy those
3261 ; blocks when doing wear leveling, and may reuse the same
3262 ; phsyical blocks for different logical blocks.)
3263 ;
3264 ; HFS also uses this, in combination with journal_trim_set_callback,
3265 ; to add recently freed extents to its free extent cache, but
3266 ; only after the transaction that freed them is committed to
3267 ; disk. (This reduces the chance of overwriting live data in
3268 ; a way that causes data loss if a transaction never gets
3269 ; written to the journal.)
3270 ;
3271 ; Input Arguments:
3272 ; jnl - The journal for the volume containing the byte range.
3273 ; offset - The first byte of the range to be trimmed.
3274 ; length - The number of bytes of the extent being trimmed.
3275 ;________________________________________________________________________________
3276 */
3277 __private_extern__ int
3278 journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length)
3279 {
3280 uint64_t end;
3281 transaction *tr;
3282 dk_extent_t *extent;
3283 uint32_t insert_index;
3284 uint32_t replace_count;
3285
3286 CHECK_JOURNAL(jnl);
3287
3288 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
3289 if (jnl->flags & JOURNAL_INVALID) {
3290 return EINVAL;
3291 }
3292
3293 tr = jnl->active_tr;
3294 CHECK_TRANSACTION(tr);
3295
3296 if (jnl_kdebug)
3297 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
3298
3299 if (jnl->owner != current_thread()) {
3300 panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3301 jnl, jnl->owner, current_thread());
3302 }
3303
3304 free_old_stuff(jnl);
3305
3306 end = offset + length;
3307
3308 /*
3309 * Find the range of existing extents that can be combined with the
3310 * input extent. We start by counting the number of extents that end
3311 * strictly before the input extent, then count the number of extents
3312 * that overlap or are contiguous with the input extent.
3313 */
3314 extent = tr->trim.extents;
3315 insert_index = 0;
3316 while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) {
3317 ++insert_index;
3318 ++extent;
3319 }
3320 replace_count = 0;
3321 while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) {
3322 ++replace_count;
3323 ++extent;
3324 }
3325
3326 /*
3327 * If none of the existing extents can be combined with the input extent,
3328 * then just insert it in the list (before item number insert_index).
3329 */
3330 if (replace_count == 0) {
3331 /* If the list was already full, we need to grow it. */
3332 if (tr->trim.extent_count == tr->trim.allocated_count) {
3333 if (trim_realloc(jnl, &tr->trim) != 0) {
3334 printf("jnl: trim_add_extent: out of memory!");
3335 if (jnl_kdebug)
3336 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0);
3337 return ENOMEM;
3338 }
3339 }
3340
3341 /* Shift any existing extents with larger offsets. */
3342 if (insert_index < tr->trim.extent_count) {
3343 memmove(&tr->trim.extents[insert_index+1],
3344 &tr->trim.extents[insert_index],
3345 (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t));
3346 }
3347 tr->trim.extent_count++;
3348
3349 /* Store the new extent in the list. */
3350 tr->trim.extents[insert_index].offset = offset;
3351 tr->trim.extents[insert_index].length = length;
3352
3353 /* We're done. */
3354 if (jnl_kdebug)
3355 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3356 return 0;
3357 }
3358
3359 /*
3360 * Update extent number insert_index to be the union of the input extent
3361 * and all of the replaced extents.
3362 */
3363 if (tr->trim.extents[insert_index].offset < offset)
3364 offset = tr->trim.extents[insert_index].offset;
3365 extent = &tr->trim.extents[insert_index + replace_count - 1];
3366 if (extent->offset + extent->length > end)
3367 end = extent->offset + extent->length;
3368 tr->trim.extents[insert_index].offset = offset;
3369 tr->trim.extents[insert_index].length = end - offset;
3370
3371 /*
3372 * If we were replacing more than one existing extent, then shift any
3373 * extents with larger offsets, and update the count of extents.
3374 *
3375 * We're going to leave extent #insert_index alone since it was just updated, above.
3376 * We need to move extents from index (insert_index + replace_count) through the end of
3377 * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1).
3378 */
3379 if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) {
3380 memmove(&tr->trim.extents[insert_index + 1],
3381 &tr->trim.extents[insert_index + replace_count],
3382 (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t));
3383 }
3384 tr->trim.extent_count -= replace_count - 1;
3385
3386 if (jnl_kdebug)
3387 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0);
3388 return 0;
3389 }
3390
3391 /*
3392 * journal_trim_extent_overlap
3393 *
3394 * Return 1 if there are any pending TRIMs that overlap with the given offset and length
3395 * Return 0 otherwise.
3396 */
3397
3398 int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) {
3399 transaction *tr = NULL;
3400 int overlap = 0;
3401
3402 uint64_t overlap_start;
3403 uint64_t overlap_len;
3404 tr = jnl->active_tr;
3405 CHECK_TRANSACTION(tr);
3406
3407 /*
3408 * There are two lists that need to be examined for potential overlaps:
3409 *
3410 * The first is the current transaction. Since this function requires that
3411 * a transaction be active when this is called, this is the "active_tr"
3412 * pointer in the journal struct. This has a trimlist pointer which needs
3413 * to be searched.
3414 */
3415 overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len);
3416 if (overlap == 0) {
3417 /*
3418 * The second is the async trim list, which is only done if the current
3419 * transaction group (active transaction) did not overlap with our target
3420 * extent. This async trim list is the set of all previously
3421 * committed transaction groups whose I/Os are now in-flight. We need to hold the
3422 * trim lock in order to search this list. If we grab the list before the
3423 * TRIM has completed, then we will compare it. If it is grabbed AFTER the
3424 * TRIM has completed, then the pointer will be zeroed out and we won't have
3425 * to check anything.
3426 */
3427 lck_rw_lock_shared (&jnl->trim_lock);
3428 if (jnl->async_trim != NULL) {
3429 overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len);
3430 }
3431 lck_rw_unlock_shared (&jnl->trim_lock);
3432 }
3433
3434 if (overlap) {
3435 /* compute the end (min) of the overlapping range */
3436 if ( (overlap_start + overlap_len) < (offset + length)) {
3437 *end = (overlap_start + overlap_len);
3438 }
3439 else {
3440 *end = (offset + length);
3441 }
3442 }
3443
3444
3445 return overlap;
3446 }
3447
3448 /*
3449 * journal_request_immediate_flush
3450 *
3451 * FS requests that the journal flush immediately upon the
3452 * active transaction's completion.
3453 *
3454 * Returns 0 if operation succeeds
3455 * Returns EPERM if we failed to leave hint
3456 */
3457 int
3458 journal_request_immediate_flush (journal *jnl) {
3459
3460 transaction *tr = NULL;
3461 /*
3462 * Is a transaction still in process? You must do
3463 * this while there are txns open
3464 */
3465 tr = jnl->active_tr;
3466 if (tr != NULL) {
3467 CHECK_TRANSACTION(tr);
3468 tr->flush_on_completion = TRUE;
3469 }
3470 else {
3471 return EPERM;
3472 }
3473 return 0;
3474 }
3475
3476
3477
3478 /*
3479 ;________________________________________________________________________________
3480 ;
3481 ; Routine: trim_remove_extent
3482 ;
3483 ; Function: Indicate that a range of bytes, some of which may have previously
3484 ; been passed to journal_trim_add_extent, is now allocated.
3485 ; Any overlapping ranges currently in the journal's trim list will
3486 ; be removed. If the underlying device supports TRIM (UNMAP), then
3487 ; these extents will not be trimmed/unmapped when the transaction
3488 ; is written to the journal.
3489 ;
3490 ; HFS also uses this to prevent newly allocated space from being
3491 ; added to its free extent cache (if some portion of the newly
3492 ; allocated space was recently freed).
3493 ;
3494 ; Input Arguments:
3495 ; trim - The trim list to update.
3496 ; offset - The first byte of the range to be trimmed.
3497 ; length - The number of bytes of the extent being trimmed.
3498 ;________________________________________________________________________________
3499 */
3500 static int
3501 trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length)
3502 {
3503 u_int64_t end;
3504 dk_extent_t *extent;
3505 u_int32_t keep_before;
3506 u_int32_t keep_after;
3507
3508 end = offset + length;
3509
3510 /*
3511 * Find any existing extents that start before or end after the input
3512 * extent. These extents will be modified if they overlap the input
3513 * extent. Other extents between them will be deleted.
3514 */
3515 extent = trim->extents;
3516 keep_before = 0;
3517 while (keep_before < trim->extent_count && extent->offset < offset) {
3518 ++keep_before;
3519 ++extent;
3520 }
3521 keep_after = keep_before;
3522 if (keep_after > 0) {
3523 /* See if previous extent extends beyond both ends of input extent. */
3524 --keep_after;
3525 --extent;
3526 }
3527 while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) {
3528 ++keep_after;
3529 ++extent;
3530 }
3531
3532 /*
3533 * When we get here, the first keep_before extents (0 .. keep_before-1)
3534 * start before the input extent, and extents (keep_after .. extent_count-1)
3535 * end after the input extent. We'll need to keep, all of those extents,
3536 * but possibly modify #(keep_before-1) and #keep_after to remove the portion
3537 * that overlaps with the input extent.
3538 */
3539
3540 /*
3541 * Does the input extent start after and end before the same existing
3542 * extent? If so, we have to "punch a hole" in that extent and convert
3543 * it to two separate extents.
3544 */
3545 if (keep_before > keep_after) {
3546 /* If the list was already full, we need to grow it. */
3547 if (trim->extent_count == trim->allocated_count) {
3548 if (trim_realloc(jnl, trim) != 0) {
3549 printf("jnl: trim_remove_extent: out of memory!");
3550 return ENOMEM;
3551 }
3552 }
3553
3554 /*
3555 * Make room for a new extent by shifting extents #keep_after and later
3556 * down by one extent. When we're done, extents #keep_before and
3557 * #keep_after will be identical, and we can fall through to removing
3558 * the portion that overlaps the input extent.
3559 */
3560 memmove(&trim->extents[keep_before],
3561 &trim->extents[keep_after],
3562 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3563 ++trim->extent_count;
3564 ++keep_after;
3565
3566 /*
3567 * Fall through. We now have the case where the length of extent
3568 * #(keep_before - 1) needs to be updated, and the start of extent
3569 * #(keep_after) needs to be updated.
3570 */
3571 }
3572
3573 /*
3574 * May need to truncate the end of extent #(keep_before - 1) if it overlaps
3575 * the input extent.
3576 */
3577 if (keep_before > 0) {
3578 extent = &trim->extents[keep_before - 1];
3579 if (extent->offset + extent->length > offset) {
3580 extent->length = offset - extent->offset;
3581 }
3582 }
3583
3584 /*
3585 * May need to update the start of extent #(keep_after) if it overlaps the
3586 * input extent.
3587 */
3588 if (keep_after < trim->extent_count) {
3589 extent = &trim->extents[keep_after];
3590 if (extent->offset < end) {
3591 extent->length = extent->offset + extent->length - end;
3592 extent->offset = end;
3593 }
3594 }
3595
3596 /*
3597 * If there were whole extents that overlapped the input extent, get rid
3598 * of them by shifting any following extents, and updating the count.
3599 */
3600 if (keep_after > keep_before && keep_after < trim->extent_count) {
3601 memmove(&trim->extents[keep_before],
3602 &trim->extents[keep_after],
3603 (trim->extent_count - keep_after) * sizeof(dk_extent_t));
3604 }
3605 trim->extent_count -= keep_after - keep_before;
3606
3607 return 0;
3608 }
3609
3610 /*
3611 ;________________________________________________________________________________
3612 ;
3613 ; Routine: journal_trim_remove_extent
3614 ;
3615 ; Function: Make note of a range of bytes, some of which may have previously
3616 ; been passed to journal_trim_add_extent, is now in use on the
3617 ; volume. The given bytes will be not be trimmed as part of
3618 ; this transaction, or a pending trim of a transaction being
3619 ; asynchronously flushed.
3620 ;
3621 ; Input Arguments:
3622 ; jnl - The journal for the volume containing the byte range.
3623 ; offset - The first byte of the range to be trimmed.
3624 ; length - The number of bytes of the extent being trimmed.
3625 ;________________________________________________________________________________
3626 */
3627 __private_extern__ int
3628 journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length)
3629 {
3630 int error = 0;
3631 transaction *tr;
3632
3633 CHECK_JOURNAL(jnl);
3634
3635 /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */
3636 if (jnl->flags & JOURNAL_INVALID) {
3637 return EINVAL;
3638 }
3639
3640 tr = jnl->active_tr;
3641 CHECK_TRANSACTION(tr);
3642
3643 if (jnl_kdebug)
3644 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, tr->trim.extent_count, 0);
3645
3646 if (jnl->owner != current_thread()) {
3647 panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3648 jnl, jnl->owner, current_thread());
3649 }
3650
3651 free_old_stuff(jnl);
3652
3653 error = trim_remove_extent(jnl, &tr->trim, offset, length);
3654 if (error == 0) {
3655 int found = FALSE;
3656
3657 /*
3658 * See if a pending trim has any extents that overlap with the
3659 * one we were given.
3660 */
3661 lck_rw_lock_shared(&jnl->trim_lock);
3662 if (jnl->async_trim != NULL)
3663 found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL);
3664 lck_rw_unlock_shared(&jnl->trim_lock);
3665
3666 if (found) {
3667 /*
3668 * There was an overlap, so avoid trimming the extent we
3669 * just allocated. (Otherwise, it might get trimmed after
3670 * we've written to it, which will cause that data to be
3671 * corrupted.)
3672 */
3673 uint32_t async_extent_count = 0;
3674
3675 if (jnl_kdebug)
3676 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), offset, length, 0, 0);
3677 lck_rw_lock_exclusive(&jnl->trim_lock);
3678 if (jnl->async_trim != NULL) {
3679 error = trim_remove_extent(jnl, jnl->async_trim, offset, length);
3680 async_extent_count = jnl->async_trim->extent_count;
3681 }
3682 lck_rw_unlock_exclusive(&jnl->trim_lock);
3683 if (jnl_kdebug)
3684 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0);
3685 }
3686 }
3687
3688 if (jnl_kdebug)
3689 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0);
3690 return error;
3691 }
3692
3693
3694 static int
3695 journal_trim_flush(journal *jnl, transaction *tr)
3696 {
3697 int errno = 0;
3698 boolean_t was_vm_privileged = FALSE;
3699
3700 if (jnl_kdebug)
3701 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
3702
3703 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
3704 /*
3705 * the disk driver can allocate memory on this path...
3706 * if we block waiting for memory, and there is enough pressure to
3707 * cause us to try and create a new swap file, we may end up deadlocking
3708 * due to waiting for the journal on the swap file creation path...
3709 * by making ourselves vm_privileged, we give ourselves the best chance
3710 * of not blocking
3711 */
3712 was_vm_privileged = set_vm_privilege(TRUE);
3713 }
3714 lck_rw_lock_shared(&jnl->trim_lock);
3715 if (tr->trim.extent_count > 0) {
3716 dk_unmap_t unmap;
3717
3718 bzero(&unmap, sizeof(unmap));
3719 if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) {
3720 unmap.extents = tr->trim.extents;
3721 unmap.extentsCount = tr->trim.extent_count;
3722 if (jnl_kdebug)
3723 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, VM_KERNEL_ADDRPERM(jnl), tr, 0, tr->trim.extent_count, 0);
3724 errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel());
3725 if (jnl_kdebug)
3726 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0);
3727 }
3728
3729 /*
3730 * Call back into the file system to tell them that we have
3731 * trimmed some extents and that they can now be reused.
3732 *
3733 * CAUTION: If the journal becomes invalid (eg., due to an I/O
3734 * error when trying to write to the journal), this callback
3735 * will stop getting called, even if extents got freed before
3736 * the journal became invalid!
3737 */
3738 if (jnl->trim_callback)
3739 jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents);
3740 }
3741 lck_rw_unlock_shared(&jnl->trim_lock);
3742
3743 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
3744 set_vm_privilege(FALSE);
3745 /*
3746 * If the transaction we're flushing was the async transaction, then
3747 * tell the current transaction that there is no pending trim
3748 * any more.
3749 *
3750 * NOTE: Since we released the lock, another thread could have
3751 * removed one or more extents from our list. That's not a
3752 * problem since any writes to the re-allocated blocks
3753 * would get sent to the device after the DKIOCUNMAP.
3754 */
3755 lck_rw_lock_exclusive(&jnl->trim_lock);
3756 if (jnl->async_trim == &tr->trim)
3757 jnl->async_trim = NULL;
3758 lck_rw_unlock_exclusive(&jnl->trim_lock);
3759
3760 /*
3761 * By the time we get here, no other thread can discover the address
3762 * of "tr", so it is safe for us to manipulate tr->trim without
3763 * holding any locks.
3764 */
3765 if (tr->trim.extents) {
3766 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
3767 tr->trim.allocated_count = 0;
3768 tr->trim.extent_count = 0;
3769 tr->trim.extents = NULL;
3770 }
3771
3772 if (jnl_kdebug)
3773 KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0);
3774
3775 return errno;
3776 }
3777
3778 static int
3779 journal_binfo_cmp(const void *a, const void *b)
3780 {
3781 const block_info *bi_a = (const struct block_info *)a;
3782 const block_info *bi_b = (const struct block_info *)b;
3783 daddr64_t res;
3784
3785 if (bi_a->bnum == (off_t)-1) {
3786 return 1;
3787 }
3788 if (bi_b->bnum == (off_t)-1) {
3789 return -1;
3790 }
3791
3792 // don't have to worry about negative block
3793 // numbers so this is ok to do.
3794 //
3795 res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp));
3796
3797 return (int)res;
3798 }
3799
3800
3801 /*
3802 * End a transaction. If the transaction is small enough, and we're not forcing
3803 * a write to disk, the "active" transaction becomes the "current" transaction,
3804 * and will be reused for the next transaction that is started (group commit).
3805 *
3806 * If the transaction gets written to disk (because force_it is true, or no
3807 * group commit, or the transaction is sufficiently full), the blocks get
3808 * written into the journal first, then the are written asynchronously. When
3809 * those async writes complete, the transaction can be freed and removed from
3810 * the journal.
3811 *
3812 * An optional callback can be supplied. If given, it is called after the
3813 * the blocks have been written to the journal, but before the async writes
3814 * of those blocks to their normal on-disk locations. This is used by
3815 * journal_relocate so that the location of the journal can be changed and
3816 * flushed to disk before the blocks get written to their normal locations.
3817 * Note that the callback is only called if the transaction gets written to
3818 * the journal during this end_transaction call; you probably want to set the
3819 * force_it flag.
3820 *
3821 * Inputs:
3822 * tr Transaction to add to the journal
3823 * force_it If true, force this transaction to the on-disk journal immediately.
3824 * callback See description above. Pass NULL for no callback.
3825 * callback_arg Argument passed to callback routine.
3826 *
3827 * Result
3828 * 0 No errors
3829 * -1 An error occurred. The journal is marked invalid.
3830 */
3831 static int
3832 end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait)
3833 {
3834 block_list_header *blhdr=NULL, *next=NULL;
3835 int i, ret_val = 0;
3836 errno_t errno;
3837 journal *jnl = tr->jnl;
3838 struct buf *bp;
3839 size_t tbuffer_offset;
3840 boolean_t drop_lock_early;
3841
3842 if (jnl->cur_tr) {
3843 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
3844 jnl, jnl->cur_tr, tr);
3845 }
3846
3847 // if there weren't any modified blocks in the transaction
3848 // just save off the transaction pointer and return.
3849 if (tr->total_bytes == jnl->jhdr->blhdr_size) {
3850 jnl->cur_tr = tr;
3851 goto done;
3852 }
3853
3854 // if our transaction buffer isn't very full, just hang
3855 // on to it and don't actually flush anything. this is
3856 // what is known as "group commit". we will flush the
3857 // transaction buffer if it's full or if we have more than
3858 // one of them so we don't start hogging too much memory.
3859 //
3860 // We also check the device supports UNMAP/TRIM, and if so,
3861 // the number of extents waiting to be trimmed. If it is
3862 // small enough, then keep accumulating more (so we can
3863 // reduce the overhead of trimming). If there was a prior
3864 // trim error, then we stop issuing trims for this
3865 // volume, so we can also coalesce transactions.
3866 //
3867 if ( force_it == 0
3868 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
3869 && tr->num_blhdrs < 3
3870 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
3871 && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) {
3872
3873 jnl->cur_tr = tr;
3874 goto done;
3875 }
3876
3877 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0);
3878
3879 lock_condition(jnl, &jnl->flushing, "end_transaction");
3880
3881 /*
3882 * if the previous 'finish_end_transaction' was being run
3883 * asynchronously, it could have encountered a condition
3884 * that caused it to mark the journal invalid... if that
3885 * occurred while we were waiting for it to finish, we
3886 * need to notice and abort the current transaction
3887 */
3888 if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) {
3889 unlock_condition(jnl, &jnl->flushing);
3890
3891 abort_transaction(jnl, tr);
3892 ret_val = -1;
3893 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
3894 goto done;
3895 }
3896
3897 /*
3898 * Store a pointer to this transaction's trim list so that
3899 * future transactions can find it.
3900 *
3901 * Note: if there are no extents in the trim list, then don't
3902 * bother saving the pointer since nothing can add new extents
3903 * to the list (and other threads/transactions only care if
3904 * there is a trim pending).
3905 */
3906 lck_rw_lock_exclusive(&jnl->trim_lock);
3907 if (jnl->async_trim != NULL)
3908 panic("jnl: end_transaction: async_trim already non-NULL!");
3909 if (tr->trim.extent_count > 0)
3910 jnl->async_trim = &tr->trim;
3911 lck_rw_unlock_exclusive(&jnl->trim_lock);
3912
3913 /*
3914 * snapshot the transaction sequence number while we are still behind
3915 * the journal lock since it will be bumped upon the start of the
3916 * next transaction group which may overlap the current journal flush...
3917 * we pass the snapshot into write_journal_header during the journal
3918 * flush so that it can write the correct version in the header...
3919 * because we hold the 'flushing' condition variable for the duration
3920 * of the journal flush, 'saved_sequence_num' remains stable
3921 */
3922 jnl->saved_sequence_num = jnl->sequence_num;
3923
3924 /*
3925 * if we're here we're going to flush the transaction buffer to disk.
3926 * 'check_free_space' will not return untl there is enough free
3927 * space for this transaction in the journal and jnl->old_start[0]
3928 * is avaiable for use
3929 */
3930 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0);
3931
3932 check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
3933
3934 KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0);
3935
3936 // range check the end index
3937 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
3938 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
3939 jnl->jhdr->end, jnl->jhdr->size);
3940 }
3941 if (tr->delayed_header_write == TRUE) {
3942 thread_t thread = THREAD_NULL;
3943
3944 lock_condition(jnl, &jnl->writing_header, "end_transaction");
3945 /*
3946 * fire up a thread to write the journal header
3947 * asynchronously... when it finishes, it will call
3948 * unlock_condition... we can overlap the preparation of
3949 * the log and buffers during this time
3950 */
3951 kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread);
3952 } else
3953 jnl->write_header_failed = FALSE;
3954
3955
3956 // this transaction starts where the current journal ends
3957 tr->journal_start = jnl->jhdr->end;
3958
3959 lock_oldstart(jnl);
3960 /*
3961 * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
3962 * slide everyone else down and put our latest guy in the last
3963 * entry in the old_start array
3964 */
3965 memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
3966 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
3967
3968 unlock_oldstart(jnl);
3969
3970
3971 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
3972 char *blkptr;
3973 buf_t sbp;
3974 int32_t bsize;
3975
3976 tbuffer_offset = jnl->jhdr->blhdr_size;
3977
3978 for (i = 1; i < blhdr->num_blocks; i++) {
3979
3980 if (blhdr->binfo[i].bnum != (off_t)-1) {
3981 void (*func)(buf_t, void *);
3982 void *arg;
3983
3984 bp = blhdr->binfo[i].u.bp;
3985
3986 if (bp == NULL) {
3987 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
3988 blhdr->binfo[i].bnum, jnl, tr);
3989 }
3990 /*
3991 * acquire the bp here so that we can safely
3992 * mess around with its data. buf_acquire()
3993 * will return EAGAIN if the buffer was busy,
3994 * so loop trying again.
3995 */
3996 do {
3997 errno = buf_acquire(bp, BAC_REMOVE, 0, 0);
3998 } while (errno == EAGAIN);
3999
4000 if (errno)
4001 panic("could not acquire bp %p (err %d)\n", bp, errno);
4002
4003 if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
4004 if (jnl->flags & JOURNAL_CLOSE_PENDING) {
4005 buf_clearflags(bp, B_LOCKED);
4006 buf_brelse(bp);
4007
4008 /*
4009 * this is an odd case that appears to happen occasionally
4010 * make sure we mark this block as no longer valid
4011 * so that we don't process it in "finish_end_transaction" since
4012 * the bp that is recorded in our array no longer belongs
4013 * to us (normally we substitute a shadow bp to be processed
4014 * issuing a 'buf_bawrite' on a stale buf_t pointer leads
4015 * to all kinds of problems.
4016 */
4017 blhdr->binfo[i].bnum = (off_t)-1;
4018 continue;
4019 } else {
4020 panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp));
4021 }
4022 }
4023 bsize = buf_size(bp);
4024
4025 buf_setfilter(bp, NULL, NULL, &func, &arg);
4026
4027 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
4028
4029 sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0);
4030
4031 if (sbp == NULL)
4032 panic("jnl: buf_create_shadow returned NULL");
4033
4034 /*
4035 * copy the data into the transaction buffer...
4036 */
4037 memcpy(blkptr, (char *)buf_dataptr(bp), bsize);
4038
4039 buf_clearflags(bp, B_LOCKED);
4040 buf_markclean(bp);
4041 buf_drop(bp);
4042
4043 /*
4044 * adopt the shadow buffer for this block
4045 */
4046 if (func) {
4047 /*
4048 * transfer FS hook function to the
4049 * shadow buffer... it will get called
4050 * in finish_end_transaction
4051 */
4052 buf_setfilter(sbp, func, arg, NULL, NULL);
4053 }
4054 blhdr->binfo[i].u.bp = sbp;
4055
4056 } else {
4057 // bnum == -1, only true if a block was "killed"
4058 bsize = blhdr->binfo[i].u.bi.bsize;
4059 }
4060 tbuffer_offset += bsize;
4061 }
4062 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4063 }
4064 /*
4065 * if callback != NULL, we don't want to drop the journal
4066 * lock, or complete end_transaction asynchronously, since
4067 * the caller is expecting the callback to run in the calling
4068 * context
4069 *
4070 * if drop_lock == FALSE, we can't complete end_transaction
4071 * asynchronously
4072 */
4073 if (callback)
4074 drop_lock_early = FALSE;
4075 else
4076 drop_lock_early = drop_lock;
4077
4078 if (drop_lock_early == FALSE)
4079 must_wait = TRUE;
4080
4081 if (drop_lock_early == TRUE) {
4082 journal_unlock(jnl);
4083 drop_lock = FALSE;
4084 }
4085 if (must_wait == TRUE)
4086 ret_val = finish_end_transaction(tr, callback, callback_arg);
4087 else {
4088 thread_t thread = THREAD_NULL;
4089
4090 /*
4091 * fire up a thread to complete processing this transaction
4092 * asynchronously... when it finishes, it will call
4093 * unlock_condition
4094 */
4095 kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread);
4096 }
4097 KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0);
4098 done:
4099 if (drop_lock == TRUE) {
4100 journal_unlock(jnl);
4101 }
4102 return (ret_val);
4103 }
4104
4105
4106 static void
4107 finish_end_thread(transaction *tr)
4108 {
4109 proc_set_task_policy(current_task(), current_thread(),
4110 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
4111
4112 finish_end_transaction(tr, NULL, NULL);
4113
4114 thread_deallocate(current_thread());
4115 thread_terminate(current_thread());
4116 }
4117
4118 static void
4119 write_header_thread(journal *jnl)
4120 {
4121 proc_set_task_policy(current_task(), current_thread(),
4122 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, IOPOL_PASSIVE);
4123
4124 if (write_journal_header(jnl, 1, jnl->saved_sequence_num))
4125 jnl->write_header_failed = TRUE;
4126 else
4127 jnl->write_header_failed = FALSE;
4128 unlock_condition(jnl, &jnl->writing_header);
4129
4130 thread_deallocate(current_thread());
4131 thread_terminate(current_thread());
4132 }
4133
4134 static int
4135 finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg)
4136 {
4137 int i, amt;
4138 int ret = 0;
4139 off_t end;
4140 journal *jnl = tr->jnl;
4141 buf_t bp, *bparray;
4142 vnode_t vp;
4143 block_list_header *blhdr=NULL, *next=NULL;
4144 size_t tbuffer_offset;
4145 int bufs_written = 0;
4146 int ret_val = 0;
4147 boolean_t was_vm_privileged = FALSE;
4148
4149 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0);
4150
4151 if (jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) {
4152 /*
4153 * if we block waiting for memory, and there is enough pressure to
4154 * cause us to try and create a new swap file, we may end up deadlocking
4155 * due to waiting for the journal on the swap file creation path...
4156 * by making ourselves vm_privileged, we give ourselves the best chance
4157 * of not blocking
4158 */
4159 was_vm_privileged = set_vm_privilege(TRUE);
4160 }
4161 end = jnl->jhdr->end;
4162
4163 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
4164
4165 amt = blhdr->bytes_used;
4166
4167 blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
4168
4169 blhdr->checksum = 0;
4170 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
4171
4172 if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *), VM_KERN_MEMORY_FILE)) {
4173 panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *));
4174 }
4175 tbuffer_offset = jnl->jhdr->blhdr_size;
4176
4177 for (i = 1; i < blhdr->num_blocks; i++) {
4178 void (*func)(buf_t, void *);
4179 void *arg;
4180 int32_t bsize;
4181
4182 /*
4183 * finish preparing the shadow buf_t before
4184 * calculating the individual block checksums
4185 */
4186 if (blhdr->binfo[i].bnum != (off_t)-1) {
4187 daddr64_t blkno;
4188 daddr64_t lblkno;
4189
4190 bp = blhdr->binfo[i].u.bp;
4191
4192 vp = buf_vnode(bp);
4193 blkno = buf_blkno(bp);
4194 lblkno = buf_lblkno(bp);
4195
4196 if (vp == NULL && lblkno == blkno) {
4197 printf("jnl: %s: end_tr: bad news! buffer w/null vp and l/blkno = %qd/%qd. aborting the transaction.\n",
4198 jnl->jdev_name, lblkno, blkno);
4199 ret_val = -1;
4200 goto bad_journal;
4201 }
4202
4203 // if the lblkno is the same as blkno and this bp isn't
4204 // associated with the underlying file system device then
4205 // we need to call bmap() to get the actual physical block.
4206 //
4207 if ((lblkno == blkno) && (vp != jnl->fsdev)) {
4208 off_t f_offset;
4209 size_t contig_bytes;
4210
4211 if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) {
4212 printf("jnl: %s: end_tr: vnop_blktooff failed\n", jnl->jdev_name);
4213 ret_val = -1;
4214 goto bad_journal;
4215 }
4216 if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) {
4217 printf("jnl: %s: end_tr: can't blockmap the buffer", jnl->jdev_name);
4218 ret_val = -1;
4219 goto bad_journal;
4220 }
4221 if ((uint32_t)contig_bytes < buf_count(bp)) {
4222 printf("jnl: %s: end_tr: blk not physically contiguous on disk\n", jnl->jdev_name);
4223 ret_val = -1;
4224 goto bad_journal;
4225 }
4226 buf_setblkno(bp, blkno);
4227 }
4228 // update this so we write out the correct physical block number!
4229 blhdr->binfo[i].bnum = (off_t)(blkno);
4230
4231 /*
4232 * pick up the FS hook function (if any) and prepare
4233 * to fire this buffer off in the next pass
4234 */
4235 buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg);
4236
4237 if (func) {
4238 /*
4239 * call the hook function supplied by the filesystem...
4240 * this needs to happen BEFORE cacl_checksum in case
4241 * the FS morphs the data in the buffer
4242 */
4243 func(bp, arg);
4244 }
4245 bparray[i] = bp;
4246 bsize = buf_size(bp);
4247 blhdr->binfo[i].u.bi.bsize = bsize;
4248 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize);
4249 } else {
4250 bparray[i] = NULL;
4251 bsize = blhdr->binfo[i].u.bi.bsize;
4252 blhdr->binfo[i].u.bi.b.cksum = 0;
4253 }
4254 tbuffer_offset += bsize;
4255 }
4256 /*
4257 * if we fired off the journal_write_header asynchronously in
4258 * 'end_transaction', we need to wait for its completion
4259 * before writing the actual journal data
4260 */
4261 wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
4262
4263 if (jnl->write_header_failed == FALSE)
4264 ret = write_journal_data(jnl, &end, blhdr, amt);
4265 else
4266 ret_val = -1;
4267 /*
4268 * put the bp pointers back so that we can
4269 * make the final pass on them
4270 */
4271 for (i = 1; i < blhdr->num_blocks; i++)
4272 blhdr->binfo[i].u.bp = bparray[i];
4273
4274 kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *));
4275
4276 if (ret_val == -1)
4277 goto bad_journal;
4278
4279 if (ret != amt) {
4280 printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n",
4281 jnl->jdev_name, ret, amt);
4282
4283 ret_val = -1;
4284 goto bad_journal;
4285 }
4286 }
4287 jnl->jhdr->end = end; // update where the journal now ends
4288 tr->journal_end = end; // the transaction ends here too
4289
4290 if (tr->journal_start == 0 || tr->journal_end == 0) {
4291 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
4292 tr->journal_start, tr->journal_end);
4293 }
4294
4295 if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
4296 ret_val = -1;
4297 goto bad_journal;
4298 }
4299 /*
4300 * If the caller supplied a callback, call it now that the blocks have been
4301 * written to the journal. This is used by journal_relocate so, for example,
4302 * the file system can change its pointer to the new journal.
4303 */
4304 if (callback != NULL && callback(callback_arg) != 0) {
4305 ret_val = -1;
4306 goto bad_journal;
4307 }
4308
4309 //
4310 // Send a DKIOCUNMAP for the extents trimmed by this transaction, and
4311 // free up the extent list.
4312 //
4313 journal_trim_flush(jnl, tr);
4314
4315 // the buffer_flushed_callback will only be called for the
4316 // real blocks that get flushed so we have to account for
4317 // the block_list_headers here.
4318 //
4319 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
4320
4321 lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
4322
4323 //
4324 // setup for looping through all the blhdr's.
4325 //
4326 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4327 uint16_t num_blocks;
4328
4329 /*
4330 * grab this info ahead of issuing the buf_bawrites...
4331 * once the last one goes out, its possible for blhdr
4332 * to be freed (especially if we get preempted) before
4333 * we do the last check of num_blocks or
4334 * grab the next blhdr pointer...
4335 */
4336 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4337 num_blocks = blhdr->num_blocks;
4338
4339 /*
4340 * we can re-order the buf ptrs because everything is written out already
4341 */
4342 qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
4343
4344 /*
4345 * need to make sure that the loop issuing the buf_bawrite's
4346 * does not touch blhdr once the last buf_bawrite has been
4347 * issued... at that point, we no longer have a legitmate
4348 * reference on the associated storage since it will be
4349 * released upon the completion of that last buf_bawrite
4350 */
4351 for (i = num_blocks-1; i >= 1; i--) {
4352 if (blhdr->binfo[i].bnum != (off_t)-1)
4353 break;
4354 num_blocks--;
4355 }
4356 for (i = 1; i < num_blocks; i++) {
4357
4358 if ((bp = blhdr->binfo[i].u.bp)) {
4359 vp = buf_vnode(bp);
4360
4361 buf_bawrite(bp);
4362
4363 // this undoes the vnode_ref() in journal_modify_block_end()
4364 vnode_rele_ext(vp, 0, 1);
4365
4366 bufs_written++;
4367 }
4368 }
4369 }
4370 if (bufs_written == 0) {
4371 /*
4372 * since we didn't issue any buf_bawrite's, there is no
4373 * async trigger to cause the memory associated with this
4374 * transaction to be freed... so, move it to the garbage
4375 * list now
4376 */
4377 lock_oldstart(jnl);
4378
4379 tr->next = jnl->tr_freeme;
4380 jnl->tr_freeme = tr;
4381
4382 unlock_oldstart(jnl);
4383
4384 unlock_condition(jnl, &jnl->asyncIO);
4385 }
4386
4387 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
4388 // tr, tr->journal_start, tr->journal_end);
4389
4390 bad_journal:
4391 if (ret_val == -1) {
4392 abort_transaction(jnl, tr); // cleans up list of extents to be trimmed
4393
4394 /*
4395 * 'flush_aborted' is protected by the flushing condition... we need to
4396 * set it before dropping the condition so that it will be
4397 * noticed in 'end_transaction'... we add this additional
4398 * aborted condition so that we can drop the 'flushing' condition
4399 * before grabbing the journal lock... this avoids a deadlock
4400 * in 'end_transaction' which is holding the journal lock while
4401 * waiting for the 'flushing' condition to clear...
4402 * everyone else will notice the JOURNAL_INVALID flag
4403 */
4404 jnl->flush_aborted = TRUE;
4405
4406 unlock_condition(jnl, &jnl->flushing);
4407 journal_lock(jnl);
4408
4409 jnl->flags |= JOURNAL_INVALID;
4410 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
4411
4412 journal_unlock(jnl);
4413 } else
4414 unlock_condition(jnl, &jnl->flushing);
4415
4416 if ((jnl->fsmount->mnt_kern_flag & MNTK_SWAP_MOUNT) && (was_vm_privileged == FALSE))
4417 set_vm_privilege(FALSE);
4418
4419 KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0);
4420
4421 return (ret_val);
4422 }
4423
4424
4425 static void
4426 lock_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4427 {
4428
4429 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4430
4431 lock_flush(jnl);
4432
4433 while (*condition == TRUE)
4434 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4435
4436 *condition = TRUE;
4437 unlock_flush(jnl);
4438
4439 KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4440 }
4441
4442 static void
4443 wait_condition(journal *jnl, boolean_t *condition, const char *condition_name)
4444 {
4445
4446 if (*condition == FALSE)
4447 return;
4448
4449 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0);
4450
4451 lock_flush(jnl);
4452
4453 while (*condition == TRUE)
4454 msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL);
4455
4456 unlock_flush(jnl);
4457
4458 KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0);
4459 }
4460
4461 static void
4462 unlock_condition(journal *jnl, boolean_t *condition)
4463 {
4464 lock_flush(jnl);
4465
4466 *condition = FALSE;
4467 wakeup(condition);
4468
4469 unlock_flush(jnl);
4470 }
4471
4472 static void
4473 abort_transaction(journal *jnl, transaction *tr)
4474 {
4475 block_list_header *blhdr, *next;
4476
4477 // for each block list header, iterate over the blocks then
4478 // free up the memory associated with the block list.
4479 //
4480 // find each of the primary blocks (i.e. the list could
4481 // contain a mix of shadowed and real buf_t's depending
4482 // on when the abort condition was detected) and mark them
4483 // clean and locked in the cache... this at least allows
4484 // the FS a consistent view between it's incore data structures
4485 // and the meta-data held in the cache
4486 //
4487 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0);
4488
4489 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
4490 int i;
4491
4492 for (i = 1; i < blhdr->num_blocks; i++) {
4493 buf_t bp, tbp, sbp;
4494 vnode_t bp_vp;
4495 errno_t errno;
4496
4497 if (blhdr->binfo[i].bnum == (off_t)-1)
4498 continue;
4499
4500 tbp = blhdr->binfo[i].u.bp;
4501
4502 bp_vp = buf_vnode(tbp);
4503
4504 if (buf_shadow(tbp)) {
4505 sbp = tbp;
4506 buf_setfilter(tbp, NULL, NULL, NULL, NULL);
4507 } else {
4508 assert(ISSET(buf_flags(tbp), B_LOCKED));
4509
4510 sbp = NULL;
4511
4512 do {
4513 errno = buf_acquire(tbp, BAC_REMOVE, 0, 0);
4514 } while (errno == EAGAIN);
4515
4516 if (!errno) {
4517 buf_setfilter(tbp, NULL, NULL, NULL, NULL);
4518 buf_brelse(tbp);
4519 }
4520 }
4521
4522 if (bp_vp) {
4523 errno = buf_meta_bread(bp_vp,
4524 buf_lblkno(tbp),
4525 buf_size(tbp),
4526 NOCRED,
4527 &bp);
4528 if (errno == 0) {
4529 if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) {
4530 panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n",
4531 bp, tbp, jnl);
4532 }
4533 /*
4534 * once the journal has been marked INVALID and aborted,
4535 * NO meta data can be written back to the disk, so
4536 * mark the buf_t clean and make sure it's locked in the cache
4537 * note: if we found a shadow, the real buf_t needs to be relocked
4538 */
4539 buf_setflags(bp, B_LOCKED);
4540 buf_markclean(bp);
4541 buf_brelse(bp);
4542
4543 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0);
4544
4545 /*
4546 * this undoes the vnode_ref() in journal_modify_block_end()
4547 */
4548 vnode_rele_ext(bp_vp, 0, 1);
4549 } else {
4550 printf("jnl: %s: abort_tr: could not find block %lld for vnode!\n",
4551 jnl->jdev_name, blhdr->binfo[i].bnum);
4552 if (bp) {
4553 buf_brelse(bp);
4554 }
4555 }
4556 }
4557 if (sbp)
4558 buf_brelse(sbp);
4559 }
4560 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
4561
4562 // we can free blhdr here since we won't need it any more
4563 blhdr->binfo[0].bnum = 0xdeadc0de;
4564 kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
4565 }
4566
4567 /*
4568 * If the transaction we're aborting was the async transaction, then
4569 * tell the current transaction that there is no pending trim
4570 * any more.
4571 */
4572 lck_rw_lock_exclusive(&jnl->trim_lock);
4573 if (jnl->async_trim == &tr->trim)
4574 jnl->async_trim = NULL;
4575 lck_rw_unlock_exclusive(&jnl->trim_lock);
4576
4577
4578 if (tr->trim.extents) {
4579 kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t));
4580 }
4581 tr->trim.allocated_count = 0;
4582 tr->trim.extent_count = 0;
4583 tr->trim.extents = NULL;
4584 tr->tbuffer = NULL;
4585 tr->blhdr = NULL;
4586 tr->total_bytes = 0xdbadc0de;
4587 FREE_ZONE(tr, sizeof(transaction), M_JNL_TR);
4588
4589 KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0);
4590 }
4591
4592
4593 int
4594 journal_end_transaction(journal *jnl)
4595 {
4596 int ret;
4597 transaction *tr;
4598
4599 CHECK_JOURNAL(jnl);
4600
4601 free_old_stuff(jnl);
4602
4603 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
4604 return 0;
4605 }
4606
4607 if (jnl->owner != current_thread()) {
4608 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
4609 jnl, jnl->owner, current_thread());
4610 }
4611 jnl->nested_count--;
4612
4613 if (jnl->nested_count > 0) {
4614 return 0;
4615 } else if (jnl->nested_count < 0) {
4616 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
4617 }
4618
4619 if (jnl->flags & JOURNAL_INVALID) {
4620 if (jnl->active_tr) {
4621 if (jnl->cur_tr != NULL) {
4622 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
4623 jnl, jnl->active_tr, jnl->cur_tr);
4624 }
4625 tr = jnl->active_tr;
4626 jnl->active_tr = NULL;
4627
4628 abort_transaction(jnl, tr);
4629 }
4630 journal_unlock(jnl);
4631
4632 return EINVAL;
4633 }
4634
4635 tr = jnl->active_tr;
4636 CHECK_TRANSACTION(tr);
4637
4638 // clear this out here so that when check_free_space() calls
4639 // the FS flush function, we don't panic in journal_flush()
4640 // if the FS were to call that. note: check_free_space() is
4641 // called from end_transaction().
4642 //
4643 jnl->active_tr = NULL;
4644
4645 /* Examine the force-journal-flush state in the active txn */
4646 if (tr->flush_on_completion == TRUE) {
4647 /*
4648 * If the FS requested it, disallow group commit and force the
4649 * transaction out to disk immediately.
4650 */
4651 ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE);
4652 }
4653 else {
4654 /* in the common path we can simply use the double-buffered journal */
4655 ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE);
4656 }
4657
4658 return ret;
4659 }
4660
4661
4662 /*
4663 * Flush the contents of the journal to the disk.
4664 *
4665 * Input:
4666 * wait_for_IO -
4667 * If TRUE, wait to write in-memory journal to the disk
4668 * consistently, and also wait to write all asynchronous
4669 * metadata blocks to its corresponding locations
4670 * consistently on the disk. This means that the journal
4671 * is empty at this point and does not contain any
4672 * transactions. This is overkill in normal scenarios
4673 * but is useful whenever the metadata blocks are required
4674 * to be consistent on-disk instead of just the journal
4675 * being consistent; like before live verification
4676 * and live volume resizing.
4677 *
4678 * If FALSE, only wait to write in-memory journal to the
4679 * disk consistently. This means that the journal still
4680 * contains uncommitted transactions and the file system
4681 * metadata blocks in the journal transactions might be
4682 * written asynchronously to the disk. But there is no
4683 * guarantee that they are written to the disk before
4684 * returning to the caller. Note that this option is
4685 * sufficient for file system data integrity as it
4686 * guarantees consistent journal content on the disk.
4687 */
4688 int
4689 journal_flush(journal *jnl, journal_flush_options_t options)
4690 {
4691 boolean_t drop_lock = FALSE;
4692 errno_t error = 0;
4693 uint32_t flush_count;
4694
4695 CHECK_JOURNAL(jnl);
4696
4697 free_old_stuff(jnl);
4698
4699 if (jnl->flags & JOURNAL_INVALID) {
4700 return -1;
4701 }
4702
4703 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0);
4704
4705 if (jnl->owner != current_thread()) {
4706 journal_lock(jnl);
4707 drop_lock = TRUE;
4708 }
4709
4710 if (ISSET(options, JOURNAL_FLUSH_FULL))
4711 flush_count = jnl->flush_counter;
4712
4713 // if we're not active, flush any buffered transactions
4714 if (jnl->active_tr == NULL && jnl->cur_tr) {
4715 transaction *tr = jnl->cur_tr;
4716
4717 jnl->cur_tr = NULL;
4718
4719 if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
4720 wait_condition(jnl, &jnl->flushing, "journal_flush");
4721 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4722 }
4723 /*
4724 * "end_transction" will wait for any current async flush
4725 * to complete, before flushing "cur_tr"... because we've
4726 * specified the 'must_wait' arg as TRUE, it will then
4727 * synchronously flush the "cur_tr"
4728 */
4729 end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed
4730
4731 } else {
4732 if (drop_lock == TRUE) {
4733 journal_unlock(jnl);
4734 }
4735
4736 /* Because of pipelined journal, the journal transactions
4737 * might be in process of being flushed on another thread.
4738 * If there is nothing to flush currently, we should
4739 * synchronize ourselves with the pipelined journal thread
4740 * to ensure that all inflight transactions, if any, are
4741 * flushed before we return success to caller.
4742 */
4743 wait_condition(jnl, &jnl->flushing, "journal_flush");
4744 }
4745 if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
4746 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
4747 }
4748
4749 if (ISSET(options, JOURNAL_FLUSH_FULL)) {
4750
4751 dk_synchronize_t sync_request = {
4752 .options = 0,
4753 };
4754
4755 // We need a full cache flush. If it has not been done, do it here.
4756 if (flush_count == jnl->flush_counter)
4757 error = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel());
4758
4759 // If external journal partition is enabled, flush filesystem data partition.
4760 if (jnl->jdev != jnl->fsdev)
4761 error = VNOP_IOCTL(jnl->fsdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel());
4762
4763 }
4764
4765 KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0);
4766
4767 return 0;
4768 }
4769
4770 int
4771 journal_active(journal *jnl)
4772 {
4773 if (jnl->flags & JOURNAL_INVALID) {
4774 return -1;
4775 }
4776
4777 return (jnl->active_tr == NULL) ? 0 : 1;
4778 }
4779
4780 void *
4781 journal_owner(journal *jnl)
4782 {
4783 return jnl->owner;
4784 }
4785
4786 int journal_uses_fua(journal *jnl)
4787 {
4788 if (jnl->flags & JOURNAL_DO_FUA_WRITES)
4789 return 1;
4790 return 0;
4791 }
4792
4793 /*
4794 * Relocate the journal.
4795 *
4796 * You provide the new starting offset and size for the journal. You may
4797 * optionally provide a new tbuffer_size; passing zero defaults to not
4798 * changing the tbuffer size except as needed to fit within the new journal
4799 * size.
4800 *
4801 * You must have already started a transaction. The transaction may contain
4802 * modified blocks (such as those needed to deallocate the old journal,
4803 * allocate the new journal, and update the location and size of the journal
4804 * in filesystem-private structures). Any transactions prior to the active
4805 * transaction will be flushed to the old journal. The new journal will be
4806 * initialized, and the blocks from the active transaction will be written to
4807 * the new journal.
4808 *
4809 * The caller will need to update the structures that identify the location
4810 * and size of the journal. These updates should be made in the supplied
4811 * callback routine. These updates must NOT go into a transaction. You should
4812 * force these updates to the media before returning from the callback. In the
4813 * even of a crash, either the old journal will be found, with an empty journal,
4814 * or the new journal will be found with the contents of the active transaction.
4815 *
4816 * Upon return from the callback, the blocks from the active transaction are
4817 * written to their normal locations on disk.
4818 *
4819 * (Remember that we have to ensure that blocks get committed to the journal
4820 * before being committed to their normal locations. But the blocks don't count
4821 * as committed until the new journal is pointed at.)
4822 *
4823 * Upon return, there is still an active transaction: newly allocated, and
4824 * with no modified blocks. Call journal_end_transaction as normal. You may
4825 * modifiy additional blocks before calling journal_end_transaction, and those
4826 * blocks will (eventually) go to the relocated journal.
4827 *
4828 * Inputs:
4829 * jnl The (opened) journal to relocate.
4830 * offset The new journal byte offset (from start of the journal device).
4831 * journal_size The size, in bytes, of the new journal.
4832 * tbuffer_size The new desired transaction buffer size. Pass zero to keep
4833 * the same size as the current journal. The size will be
4834 * modified as needed to fit the new journal.
4835 * callback Routine called after the new journal has been initialized,
4836 * and the active transaction written to the new journal, but
4837 * before the blocks are written to their normal locations.
4838 * Pass NULL for no callback.
4839 * callback_arg An argument passed to the callback routine.
4840 *
4841 * Result:
4842 * 0 No errors
4843 * EINVAL The offset is not block aligned
4844 * EINVAL The journal_size is not a multiple of the block size
4845 * EINVAL The journal is invalid
4846 * (any) An error returned by journal_flush.
4847 *
4848 */
4849 int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size,
4850 errno_t (*callback)(void *), void *callback_arg)
4851 {
4852 int ret;
4853 transaction *tr;
4854 size_t i = 0;
4855
4856 /*
4857 * Sanity check inputs, and adjust the size of the transaction buffer.
4858 */
4859 if ((offset % jnl->jhdr->jhdr_size) != 0) {
4860 printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n",
4861 jnl->jdev_name, offset, jnl->jhdr->jhdr_size);
4862 return EINVAL;
4863 }
4864 if ((journal_size % jnl->jhdr->jhdr_size) != 0) {
4865 printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n",
4866 jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size);
4867 return EINVAL;
4868 }
4869
4870 CHECK_JOURNAL(jnl);
4871
4872 /* Guarantee we own the active transaction. */
4873 if (jnl->flags & JOURNAL_INVALID) {
4874 return EINVAL;
4875 }
4876 if (jnl->owner != current_thread()) {
4877 panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n",
4878 jnl, jnl->owner, current_thread());
4879 }
4880
4881 if (tbuffer_size == 0)
4882 tbuffer_size = jnl->tbuffer_size;
4883 size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size);
4884
4885 /*
4886 * Flush any non-active transactions. We have to temporarily hide the
4887 * active transaction to make journal_flush flush out non-active but
4888 * current (unwritten) transactions.
4889 */
4890 tr = jnl->active_tr;
4891 CHECK_TRANSACTION(tr);
4892 jnl->active_tr = NULL;
4893 ret = journal_flush(jnl, JOURNAL_WAIT_FOR_IO);
4894 jnl->active_tr = tr;
4895
4896 if (ret) {
4897 return ret;
4898 }
4899 wait_condition(jnl, &jnl->flushing, "end_transaction");
4900
4901 /*
4902 * At this point, we have completely flushed the contents of the current
4903 * journal to disk (and have asynchronously written all of the txns to
4904 * their actual desired locations). As a result, we can (and must) clear
4905 * out the old_start array. If we do not, then if the last written transaction
4906 * started at the beginning of the journal (starting 1 block into the
4907 * journal file) it could confuse the buffer_flushed callback. This is
4908 * because we're about to reset the start/end pointers of the journal header
4909 * below.
4910 */
4911 lock_oldstart(jnl);
4912 for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) {
4913 jnl->old_start[i] = 0;
4914 }
4915 unlock_oldstart(jnl);
4916
4917 /* Update the journal's offset and size in memory. */
4918 jnl->jdev_offset = offset;
4919 jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size;
4920 jnl->jhdr->size = journal_size;
4921 jnl->active_start = jnl->jhdr->start;
4922
4923 /*
4924 * Force the active transaction to be written to the new journal. Call the
4925 * supplied callback after the blocks have been written to the journal, but
4926 * before they get written to their normal on-disk locations.
4927 */
4928 jnl->active_tr = NULL;
4929 ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE);
4930 if (ret) {
4931 printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret);
4932 goto bad_journal;
4933 }
4934
4935 /*
4936 * Create a new, empty transaction to be the active transaction. This way
4937 * our caller can use journal_end_transaction as usual.
4938 */
4939 ret = journal_allocate_transaction(jnl);
4940 if (ret) {
4941 printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret);
4942 goto bad_journal;
4943 }
4944
4945 return 0;
4946
4947 bad_journal:
4948 jnl->flags |= JOURNAL_INVALID;
4949 abort_transaction(jnl, tr);
4950 return ret;
4951 }
4952
4953 uint32_t journal_current_txn(journal *jnl)
4954 {
4955 return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1);
4956 }
4957
4958 #else // !JOURNALING - so provide stub functions
4959
4960 int journal_uses_fua(__unused journal *jnl)
4961 {
4962 return 0;
4963 }
4964
4965 journal *
4966 journal_create(__unused struct vnode *jvp,
4967 __unused off_t offset,
4968 __unused off_t journal_size,
4969 __unused struct vnode *fsvp,
4970 __unused size_t min_fs_blksz,
4971 __unused int32_t flags,
4972 __unused int32_t tbuffer_size,
4973 __unused void (*flush)(void *arg),
4974 __unused void *arg,
4975 __unused struct mount *fsmount)
4976 {
4977 return NULL;
4978 }
4979
4980 journal *
4981 journal_open(__unused struct vnode *jvp,
4982 __unused off_t offset,
4983 __unused off_t journal_size,
4984 __unused struct vnode *fsvp,
4985 __unused size_t min_fs_blksz,
4986 __unused int32_t flags,
4987 __unused int32_t tbuffer_size,
4988 __unused void (*flush)(void *arg),
4989 __unused void *arg,
4990 __unused struct mount *fsmount)
4991 {
4992 return NULL;
4993 }
4994
4995
4996 int
4997 journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp)
4998 {
4999 return EINVAL;
5000 }
5001
5002 int
5003 journal_modify_block_end(__unused journal *jnl,
5004 __unused struct buf *bp,
5005 __unused void (*func)(struct buf *bp, void *arg),
5006 __unused void *arg)
5007 {
5008 return EINVAL;
5009 }
5010
5011 int
5012 journal_kill_block(__unused journal *jnl, __unused struct buf *bp)
5013 {
5014 return EINVAL;
5015 }
5016
5017 int journal_relocate(__unused journal *jnl,
5018 __unused off_t offset,
5019 __unused off_t journal_size,
5020 __unused int32_t tbuffer_size,
5021 __unused errno_t (*callback)(void *),
5022 __unused void *callback_arg)
5023 {
5024 return EINVAL;
5025 }
5026
5027 void
5028 journal_close(__unused journal *jnl)
5029 {
5030 }
5031
5032 int
5033 journal_start_transaction(__unused journal *jnl)
5034 {
5035 return EINVAL;
5036 }
5037
5038 int
5039 journal_end_transaction(__unused journal *jnl)
5040 {
5041 return EINVAL;
5042 }
5043
5044 int
5045 journal_flush(__unused journal *jnl, __unused journal_flush_options_t options)
5046 {
5047 return EINVAL;
5048 }
5049
5050 int
5051 journal_is_clean(__unused struct vnode *jvp,
5052 __unused off_t offset,
5053 __unused off_t journal_size,
5054 __unused struct vnode *fsvp,
5055 __unused size_t min_fs_block_size)
5056 {
5057 return 0;
5058 }
5059
5060
5061 void *
5062 journal_owner(__unused journal *jnl)
5063 {
5064 return NULL;
5065 }
5066
5067 void
5068 journal_lock(__unused journal *jnl)
5069 {
5070 return;
5071 }
5072
5073 void
5074 journal_unlock(__unused journal *jnl)
5075 {
5076 return;
5077 }
5078
5079 __private_extern__ int
5080 journal_trim_add_extent(__unused journal *jnl,
5081 __unused uint64_t offset,
5082 __unused uint64_t length)
5083 {
5084 return 0;
5085 }
5086
5087 int
5088 journal_request_immediate_flush(__unused journal *jnl)
5089 {
5090 return 0;
5091 }
5092
5093 __private_extern__ int
5094 journal_trim_remove_extent(__unused journal *jnl,
5095 __unused uint64_t offset,
5096 __unused uint64_t length)
5097 {
5098 return 0;
5099 }
5100
5101 int journal_trim_extent_overlap(__unused journal *jnl,
5102 __unused uint64_t offset,
5103 __unused uint64_t length,
5104 __unused uint64_t *end)
5105 {
5106 return 0;
5107 }
5108
5109 #endif // !JOURNALING