2 * Copyright (c) 2002-2015 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 // This file implements a simple write-ahead journaling layer.
30 // In theory any file system can make use of it by calling these
31 // functions when the fs wants to modify meta-data blocks. See
32 // hfs_journal.h for a more detailed description of the api and
35 // Dominic Giampaolo (dbg@apple.com)
36 // Port to Live-Files: Oded Shoshani (oshoshani@apple.com)
48 #include <sys/sysctl.h>
49 #include <sys/types.h>
50 #include <mach/mach.h>
52 #include <sys/kdebug.h>
53 #include "lf_hfs_locks.h"
54 #include "lf_hfs_journal.h"
55 #include "lf_hfs_vfsutils.h"
56 #include "lf_hfs_raw_read_write.h"
57 #include "lf_hfs_generic_buf.h"
58 #include "lf_hfs_logger.h"
59 #include "lf_hfs_vfsops.h"
61 // ************************** Function Definitions ***********************
62 // number of bytes to checksum in a block_list_header
63 // NOTE: this should be enough to clear out the header
64 // fields as well as the first entry of binfo[]
66 #define CHECK_JOURNAL(jnl) \
69 printf("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
70 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
72 if (jnl->jdev == NULL) { \
73 printf("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
74 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
76 if (jnl->fsdev == NULL) { \
77 printf("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
78 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
80 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \
81 printf("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
82 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
83 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
84 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
86 if (jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size) { \
87 printf("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
88 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
89 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
90 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
92 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { \
93 printf("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
94 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
95 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
96 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
100 #define CHECK_TRANSACTION(tr) \
103 printf("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
104 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
106 if (tr->jnl == NULL) { \
107 printf("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
108 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
110 if (tr->blhdr != (block_list_header *)tr->tbuffer) { \
111 printf("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
112 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
114 if (tr->total_bytes < 0) { \
115 printf("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
116 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
118 if (tr->journal_start < 0) { \
119 printf("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
120 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
122 if (tr->journal_end < 0) { \
123 printf("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
124 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
126 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
127 printf("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
128 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
132 #define SWAP16(x) OSSwapInt16(x)
133 #define SWAP32(x) OSSwapInt32(x)
134 #define SWAP64(x) OSSwapInt64(x)
136 #define JNL_WRITE 0x0001
137 #define JNL_READ 0x0002
138 #define JNL_HEADER 0x8000
140 #define BLHDR_CHECKSUM_SIZE 32
141 #define MAX_JOURNAL_SIZE 0x80000000U
143 #define STARTING_BUCKETS 256
144 typedef struct bucket
{
151 static int replay_journal(journal
*jnl
);
152 static void free_old_stuff(journal
*jnl
);
153 static errno_t
journal_allocate_transaction(journal
*jnl
);
154 static void get_io_info(struct vnode
*devvp
, size_t phys_blksz
, journal
*jnl
);
155 static size_t read_journal_header(journal
*jnl
, void *data
, size_t len
);
156 static size_t do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
);
157 static unsigned int calc_checksum(const char *ptr
, int len
);
158 static void swap_journal_header(journal
*jnl
);
159 static int end_transaction(transaction
*tr
,
161 errno_t (*callback
)(void*),
163 boolean_t drop_lock
);
164 static void abort_transaction(journal
*jnl
, transaction
*tr
);
165 static void size_up_tbuffer(journal
*jnl
, uint32_t tbuffer_size
, uint32_t phys_blksz
);
166 static void lock_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
, __unused
const char *condition_name
);
167 static void wait_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
, __unused
const char *condition_name
);
168 static void unlock_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
);
169 static int write_journal_header(journal
*jnl
, int updating_start
, uint32_t sequence_num
);
170 static size_t read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
);
171 static size_t write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
);
174 static __inline__
void lock_oldstart(journal
*jnl
) {
175 lf_lck_mtx_lock(&jnl
->old_start_lock
);
178 static __inline__
void unlock_oldstart(journal
*jnl
) {
179 lf_lck_mtx_unlock(&jnl
->old_start_lock
);
182 __inline__
void journal_lock(journal
*jnl
) {
183 lf_lck_mtx_lock(&jnl
->jlock
);
185 panic ("jnl: owner is %p, expected NULL\n", jnl
->owner
);
187 jnl
->owner
= pthread_self();
190 __inline__
void journal_unlock(journal
*jnl
) {
192 lf_lck_mtx_unlock(&jnl
->jlock
);
195 static __inline__
void lock_flush(journal
*jnl
) {
196 lf_lck_mtx_lock(&jnl
->flock
);
199 static __inline__
void unlock_flush(journal
*jnl
) {
200 lf_lck_mtx_unlock(&jnl
->flock
);
203 // ************************** Global Variables ***********************
205 lck_grp_attr_t
*jnl_group_attr
= NULL
;
206 lck_attr_t
*jnl_lock_attr
= NULL
;
207 lck_grp_t
*jnl_mutex_group
= NULL
;
209 // By default, we grow the list of extents to trim by 4K at a time.
210 // We'll opt to flush a transaction if it contains at least
211 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
212 // of modified blocks is small).
214 JOURNAL_DEFAULT_TRIM_BYTES
= 4096,
215 JOURNAL_DEFAULT_TRIM_EXTENTS
= JOURNAL_DEFAULT_TRIM_BYTES
/ sizeof(dk_extent_t
),
216 JOURNAL_FLUSH_TRIM_EXTENTS
= JOURNAL_DEFAULT_TRIM_EXTENTS
* 15 / 16
219 unsigned int jnl_trim_flush_limit
= JOURNAL_FLUSH_TRIM_EXTENTS
;
222 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
223 #define MAX_TRANSACTION_BUFFER_SIZE (3072*1024)
224 uint32_t def_tbuffer_size
= 0; // XXXdbg - so I can change it in the debugger
226 // ************************** Global Functions ***********************
227 void journal_init(void) {
229 jnl_lock_attr
= lf_lck_attr_alloc_init();
230 jnl_group_attr
= lf_lck_grp_attr_alloc_init();
231 jnl_mutex_group
= lf_lck_grp_alloc_init();
234 journal
*journal_open(struct vnode
*jvp
,
240 int32_t tbuffer_size
,
241 void (*flush
)(void *arg
),
243 struct mount
*fsmount
) {
245 uint32_t orig_blksz
=0;
247 u_int32_t min_size
= 0;
248 int orig_checksum
, checksum
;
250 /* Get the real physical block size. */
251 if (ioctl(jvp
->psFSRecord
->iFD
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
)) {
252 goto cleanup_jdev_name
;
255 if (phys_blksz
> min_fs_blksz
) {
256 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: error: phys blksize %u bigger than min fs blksize %zd\n",
257 phys_blksz
, min_fs_blksz
);
258 goto cleanup_jdev_name
;
261 if (journal_size
< (256*1024) || journal_size
> (1024*1024*1024)) {
262 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal size %lld looks bogus.\n", journal_size
);
263 goto cleanup_jdev_name
;
266 min_size
= phys_blksz
* (phys_blksz
/ sizeof(block_info
));
267 /* Reject journals that are too small given the sector size of the device */
268 if (journal_size
< min_size
) {
269 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal size (%lld) too small given sector size of (%u)\n",
270 journal_size
, phys_blksz
);
271 goto cleanup_jdev_name
;
274 if ((journal_size
% phys_blksz
) != 0) {
275 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
276 journal_size
, phys_blksz
);
277 goto cleanup_jdev_name
;
280 jnl
= hfs_mallocz(sizeof(struct journal
));
283 jnl
->jdev_offset
= offset
;
284 jnl
->jdev_blknum
= (uint32_t)(offset
/ min_fs_blksz
);
287 jnl
->flush_arg
= arg
;
288 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
289 lf_lck_mtx_init(&jnl
->old_start_lock
);
290 lf_cond_init(&jnl
->flushing
.sCond
);
291 lf_cond_init(&jnl
->asyncIO
.sCond
);
292 lf_cond_init(&jnl
->writing_header
.sCond
);
294 /* We hold the mount to later pass to the throttling code for IO
297 jnl
->fsmount
= fsmount
;
299 get_io_info(jvp
, phys_blksz
, jnl
);
301 jnl
->header_buf
= hfs_malloc(phys_blksz
);
302 jnl
->header_buf_size
= phys_blksz
;
304 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
305 memset(jnl
->jhdr
, 0, sizeof(journal_header
));
307 // we have to set this up here so that do_journal_io() will work
308 jnl
->jhdr
->jhdr_size
= phys_blksz
;
310 if (read_journal_header(jnl
, jnl
->jhdr
, phys_blksz
) != phys_blksz
) {
311 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: could not read %u bytes for the journal header.\n",
317 * Check for a bad jhdr size after reading in the journal header.
318 * The journal header length cannot be zero
320 if (jnl
->jhdr
->jhdr_size
== 0) {
321 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: bad jhdr size (%d) \n", jnl
->jhdr
->jhdr_size
);
325 orig_checksum
= jnl
->jhdr
->checksum
;
326 jnl
->jhdr
->checksum
= 0;
328 if (jnl
->jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
330 // do this before the swap since it's done byte-at-a-time
331 orig_checksum
= SWAP32(orig_checksum
);
332 checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
333 swap_journal_header(jnl
);
334 jnl
->flags
|= JOURNAL_NEED_SWAP
;
338 checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
341 if (jnl
->jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
->jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
342 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal magic is bad (0x%x != 0x%x)\n",
343 jnl
->jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
347 // only check if we're the current journal header magic value
348 if (jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
) {
350 if (orig_checksum
!= checksum
) {
351 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
352 orig_checksum
, checksum
);
358 // XXXdbg - convert old style magic numbers to the new one
359 if (jnl
->jhdr
->magic
== OLD_JOURNAL_HEADER_MAGIC
) {
360 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
363 if (phys_blksz
!= (size_t)jnl
->jhdr
->jhdr_size
&& jnl
->jhdr
->jhdr_size
!= 0) {
365 * The volume has probably been resized (such that we had to adjust the
366 * logical sector size), or copied to media with a different logical
369 * For us, though, no big deal because we are giving byte offsets to
370 * pread() and pwrite() to do our I/O, and as long as we use self-
371 * consistent units, we are all good.
373 LFHFS_LOG(LEVEL_ERROR
,
374 "jnl: block size mismatch: phys_blksz=%llu, jhdr->jhdr_size=%llu -- COMPENSATING\n",
375 (unsigned long long)phys_blksz
, (unsigned long long)jnl
->jhdr
->jhdr_size
);
376 orig_blksz
= phys_blksz
;
379 if ( jnl
->jhdr
->start
<= 0
380 || jnl
->jhdr
->start
> jnl
->jhdr
->size
381 || jnl
->jhdr
->start
> 1024*1024*1024) {
382 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
383 jnl
->jhdr
->start
, jnl
->jhdr
->size
);
387 if ( jnl
->jhdr
->end
<= 0
388 || jnl
->jhdr
->end
> jnl
->jhdr
->size
389 || jnl
->jhdr
->end
> 1024*1024*1024) {
390 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
391 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
395 if (jnl
->jhdr
->size
< (256*1024) || jnl
->jhdr
->size
> 1024*1024*1024) {
396 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: jhdr size looks bad (0x%llx)\n", jnl
->jhdr
->size
);
400 // XXXdbg - can't do these checks because hfs writes all kinds of
401 // non-uniform sized blocks even on devices that have a block size
402 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
403 // therefore these checks will fail and so we just have to punt and
404 // do more relaxed checking...
405 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
406 if ((jnl
->jhdr
->start
% 512) != 0) {
407 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal start (0x%llx) not a multiple of 512?\n",
412 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
413 if ((jnl
->jhdr
->end
% 512) != 0) {
414 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
415 jnl
->jhdr
->end
, jnl
->jhdr
->jhdr_size
);
419 if (jnl
->jhdr
->blhdr_size
< 0) {
420 //throw out invalid sizes
421 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: blhdr size looks bogus! (%d) \n",
422 jnl
->jhdr
->blhdr_size
);
426 // take care of replaying the journal if necessary
427 if (flags
& JOURNAL_RESET
) {
428 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n",
429 jnl
->jhdr
->start
, jnl
->jhdr
->end
);
430 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
431 } else if (replay_journal(jnl
) != 0) {
432 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_open: Error replaying the journal!\n");
437 * When we get here, we know that the journal is empty (jnl->jhdr->start ==
438 * jnl->jhdr->end). If the device's logical block size was different from
439 * the journal's header size, then we can now restore the device's logical
440 * block size and update the journal's header size to match.
442 * Note that we also adjust the journal's start and end so that they will
443 * be aligned on the new block size. We pick a new sequence number to
444 * avoid any problems if a replay found previous transactions using the old
445 * journal header size. (See the comments in journal_create(), above.)
448 if (orig_blksz
!= 0) {
449 LFHFS_LOG(LEVEL_ERROR
, "jnl: updating journal header with block size %llu\n",
450 (unsigned long long)phys_blksz
);
452 jnl
->jhdr
->jhdr_size
= phys_blksz
;
453 jnl
->jhdr
->start
= phys_blksz
;
454 jnl
->jhdr
->end
= phys_blksz
;
455 jnl
->jhdr
->sequence_num
= (jnl
->jhdr
->sequence_num
+
456 (journal_size
/ phys_blksz
) +
457 (random() % 16384)) & 0x00ffffff;
459 if (write_journal_header(jnl
, 1, jnl
->jhdr
->sequence_num
)) {
460 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: failed to update journal header size\n");
465 // make sure this is in sync!
466 jnl
->active_start
= jnl
->jhdr
->start
;
467 jnl
->sequence_num
= jnl
->jhdr
->sequence_num
;
469 // set this now, after we've replayed the journal
470 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
472 // TODO: Does this need to change if the device's logical block size changed?
473 if ((off_t
)(jnl
->jhdr
->blhdr_size
/sizeof(block_info
)-1) > (jnl
->jhdr
->size
/jnl
->jhdr
->jhdr_size
)) {
474 LFHFS_LOG(LEVEL_ERROR
, "jnl: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jnl
->jhdr
->size
,
475 jnl
->jhdr
->blhdr_size
, jnl
->jhdr
->jhdr_size
);
479 lf_lck_mtx_init(&jnl
->jlock
);
480 lf_lck_mtx_init(&jnl
->flock
);
481 lf_lck_rw_init(&jnl
->trim_lock
);
483 goto journal_open_complete
;
486 hfs_free(jnl
->header_buf
);
490 journal_open_complete
:
494 journal
*journal_create(struct vnode
*jvp
,
500 int32_t tbuffer_size
,
501 void (*flush
)(void *arg
),
503 struct mount
*fsmount
) {
506 uint32_t phys_blksz
, new_txn_base
;
510 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
511 * a full allocation block if the current size is smaller than the allocation
512 * block on which it resides. Once we hit the exabyte filesystem range, then
513 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
516 /* Get the real physical block size. */
517 if (ioctl(jvp
->psFSRecord
->iFD
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
)) {
518 goto cleanup_jdev_name
;
521 if (journal_size
< (256*1024) || journal_size
> (MAX_JOURNAL_SIZE
)) {
522 LFHFS_LOG(LEVEL_ERROR
, "jnl: create: journal size %lld looks bogus.\n", journal_size
);
523 goto cleanup_jdev_name
;
526 min_size
= phys_blksz
* (phys_blksz
/ sizeof(block_info
));
527 /* Reject journals that are too small given the sector size of the device */
528 if (journal_size
< min_size
) {
529 LFHFS_LOG(LEVEL_ERROR
, "jnl: create: journal size (%lld) too small given sector size of (%u)\n",
530 journal_size
, phys_blksz
);
531 goto cleanup_jdev_name
;
534 if (phys_blksz
> min_fs_blksz
) {
535 LFHFS_LOG(LEVEL_ERROR
, "jnl: create: error: phys blksize %u bigger than min fs blksize %zd\n",
536 phys_blksz
, min_fs_blksz
);
537 goto cleanup_jdev_name
;
540 if ((journal_size
% phys_blksz
) != 0) {
541 LFHFS_LOG(LEVEL_ERROR
, "jnl: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
542 journal_size
, phys_blksz
);
543 goto cleanup_jdev_name
;
547 jnl
= hfs_mallocz(sizeof(struct journal
));
550 jnl
->jdev_offset
= offset
;
551 jnl
->jdev_blknum
= (uint32_t)(offset
/ min_fs_blksz
);
554 jnl
->flush_arg
= arg
;
555 jnl
->flags
= (flags
& JOURNAL_OPTION_FLAGS_MASK
);
556 lf_lck_mtx_init(&jnl
->old_start_lock
);
558 // Keep a point to the mount around for use in IO throttling.
559 jnl
->fsmount
= fsmount
;
561 get_io_info(jvp
, phys_blksz
, jnl
);
563 jnl
->header_buf
= hfs_malloc(phys_blksz
);
564 jnl
->header_buf_size
= phys_blksz
;
566 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
567 memset(jnl
->jhdr
, 0, sizeof(journal_header
));
569 // we have to set this up here so that do_journal_io() will work
570 jnl
->jhdr
->jhdr_size
= phys_blksz
;
573 // We try and read the journal header to see if there is already one
574 // out there. If there is, it's possible that it has transactions
575 // in it that we might replay if we happen to pick a sequence number
576 // that is a little less than the old one, there is a crash and the
577 // last txn written ends right at the start of a txn from the previous
578 // incarnation of this file system. If all that happens we would
579 // replay the transactions from the old file system and that would
580 // destroy your disk. Although it is extremely unlikely for all those
581 // conditions to happen, the probability is non-zero and the result is
582 // severe - you lose your file system. Therefore if we find a valid
583 // journal header and the sequence number is non-zero we write junk
584 // over the entire journal so that there is no way we will encounter
585 // any old transactions. This is slow but should be a rare event
586 // since most tools erase the journal.
588 if ( read_journal_header(jnl
, jnl
->jhdr
, phys_blksz
) == phys_blksz
589 && jnl
->jhdr
->magic
== JOURNAL_HEADER_MAGIC
590 && jnl
->jhdr
->sequence_num
!= 0) {
592 new_txn_base
= (jnl
->jhdr
->sequence_num
+ (journal_size
/ phys_blksz
) + (random() % 16384)) & 0x00ffffff;
593 LFHFS_LOG(LEVEL_ERROR
, "jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl
->jhdr
->sequence_num
, new_txn_base
);
596 new_txn_base
= random() & 0x00ffffff;
599 memset(jnl
->header_buf
, 0, phys_blksz
);
601 jnl
->jhdr
->magic
= JOURNAL_HEADER_MAGIC
;
602 jnl
->jhdr
->endian
= ENDIAN_MAGIC
;
603 jnl
->jhdr
->start
= phys_blksz
; // start at block #1, block #0 is for the jhdr itself
604 jnl
->jhdr
->end
= phys_blksz
;
605 jnl
->jhdr
->size
= journal_size
;
606 jnl
->jhdr
->jhdr_size
= phys_blksz
;
607 size_up_tbuffer(jnl
, tbuffer_size
, phys_blksz
);
609 jnl
->active_start
= jnl
->jhdr
->start
;
611 jnl
->jhdr
->sequence_num
= new_txn_base
;
613 lf_lck_mtx_init(&jnl
->jlock
);
614 lf_lck_mtx_init(&jnl
->flock
);
615 lf_lck_rw_init(&jnl
->trim_lock
);
617 lf_cond_init(&jnl
->flushing
.sCond
);
618 lf_cond_init(&jnl
->asyncIO
.sCond
);
619 lf_cond_init(&jnl
->writing_header
.sCond
);
620 jnl
->flush_aborted
= FALSE
;
621 jnl
->async_trim
= NULL
;
622 jnl
->sequence_num
= jnl
->jhdr
->sequence_num
;
624 if (write_journal_header(jnl
, 1, jnl
->jhdr
->sequence_num
) != 0) {
625 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_create: failed to write journal header.\n");
629 goto journal_create_complete
;
633 hfs_free(jnl
->header_buf
);
638 journal_create_complete
:
644 void *journal_owner(journal
*jnl
) {
648 /* Is the given cnode either the .journal or .journal_info_block file on
649 * a volume with an active journal? Many VNOPs use this to deny access
652 * Note: the .journal file on a volume with an external journal still
653 * returns true here, even though it does not actually hold the contents
654 * of the volume's journal.
656 _Bool
hfs_is_journal_file(struct hfsmount
*hfsmp
, struct cnode
*cp
) {
657 if (hfsmp
->jnl
!= NULL
&&
658 (cp
->c_fileid
== hfsmp
->hfs_jnlinfoblkid
||
659 cp
->c_fileid
== hfsmp
->hfs_jnlfileid
)) {
666 bool is_journaled(UVFSFileNode
*psRootNode
) {
668 struct vnode
*psRootVnode
= *psRootNode
;
671 LFHFS_LOG(LEVEL_DEBUG
, "is_journaled: psRootNode is NULL");
675 if (!psRootVnode
->sFSParams
.vnfs_mp
) {
676 LFHFS_LOG(LEVEL_DEBUG
, "is_journaled: psRootVnode->sFSParams.vnfs_mp is NULL");
680 if (psRootVnode
->sFSParams
.vnfs_mp
->psHfsmount
->jnl
)
687 // Media no longer available, clear all memory occupied by the journal
688 void journal_release(journal
*jnl
) {
689 if (jnl
->owner
!= pthread_self()) {
693 if (jnl
->active_tr
) {
694 abort_transaction(jnl
, jnl
->active_tr
);
698 abort_transaction(jnl
, jnl
->cur_tr
);
703 hfs_free(jnl
->header_buf
);
704 jnl
->jhdr
= (void *)0xbeefbabe;
707 lf_lck_mtx_destroy(&jnl
->old_start_lock
);
708 lf_lck_mtx_destroy(&jnl
->jlock
);
709 lf_lck_mtx_destroy(&jnl
->flock
);
714 void journal_close(journal
*jnl
) {
715 volatile off_t
*start
, *end
;
720 // set this before doing anything that would block so that
721 // we start tearing things down properly.
723 jnl
->flags
|= JOURNAL_CLOSE_PENDING
;
725 if (jnl
->owner
!= pthread_self()) {
729 wait_condition(jnl
, &jnl
->flushing
, "journal_close");
732 // only write stuff to disk if the journal is still valid
734 if ((jnl
->flags
& JOURNAL_INVALID
) == 0) {
736 if (jnl
->active_tr
) {
738 * "journal_end_transaction" will fire the flush asynchronously
740 journal_end_transaction(jnl
);
743 // flush any buffered transactions
745 transaction
*tr
= jnl
->cur_tr
;
749 * "end_transaction" will wait for any in-progress flush to complete
750 * before flushing "cur_tr" synchronously("must_wait" == TRUE)
752 end_transaction(tr
, 1, NULL
, NULL
, FALSE
);
755 * if there was an "active_tr", make sure we wait for
756 * it to flush if there was no "cur_tr" to process
758 wait_condition(jnl
, &jnl
->flushing
, "journal_close");
760 //start = &jnl->jhdr->start;
761 start
= &jnl
->active_start
;
762 end
= &jnl
->jhdr
->end
;
764 while (*start
!= *end
&& counter
++ < 5000) {
765 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
767 jnl
->flush(jnl
->flush_arg
);
772 if (*start
!= *end
) {
773 LFHFS_LOG(LEVEL_ERROR
, "jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
777 // make sure this is in sync when we close the journal
778 jnl
->jhdr
->start
= jnl
->active_start
;
780 // if this fails there's not much we can do at this point...
781 write_journal_header(jnl
, 1, jnl
->sequence_num
);
783 // if we're here the journal isn't valid any more.
784 // so make sure we don't leave any locked blocks lying around
785 LFHFS_LOG(LEVEL_ERROR
, "jnl: close: journal is invalid. aborting outstanding transactions\n");
786 if (jnl
->active_tr
|| jnl
->cur_tr
) {
789 if (jnl
->active_tr
) {
791 jnl
->active_tr
= NULL
;
796 abort_transaction(jnl
, tr
);
798 if (jnl
->active_tr
|| jnl
->cur_tr
) {
799 panic("jnl: close: jnl @ %p had both an active and cur tr\n", jnl
);
803 wait_condition(jnl
, &jnl
->asyncIO
, "journal_close");
807 hfs_free(jnl
->header_buf
);
808 jnl
->jhdr
= (void *)0xbeefbabe;
811 lf_lck_mtx_destroy(&jnl
->old_start_lock
);
812 lf_lck_mtx_destroy(&jnl
->jlock
);
813 lf_lck_mtx_destroy(&jnl
->flock
);
817 // This function performs the following:
818 // 1) Checks that we have a valid journal
819 // 2) locks the journal
820 // 3) Allocates roon in the journal
821 int journal_start_transaction(journal
*jnl
) {
826 printf("journal_start_transaction (%u).\n", jnl
->nested_count
);
833 if (jnl
->flags
& JOURNAL_INVALID
) {
837 if (jnl
->owner
== pthread_self()) {
838 if (jnl
->active_tr
== NULL
) {
839 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
840 jnl
, jnl
->owner
, pthread_self());
848 if (jnl
->nested_count
!= 0 || jnl
->active_tr
!= NULL
) {
849 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
850 jnl
->owner
, jnl
->nested_count
, jnl
->active_tr
, jnl
);
853 jnl
->nested_count
= 1;
855 // if there's a buffered transaction, use it.
857 jnl
->active_tr
= jnl
->cur_tr
;
863 ret
= journal_allocate_transaction(jnl
);
868 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
873 jnl
->nested_count
= 0;
878 // journal_end_transaction
879 // This function does the following:
880 // 1) Validates journal status/state
882 int journal_end_transaction(journal
*jnl
) {
887 printf("journal_end_transaction (%u).\n", jnl
->nested_count
-1);
894 if ((jnl
->flags
& JOURNAL_INVALID
) && jnl
->owner
== NULL
) {
898 if (jnl
->owner
!= pthread_self()) {
899 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
900 jnl
, jnl
->owner
, pthread_self());
904 if (jnl
->nested_count
> 0) {
906 } else if (jnl
->nested_count
< 0) {
907 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl
, jnl
->nested_count
);
910 if (jnl
->flags
& JOURNAL_INVALID
) {
911 if (jnl
->active_tr
) {
912 if (jnl
->cur_tr
!= NULL
) {
913 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
914 jnl
, jnl
->active_tr
, jnl
->cur_tr
);
917 jnl
->active_tr
= NULL
;
919 abort_transaction(jnl
, tr
);
927 CHECK_TRANSACTION(tr
);
929 // clear this out here so that when check_free_space() calls
930 // the FS flush function, we don't panic in journal_flush()
931 // if the FS were to call that. note: check_free_space() is
932 // called from end_transaction().
933 jnl
->active_tr
= NULL
;
935 /* Examine the force-journal-flush state in the active txn */
936 if (tr
->flush_on_completion
== TRUE
) {
938 * If the FS requested it, disallow group commit and force the
939 * transaction out to disk immediately.
941 ret
= end_transaction(tr
, 1, NULL
, NULL
, TRUE
);
944 /* in the common path we can simply use the double-buffered journal */
945 ret
= end_transaction(tr
, 0, NULL
, NULL
, TRUE
);
951 // journal_modify_block_start
952 // This function does the following:
953 // 1) Makes sure the journal file is on and valid
954 // 2) Clean up (free previous transactions)
955 // 3) Validate that the phy-block-size has not changed.
956 // 4) Locks the buffer.
957 // Buffer life cycle with journal:
958 // 1) Client code (ie btrees_io.c) allocates a buffer (ie gains ownership). Other threads will pend on using this buffer until it is released.
959 // 2) Client code calls journal_modify_block_start which sets the GEN_BUF_WRITE_LOCK uCacheFlag.
960 // 3) Client code modifies the buffer.
961 // 4) Client code calls journal_modify_block_end which released the buffer. The GEN_BUF_WRITE_LOCK flag remains set.
962 // It this point other threads are welcomed to modify the buffer (after executing steps 1 and 2 above). The buffer content will not be written to media before transaction_end, thus only the accumulative change of both threads after transaction_end will be committed.
963 // 5) transaction-end (called from within client-code or async Sync) obtains ownership on in transaction buffers. By doing that it makes sure no buffer is currently being modified by any Client code. It then prepares the buffer for commiting (ie realigns endianizm), and commits (writes to the t-buffer, write the t-buffer to media, updates journal-info, clears the GEN_BUF_WRITE_LOCK flags and writes the buffers to media).
964 int journal_modify_block_start(journal
*jnl
, GenericLFBuf
*psGenBuf
) {
969 printf("journal_modify_block_start: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uCacheFlags 0x%llx, uPhyCluster %llu, uLockCnt %u\n",
970 psGenBuf
, psGenBuf
->psVnode
, psGenBuf
->uBlockN
, psGenBuf
->uDataSize
, psGenBuf
->uCacheFlags
,psGenBuf
->uPhyCluster
, psGenBuf
->uLockCnt
);
977 if (jnl
->flags
& JOURNAL_INVALID
) {
982 CHECK_TRANSACTION(tr
);
984 if (jnl
->owner
!= pthread_self()) {
985 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
986 jnl
, jnl
->owner
, pthread_self());
989 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
990 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
992 // can't allow blocks that aren't an even multiple of the
993 // underlying block size.
994 if ((psGenBuf
->uDataSize
% jnl
->jhdr
->jhdr_size
) != 0) {
998 if (ioctl(jnl
->jdev
->psFSRecord
->iFD
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
)) {
1000 } else if (phys_blksz
!= (uint32_t)jnl
->jhdr
->jhdr_size
) {
1001 if (phys_blksz
< 512) {
1002 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
1003 phys_blksz
, psGenBuf
->uDataSize
, jnl
->jhdr
->jhdr_size
);
1006 if ((psGenBuf
->uDataSize
% phys_blksz
) != 0) {
1008 } else if (phys_blksz
< (uint32_t)jnl
->jhdr
->jhdr_size
) {
1009 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1011 // the phys_blksz is now larger... need to realloc the jhdr
1012 char *new_header_buf
;
1014 LFHFS_LOG(LEVEL_ERROR
, "jnl: phys blksz got bigger (was: %d/%d now %d)\n",
1015 jnl
->header_buf_size
, jnl
->jhdr
->jhdr_size
, phys_blksz
);
1016 new_header_buf
= hfs_malloc(phys_blksz
);
1017 memcpy(new_header_buf
, jnl
->header_buf
, jnl
->header_buf_size
);
1018 memset(&new_header_buf
[jnl
->header_buf_size
], 0x18, (phys_blksz
- jnl
->header_buf_size
));
1019 hfs_free(jnl
->header_buf
);
1020 jnl
->header_buf
= new_header_buf
;
1021 jnl
->header_buf_size
= phys_blksz
;
1023 jnl
->jhdr
= (journal_header
*)jnl
->header_buf
;
1024 jnl
->jhdr
->jhdr_size
= phys_blksz
;
1031 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1032 psGenBuf
->uDataSize
, jnl
->jhdr
->jhdr_size
);
1038 // make sure that this transaction isn't bigger than the whole journal
1039 if ((tr
->total_bytes
+psGenBuf
->uDataSize
) >= (size_t)(jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
)) {
1040 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
1041 tr
->total_bytes
, (tr
->jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
), psGenBuf
->uDataSize
, tr
, psGenBuf
->pvData
);
1046 lf_hfs_generic_buf_set_cache_flag(psGenBuf
, GEN_BUF_WRITE_LOCK
);
1050 // journal_modify_block_end
1051 // This function does the following:
1052 // 1) Makes sure the journal file is on and valid
1053 // 2) Clean up (free previous transactions)
1054 // 3) Check if this block already exists in transaction
1055 // 4) Add block number to transcation. We dont add the block data, nor we release the buffer at this point.
1056 // This will be done later on, at the transaction-end.
1057 int journal_modify_block_end(journal
*jnl
, GenericLFBuf
*psGenBuf
,
1058 void (*func
)(GenericLFBuf
*bp
, void *arg
), void *arg
) {
1060 size_t tbuffer_offset
=0;
1061 block_list_header
*blhdr
, *prev
=NULL
;
1062 transaction
*tr
= NULL
;
1065 printf("journal_modify_block_end: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1066 psGenBuf
, psGenBuf
->psVnode
, psGenBuf
->uBlockN
, psGenBuf
->uDataSize
, psGenBuf
->uPhyCluster
, psGenBuf
->uLockCnt
);
1071 free_old_stuff(jnl
);
1074 psGenBuf
->pfFunc
= func
;
1075 psGenBuf
->pvCallbackArgs
= arg
;
1078 if (jnl
->flags
& JOURNAL_INVALID
) {
1079 /* Still need to buf_brelse(). Callers assume we consume the bp. */
1080 lf_hfs_generic_buf_clear_cache_flag(psGenBuf
, GEN_BUF_WRITE_LOCK
);
1081 lf_hfs_generic_buf_release(psGenBuf
);
1085 tr
= jnl
->active_tr
;
1086 CHECK_TRANSACTION(tr
);
1088 if (jnl
->owner
!= pthread_self()) {
1089 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
1090 jnl
, jnl
->owner
, pthread_self());
1093 if ((psGenBuf
->uCacheFlags
& GEN_BUF_WRITE_LOCK
) == 0) {
1094 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", psGenBuf
, jnl
);
1097 // first check if this block is already part of this transaction
1098 for (blhdr
= tr
->blhdr
; blhdr
; prev
= blhdr
, blhdr
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
1099 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
1101 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
1102 GenericLFBuf
*bp
= (void*)blhdr
->binfo
[i
].u
.bp
;
1103 if (psGenBuf
== bp
) {
1104 // Block found in transaction
1106 printf("block_end, already in journal: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1107 psGenBuf
, psGenBuf
->psVnode
, psGenBuf
->uBlockN
, psGenBuf
->uDataSize
, psGenBuf
->uPhyCluster
, psGenBuf
->uLockCnt
);
1111 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
1112 off_t uSizeOfBuf
= ((GenericLFBuf
*)(blhdr
->binfo
[i
].u
.bp
))->uDataSize
;
1113 tbuffer_offset
+= uSizeOfBuf
;
1115 tbuffer_offset
+= blhdr
->binfo
[i
].u
.bi
.bsize
;
1119 if (i
< blhdr
->num_blocks
) {
1126 && (prev
->num_blocks
+1) <= prev
->max_blocks
1127 && (prev
->bytes_used
+psGenBuf
->uDataSize
) <= (uint32_t)tr
->tbuffer_size
) {
1128 // Block not found, add to last list
1131 } else if (blhdr
== NULL
) {
1132 block_list_header
*nblhdr
;
1134 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, psGenBuf %p\n", jnl
, psGenBuf
);
1136 // Add another tbuffer:
1138 // we got to the end of the list, didn't find the block and there's
1139 // no room in the block_list_header pointed to by prev
1141 // we allocate another tbuffer and link it in at the end of the list
1142 // through prev->binfo[0].bnum. that's a skanky way to do things but
1143 // avoids having yet another linked list of small data structures to manage.
1145 nblhdr
= hfs_malloc(tr
->tbuffer_size
);
1147 // journal replay code checksum check depends on this.
1148 memset(nblhdr
, 0, BLHDR_CHECKSUM_SIZE
);
1149 // Fill up the rest of the block with unimportant bytes
1150 memset(nblhdr
+ BLHDR_CHECKSUM_SIZE
, 0x5a, jnl
->jhdr
->blhdr_size
- BLHDR_CHECKSUM_SIZE
);
1152 // initialize the new guy
1153 nblhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
1154 nblhdr
->num_blocks
= 1; // accounts for this header block
1155 nblhdr
->bytes_used
= (uint32_t)jnl
->jhdr
->blhdr_size
;
1156 nblhdr
->flags
= BLHDR_CHECK_CHECKSUMS
;
1159 tr
->total_bytes
+= jnl
->jhdr
->blhdr_size
;
1161 // then link him in at the end
1162 prev
->binfo
[0].bnum
= (off_t
)((long)nblhdr
);
1164 // and finally switch to using the new guy
1169 if ((i
+1) > blhdr
->max_blocks
) {
1170 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i
, blhdr
->max_blocks
);
1173 // if this is true then this is a new block we haven't seen before
1174 if (i
>= blhdr
->num_blocks
) {
1176 bsize
= psGenBuf
->uDataSize
;
1178 // Add block to list
1179 blhdr
->binfo
[i
].bnum
= (off_t
)(psGenBuf
->uBlockN
);
1180 blhdr
->binfo
[i
].u
.bp
= (void*)psGenBuf
;
1182 blhdr
->bytes_used
+= bsize
;
1183 tr
->total_bytes
+= bsize
;
1185 blhdr
->num_blocks
++;
1188 // We can release the block here to allow other threads to perform operations on it until the next transaction-end.
1189 // The buffer will not be removed from cache since it is write-locked.
1190 lf_hfs_generic_buf_release(psGenBuf
);
1195 // This function validates if a block is already registered to a transaction
1197 * Flush the contents of the journal to the disk.
1201 * If TRUE, wait to write in-memory journal to the disk
1202 * consistently, and also wait to write all asynchronous
1203 * metadata blocks to its corresponding locations
1204 * consistently on the disk. This means that the journal
1205 * is empty at this point and does not contain any
1206 * transactions. This is overkill in normal scenarios
1207 * but is useful whenever the metadata blocks are required
1208 * to be consistent on-disk instead of just the journal
1209 * being consistent; like before live verification
1210 * and live volume resizing.
1212 * If FALSE, only wait to write in-memory journal to the
1213 * disk consistently. This means that the journal still
1214 * contains uncommitted transactions and the file system
1215 * metadata blocks in the journal transactions might be
1216 * written asynchronously to the disk. But there is no
1217 * guarantee that they are written to the disk before
1218 * returning to the caller. Note that this option is
1219 * sufficient for file system data integrity as it
1220 * guarantees consistent journal content on the disk.
1222 int journal_flush(journal
*jnl
, journal_flush_options_t options
) {
1223 boolean_t drop_lock
= FALSE
;
1225 uint32_t flush_count
= 0;
1229 free_old_stuff(jnl
);
1231 if (jnl
->flags
& JOURNAL_INVALID
) {
1235 if (jnl
->owner
!= pthread_self()) {
1240 if (ISSET(options
, JOURNAL_FLUSH_FULL
))
1241 flush_count
= jnl
->flush_counter
;
1243 // if we're not active, flush any buffered transactions
1244 if (jnl
->active_tr
== NULL
&& jnl
->cur_tr
) {
1245 transaction
*tr
= jnl
->cur_tr
;
1249 if (ISSET(options
, JOURNAL_WAIT_FOR_IO
)) {
1250 wait_condition(jnl
, &jnl
->flushing
, "journal_flush");
1251 wait_condition(jnl
, &jnl
->asyncIO
, "journal_flush");
1254 // As the journal flush changes the MetaData content (update Endianizm), we need to lock the system times.
1255 int lockflags
= hfs_systemfile_lock(jnl
->fsmount
->psHfsmount
, SFL_CATALOG
| SFL_ATTRIBUTE
| SFL_EXTENTS
| SFL_BITMAP
, HFS_EXCLUSIVE_LOCK
);
1258 * "end_transction" will wait for any current async flush
1259 * to complete, before flushing "cur_tr"... because we've
1260 * specified the 'must_wait' arg as TRUE, it will then
1261 * synchronously flush the "cur_tr"
1263 end_transaction(tr
, 1, NULL
, NULL
, drop_lock
); // force it to get flushed
1265 hfs_systemfile_unlock(jnl
->fsmount
->psHfsmount
, lockflags
);
1268 if (drop_lock
== TRUE
) {
1269 journal_unlock(jnl
);
1272 /* Because of pipelined journal, the journal transactions
1273 * might be in process of being flushed on another thread.
1274 * If there is nothing to flush currently, we should
1275 * synchronize ourselves with the pipelined journal thread
1276 * to ensure that all inflight transactions, if any, are
1277 * flushed before we return success to caller.
1279 wait_condition(jnl
, &jnl
->flushing
, "journal_flush");
1281 if (ISSET(options
, JOURNAL_WAIT_FOR_IO
)) {
1282 wait_condition(jnl
, &jnl
->asyncIO
, "journal_flush");
1285 if (ISSET(options
, JOURNAL_FLUSH_FULL
)) {
1287 dk_synchronize_t sync_request
= {
1291 // We need a full cache flush. If it has not been done, do it here.
1292 if (flush_count
== jnl
->flush_counter
)
1293 error
= ioctl(jnl
->jdev
->psFSRecord
->iFD
, DKIOCSYNCHRONIZE
, (caddr_t
)&sync_request
);
1295 // If external journal partition is enabled, flush filesystem data partition.
1296 if (jnl
->jdev
!= jnl
->fsdev
)
1297 error
= ioctl(jnl
->jdev
->psFSRecord
->iFD
, DKIOCSYNCHRONIZE
, (caddr_t
)&sync_request
);
1305 // ************************** Local Functions ***********************
1306 static int update_fs_block(journal
*jnl
, void *block_ptr
, off_t fs_block
, size_t bsize
) {
1309 GenericLFBuf
*psGenBuf
= NULL
;
1311 // first read the block we want.
1312 psGenBuf
= lf_hfs_generic_buf_allocate(jnl
->fsmount
->psHfsmount
->hfs_devvp
,
1315 GEN_BUF_PHY_BLOCK
| GEN_BUF_NON_CACHED
);
1317 LFHFS_LOG(LEVEL_ERROR
, "jnl: update_fs_block: error allocating fs block # %lld!\n", fs_block
);
1322 iRet
= lf_hfs_generic_buf_read(psGenBuf
);
1324 LFHFS_LOG(LEVEL_ERROR
, "jnl: update_fs_block: error reading fs block # %lld!\n", fs_block
);
1328 // copy the journal data over top of it
1329 memcpy(psGenBuf
->pvData
, block_ptr
, bsize
);
1331 iRet
= lf_hfs_generic_buf_write(psGenBuf
);
1333 LFHFS_LOG(LEVEL_ERROR
, "jnl: update_fs_block: failed to write block %lld (ret %d)\n", fs_block
, iRet
);
1339 lf_hfs_generic_buf_release(psGenBuf
);
1346 static int grow_table(struct bucket
**buf_ptr
, int num_buckets
, int new_size
) {
1347 struct bucket
*newBuf
;
1348 int current_size
= num_buckets
, i
;
1350 // return if newsize is less than the current size
1351 if (new_size
< num_buckets
) {
1352 return current_size
;
1355 newBuf
= hfs_malloc(new_size
*sizeof(struct bucket
));
1357 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
1359 // copy existing elements
1360 bcopy(*buf_ptr
, newBuf
, num_buckets
*sizeof(struct bucket
));
1362 // initialize the new ones
1363 for(i
= num_buckets
; i
< new_size
; i
++) {
1364 newBuf
[i
].block_num
= (off_t
)-1;
1367 // free the old container
1370 // reset the buf_ptr
1377 static int insert_block(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
, int overwriting
) {
1380 // grow the table if we're out of space - we may index the table
1381 // with *num_full_ptr (lookup_bucket() can return a maximum value ==
1382 // *num_full_ptr), so we need to grow when we hit (*num_buckets_ptr - 1)
1383 // to prevent out-of-bounds indexing
1384 if (*num_full_ptr
>= (*num_buckets_ptr
- 1)) {
1385 int new_size
= *num_buckets_ptr
* 2;
1386 int grow_size
= grow_table(buf_ptr
, *num_buckets_ptr
, new_size
);
1388 if (grow_size
< new_size
) {
1389 LFHFS_LOG(LEVEL_ERROR
, "jnl: add_block: grow_table returned an error!\n");
1393 *num_buckets_ptr
= grow_size
; //update num_buckets to reflect the new size
1396 // if we're not inserting at the end, we need to bcopy
1397 if (blk_index
!= *num_full_ptr
) {
1398 bcopy( (*buf_ptr
)+(blk_index
), (*buf_ptr
)+(blk_index
+1), (*num_full_ptr
-blk_index
)*sizeof(struct bucket
) );
1401 (*num_full_ptr
)++; // increment only if we're not overwriting
1404 // sanity check the values we're about to add
1405 if ((off_t
)offset
>= jnl
->jhdr
->size
) {
1406 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
1409 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size
);
1412 (*buf_ptr
)[blk_index
].block_num
= num
;
1413 (*buf_ptr
)[blk_index
].block_size
= (uint32_t)size
;
1414 (*buf_ptr
)[blk_index
].jnl_offset
= (uint32_t)offset
;
1415 (*buf_ptr
)[blk_index
].cksum
= cksum
;
1420 static int do_overlap(journal
*jnl
, struct bucket
**buf_ptr
, int blk_index
, off_t block_num
, size_t size
, __unused
size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
) {
1422 int num_to_remove
, index
, i
, overwrite
, err
;
1423 size_t jhdr_size
= jnl
->jhdr
->jhdr_size
, new_offset
;
1424 off_t overlap
, block_start
, block_end
;
1426 block_start
= block_num
*jhdr_size
;
1427 block_end
= block_start
+ size
;
1428 overwrite
= (block_num
== (*buf_ptr
)[blk_index
].block_num
&& size
>= (*buf_ptr
)[blk_index
].block_size
);
1430 // first, eliminate any overlap with the previous entry
1431 if (blk_index
!= 0 && !overwrite
) {
1432 off_t prev_block_start
= (*buf_ptr
)[blk_index
-1].block_num
*jhdr_size
;
1433 off_t prev_block_end
= prev_block_start
+ (*buf_ptr
)[blk_index
-1].block_size
;
1434 overlap
= prev_block_end
- block_start
;
1436 if (overlap
% jhdr_size
!= 0) {
1437 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size
);
1440 // if the previous entry completely overlaps this one, we need to break it into two pieces.
1441 if (prev_block_end
> block_end
) {
1442 off_t new_num
= block_end
/ jhdr_size
;
1443 size_t new_size
= prev_block_end
- block_end
;
1445 new_offset
= (*buf_ptr
)[blk_index
-1].jnl_offset
+ (block_end
- prev_block_start
);
1447 err
= insert_block(jnl
, buf_ptr
, blk_index
, new_num
, new_size
, new_offset
, cksum
, num_buckets_ptr
, num_full_ptr
, 0);
1449 panic("jnl: do_overlap: error inserting during pre-overlap\n");
1453 // Regardless, we need to truncate the previous entry to the beginning of the overlap
1454 (*buf_ptr
)[blk_index
-1].block_size
= (uint32_t)(block_start
- prev_block_start
);
1455 (*buf_ptr
)[blk_index
-1].cksum
= 0; // have to blow it away because there's no way to check it
1459 // then, bail out fast if there's no overlap with the entries that follow
1460 if (!overwrite
&& block_end
<= (off_t
)((*buf_ptr
)[blk_index
].block_num
*jhdr_size
)) {
1461 return 0; // no overlap, no overwrite
1462 } else if (overwrite
&& (blk_index
+ 1 >= *num_full_ptr
|| block_end
<= (off_t
)((*buf_ptr
)[blk_index
+1].block_num
*jhdr_size
))) {
1464 (*buf_ptr
)[blk_index
].cksum
= cksum
; // update this
1465 return 1; // simple overwrite
1468 // Otherwise, find all cases of total and partial overlap. We use the special
1469 // block_num of -2 to designate entries that are completely overlapped and must
1470 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
1471 // entries must be adjusted to keep the array consistent.
1474 while (index
< *num_full_ptr
&& block_end
> (off_t
)((*buf_ptr
)[index
].block_num
*jhdr_size
)) {
1475 if (block_end
>= (off_t
)(((*buf_ptr
)[index
].block_num
*jhdr_size
+ (*buf_ptr
)[index
].block_size
))) {
1476 (*buf_ptr
)[index
].block_num
= -2; // mark this for deletion
1479 overlap
= block_end
- (*buf_ptr
)[index
].block_num
*jhdr_size
;
1481 if (overlap
% jhdr_size
!= 0) {
1482 panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap
, jhdr_size
);
1485 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1486 (*buf_ptr
)[index
].block_num
+= (overlap
/ jhdr_size
); // make sure overlap is multiple of jhdr_size, or round up
1487 (*buf_ptr
)[index
].cksum
= 0;
1489 new_offset
= (*buf_ptr
)[index
].jnl_offset
+ overlap
; // check for wrap-around
1490 if ((off_t
)new_offset
>= jnl
->jhdr
->size
) {
1491 new_offset
= jhdr_size
+ (new_offset
- jnl
->jhdr
->size
);
1493 (*buf_ptr
)[index
].jnl_offset
= (uint32_t)new_offset
;
1495 (*buf_ptr
)[index
].block_size
-= overlap
; // sanity check for negative value
1496 if ((*buf_ptr
)[index
].block_size
<= 0) {
1497 panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr
)[index
].block_size
);
1498 // return -1; // if above panic is removed, return -1 for error
1507 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1508 index
--; // start with the last index used within the above loop
1509 while (index
>= blk_index
) {
1510 if ((*buf_ptr
)[index
].block_num
== -2) {
1511 if (index
== *num_full_ptr
-1) {
1512 (*buf_ptr
)[index
].block_num
= -1; // it's the last item in the table... just mark as free
1514 bcopy( (*buf_ptr
)+(index
+1), (*buf_ptr
)+(index
), (*num_full_ptr
- (index
+ 1)) * sizeof(struct bucket
) );
1521 // eliminate any stale entries at the end of the table
1522 for(i
= *num_full_ptr
; i
< (*num_full_ptr
+ num_to_remove
); i
++) {
1523 (*buf_ptr
)[i
].block_num
= -1;
1526 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1530 static int lookup_bucket(struct bucket
**buf_ptr
, off_t block_num
, int num_full
) {
1531 int lo
, hi
, index
, matches
, i
;
1533 if (num_full
== 0) {
1534 return 0; // table is empty, so insert at index=0
1541 // perform binary search for block_num
1543 int mid
= (hi
- lo
)/2 + lo
;
1544 off_t this_num
= (*buf_ptr
)[mid
].block_num
;
1546 if (block_num
== this_num
) {
1551 if (block_num
< this_num
) {
1556 if (block_num
> this_num
) {
1562 // check if lo and hi converged on the match
1563 if (block_num
== (*buf_ptr
)[hi
].block_num
) {
1567 // if no existing entry found, find index for new one
1569 index
= (block_num
< (*buf_ptr
)[hi
].block_num
) ? hi
: hi
+ 1;
1571 // make sure that we return the right-most index in the case of multiple matches
1574 while (i
< num_full
&& block_num
== (*buf_ptr
)[i
].block_num
) {
1585 // PR-3105942: Coalesce writes to the same block in journal replay
1586 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1587 // to be replayed and the corresponding location in the journal which contains
1588 // the most recent data for those blocks. The array is "played" once the all the
1589 // blocks in the journal have been coalesced. The code for the case of conflicting/
1590 // overlapping writes to a single block is the most dense. Because coalescing can
1591 // disrupt the existing time-ordering of blocks in the journal playback, care
1592 // is taken to catch any overlaps and keep the array consistent.
1593 static int add_block(journal
*jnl
, struct bucket
**buf_ptr
, off_t block_num
, size_t size
, size_t offset
, int32_t cksum
, int *num_buckets_ptr
, int *num_full_ptr
) {
1594 int blk_index
, overwriting
;
1596 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1597 // inserted (or the index of the elem to overwrite).
1598 blk_index
= lookup_bucket( buf_ptr
, block_num
, *num_full_ptr
);
1600 // check if the index is within bounds (if we're adding this block to the end of
1601 // the table, blk_index will be equal to num_full)
1602 if (blk_index
< 0 || blk_index
> *num_full_ptr
) {
1603 //printf("jnl: add_block: trouble adding block to co_buf\n");
1605 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1607 // Determine whether we're overwriting an existing entry by checking for overlap
1608 overwriting
= do_overlap(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, cksum
, num_buckets_ptr
, num_full_ptr
);
1609 if (overwriting
< 0) {
1610 return -1; // if we got an error, pass it along
1613 // returns the index, or -1 on error
1614 blk_index
= insert_block(jnl
, buf_ptr
, blk_index
, block_num
, size
, offset
, cksum
, num_buckets_ptr
, num_full_ptr
, overwriting
);
1619 static void swap_block_list_header(journal
*jnl
, block_list_header
*blhdr
) {
1622 blhdr
->max_blocks
= SWAP16(blhdr
->max_blocks
);
1623 blhdr
->num_blocks
= SWAP16(blhdr
->num_blocks
);
1624 blhdr
->bytes_used
= SWAP32(blhdr
->bytes_used
);
1625 blhdr
->checksum
= SWAP32(blhdr
->checksum
);
1626 blhdr
->flags
= SWAP32(blhdr
->flags
);
1628 if (blhdr
->num_blocks
>= ((jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1)) {
1629 LFHFS_LOG(LEVEL_ERROR
, "jnl: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", blhdr
->num_blocks
, jnl
->jhdr
->blhdr_size
);
1633 for(i
= 0; i
< blhdr
->num_blocks
; i
++) {
1634 blhdr
->binfo
[i
].bnum
= SWAP64(blhdr
->binfo
[i
].bnum
);
1635 blhdr
->binfo
[i
].u
.bi
.bsize
= SWAP32(blhdr
->binfo
[i
].u
.bi
.bsize
);
1636 blhdr
->binfo
[i
].u
.bi
.b
.cksum
= SWAP32(blhdr
->binfo
[i
].u
.bi
.b
.cksum
);
1640 static int replay_journal(journal
*jnl
) {
1641 int i
, bad_blocks
=0;
1642 unsigned int orig_checksum
, checksum
, check_block_checksums
= 0;
1644 size_t max_bsize
= 0; /* protected by block_ptr */
1645 block_list_header
*blhdr
;
1646 off_t offset
, txn_start_offset
=0, blhdr_offset
, orig_jnl_start
;
1647 char *buff
, *block_ptr
=NULL
;
1648 struct bucket
*co_buf
;
1649 int num_buckets
= STARTING_BUCKETS
, num_full
, check_past_jnl_end
= 1, in_uncharted_territory
= 0;
1650 uint32_t last_sequence_num
= 0;
1651 int replay_retry_count
= 0;
1653 LFHFS_LOG(LEVEL_DEFAULT
, "replay_journal: start.\n");
1656 // wrap the start ptr if it points to the very end of the journal
1657 if (jnl
->jhdr
->start
== jnl
->jhdr
->size
) {
1658 jnl
->jhdr
->start
= jnl
->jhdr
->jhdr_size
;
1660 if (jnl
->jhdr
->end
== jnl
->jhdr
->size
) {
1661 jnl
->jhdr
->end
= jnl
->jhdr
->jhdr_size
;
1664 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
1665 LFHFS_LOG(LEVEL_DEFAULT
, "replay_journal: journal empty.\n");
1669 orig_jnl_start
= jnl
->jhdr
->start
;
1671 // allocate memory for the header_block. we'll read each blhdr into this
1672 buff
= hfs_malloc(jnl
->jhdr
->blhdr_size
);
1674 // allocate memory for the coalesce buffer
1675 co_buf
= hfs_malloc(num_buckets
*sizeof(struct bucket
));
1679 // initialize entries
1680 for(i
= 0; i
< num_buckets
; i
++) {
1681 co_buf
[i
].block_num
= -1;
1683 num_full
= 0; // empty at first
1686 while (check_past_jnl_end
|| jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1687 offset
= blhdr_offset
= jnl
->jhdr
->start
;
1688 ret
= read_journal_data(jnl
, &offset
, buff
, jnl
->jhdr
->blhdr_size
);
1689 if (ret
!= (size_t)jnl
->jhdr
->blhdr_size
) {
1690 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset
);
1691 goto bad_txn_handling
;
1694 blhdr
= (block_list_header
*)buff
;
1696 orig_checksum
= blhdr
->checksum
;
1697 blhdr
->checksum
= 0;
1698 if (jnl
->flags
& JOURNAL_NEED_SWAP
) {
1699 // calculate the checksum based on the unswapped data
1700 // because it is done byte-at-a-time.
1701 orig_checksum
= (unsigned int)SWAP32(orig_checksum
);
1702 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1703 swap_block_list_header(jnl
, blhdr
);
1705 checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
1710 // XXXdbg - if these checks fail, we should replay as much
1711 // we can in the hopes that it will still leave the
1712 // drive in a better state than if we didn't replay
1715 if (checksum
!= orig_checksum
) {
1716 if (check_past_jnl_end
&& in_uncharted_territory
) {
1718 if (blhdr_offset
!= jnl
->jhdr
->end
) {
1719 LFHFS_LOG(LEVEL_ERROR
, "jnl: Extra txn replay stopped @ %lld / 0x%llx\n", blhdr_offset
, blhdr_offset
);
1722 check_past_jnl_end
= 0;
1723 jnl
->jhdr
->end
= blhdr_offset
;
1727 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1728 blhdr_offset
, orig_checksum
, checksum
);
1730 if (blhdr_offset
== orig_jnl_start
) {
1731 // if there's nothing in the journal at all, just bail out altogether.
1735 goto bad_txn_handling
;
1738 if ( (last_sequence_num
!= 0)
1739 && (blhdr
->binfo
[0].u
.bi
.b
.sequence_num
!= 0)
1740 && (blhdr
->binfo
[0].u
.bi
.b
.sequence_num
!= last_sequence_num
)
1741 && (blhdr
->binfo
[0].u
.bi
.b
.sequence_num
!= last_sequence_num
+1)) {
1743 txn_start_offset
= jnl
->jhdr
->end
= blhdr_offset
;
1745 if (check_past_jnl_end
) {
1746 check_past_jnl_end
= 0;
1747 LFHFS_LOG(LEVEL_ERROR
, "jnl: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1748 blhdr_offset
, blhdr_offset
, blhdr
->binfo
[0].u
.bi
.b
.sequence_num
, last_sequence_num
);
1752 LFHFS_LOG(LEVEL_ERROR
, "jnl: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1753 blhdr_offset
, blhdr_offset
, blhdr
->binfo
[0].u
.bi
.b
.sequence_num
, last_sequence_num
);
1754 goto bad_txn_handling
;
1756 last_sequence_num
= blhdr
->binfo
[0].u
.bi
.b
.sequence_num
;
1758 if (blhdr_offset
>= jnl
->jhdr
->end
&& jnl
->jhdr
->start
<= jnl
->jhdr
->end
) {
1759 if (last_sequence_num
== 0) {
1760 check_past_jnl_end
= 0;
1761 LFHFS_LOG(LEVEL_ERROR
, "jnl: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1762 jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1763 if (jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1764 jnl
->jhdr
->start
= jnl
->jhdr
->end
;
1768 LFHFS_LOG(LEVEL_ERROR
, "jnl: examining extra transactions starting @ %lld / 0x%llx\n", blhdr_offset
, blhdr_offset
);
1771 if ( blhdr
->max_blocks
<= 0 || blhdr
->max_blocks
> (jnl
->jhdr
->size
/jnl
->jhdr
->jhdr_size
)
1772 || blhdr
->num_blocks
<= 0 || blhdr
->num_blocks
> blhdr
->max_blocks
) {
1773 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
1774 blhdr
->max_blocks
, blhdr
->num_blocks
);
1775 goto bad_txn_handling
;
1779 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
1780 if (blhdr
->binfo
[i
].bnum
< 0 && blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
1781 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: bogus block number 0x%llx\n", blhdr
->binfo
[i
].bnum
);
1782 goto bad_txn_handling
;
1785 if ((size_t)blhdr
->binfo
[i
].u
.bi
.bsize
> max_bsize
) {
1786 max_bsize
= blhdr
->binfo
[i
].u
.bi
.bsize
;
1790 if (blhdr
->flags
& BLHDR_CHECK_CHECKSUMS
) {
1791 check_block_checksums
= 1;
1792 block_ptr
= hfs_malloc(max_bsize
);
1797 if (blhdr
->flags
& BLHDR_FIRST_HEADER
) {
1798 txn_start_offset
= blhdr_offset
;
1801 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1802 // blhdr->num_blocks-1, jnl->jhdr->start);
1804 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
1808 size
= blhdr
->binfo
[i
].u
.bi
.bsize
;
1809 number
= blhdr
->binfo
[i
].bnum
;
1811 // don't add "killed" blocks
1812 if (number
== (off_t
)-1) {
1813 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1816 if (check_block_checksums
) {
1820 block_offset
= offset
;
1822 // read the block so we can check the checksum
1823 ret
= read_journal_data(jnl
, &block_offset
, block_ptr
, size
);
1824 if (ret
!= (size_t)size
) {
1825 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset
);
1826 goto bad_txn_handling
;
1829 disk_cksum
= calc_checksum(block_ptr
, size
);
1831 // there is no need to swap the checksum from disk because
1832 // it got swapped when the blhdr was read in.
1833 if (blhdr
->binfo
[i
].u
.bi
.b
.cksum
!= 0 && disk_cksum
!= blhdr
->binfo
[i
].u
.bi
.b
.cksum
) {
1834 LFHFS_LOG(LEVEL_ERROR
, "jnl: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1835 txn_start_offset
, blhdr_offset
, i
, number
, size
, disk_cksum
, blhdr
->binfo
[i
].u
.bi
.b
.cksum
);
1836 LFHFS_LOG(LEVEL_ERROR
, "jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1837 *(int *)&block_ptr
[0*sizeof(int)], *(int *)&block_ptr
[1*sizeof(int)], *(int *)&block_ptr
[2*sizeof(int)], *(int *)&block_ptr
[3*sizeof(int)],
1838 *(int *)&block_ptr
[4*sizeof(int)], *(int *)&block_ptr
[5*sizeof(int)], *(int *)&block_ptr
[6*sizeof(int)], *(int *)&block_ptr
[7*sizeof(int)]);
1840 goto bad_txn_handling
;
1845 // add this bucket to co_buf, coalescing where possible
1846 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1847 ret_val
= add_block(jnl
, &co_buf
, number
, size
, (size_t) offset
, blhdr
->binfo
[i
].u
.bi
.b
.cksum
, &num_buckets
, &num_full
);
1849 if (ret_val
== -1) {
1850 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: trouble adding block to co_buf\n");
1852 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1858 // check if the last block added puts us off the end of the jnl.
1859 // if so, we need to wrap to the beginning and take any remainder
1862 if (offset
>= jnl
->jhdr
->size
) {
1863 offset
= jnl
->jhdr
->jhdr_size
+ (offset
- jnl
->jhdr
->size
);
1868 hfs_free(block_ptr
);
1874 /* Journal replay got error before it found any valid
1875 * transations, abort replay */
1876 if (txn_start_offset
== 0) {
1877 LFHFS_LOG(LEVEL_ERROR
, "jnl: no known good txn start offset! aborting journal replay.\n");
1881 /* Repeated error during journal replay, abort replay */
1882 if (replay_retry_count
== 3) {
1883 LFHFS_LOG(LEVEL_ERROR
, "jnl: repeated errors replaying journal! aborting journal replay.\n");
1886 replay_retry_count
++;
1888 /* There was an error replaying the journal (possibly
1889 * EIO/ENXIO from the device). So retry replaying all
1890 * the good transactions that we found before getting
1893 jnl
->jhdr
->start
= orig_jnl_start
;
1894 jnl
->jhdr
->end
= txn_start_offset
;
1895 check_past_jnl_end
= 0;
1896 last_sequence_num
= 0;
1897 LFHFS_LOG(LEVEL_ERROR
, "jnl: restarting journal replay (%lld - %lld)!\n", jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1898 goto restart_replay
;
1901 jnl
->jhdr
->start
+= blhdr
->bytes_used
;
1902 if (jnl
->jhdr
->start
>= jnl
->jhdr
->size
) {
1903 // wrap around and skip the journal header block
1904 jnl
->jhdr
->start
= (jnl
->jhdr
->start
% jnl
->jhdr
->size
) + jnl
->jhdr
->jhdr_size
;
1907 if (jnl
->jhdr
->start
== jnl
->jhdr
->end
) {
1908 in_uncharted_territory
= 1;
1912 if (jnl
->jhdr
->start
!= jnl
->jhdr
->end
) {
1913 LFHFS_LOG(LEVEL_ERROR
, "jnl: start %lld != end %lld. resetting end.\n", jnl
->jhdr
->start
, jnl
->jhdr
->end
);
1914 jnl
->jhdr
->end
= jnl
->jhdr
->start
;
1917 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1920 * make sure it's at least one page in size, so
1921 * start max_bsize at PAGE_SIZE
1923 for (i
= 0, max_bsize
= PAGE_SIZE
; i
< num_full
; i
++) {
1925 if (co_buf
[i
].block_num
== (off_t
)-1)
1928 if (co_buf
[i
].block_size
> max_bsize
)
1929 max_bsize
= co_buf
[i
].block_size
;
1932 * round max_bsize up to the nearest PAGE_SIZE multiple
1934 if (max_bsize
& (PAGE_SIZE
- 1)) {
1935 max_bsize
= (max_bsize
+ PAGE_SIZE
) & ~(PAGE_SIZE
- 1);
1938 block_ptr
= hfs_malloc(max_bsize
);
1940 // Replay the coalesced entries in the co-buf
1941 for(i
= 0; i
< num_full
; i
++) {
1942 size_t size
= co_buf
[i
].block_size
;
1943 off_t jnl_offset
= (off_t
) co_buf
[i
].jnl_offset
;
1944 off_t number
= co_buf
[i
].block_num
;
1947 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1948 // co_buf[i].block_size, co_buf[i].jnl_offset);
1950 if (number
== (off_t
)-1) {
1951 // printf("jnl: replay_journal: skipping killed fs block\n");
1954 // do journal read, and set the phys. block
1955 ret
= read_journal_data(jnl
, &jnl_offset
, block_ptr
, size
);
1957 LFHFS_LOG(LEVEL_ERROR
, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl_offset
);
1961 if (update_fs_block(jnl
, block_ptr
, number
, size
) != 0) {
1968 // done replaying; update jnl header
1969 if (write_journal_header(jnl
, 1, jnl
->jhdr
->sequence_num
) != 0) {
1975 hfs_free(block_ptr
);
1979 // free the coalesce buffer
1986 LFHFS_LOG(LEVEL_DEFAULT
, "replay_journal: success.\n");
1990 hfs_free(block_ptr
);
1994 LFHFS_LOG(LEVEL_ERROR
, "replay_journal: error.\n");
1999 // This function get executed after a buffer has been written to its
2000 // final destination.
2001 // This function lets us know when a buffer has been
2002 // flushed to disk. Originally (kext), it was called from deep
2003 // within the driver stack and thus is quite limited in what it could do.
2004 // Notably, it could not initiate any new i/o's or allocate/free memory.
2005 static void buffer_written(transaction
*tr
, GenericLFBuf
*bp
) {
2008 transaction
*ctr
, *prev
=NULL
, *next
;
2010 size_t bufsize
, amt_flushed
, total_bytes
;
2013 // snarf out the bits we want
2014 bufsize
= bp
->uDataSize
;
2016 // then we've already seen it
2021 CHECK_TRANSACTION(tr
);
2027 amt_flushed
= tr
->num_killed
;
2028 total_bytes
= tr
->total_bytes
;
2030 // update the number of blocks that have been flushed.
2031 // this buf may represent more than one block so take
2032 // that into account.
2033 amt_flushed
+= tr
->num_flushed
;
2034 tr
->num_flushed
+= bufsize
;
2036 // if this transaction isn't done yet, just return as
2037 // there is nothing to do.
2039 // NOTE: we are careful to not reference anything through
2040 // the tr pointer after doing the OSAddAtomic(). if
2041 // this if statement fails then we are the last one
2042 // and then it's ok to dereference "tr".
2044 if ((amt_flushed
+ bufsize
) < total_bytes
) {
2048 // this will single thread checking the transaction
2051 if (tr
->total_bytes
== (int)0xfbadc0de) {
2052 // then someone beat us to it...
2053 unlock_oldstart(jnl
);
2057 // mark this so that we're the owner of dealing with the
2058 // cleanup for this transaction
2059 tr
->total_bytes
= 0xfbadc0de;
2061 if (jnl
->flags
& JOURNAL_INVALID
)
2062 goto transaction_done
;
2064 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
2065 // tr, tr->journal_start, tr->journal_end, jnl);
2067 // find this entry in the old_start[] index and mark it completed
2068 for(i
= 0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
2070 if ((off_t
)(jnl
->old_start
[i
] & ~(0x8000000000000000ULL
)) == tr
->journal_start
) {
2071 jnl
->old_start
[i
] &= ~(0x8000000000000000ULL
);
2076 if (i
>= sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
2077 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
2078 tr
->journal_start
, tr
, jnl
);
2082 // if we are here then we need to update the journal header
2083 // to reflect that this transaction is complete
2084 if (tr
->journal_start
== jnl
->active_start
) {
2085 jnl
->active_start
= tr
->journal_end
;
2086 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
2089 // go through the completed_trs list and try to coalesce
2090 // entries, restarting back at the beginning if we have to.
2091 for (ctr
= jnl
->completed_trs
; ctr
; prev
=ctr
, ctr
=next
) {
2092 if (ctr
->journal_start
== jnl
->active_start
) {
2093 jnl
->active_start
= ctr
->journal_end
;
2095 prev
->next
= ctr
->next
;
2097 if (ctr
== jnl
->completed_trs
) {
2098 jnl
->completed_trs
= ctr
->next
;
2101 next
= jnl
->completed_trs
; // this starts us over again
2102 ctr
->next
= jnl
->tr_freeme
;
2103 jnl
->tr_freeme
= ctr
;
2106 } else if (tr
->journal_end
== ctr
->journal_start
) {
2107 ctr
->journal_start
= tr
->journal_start
;
2108 next
= jnl
->completed_trs
; // this starts us over again
2110 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
2112 } else if (tr
->journal_start
== ctr
->journal_end
) {
2113 ctr
->journal_end
= tr
->journal_end
;
2115 tr
->journal_start
= tr
->journal_end
= (off_t
)0;
2116 } else if (ctr
->next
&& ctr
->journal_end
== ctr
->next
->journal_start
) {
2117 // coalesce the next entry with this one and link the next
2118 // entry in at the head of the tr_freeme list
2119 next
= ctr
->next
; // temporarily use the "next" variable
2120 ctr
->journal_end
= next
->journal_end
;
2121 ctr
->next
= next
->next
;
2122 next
->next
= jnl
->tr_freeme
; // link in the next guy at the head of the tr_freeme list
2123 jnl
->tr_freeme
= next
;
2125 next
= jnl
->completed_trs
; // this starts us over again
2133 // if this is true then we didn't merge with anyone
2134 // so link ourselves in at the head of the completed
2135 // transaction list.
2136 if (tr
->journal_start
!= 0) {
2137 // put this entry into the correct sorted place
2138 // in the list instead of just at the head.
2141 for (ctr
= jnl
->completed_trs
; ctr
&& tr
->journal_start
> ctr
->journal_start
; prev
=ctr
, ctr
=ctr
->next
) {
2142 // just keep looping
2145 if (ctr
== NULL
&& prev
== NULL
) {
2146 jnl
->completed_trs
= tr
;
2149 } else if (ctr
== jnl
->completed_trs
) {
2150 tr
->next
= jnl
->completed_trs
;
2151 jnl
->completed_trs
= tr
;
2154 tr
->next
= prev
->next
;
2159 // if we're here this tr got merged with someone else so
2160 // put it on the list to be free'd
2161 tr
->next
= jnl
->tr_freeme
;
2162 jnl
->tr_freeme
= tr
;
2165 unlock_oldstart(jnl
);
2167 unlock_condition(jnl
, &jnl
->asyncIO
);
2170 static size_t write_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
) {
2171 return do_journal_io(jnl
, offset
, data
, len
, JNL_WRITE
);
2174 static size_t read_journal_data(journal
*jnl
, off_t
*offset
, void *data
, size_t len
) {
2175 return do_journal_io(jnl
, offset
, data
, len
, JNL_READ
);
2179 // This function sets the size of the tbuffer and the
2180 // size of the blhdr. It assumes that jnl->jhdr->size
2181 // and jnl->jhdr->jhdr_size are already valid.
2182 static void size_up_tbuffer(journal
*jnl
, uint32_t tbuffer_size
, uint32_t phys_blksz
) {
2184 // one-time initialization based on how much memory
2185 // there is in the machine.
2187 if (def_tbuffer_size
== 0) {
2188 uint64_t memsize
= 0;
2189 size_t l
= sizeof(memsize
);
2190 sysctlbyname("hw.memsize", &memsize
, &l
, NULL
, 0);
2192 if (memsize
< (256*1024*1024)) {
2193 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
;
2194 } else if (memsize
< (512*1024*1024)) {
2195 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 2;
2196 } else if (memsize
< (1024*1024*1024)) {
2197 def_tbuffer_size
= DEFAULT_TRANSACTION_BUFFER_SIZE
* 3;
2199 def_tbuffer_size
= (uint32_t)(DEFAULT_TRANSACTION_BUFFER_SIZE
* (memsize
/ (256*1024*1024)));
2204 if (!(jnl
->jhdr
->jhdr_size
> 0)) {
2205 panic("jnl->jhdr->jhdr_size is %d", jnl
->jhdr
->jhdr_size
);
2208 // size up the transaction buffer... can't be larger than the number
2209 // of blocks that can fit in a block_list_header block.
2210 if (tbuffer_size
== 0) {
2211 jnl
->tbuffer_size
= def_tbuffer_size
;
2213 // make sure that the specified tbuffer_size isn't too small
2214 if (tbuffer_size
< jnl
->jhdr
->blhdr_size
* 2) {
2215 tbuffer_size
= jnl
->jhdr
->blhdr_size
* 2;
2217 // and make sure it's an even multiple of the block size
2218 if ((tbuffer_size
% jnl
->jhdr
->jhdr_size
) != 0) {
2219 tbuffer_size
-= (tbuffer_size
% jnl
->jhdr
->jhdr_size
);
2222 jnl
->tbuffer_size
= tbuffer_size
;
2225 if (jnl
->tbuffer_size
> (jnl
->jhdr
->size
/ 2)) {
2226 jnl
->tbuffer_size
= (uint32_t)(jnl
->jhdr
->size
/ 2);
2229 if (jnl
->tbuffer_size
> MAX_TRANSACTION_BUFFER_SIZE
) {
2230 jnl
->tbuffer_size
= MAX_TRANSACTION_BUFFER_SIZE
;
2233 jnl
->jhdr
->blhdr_size
= (jnl
->tbuffer_size
/ jnl
->jhdr
->jhdr_size
) * sizeof(block_info
);
2234 if (jnl
->jhdr
->blhdr_size
< phys_blksz
) {
2235 jnl
->jhdr
->blhdr_size
= phys_blksz
;
2236 } else if ((jnl
->jhdr
->blhdr_size
% phys_blksz
) != 0) {
2237 // have to round up so we're an even multiple of the physical block size
2238 jnl
->jhdr
->blhdr_size
= (jnl
->jhdr
->blhdr_size
+ (phys_blksz
- 1)) & ~(phys_blksz
- 1);
2243 static int write_journal_header(journal
*jnl
, int updating_start
, uint32_t sequence_num
) {
2244 static int num_err_prints
= 0;
2246 off_t jhdr_offset
= 0;
2248 // Flush the track cache if we're not doing force-unit-access
2250 if (!updating_start
&& (jnl
->flags
& JOURNAL_DO_FUA_WRITES
) == 0) {
2252 dk_synchronize_t sync_request
= {
2253 .options
= DK_SYNCHRONIZE_OPTION_BARRIER
,
2257 * If device doesn't support barrier-only flush, or
2258 * the journal is on a different device, use full flush.
2260 if (!(jnl
->flags
& JOURNAL_FEATURE_BARRIER
) || (jnl
->jdev
!= jnl
->fsdev
)) {
2261 sync_request
.options
= 0;
2262 jnl
->flush_counter
++;
2265 ret
= ioctl(jnl
->jdev
->psFSRecord
->iFD
, DKIOCSYNCHRONIZE
, (caddr_t
)&sync_request
);
2269 // Only print this error if it's a different error than the
2270 // previous one, or if it's the first time for this device
2271 // or if the total number of printfs is less than 25. We
2272 // allow for up to 25 printfs to insure that some make it
2273 // into the on-disk syslog. Otherwise if we only printed
2274 // one, it's possible it would never make it to the syslog
2275 // for the root volume and that makes debugging hard.
2277 if ( ret
!= jnl
->last_flush_err
2278 || (jnl
->flags
& JOURNAL_FLUSHCACHE_ERR
) == 0
2279 || num_err_prints
++ < 25) {
2281 LFHFS_LOG(LEVEL_ERROR
, "jnl: flushing fs disk buffer returned 0x%x\n", ret
);
2283 jnl
->flags
|= JOURNAL_FLUSHCACHE_ERR
;
2284 jnl
->last_flush_err
= ret
;
2288 jnl
->jhdr
->sequence_num
= sequence_num
;
2289 jnl
->jhdr
->checksum
= 0;
2290 jnl
->jhdr
->checksum
= calc_checksum((char *)jnl
->jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
2292 if (do_journal_io(jnl
, &jhdr_offset
, jnl
->header_buf
, jnl
->jhdr
->jhdr_size
, JNL_WRITE
|JNL_HEADER
) != (size_t)jnl
->jhdr
->jhdr_size
) {
2293 LFHFS_LOG(LEVEL_ERROR
, "jnl: write_journal_header: error writing the journal header!\n");
2294 jnl
->flags
|= JOURNAL_INVALID
;
2298 // If we're not doing force-unit-access writes, then we
2299 // have to flush after writing the journal header so that
2300 // a future transaction doesn't sneak out to disk before
2301 // the header does and thus overwrite data that the old
2302 // journal header refers to. Saw this exact case happen
2303 // on an IDE bus analyzer with Larry Barras so while it
2304 // may seem obscure, it's not.
2306 if (updating_start
&& (jnl
->flags
& JOURNAL_DO_FUA_WRITES
) == 0) {
2308 dk_synchronize_t sync_request
= {
2309 .options
= DK_SYNCHRONIZE_OPTION_BARRIER
,
2313 * If device doesn't support barrier-only flush, or
2314 * the journal is on a different device, use full flush.
2316 if (!(jnl
->flags
& JOURNAL_FEATURE_BARRIER
) || (jnl
->jdev
!= jnl
->fsdev
)) {
2317 sync_request
.options
= 0;
2318 jnl
->flush_counter
++;
2321 ioctl(jnl
->jdev
->psFSRecord
->iFD
, DKIOCSYNCHRONIZE
, (caddr_t
)&sync_request
);
2326 static int journal_binfo_cmp(const void *a
, const void *b
) {
2328 const block_info
*bi_a
= (const struct block_info
*)a
;
2329 const block_info
*bi_b
= (const struct block_info
*)b
;
2332 if (bi_a
->bnum
== (off_t
)-1) {
2335 if (bi_b
->bnum
== (off_t
)-1) {
2339 // don't have to worry about negative block
2340 // numbers so this is ok to do.
2341 GenericLFBuf
*psGenBufA
, *psGenBufB
;
2342 psGenBufA
= (void*)bi_a
->u
.bp
;
2343 psGenBufB
= (void*)bi_b
->u
.bp
;
2344 res
= psGenBufA
->uBlockN
- psGenBufB
->uBlockN
;
2349 // finish_end_transaction:
2351 static int finish_end_transaction(transaction
*tr
, errno_t (*callback
)(void*), void *callback_arg
) {
2356 journal
*jnl
= tr
->jnl
;
2357 GenericLFBuf
*bp
= NULL
, **bparray
= NULL
;
2358 block_list_header
*blhdr
=NULL
, *next
=NULL
;
2359 size_t tbuffer_offset
;
2360 int bufs_written
= 0;
2363 end
= jnl
->jhdr
->end
;
2365 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
2367 amt
= blhdr
->bytes_used
;
2369 blhdr
->binfo
[0].u
.bi
.b
.sequence_num
= tr
->sequence_num
;
2371 blhdr
->checksum
= 0;
2372 blhdr
->checksum
= calc_checksum((char *)blhdr
, BLHDR_CHECKSUM_SIZE
);
2374 bparray
= hfs_malloc(blhdr
->num_blocks
* sizeof(buf_t
));
2375 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2377 // for each block in the block-header,
2378 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
2382 * finish preparing the shadow buf_t before
2383 * calculating the individual block checksums
2385 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
2388 bp
= (void*)blhdr
->binfo
[i
].u
.bp
;
2389 blkno
= bp
->uPhyCluster
;
2390 // update this so we write out the correct physical block number!
2391 blhdr
->binfo
[i
].bnum
= (off_t
)(blkno
);
2394 bsize
= bp
->uDataSize
;
2395 blhdr
->binfo
[i
].u
.bi
.bsize
= (uint32_t)bsize
;
2396 blhdr
->binfo
[i
].u
.bi
.b
.cksum
= calc_checksum(&((char *)blhdr
)[tbuffer_offset
], (uint32_t)bsize
);
2399 bsize
= blhdr
->binfo
[i
].u
.bi
.bsize
;
2400 blhdr
->binfo
[i
].u
.bi
.b
.cksum
= 0;
2402 tbuffer_offset
+= bsize
;
2406 * if we fired off the journal_write_header asynchronously in
2407 * 'end_transaction', we need to wait for its completion
2408 * before writing the actual journal data
2410 wait_condition(jnl
, &jnl
->writing_header
, "finish_end_transaction");
2412 if (jnl
->write_header_failed
== FALSE
)
2413 ret
= write_journal_data(jnl
, &end
, blhdr
, amt
);
2418 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_DATA
, jnl
->fsmount
->psHfsmount
, NULL
);
2422 * put the bp pointers back so that we can
2423 * make the final pass on them
2425 for (i
= 1; i
< blhdr
->num_blocks
; i
++)
2426 blhdr
->binfo
[i
].u
.bp
= (void*)bparray
[i
];
2434 LFHFS_LOG(LEVEL_ERROR
, "jnl: end_transaction: only wrote %zu of %zu bytes to the journal!\n",
2441 jnl
->jhdr
->end
= end
; // update where the journal now ends
2442 tr
->journal_end
= end
; // the transaction ends here too
2444 if (tr
->journal_start
== 0 || tr
->journal_end
== 0) {
2445 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2446 tr
->journal_start
, tr
->journal_end
);
2449 if (write_journal_header(jnl
, 0, jnl
->saved_sequence_num
) != 0) {
2455 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_HEADER
, jnl
->fsmount
->psHfsmount
, NULL
);
2459 * If the caller supplied a callback, call it now that the blocks have been
2460 * written to the journal. This is used by journal_relocate so, for example,
2461 * the file system can change its pointer to the new journal.
2463 if (callback
!= NULL
&& callback(callback_arg
) != 0) {
2468 // the buffer_flushed_callback will only be called for the
2469 // real blocks that get flushed so we have to account for
2470 // the block_list_headers here.
2472 tr
->num_flushed
= tr
->num_blhdrs
* jnl
->jhdr
->blhdr_size
;
2474 lock_condition(jnl
, &jnl
->asyncIO
, "finish_end_transaction");
2477 // setup for looping through all the blhdr's.
2479 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= next
) {
2480 uint16_t num_blocks
;
2483 * grab this info ahead of issuing the buf_bawrites...
2484 * once the last one goes out, its possible for blhdr
2485 * to be freed (especially if we get preempted) before
2486 * we do the last check of num_blocks or
2487 * grab the next blhdr pointer...
2489 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2490 num_blocks
= blhdr
->num_blocks
;
2493 * we can re-order the buf ptrs because everything is written out already
2495 qsort(&blhdr
->binfo
[1], num_blocks
-1, sizeof(block_info
), journal_binfo_cmp
);
2498 * need to make sure that the loop issuing the buf_bawrite's
2499 * does not touch blhdr once the last buf_bawrite has been
2500 * issued... at that point, we no longer have a legitmate
2501 * reference on the associated storage since it will be
2502 * released upon the completion of that last buf_bawrite
2504 for (i
= num_blocks
-1; i
>= 1; i
--) {
2505 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1)
2509 for (i
= 1; i
< num_blocks
; i
++) {
2511 if ((bp
= (void*)blhdr
->binfo
[i
].u
.bp
)) {
2513 errno_t ret_val
= 0;
2516 printf("journal write physical: bp %p, psVnode %p, uBlockN %llu, uPhyCluster %llu uLockCnt %u\n",
2517 bp
, bp
->psVnode
, bp
->uBlockN
, bp
->uPhyCluster
, bp
->uLockCnt
);
2520 lf_hfs_generic_buf_clear_cache_flag(bp
, GEN_BUF_WRITE_LOCK
);
2521 ret_val
= lf_hfs_generic_buf_write(bp
);
2524 CRASH_ABORT(CRASH_ABORT_JOURNAL_IN_BLOCK_DATA
, jnl
->fsmount
->psHfsmount
, NULL
);
2528 LFHFS_LOG(LEVEL_ERROR
, "jnl: raw_readwrite_write_mount inside finish_end_transaction returned %d.\n", ret_val
);
2531 buffer_written(tr
, bp
);
2533 lf_hfs_generic_buf_unlock(bp
);
2534 lf_hfs_generic_buf_release(bp
);
2541 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_BLOCK_DATA
, jnl
->fsmount
->psHfsmount
, NULL
);
2543 if (bufs_written
== 0) {
2545 * since we didn't issue any buf_bawrite's, there is no
2546 * async trigger to cause the memory associated with this
2547 * transaction to be freed... so, move it to the garbage
2552 tr
->next
= jnl
->tr_freeme
;
2553 jnl
->tr_freeme
= tr
;
2555 unlock_oldstart(jnl
);
2557 unlock_condition(jnl
, &jnl
->asyncIO
);
2560 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2561 // tr, tr->journal_start, tr->journal_end);
2564 if (ret_val
== -1) {
2565 abort_transaction(jnl
, tr
); // cleans up list of extents to be trimmed
2568 * 'flush_aborted' is protected by the flushing condition... we need to
2569 * set it before dropping the condition so that it will be
2570 * noticed in 'end_transaction'... we add this additional
2571 * aborted condition so that we can drop the 'flushing' condition
2572 * before grabbing the journal lock... this avoids a deadlock
2573 * in 'end_transaction' which is holding the journal lock while
2574 * waiting for the 'flushing' condition to clear...
2575 * everyone else will notice the JOURNAL_INVALID flag
2577 jnl
->flush_aborted
= TRUE
;
2579 unlock_condition(jnl
, &jnl
->flushing
);
2582 jnl
->flags
|= JOURNAL_INVALID
;
2583 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] &= ~0x8000000000000000LL
;
2585 journal_unlock(jnl
);
2587 unlock_condition(jnl
, &jnl
->flushing
);
2591 static off_t
free_space(journal
*jnl
) {
2592 off_t free_space_offset
;
2594 if (jnl
->jhdr
->start
< jnl
->jhdr
->end
) {
2595 free_space_offset
= jnl
->jhdr
->size
- (jnl
->jhdr
->end
- jnl
->jhdr
->start
) - jnl
->jhdr
->jhdr_size
;
2596 } else if (jnl
->jhdr
->start
> jnl
->jhdr
->end
) {
2597 free_space_offset
= jnl
->jhdr
->start
- jnl
->jhdr
->end
;
2599 // journal is completely empty
2600 free_space_offset
= jnl
->jhdr
->size
- jnl
->jhdr
->jhdr_size
;
2603 return free_space_offset
;
2606 static void dump_journal(journal
*jnl
) {
2609 printf(" jdev_offset %.8llx\n", jnl
->jdev_offset
);
2610 printf(" magic: 0x%.8x\n", jnl
->jhdr
->magic
);
2611 printf(" start: 0x%.8llx\n", jnl
->jhdr
->start
);
2612 printf(" end: 0x%.8llx\n", jnl
->jhdr
->end
);
2613 printf(" size: 0x%.8llx\n", jnl
->jhdr
->size
);
2614 printf(" blhdr size: %d\n", jnl
->jhdr
->blhdr_size
);
2615 printf(" jhdr size: %d\n", jnl
->jhdr
->jhdr_size
);
2616 printf(" chksum: 0x%.8x\n", jnl
->jhdr
->checksum
);
2618 printf(" completed transactions:\n");
2619 for (ctr
= jnl
->completed_trs
; ctr
; ctr
= ctr
->next
) {
2620 printf(" 0x%.8llx - 0x%.8llx\n", ctr
->journal_start
, ctr
->journal_end
);
2624 // The journal must be locked on entry to this function.
2625 // The "desired_size" is in bytes.
2626 static int check_free_space( journal
*jnl
,
2628 boolean_t
*delayed_header_write
,
2629 uint32_t sequence_num
) {
2634 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2635 // desired_size, free_space(jnl));
2637 if (delayed_header_write
)
2638 *delayed_header_write
= FALSE
;
2641 int old_start_empty
;
2643 // make sure there's space in the journal to hold this transaction
2644 if (free_space(jnl
) > desired_size
&& jnl
->old_start
[0] == 0) {
2647 if (counter
++ == 5000) {
2649 panic("jnl: check_free_space: buffer flushing isn't working "
2650 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl
,
2651 jnl
->jhdr
->start
, jnl
->jhdr
->end
, free_space(jnl
), jnl
->active_start
);
2653 if (counter
> 7500) {
2657 // here's where we lazily bump up jnl->jhdr->start. we'll consume
2658 // entries until there is enough space for the next transaction.
2659 old_start_empty
= 1;
2662 for (i
= 0; i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]); i
++) {
2666 while (jnl
->old_start
[i
] & 0x8000000000000000LL
) {
2667 if (lcl_counter
++ > 10000) {
2668 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2669 jnl
->old_start
[i
], jnl
);
2672 unlock_oldstart(jnl
);
2674 jnl
->flush(jnl
->flush_arg
);
2680 if (jnl
->old_start
[i
] == 0) {
2684 old_start_empty
= 0;
2685 jnl
->jhdr
->start
= jnl
->old_start
[i
];
2686 jnl
->old_start
[i
] = 0;
2688 if (free_space(jnl
) > desired_size
) {
2690 if (delayed_header_write
)
2691 *delayed_header_write
= TRUE
;
2693 unlock_oldstart(jnl
);
2694 write_journal_header(jnl
, 1, sequence_num
);
2700 unlock_oldstart(jnl
);
2702 // if we bumped the start, loop and try again
2703 if (i
< sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0])) {
2705 } else if (old_start_empty
) {
2707 // if there is nothing in old_start anymore then we can
2708 // bump the jhdr->start to be the same as active_start
2709 // since it is possible there was only one very large
2710 // transaction in the old_start array. if we didn't do
2711 // this then jhdr->start would never get updated and we
2712 // would wind up looping until we hit the panic at the
2713 // start of the loop.
2715 jnl
->jhdr
->start
= jnl
->active_start
;
2717 if (delayed_header_write
)
2718 *delayed_header_write
= TRUE
;
2720 write_journal_header(jnl
, 1, sequence_num
);
2725 // if the file system gave us a flush function, call it to so that
2726 // it can flush some blocks which hopefully will cause some transactions
2727 // to complete and thus free up space in the journal.
2729 jnl
->flush(jnl
->flush_arg
);
2732 // wait for a while to avoid being cpu-bound (this will
2733 // put us to sleep for 10 milliseconds)
2740 static void lock_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
, __unused
const char *condition_name
) {
2744 while (psCondFlag
->uFlag
) {
2745 pthread_cond_wait(&psCondFlag
->sCond
, &jnl
->flock
);
2748 psCondFlag
->uFlag
= TRUE
;
2752 static void wait_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
, __unused
const char *condition_name
) {
2754 if (!psCondFlag
->uFlag
)
2759 while (psCondFlag
->uFlag
) {
2760 pthread_cond_wait(&psCondFlag
->sCond
, &jnl
->flock
);
2766 static void unlock_condition(journal
*jnl
, ConditionalFlag_S
*psCondFlag
) {
2769 psCondFlag
->uFlag
= FALSE
;
2770 pthread_cond_broadcast(&psCondFlag
->sCond
);
2776 * End a transaction:
2777 * 1) Determine if it is time to commit the transaction or not:
2778 * If the transaction is small enough, and we're not forcing
2779 * a write to disk, the "active" transaction becomes the "current" transaction,
2780 * and will be reused for the next transaction that is started (group commit).
2783 * If the transaction gets written to disk (because force_it is true, or no
2784 * group commit, or the transaction is sufficiently full), the blocks get
2785 * written into the journal first, then they are written to their final location
2786 * asynchronously. When those async writes complete, the transaction can be freed
2787 * and removed from the journal.
2790 * An optional callback can be supplied. If given, it is called after the
2791 * the blocks have been written to the journal, but before the async writes
2792 * of those blocks to their normal on-disk locations. This is used by
2793 * journal_relocate so that the location of the journal can be changed and
2794 * flushed to disk before the blocks get written to their normal locations.
2795 * Note that the callback is only called if the transaction gets written to
2796 * the journal during this end_transaction call; you probably want to set the
2799 * 4) Free blocks' Generic Buff.
2802 * tr Transaction to add to the journal
2803 * force_it If true, force this transaction to the on-disk journal immediately.
2804 * callback See description above. Pass NULL for no callback.
2805 * callback_arg Argument passed to callback routine.
2809 * -1 An error occurred. The journal is marked invalid.
2811 static int end_transaction(transaction
*tr
, int force_it
, errno_t (*callback
)(void*), void *callback_arg
, boolean_t drop_lock
) {
2813 block_list_header
*blhdr
=NULL
, *next
=NULL
;
2815 journal
*jnl
= tr
->jnl
;
2817 size_t tbuffer_offset
;
2820 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2821 jnl
, jnl
->cur_tr
, tr
);
2824 // if there weren't any modified blocks in the transaction
2825 // just save off the transaction pointer and return.
2826 if (tr
->total_bytes
== (int)jnl
->jhdr
->blhdr_size
) {
2831 // if our transaction buffer isn't very full, just hang
2832 // on to it and don't actually flush anything. this is
2833 // what is known as "group commit". we will flush the
2834 // transaction buffer if it's full or if we have more than
2835 // one of them so we don't start hogging too much memory.
2837 // We also check the device supports UNMAP/TRIM, and if so,
2838 // the number of extents waiting to be trimmed. If it is
2839 // small enough, then keep accumulating more (so we can
2840 // reduce the overhead of trimming). If there was a prior
2841 // trim error, then we stop issuing trims for this
2842 // volume, so we can also coalesce transactions.
2845 && (jnl
->flags
& JOURNAL_NO_GROUP_COMMIT
) == 0
2846 && tr
->num_blhdrs
< 3
2847 && (tr
->total_bytes
<= ((tr
->tbuffer_size
*tr
->num_blhdrs
) - tr
->tbuffer_size
/8))
2848 && (!(jnl
->flags
& JOURNAL_USE_UNMAP
) || (tr
->trim
.extent_count
< jnl_trim_flush_limit
))) {
2854 lock_condition(jnl
, &jnl
->flushing
, "end_transaction");
2857 * if the previous 'finish_end_transaction' was being run
2858 * asynchronously, it could have encountered a condition
2859 * that caused it to mark the journal invalid... if that
2860 * occurred while we were waiting for it to finish, we
2861 * need to notice and abort the current transaction
2863 if ((jnl
->flags
& JOURNAL_INVALID
) || jnl
->flush_aborted
== TRUE
) {
2864 unlock_condition(jnl
, &jnl
->flushing
);
2866 abort_transaction(jnl
, tr
);
2872 * Store a pointer to this transaction's trim list so that
2873 * future transactions can find it.
2875 * Note: if there are no extents in the trim list, then don't
2876 * bother saving the pointer since nothing can add new extents
2877 * to the list (and other threads/transactions only care if
2878 * there is a trim pending).
2880 lf_lck_rw_lock_exclusive(&jnl
->trim_lock
);
2881 if (jnl
->async_trim
!= NULL
)
2882 panic("jnl: end_transaction: async_trim already non-NULL!");
2883 if (tr
->trim
.extent_count
> 0)
2884 jnl
->async_trim
= &tr
->trim
;
2885 lf_lck_rw_unlock_exclusive(&jnl
->trim_lock
);
2888 * snapshot the transaction sequence number while we are still behind
2889 * the journal lock since it will be bumped upon the start of the
2890 * next transaction group which may overlap the current journal flush...
2891 * we pass the snapshot into write_journal_header during the journal
2892 * flush so that it can write the correct version in the header...
2893 * because we hold the 'flushing' condition variable for the duration
2894 * of the journal flush, 'saved_sequence_num' remains stable
2896 jnl
->saved_sequence_num
= jnl
->sequence_num
;
2899 * if we're here we're going to flush the transaction buffer to disk.
2900 * 'check_free_space' will not return untl there is enough free
2901 * space for this transaction in the journal and jnl->old_start[0]
2902 * is avaiable for use
2904 check_free_space(jnl
, tr
->total_bytes
, &tr
->delayed_header_write
, jnl
->saved_sequence_num
);
2906 // range check the end index
2907 if (jnl
->jhdr
->end
<= 0 || jnl
->jhdr
->end
> jnl
->jhdr
->size
) {
2908 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2909 jnl
->jhdr
->end
, jnl
->jhdr
->size
);
2912 // this transaction starts where the current journal ends
2913 tr
->journal_start
= jnl
->jhdr
->end
;
2917 * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
2918 * slide everyone else down and put our latest guy in the last
2919 * entry in the old_start array
2921 memcpy(__CAST_AWAY_QUALIFIER(&jnl
->old_start
[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl
->old_start
[1], volatile, void *), sizeof(jnl
->old_start
)-sizeof(jnl
->old_start
[0]));
2922 jnl
->old_start
[sizeof(jnl
->old_start
)/sizeof(jnl
->old_start
[0]) - 1] = tr
->journal_start
| 0x8000000000000000LL
;
2924 unlock_oldstart(jnl
);
2926 // go over the blocks in the transaction.
2927 // for each block, call the fpCallback and copy the content into the journal buffer
2928 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= next
) {
2932 tbuffer_offset
= jnl
->jhdr
->blhdr_size
;
2934 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
2936 if (blhdr
->binfo
[i
].bnum
!= (off_t
)-1) {
2938 bp
= (GenericLFBuf
*)blhdr
->binfo
[i
].u
.bp
;
2941 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2942 blhdr
->binfo
[i
].bnum
, jnl
, tr
);
2945 bsize
= bp
->uDataSize
;
2947 blkptr
= (char *)&((char *)blhdr
)[tbuffer_offset
];
2951 iRet
= lf_hfs_generic_buf_take_ownership(bp
, NULL
);
2952 if (iRet
== EAGAIN
) {
2955 LFHFS_LOG(LEVEL_ERROR
, "jnl: end_transaction: lf_hfs_generic_buf_take_ownership returned %d.\n", iRet
);
2960 if (!(bp
->uCacheFlags
& GEN_BUF_WRITE_LOCK
)) {
2961 panic("GEN_BUF_WRITE_LOCK should be set!");
2964 // Call the buffer callback
2966 bp
->pfFunc(bp
, bp
->pvCallbackArgs
);
2970 if (bp
->uCacheFlags
& GEN_BUF_LITTLE_ENDIAN
) {
2971 panic("We do not want to write a GEN_BUF_LITTLE_ENDIAN buffer to media!");
2974 // copy the data into the transaction buffer...
2975 memcpy(blkptr
, bp
->pvData
, bsize
);
2977 blhdr
->binfo
[i
].u
.bp
= (void*)bp
;
2980 // bnum == -1, only true if a block was "killed"
2981 bsize
= blhdr
->binfo
[i
].u
.bi
.bsize
;
2983 tbuffer_offset
+= bsize
;
2985 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
2989 CRASH_ABORT(CRASH_ABORT_JOURNAL_BEFORE_FINISH
, jnl
->fsmount
->psHfsmount
, NULL
);
2992 ret_val
= finish_end_transaction(tr
, callback
, callback_arg
);
2995 if (drop_lock
== TRUE
) {
2996 journal_unlock(jnl
);
3001 static void abort_transaction(journal
*jnl
, transaction
*tr
) {
3003 block_list_header
*blhdr
, *next
;
3004 // for each block list header, iterate over the blocks then
3005 // free up the memory associated with the block list.
3006 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= next
) {
3009 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
3012 if (blhdr
->binfo
[i
].bnum
== (off_t
)-1)
3015 bp
= (void*)blhdr
->binfo
[i
].u
.bp
;
3017 // Release the buffers
3018 lf_hfs_generic_buf_clear_cache_flag(bp
, GEN_BUF_WRITE_LOCK
);
3019 if (lf_hfs_generic_buf_validate_owner(bp
)) { // abort_transaction can be called before or after we take ownership
3020 lf_hfs_generic_buf_release(bp
);
3024 next
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
3026 // we can free blhdr here since we won't need it any more
3027 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
3032 * If the transaction we're aborting was the async transaction, then
3033 * tell the current transaction that there is no pending trim
3036 lf_lck_rw_lock_exclusive(&jnl
->trim_lock
);
3037 if (jnl
->async_trim
== &tr
->trim
)
3038 jnl
->async_trim
= NULL
;
3039 lf_lck_rw_unlock_exclusive(&jnl
->trim_lock
);
3042 if (tr
->trim
.extents
) {
3043 hfs_free(tr
->trim
.extents
);
3045 tr
->trim
.allocated_count
= 0;
3046 tr
->trim
.extent_count
= 0;
3047 tr
->trim
.extents
= NULL
;
3050 tr
->total_bytes
= 0xdbadc0de;
3054 static void swap_journal_header(journal
*jnl
) {
3055 jnl
->jhdr
->magic
= SWAP32(jnl
->jhdr
->magic
);
3056 jnl
->jhdr
->endian
= SWAP32(jnl
->jhdr
->endian
);
3057 jnl
->jhdr
->start
= SWAP64(jnl
->jhdr
->start
);
3058 jnl
->jhdr
->end
= SWAP64(jnl
->jhdr
->end
);
3059 jnl
->jhdr
->size
= SWAP64(jnl
->jhdr
->size
);
3060 jnl
->jhdr
->blhdr_size
= SWAP32(jnl
->jhdr
->blhdr_size
);
3061 jnl
->jhdr
->checksum
= SWAP32(jnl
->jhdr
->checksum
);
3062 jnl
->jhdr
->jhdr_size
= SWAP32(jnl
->jhdr
->jhdr_size
);
3063 jnl
->jhdr
->sequence_num
= SWAP32(jnl
->jhdr
->sequence_num
);
3066 // this isn't a great checksum routine but it will do for now.
3067 // we use it to checksum the journal header and the block list
3068 // headers that are at the start of each transaction.
3069 static unsigned int calc_checksum(const char *ptr
, int len
) {
3071 unsigned int cksum
=0;
3073 // this is a lame checksum but for now it'll do
3074 for(i
= 0; i
< len
; i
++, ptr
++) {
3075 cksum
= (cksum
<< 8) ^ (cksum
+ *(unsigned char *)ptr
);
3082 static size_t do_journal_io(journal
*jnl
, off_t
*offset
, void *data
, size_t len
, int direction
) {
3089 off_t accumulated_offset
= 0;
3090 ExtendedVCB
*vcb
= HFSTOVCB(jnl
->fsmount
->psHfsmount
);
3093 if (*offset
< 0 || *offset
> jnl
->jhdr
->size
) {
3094 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset
, jnl
->jhdr
->size
);
3097 if (direction
& JNL_WRITE
)
3098 max_iosize
= jnl
->max_write_size
;
3099 else if (direction
& JNL_READ
)
3100 max_iosize
= jnl
->max_read_size
;
3102 max_iosize
= 128 * 1024;
3106 // Determine the Current R/W Length, taking cyclic wrap around into account
3107 if (*offset
+ curlen
> jnl
->jhdr
->size
&& *offset
!= 0 && jnl
->jhdr
->size
!= 0) {
3108 if (*offset
== jnl
->jhdr
->size
) {
3109 *offset
= jnl
->jhdr
->jhdr_size
;
3111 curlen
= jnl
->jhdr
->size
- *offset
;
3115 if (curlen
> max_iosize
) {
3116 curlen
= max_iosize
;
3120 panic("jnl: do_jnl_io: curlen == %lld, offset 0x%llx len %zd\n", curlen
, *offset
, len
);
3123 if (*offset
== 0 && (direction
& JNL_HEADER
) == 0) {
3124 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %lld, data %p)\n", curlen
, data
);
3129 uint64_t phyblksize
= jnl
->fsmount
->psHfsmount
->hfs_physical_block_size
;
3130 uint64_t uBlkNum
= jnl
->jdev_blknum
+(*offset
)/phyblksize
;
3132 if (direction
& JNL_READ
) {
3133 raw_readwrite_read_mount(jnl
->jdev
, uBlkNum
, phyblksize
, data
, curlen
, NULL
, NULL
);
3135 } else if (direction
& JNL_WRITE
) {
3136 raw_readwrite_write_mount(jnl
->jdev
, uBlkNum
, phyblksize
, data
, curlen
, NULL
, NULL
);
3139 // Move to the next section
3144 // handle wrap-around
3145 data
= (char *)data
+ curlen
;
3146 curlen
= len
- io_sz
;
3147 if (*offset
>= jnl
->jhdr
->size
) {
3148 *offset
= jnl
->jhdr
->jhdr_size
;
3156 static size_t read_journal_header(journal
*jnl
, void *data
, size_t len
) {
3157 off_t hdr_offset
= 0;
3159 return do_journal_io(jnl
, &hdr_offset
, data
, len
, JNL_READ
|JNL_HEADER
);
3162 static void get_io_info(struct vnode
*devvp
, size_t phys_blksz
, journal
*jnl
) {
3164 off_t writeblockcnt
;
3165 off_t readmaxcnt
=0, tmp_readmaxcnt
;
3166 off_t writemaxcnt
=0, tmp_writemaxcnt
;
3167 off_t readsegcnt
, writesegcnt
;
3169 // First check the max read size via several different mechanisms...
3170 ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXBYTECOUNTREAD
, (caddr_t
)&readmaxcnt
);
3172 if (ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXBLOCKCOUNTREAD
, (caddr_t
)&readblockcnt
) == 0) {
3173 tmp_readmaxcnt
= readblockcnt
* phys_blksz
;
3174 if (readmaxcnt
== 0 || (readblockcnt
> 0 && tmp_readmaxcnt
< readmaxcnt
)) {
3175 readmaxcnt
= tmp_readmaxcnt
;
3179 if (ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXSEGMENTCOUNTREAD
, (caddr_t
)&readsegcnt
)) {
3183 if (readsegcnt
> 0 && (readsegcnt
* PAGE_SIZE
) < readmaxcnt
) {
3184 readmaxcnt
= readsegcnt
* PAGE_SIZE
;
3187 if (readmaxcnt
== 0) {
3188 readmaxcnt
= 128 * 1024;
3189 } else if (readmaxcnt
> UINT32_MAX
) {
3190 readmaxcnt
= UINT32_MAX
;
3194 // Now check the max writes size via several different mechanisms...
3195 ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXBYTECOUNTWRITE
, (caddr_t
)&writemaxcnt
);
3197 if (ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXBLOCKCOUNTWRITE
, (caddr_t
)&writeblockcnt
) == 0) {
3198 tmp_writemaxcnt
= writeblockcnt
* phys_blksz
;
3199 if (writemaxcnt
== 0 || (writeblockcnt
> 0 && tmp_writemaxcnt
< writemaxcnt
)) {
3200 writemaxcnt
= tmp_writemaxcnt
;
3204 if (ioctl(devvp
->psFSRecord
->iFD
, DKIOCGETMAXSEGMENTCOUNTWRITE
, (caddr_t
)&writesegcnt
)) {
3208 if (writesegcnt
> 0 && (writesegcnt
* PAGE_SIZE
) < writemaxcnt
) {
3209 writemaxcnt
= writesegcnt
* PAGE_SIZE
;
3212 if (writemaxcnt
== 0) {
3213 writemaxcnt
= 128 * 1024;
3214 } else if (writemaxcnt
> UINT32_MAX
) {
3215 writemaxcnt
= UINT32_MAX
;
3218 jnl
->max_read_size
= readmaxcnt
;
3219 jnl
->max_write_size
= writemaxcnt
;
3222 // this is a work function used to free up transactions that
3223 // completed. they can't be free'd from buffer_flushed_callback
3224 // because it is called from deep with the disk driver stack
3225 // and thus can't do something that would potentially cause
3226 // paging. it gets called by each of the journal api entry
3227 // points so stuff shouldn't hang around for too long.
3228 static void free_old_stuff(journal
*jnl
) {
3229 transaction
*tr
, *next
;
3230 block_list_header
*blhdr
=NULL
, *next_blhdr
=NULL
;
3232 if (jnl
->tr_freeme
== NULL
)
3236 tr
= jnl
->tr_freeme
;
3237 jnl
->tr_freeme
= NULL
;
3238 unlock_oldstart(jnl
);
3240 for(; tr
; tr
=next
) {
3241 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= next_blhdr
) {
3242 next_blhdr
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
);
3243 blhdr
->binfo
[0].bnum
= 0xdeadc0de;
3247 KERNEL_DEBUG(0xbbbbc01c, jnl
, tr
, tr
->tbuffer_size
, 0, 0);
3254 // Allocate a new active transaction.
3255 // The function does the following:
3256 // 1) mallocs memory for a transaction structure and a buffer
3257 // 2) initializes the transaction structure and the buffer (invalid CRC + 0x5a)
3258 static errno_t
journal_allocate_transaction(journal
*jnl
) {
3261 tr
= hfs_mallocz(sizeof(transaction
));
3263 tr
->tbuffer_size
= jnl
->tbuffer_size
;
3265 tr
->tbuffer
= hfs_malloc(tr
->tbuffer_size
);
3267 // journal replay code checksum check depends on this.
3268 memset(tr
->tbuffer
, 0, BLHDR_CHECKSUM_SIZE
);
3269 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
3270 memset(tr
->tbuffer
+ BLHDR_CHECKSUM_SIZE
, 0x5a, jnl
->jhdr
->blhdr_size
- BLHDR_CHECKSUM_SIZE
);
3272 tr
->blhdr
= (block_list_header
*)tr
->tbuffer
;
3273 tr
->blhdr
->max_blocks
= (jnl
->jhdr
->blhdr_size
/ sizeof(block_info
)) - 1;
3274 tr
->blhdr
->num_blocks
= 1; // accounts for this header block
3275 tr
->blhdr
->bytes_used
= jnl
->jhdr
->blhdr_size
;
3276 tr
->blhdr
->flags
= BLHDR_CHECK_CHECKSUMS
| BLHDR_FIRST_HEADER
;
3278 tr
->sequence_num
= ++jnl
->sequence_num
;
3280 tr
->total_bytes
= jnl
->jhdr
->blhdr_size
;
3283 jnl
->active_tr
= tr
;
3288 int journal_kill_block(journal
*jnl
, GenericLFBuf
*psGenBuf
) {
3291 block_list_header
*blhdr
;
3295 printf("journal_kill_block: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
3296 psGenBuf
, psGenBuf
->psVnode
, psGenBuf
->uBlockN
, psGenBuf
->uDataSize
,psGenBuf
->uPhyCluster
, psGenBuf
->uLockCnt
);
3300 free_old_stuff(jnl
);
3302 if (jnl
->flags
& JOURNAL_INVALID
) {
3303 lf_hfs_generic_buf_clear_cache_flag(psGenBuf
, GEN_BUF_WRITE_LOCK
);
3304 lf_hfs_generic_buf_release(psGenBuf
);
3308 tr
= jnl
->active_tr
;
3309 CHECK_TRANSACTION(tr
);
3311 if (jnl
->owner
!= pthread_self()) {
3312 panic("jnl: journal_kill_block: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3313 jnl
, jnl
->owner
, pthread_self());
3316 uflags
= psGenBuf
->uCacheFlags
;
3318 if ( !(uflags
& GEN_BUF_WRITE_LOCK
))
3319 panic("jnl: journal_kill_block: called with bp not B_LOCKED");
3322 * bp must be BL_BUSY and B_LOCKED
3323 * first check if it's already part of this transaction
3325 for (blhdr
= tr
->blhdr
; blhdr
; blhdr
= (block_list_header
*)((long)blhdr
->binfo
[0].bnum
)) {
3327 for (i
= 1; i
< blhdr
->num_blocks
; i
++) {
3328 if (psGenBuf
== (void*)blhdr
->binfo
[i
].u
.bp
) {
3330 // if the block has the DELWRI and FILTER bits sets, then
3331 // things are seriously weird. if it was part of another
3332 // transaction then journal_modify_block_start() should
3333 // have force it to be written.
3335 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
3336 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
3338 tr
->num_killed
+= psGenBuf
->uDataSize
;
3340 blhdr
->binfo
[i
].bnum
= (off_t
)-1;
3341 blhdr
->binfo
[i
].u
.bp
= NULL
;
3342 blhdr
->binfo
[i
].u
.bi
.bsize
= psGenBuf
->uDataSize
;
3344 lf_hfs_generic_buf_clear_cache_flag(psGenBuf
, GEN_BUF_WRITE_LOCK
);
3345 lf_hfs_generic_buf_release(psGenBuf
);
3353 * We did not find the block in any transaction buffer but we still
3354 * need to release it or else it will be left locked forever.
3356 lf_hfs_generic_buf_clear_cache_flag(psGenBuf
, GEN_BUF_WRITE_LOCK
);
3357 lf_hfs_generic_buf_release(psGenBuf
);
3362 int journal_is_clean(struct vnode
*jvp
,
3366 size_t min_fs_block_size
,
3367 struct mount
*fsmount
) {
3370 uint32_t phys_blksz
;
3372 int orig_checksum
, checksum
;
3374 /* Get the real physical block size. */
3375 if (ioctl(jvp
->psFSRecord
->iFD
, DKIOCGETBLOCKSIZE
, (caddr_t
)&phys_blksz
)) {
3376 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: failed to get device block size.\n");
3378 goto cleanup_jdev_name
;
3381 if (phys_blksz
> (uint32_t)min_fs_block_size
) {
3382 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
3383 phys_blksz
, min_fs_block_size
);
3385 goto cleanup_jdev_name
;
3388 if (journal_size
< (256*1024) || journal_size
> (MAX_JOURNAL_SIZE
)) {
3389 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: journal size %lld looks bogus.\n", journal_size
);
3391 goto cleanup_jdev_name
;
3394 if ((journal_size
% phys_blksz
) != 0) {
3395 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3396 journal_size
, phys_blksz
);
3398 goto cleanup_jdev_name
;
3401 memset(&jnl
, 0, sizeof(jnl
));
3403 jnl
.header_buf
= hfs_malloc(phys_blksz
);
3404 jnl
.header_buf_size
= phys_blksz
;
3406 // Keep a point to the mount around for use in IO throttling.
3407 jnl
.fsmount
= fsmount
;
3409 get_io_info(jvp
, phys_blksz
, &jnl
);
3411 jnl
.jhdr
= (journal_header
*)jnl
.header_buf
;
3412 memset(jnl
.jhdr
, 0, sizeof(journal_header
));
3415 jnl
.jdev_offset
= offset
;
3416 jnl
.jdev_blknum
= (uint32_t)(offset
/ phys_blksz
);
3419 // we have to set this up here so that do_journal_io() will work
3420 jnl
.jhdr
->jhdr_size
= phys_blksz
;
3422 if (read_journal_header(&jnl
, jnl
.jhdr
, phys_blksz
) != (unsigned)phys_blksz
) {
3423 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: could not read %d bytes for the journal header.\n",
3429 orig_checksum
= jnl
.jhdr
->checksum
;
3430 jnl
.jhdr
->checksum
= 0;
3432 if (jnl
.jhdr
->magic
== SWAP32(JOURNAL_HEADER_MAGIC
)) {
3433 // do this before the swap since it's done byte-at-a-time
3434 orig_checksum
= SWAP32(orig_checksum
);
3435 checksum
= calc_checksum((char *)jnl
.jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
3436 swap_journal_header(&jnl
);
3437 jnl
.flags
|= JOURNAL_NEED_SWAP
;
3439 checksum
= calc_checksum((char *)jnl
.jhdr
, JOURNAL_HEADER_CKSUM_SIZE
);
3442 if (jnl
.jhdr
->magic
!= JOURNAL_HEADER_MAGIC
&& jnl
.jhdr
->magic
!= OLD_JOURNAL_HEADER_MAGIC
) {
3443 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: journal magic is bad (0x%x != 0x%x)\n",
3444 jnl
.jhdr
->magic
, JOURNAL_HEADER_MAGIC
);
3449 if (orig_checksum
!= checksum
) {
3450 LFHFS_LOG(LEVEL_ERROR
, "jnl: journal_is_clean: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum
, checksum
);
3456 // if the start and end are equal then the journal is clean.
3457 // otherwise it's not clean and therefore an error.
3459 if (jnl
.jhdr
->start
== jnl
.jhdr
->end
) {
3462 ret
= EBUSY
; // so the caller can differentiate an invalid journal from a "busy" one
3466 hfs_free(jnl
.header_buf
);
3471 uint32_t journal_current_txn(journal
*jnl
) {
3472 return jnl
->sequence_num
+ (jnl
->active_tr
|| jnl
->cur_tr
? 0 : 1);