]> git.saurik.com Git - apple/hfs.git/blob - livefiles_hfs_plugin/lf_hfs_journal.c
hfs-522.0.9.tar.gz
[apple/hfs.git] / livefiles_hfs_plugin / lf_hfs_journal.c
1 /*
2 * Copyright (c) 2002-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 //
29 // This file implements a simple write-ahead journaling layer.
30 // In theory any file system can make use of it by calling these
31 // functions when the fs wants to modify meta-data blocks. See
32 // hfs_journal.h for a more detailed description of the api and
33 // data structures.
34 //
35 // Dominic Giampaolo (dbg@apple.com)
36 // Port to Live-Files: Oded Shoshani (oshoshani@apple.com)
37 //
38
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <limits.h>
43 #include <errno.h>
44 #include <fcntl.h>
45 #include <unistd.h>
46 #include <stdarg.h>
47 #include <assert.h>
48 #include <sys/sysctl.h>
49 #include <sys/types.h>
50 #include <mach/mach.h>
51 #include <sys/disk.h>
52 #include <sys/kdebug.h>
53 #include "lf_hfs_locks.h"
54 #include "lf_hfs_journal.h"
55 #include "lf_hfs_vfsutils.h"
56 #include "lf_hfs_raw_read_write.h"
57 #include "lf_hfs_generic_buf.h"
58 #include "lf_hfs_logger.h"
59 #include "lf_hfs_vfsops.h"
60
61 // ************************** Function Definitions ***********************
62 // number of bytes to checksum in a block_list_header
63 // NOTE: this should be enough to clear out the header
64 // fields as well as the first entry of binfo[]
65
66 #define CHECK_JOURNAL(jnl) \
67 do { \
68 if (jnl == NULL) { \
69 printf("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
70 panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \
71 } \
72 if (jnl->jdev == NULL) { \
73 printf("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
74 panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \
75 } \
76 if (jnl->fsdev == NULL) { \
77 printf("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
78 panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \
79 } \
80 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \
81 printf("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
82 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
83 panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \
84 __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \
85 } \
86 if (jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size) { \
87 printf("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
88 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
89 panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
90 __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \
91 } \
92 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { \
93 printf("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
94 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
95 panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
96 __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \
97 } \
98 } while(0)
99
100 #define CHECK_TRANSACTION(tr) \
101 do { \
102 if (tr == NULL) { \
103 printf("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
104 panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \
105 } \
106 if (tr->jnl == NULL) { \
107 printf("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
108 panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \
109 } \
110 if (tr->blhdr != (block_list_header *)tr->tbuffer) { \
111 printf("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
112 panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \
113 } \
114 if (tr->total_bytes < 0) { \
115 printf("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
116 panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \
117 } \
118 if (tr->journal_start < 0) { \
119 printf("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
120 panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \
121 } \
122 if (tr->journal_end < 0) { \
123 printf("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
124 panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \
125 } \
126 if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \
127 printf("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
128 panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \
129 } \
130 } while(0)
131
132 #define SWAP16(x) OSSwapInt16(x)
133 #define SWAP32(x) OSSwapInt32(x)
134 #define SWAP64(x) OSSwapInt64(x)
135
136 #define JNL_WRITE 0x0001
137 #define JNL_READ 0x0002
138 #define JNL_HEADER 0x8000
139
140 #define BLHDR_CHECKSUM_SIZE 32
141 #define MAX_JOURNAL_SIZE 0x80000000U
142
143 #define STARTING_BUCKETS 256
144 typedef struct bucket {
145 off_t block_num;
146 uint32_t jnl_offset;
147 uint32_t block_size;
148 int32_t cksum;
149 } bucket;
150
151 static int replay_journal(journal *jnl);
152 static void free_old_stuff(journal *jnl);
153 static errno_t journal_allocate_transaction(journal *jnl);
154 static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl);
155 static size_t read_journal_header(journal *jnl, void *data, size_t len);
156 static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction);
157 static unsigned int calc_checksum(const char *ptr, int len);
158 static void swap_journal_header(journal *jnl);
159 static int end_transaction(transaction *tr,
160 int force_it,
161 errno_t (*callback)(void*),
162 void *callback_arg,
163 boolean_t drop_lock);
164 static void abort_transaction(journal *jnl, transaction *tr);
165 static void size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz);
166 static void lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name);
167 static void wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name);
168 static void unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag);
169 static int write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num);
170 static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len);
171 static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len);
172
173
174 static __inline__ void lock_oldstart(journal *jnl) {
175 lf_lck_mtx_lock(&jnl->old_start_lock);
176 }
177
178 static __inline__ void unlock_oldstart(journal *jnl) {
179 lf_lck_mtx_unlock(&jnl->old_start_lock);
180 }
181
182 __inline__ void journal_lock(journal *jnl) {
183 lf_lck_mtx_lock(&jnl->jlock);
184 if (jnl->owner) {
185 panic ("jnl: owner is %p, expected NULL\n", jnl->owner);
186 }
187 jnl->owner = pthread_self();
188 }
189
190 __inline__ void journal_unlock(journal *jnl) {
191 jnl->owner = NULL;
192 lf_lck_mtx_unlock(&jnl->jlock);
193 }
194
195 static __inline__ void lock_flush(journal *jnl) {
196 lf_lck_mtx_lock(&jnl->flock);
197 }
198
199 static __inline__ void unlock_flush(journal *jnl) {
200 lf_lck_mtx_unlock(&jnl->flock);
201 }
202
203 // ************************** Global Variables ***********************
204 // Journal Locking
205 lck_grp_attr_t *jnl_group_attr = NULL;
206 lck_attr_t *jnl_lock_attr = NULL;
207 lck_grp_t *jnl_mutex_group = NULL;
208
209 // By default, we grow the list of extents to trim by 4K at a time.
210 // We'll opt to flush a transaction if it contains at least
211 // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number
212 // of modified blocks is small).
213 enum {
214 JOURNAL_DEFAULT_TRIM_BYTES = 4096,
215 JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t),
216 JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16
217 };
218
219 unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS;
220
221 // tbuffer
222 #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
223 #define MAX_TRANSACTION_BUFFER_SIZE (3072*1024)
224 uint32_t def_tbuffer_size = 0; // XXXdbg - so I can change it in the debugger
225
226 // ************************** Global Functions ***********************
227 void journal_init(void) {
228
229 jnl_lock_attr = lf_lck_attr_alloc_init();
230 jnl_group_attr = lf_lck_grp_attr_alloc_init();
231 jnl_mutex_group = lf_lck_grp_alloc_init();
232 }
233
234 journal *journal_open(struct vnode *jvp,
235 off_t offset,
236 off_t journal_size,
237 struct vnode *fsvp,
238 size_t min_fs_blksz,
239 int32_t flags,
240 int32_t tbuffer_size,
241 void (*flush)(void *arg),
242 void *arg,
243 struct mount *fsmount) {
244 journal *jnl;
245 uint32_t orig_blksz=0;
246 uint32_t phys_blksz;
247 u_int32_t min_size = 0;
248 int orig_checksum, checksum;
249
250 /* Get the real physical block size. */
251 if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
252 goto cleanup_jdev_name;
253 }
254
255 if (phys_blksz > min_fs_blksz) {
256 LFHFS_LOG(LEVEL_ERROR, "jnl: open: error: phys blksize %u bigger than min fs blksize %zd\n",
257 phys_blksz, min_fs_blksz);
258 goto cleanup_jdev_name;
259 }
260
261 if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) {
262 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size %lld looks bogus.\n", journal_size);
263 goto cleanup_jdev_name;
264 }
265
266 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
267 /* Reject journals that are too small given the sector size of the device */
268 if (journal_size < min_size) {
269 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size (%lld) too small given sector size of (%u)\n",
270 journal_size, phys_blksz);
271 goto cleanup_jdev_name;
272 }
273
274 if ((journal_size % phys_blksz) != 0) {
275 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
276 journal_size, phys_blksz);
277 goto cleanup_jdev_name;
278 }
279
280 jnl = hfs_mallocz(sizeof(struct journal));
281
282 jnl->jdev = jvp;
283 jnl->jdev_offset = offset;
284 jnl->jdev_blknum = (uint32_t)(offset / min_fs_blksz);
285 jnl->fsdev = fsvp;
286 jnl->flush = flush;
287 jnl->flush_arg = arg;
288 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
289 lf_lck_mtx_init(&jnl->old_start_lock);
290 lf_cond_init(&jnl->flushing.sCond);
291 lf_cond_init(&jnl->asyncIO.sCond);
292 lf_cond_init(&jnl->writing_header.sCond);
293
294 /* We hold the mount to later pass to the throttling code for IO
295 * accounting.
296 */
297 jnl->fsmount = fsmount;
298
299 get_io_info(jvp, phys_blksz, jnl);
300
301 jnl->header_buf = hfs_malloc(phys_blksz);
302 jnl->header_buf_size = phys_blksz;
303
304 jnl->jhdr = (journal_header *)jnl->header_buf;
305 memset(jnl->jhdr, 0, sizeof(journal_header));
306
307 // we have to set this up here so that do_journal_io() will work
308 jnl->jhdr->jhdr_size = phys_blksz;
309
310 if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) {
311 LFHFS_LOG(LEVEL_ERROR, "jnl: open: could not read %u bytes for the journal header.\n",
312 phys_blksz);
313 goto bad_journal;
314 }
315
316 /*
317 * Check for a bad jhdr size after reading in the journal header.
318 * The journal header length cannot be zero
319 */
320 if (jnl->jhdr->jhdr_size == 0) {
321 LFHFS_LOG(LEVEL_ERROR, "jnl: open: bad jhdr size (%d) \n", jnl->jhdr->jhdr_size);
322 goto bad_journal;
323 }
324
325 orig_checksum = jnl->jhdr->checksum;
326 jnl->jhdr->checksum = 0;
327
328 if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
329
330 // do this before the swap since it's done byte-at-a-time
331 orig_checksum = SWAP32(orig_checksum);
332 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
333 swap_journal_header(jnl);
334 jnl->flags |= JOURNAL_NEED_SWAP;
335
336 } else {
337
338 checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
339 }
340
341 if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
342 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal magic is bad (0x%x != 0x%x)\n",
343 jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
344 goto bad_journal;
345 }
346
347 // only check if we're the current journal header magic value
348 if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
349
350 if (orig_checksum != checksum) {
351 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal checksum is bad (0x%x != 0x%x)\n",
352 orig_checksum, checksum);
353
354 //goto bad_journal;
355 }
356 }
357
358 // XXXdbg - convert old style magic numbers to the new one
359 if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
360 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
361 }
362
363 if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
364 /*
365 * The volume has probably been resized (such that we had to adjust the
366 * logical sector size), or copied to media with a different logical
367 * sector size.
368 *
369 * For us, though, no big deal because we are giving byte offsets to
370 * pread() and pwrite() to do our I/O, and as long as we use self-
371 * consistent units, we are all good.
372 */
373 LFHFS_LOG(LEVEL_ERROR,
374 "jnl: block size mismatch: phys_blksz=%llu, jhdr->jhdr_size=%llu -- COMPENSATING\n",
375 (unsigned long long)phys_blksz, (unsigned long long)jnl->jhdr->jhdr_size);
376 orig_blksz = phys_blksz;
377 }
378
379 if ( jnl->jhdr->start <= 0
380 || jnl->jhdr->start > jnl->jhdr->size
381 || jnl->jhdr->start > 1024*1024*1024) {
382 LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
383 jnl->jhdr->start, jnl->jhdr->size);
384 goto bad_journal;
385 }
386
387 if ( jnl->jhdr->end <= 0
388 || jnl->jhdr->end > jnl->jhdr->size
389 || jnl->jhdr->end > 1024*1024*1024) {
390 LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
391 jnl->jhdr->end, jnl->jhdr->size);
392 goto bad_journal;
393 }
394
395 if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) {
396 LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
397 goto bad_journal;
398 }
399
400 // XXXdbg - can't do these checks because hfs writes all kinds of
401 // non-uniform sized blocks even on devices that have a block size
402 // that is larger than 512 bytes (i.e. optical media w/2k blocks).
403 // therefore these checks will fail and so we just have to punt and
404 // do more relaxed checking...
405 // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
406 if ((jnl->jhdr->start % 512) != 0) {
407 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal start (0x%llx) not a multiple of 512?\n",
408 jnl->jhdr->start);
409 goto bad_journal;
410 }
411
412 //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
413 if ((jnl->jhdr->end % 512) != 0) {
414 LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
415 jnl->jhdr->end, jnl->jhdr->jhdr_size);
416 goto bad_journal;
417 }
418
419 if (jnl->jhdr->blhdr_size < 0) {
420 //throw out invalid sizes
421 LFHFS_LOG(LEVEL_ERROR, "jnl: open: blhdr size looks bogus! (%d) \n",
422 jnl->jhdr->blhdr_size);
423 goto bad_journal;
424 }
425
426 // take care of replaying the journal if necessary
427 if (flags & JOURNAL_RESET) {
428 LFHFS_LOG(LEVEL_ERROR, "jnl: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n",
429 jnl->jhdr->start, jnl->jhdr->end);
430 jnl->jhdr->start = jnl->jhdr->end;
431 } else if (replay_journal(jnl) != 0) {
432 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_open: Error replaying the journal!\n");
433 goto bad_journal;
434 }
435
436 /*
437 * When we get here, we know that the journal is empty (jnl->jhdr->start ==
438 * jnl->jhdr->end). If the device's logical block size was different from
439 * the journal's header size, then we can now restore the device's logical
440 * block size and update the journal's header size to match.
441 *
442 * Note that we also adjust the journal's start and end so that they will
443 * be aligned on the new block size. We pick a new sequence number to
444 * avoid any problems if a replay found previous transactions using the old
445 * journal header size. (See the comments in journal_create(), above.)
446 */
447
448 if (orig_blksz != 0) {
449 LFHFS_LOG(LEVEL_ERROR, "jnl: updating journal header with block size %llu\n",
450 (unsigned long long)phys_blksz);
451
452 jnl->jhdr->jhdr_size = phys_blksz;
453 jnl->jhdr->start = phys_blksz;
454 jnl->jhdr->end = phys_blksz;
455 jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num +
456 (journal_size / phys_blksz) +
457 (random() % 16384)) & 0x00ffffff;
458
459 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) {
460 LFHFS_LOG(LEVEL_ERROR, "jnl: open: failed to update journal header size\n");
461 goto bad_journal;
462 }
463 }
464
465 // make sure this is in sync!
466 jnl->active_start = jnl->jhdr->start;
467 jnl->sequence_num = jnl->jhdr->sequence_num;
468
469 // set this now, after we've replayed the journal
470 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
471
472 // TODO: Does this need to change if the device's logical block size changed?
473 if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) {
474 LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jnl->jhdr->size,
475 jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size);
476 goto bad_journal;
477 }
478
479 lf_lck_mtx_init(&jnl->jlock);
480 lf_lck_mtx_init(&jnl->flock);
481 lf_lck_rw_init(&jnl->trim_lock);
482
483 goto journal_open_complete;
484
485 bad_journal:
486 hfs_free(jnl->header_buf);
487 hfs_free(jnl);
488 cleanup_jdev_name:
489 jnl = NULL;
490 journal_open_complete:
491 return jnl;
492 }
493
494 journal *journal_create(struct vnode *jvp,
495 off_t offset,
496 off_t journal_size,
497 struct vnode *fsvp,
498 size_t min_fs_blksz,
499 int32_t flags,
500 int32_t tbuffer_size,
501 void (*flush)(void *arg),
502 void *arg,
503 struct mount *fsmount) {
504
505 journal *jnl;
506 uint32_t phys_blksz, new_txn_base;
507 u_int32_t min_size;
508
509 /*
510 * Cap the journal max size to 2GB. On HFS, it will attempt to occupy
511 * a full allocation block if the current size is smaller than the allocation
512 * block on which it resides. Once we hit the exabyte filesystem range, then
513 * it will use 2GB allocation blocks. As a result, make the cap 2GB.
514 */
515
516 /* Get the real physical block size. */
517 if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
518 goto cleanup_jdev_name;
519 }
520
521 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
522 LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size %lld looks bogus.\n", journal_size);
523 goto cleanup_jdev_name;
524 }
525
526 min_size = phys_blksz * (phys_blksz / sizeof(block_info));
527 /* Reject journals that are too small given the sector size of the device */
528 if (journal_size < min_size) {
529 LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size (%lld) too small given sector size of (%u)\n",
530 journal_size, phys_blksz);
531 goto cleanup_jdev_name;
532 }
533
534 if (phys_blksz > min_fs_blksz) {
535 LFHFS_LOG(LEVEL_ERROR, "jnl: create: error: phys blksize %u bigger than min fs blksize %zd\n",
536 phys_blksz, min_fs_blksz);
537 goto cleanup_jdev_name;
538 }
539
540 if ((journal_size % phys_blksz) != 0) {
541 LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n",
542 journal_size, phys_blksz);
543 goto cleanup_jdev_name;
544 }
545
546
547 jnl = hfs_mallocz(sizeof(struct journal));
548
549 jnl->jdev = jvp;
550 jnl->jdev_offset = offset;
551 jnl->jdev_blknum = (uint32_t)(offset / min_fs_blksz);
552 jnl->fsdev = fsvp;
553 jnl->flush = flush;
554 jnl->flush_arg = arg;
555 jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
556 lf_lck_mtx_init(&jnl->old_start_lock);
557
558 // Keep a point to the mount around for use in IO throttling.
559 jnl->fsmount = fsmount;
560
561 get_io_info(jvp, phys_blksz, jnl);
562
563 jnl->header_buf = hfs_malloc(phys_blksz);
564 jnl->header_buf_size = phys_blksz;
565
566 jnl->jhdr = (journal_header *)jnl->header_buf;
567 memset(jnl->jhdr, 0, sizeof(journal_header));
568
569 // we have to set this up here so that do_journal_io() will work
570 jnl->jhdr->jhdr_size = phys_blksz;
571
572 //
573 // We try and read the journal header to see if there is already one
574 // out there. If there is, it's possible that it has transactions
575 // in it that we might replay if we happen to pick a sequence number
576 // that is a little less than the old one, there is a crash and the
577 // last txn written ends right at the start of a txn from the previous
578 // incarnation of this file system. If all that happens we would
579 // replay the transactions from the old file system and that would
580 // destroy your disk. Although it is extremely unlikely for all those
581 // conditions to happen, the probability is non-zero and the result is
582 // severe - you lose your file system. Therefore if we find a valid
583 // journal header and the sequence number is non-zero we write junk
584 // over the entire journal so that there is no way we will encounter
585 // any old transactions. This is slow but should be a rare event
586 // since most tools erase the journal.
587 //
588 if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz
589 && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC
590 && jnl->jhdr->sequence_num != 0) {
591
592 new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff;
593 LFHFS_LOG(LEVEL_ERROR, "jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base);
594
595 } else {
596 new_txn_base = random() & 0x00ffffff;
597 }
598
599 memset(jnl->header_buf, 0, phys_blksz);
600
601 jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
602 jnl->jhdr->endian = ENDIAN_MAGIC;
603 jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
604 jnl->jhdr->end = phys_blksz;
605 jnl->jhdr->size = journal_size;
606 jnl->jhdr->jhdr_size = phys_blksz;
607 size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
608
609 jnl->active_start = jnl->jhdr->start;
610
611 jnl->jhdr->sequence_num = new_txn_base;
612
613 lf_lck_mtx_init(&jnl->jlock);
614 lf_lck_mtx_init(&jnl->flock);
615 lf_lck_rw_init(&jnl->trim_lock);
616
617 lf_cond_init(&jnl->flushing.sCond);
618 lf_cond_init(&jnl->asyncIO.sCond);
619 lf_cond_init(&jnl->writing_header.sCond);
620 jnl->flush_aborted = FALSE;
621 jnl->async_trim = NULL;
622 jnl->sequence_num = jnl->jhdr->sequence_num;
623
624 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
625 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_create: failed to write journal header.\n");
626 goto bad_write;
627 }
628
629 goto journal_create_complete;
630
631
632 bad_write:
633 hfs_free(jnl->header_buf);
634 jnl->jhdr = NULL;
635 hfs_free(jnl);
636 cleanup_jdev_name:
637 jnl = NULL;
638 journal_create_complete:
639 return jnl;
640 }
641
642
643
644 void *journal_owner(journal *jnl) {
645 return jnl->owner;
646 }
647
648 /* Is the given cnode either the .journal or .journal_info_block file on
649 * a volume with an active journal? Many VNOPs use this to deny access
650 * to those files.
651 *
652 * Note: the .journal file on a volume with an external journal still
653 * returns true here, even though it does not actually hold the contents
654 * of the volume's journal.
655 */
656 _Bool hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) {
657 if (hfsmp->jnl != NULL &&
658 (cp->c_fileid == hfsmp->hfs_jnlinfoblkid ||
659 cp->c_fileid == hfsmp->hfs_jnlfileid)) {
660 return true;
661 } else {
662 return false;
663 }
664 }
665
666 bool is_journaled(UVFSFileNode *psRootNode) {
667
668 struct vnode *psRootVnode = *psRootNode;
669
670 if (!psRootNode) {
671 LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootNode is NULL");
672 return false;
673 }
674
675 if (!psRootVnode->sFSParams.vnfs_mp) {
676 LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootVnode->sFSParams.vnfs_mp is NULL");
677 return false;
678 }
679
680 if (psRootVnode->sFSParams.vnfs_mp->psHfsmount->jnl)
681 return true;
682
683 return false;
684 }
685
686
687 // Media no longer available, clear all memory occupied by the journal
688 void journal_release(journal *jnl) {
689 if (jnl->owner != pthread_self()) {
690 journal_lock(jnl);
691 }
692
693 if (jnl->active_tr) {
694 abort_transaction(jnl, jnl->active_tr);
695 }
696
697 if (jnl->cur_tr) {
698 abort_transaction(jnl, jnl->cur_tr);
699 }
700
701 free_old_stuff(jnl);
702
703 hfs_free(jnl->header_buf);
704 jnl->jhdr = (void *)0xbeefbabe;
705
706 journal_unlock(jnl);
707 lf_lck_mtx_destroy(&jnl->old_start_lock);
708 lf_lck_mtx_destroy(&jnl->jlock);
709 lf_lck_mtx_destroy(&jnl->flock);
710 hfs_free(jnl);
711 }
712
713
714 void journal_close(journal *jnl) {
715 volatile off_t *start, *end;
716 int counter=0;
717
718 CHECK_JOURNAL(jnl);
719
720 // set this before doing anything that would block so that
721 // we start tearing things down properly.
722 //
723 jnl->flags |= JOURNAL_CLOSE_PENDING;
724
725 if (jnl->owner != pthread_self()) {
726 journal_lock(jnl);
727 }
728
729 wait_condition(jnl, &jnl->flushing, "journal_close");
730
731 //
732 // only write stuff to disk if the journal is still valid
733 //
734 if ((jnl->flags & JOURNAL_INVALID) == 0) {
735
736 if (jnl->active_tr) {
737 /*
738 * "journal_end_transaction" will fire the flush asynchronously
739 */
740 journal_end_transaction(jnl);
741 }
742
743 // flush any buffered transactions
744 if (jnl->cur_tr) {
745 transaction *tr = jnl->cur_tr;
746
747 jnl->cur_tr = NULL;
748 /*
749 * "end_transaction" will wait for any in-progress flush to complete
750 * before flushing "cur_tr" synchronously("must_wait" == TRUE)
751 */
752 end_transaction(tr, 1, NULL, NULL, FALSE);
753 }
754 /*
755 * if there was an "active_tr", make sure we wait for
756 * it to flush if there was no "cur_tr" to process
757 */
758 wait_condition(jnl, &jnl->flushing, "journal_close");
759
760 //start = &jnl->jhdr->start;
761 start = &jnl->active_start;
762 end = &jnl->jhdr->end;
763
764 while (*start != *end && counter++ < 5000) {
765 //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
766 if (jnl->flush) {
767 jnl->flush(jnl->flush_arg);
768 }
769 usleep(10000);
770 }
771
772 if (*start != *end) {
773 LFHFS_LOG(LEVEL_ERROR, "jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
774 *start, *end);
775 }
776
777 // make sure this is in sync when we close the journal
778 jnl->jhdr->start = jnl->active_start;
779
780 // if this fails there's not much we can do at this point...
781 write_journal_header(jnl, 1, jnl->sequence_num);
782 } else {
783 // if we're here the journal isn't valid any more.
784 // so make sure we don't leave any locked blocks lying around
785 LFHFS_LOG(LEVEL_ERROR, "jnl: close: journal is invalid. aborting outstanding transactions\n");
786 if (jnl->active_tr || jnl->cur_tr) {
787 transaction *tr;
788
789 if (jnl->active_tr) {
790 tr = jnl->active_tr;
791 jnl->active_tr = NULL;
792 } else {
793 tr = jnl->cur_tr;
794 jnl->cur_tr = NULL;
795 }
796 abort_transaction(jnl, tr);
797
798 if (jnl->active_tr || jnl->cur_tr) {
799 panic("jnl: close: jnl @ %p had both an active and cur tr\n", jnl);
800 }
801 }
802 }
803 wait_condition(jnl, &jnl->asyncIO, "journal_close");
804
805 free_old_stuff(jnl);
806
807 hfs_free(jnl->header_buf);
808 jnl->jhdr = (void *)0xbeefbabe;
809
810 journal_unlock(jnl);
811 lf_lck_mtx_destroy(&jnl->old_start_lock);
812 lf_lck_mtx_destroy(&jnl->jlock);
813 lf_lck_mtx_destroy(&jnl->flock);
814 hfs_free(jnl);
815 }
816
817 // This function performs the following:
818 // 1) Checks that we have a valid journal
819 // 2) locks the journal
820 // 3) Allocates roon in the journal
821 int journal_start_transaction(journal *jnl) {
822
823 int ret;
824
825 #if JOURNAL_DEBUG
826 printf("journal_start_transaction (%u).\n", jnl->nested_count);
827 #endif
828
829 CHECK_JOURNAL(jnl);
830
831 free_old_stuff(jnl);
832
833 if (jnl->flags & JOURNAL_INVALID) {
834 return EINVAL;
835 }
836
837 if (jnl->owner == pthread_self()) {
838 if (jnl->active_tr == NULL) {
839 panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n",
840 jnl, jnl->owner, pthread_self());
841 }
842 jnl->nested_count++;
843 return 0;
844 }
845
846 journal_lock(jnl);
847
848 if (jnl->nested_count != 0 || jnl->active_tr != NULL) {
849 panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n",
850 jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
851 }
852
853 jnl->nested_count = 1;
854
855 // if there's a buffered transaction, use it.
856 if (jnl->cur_tr) {
857 jnl->active_tr = jnl->cur_tr;
858 jnl->cur_tr = NULL;
859
860 return 0;
861 }
862
863 ret = journal_allocate_transaction(jnl);
864 if (ret) {
865 goto bad_start;
866 }
867
868 // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr);
869
870 return 0;
871
872 bad_start:
873 jnl->nested_count = 0;
874 journal_unlock(jnl);
875
876 return ret;
877 }
878 // journal_end_transaction
879 // This function does the following:
880 // 1) Validates journal status/state
881 // 2)
882 int journal_end_transaction(journal *jnl) {
883 int ret;
884 transaction *tr;
885
886 #if JOURNAL_DEBUG
887 printf("journal_end_transaction (%u).\n", jnl->nested_count-1);
888 #endif
889
890 CHECK_JOURNAL(jnl);
891
892 free_old_stuff(jnl);
893
894 if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
895 return 0;
896 }
897
898 if (jnl->owner != pthread_self()) {
899 panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n",
900 jnl, jnl->owner, pthread_self());
901 }
902 jnl->nested_count--;
903
904 if (jnl->nested_count > 0) {
905 return 0;
906 } else if (jnl->nested_count < 0) {
907 panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
908 }
909
910 if (jnl->flags & JOURNAL_INVALID) {
911 if (jnl->active_tr) {
912 if (jnl->cur_tr != NULL) {
913 panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n",
914 jnl, jnl->active_tr, jnl->cur_tr);
915 }
916 tr = jnl->active_tr;
917 jnl->active_tr = NULL;
918
919 abort_transaction(jnl, tr);
920 }
921 journal_unlock(jnl);
922
923 return EINVAL;
924 }
925
926 tr = jnl->active_tr;
927 CHECK_TRANSACTION(tr);
928
929 // clear this out here so that when check_free_space() calls
930 // the FS flush function, we don't panic in journal_flush()
931 // if the FS were to call that. note: check_free_space() is
932 // called from end_transaction().
933 jnl->active_tr = NULL;
934
935 /* Examine the force-journal-flush state in the active txn */
936 if (tr->flush_on_completion == TRUE) {
937 /*
938 * If the FS requested it, disallow group commit and force the
939 * transaction out to disk immediately.
940 */
941 ret = end_transaction(tr, 1, NULL, NULL, TRUE);
942 }
943 else {
944 /* in the common path we can simply use the double-buffered journal */
945 ret = end_transaction(tr, 0, NULL, NULL, TRUE);
946 }
947
948 return ret;
949 }
950
951 // journal_modify_block_start
952 // This function does the following:
953 // 1) Makes sure the journal file is on and valid
954 // 2) Clean up (free previous transactions)
955 // 3) Validate that the phy-block-size has not changed.
956 // 4) Locks the buffer.
957 // Buffer life cycle with journal:
958 // 1) Client code (ie btrees_io.c) allocates a buffer (ie gains ownership). Other threads will pend on using this buffer until it is released.
959 // 2) Client code calls journal_modify_block_start which sets the GEN_BUF_WRITE_LOCK uCacheFlag.
960 // 3) Client code modifies the buffer.
961 // 4) Client code calls journal_modify_block_end which released the buffer. The GEN_BUF_WRITE_LOCK flag remains set.
962 // It this point other threads are welcomed to modify the buffer (after executing steps 1 and 2 above). The buffer content will not be written to media before transaction_end, thus only the accumulative change of both threads after transaction_end will be committed.
963 // 5) transaction-end (called from within client-code or async Sync) obtains ownership on in transaction buffers. By doing that it makes sure no buffer is currently being modified by any Client code. It then prepares the buffer for commiting (ie realigns endianizm), and commits (writes to the t-buffer, write the t-buffer to media, updates journal-info, clears the GEN_BUF_WRITE_LOCK flags and writes the buffers to media).
964 int journal_modify_block_start(journal *jnl, GenericLFBuf *psGenBuf) {
965
966 transaction *tr;
967
968 #if JOURNAL_DEBUG
969 printf("journal_modify_block_start: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uCacheFlags 0x%llx, uPhyCluster %llu, uLockCnt %u\n",
970 psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uCacheFlags ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
971 #endif
972
973 CHECK_JOURNAL(jnl);
974
975 free_old_stuff(jnl);
976
977 if (jnl->flags & JOURNAL_INVALID) {
978 return EINVAL;
979 }
980
981 tr = jnl->active_tr;
982 CHECK_TRANSACTION(tr);
983
984 if (jnl->owner != pthread_self()) {
985 panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n",
986 jnl, jnl->owner, pthread_self());
987 }
988
989 //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n",
990 // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes);
991
992 // can't allow blocks that aren't an even multiple of the
993 // underlying block size.
994 if ((psGenBuf->uDataSize % jnl->jhdr->jhdr_size) != 0) {
995 uint32_t bad=0;
996 uint32_t phys_blksz;
997
998 if (ioctl(jnl->jdev->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
999 bad = 1;
1000 } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) {
1001 if (phys_blksz < 512) {
1002 panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n",
1003 phys_blksz, psGenBuf->uDataSize, jnl->jhdr->jhdr_size);
1004 }
1005
1006 if ((psGenBuf->uDataSize % phys_blksz) != 0) {
1007 bad = 1;
1008 } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) {
1009 jnl->jhdr->jhdr_size = phys_blksz;
1010 } else {
1011 // the phys_blksz is now larger... need to realloc the jhdr
1012 char *new_header_buf;
1013
1014 LFHFS_LOG(LEVEL_ERROR, "jnl: phys blksz got bigger (was: %d/%d now %d)\n",
1015 jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz);
1016 new_header_buf = hfs_malloc(phys_blksz);
1017 memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size);
1018 memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size));
1019 hfs_free(jnl->header_buf);
1020 jnl->header_buf = new_header_buf;
1021 jnl->header_buf_size = phys_blksz;
1022
1023 jnl->jhdr = (journal_header *)jnl->header_buf;
1024 jnl->jhdr->jhdr_size = phys_blksz;
1025 }
1026 } else {
1027 bad = 1;
1028 }
1029
1030 if (bad) {
1031 panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
1032 psGenBuf->uDataSize, jnl->jhdr->jhdr_size);
1033
1034 return -1;
1035 }
1036 }
1037
1038 // make sure that this transaction isn't bigger than the whole journal
1039 if ((tr->total_bytes+psGenBuf->uDataSize) >= (size_t)(jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
1040 panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n",
1041 tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), psGenBuf->uDataSize, tr, psGenBuf->pvData);
1042
1043 return -1;
1044 }
1045
1046 lf_hfs_generic_buf_set_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
1047
1048 return 0;
1049 }
1050 // journal_modify_block_end
1051 // This function does the following:
1052 // 1) Makes sure the journal file is on and valid
1053 // 2) Clean up (free previous transactions)
1054 // 3) Check if this block already exists in transaction
1055 // 4) Add block number to transcation. We dont add the block data, nor we release the buffer at this point.
1056 // This will be done later on, at the transaction-end.
1057 int journal_modify_block_end(journal *jnl, GenericLFBuf *psGenBuf,
1058 void (*func)(GenericLFBuf *bp, void *arg), void *arg) {
1059 int i = 1;
1060 size_t tbuffer_offset=0;
1061 block_list_header *blhdr, *prev=NULL;
1062 transaction *tr = NULL;
1063
1064 #if JOURNAL_DEBUG
1065 printf("journal_modify_block_end: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1066 psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
1067 #endif
1068
1069 CHECK_JOURNAL(jnl);
1070
1071 free_old_stuff(jnl);
1072
1073 if (func) {
1074 psGenBuf->pfFunc = func;
1075 psGenBuf->pvCallbackArgs = arg;
1076 }
1077
1078 if (jnl->flags & JOURNAL_INVALID) {
1079 /* Still need to buf_brelse(). Callers assume we consume the bp. */
1080 lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
1081 lf_hfs_generic_buf_release(psGenBuf);
1082 return EINVAL;
1083 }
1084
1085 tr = jnl->active_tr;
1086 CHECK_TRANSACTION(tr);
1087
1088 if (jnl->owner != pthread_self()) {
1089 panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n",
1090 jnl, jnl->owner, pthread_self());
1091 }
1092
1093 if ((psGenBuf->uCacheFlags & GEN_BUF_WRITE_LOCK) == 0) {
1094 panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", psGenBuf, jnl);
1095 }
1096
1097 // first check if this block is already part of this transaction
1098 for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
1099 tbuffer_offset = jnl->jhdr->blhdr_size;
1100
1101 for (i = 1; i < blhdr->num_blocks; i++) {
1102 GenericLFBuf *bp = (void*)blhdr->binfo[i].u.bp;
1103 if (psGenBuf == bp) {
1104 // Block found in transaction
1105 #if JOURNAL_DEBUG
1106 printf("block_end, already in journal: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
1107 psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
1108 #endif
1109 break;
1110 }
1111 if (blhdr->binfo[i].bnum != (off_t)-1) {
1112 off_t uSizeOfBuf = ((GenericLFBuf*)(blhdr->binfo[i].u.bp))->uDataSize;
1113 tbuffer_offset += uSizeOfBuf;
1114 } else {
1115 tbuffer_offset += blhdr->binfo[i].u.bi.bsize;
1116 }
1117 }
1118
1119 if (i < blhdr->num_blocks) {
1120 break;
1121 }
1122 }
1123
1124 if (blhdr == NULL
1125 && prev
1126 && (prev->num_blocks+1) <= prev->max_blocks
1127 && (prev->bytes_used+psGenBuf->uDataSize) <= (uint32_t)tr->tbuffer_size) {
1128 // Block not found, add to last list
1129 blhdr = prev;
1130
1131 } else if (blhdr == NULL) {
1132 block_list_header *nblhdr;
1133 if (prev == NULL) {
1134 panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, psGenBuf %p\n", jnl, psGenBuf);
1135 }
1136 // Add another tbuffer:
1137
1138 // we got to the end of the list, didn't find the block and there's
1139 // no room in the block_list_header pointed to by prev
1140
1141 // we allocate another tbuffer and link it in at the end of the list
1142 // through prev->binfo[0].bnum. that's a skanky way to do things but
1143 // avoids having yet another linked list of small data structures to manage.
1144
1145 nblhdr = hfs_malloc(tr->tbuffer_size);
1146
1147 // journal replay code checksum check depends on this.
1148 memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
1149 // Fill up the rest of the block with unimportant bytes
1150 memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
1151
1152 // initialize the new guy
1153 nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
1154 nblhdr->num_blocks = 1; // accounts for this header block
1155 nblhdr->bytes_used = (uint32_t)jnl->jhdr->blhdr_size;
1156 nblhdr->flags = BLHDR_CHECK_CHECKSUMS;
1157
1158 tr->num_blhdrs++;
1159 tr->total_bytes += jnl->jhdr->blhdr_size;
1160
1161 // then link him in at the end
1162 prev->binfo[0].bnum = (off_t)((long)nblhdr);
1163
1164 // and finally switch to using the new guy
1165 blhdr = nblhdr;
1166 i = 1;
1167 }
1168
1169 if ((i+1) > blhdr->max_blocks) {
1170 panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
1171 }
1172
1173 // if this is true then this is a new block we haven't seen before
1174 if (i >= blhdr->num_blocks) {
1175 off_t bsize;
1176 bsize = psGenBuf->uDataSize;
1177
1178 // Add block to list
1179 blhdr->binfo[i].bnum = (off_t)(psGenBuf->uBlockN);
1180 blhdr->binfo[i].u.bp = (void*)psGenBuf;
1181
1182 blhdr->bytes_used += bsize;
1183 tr->total_bytes += bsize;
1184
1185 blhdr->num_blocks++;
1186 }
1187
1188 // We can release the block here to allow other threads to perform operations on it until the next transaction-end.
1189 // The buffer will not be removed from cache since it is write-locked.
1190 lf_hfs_generic_buf_release(psGenBuf);
1191
1192 return 0;
1193 }
1194
1195 // This function validates if a block is already registered to a transaction
1196 /*
1197 * Flush the contents of the journal to the disk.
1198 *
1199 * Input:
1200 * wait_for_IO -
1201 * If TRUE, wait to write in-memory journal to the disk
1202 * consistently, and also wait to write all asynchronous
1203 * metadata blocks to its corresponding locations
1204 * consistently on the disk. This means that the journal
1205 * is empty at this point and does not contain any
1206 * transactions. This is overkill in normal scenarios
1207 * but is useful whenever the metadata blocks are required
1208 * to be consistent on-disk instead of just the journal
1209 * being consistent; like before live verification
1210 * and live volume resizing.
1211 *
1212 * If FALSE, only wait to write in-memory journal to the
1213 * disk consistently. This means that the journal still
1214 * contains uncommitted transactions and the file system
1215 * metadata blocks in the journal transactions might be
1216 * written asynchronously to the disk. But there is no
1217 * guarantee that they are written to the disk before
1218 * returning to the caller. Note that this option is
1219 * sufficient for file system data integrity as it
1220 * guarantees consistent journal content on the disk.
1221 */
1222 int journal_flush(journal *jnl, journal_flush_options_t options) {
1223 boolean_t drop_lock = FALSE;
1224 errno_t error = 0;
1225 uint32_t flush_count = 0;
1226
1227 CHECK_JOURNAL(jnl);
1228
1229 free_old_stuff(jnl);
1230
1231 if (jnl->flags & JOURNAL_INVALID) {
1232 return EINVAL;
1233 }
1234
1235 if (jnl->owner != pthread_self()) {
1236 journal_lock(jnl);
1237 drop_lock = TRUE;
1238 }
1239
1240 if (ISSET(options, JOURNAL_FLUSH_FULL))
1241 flush_count = jnl->flush_counter;
1242
1243 // if we're not active, flush any buffered transactions
1244 if (jnl->active_tr == NULL && jnl->cur_tr) {
1245 transaction *tr = jnl->cur_tr;
1246
1247 jnl->cur_tr = NULL;
1248
1249 if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
1250 wait_condition(jnl, &jnl->flushing, "journal_flush");
1251 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
1252 }
1253
1254 // As the journal flush changes the MetaData content (update Endianizm), we need to lock the system times.
1255 int lockflags = hfs_systemfile_lock(jnl->fsmount->psHfsmount, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1256
1257 /*
1258 * "end_transction" will wait for any current async flush
1259 * to complete, before flushing "cur_tr"... because we've
1260 * specified the 'must_wait' arg as TRUE, it will then
1261 * synchronously flush the "cur_tr"
1262 */
1263 end_transaction(tr, 1, NULL, NULL, drop_lock); // force it to get flushed
1264
1265 hfs_systemfile_unlock(jnl->fsmount->psHfsmount, lockflags);
1266
1267 } else {
1268 if (drop_lock == TRUE) {
1269 journal_unlock(jnl);
1270 }
1271
1272 /* Because of pipelined journal, the journal transactions
1273 * might be in process of being flushed on another thread.
1274 * If there is nothing to flush currently, we should
1275 * synchronize ourselves with the pipelined journal thread
1276 * to ensure that all inflight transactions, if any, are
1277 * flushed before we return success to caller.
1278 */
1279 wait_condition(jnl, &jnl->flushing, "journal_flush");
1280 }
1281 if (ISSET(options, JOURNAL_WAIT_FOR_IO)) {
1282 wait_condition(jnl, &jnl->asyncIO, "journal_flush");
1283 }
1284
1285 if (ISSET(options, JOURNAL_FLUSH_FULL)) {
1286
1287 dk_synchronize_t sync_request = {
1288 .options = 0,
1289 };
1290
1291 // We need a full cache flush. If it has not been done, do it here.
1292 if (flush_count == jnl->flush_counter)
1293 error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
1294
1295 // If external journal partition is enabled, flush filesystem data partition.
1296 if (jnl->jdev != jnl->fsdev)
1297 error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
1298
1299 }
1300
1301 return error;
1302 }
1303
1304
1305 // ************************** Local Functions ***********************
1306 static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) {
1307
1308 int iRet = 0;
1309 GenericLFBuf *psGenBuf = NULL;
1310
1311 // first read the block we want.
1312 psGenBuf = lf_hfs_generic_buf_allocate(jnl->fsmount->psHfsmount->hfs_devvp,
1313 fs_block,
1314 (uint32_t)bsize,
1315 GEN_BUF_PHY_BLOCK | GEN_BUF_NON_CACHED);
1316 if (!psGenBuf) {
1317 LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error allocating fs block # %lld!\n", fs_block);
1318 iRet = -1;
1319 goto exit;
1320 }
1321
1322 iRet = lf_hfs_generic_buf_read(psGenBuf);
1323 if (iRet) {
1324 LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error reading fs block # %lld!\n", fs_block);
1325 goto exit;
1326 }
1327
1328 // copy the journal data over top of it
1329 memcpy(psGenBuf->pvData, block_ptr, bsize);
1330
1331 iRet = lf_hfs_generic_buf_write(psGenBuf);
1332 if (iRet) {
1333 LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: failed to write block %lld (ret %d)\n", fs_block, iRet);
1334 goto exit;
1335 }
1336
1337 exit:
1338 if (psGenBuf) {
1339 lf_hfs_generic_buf_release(psGenBuf);
1340 }
1341
1342 return iRet;
1343 }
1344
1345
1346 static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) {
1347 struct bucket *newBuf;
1348 int current_size = num_buckets, i;
1349
1350 // return if newsize is less than the current size
1351 if (new_size < num_buckets) {
1352 return current_size;
1353 }
1354
1355 newBuf = hfs_malloc(new_size*sizeof(struct bucket));
1356
1357 // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
1358
1359 // copy existing elements
1360 bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
1361
1362 // initialize the new ones
1363 for(i = num_buckets; i < new_size; i++) {
1364 newBuf[i].block_num = (off_t)-1;
1365 }
1366
1367 // free the old container
1368 hfs_free(*buf_ptr);
1369
1370 // reset the buf_ptr
1371 *buf_ptr = newBuf;
1372
1373 return new_size;
1374 }
1375
1376
1377 static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) {
1378
1379 if (!overwriting) {
1380 // grow the table if we're out of space - we may index the table
1381 // with *num_full_ptr (lookup_bucket() can return a maximum value ==
1382 // *num_full_ptr), so we need to grow when we hit (*num_buckets_ptr - 1)
1383 // to prevent out-of-bounds indexing
1384 if (*num_full_ptr >= (*num_buckets_ptr - 1)) {
1385 int new_size = *num_buckets_ptr * 2;
1386 int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
1387
1388 if (grow_size < new_size) {
1389 LFHFS_LOG(LEVEL_ERROR, "jnl: add_block: grow_table returned an error!\n");
1390 return -1;
1391 }
1392
1393 *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
1394 }
1395
1396 // if we're not inserting at the end, we need to bcopy
1397 if (blk_index != *num_full_ptr) {
1398 bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
1399 }
1400
1401 (*num_full_ptr)++; // increment only if we're not overwriting
1402 }
1403
1404 // sanity check the values we're about to add
1405 if ((off_t)offset >= jnl->jhdr->size) {
1406 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1407 }
1408 if (size <= 0) {
1409 panic("jnl: insert_block: bad size in insert_block (%zd)\n", size);
1410 }
1411
1412 (*buf_ptr)[blk_index].block_num = num;
1413 (*buf_ptr)[blk_index].block_size = (uint32_t)size;
1414 (*buf_ptr)[blk_index].jnl_offset = (uint32_t)offset;
1415 (*buf_ptr)[blk_index].cksum = cksum;
1416
1417 return blk_index;
1418 }
1419
1420 static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) {
1421
1422 int num_to_remove, index, i, overwrite, err;
1423 size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
1424 off_t overlap, block_start, block_end;
1425
1426 block_start = block_num*jhdr_size;
1427 block_end = block_start + size;
1428 overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
1429
1430 // first, eliminate any overlap with the previous entry
1431 if (blk_index != 0 && !overwrite) {
1432 off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
1433 off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
1434 overlap = prev_block_end - block_start;
1435 if (overlap > 0) {
1436 if (overlap % jhdr_size != 0) {
1437 panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size);
1438 }
1439
1440 // if the previous entry completely overlaps this one, we need to break it into two pieces.
1441 if (prev_block_end > block_end) {
1442 off_t new_num = block_end / jhdr_size;
1443 size_t new_size = prev_block_end - block_end;
1444
1445 new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
1446
1447 err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0);
1448 if (err < 0) {
1449 panic("jnl: do_overlap: error inserting during pre-overlap\n");
1450 }
1451 }
1452
1453 // Regardless, we need to truncate the previous entry to the beginning of the overlap
1454 (*buf_ptr)[blk_index-1].block_size = (uint32_t)(block_start - prev_block_start);
1455 (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it
1456 }
1457 }
1458
1459 // then, bail out fast if there's no overlap with the entries that follow
1460 if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) {
1461 return 0; // no overlap, no overwrite
1462 } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) {
1463
1464 (*buf_ptr)[blk_index].cksum = cksum; // update this
1465 return 1; // simple overwrite
1466 }
1467
1468 // Otherwise, find all cases of total and partial overlap. We use the special
1469 // block_num of -2 to designate entries that are completely overlapped and must
1470 // be eliminated. The block_num, size, and jnl_offset of partially overlapped
1471 // entries must be adjusted to keep the array consistent.
1472 index = blk_index;
1473 num_to_remove = 0;
1474 while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) {
1475 if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) {
1476 (*buf_ptr)[index].block_num = -2; // mark this for deletion
1477 num_to_remove++;
1478 } else {
1479 overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
1480 if (overlap > 0) {
1481 if (overlap % jhdr_size != 0) {
1482 panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size);
1483 }
1484
1485 // if we partially overlap this entry, adjust its block number, jnl offset, and size
1486 (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
1487 (*buf_ptr)[index].cksum = 0;
1488
1489 new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
1490 if ((off_t)new_offset >= jnl->jhdr->size) {
1491 new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
1492 }
1493 (*buf_ptr)[index].jnl_offset = (uint32_t)new_offset;
1494
1495 (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
1496 if ((*buf_ptr)[index].block_size <= 0) {
1497 panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size);
1498 // return -1; // if above panic is removed, return -1 for error
1499 }
1500 }
1501
1502 }
1503
1504 index++;
1505 }
1506
1507 // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
1508 index--; // start with the last index used within the above loop
1509 while (index >= blk_index) {
1510 if ((*buf_ptr)[index].block_num == -2) {
1511 if (index == *num_full_ptr-1) {
1512 (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
1513 } else {
1514 bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
1515 }
1516 (*num_full_ptr)--;
1517 }
1518 index--;
1519 }
1520
1521 // eliminate any stale entries at the end of the table
1522 for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
1523 (*buf_ptr)[i].block_num = -1;
1524 }
1525
1526 return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
1527 }
1528
1529
1530 static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) {
1531 int lo, hi, index, matches, i;
1532
1533 if (num_full == 0) {
1534 return 0; // table is empty, so insert at index=0
1535 }
1536
1537 lo = 0;
1538 hi = num_full - 1;
1539 index = -1;
1540
1541 // perform binary search for block_num
1542 do {
1543 int mid = (hi - lo)/2 + lo;
1544 off_t this_num = (*buf_ptr)[mid].block_num;
1545
1546 if (block_num == this_num) {
1547 index = mid;
1548 break;
1549 }
1550
1551 if (block_num < this_num) {
1552 hi = mid;
1553 continue;
1554 }
1555
1556 if (block_num > this_num) {
1557 lo = mid + 1;
1558 continue;
1559 }
1560 } while (lo < hi);
1561
1562 // check if lo and hi converged on the match
1563 if (block_num == (*buf_ptr)[hi].block_num) {
1564 index = hi;
1565 }
1566
1567 // if no existing entry found, find index for new one
1568 if (index == -1) {
1569 index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
1570 } else {
1571 // make sure that we return the right-most index in the case of multiple matches
1572 matches = 0;
1573 i = index + 1;
1574 while (i < num_full && block_num == (*buf_ptr)[i].block_num) {
1575 matches++;
1576 i++;
1577 }
1578
1579 index += matches;
1580 }
1581
1582 return index;
1583 }
1584
1585 // PR-3105942: Coalesce writes to the same block in journal replay
1586 // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
1587 // to be replayed and the corresponding location in the journal which contains
1588 // the most recent data for those blocks. The array is "played" once the all the
1589 // blocks in the journal have been coalesced. The code for the case of conflicting/
1590 // overlapping writes to a single block is the most dense. Because coalescing can
1591 // disrupt the existing time-ordering of blocks in the journal playback, care
1592 // is taken to catch any overlaps and keep the array consistent.
1593 static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) {
1594 int blk_index, overwriting;
1595
1596 // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
1597 // inserted (or the index of the elem to overwrite).
1598 blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
1599
1600 // check if the index is within bounds (if we're adding this block to the end of
1601 // the table, blk_index will be equal to num_full)
1602 if (blk_index < 0 || blk_index > *num_full_ptr) {
1603 //printf("jnl: add_block: trouble adding block to co_buf\n");
1604 return -1;
1605 } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
1606
1607 // Determine whether we're overwriting an existing entry by checking for overlap
1608 overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr);
1609 if (overwriting < 0) {
1610 return -1; // if we got an error, pass it along
1611 }
1612
1613 // returns the index, or -1 on error
1614 blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting);
1615
1616 return blk_index;
1617 }
1618
1619 static void swap_block_list_header(journal *jnl, block_list_header *blhdr) {
1620 int i;
1621
1622 blhdr->max_blocks = SWAP16(blhdr->max_blocks);
1623 blhdr->num_blocks = SWAP16(blhdr->num_blocks);
1624 blhdr->bytes_used = SWAP32(blhdr->bytes_used);
1625 blhdr->checksum = SWAP32(blhdr->checksum);
1626 blhdr->flags = SWAP32(blhdr->flags);
1627
1628 if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) {
1629 LFHFS_LOG(LEVEL_ERROR, "jnl: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", blhdr->num_blocks, jnl->jhdr->blhdr_size);
1630 return;
1631 }
1632
1633 for(i = 0; i < blhdr->num_blocks; i++) {
1634 blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum);
1635 blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize);
1636 blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum);
1637 }
1638 }
1639
1640 static int replay_journal(journal *jnl) {
1641 int i, bad_blocks=0;
1642 unsigned int orig_checksum, checksum, check_block_checksums = 0;
1643 size_t ret;
1644 size_t max_bsize = 0; /* protected by block_ptr */
1645 block_list_header *blhdr;
1646 off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start;
1647 char *buff, *block_ptr=NULL;
1648 struct bucket *co_buf;
1649 int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory = 0;
1650 uint32_t last_sequence_num = 0;
1651 int replay_retry_count = 0;
1652
1653 LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: start.\n");
1654
1655
1656 // wrap the start ptr if it points to the very end of the journal
1657 if (jnl->jhdr->start == jnl->jhdr->size) {
1658 jnl->jhdr->start = jnl->jhdr->jhdr_size;
1659 }
1660 if (jnl->jhdr->end == jnl->jhdr->size) {
1661 jnl->jhdr->end = jnl->jhdr->jhdr_size;
1662 }
1663
1664 if (jnl->jhdr->start == jnl->jhdr->end) {
1665 LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: journal empty.\n");
1666 goto success;
1667 }
1668
1669 orig_jnl_start = jnl->jhdr->start;
1670
1671 // allocate memory for the header_block. we'll read each blhdr into this
1672 buff = hfs_malloc(jnl->jhdr->blhdr_size);
1673
1674 // allocate memory for the coalesce buffer
1675 co_buf = hfs_malloc(num_buckets*sizeof(struct bucket));
1676
1677 restart_replay:
1678
1679 // initialize entries
1680 for(i = 0; i < num_buckets; i++) {
1681 co_buf[i].block_num = -1;
1682 }
1683 num_full = 0; // empty at first
1684
1685
1686 while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) {
1687 offset = blhdr_offset = jnl->jhdr->start;
1688 ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size);
1689 if (ret != (size_t)jnl->jhdr->blhdr_size) {
1690 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
1691 goto bad_txn_handling;
1692 }
1693
1694 blhdr = (block_list_header *)buff;
1695
1696 orig_checksum = blhdr->checksum;
1697 blhdr->checksum = 0;
1698 if (jnl->flags & JOURNAL_NEED_SWAP) {
1699 // calculate the checksum based on the unswapped data
1700 // because it is done byte-at-a-time.
1701 orig_checksum = (unsigned int)SWAP32(orig_checksum);
1702 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1703 swap_block_list_header(jnl, blhdr);
1704 } else {
1705 checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
1706 }
1707
1708
1709 //
1710 // XXXdbg - if these checks fail, we should replay as much
1711 // we can in the hopes that it will still leave the
1712 // drive in a better state than if we didn't replay
1713 // anything
1714 //
1715 if (checksum != orig_checksum) {
1716 if (check_past_jnl_end && in_uncharted_territory) {
1717
1718 if (blhdr_offset != jnl->jhdr->end) {
1719 LFHFS_LOG(LEVEL_ERROR, "jnl: Extra txn replay stopped @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset);
1720 }
1721
1722 check_past_jnl_end = 0;
1723 jnl->jhdr->end = blhdr_offset;
1724 continue;
1725 }
1726
1727 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
1728 blhdr_offset, orig_checksum, checksum);
1729
1730 if (blhdr_offset == orig_jnl_start) {
1731 // if there's nothing in the journal at all, just bail out altogether.
1732 goto bad_replay;
1733 }
1734
1735 goto bad_txn_handling;
1736 }
1737
1738 if ( (last_sequence_num != 0)
1739 && (blhdr->binfo[0].u.bi.b.sequence_num != 0)
1740 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num)
1741 && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) {
1742
1743 txn_start_offset = jnl->jhdr->end = blhdr_offset;
1744
1745 if (check_past_jnl_end) {
1746 check_past_jnl_end = 0;
1747 LFHFS_LOG(LEVEL_ERROR, "jnl: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n",
1748 blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1749 continue;
1750 }
1751
1752 LFHFS_LOG(LEVEL_ERROR, "jnl: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n",
1753 blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num);
1754 goto bad_txn_handling;
1755 }
1756 last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num;
1757
1758 if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) {
1759 if (last_sequence_num == 0) {
1760 check_past_jnl_end = 0;
1761 LFHFS_LOG(LEVEL_ERROR, "jnl: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n",
1762 jnl->jhdr->start, jnl->jhdr->end);
1763 if (jnl->jhdr->start != jnl->jhdr->end) {
1764 jnl->jhdr->start = jnl->jhdr->end;
1765 }
1766 continue;
1767 }
1768 LFHFS_LOG(LEVEL_ERROR, "jnl: examining extra transactions starting @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset);
1769 }
1770
1771 if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size)
1772 || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
1773 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
1774 blhdr->max_blocks, blhdr->num_blocks);
1775 goto bad_txn_handling;
1776 }
1777
1778 max_bsize = 0;
1779 for (i = 1; i < blhdr->num_blocks; i++) {
1780 if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
1781 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
1782 goto bad_txn_handling;
1783 }
1784
1785 if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) {
1786 max_bsize = blhdr->binfo[i].u.bi.bsize;
1787 }
1788 }
1789
1790 if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) {
1791 check_block_checksums = 1;
1792 block_ptr = hfs_malloc(max_bsize);
1793 } else {
1794 block_ptr = NULL;
1795 }
1796
1797 if (blhdr->flags & BLHDR_FIRST_HEADER) {
1798 txn_start_offset = blhdr_offset;
1799 }
1800
1801 //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n",
1802 // blhdr->num_blocks-1, jnl->jhdr->start);
1803 bad_blocks = 0;
1804 for (i = 1; i < blhdr->num_blocks; i++) {
1805 int size, ret_val;
1806 off_t number;
1807
1808 size = blhdr->binfo[i].u.bi.bsize;
1809 number = blhdr->binfo[i].bnum;
1810
1811 // don't add "killed" blocks
1812 if (number == (off_t)-1) {
1813 //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i);
1814 } else {
1815
1816 if (check_block_checksums) {
1817 int32_t disk_cksum;
1818 off_t block_offset;
1819
1820 block_offset = offset;
1821
1822 // read the block so we can check the checksum
1823 ret = read_journal_data(jnl, &block_offset, block_ptr, size);
1824 if (ret != (size_t)size) {
1825 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
1826 goto bad_txn_handling;
1827 }
1828
1829 disk_cksum = calc_checksum(block_ptr, size);
1830
1831 // there is no need to swap the checksum from disk because
1832 // it got swapped when the blhdr was read in.
1833 if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) {
1834 LFHFS_LOG(LEVEL_ERROR, "jnl: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n",
1835 txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum);
1836 LFHFS_LOG(LEVEL_ERROR, "jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
1837 *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)],
1838 *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]);
1839
1840 goto bad_txn_handling;
1841 }
1842 }
1843
1844
1845 // add this bucket to co_buf, coalescing where possible
1846 // printf("jnl: replay_journal: adding block 0x%llx\n", number);
1847 ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full);
1848
1849 if (ret_val == -1) {
1850 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: trouble adding block to co_buf\n");
1851 goto bad_replay;
1852 } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number);
1853 }
1854
1855 // increment offset
1856 offset += size;
1857
1858 // check if the last block added puts us off the end of the jnl.
1859 // if so, we need to wrap to the beginning and take any remainder
1860 // into account
1861 //
1862 if (offset >= jnl->jhdr->size) {
1863 offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
1864 }
1865 }
1866
1867 if (block_ptr) {
1868 hfs_free(block_ptr);
1869 block_ptr = NULL;
1870 }
1871
1872 if (bad_blocks) {
1873 bad_txn_handling:
1874 /* Journal replay got error before it found any valid
1875 * transations, abort replay */
1876 if (txn_start_offset == 0) {
1877 LFHFS_LOG(LEVEL_ERROR, "jnl: no known good txn start offset! aborting journal replay.\n");
1878 goto bad_replay;
1879 }
1880
1881 /* Repeated error during journal replay, abort replay */
1882 if (replay_retry_count == 3) {
1883 LFHFS_LOG(LEVEL_ERROR, "jnl: repeated errors replaying journal! aborting journal replay.\n");
1884 goto bad_replay;
1885 }
1886 replay_retry_count++;
1887
1888 /* There was an error replaying the journal (possibly
1889 * EIO/ENXIO from the device). So retry replaying all
1890 * the good transactions that we found before getting
1891 * the error.
1892 */
1893 jnl->jhdr->start = orig_jnl_start;
1894 jnl->jhdr->end = txn_start_offset;
1895 check_past_jnl_end = 0;
1896 last_sequence_num = 0;
1897 LFHFS_LOG(LEVEL_ERROR, "jnl: restarting journal replay (%lld - %lld)!\n", jnl->jhdr->start, jnl->jhdr->end);
1898 goto restart_replay;
1899 }
1900
1901 jnl->jhdr->start += blhdr->bytes_used;
1902 if (jnl->jhdr->start >= jnl->jhdr->size) {
1903 // wrap around and skip the journal header block
1904 jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
1905 }
1906
1907 if (jnl->jhdr->start == jnl->jhdr->end) {
1908 in_uncharted_territory = 1;
1909 }
1910 }
1911
1912 if (jnl->jhdr->start != jnl->jhdr->end) {
1913 LFHFS_LOG(LEVEL_ERROR, "jnl: start %lld != end %lld. resetting end.\n", jnl->jhdr->start, jnl->jhdr->end);
1914 jnl->jhdr->end = jnl->jhdr->start;
1915 }
1916
1917 //printf("jnl: replay_journal: replaying %d blocks\n", num_full);
1918
1919 /*
1920 * make sure it's at least one page in size, so
1921 * start max_bsize at PAGE_SIZE
1922 */
1923 for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) {
1924
1925 if (co_buf[i].block_num == (off_t)-1)
1926 continue;
1927
1928 if (co_buf[i].block_size > max_bsize)
1929 max_bsize = co_buf[i].block_size;
1930 }
1931 /*
1932 * round max_bsize up to the nearest PAGE_SIZE multiple
1933 */
1934 if (max_bsize & (PAGE_SIZE - 1)) {
1935 max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1936 }
1937
1938 block_ptr = hfs_malloc(max_bsize);
1939
1940 // Replay the coalesced entries in the co-buf
1941 for(i = 0; i < num_full; i++) {
1942 size_t size = co_buf[i].block_size;
1943 off_t jnl_offset = (off_t) co_buf[i].jnl_offset;
1944 off_t number = co_buf[i].block_num;
1945
1946
1947 // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num,
1948 // co_buf[i].block_size, co_buf[i].jnl_offset);
1949
1950 if (number == (off_t)-1) {
1951 // printf("jnl: replay_journal: skipping killed fs block\n");
1952 } else {
1953
1954 // do journal read, and set the phys. block
1955 ret = read_journal_data(jnl, &jnl_offset, block_ptr, size);
1956 if (ret != size) {
1957 LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl_offset);
1958 goto bad_replay;
1959 }
1960
1961 if (update_fs_block(jnl, block_ptr, number, size) != 0) {
1962 goto bad_replay;
1963 }
1964 }
1965 }
1966
1967
1968 // done replaying; update jnl header
1969 if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) {
1970 goto bad_replay;
1971 }
1972
1973 // free block_ptr
1974 if (block_ptr) {
1975 hfs_free(block_ptr);
1976 block_ptr = NULL;
1977 }
1978
1979 // free the coalesce buffer
1980 hfs_free(co_buf);
1981 co_buf = NULL;
1982
1983 hfs_free(buff);
1984
1985 success:
1986 LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: success.\n");
1987 return 0;
1988
1989 bad_replay:
1990 hfs_free(block_ptr);
1991 hfs_free(co_buf);
1992 hfs_free(buff);
1993
1994 LFHFS_LOG(LEVEL_ERROR, "replay_journal: error.\n");
1995 return -1;
1996 }
1997
1998 // buffer_written:
1999 // This function get executed after a buffer has been written to its
2000 // final destination.
2001 // This function lets us know when a buffer has been
2002 // flushed to disk. Originally (kext), it was called from deep
2003 // within the driver stack and thus is quite limited in what it could do.
2004 // Notably, it could not initiate any new i/o's or allocate/free memory.
2005 static void buffer_written(transaction *tr, GenericLFBuf *bp) {
2006
2007 journal *jnl;
2008 transaction *ctr, *prev=NULL, *next;
2009 size_t i;
2010 size_t bufsize, amt_flushed, total_bytes;
2011
2012
2013 // snarf out the bits we want
2014 bufsize = bp->uDataSize;
2015
2016 // then we've already seen it
2017 if (tr == NULL) {
2018 return;
2019 }
2020
2021 CHECK_TRANSACTION(tr);
2022
2023 jnl = tr->jnl;
2024
2025 CHECK_JOURNAL(jnl);
2026
2027 amt_flushed = tr->num_killed;
2028 total_bytes = tr->total_bytes;
2029
2030 // update the number of blocks that have been flushed.
2031 // this buf may represent more than one block so take
2032 // that into account.
2033 amt_flushed += tr->num_flushed;
2034 tr->num_flushed += bufsize;
2035
2036 // if this transaction isn't done yet, just return as
2037 // there is nothing to do.
2038 //
2039 // NOTE: we are careful to not reference anything through
2040 // the tr pointer after doing the OSAddAtomic(). if
2041 // this if statement fails then we are the last one
2042 // and then it's ok to dereference "tr".
2043 //
2044 if ((amt_flushed + bufsize) < total_bytes) {
2045 return;
2046 }
2047
2048 // this will single thread checking the transaction
2049 lock_oldstart(jnl);
2050
2051 if (tr->total_bytes == (int)0xfbadc0de) {
2052 // then someone beat us to it...
2053 unlock_oldstart(jnl);
2054 return;
2055 }
2056
2057 // mark this so that we're the owner of dealing with the
2058 // cleanup for this transaction
2059 tr->total_bytes = 0xfbadc0de;
2060
2061 if (jnl->flags & JOURNAL_INVALID)
2062 goto transaction_done;
2063
2064 //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
2065 // tr, tr->journal_start, tr->journal_end, jnl);
2066
2067 // find this entry in the old_start[] index and mark it completed
2068 for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2069
2070 if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) {
2071 jnl->old_start[i] &= ~(0x8000000000000000ULL);
2072 break;
2073 }
2074 }
2075
2076 if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2077 panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n",
2078 tr->journal_start, tr, jnl);
2079 }
2080
2081
2082 // if we are here then we need to update the journal header
2083 // to reflect that this transaction is complete
2084 if (tr->journal_start == jnl->active_start) {
2085 jnl->active_start = tr->journal_end;
2086 tr->journal_start = tr->journal_end = (off_t)0;
2087 }
2088
2089 // go through the completed_trs list and try to coalesce
2090 // entries, restarting back at the beginning if we have to.
2091 for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) {
2092 if (ctr->journal_start == jnl->active_start) {
2093 jnl->active_start = ctr->journal_end;
2094 if (prev) {
2095 prev->next = ctr->next;
2096 }
2097 if (ctr == jnl->completed_trs) {
2098 jnl->completed_trs = ctr->next;
2099 }
2100
2101 next = jnl->completed_trs; // this starts us over again
2102 ctr->next = jnl->tr_freeme;
2103 jnl->tr_freeme = ctr;
2104 ctr = NULL;
2105
2106 } else if (tr->journal_end == ctr->journal_start) {
2107 ctr->journal_start = tr->journal_start;
2108 next = jnl->completed_trs; // this starts us over again
2109 ctr = NULL;
2110 tr->journal_start = tr->journal_end = (off_t)0;
2111
2112 } else if (tr->journal_start == ctr->journal_end) {
2113 ctr->journal_end = tr->journal_end;
2114 next = ctr->next;
2115 tr->journal_start = tr->journal_end = (off_t)0;
2116 } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) {
2117 // coalesce the next entry with this one and link the next
2118 // entry in at the head of the tr_freeme list
2119 next = ctr->next; // temporarily use the "next" variable
2120 ctr->journal_end = next->journal_end;
2121 ctr->next = next->next;
2122 next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list
2123 jnl->tr_freeme = next;
2124
2125 next = jnl->completed_trs; // this starts us over again
2126 ctr = NULL;
2127
2128 } else {
2129 next = ctr->next;
2130 }
2131 }
2132
2133 // if this is true then we didn't merge with anyone
2134 // so link ourselves in at the head of the completed
2135 // transaction list.
2136 if (tr->journal_start != 0) {
2137 // put this entry into the correct sorted place
2138 // in the list instead of just at the head.
2139
2140 prev = NULL;
2141 for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
2142 // just keep looping
2143 }
2144
2145 if (ctr == NULL && prev == NULL) {
2146 jnl->completed_trs = tr;
2147 tr->next = NULL;
2148
2149 } else if (ctr == jnl->completed_trs) {
2150 tr->next = jnl->completed_trs;
2151 jnl->completed_trs = tr;
2152
2153 } else {
2154 tr->next = prev->next;
2155 prev->next = tr;
2156 }
2157
2158 } else {
2159 // if we're here this tr got merged with someone else so
2160 // put it on the list to be free'd
2161 tr->next = jnl->tr_freeme;
2162 jnl->tr_freeme = tr;
2163 }
2164 transaction_done:
2165 unlock_oldstart(jnl);
2166
2167 unlock_condition(jnl, &jnl->asyncIO);
2168 }
2169
2170 static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) {
2171 return do_journal_io(jnl, offset, data, len, JNL_WRITE);
2172 }
2173
2174 static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) {
2175 return do_journal_io(jnl, offset, data, len, JNL_READ);
2176 }
2177
2178
2179 // This function sets the size of the tbuffer and the
2180 // size of the blhdr. It assumes that jnl->jhdr->size
2181 // and jnl->jhdr->jhdr_size are already valid.
2182 static void size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz) {
2183 //
2184 // one-time initialization based on how much memory
2185 // there is in the machine.
2186 //
2187 if (def_tbuffer_size == 0) {
2188 uint64_t memsize = 0;
2189 size_t l = sizeof(memsize);
2190 sysctlbyname("hw.memsize", &memsize, &l, NULL, 0);
2191
2192 if (memsize < (256*1024*1024)) {
2193 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
2194 } else if (memsize < (512*1024*1024)) {
2195 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
2196 } else if (memsize < (1024*1024*1024)) {
2197 def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
2198 } else {
2199 def_tbuffer_size = (uint32_t)(DEFAULT_TRANSACTION_BUFFER_SIZE * (memsize / (256*1024*1024)));
2200 }
2201 }
2202
2203 // For analyzer
2204 if (!(jnl->jhdr->jhdr_size > 0)) {
2205 panic("jnl->jhdr->jhdr_size is %d", jnl->jhdr->jhdr_size);
2206 }
2207
2208 // size up the transaction buffer... can't be larger than the number
2209 // of blocks that can fit in a block_list_header block.
2210 if (tbuffer_size == 0) {
2211 jnl->tbuffer_size = def_tbuffer_size;
2212 } else {
2213 // make sure that the specified tbuffer_size isn't too small
2214 if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
2215 tbuffer_size = jnl->jhdr->blhdr_size * 2;
2216 }
2217 // and make sure it's an even multiple of the block size
2218 if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
2219 tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
2220 }
2221
2222 jnl->tbuffer_size = tbuffer_size;
2223 }
2224
2225 if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
2226 jnl->tbuffer_size = (uint32_t)(jnl->jhdr->size / 2);
2227 }
2228
2229 if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
2230 jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
2231 }
2232
2233 jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
2234 if (jnl->jhdr->blhdr_size < phys_blksz) {
2235 jnl->jhdr->blhdr_size = phys_blksz;
2236 } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) {
2237 // have to round up so we're an even multiple of the physical block size
2238 jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1);
2239 }
2240 }
2241
2242
2243 static int write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) {
2244 static int num_err_prints = 0;
2245 int ret=0;
2246 off_t jhdr_offset = 0;
2247
2248 // Flush the track cache if we're not doing force-unit-access
2249 // writes.
2250 if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
2251
2252 dk_synchronize_t sync_request = {
2253 .options = DK_SYNCHRONIZE_OPTION_BARRIER,
2254 };
2255
2256 /*
2257 * If device doesn't support barrier-only flush, or
2258 * the journal is on a different device, use full flush.
2259 */
2260 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
2261 sync_request.options = 0;
2262 jnl->flush_counter++;
2263 }
2264
2265 ret = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
2266 }
2267 if (ret != 0) {
2268 //
2269 // Only print this error if it's a different error than the
2270 // previous one, or if it's the first time for this device
2271 // or if the total number of printfs is less than 25. We
2272 // allow for up to 25 printfs to insure that some make it
2273 // into the on-disk syslog. Otherwise if we only printed
2274 // one, it's possible it would never make it to the syslog
2275 // for the root volume and that makes debugging hard.
2276 //
2277 if ( ret != jnl->last_flush_err
2278 || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0
2279 || num_err_prints++ < 25) {
2280
2281 LFHFS_LOG(LEVEL_ERROR, "jnl: flushing fs disk buffer returned 0x%x\n", ret);
2282
2283 jnl->flags |= JOURNAL_FLUSHCACHE_ERR;
2284 jnl->last_flush_err = ret;
2285 }
2286 }
2287
2288 jnl->jhdr->sequence_num = sequence_num;
2289 jnl->jhdr->checksum = 0;
2290 jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE);
2291
2292 if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) {
2293 LFHFS_LOG(LEVEL_ERROR, "jnl: write_journal_header: error writing the journal header!\n");
2294 jnl->flags |= JOURNAL_INVALID;
2295 return -1;
2296 }
2297
2298 // If we're not doing force-unit-access writes, then we
2299 // have to flush after writing the journal header so that
2300 // a future transaction doesn't sneak out to disk before
2301 // the header does and thus overwrite data that the old
2302 // journal header refers to. Saw this exact case happen
2303 // on an IDE bus analyzer with Larry Barras so while it
2304 // may seem obscure, it's not.
2305 //
2306 if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) {
2307
2308 dk_synchronize_t sync_request = {
2309 .options = DK_SYNCHRONIZE_OPTION_BARRIER,
2310 };
2311
2312 /*
2313 * If device doesn't support barrier-only flush, or
2314 * the journal is on a different device, use full flush.
2315 */
2316 if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) {
2317 sync_request.options = 0;
2318 jnl->flush_counter++;
2319 }
2320
2321 ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request);
2322 }
2323 return 0;
2324 }
2325
2326 static int journal_binfo_cmp(const void *a, const void *b) {
2327
2328 const block_info *bi_a = (const struct block_info *)a;
2329 const block_info *bi_b = (const struct block_info *)b;
2330 daddr64_t res;
2331
2332 if (bi_a->bnum == (off_t)-1) {
2333 return 1;
2334 }
2335 if (bi_b->bnum == (off_t)-1) {
2336 return -1;
2337 }
2338
2339 // don't have to worry about negative block
2340 // numbers so this is ok to do.
2341 GenericLFBuf *psGenBufA, *psGenBufB;
2342 psGenBufA = (void*)bi_a->u.bp;
2343 psGenBufB = (void*)bi_b->u.bp;
2344 res = psGenBufA->uBlockN - psGenBufB->uBlockN;
2345
2346 return (int)res;
2347 }
2348
2349 // finish_end_transaction:
2350
2351 static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) {
2352 int i;
2353 size_t amt;
2354 size_t ret = 0;
2355 off_t end;
2356 journal *jnl = tr->jnl;
2357 GenericLFBuf *bp = NULL, **bparray = NULL;
2358 block_list_header *blhdr=NULL, *next=NULL;
2359 size_t tbuffer_offset;
2360 int bufs_written = 0;
2361 int ret_val = 0;
2362
2363 end = jnl->jhdr->end;
2364
2365 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
2366
2367 amt = blhdr->bytes_used;
2368
2369 blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num;
2370
2371 blhdr->checksum = 0;
2372 blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
2373
2374 bparray = hfs_malloc(blhdr->num_blocks * sizeof(buf_t));
2375 tbuffer_offset = jnl->jhdr->blhdr_size;
2376
2377 // for each block in the block-header,
2378 for (i = 1; i < blhdr->num_blocks; i++) {
2379 size_t bsize;
2380
2381 /*
2382 * finish preparing the shadow buf_t before
2383 * calculating the individual block checksums
2384 */
2385 if (blhdr->binfo[i].bnum != (off_t)-1) {
2386 daddr64_t blkno;
2387
2388 bp = (void*)blhdr->binfo[i].u.bp;
2389 blkno = bp->uPhyCluster;
2390 // update this so we write out the correct physical block number!
2391 blhdr->binfo[i].bnum = (off_t)(blkno);
2392
2393 bparray[i] = bp;
2394 bsize = bp->uDataSize;
2395 blhdr->binfo[i].u.bi.bsize = (uint32_t)bsize;
2396 blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], (uint32_t)bsize);
2397 } else {
2398 bparray[i] = NULL;
2399 bsize = blhdr->binfo[i].u.bi.bsize;
2400 blhdr->binfo[i].u.bi.b.cksum = 0;
2401 }
2402 tbuffer_offset += bsize;
2403 }
2404
2405 /*
2406 * if we fired off the journal_write_header asynchronously in
2407 * 'end_transaction', we need to wait for its completion
2408 * before writing the actual journal data
2409 */
2410 wait_condition(jnl, &jnl->writing_header, "finish_end_transaction");
2411
2412 if (jnl->write_header_failed == FALSE)
2413 ret = write_journal_data(jnl, &end, blhdr, amt);
2414 else
2415 ret_val = -1;
2416
2417 #if HFS_CRASH_TEST
2418 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_DATA, jnl->fsmount->psHfsmount, NULL);
2419 #endif
2420
2421 /*
2422 * put the bp pointers back so that we can
2423 * make the final pass on them
2424 */
2425 for (i = 1; i < blhdr->num_blocks; i++)
2426 blhdr->binfo[i].u.bp = (void*)bparray[i];
2427
2428 hfs_free(bparray);
2429
2430 if (ret_val == -1)
2431 goto bad_journal;
2432
2433 if (ret != amt) {
2434 LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: only wrote %zu of %zu bytes to the journal!\n",
2435 ret, amt);
2436
2437 ret_val = -1;
2438 goto bad_journal;
2439 }
2440 }
2441 jnl->jhdr->end = end; // update where the journal now ends
2442 tr->journal_end = end; // the transaction ends here too
2443
2444 if (tr->journal_start == 0 || tr->journal_end == 0) {
2445 panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
2446 tr->journal_start, tr->journal_end);
2447 }
2448
2449 if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) {
2450 ret_val = -1;
2451 goto bad_journal;
2452 }
2453
2454 #if HFS_CRASH_TEST
2455 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_HEADER, jnl->fsmount->psHfsmount, NULL);
2456 #endif
2457
2458 /*
2459 * If the caller supplied a callback, call it now that the blocks have been
2460 * written to the journal. This is used by journal_relocate so, for example,
2461 * the file system can change its pointer to the new journal.
2462 */
2463 if (callback != NULL && callback(callback_arg) != 0) {
2464 ret_val = -1;
2465 goto bad_journal;
2466 }
2467
2468 // the buffer_flushed_callback will only be called for the
2469 // real blocks that get flushed so we have to account for
2470 // the block_list_headers here.
2471 //
2472 tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
2473
2474 lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction");
2475
2476 //
2477 // setup for looping through all the blhdr's.
2478 //
2479 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
2480 uint16_t num_blocks;
2481
2482 /*
2483 * grab this info ahead of issuing the buf_bawrites...
2484 * once the last one goes out, its possible for blhdr
2485 * to be freed (especially if we get preempted) before
2486 * we do the last check of num_blocks or
2487 * grab the next blhdr pointer...
2488 */
2489 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2490 num_blocks = blhdr->num_blocks;
2491
2492 /*
2493 * we can re-order the buf ptrs because everything is written out already
2494 */
2495 qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp);
2496
2497 /*
2498 * need to make sure that the loop issuing the buf_bawrite's
2499 * does not touch blhdr once the last buf_bawrite has been
2500 * issued... at that point, we no longer have a legitmate
2501 * reference on the associated storage since it will be
2502 * released upon the completion of that last buf_bawrite
2503 */
2504 for (i = num_blocks-1; i >= 1; i--) {
2505 if (blhdr->binfo[i].bnum != (off_t)-1)
2506 break;
2507 num_blocks--;
2508 }
2509 for (i = 1; i < num_blocks; i++) {
2510
2511 if ((bp = (void*)blhdr->binfo[i].u.bp)) {
2512
2513 errno_t ret_val = 0;
2514
2515 #if JOURNAL_DEBUG
2516 printf("journal write physical: bp %p, psVnode %p, uBlockN %llu, uPhyCluster %llu uLockCnt %u\n",
2517 bp, bp->psVnode, bp->uBlockN, bp->uPhyCluster, bp->uLockCnt);
2518 #endif
2519
2520 lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK);
2521 ret_val = lf_hfs_generic_buf_write(bp);
2522
2523 #if HFS_CRASH_TEST
2524 CRASH_ABORT(CRASH_ABORT_JOURNAL_IN_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL);
2525 #endif
2526
2527 if (ret_val) {
2528 LFHFS_LOG(LEVEL_ERROR, "jnl: raw_readwrite_write_mount inside finish_end_transaction returned %d.\n", ret_val);
2529 }
2530
2531 buffer_written(tr, bp);
2532
2533 lf_hfs_generic_buf_unlock(bp);
2534 lf_hfs_generic_buf_release(bp);
2535
2536 bufs_written++;
2537 }
2538 }
2539 }
2540 #if HFS_CRASH_TEST
2541 CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL);
2542 #endif
2543 if (bufs_written == 0) {
2544 /*
2545 * since we didn't issue any buf_bawrite's, there is no
2546 * async trigger to cause the memory associated with this
2547 * transaction to be freed... so, move it to the garbage
2548 * list now
2549 */
2550 lock_oldstart(jnl);
2551
2552 tr->next = jnl->tr_freeme;
2553 jnl->tr_freeme = tr;
2554
2555 unlock_oldstart(jnl);
2556
2557 unlock_condition(jnl, &jnl->asyncIO);
2558 }
2559
2560 //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
2561 // tr, tr->journal_start, tr->journal_end);
2562
2563 bad_journal:
2564 if (ret_val == -1) {
2565 abort_transaction(jnl, tr); // cleans up list of extents to be trimmed
2566
2567 /*
2568 * 'flush_aborted' is protected by the flushing condition... we need to
2569 * set it before dropping the condition so that it will be
2570 * noticed in 'end_transaction'... we add this additional
2571 * aborted condition so that we can drop the 'flushing' condition
2572 * before grabbing the journal lock... this avoids a deadlock
2573 * in 'end_transaction' which is holding the journal lock while
2574 * waiting for the 'flushing' condition to clear...
2575 * everyone else will notice the JOURNAL_INVALID flag
2576 */
2577 jnl->flush_aborted = TRUE;
2578
2579 unlock_condition(jnl, &jnl->flushing);
2580 journal_lock(jnl);
2581
2582 jnl->flags |= JOURNAL_INVALID;
2583 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL;
2584
2585 journal_unlock(jnl);
2586 } else
2587 unlock_condition(jnl, &jnl->flushing);
2588
2589 return (ret_val);
2590 }
2591 static off_t free_space(journal *jnl) {
2592 off_t free_space_offset;
2593
2594 if (jnl->jhdr->start < jnl->jhdr->end) {
2595 free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
2596 } else if (jnl->jhdr->start > jnl->jhdr->end) {
2597 free_space_offset = jnl->jhdr->start - jnl->jhdr->end;
2598 } else {
2599 // journal is completely empty
2600 free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size;
2601 }
2602
2603 return free_space_offset;
2604 }
2605
2606 static void dump_journal(journal *jnl) {
2607 transaction *ctr;
2608
2609 printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
2610 printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
2611 printf(" start: 0x%.8llx\n", jnl->jhdr->start);
2612 printf(" end: 0x%.8llx\n", jnl->jhdr->end);
2613 printf(" size: 0x%.8llx\n", jnl->jhdr->size);
2614 printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
2615 printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
2616 printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
2617
2618 printf(" completed transactions:\n");
2619 for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) {
2620 printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
2621 }
2622 }
2623
2624 // The journal must be locked on entry to this function.
2625 // The "desired_size" is in bytes.
2626 static int check_free_space( journal *jnl,
2627 int desired_size,
2628 boolean_t *delayed_header_write,
2629 uint32_t sequence_num) {
2630
2631 size_t i;
2632 int counter=0;
2633
2634 //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
2635 // desired_size, free_space(jnl));
2636
2637 if (delayed_header_write)
2638 *delayed_header_write = FALSE;
2639
2640 while (1) {
2641 int old_start_empty;
2642
2643 // make sure there's space in the journal to hold this transaction
2644 if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) {
2645 break;
2646 }
2647 if (counter++ == 5000) {
2648 dump_journal(jnl);
2649 panic("jnl: check_free_space: buffer flushing isn't working "
2650 "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl,
2651 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
2652 }
2653 if (counter > 7500) {
2654 return ENOSPC;
2655 }
2656
2657 // here's where we lazily bump up jnl->jhdr->start. we'll consume
2658 // entries until there is enough space for the next transaction.
2659 old_start_empty = 1;
2660 lock_oldstart(jnl);
2661
2662 for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
2663 int lcl_counter;
2664
2665 lcl_counter = 0;
2666 while (jnl->old_start[i] & 0x8000000000000000LL) {
2667 if (lcl_counter++ > 10000) {
2668 panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n",
2669 jnl->old_start[i], jnl);
2670 }
2671
2672 unlock_oldstart(jnl);
2673 if (jnl->flush) {
2674 jnl->flush(jnl->flush_arg);
2675 }
2676 usleep(10000);
2677 lock_oldstart(jnl);
2678 }
2679
2680 if (jnl->old_start[i] == 0) {
2681 continue;
2682 }
2683
2684 old_start_empty = 0;
2685 jnl->jhdr->start = jnl->old_start[i];
2686 jnl->old_start[i] = 0;
2687
2688 if (free_space(jnl) > desired_size) {
2689
2690 if (delayed_header_write)
2691 *delayed_header_write = TRUE;
2692 else {
2693 unlock_oldstart(jnl);
2694 write_journal_header(jnl, 1, sequence_num);
2695 lock_oldstart(jnl);
2696 }
2697 break;
2698 }
2699 }
2700 unlock_oldstart(jnl);
2701
2702 // if we bumped the start, loop and try again
2703 if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
2704 continue;
2705 } else if (old_start_empty) {
2706 //
2707 // if there is nothing in old_start anymore then we can
2708 // bump the jhdr->start to be the same as active_start
2709 // since it is possible there was only one very large
2710 // transaction in the old_start array. if we didn't do
2711 // this then jhdr->start would never get updated and we
2712 // would wind up looping until we hit the panic at the
2713 // start of the loop.
2714 //
2715 jnl->jhdr->start = jnl->active_start;
2716
2717 if (delayed_header_write)
2718 *delayed_header_write = TRUE;
2719 else
2720 write_journal_header(jnl, 1, sequence_num);
2721 continue;
2722 }
2723
2724
2725 // if the file system gave us a flush function, call it to so that
2726 // it can flush some blocks which hopefully will cause some transactions
2727 // to complete and thus free up space in the journal.
2728 if (jnl->flush) {
2729 jnl->flush(jnl->flush_arg);
2730 }
2731
2732 // wait for a while to avoid being cpu-bound (this will
2733 // put us to sleep for 10 milliseconds)
2734 usleep(10000);
2735 }
2736
2737 return 0;
2738 }
2739
2740 static void lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) {
2741
2742 lock_flush(jnl);
2743
2744 while (psCondFlag->uFlag) {
2745 pthread_cond_wait(&psCondFlag->sCond, &jnl->flock);
2746 }
2747
2748 psCondFlag->uFlag = TRUE;
2749 unlock_flush(jnl);
2750 }
2751
2752 static void wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) {
2753
2754 if (!psCondFlag->uFlag)
2755 return;
2756
2757 lock_flush(jnl);
2758
2759 while (psCondFlag->uFlag) {
2760 pthread_cond_wait(&psCondFlag->sCond, &jnl->flock);
2761 }
2762
2763 unlock_flush(jnl);
2764 }
2765
2766 static void unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag) {
2767 lock_flush(jnl);
2768
2769 psCondFlag->uFlag = FALSE;
2770 pthread_cond_broadcast(&psCondFlag->sCond);
2771
2772 unlock_flush(jnl);
2773 }
2774
2775 /*
2776 * End a transaction:
2777 * 1) Determine if it is time to commit the transaction or not:
2778 * If the transaction is small enough, and we're not forcing
2779 * a write to disk, the "active" transaction becomes the "current" transaction,
2780 * and will be reused for the next transaction that is started (group commit).
2781 *
2782 * 2) Commit:
2783 * If the transaction gets written to disk (because force_it is true, or no
2784 * group commit, or the transaction is sufficiently full), the blocks get
2785 * written into the journal first, then they are written to their final location
2786 * asynchronously. When those async writes complete, the transaction can be freed
2787 * and removed from the journal.
2788 *
2789 * 3) Callback:
2790 * An optional callback can be supplied. If given, it is called after the
2791 * the blocks have been written to the journal, but before the async writes
2792 * of those blocks to their normal on-disk locations. This is used by
2793 * journal_relocate so that the location of the journal can be changed and
2794 * flushed to disk before the blocks get written to their normal locations.
2795 * Note that the callback is only called if the transaction gets written to
2796 * the journal during this end_transaction call; you probably want to set the
2797 * force_it flag.
2798 *
2799 * 4) Free blocks' Generic Buff.
2800 *
2801 * Inputs:
2802 * tr Transaction to add to the journal
2803 * force_it If true, force this transaction to the on-disk journal immediately.
2804 * callback See description above. Pass NULL for no callback.
2805 * callback_arg Argument passed to callback routine.
2806 *
2807 * Result
2808 * 0 No errors
2809 * -1 An error occurred. The journal is marked invalid.
2810 */
2811 static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock) {
2812
2813 block_list_header *blhdr=NULL, *next=NULL;
2814 int i, ret_val = 0;
2815 journal *jnl = tr->jnl;
2816 GenericLFBuf *bp;
2817 size_t tbuffer_offset;
2818
2819 if (jnl->cur_tr) {
2820 panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n",
2821 jnl, jnl->cur_tr, tr);
2822 }
2823
2824 // if there weren't any modified blocks in the transaction
2825 // just save off the transaction pointer and return.
2826 if (tr->total_bytes == (int)jnl->jhdr->blhdr_size) {
2827 jnl->cur_tr = tr;
2828 goto done;
2829 }
2830
2831 // if our transaction buffer isn't very full, just hang
2832 // on to it and don't actually flush anything. this is
2833 // what is known as "group commit". we will flush the
2834 // transaction buffer if it's full or if we have more than
2835 // one of them so we don't start hogging too much memory.
2836 //
2837 // We also check the device supports UNMAP/TRIM, and if so,
2838 // the number of extents waiting to be trimmed. If it is
2839 // small enough, then keep accumulating more (so we can
2840 // reduce the overhead of trimming). If there was a prior
2841 // trim error, then we stop issuing trims for this
2842 // volume, so we can also coalesce transactions.
2843 //
2844 if ( force_it == 0
2845 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
2846 && tr->num_blhdrs < 3
2847 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))
2848 && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) {
2849
2850 jnl->cur_tr = tr;
2851 goto done;
2852 }
2853
2854 lock_condition(jnl, &jnl->flushing, "end_transaction");
2855
2856 /*
2857 * if the previous 'finish_end_transaction' was being run
2858 * asynchronously, it could have encountered a condition
2859 * that caused it to mark the journal invalid... if that
2860 * occurred while we were waiting for it to finish, we
2861 * need to notice and abort the current transaction
2862 */
2863 if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) {
2864 unlock_condition(jnl, &jnl->flushing);
2865
2866 abort_transaction(jnl, tr);
2867 ret_val = -1;
2868 goto done;
2869 }
2870
2871 /*
2872 * Store a pointer to this transaction's trim list so that
2873 * future transactions can find it.
2874 *
2875 * Note: if there are no extents in the trim list, then don't
2876 * bother saving the pointer since nothing can add new extents
2877 * to the list (and other threads/transactions only care if
2878 * there is a trim pending).
2879 */
2880 lf_lck_rw_lock_exclusive(&jnl->trim_lock);
2881 if (jnl->async_trim != NULL)
2882 panic("jnl: end_transaction: async_trim already non-NULL!");
2883 if (tr->trim.extent_count > 0)
2884 jnl->async_trim = &tr->trim;
2885 lf_lck_rw_unlock_exclusive(&jnl->trim_lock);
2886
2887 /*
2888 * snapshot the transaction sequence number while we are still behind
2889 * the journal lock since it will be bumped upon the start of the
2890 * next transaction group which may overlap the current journal flush...
2891 * we pass the snapshot into write_journal_header during the journal
2892 * flush so that it can write the correct version in the header...
2893 * because we hold the 'flushing' condition variable for the duration
2894 * of the journal flush, 'saved_sequence_num' remains stable
2895 */
2896 jnl->saved_sequence_num = jnl->sequence_num;
2897
2898 /*
2899 * if we're here we're going to flush the transaction buffer to disk.
2900 * 'check_free_space' will not return untl there is enough free
2901 * space for this transaction in the journal and jnl->old_start[0]
2902 * is avaiable for use
2903 */
2904 check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num);
2905
2906 // range check the end index
2907 if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
2908 panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
2909 jnl->jhdr->end, jnl->jhdr->size);
2910 }
2911
2912 // this transaction starts where the current journal ends
2913 tr->journal_start = jnl->jhdr->end;
2914
2915 lock_oldstart(jnl);
2916 /*
2917 * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy.
2918 * slide everyone else down and put our latest guy in the last
2919 * entry in the old_start array
2920 */
2921 memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
2922 jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
2923
2924 unlock_oldstart(jnl);
2925
2926 // go over the blocks in the transaction.
2927 // for each block, call the fpCallback and copy the content into the journal buffer
2928 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
2929 char *blkptr;
2930 size_t bsize;
2931
2932 tbuffer_offset = jnl->jhdr->blhdr_size;
2933
2934 for (i = 1; i < blhdr->num_blocks; i++) {
2935
2936 if (blhdr->binfo[i].bnum != (off_t)-1) {
2937
2938 bp = (GenericLFBuf*)blhdr->binfo[i].u.bp;
2939
2940 if (bp == NULL) {
2941 panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n",
2942 blhdr->binfo[i].bnum, jnl, tr);
2943 }
2944
2945 bsize = bp->uDataSize;
2946
2947 blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
2948
2949 int iRet;
2950 retry:
2951 iRet = lf_hfs_generic_buf_take_ownership(bp, NULL);
2952 if (iRet == EAGAIN) {
2953 goto retry;
2954 } else if (iRet) {
2955 LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: lf_hfs_generic_buf_take_ownership returned %d.\n", iRet);
2956 ret_val = -1;
2957 goto done;
2958 }
2959
2960 if (!(bp->uCacheFlags & GEN_BUF_WRITE_LOCK)) {
2961 panic("GEN_BUF_WRITE_LOCK should be set!");
2962 }
2963
2964 // Call the buffer callback
2965 if (bp->pfFunc) {
2966 bp->pfFunc(bp, bp->pvCallbackArgs);
2967 bp->pfFunc = NULL;
2968 }
2969
2970 if (bp->uCacheFlags & GEN_BUF_LITTLE_ENDIAN) {
2971 panic("We do not want to write a GEN_BUF_LITTLE_ENDIAN buffer to media!");
2972 }
2973
2974 // copy the data into the transaction buffer...
2975 memcpy(blkptr, bp->pvData, bsize);
2976
2977 blhdr->binfo[i].u.bp = (void*)bp;
2978
2979 } else {
2980 // bnum == -1, only true if a block was "killed"
2981 bsize = blhdr->binfo[i].u.bi.bsize;
2982 }
2983 tbuffer_offset += bsize;
2984 }
2985 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
2986 }
2987
2988 #if HFS_CRASH_TEST
2989 CRASH_ABORT(CRASH_ABORT_JOURNAL_BEFORE_FINISH, jnl->fsmount->psHfsmount, NULL);
2990 #endif
2991
2992 ret_val = finish_end_transaction(tr, callback, callback_arg);
2993
2994 done:
2995 if (drop_lock == TRUE) {
2996 journal_unlock(jnl);
2997 }
2998 return (ret_val);
2999 }
3000
3001 static void abort_transaction(journal *jnl, transaction *tr) {
3002
3003 block_list_header *blhdr, *next;
3004 // for each block list header, iterate over the blocks then
3005 // free up the memory associated with the block list.
3006 for (blhdr = tr->blhdr; blhdr; blhdr = next) {
3007 int i;
3008
3009 for (i = 1; i < blhdr->num_blocks; i++) {
3010 GenericLFBufPtr bp;
3011
3012 if (blhdr->binfo[i].bnum == (off_t)-1)
3013 continue;
3014
3015 bp = (void*)blhdr->binfo[i].u.bp;
3016
3017 // Release the buffers
3018 lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK);
3019 if (lf_hfs_generic_buf_validate_owner(bp)) { // abort_transaction can be called before or after we take ownership
3020 lf_hfs_generic_buf_release(bp);
3021 }
3022
3023 }
3024 next = (block_list_header *)((long)blhdr->binfo[0].bnum);
3025
3026 // we can free blhdr here since we won't need it any more
3027 blhdr->binfo[0].bnum = 0xdeadc0de;
3028 hfs_free(blhdr);
3029 }
3030
3031 /*
3032 * If the transaction we're aborting was the async transaction, then
3033 * tell the current transaction that there is no pending trim
3034 * any more.
3035 */
3036 lf_lck_rw_lock_exclusive(&jnl->trim_lock);
3037 if (jnl->async_trim == &tr->trim)
3038 jnl->async_trim = NULL;
3039 lf_lck_rw_unlock_exclusive(&jnl->trim_lock);
3040
3041
3042 if (tr->trim.extents) {
3043 hfs_free(tr->trim.extents);
3044 }
3045 tr->trim.allocated_count = 0;
3046 tr->trim.extent_count = 0;
3047 tr->trim.extents = NULL;
3048 tr->tbuffer = NULL;
3049 tr->blhdr = NULL;
3050 tr->total_bytes = 0xdbadc0de;
3051 hfs_free(tr);
3052 }
3053
3054 static void swap_journal_header(journal *jnl) {
3055 jnl->jhdr->magic = SWAP32(jnl->jhdr->magic);
3056 jnl->jhdr->endian = SWAP32(jnl->jhdr->endian);
3057 jnl->jhdr->start = SWAP64(jnl->jhdr->start);
3058 jnl->jhdr->end = SWAP64(jnl->jhdr->end);
3059 jnl->jhdr->size = SWAP64(jnl->jhdr->size);
3060 jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size);
3061 jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum);
3062 jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size);
3063 jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num);
3064 }
3065
3066 // this isn't a great checksum routine but it will do for now.
3067 // we use it to checksum the journal header and the block list
3068 // headers that are at the start of each transaction.
3069 static unsigned int calc_checksum(const char *ptr, int len) {
3070 int i;
3071 unsigned int cksum=0;
3072
3073 // this is a lame checksum but for now it'll do
3074 for(i = 0; i < len; i++, ptr++) {
3075 cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
3076 }
3077
3078 return (~cksum);
3079 }
3080
3081
3082 static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) {
3083 off_t curlen = len;
3084 size_t io_sz = 0;
3085 off_t max_iosize;
3086 #if 0 // TBD
3087 int err;
3088 buf_t bp;
3089 off_t accumulated_offset = 0;
3090 ExtendedVCB *vcb = HFSTOVCB(jnl->fsmount->psHfsmount);
3091 #endif
3092
3093 if (*offset < 0 || *offset > jnl->jhdr->size) {
3094 panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
3095 }
3096
3097 if (direction & JNL_WRITE)
3098 max_iosize = jnl->max_write_size;
3099 else if (direction & JNL_READ)
3100 max_iosize = jnl->max_read_size;
3101 else
3102 max_iosize = 128 * 1024;
3103
3104 again:
3105
3106 // Determine the Current R/W Length, taking cyclic wrap around into account
3107 if (*offset + curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
3108 if (*offset == jnl->jhdr->size) {
3109 *offset = jnl->jhdr->jhdr_size;
3110 } else {
3111 curlen = jnl->jhdr->size - *offset;
3112 }
3113 }
3114
3115 if (curlen > max_iosize) {
3116 curlen = max_iosize;
3117 }
3118
3119 if (curlen <= 0) {
3120 panic("jnl: do_jnl_io: curlen == %lld, offset 0x%llx len %zd\n", curlen, *offset, len);
3121 }
3122
3123 if (*offset == 0 && (direction & JNL_HEADER) == 0) {
3124 panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %lld, data %p)\n", curlen, data);
3125 }
3126
3127
3128 // Perform the I/O
3129 uint64_t phyblksize = jnl->fsmount->psHfsmount->hfs_physical_block_size;
3130 uint64_t uBlkNum = jnl->jdev_blknum+(*offset)/phyblksize;
3131
3132 if (direction & JNL_READ) {
3133 raw_readwrite_read_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL);
3134
3135 } else if (direction & JNL_WRITE) {
3136 raw_readwrite_write_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL);
3137 }
3138
3139 // Move to the next section
3140 *offset += curlen;
3141 io_sz += curlen;
3142
3143 if (io_sz != len) {
3144 // handle wrap-around
3145 data = (char *)data + curlen;
3146 curlen = len - io_sz;
3147 if (*offset >= jnl->jhdr->size) {
3148 *offset = jnl->jhdr->jhdr_size;
3149 }
3150 goto again;
3151 }
3152
3153 return io_sz;
3154 }
3155
3156 static size_t read_journal_header(journal *jnl, void *data, size_t len) {
3157 off_t hdr_offset = 0;
3158
3159 return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER);
3160 }
3161
3162 static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl) {
3163 off_t readblockcnt;
3164 off_t writeblockcnt;
3165 off_t readmaxcnt=0, tmp_readmaxcnt;
3166 off_t writemaxcnt=0, tmp_writemaxcnt;
3167 off_t readsegcnt, writesegcnt;
3168
3169 // First check the max read size via several different mechanisms...
3170 ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt);
3171
3172 if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt) == 0) {
3173 tmp_readmaxcnt = readblockcnt * phys_blksz;
3174 if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) {
3175 readmaxcnt = tmp_readmaxcnt;
3176 }
3177 }
3178
3179 if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt)) {
3180 readsegcnt = 0;
3181 }
3182
3183 if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) {
3184 readmaxcnt = readsegcnt * PAGE_SIZE;
3185 }
3186
3187 if (readmaxcnt == 0) {
3188 readmaxcnt = 128 * 1024;
3189 } else if (readmaxcnt > UINT32_MAX) {
3190 readmaxcnt = UINT32_MAX;
3191 }
3192
3193
3194 // Now check the max writes size via several different mechanisms...
3195 ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt);
3196
3197 if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt) == 0) {
3198 tmp_writemaxcnt = writeblockcnt * phys_blksz;
3199 if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) {
3200 writemaxcnt = tmp_writemaxcnt;
3201 }
3202 }
3203
3204 if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt)) {
3205 writesegcnt = 0;
3206 }
3207
3208 if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) {
3209 writemaxcnt = writesegcnt * PAGE_SIZE;
3210 }
3211
3212 if (writemaxcnt == 0) {
3213 writemaxcnt = 128 * 1024;
3214 } else if (writemaxcnt > UINT32_MAX) {
3215 writemaxcnt = UINT32_MAX;
3216 }
3217
3218 jnl->max_read_size = readmaxcnt;
3219 jnl->max_write_size = writemaxcnt;
3220 }
3221
3222 // this is a work function used to free up transactions that
3223 // completed. they can't be free'd from buffer_flushed_callback
3224 // because it is called from deep with the disk driver stack
3225 // and thus can't do something that would potentially cause
3226 // paging. it gets called by each of the journal api entry
3227 // points so stuff shouldn't hang around for too long.
3228 static void free_old_stuff(journal *jnl) {
3229 transaction *tr, *next;
3230 block_list_header *blhdr=NULL, *next_blhdr=NULL;
3231
3232 if (jnl->tr_freeme == NULL)
3233 return;
3234
3235 lock_oldstart(jnl);
3236 tr = jnl->tr_freeme;
3237 jnl->tr_freeme = NULL;
3238 unlock_oldstart(jnl);
3239
3240 for(; tr; tr=next) {
3241 for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) {
3242 next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum);
3243 blhdr->binfo[0].bnum = 0xdeadc0de;
3244
3245 hfs_free(blhdr);
3246
3247 KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0);
3248 }
3249 next = tr->next;
3250 hfs_free(tr);
3251 }
3252 }
3253
3254 // Allocate a new active transaction.
3255 // The function does the following:
3256 // 1) mallocs memory for a transaction structure and a buffer
3257 // 2) initializes the transaction structure and the buffer (invalid CRC + 0x5a)
3258 static errno_t journal_allocate_transaction(journal *jnl) {
3259 transaction *tr;
3260
3261 tr = hfs_mallocz(sizeof(transaction));
3262
3263 tr->tbuffer_size = jnl->tbuffer_size;
3264
3265 tr->tbuffer = hfs_malloc(tr->tbuffer_size);
3266
3267 // journal replay code checksum check depends on this.
3268 memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
3269 // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility)
3270 memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE);
3271
3272 tr->blhdr = (block_list_header *)tr->tbuffer;
3273 tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
3274 tr->blhdr->num_blocks = 1; // accounts for this header block
3275 tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
3276 tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER;
3277
3278 tr->sequence_num = ++jnl->sequence_num;
3279 tr->num_blhdrs = 1;
3280 tr->total_bytes = jnl->jhdr->blhdr_size;
3281 tr->jnl = jnl;
3282
3283 jnl->active_tr = tr;
3284
3285 return 0;
3286 }
3287
3288 int journal_kill_block(journal *jnl, GenericLFBuf *psGenBuf) {
3289 int i;
3290 uint64_t uflags;
3291 block_list_header *blhdr;
3292 transaction *tr;
3293
3294 #if JOURNAL_DEBUG
3295 printf("journal_kill_block: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n",
3296 psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt);
3297 #endif
3298
3299 CHECK_JOURNAL(jnl);
3300 free_old_stuff(jnl);
3301
3302 if (jnl->flags & JOURNAL_INVALID) {
3303 lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3304 lf_hfs_generic_buf_release(psGenBuf);
3305 return 0;
3306 }
3307
3308 tr = jnl->active_tr;
3309 CHECK_TRANSACTION(tr);
3310
3311 if (jnl->owner != pthread_self()) {
3312 panic("jnl: journal_kill_block: called w/out a transaction! jnl %p, owner %p, curact %p\n",
3313 jnl, jnl->owner, pthread_self());
3314 }
3315
3316 uflags = psGenBuf->uCacheFlags;
3317
3318 if ( !(uflags & GEN_BUF_WRITE_LOCK))
3319 panic("jnl: journal_kill_block: called with bp not B_LOCKED");
3320
3321 /*
3322 * bp must be BL_BUSY and B_LOCKED
3323 * first check if it's already part of this transaction
3324 */
3325 for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) {
3326
3327 for (i = 1; i < blhdr->num_blocks; i++) {
3328 if (psGenBuf == (void*)blhdr->binfo[i].u.bp) {
3329
3330 // if the block has the DELWRI and FILTER bits sets, then
3331 // things are seriously weird. if it was part of another
3332 // transaction then journal_modify_block_start() should
3333 // have force it to be written.
3334 //
3335 //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) {
3336 // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
3337 //} else {
3338 tr->num_killed += psGenBuf->uDataSize;
3339 //}
3340 blhdr->binfo[i].bnum = (off_t)-1;
3341 blhdr->binfo[i].u.bp = NULL;
3342 blhdr->binfo[i].u.bi.bsize = psGenBuf->uDataSize;
3343
3344 lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3345 lf_hfs_generic_buf_release(psGenBuf);
3346
3347 return 0;
3348 }
3349 }
3350 }
3351
3352 /*
3353 * We did not find the block in any transaction buffer but we still
3354 * need to release it or else it will be left locked forever.
3355 */
3356 lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK);
3357 lf_hfs_generic_buf_release(psGenBuf);
3358
3359 return 0;
3360 }
3361
3362 int journal_is_clean(struct vnode *jvp,
3363 off_t offset,
3364 off_t journal_size,
3365 struct vnode *fsvp,
3366 size_t min_fs_block_size,
3367 struct mount *fsmount) {
3368
3369 journal jnl;
3370 uint32_t phys_blksz;
3371 int ret;
3372 int orig_checksum, checksum;
3373
3374 /* Get the real physical block size. */
3375 if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) {
3376 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: failed to get device block size.\n");
3377 ret = EINVAL;
3378 goto cleanup_jdev_name;
3379 }
3380
3381 if (phys_blksz > (uint32_t)min_fs_block_size) {
3382 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: error: phys blksize %d bigger than min fs blksize %zd\n",
3383 phys_blksz, min_fs_block_size);
3384 ret = EINVAL;
3385 goto cleanup_jdev_name;
3386 }
3387
3388 if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) {
3389 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size %lld looks bogus.\n", journal_size);
3390 ret = EINVAL;
3391 goto cleanup_jdev_name;
3392 }
3393
3394 if ((journal_size % phys_blksz) != 0) {
3395 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n",
3396 journal_size, phys_blksz);
3397 ret = EINVAL;
3398 goto cleanup_jdev_name;
3399 }
3400
3401 memset(&jnl, 0, sizeof(jnl));
3402
3403 jnl.header_buf = hfs_malloc(phys_blksz);
3404 jnl.header_buf_size = phys_blksz;
3405
3406 // Keep a point to the mount around for use in IO throttling.
3407 jnl.fsmount = fsmount;
3408
3409 get_io_info(jvp, phys_blksz, &jnl);
3410
3411 jnl.jhdr = (journal_header *)jnl.header_buf;
3412 memset(jnl.jhdr, 0, sizeof(journal_header));
3413
3414 jnl.jdev = jvp;
3415 jnl.jdev_offset = offset;
3416 jnl.jdev_blknum = (uint32_t)(offset / phys_blksz);
3417 jnl.fsdev = fsvp;
3418
3419 // we have to set this up here so that do_journal_io() will work
3420 jnl.jhdr->jhdr_size = phys_blksz;
3421
3422 if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) {
3423 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: could not read %d bytes for the journal header.\n",
3424 phys_blksz);
3425 ret = EINVAL;
3426 goto get_out;
3427 }
3428
3429 orig_checksum = jnl.jhdr->checksum;
3430 jnl.jhdr->checksum = 0;
3431
3432 if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) {
3433 // do this before the swap since it's done byte-at-a-time
3434 orig_checksum = SWAP32(orig_checksum);
3435 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
3436 swap_journal_header(&jnl);
3437 jnl.flags |= JOURNAL_NEED_SWAP;
3438 } else {
3439 checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE);
3440 }
3441
3442 if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
3443 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal magic is bad (0x%x != 0x%x)\n",
3444 jnl.jhdr->magic, JOURNAL_HEADER_MAGIC);
3445 ret = EINVAL;
3446 goto get_out;
3447 }
3448
3449 if (orig_checksum != checksum) {
3450 LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum, checksum);
3451 ret = EINVAL;
3452 goto get_out;
3453 }
3454
3455 //
3456 // if the start and end are equal then the journal is clean.
3457 // otherwise it's not clean and therefore an error.
3458 //
3459 if (jnl.jhdr->start == jnl.jhdr->end) {
3460 ret = 0;
3461 } else {
3462 ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one
3463 }
3464
3465 get_out:
3466 hfs_free(jnl.header_buf);
3467 cleanup_jdev_name:
3468 return ret;
3469 }
3470
3471 uint32_t journal_current_txn(journal *jnl) {
3472 return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1);
3473 }
3474