]>
Commit | Line | Data |
---|---|---|
de8ee011 A |
1 | /* |
2 | * Copyright (c) 2002-2015 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | // | |
29 | // This file implements a simple write-ahead journaling layer. | |
30 | // In theory any file system can make use of it by calling these | |
31 | // functions when the fs wants to modify meta-data blocks. See | |
32 | // hfs_journal.h for a more detailed description of the api and | |
33 | // data structures. | |
34 | // | |
35 | // Dominic Giampaolo (dbg@apple.com) | |
36 | // Port to Live-Files: Oded Shoshani (oshoshani@apple.com) | |
37 | // | |
38 | ||
39 | #include <stdio.h> | |
40 | #include <stdlib.h> | |
41 | #include <string.h> | |
42 | #include <limits.h> | |
43 | #include <errno.h> | |
44 | #include <fcntl.h> | |
45 | #include <unistd.h> | |
46 | #include <stdarg.h> | |
47 | #include <assert.h> | |
48 | #include <sys/sysctl.h> | |
49 | #include <sys/types.h> | |
50 | #include <mach/mach.h> | |
51 | #include <sys/disk.h> | |
52 | #include <sys/kdebug.h> | |
53 | #include "lf_hfs_locks.h" | |
54 | #include "lf_hfs_journal.h" | |
55 | #include "lf_hfs_vfsutils.h" | |
56 | #include "lf_hfs_raw_read_write.h" | |
57 | #include "lf_hfs_generic_buf.h" | |
58 | #include "lf_hfs_logger.h" | |
59 | #include "lf_hfs_vfsops.h" | |
60 | ||
61 | // ************************** Function Definitions *********************** | |
62 | // number of bytes to checksum in a block_list_header | |
63 | // NOTE: this should be enough to clear out the header | |
64 | // fields as well as the first entry of binfo[] | |
65 | ||
66 | #define CHECK_JOURNAL(jnl) \ | |
67 | do { \ | |
68 | if (jnl == NULL) { \ | |
69 | printf("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ | |
70 | panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ | |
71 | } \ | |
72 | if (jnl->jdev == NULL) { \ | |
73 | printf("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ | |
74 | panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ | |
75 | } \ | |
76 | if (jnl->fsdev == NULL) { \ | |
77 | printf("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ | |
78 | panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ | |
79 | } \ | |
80 | if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ | |
81 | printf("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ | |
82 | __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ | |
83 | panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ | |
84 | __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ | |
85 | } \ | |
86 | if (jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size) { \ | |
87 | printf("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ | |
88 | __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ | |
89 | panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ | |
90 | __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ | |
91 | } \ | |
92 | if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { \ | |
93 | printf("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ | |
94 | __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ | |
95 | panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ | |
96 | __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ | |
97 | } \ | |
98 | } while(0) | |
99 | ||
100 | #define CHECK_TRANSACTION(tr) \ | |
101 | do { \ | |
102 | if (tr == NULL) { \ | |
103 | printf("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ | |
104 | panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ | |
105 | } \ | |
106 | if (tr->jnl == NULL) { \ | |
107 | printf("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ | |
108 | panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ | |
109 | } \ | |
110 | if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ | |
111 | printf("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ | |
112 | panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ | |
113 | } \ | |
114 | if (tr->total_bytes < 0) { \ | |
115 | printf("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ | |
116 | panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ | |
117 | } \ | |
118 | if (tr->journal_start < 0) { \ | |
119 | printf("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ | |
120 | panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ | |
121 | } \ | |
122 | if (tr->journal_end < 0) { \ | |
123 | printf("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ | |
124 | panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ | |
125 | } \ | |
126 | if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ | |
127 | printf("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ | |
128 | panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ | |
129 | } \ | |
130 | } while(0) | |
131 | ||
132 | #define SWAP16(x) OSSwapInt16(x) | |
133 | #define SWAP32(x) OSSwapInt32(x) | |
134 | #define SWAP64(x) OSSwapInt64(x) | |
135 | ||
136 | #define JNL_WRITE 0x0001 | |
137 | #define JNL_READ 0x0002 | |
138 | #define JNL_HEADER 0x8000 | |
139 | ||
140 | #define BLHDR_CHECKSUM_SIZE 32 | |
141 | #define MAX_JOURNAL_SIZE 0x80000000U | |
142 | ||
143 | #define STARTING_BUCKETS 256 | |
144 | typedef struct bucket { | |
145 | off_t block_num; | |
146 | uint32_t jnl_offset; | |
147 | uint32_t block_size; | |
148 | int32_t cksum; | |
149 | } bucket; | |
150 | ||
151 | static int replay_journal(journal *jnl); | |
152 | static void free_old_stuff(journal *jnl); | |
153 | static errno_t journal_allocate_transaction(journal *jnl); | |
154 | static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl); | |
155 | static size_t read_journal_header(journal *jnl, void *data, size_t len); | |
156 | static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction); | |
157 | static unsigned int calc_checksum(const char *ptr, int len); | |
158 | static void swap_journal_header(journal *jnl); | |
159 | static int end_transaction(transaction *tr, | |
160 | int force_it, | |
161 | errno_t (*callback)(void*), | |
162 | void *callback_arg, | |
163 | boolean_t drop_lock); | |
164 | static void abort_transaction(journal *jnl, transaction *tr); | |
165 | static void size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz); | |
166 | static void lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name); | |
167 | static void wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name); | |
168 | static void unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag); | |
169 | static int write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num); | |
170 | static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len); | |
171 | static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len); | |
172 | ||
173 | ||
174 | static __inline__ void lock_oldstart(journal *jnl) { | |
175 | lf_lck_mtx_lock(&jnl->old_start_lock); | |
176 | } | |
177 | ||
178 | static __inline__ void unlock_oldstart(journal *jnl) { | |
179 | lf_lck_mtx_unlock(&jnl->old_start_lock); | |
180 | } | |
181 | ||
182 | __inline__ void journal_lock(journal *jnl) { | |
183 | lf_lck_mtx_lock(&jnl->jlock); | |
184 | if (jnl->owner) { | |
185 | panic ("jnl: owner is %p, expected NULL\n", jnl->owner); | |
186 | } | |
187 | jnl->owner = pthread_self(); | |
188 | } | |
189 | ||
190 | __inline__ void journal_unlock(journal *jnl) { | |
191 | jnl->owner = NULL; | |
192 | lf_lck_mtx_unlock(&jnl->jlock); | |
193 | } | |
194 | ||
195 | static __inline__ void lock_flush(journal *jnl) { | |
196 | lf_lck_mtx_lock(&jnl->flock); | |
197 | } | |
198 | ||
199 | static __inline__ void unlock_flush(journal *jnl) { | |
200 | lf_lck_mtx_unlock(&jnl->flock); | |
201 | } | |
202 | ||
203 | // ************************** Global Variables *********************** | |
204 | // Journal Locking | |
205 | lck_grp_attr_t *jnl_group_attr = NULL; | |
206 | lck_attr_t *jnl_lock_attr = NULL; | |
207 | lck_grp_t *jnl_mutex_group = NULL; | |
208 | ||
209 | // By default, we grow the list of extents to trim by 4K at a time. | |
210 | // We'll opt to flush a transaction if it contains at least | |
211 | // JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number | |
212 | // of modified blocks is small). | |
213 | enum { | |
214 | JOURNAL_DEFAULT_TRIM_BYTES = 4096, | |
215 | JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t), | |
216 | JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16 | |
217 | }; | |
218 | ||
219 | unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; | |
220 | ||
221 | // tbuffer | |
222 | #define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) | |
223 | #define MAX_TRANSACTION_BUFFER_SIZE (3072*1024) | |
224 | uint32_t def_tbuffer_size = 0; // XXXdbg - so I can change it in the debugger | |
225 | ||
226 | // ************************** Global Functions *********************** | |
227 | void journal_init(void) { | |
228 | ||
229 | jnl_lock_attr = lf_lck_attr_alloc_init(); | |
230 | jnl_group_attr = lf_lck_grp_attr_alloc_init(); | |
231 | jnl_mutex_group = lf_lck_grp_alloc_init(); | |
232 | } | |
233 | ||
234 | journal *journal_open(struct vnode *jvp, | |
235 | off_t offset, | |
236 | off_t journal_size, | |
237 | struct vnode *fsvp, | |
238 | size_t min_fs_blksz, | |
239 | int32_t flags, | |
240 | int32_t tbuffer_size, | |
241 | void (*flush)(void *arg), | |
242 | void *arg, | |
243 | struct mount *fsmount) { | |
244 | journal *jnl; | |
245 | uint32_t orig_blksz=0; | |
246 | uint32_t phys_blksz; | |
247 | u_int32_t min_size = 0; | |
248 | int orig_checksum, checksum; | |
249 | ||
250 | /* Get the real physical block size. */ | |
251 | if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) { | |
252 | goto cleanup_jdev_name; | |
253 | } | |
254 | ||
255 | if (phys_blksz > min_fs_blksz) { | |
256 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: error: phys blksize %u bigger than min fs blksize %zd\n", | |
257 | phys_blksz, min_fs_blksz); | |
258 | goto cleanup_jdev_name; | |
259 | } | |
260 | ||
261 | if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { | |
262 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size %lld looks bogus.\n", journal_size); | |
263 | goto cleanup_jdev_name; | |
264 | } | |
265 | ||
266 | min_size = phys_blksz * (phys_blksz / sizeof(block_info)); | |
267 | /* Reject journals that are too small given the sector size of the device */ | |
268 | if (journal_size < min_size) { | |
269 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size (%lld) too small given sector size of (%u)\n", | |
270 | journal_size, phys_blksz); | |
271 | goto cleanup_jdev_name; | |
272 | } | |
273 | ||
274 | if ((journal_size % phys_blksz) != 0) { | |
275 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", | |
276 | journal_size, phys_blksz); | |
277 | goto cleanup_jdev_name; | |
278 | } | |
279 | ||
280 | jnl = hfs_mallocz(sizeof(struct journal)); | |
281 | ||
282 | jnl->jdev = jvp; | |
283 | jnl->jdev_offset = offset; | |
284 | jnl->jdev_blknum = (uint32_t)(offset / min_fs_blksz); | |
285 | jnl->fsdev = fsvp; | |
286 | jnl->flush = flush; | |
287 | jnl->flush_arg = arg; | |
288 | jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); | |
289 | lf_lck_mtx_init(&jnl->old_start_lock); | |
290 | lf_cond_init(&jnl->flushing.sCond); | |
291 | lf_cond_init(&jnl->asyncIO.sCond); | |
292 | lf_cond_init(&jnl->writing_header.sCond); | |
293 | ||
294 | /* We hold the mount to later pass to the throttling code for IO | |
295 | * accounting. | |
296 | */ | |
297 | jnl->fsmount = fsmount; | |
298 | ||
299 | get_io_info(jvp, phys_blksz, jnl); | |
300 | ||
301 | jnl->header_buf = hfs_malloc(phys_blksz); | |
302 | jnl->header_buf_size = phys_blksz; | |
303 | ||
304 | jnl->jhdr = (journal_header *)jnl->header_buf; | |
305 | memset(jnl->jhdr, 0, sizeof(journal_header)); | |
306 | ||
307 | // we have to set this up here so that do_journal_io() will work | |
308 | jnl->jhdr->jhdr_size = phys_blksz; | |
309 | ||
310 | if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { | |
311 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: could not read %u bytes for the journal header.\n", | |
312 | phys_blksz); | |
313 | goto bad_journal; | |
314 | } | |
315 | ||
316 | /* | |
317 | * Check for a bad jhdr size after reading in the journal header. | |
318 | * The journal header length cannot be zero | |
319 | */ | |
320 | if (jnl->jhdr->jhdr_size == 0) { | |
321 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: bad jhdr size (%d) \n", jnl->jhdr->jhdr_size); | |
322 | goto bad_journal; | |
323 | } | |
324 | ||
325 | orig_checksum = jnl->jhdr->checksum; | |
326 | jnl->jhdr->checksum = 0; | |
327 | ||
328 | if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { | |
329 | ||
330 | // do this before the swap since it's done byte-at-a-time | |
331 | orig_checksum = SWAP32(orig_checksum); | |
332 | checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); | |
333 | swap_journal_header(jnl); | |
334 | jnl->flags |= JOURNAL_NEED_SWAP; | |
335 | ||
336 | } else { | |
337 | ||
338 | checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); | |
339 | } | |
340 | ||
341 | if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { | |
342 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal magic is bad (0x%x != 0x%x)\n", | |
343 | jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); | |
344 | goto bad_journal; | |
345 | } | |
346 | ||
347 | // only check if we're the current journal header magic value | |
348 | if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { | |
349 | ||
350 | if (orig_checksum != checksum) { | |
351 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal checksum is bad (0x%x != 0x%x)\n", | |
352 | orig_checksum, checksum); | |
353 | ||
354 | //goto bad_journal; | |
355 | } | |
356 | } | |
357 | ||
358 | // XXXdbg - convert old style magic numbers to the new one | |
359 | if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) { | |
360 | jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; | |
361 | } | |
362 | ||
363 | if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { | |
364 | /* | |
365 | * The volume has probably been resized (such that we had to adjust the | |
366 | * logical sector size), or copied to media with a different logical | |
367 | * sector size. | |
368 | * | |
369 | * For us, though, no big deal because we are giving byte offsets to | |
370 | * pread() and pwrite() to do our I/O, and as long as we use self- | |
371 | * consistent units, we are all good. | |
372 | */ | |
373 | LFHFS_LOG(LEVEL_ERROR, | |
374 | "jnl: block size mismatch: phys_blksz=%llu, jhdr->jhdr_size=%llu -- COMPENSATING\n", | |
375 | (unsigned long long)phys_blksz, (unsigned long long)jnl->jhdr->jhdr_size); | |
376 | orig_blksz = phys_blksz; | |
377 | } | |
378 | ||
379 | if ( jnl->jhdr->start <= 0 | |
380 | || jnl->jhdr->start > jnl->jhdr->size | |
381 | || jnl->jhdr->start > 1024*1024*1024) { | |
382 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", | |
383 | jnl->jhdr->start, jnl->jhdr->size); | |
384 | goto bad_journal; | |
385 | } | |
386 | ||
387 | if ( jnl->jhdr->end <= 0 | |
388 | || jnl->jhdr->end > jnl->jhdr->size | |
389 | || jnl->jhdr->end > 1024*1024*1024) { | |
390 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", | |
391 | jnl->jhdr->end, jnl->jhdr->size); | |
392 | goto bad_journal; | |
393 | } | |
394 | ||
395 | if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { | |
396 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size); | |
397 | goto bad_journal; | |
398 | } | |
399 | ||
400 | // XXXdbg - can't do these checks because hfs writes all kinds of | |
401 | // non-uniform sized blocks even on devices that have a block size | |
402 | // that is larger than 512 bytes (i.e. optical media w/2k blocks). | |
403 | // therefore these checks will fail and so we just have to punt and | |
404 | // do more relaxed checking... | |
405 | // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { | |
406 | if ((jnl->jhdr->start % 512) != 0) { | |
407 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal start (0x%llx) not a multiple of 512?\n", | |
408 | jnl->jhdr->start); | |
409 | goto bad_journal; | |
410 | } | |
411 | ||
412 | //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { | |
413 | if ((jnl->jhdr->end % 512) != 0) { | |
414 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", | |
415 | jnl->jhdr->end, jnl->jhdr->jhdr_size); | |
416 | goto bad_journal; | |
417 | } | |
418 | ||
419 | if (jnl->jhdr->blhdr_size < 0) { | |
420 | //throw out invalid sizes | |
421 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: blhdr size looks bogus! (%d) \n", | |
422 | jnl->jhdr->blhdr_size); | |
423 | goto bad_journal; | |
424 | } | |
425 | ||
426 | // take care of replaying the journal if necessary | |
427 | if (flags & JOURNAL_RESET) { | |
428 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n", | |
429 | jnl->jhdr->start, jnl->jhdr->end); | |
430 | jnl->jhdr->start = jnl->jhdr->end; | |
431 | } else if (replay_journal(jnl) != 0) { | |
432 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_open: Error replaying the journal!\n"); | |
433 | goto bad_journal; | |
434 | } | |
435 | ||
436 | /* | |
437 | * When we get here, we know that the journal is empty (jnl->jhdr->start == | |
438 | * jnl->jhdr->end). If the device's logical block size was different from | |
439 | * the journal's header size, then we can now restore the device's logical | |
440 | * block size and update the journal's header size to match. | |
441 | * | |
442 | * Note that we also adjust the journal's start and end so that they will | |
443 | * be aligned on the new block size. We pick a new sequence number to | |
444 | * avoid any problems if a replay found previous transactions using the old | |
445 | * journal header size. (See the comments in journal_create(), above.) | |
446 | */ | |
447 | ||
448 | if (orig_blksz != 0) { | |
449 | LFHFS_LOG(LEVEL_ERROR, "jnl: updating journal header with block size %llu\n", | |
450 | (unsigned long long)phys_blksz); | |
451 | ||
452 | jnl->jhdr->jhdr_size = phys_blksz; | |
453 | jnl->jhdr->start = phys_blksz; | |
454 | jnl->jhdr->end = phys_blksz; | |
455 | jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + | |
456 | (journal_size / phys_blksz) + | |
457 | (random() % 16384)) & 0x00ffffff; | |
458 | ||
459 | if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { | |
460 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: failed to update journal header size\n"); | |
461 | goto bad_journal; | |
462 | } | |
463 | } | |
464 | ||
465 | // make sure this is in sync! | |
466 | jnl->active_start = jnl->jhdr->start; | |
467 | jnl->sequence_num = jnl->jhdr->sequence_num; | |
468 | ||
469 | // set this now, after we've replayed the journal | |
470 | size_up_tbuffer(jnl, tbuffer_size, phys_blksz); | |
471 | ||
472 | // TODO: Does this need to change if the device's logical block size changed? | |
473 | if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { | |
474 | LFHFS_LOG(LEVEL_ERROR, "jnl: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jnl->jhdr->size, | |
475 | jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); | |
476 | goto bad_journal; | |
477 | } | |
478 | ||
479 | lf_lck_mtx_init(&jnl->jlock); | |
480 | lf_lck_mtx_init(&jnl->flock); | |
481 | lf_lck_rw_init(&jnl->trim_lock); | |
482 | ||
483 | goto journal_open_complete; | |
484 | ||
485 | bad_journal: | |
486 | hfs_free(jnl->header_buf); | |
487 | hfs_free(jnl); | |
488 | cleanup_jdev_name: | |
489 | jnl = NULL; | |
490 | journal_open_complete: | |
491 | return jnl; | |
492 | } | |
493 | ||
494 | journal *journal_create(struct vnode *jvp, | |
495 | off_t offset, | |
496 | off_t journal_size, | |
497 | struct vnode *fsvp, | |
498 | size_t min_fs_blksz, | |
499 | int32_t flags, | |
500 | int32_t tbuffer_size, | |
501 | void (*flush)(void *arg), | |
502 | void *arg, | |
503 | struct mount *fsmount) { | |
504 | ||
505 | journal *jnl; | |
506 | uint32_t phys_blksz, new_txn_base; | |
507 | u_int32_t min_size; | |
508 | ||
509 | /* | |
510 | * Cap the journal max size to 2GB. On HFS, it will attempt to occupy | |
511 | * a full allocation block if the current size is smaller than the allocation | |
512 | * block on which it resides. Once we hit the exabyte filesystem range, then | |
513 | * it will use 2GB allocation blocks. As a result, make the cap 2GB. | |
514 | */ | |
515 | ||
516 | /* Get the real physical block size. */ | |
517 | if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) { | |
518 | goto cleanup_jdev_name; | |
519 | } | |
520 | ||
521 | if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { | |
522 | LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size %lld looks bogus.\n", journal_size); | |
523 | goto cleanup_jdev_name; | |
524 | } | |
525 | ||
526 | min_size = phys_blksz * (phys_blksz / sizeof(block_info)); | |
527 | /* Reject journals that are too small given the sector size of the device */ | |
528 | if (journal_size < min_size) { | |
529 | LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size (%lld) too small given sector size of (%u)\n", | |
530 | journal_size, phys_blksz); | |
531 | goto cleanup_jdev_name; | |
532 | } | |
533 | ||
534 | if (phys_blksz > min_fs_blksz) { | |
535 | LFHFS_LOG(LEVEL_ERROR, "jnl: create: error: phys blksize %u bigger than min fs blksize %zd\n", | |
536 | phys_blksz, min_fs_blksz); | |
537 | goto cleanup_jdev_name; | |
538 | } | |
539 | ||
540 | if ((journal_size % phys_blksz) != 0) { | |
541 | LFHFS_LOG(LEVEL_ERROR, "jnl: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", | |
542 | journal_size, phys_blksz); | |
543 | goto cleanup_jdev_name; | |
544 | } | |
545 | ||
546 | ||
547 | jnl = hfs_mallocz(sizeof(struct journal)); | |
548 | ||
549 | jnl->jdev = jvp; | |
550 | jnl->jdev_offset = offset; | |
551 | jnl->jdev_blknum = (uint32_t)(offset / min_fs_blksz); | |
552 | jnl->fsdev = fsvp; | |
553 | jnl->flush = flush; | |
554 | jnl->flush_arg = arg; | |
555 | jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); | |
556 | lf_lck_mtx_init(&jnl->old_start_lock); | |
557 | ||
558 | // Keep a point to the mount around for use in IO throttling. | |
559 | jnl->fsmount = fsmount; | |
560 | ||
561 | get_io_info(jvp, phys_blksz, jnl); | |
562 | ||
563 | jnl->header_buf = hfs_malloc(phys_blksz); | |
564 | jnl->header_buf_size = phys_blksz; | |
565 | ||
566 | jnl->jhdr = (journal_header *)jnl->header_buf; | |
567 | memset(jnl->jhdr, 0, sizeof(journal_header)); | |
568 | ||
569 | // we have to set this up here so that do_journal_io() will work | |
570 | jnl->jhdr->jhdr_size = phys_blksz; | |
571 | ||
572 | // | |
573 | // We try and read the journal header to see if there is already one | |
574 | // out there. If there is, it's possible that it has transactions | |
575 | // in it that we might replay if we happen to pick a sequence number | |
576 | // that is a little less than the old one, there is a crash and the | |
577 | // last txn written ends right at the start of a txn from the previous | |
578 | // incarnation of this file system. If all that happens we would | |
579 | // replay the transactions from the old file system and that would | |
580 | // destroy your disk. Although it is extremely unlikely for all those | |
581 | // conditions to happen, the probability is non-zero and the result is | |
582 | // severe - you lose your file system. Therefore if we find a valid | |
583 | // journal header and the sequence number is non-zero we write junk | |
584 | // over the entire journal so that there is no way we will encounter | |
585 | // any old transactions. This is slow but should be a rare event | |
586 | // since most tools erase the journal. | |
587 | // | |
588 | if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz | |
589 | && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC | |
590 | && jnl->jhdr->sequence_num != 0) { | |
591 | ||
592 | new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; | |
593 | LFHFS_LOG(LEVEL_ERROR, "jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); | |
594 | ||
595 | } else { | |
596 | new_txn_base = random() & 0x00ffffff; | |
597 | } | |
598 | ||
599 | memset(jnl->header_buf, 0, phys_blksz); | |
600 | ||
601 | jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; | |
602 | jnl->jhdr->endian = ENDIAN_MAGIC; | |
603 | jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself | |
604 | jnl->jhdr->end = phys_blksz; | |
605 | jnl->jhdr->size = journal_size; | |
606 | jnl->jhdr->jhdr_size = phys_blksz; | |
607 | size_up_tbuffer(jnl, tbuffer_size, phys_blksz); | |
608 | ||
609 | jnl->active_start = jnl->jhdr->start; | |
610 | ||
611 | jnl->jhdr->sequence_num = new_txn_base; | |
612 | ||
613 | lf_lck_mtx_init(&jnl->jlock); | |
614 | lf_lck_mtx_init(&jnl->flock); | |
615 | lf_lck_rw_init(&jnl->trim_lock); | |
616 | ||
617 | lf_cond_init(&jnl->flushing.sCond); | |
618 | lf_cond_init(&jnl->asyncIO.sCond); | |
619 | lf_cond_init(&jnl->writing_header.sCond); | |
620 | jnl->flush_aborted = FALSE; | |
621 | jnl->async_trim = NULL; | |
622 | jnl->sequence_num = jnl->jhdr->sequence_num; | |
623 | ||
624 | if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { | |
625 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_create: failed to write journal header.\n"); | |
626 | goto bad_write; | |
627 | } | |
628 | ||
629 | goto journal_create_complete; | |
630 | ||
631 | ||
632 | bad_write: | |
633 | hfs_free(jnl->header_buf); | |
634 | jnl->jhdr = NULL; | |
635 | hfs_free(jnl); | |
636 | cleanup_jdev_name: | |
637 | jnl = NULL; | |
638 | journal_create_complete: | |
639 | return jnl; | |
640 | } | |
641 | ||
642 | ||
643 | ||
644 | void *journal_owner(journal *jnl) { | |
645 | return jnl->owner; | |
646 | } | |
647 | ||
648 | /* Is the given cnode either the .journal or .journal_info_block file on | |
649 | * a volume with an active journal? Many VNOPs use this to deny access | |
650 | * to those files. | |
651 | * | |
652 | * Note: the .journal file on a volume with an external journal still | |
653 | * returns true here, even though it does not actually hold the contents | |
654 | * of the volume's journal. | |
655 | */ | |
656 | _Bool hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) { | |
657 | if (hfsmp->jnl != NULL && | |
658 | (cp->c_fileid == hfsmp->hfs_jnlinfoblkid || | |
659 | cp->c_fileid == hfsmp->hfs_jnlfileid)) { | |
660 | return true; | |
661 | } else { | |
662 | return false; | |
663 | } | |
664 | } | |
665 | ||
666 | bool is_journaled(UVFSFileNode *psRootNode) { | |
667 | ||
668 | struct vnode *psRootVnode = *psRootNode; | |
669 | ||
670 | if (!psRootNode) { | |
671 | LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootNode is NULL"); | |
672 | return false; | |
673 | } | |
674 | ||
675 | if (!psRootVnode->sFSParams.vnfs_mp) { | |
676 | LFHFS_LOG(LEVEL_DEBUG, "is_journaled: psRootVnode->sFSParams.vnfs_mp is NULL"); | |
677 | return false; | |
678 | } | |
679 | ||
680 | if (psRootVnode->sFSParams.vnfs_mp->psHfsmount->jnl) | |
681 | return true; | |
682 | ||
683 | return false; | |
684 | } | |
685 | ||
686 | ||
687 | // Media no longer available, clear all memory occupied by the journal | |
688 | void journal_release(journal *jnl) { | |
689 | if (jnl->owner != pthread_self()) { | |
690 | journal_lock(jnl); | |
691 | } | |
692 | ||
693 | if (jnl->active_tr) { | |
694 | abort_transaction(jnl, jnl->active_tr); | |
695 | } | |
696 | ||
697 | if (jnl->cur_tr) { | |
698 | abort_transaction(jnl, jnl->cur_tr); | |
699 | } | |
700 | ||
701 | free_old_stuff(jnl); | |
702 | ||
703 | hfs_free(jnl->header_buf); | |
704 | jnl->jhdr = (void *)0xbeefbabe; | |
705 | ||
706 | journal_unlock(jnl); | |
707 | lf_lck_mtx_destroy(&jnl->old_start_lock); | |
708 | lf_lck_mtx_destroy(&jnl->jlock); | |
709 | lf_lck_mtx_destroy(&jnl->flock); | |
710 | hfs_free(jnl); | |
711 | } | |
712 | ||
713 | ||
714 | void journal_close(journal *jnl) { | |
715 | volatile off_t *start, *end; | |
716 | int counter=0; | |
717 | ||
718 | CHECK_JOURNAL(jnl); | |
719 | ||
720 | // set this before doing anything that would block so that | |
721 | // we start tearing things down properly. | |
722 | // | |
723 | jnl->flags |= JOURNAL_CLOSE_PENDING; | |
724 | ||
725 | if (jnl->owner != pthread_self()) { | |
726 | journal_lock(jnl); | |
727 | } | |
728 | ||
729 | wait_condition(jnl, &jnl->flushing, "journal_close"); | |
730 | ||
731 | // | |
732 | // only write stuff to disk if the journal is still valid | |
733 | // | |
734 | if ((jnl->flags & JOURNAL_INVALID) == 0) { | |
735 | ||
736 | if (jnl->active_tr) { | |
737 | /* | |
738 | * "journal_end_transaction" will fire the flush asynchronously | |
739 | */ | |
740 | journal_end_transaction(jnl); | |
741 | } | |
742 | ||
743 | // flush any buffered transactions | |
744 | if (jnl->cur_tr) { | |
745 | transaction *tr = jnl->cur_tr; | |
746 | ||
747 | jnl->cur_tr = NULL; | |
748 | /* | |
749 | * "end_transaction" will wait for any in-progress flush to complete | |
750 | * before flushing "cur_tr" synchronously("must_wait" == TRUE) | |
751 | */ | |
752 | end_transaction(tr, 1, NULL, NULL, FALSE); | |
753 | } | |
754 | /* | |
755 | * if there was an "active_tr", make sure we wait for | |
756 | * it to flush if there was no "cur_tr" to process | |
757 | */ | |
758 | wait_condition(jnl, &jnl->flushing, "journal_close"); | |
759 | ||
760 | //start = &jnl->jhdr->start; | |
761 | start = &jnl->active_start; | |
762 | end = &jnl->jhdr->end; | |
763 | ||
764 | while (*start != *end && counter++ < 5000) { | |
765 | //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end); | |
766 | if (jnl->flush) { | |
767 | jnl->flush(jnl->flush_arg); | |
768 | } | |
769 | usleep(10000); | |
770 | } | |
771 | ||
772 | if (*start != *end) { | |
773 | LFHFS_LOG(LEVEL_ERROR, "jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", | |
774 | *start, *end); | |
775 | } | |
776 | ||
777 | // make sure this is in sync when we close the journal | |
778 | jnl->jhdr->start = jnl->active_start; | |
779 | ||
780 | // if this fails there's not much we can do at this point... | |
781 | write_journal_header(jnl, 1, jnl->sequence_num); | |
782 | } else { | |
783 | // if we're here the journal isn't valid any more. | |
784 | // so make sure we don't leave any locked blocks lying around | |
785 | LFHFS_LOG(LEVEL_ERROR, "jnl: close: journal is invalid. aborting outstanding transactions\n"); | |
786 | if (jnl->active_tr || jnl->cur_tr) { | |
787 | transaction *tr; | |
788 | ||
789 | if (jnl->active_tr) { | |
790 | tr = jnl->active_tr; | |
791 | jnl->active_tr = NULL; | |
792 | } else { | |
793 | tr = jnl->cur_tr; | |
794 | jnl->cur_tr = NULL; | |
795 | } | |
796 | abort_transaction(jnl, tr); | |
797 | ||
798 | if (jnl->active_tr || jnl->cur_tr) { | |
799 | panic("jnl: close: jnl @ %p had both an active and cur tr\n", jnl); | |
800 | } | |
801 | } | |
802 | } | |
803 | wait_condition(jnl, &jnl->asyncIO, "journal_close"); | |
804 | ||
805 | free_old_stuff(jnl); | |
806 | ||
807 | hfs_free(jnl->header_buf); | |
808 | jnl->jhdr = (void *)0xbeefbabe; | |
809 | ||
810 | journal_unlock(jnl); | |
811 | lf_lck_mtx_destroy(&jnl->old_start_lock); | |
812 | lf_lck_mtx_destroy(&jnl->jlock); | |
813 | lf_lck_mtx_destroy(&jnl->flock); | |
814 | hfs_free(jnl); | |
815 | } | |
816 | ||
817 | // This function performs the following: | |
818 | // 1) Checks that we have a valid journal | |
819 | // 2) locks the journal | |
820 | // 3) Allocates roon in the journal | |
821 | int journal_start_transaction(journal *jnl) { | |
822 | ||
823 | int ret; | |
824 | ||
825 | #if JOURNAL_DEBUG | |
826 | printf("journal_start_transaction (%u).\n", jnl->nested_count); | |
827 | #endif | |
828 | ||
829 | CHECK_JOURNAL(jnl); | |
830 | ||
831 | free_old_stuff(jnl); | |
832 | ||
833 | if (jnl->flags & JOURNAL_INVALID) { | |
834 | return EINVAL; | |
835 | } | |
836 | ||
837 | if (jnl->owner == pthread_self()) { | |
838 | if (jnl->active_tr == NULL) { | |
839 | panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", | |
840 | jnl, jnl->owner, pthread_self()); | |
841 | } | |
842 | jnl->nested_count++; | |
843 | return 0; | |
844 | } | |
845 | ||
846 | journal_lock(jnl); | |
847 | ||
848 | if (jnl->nested_count != 0 || jnl->active_tr != NULL) { | |
849 | panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", | |
850 | jnl->owner, jnl->nested_count, jnl->active_tr, jnl); | |
851 | } | |
852 | ||
853 | jnl->nested_count = 1; | |
854 | ||
855 | // if there's a buffered transaction, use it. | |
856 | if (jnl->cur_tr) { | |
857 | jnl->active_tr = jnl->cur_tr; | |
858 | jnl->cur_tr = NULL; | |
859 | ||
860 | return 0; | |
861 | } | |
862 | ||
863 | ret = journal_allocate_transaction(jnl); | |
864 | if (ret) { | |
865 | goto bad_start; | |
866 | } | |
867 | ||
868 | // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); | |
869 | ||
870 | return 0; | |
871 | ||
872 | bad_start: | |
873 | jnl->nested_count = 0; | |
874 | journal_unlock(jnl); | |
875 | ||
876 | return ret; | |
877 | } | |
878 | // journal_end_transaction | |
879 | // This function does the following: | |
880 | // 1) Validates journal status/state | |
881 | // 2) | |
882 | int journal_end_transaction(journal *jnl) { | |
883 | int ret; | |
884 | transaction *tr; | |
885 | ||
886 | #if JOURNAL_DEBUG | |
887 | printf("journal_end_transaction (%u).\n", jnl->nested_count-1); | |
888 | #endif | |
889 | ||
890 | CHECK_JOURNAL(jnl); | |
891 | ||
892 | free_old_stuff(jnl); | |
893 | ||
894 | if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { | |
895 | return 0; | |
896 | } | |
897 | ||
898 | if (jnl->owner != pthread_self()) { | |
899 | panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", | |
900 | jnl, jnl->owner, pthread_self()); | |
901 | } | |
902 | jnl->nested_count--; | |
903 | ||
904 | if (jnl->nested_count > 0) { | |
905 | return 0; | |
906 | } else if (jnl->nested_count < 0) { | |
907 | panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); | |
908 | } | |
909 | ||
910 | if (jnl->flags & JOURNAL_INVALID) { | |
911 | if (jnl->active_tr) { | |
912 | if (jnl->cur_tr != NULL) { | |
913 | panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", | |
914 | jnl, jnl->active_tr, jnl->cur_tr); | |
915 | } | |
916 | tr = jnl->active_tr; | |
917 | jnl->active_tr = NULL; | |
918 | ||
919 | abort_transaction(jnl, tr); | |
920 | } | |
921 | journal_unlock(jnl); | |
922 | ||
923 | return EINVAL; | |
924 | } | |
925 | ||
926 | tr = jnl->active_tr; | |
927 | CHECK_TRANSACTION(tr); | |
928 | ||
929 | // clear this out here so that when check_free_space() calls | |
930 | // the FS flush function, we don't panic in journal_flush() | |
931 | // if the FS were to call that. note: check_free_space() is | |
932 | // called from end_transaction(). | |
933 | jnl->active_tr = NULL; | |
934 | ||
935 | /* Examine the force-journal-flush state in the active txn */ | |
936 | if (tr->flush_on_completion == TRUE) { | |
937 | /* | |
938 | * If the FS requested it, disallow group commit and force the | |
939 | * transaction out to disk immediately. | |
940 | */ | |
941 | ret = end_transaction(tr, 1, NULL, NULL, TRUE); | |
942 | } | |
943 | else { | |
944 | /* in the common path we can simply use the double-buffered journal */ | |
945 | ret = end_transaction(tr, 0, NULL, NULL, TRUE); | |
946 | } | |
947 | ||
948 | return ret; | |
949 | } | |
950 | ||
951 | // journal_modify_block_start | |
952 | // This function does the following: | |
953 | // 1) Makes sure the journal file is on and valid | |
954 | // 2) Clean up (free previous transactions) | |
955 | // 3) Validate that the phy-block-size has not changed. | |
956 | // 4) Locks the buffer. | |
957 | // Buffer life cycle with journal: | |
958 | // 1) Client code (ie btrees_io.c) allocates a buffer (ie gains ownership). Other threads will pend on using this buffer until it is released. | |
959 | // 2) Client code calls journal_modify_block_start which sets the GEN_BUF_WRITE_LOCK uCacheFlag. | |
960 | // 3) Client code modifies the buffer. | |
961 | // 4) Client code calls journal_modify_block_end which released the buffer. The GEN_BUF_WRITE_LOCK flag remains set. | |
962 | // It this point other threads are welcomed to modify the buffer (after executing steps 1 and 2 above). The buffer content will not be written to media before transaction_end, thus only the accumulative change of both threads after transaction_end will be committed. | |
963 | // 5) transaction-end (called from within client-code or async Sync) obtains ownership on in transaction buffers. By doing that it makes sure no buffer is currently being modified by any Client code. It then prepares the buffer for commiting (ie realigns endianizm), and commits (writes to the t-buffer, write the t-buffer to media, updates journal-info, clears the GEN_BUF_WRITE_LOCK flags and writes the buffers to media). | |
964 | int journal_modify_block_start(journal *jnl, GenericLFBuf *psGenBuf) { | |
965 | ||
966 | transaction *tr; | |
967 | ||
968 | #if JOURNAL_DEBUG | |
969 | printf("journal_modify_block_start: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uCacheFlags 0x%llx, uPhyCluster %llu, uLockCnt %u\n", | |
970 | psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uCacheFlags ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt); | |
971 | #endif | |
972 | ||
973 | CHECK_JOURNAL(jnl); | |
974 | ||
975 | free_old_stuff(jnl); | |
976 | ||
977 | if (jnl->flags & JOURNAL_INVALID) { | |
978 | return EINVAL; | |
979 | } | |
980 | ||
981 | tr = jnl->active_tr; | |
982 | CHECK_TRANSACTION(tr); | |
983 | ||
984 | if (jnl->owner != pthread_self()) { | |
985 | panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", | |
986 | jnl, jnl->owner, pthread_self()); | |
987 | } | |
988 | ||
989 | //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", | |
990 | // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); | |
991 | ||
992 | // can't allow blocks that aren't an even multiple of the | |
993 | // underlying block size. | |
994 | if ((psGenBuf->uDataSize % jnl->jhdr->jhdr_size) != 0) { | |
995 | uint32_t bad=0; | |
996 | uint32_t phys_blksz; | |
997 | ||
998 | if (ioctl(jnl->jdev->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) { | |
999 | bad = 1; | |
1000 | } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { | |
1001 | if (phys_blksz < 512) { | |
1002 | panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", | |
1003 | phys_blksz, psGenBuf->uDataSize, jnl->jhdr->jhdr_size); | |
1004 | } | |
1005 | ||
1006 | if ((psGenBuf->uDataSize % phys_blksz) != 0) { | |
1007 | bad = 1; | |
1008 | } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { | |
1009 | jnl->jhdr->jhdr_size = phys_blksz; | |
1010 | } else { | |
1011 | // the phys_blksz is now larger... need to realloc the jhdr | |
1012 | char *new_header_buf; | |
1013 | ||
1014 | LFHFS_LOG(LEVEL_ERROR, "jnl: phys blksz got bigger (was: %d/%d now %d)\n", | |
1015 | jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); | |
1016 | new_header_buf = hfs_malloc(phys_blksz); | |
1017 | memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); | |
1018 | memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); | |
1019 | hfs_free(jnl->header_buf); | |
1020 | jnl->header_buf = new_header_buf; | |
1021 | jnl->header_buf_size = phys_blksz; | |
1022 | ||
1023 | jnl->jhdr = (journal_header *)jnl->header_buf; | |
1024 | jnl->jhdr->jhdr_size = phys_blksz; | |
1025 | } | |
1026 | } else { | |
1027 | bad = 1; | |
1028 | } | |
1029 | ||
1030 | if (bad) { | |
1031 | panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", | |
1032 | psGenBuf->uDataSize, jnl->jhdr->jhdr_size); | |
1033 | ||
1034 | return -1; | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | // make sure that this transaction isn't bigger than the whole journal | |
1039 | if ((tr->total_bytes+psGenBuf->uDataSize) >= (size_t)(jnl->jhdr->size - jnl->jhdr->jhdr_size)) { | |
1040 | panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", | |
1041 | tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), psGenBuf->uDataSize, tr, psGenBuf->pvData); | |
1042 | ||
1043 | return -1; | |
1044 | } | |
1045 | ||
1046 | lf_hfs_generic_buf_set_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK); | |
1047 | ||
1048 | return 0; | |
1049 | } | |
1050 | // journal_modify_block_end | |
1051 | // This function does the following: | |
1052 | // 1) Makes sure the journal file is on and valid | |
1053 | // 2) Clean up (free previous transactions) | |
1054 | // 3) Check if this block already exists in transaction | |
1055 | // 4) Add block number to transcation. We dont add the block data, nor we release the buffer at this point. | |
1056 | // This will be done later on, at the transaction-end. | |
1057 | int journal_modify_block_end(journal *jnl, GenericLFBuf *psGenBuf, | |
1058 | void (*func)(GenericLFBuf *bp, void *arg), void *arg) { | |
1059 | int i = 1; | |
1060 | size_t tbuffer_offset=0; | |
1061 | block_list_header *blhdr, *prev=NULL; | |
1062 | transaction *tr = NULL; | |
1063 | ||
1064 | #if JOURNAL_DEBUG | |
1065 | printf("journal_modify_block_end: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n", | |
1066 | psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt); | |
1067 | #endif | |
1068 | ||
1069 | CHECK_JOURNAL(jnl); | |
1070 | ||
1071 | free_old_stuff(jnl); | |
1072 | ||
1073 | if (func) { | |
1074 | psGenBuf->pfFunc = func; | |
1075 | psGenBuf->pvCallbackArgs = arg; | |
1076 | } | |
1077 | ||
1078 | if (jnl->flags & JOURNAL_INVALID) { | |
1079 | /* Still need to buf_brelse(). Callers assume we consume the bp. */ | |
1080 | lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK); | |
1081 | lf_hfs_generic_buf_release(psGenBuf); | |
1082 | return EINVAL; | |
1083 | } | |
1084 | ||
1085 | tr = jnl->active_tr; | |
1086 | CHECK_TRANSACTION(tr); | |
1087 | ||
1088 | if (jnl->owner != pthread_self()) { | |
1089 | panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", | |
1090 | jnl, jnl->owner, pthread_self()); | |
1091 | } | |
1092 | ||
1093 | if ((psGenBuf->uCacheFlags & GEN_BUF_WRITE_LOCK) == 0) { | |
1094 | panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", psGenBuf, jnl); | |
1095 | } | |
1096 | ||
1097 | // first check if this block is already part of this transaction | |
1098 | for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { | |
1099 | tbuffer_offset = jnl->jhdr->blhdr_size; | |
1100 | ||
1101 | for (i = 1; i < blhdr->num_blocks; i++) { | |
1102 | GenericLFBuf *bp = (void*)blhdr->binfo[i].u.bp; | |
1103 | if (psGenBuf == bp) { | |
1104 | // Block found in transaction | |
1105 | #if JOURNAL_DEBUG | |
1106 | printf("block_end, already in journal: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n", | |
1107 | psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize, psGenBuf->uPhyCluster, psGenBuf->uLockCnt); | |
1108 | #endif | |
1109 | break; | |
1110 | } | |
1111 | if (blhdr->binfo[i].bnum != (off_t)-1) { | |
1112 | off_t uSizeOfBuf = ((GenericLFBuf*)(blhdr->binfo[i].u.bp))->uDataSize; | |
1113 | tbuffer_offset += uSizeOfBuf; | |
1114 | } else { | |
1115 | tbuffer_offset += blhdr->binfo[i].u.bi.bsize; | |
1116 | } | |
1117 | } | |
1118 | ||
1119 | if (i < blhdr->num_blocks) { | |
1120 | break; | |
1121 | } | |
1122 | } | |
1123 | ||
1124 | if (blhdr == NULL | |
1125 | && prev | |
1126 | && (prev->num_blocks+1) <= prev->max_blocks | |
1127 | && (prev->bytes_used+psGenBuf->uDataSize) <= (uint32_t)tr->tbuffer_size) { | |
1128 | // Block not found, add to last list | |
1129 | blhdr = prev; | |
1130 | ||
1131 | } else if (blhdr == NULL) { | |
1132 | block_list_header *nblhdr; | |
1133 | if (prev == NULL) { | |
1134 | panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, psGenBuf %p\n", jnl, psGenBuf); | |
1135 | } | |
1136 | // Add another tbuffer: | |
1137 | ||
1138 | // we got to the end of the list, didn't find the block and there's | |
1139 | // no room in the block_list_header pointed to by prev | |
1140 | ||
1141 | // we allocate another tbuffer and link it in at the end of the list | |
1142 | // through prev->binfo[0].bnum. that's a skanky way to do things but | |
1143 | // avoids having yet another linked list of small data structures to manage. | |
1144 | ||
1145 | nblhdr = hfs_malloc(tr->tbuffer_size); | |
1146 | ||
1147 | // journal replay code checksum check depends on this. | |
1148 | memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); | |
1149 | // Fill up the rest of the block with unimportant bytes | |
1150 | memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); | |
1151 | ||
1152 | // initialize the new guy | |
1153 | nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; | |
1154 | nblhdr->num_blocks = 1; // accounts for this header block | |
1155 | nblhdr->bytes_used = (uint32_t)jnl->jhdr->blhdr_size; | |
1156 | nblhdr->flags = BLHDR_CHECK_CHECKSUMS; | |
1157 | ||
1158 | tr->num_blhdrs++; | |
1159 | tr->total_bytes += jnl->jhdr->blhdr_size; | |
1160 | ||
1161 | // then link him in at the end | |
1162 | prev->binfo[0].bnum = (off_t)((long)nblhdr); | |
1163 | ||
1164 | // and finally switch to using the new guy | |
1165 | blhdr = nblhdr; | |
1166 | i = 1; | |
1167 | } | |
1168 | ||
1169 | if ((i+1) > blhdr->max_blocks) { | |
1170 | panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); | |
1171 | } | |
1172 | ||
1173 | // if this is true then this is a new block we haven't seen before | |
1174 | if (i >= blhdr->num_blocks) { | |
1175 | off_t bsize; | |
1176 | bsize = psGenBuf->uDataSize; | |
1177 | ||
1178 | // Add block to list | |
1179 | blhdr->binfo[i].bnum = (off_t)(psGenBuf->uBlockN); | |
1180 | blhdr->binfo[i].u.bp = (void*)psGenBuf; | |
1181 | ||
1182 | blhdr->bytes_used += bsize; | |
1183 | tr->total_bytes += bsize; | |
1184 | ||
1185 | blhdr->num_blocks++; | |
1186 | } | |
1187 | ||
1188 | // We can release the block here to allow other threads to perform operations on it until the next transaction-end. | |
1189 | // The buffer will not be removed from cache since it is write-locked. | |
1190 | lf_hfs_generic_buf_release(psGenBuf); | |
1191 | ||
1192 | return 0; | |
1193 | } | |
1194 | ||
1195 | // This function validates if a block is already registered to a transaction | |
1196 | /* | |
1197 | * Flush the contents of the journal to the disk. | |
1198 | * | |
1199 | * Input: | |
1200 | * wait_for_IO - | |
1201 | * If TRUE, wait to write in-memory journal to the disk | |
1202 | * consistently, and also wait to write all asynchronous | |
1203 | * metadata blocks to its corresponding locations | |
1204 | * consistently on the disk. This means that the journal | |
1205 | * is empty at this point and does not contain any | |
1206 | * transactions. This is overkill in normal scenarios | |
1207 | * but is useful whenever the metadata blocks are required | |
1208 | * to be consistent on-disk instead of just the journal | |
1209 | * being consistent; like before live verification | |
1210 | * and live volume resizing. | |
1211 | * | |
1212 | * If FALSE, only wait to write in-memory journal to the | |
1213 | * disk consistently. This means that the journal still | |
1214 | * contains uncommitted transactions and the file system | |
1215 | * metadata blocks in the journal transactions might be | |
1216 | * written asynchronously to the disk. But there is no | |
1217 | * guarantee that they are written to the disk before | |
1218 | * returning to the caller. Note that this option is | |
1219 | * sufficient for file system data integrity as it | |
1220 | * guarantees consistent journal content on the disk. | |
1221 | */ | |
1222 | int journal_flush(journal *jnl, journal_flush_options_t options) { | |
1223 | boolean_t drop_lock = FALSE; | |
1224 | errno_t error = 0; | |
1225 | uint32_t flush_count = 0; | |
1226 | ||
1227 | CHECK_JOURNAL(jnl); | |
1228 | ||
1229 | free_old_stuff(jnl); | |
1230 | ||
1231 | if (jnl->flags & JOURNAL_INVALID) { | |
1232 | return EINVAL; | |
1233 | } | |
1234 | ||
1235 | if (jnl->owner != pthread_self()) { | |
1236 | journal_lock(jnl); | |
1237 | drop_lock = TRUE; | |
1238 | } | |
1239 | ||
1240 | if (ISSET(options, JOURNAL_FLUSH_FULL)) | |
1241 | flush_count = jnl->flush_counter; | |
1242 | ||
1243 | // if we're not active, flush any buffered transactions | |
1244 | if (jnl->active_tr == NULL && jnl->cur_tr) { | |
1245 | transaction *tr = jnl->cur_tr; | |
1246 | ||
1247 | jnl->cur_tr = NULL; | |
1248 | ||
1249 | if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { | |
1250 | wait_condition(jnl, &jnl->flushing, "journal_flush"); | |
1251 | wait_condition(jnl, &jnl->asyncIO, "journal_flush"); | |
1252 | } | |
1253 | ||
1254 | // As the journal flush changes the MetaData content (update Endianizm), we need to lock the system times. | |
1255 | int lockflags = hfs_systemfile_lock(jnl->fsmount->psHfsmount, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); | |
1256 | ||
1257 | /* | |
1258 | * "end_transction" will wait for any current async flush | |
1259 | * to complete, before flushing "cur_tr"... because we've | |
1260 | * specified the 'must_wait' arg as TRUE, it will then | |
1261 | * synchronously flush the "cur_tr" | |
1262 | */ | |
1263 | end_transaction(tr, 1, NULL, NULL, drop_lock); // force it to get flushed | |
1264 | ||
1265 | hfs_systemfile_unlock(jnl->fsmount->psHfsmount, lockflags); | |
1266 | ||
1267 | } else { | |
1268 | if (drop_lock == TRUE) { | |
1269 | journal_unlock(jnl); | |
1270 | } | |
1271 | ||
1272 | /* Because of pipelined journal, the journal transactions | |
1273 | * might be in process of being flushed on another thread. | |
1274 | * If there is nothing to flush currently, we should | |
1275 | * synchronize ourselves with the pipelined journal thread | |
1276 | * to ensure that all inflight transactions, if any, are | |
1277 | * flushed before we return success to caller. | |
1278 | */ | |
1279 | wait_condition(jnl, &jnl->flushing, "journal_flush"); | |
1280 | } | |
1281 | if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { | |
1282 | wait_condition(jnl, &jnl->asyncIO, "journal_flush"); | |
1283 | } | |
1284 | ||
1285 | if (ISSET(options, JOURNAL_FLUSH_FULL)) { | |
1286 | ||
1287 | dk_synchronize_t sync_request = { | |
1288 | .options = 0, | |
1289 | }; | |
1290 | ||
1291 | // We need a full cache flush. If it has not been done, do it here. | |
1292 | if (flush_count == jnl->flush_counter) | |
1293 | error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request); | |
1294 | ||
1295 | // If external journal partition is enabled, flush filesystem data partition. | |
1296 | if (jnl->jdev != jnl->fsdev) | |
1297 | error = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request); | |
1298 | ||
1299 | } | |
1300 | ||
1301 | return error; | |
1302 | } | |
1303 | ||
1304 | ||
1305 | // ************************** Local Functions *********************** | |
1306 | static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) { | |
1307 | ||
1308 | int iRet = 0; | |
1309 | GenericLFBuf *psGenBuf = NULL; | |
1310 | ||
1311 | // first read the block we want. | |
1312 | psGenBuf = lf_hfs_generic_buf_allocate(jnl->fsmount->psHfsmount->hfs_devvp, | |
1313 | fs_block, | |
1314 | (uint32_t)bsize, | |
1315 | GEN_BUF_PHY_BLOCK | GEN_BUF_NON_CACHED); | |
1316 | if (!psGenBuf) { | |
1317 | LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error allocating fs block # %lld!\n", fs_block); | |
1318 | iRet = -1; | |
1319 | goto exit; | |
1320 | } | |
1321 | ||
1322 | iRet = lf_hfs_generic_buf_read(psGenBuf); | |
1323 | if (iRet) { | |
1324 | LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: error reading fs block # %lld!\n", fs_block); | |
1325 | goto exit; | |
1326 | } | |
1327 | ||
1328 | // copy the journal data over top of it | |
1329 | memcpy(psGenBuf->pvData, block_ptr, bsize); | |
1330 | ||
1331 | iRet = lf_hfs_generic_buf_write(psGenBuf); | |
1332 | if (iRet) { | |
1333 | LFHFS_LOG(LEVEL_ERROR, "jnl: update_fs_block: failed to write block %lld (ret %d)\n", fs_block, iRet); | |
1334 | goto exit; | |
1335 | } | |
1336 | ||
1337 | exit: | |
1338 | if (psGenBuf) { | |
1339 | lf_hfs_generic_buf_release(psGenBuf); | |
1340 | } | |
1341 | ||
1342 | return iRet; | |
1343 | } | |
1344 | ||
1345 | ||
1346 | static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) { | |
1347 | struct bucket *newBuf; | |
1348 | int current_size = num_buckets, i; | |
1349 | ||
1350 | // return if newsize is less than the current size | |
1351 | if (new_size < num_buckets) { | |
1352 | return current_size; | |
1353 | } | |
1354 | ||
1355 | newBuf = hfs_malloc(new_size*sizeof(struct bucket)); | |
1356 | ||
1357 | // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); | |
1358 | ||
1359 | // copy existing elements | |
1360 | bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); | |
1361 | ||
1362 | // initialize the new ones | |
1363 | for(i = num_buckets; i < new_size; i++) { | |
1364 | newBuf[i].block_num = (off_t)-1; | |
1365 | } | |
1366 | ||
1367 | // free the old container | |
1368 | hfs_free(*buf_ptr); | |
1369 | ||
1370 | // reset the buf_ptr | |
1371 | *buf_ptr = newBuf; | |
1372 | ||
1373 | return new_size; | |
1374 | } | |
1375 | ||
1376 | ||
1377 | static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) { | |
1378 | ||
1379 | if (!overwriting) { | |
1380 | // grow the table if we're out of space - we may index the table | |
1381 | // with *num_full_ptr (lookup_bucket() can return a maximum value == | |
1382 | // *num_full_ptr), so we need to grow when we hit (*num_buckets_ptr - 1) | |
1383 | // to prevent out-of-bounds indexing | |
1384 | if (*num_full_ptr >= (*num_buckets_ptr - 1)) { | |
1385 | int new_size = *num_buckets_ptr * 2; | |
1386 | int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); | |
1387 | ||
1388 | if (grow_size < new_size) { | |
1389 | LFHFS_LOG(LEVEL_ERROR, "jnl: add_block: grow_table returned an error!\n"); | |
1390 | return -1; | |
1391 | } | |
1392 | ||
1393 | *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size | |
1394 | } | |
1395 | ||
1396 | // if we're not inserting at the end, we need to bcopy | |
1397 | if (blk_index != *num_full_ptr) { | |
1398 | bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); | |
1399 | } | |
1400 | ||
1401 | (*num_full_ptr)++; // increment only if we're not overwriting | |
1402 | } | |
1403 | ||
1404 | // sanity check the values we're about to add | |
1405 | if ((off_t)offset >= jnl->jhdr->size) { | |
1406 | offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); | |
1407 | } | |
1408 | if (size <= 0) { | |
1409 | panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); | |
1410 | } | |
1411 | ||
1412 | (*buf_ptr)[blk_index].block_num = num; | |
1413 | (*buf_ptr)[blk_index].block_size = (uint32_t)size; | |
1414 | (*buf_ptr)[blk_index].jnl_offset = (uint32_t)offset; | |
1415 | (*buf_ptr)[blk_index].cksum = cksum; | |
1416 | ||
1417 | return blk_index; | |
1418 | } | |
1419 | ||
1420 | static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { | |
1421 | ||
1422 | int num_to_remove, index, i, overwrite, err; | |
1423 | size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; | |
1424 | off_t overlap, block_start, block_end; | |
1425 | ||
1426 | block_start = block_num*jhdr_size; | |
1427 | block_end = block_start + size; | |
1428 | overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); | |
1429 | ||
1430 | // first, eliminate any overlap with the previous entry | |
1431 | if (blk_index != 0 && !overwrite) { | |
1432 | off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; | |
1433 | off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; | |
1434 | overlap = prev_block_end - block_start; | |
1435 | if (overlap > 0) { | |
1436 | if (overlap % jhdr_size != 0) { | |
1437 | panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); | |
1438 | } | |
1439 | ||
1440 | // if the previous entry completely overlaps this one, we need to break it into two pieces. | |
1441 | if (prev_block_end > block_end) { | |
1442 | off_t new_num = block_end / jhdr_size; | |
1443 | size_t new_size = prev_block_end - block_end; | |
1444 | ||
1445 | new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); | |
1446 | ||
1447 | err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); | |
1448 | if (err < 0) { | |
1449 | panic("jnl: do_overlap: error inserting during pre-overlap\n"); | |
1450 | } | |
1451 | } | |
1452 | ||
1453 | // Regardless, we need to truncate the previous entry to the beginning of the overlap | |
1454 | (*buf_ptr)[blk_index-1].block_size = (uint32_t)(block_start - prev_block_start); | |
1455 | (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it | |
1456 | } | |
1457 | } | |
1458 | ||
1459 | // then, bail out fast if there's no overlap with the entries that follow | |
1460 | if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { | |
1461 | return 0; // no overlap, no overwrite | |
1462 | } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { | |
1463 | ||
1464 | (*buf_ptr)[blk_index].cksum = cksum; // update this | |
1465 | return 1; // simple overwrite | |
1466 | } | |
1467 | ||
1468 | // Otherwise, find all cases of total and partial overlap. We use the special | |
1469 | // block_num of -2 to designate entries that are completely overlapped and must | |
1470 | // be eliminated. The block_num, size, and jnl_offset of partially overlapped | |
1471 | // entries must be adjusted to keep the array consistent. | |
1472 | index = blk_index; | |
1473 | num_to_remove = 0; | |
1474 | while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { | |
1475 | if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { | |
1476 | (*buf_ptr)[index].block_num = -2; // mark this for deletion | |
1477 | num_to_remove++; | |
1478 | } else { | |
1479 | overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; | |
1480 | if (overlap > 0) { | |
1481 | if (overlap % jhdr_size != 0) { | |
1482 | panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); | |
1483 | } | |
1484 | ||
1485 | // if we partially overlap this entry, adjust its block number, jnl offset, and size | |
1486 | (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up | |
1487 | (*buf_ptr)[index].cksum = 0; | |
1488 | ||
1489 | new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around | |
1490 | if ((off_t)new_offset >= jnl->jhdr->size) { | |
1491 | new_offset = jhdr_size + (new_offset - jnl->jhdr->size); | |
1492 | } | |
1493 | (*buf_ptr)[index].jnl_offset = (uint32_t)new_offset; | |
1494 | ||
1495 | (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value | |
1496 | if ((*buf_ptr)[index].block_size <= 0) { | |
1497 | panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); | |
1498 | // return -1; // if above panic is removed, return -1 for error | |
1499 | } | |
1500 | } | |
1501 | ||
1502 | } | |
1503 | ||
1504 | index++; | |
1505 | } | |
1506 | ||
1507 | // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) | |
1508 | index--; // start with the last index used within the above loop | |
1509 | while (index >= blk_index) { | |
1510 | if ((*buf_ptr)[index].block_num == -2) { | |
1511 | if (index == *num_full_ptr-1) { | |
1512 | (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free | |
1513 | } else { | |
1514 | bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); | |
1515 | } | |
1516 | (*num_full_ptr)--; | |
1517 | } | |
1518 | index--; | |
1519 | } | |
1520 | ||
1521 | // eliminate any stale entries at the end of the table | |
1522 | for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { | |
1523 | (*buf_ptr)[i].block_num = -1; | |
1524 | } | |
1525 | ||
1526 | return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) | |
1527 | } | |
1528 | ||
1529 | ||
1530 | static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) { | |
1531 | int lo, hi, index, matches, i; | |
1532 | ||
1533 | if (num_full == 0) { | |
1534 | return 0; // table is empty, so insert at index=0 | |
1535 | } | |
1536 | ||
1537 | lo = 0; | |
1538 | hi = num_full - 1; | |
1539 | index = -1; | |
1540 | ||
1541 | // perform binary search for block_num | |
1542 | do { | |
1543 | int mid = (hi - lo)/2 + lo; | |
1544 | off_t this_num = (*buf_ptr)[mid].block_num; | |
1545 | ||
1546 | if (block_num == this_num) { | |
1547 | index = mid; | |
1548 | break; | |
1549 | } | |
1550 | ||
1551 | if (block_num < this_num) { | |
1552 | hi = mid; | |
1553 | continue; | |
1554 | } | |
1555 | ||
1556 | if (block_num > this_num) { | |
1557 | lo = mid + 1; | |
1558 | continue; | |
1559 | } | |
1560 | } while (lo < hi); | |
1561 | ||
1562 | // check if lo and hi converged on the match | |
1563 | if (block_num == (*buf_ptr)[hi].block_num) { | |
1564 | index = hi; | |
1565 | } | |
1566 | ||
1567 | // if no existing entry found, find index for new one | |
1568 | if (index == -1) { | |
1569 | index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; | |
1570 | } else { | |
1571 | // make sure that we return the right-most index in the case of multiple matches | |
1572 | matches = 0; | |
1573 | i = index + 1; | |
1574 | while (i < num_full && block_num == (*buf_ptr)[i].block_num) { | |
1575 | matches++; | |
1576 | i++; | |
1577 | } | |
1578 | ||
1579 | index += matches; | |
1580 | } | |
1581 | ||
1582 | return index; | |
1583 | } | |
1584 | ||
1585 | // PR-3105942: Coalesce writes to the same block in journal replay | |
1586 | // We coalesce writes by maintaining a dynamic sorted array of physical disk blocks | |
1587 | // to be replayed and the corresponding location in the journal which contains | |
1588 | // the most recent data for those blocks. The array is "played" once the all the | |
1589 | // blocks in the journal have been coalesced. The code for the case of conflicting/ | |
1590 | // overlapping writes to a single block is the most dense. Because coalescing can | |
1591 | // disrupt the existing time-ordering of blocks in the journal playback, care | |
1592 | // is taken to catch any overlaps and keep the array consistent. | |
1593 | static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { | |
1594 | int blk_index, overwriting; | |
1595 | ||
1596 | // on return from lookup_bucket(), blk_index is the index into the table where block_num should be | |
1597 | // inserted (or the index of the elem to overwrite). | |
1598 | blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); | |
1599 | ||
1600 | // check if the index is within bounds (if we're adding this block to the end of | |
1601 | // the table, blk_index will be equal to num_full) | |
1602 | if (blk_index < 0 || blk_index > *num_full_ptr) { | |
1603 | //printf("jnl: add_block: trouble adding block to co_buf\n"); | |
1604 | return -1; | |
1605 | } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); | |
1606 | ||
1607 | // Determine whether we're overwriting an existing entry by checking for overlap | |
1608 | overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); | |
1609 | if (overwriting < 0) { | |
1610 | return -1; // if we got an error, pass it along | |
1611 | } | |
1612 | ||
1613 | // returns the index, or -1 on error | |
1614 | blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); | |
1615 | ||
1616 | return blk_index; | |
1617 | } | |
1618 | ||
1619 | static void swap_block_list_header(journal *jnl, block_list_header *blhdr) { | |
1620 | int i; | |
1621 | ||
1622 | blhdr->max_blocks = SWAP16(blhdr->max_blocks); | |
1623 | blhdr->num_blocks = SWAP16(blhdr->num_blocks); | |
1624 | blhdr->bytes_used = SWAP32(blhdr->bytes_used); | |
1625 | blhdr->checksum = SWAP32(blhdr->checksum); | |
1626 | blhdr->flags = SWAP32(blhdr->flags); | |
1627 | ||
1628 | if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { | |
1629 | LFHFS_LOG(LEVEL_ERROR, "jnl: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", blhdr->num_blocks, jnl->jhdr->blhdr_size); | |
1630 | return; | |
1631 | } | |
1632 | ||
1633 | for(i = 0; i < blhdr->num_blocks; i++) { | |
1634 | blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); | |
1635 | blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); | |
1636 | blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); | |
1637 | } | |
1638 | } | |
1639 | ||
1640 | static int replay_journal(journal *jnl) { | |
1641 | int i, bad_blocks=0; | |
1642 | unsigned int orig_checksum, checksum, check_block_checksums = 0; | |
1643 | size_t ret; | |
1644 | size_t max_bsize = 0; /* protected by block_ptr */ | |
1645 | block_list_header *blhdr; | |
1646 | off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; | |
1647 | char *buff, *block_ptr=NULL; | |
1648 | struct bucket *co_buf; | |
1649 | int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory = 0; | |
1650 | uint32_t last_sequence_num = 0; | |
1651 | int replay_retry_count = 0; | |
1652 | ||
1653 | LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: start.\n"); | |
1654 | ||
1655 | ||
1656 | // wrap the start ptr if it points to the very end of the journal | |
1657 | if (jnl->jhdr->start == jnl->jhdr->size) { | |
1658 | jnl->jhdr->start = jnl->jhdr->jhdr_size; | |
1659 | } | |
1660 | if (jnl->jhdr->end == jnl->jhdr->size) { | |
1661 | jnl->jhdr->end = jnl->jhdr->jhdr_size; | |
1662 | } | |
1663 | ||
1664 | if (jnl->jhdr->start == jnl->jhdr->end) { | |
1665 | LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: journal empty.\n"); | |
1666 | goto success; | |
1667 | } | |
1668 | ||
1669 | orig_jnl_start = jnl->jhdr->start; | |
1670 | ||
1671 | // allocate memory for the header_block. we'll read each blhdr into this | |
1672 | buff = hfs_malloc(jnl->jhdr->blhdr_size); | |
1673 | ||
1674 | // allocate memory for the coalesce buffer | |
1675 | co_buf = hfs_malloc(num_buckets*sizeof(struct bucket)); | |
1676 | ||
1677 | restart_replay: | |
1678 | ||
1679 | // initialize entries | |
1680 | for(i = 0; i < num_buckets; i++) { | |
1681 | co_buf[i].block_num = -1; | |
1682 | } | |
1683 | num_full = 0; // empty at first | |
1684 | ||
1685 | ||
1686 | while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { | |
1687 | offset = blhdr_offset = jnl->jhdr->start; | |
1688 | ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); | |
1689 | if (ret != (size_t)jnl->jhdr->blhdr_size) { | |
1690 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset); | |
1691 | goto bad_txn_handling; | |
1692 | } | |
1693 | ||
1694 | blhdr = (block_list_header *)buff; | |
1695 | ||
1696 | orig_checksum = blhdr->checksum; | |
1697 | blhdr->checksum = 0; | |
1698 | if (jnl->flags & JOURNAL_NEED_SWAP) { | |
1699 | // calculate the checksum based on the unswapped data | |
1700 | // because it is done byte-at-a-time. | |
1701 | orig_checksum = (unsigned int)SWAP32(orig_checksum); | |
1702 | checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); | |
1703 | swap_block_list_header(jnl, blhdr); | |
1704 | } else { | |
1705 | checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); | |
1706 | } | |
1707 | ||
1708 | ||
1709 | // | |
1710 | // XXXdbg - if these checks fail, we should replay as much | |
1711 | // we can in the hopes that it will still leave the | |
1712 | // drive in a better state than if we didn't replay | |
1713 | // anything | |
1714 | // | |
1715 | if (checksum != orig_checksum) { | |
1716 | if (check_past_jnl_end && in_uncharted_territory) { | |
1717 | ||
1718 | if (blhdr_offset != jnl->jhdr->end) { | |
1719 | LFHFS_LOG(LEVEL_ERROR, "jnl: Extra txn replay stopped @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset); | |
1720 | } | |
1721 | ||
1722 | check_past_jnl_end = 0; | |
1723 | jnl->jhdr->end = blhdr_offset; | |
1724 | continue; | |
1725 | } | |
1726 | ||
1727 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", | |
1728 | blhdr_offset, orig_checksum, checksum); | |
1729 | ||
1730 | if (blhdr_offset == orig_jnl_start) { | |
1731 | // if there's nothing in the journal at all, just bail out altogether. | |
1732 | goto bad_replay; | |
1733 | } | |
1734 | ||
1735 | goto bad_txn_handling; | |
1736 | } | |
1737 | ||
1738 | if ( (last_sequence_num != 0) | |
1739 | && (blhdr->binfo[0].u.bi.b.sequence_num != 0) | |
1740 | && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) | |
1741 | && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { | |
1742 | ||
1743 | txn_start_offset = jnl->jhdr->end = blhdr_offset; | |
1744 | ||
1745 | if (check_past_jnl_end) { | |
1746 | check_past_jnl_end = 0; | |
1747 | LFHFS_LOG(LEVEL_ERROR, "jnl: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", | |
1748 | blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); | |
1749 | continue; | |
1750 | } | |
1751 | ||
1752 | LFHFS_LOG(LEVEL_ERROR, "jnl: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", | |
1753 | blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); | |
1754 | goto bad_txn_handling; | |
1755 | } | |
1756 | last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; | |
1757 | ||
1758 | if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { | |
1759 | if (last_sequence_num == 0) { | |
1760 | check_past_jnl_end = 0; | |
1761 | LFHFS_LOG(LEVEL_ERROR, "jnl: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", | |
1762 | jnl->jhdr->start, jnl->jhdr->end); | |
1763 | if (jnl->jhdr->start != jnl->jhdr->end) { | |
1764 | jnl->jhdr->start = jnl->jhdr->end; | |
1765 | } | |
1766 | continue; | |
1767 | } | |
1768 | LFHFS_LOG(LEVEL_ERROR, "jnl: examining extra transactions starting @ %lld / 0x%llx\n", blhdr_offset, blhdr_offset); | |
1769 | } | |
1770 | ||
1771 | if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) | |
1772 | || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { | |
1773 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bad looking journal entry: max: %d num: %d\n", | |
1774 | blhdr->max_blocks, blhdr->num_blocks); | |
1775 | goto bad_txn_handling; | |
1776 | } | |
1777 | ||
1778 | max_bsize = 0; | |
1779 | for (i = 1; i < blhdr->num_blocks; i++) { | |
1780 | if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { | |
1781 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum); | |
1782 | goto bad_txn_handling; | |
1783 | } | |
1784 | ||
1785 | if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { | |
1786 | max_bsize = blhdr->binfo[i].u.bi.bsize; | |
1787 | } | |
1788 | } | |
1789 | ||
1790 | if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { | |
1791 | check_block_checksums = 1; | |
1792 | block_ptr = hfs_malloc(max_bsize); | |
1793 | } else { | |
1794 | block_ptr = NULL; | |
1795 | } | |
1796 | ||
1797 | if (blhdr->flags & BLHDR_FIRST_HEADER) { | |
1798 | txn_start_offset = blhdr_offset; | |
1799 | } | |
1800 | ||
1801 | //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", | |
1802 | // blhdr->num_blocks-1, jnl->jhdr->start); | |
1803 | bad_blocks = 0; | |
1804 | for (i = 1; i < blhdr->num_blocks; i++) { | |
1805 | int size, ret_val; | |
1806 | off_t number; | |
1807 | ||
1808 | size = blhdr->binfo[i].u.bi.bsize; | |
1809 | number = blhdr->binfo[i].bnum; | |
1810 | ||
1811 | // don't add "killed" blocks | |
1812 | if (number == (off_t)-1) { | |
1813 | //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); | |
1814 | } else { | |
1815 | ||
1816 | if (check_block_checksums) { | |
1817 | int32_t disk_cksum; | |
1818 | off_t block_offset; | |
1819 | ||
1820 | block_offset = offset; | |
1821 | ||
1822 | // read the block so we can check the checksum | |
1823 | ret = read_journal_data(jnl, &block_offset, block_ptr, size); | |
1824 | if (ret != (size_t)size) { | |
1825 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset); | |
1826 | goto bad_txn_handling; | |
1827 | } | |
1828 | ||
1829 | disk_cksum = calc_checksum(block_ptr, size); | |
1830 | ||
1831 | // there is no need to swap the checksum from disk because | |
1832 | // it got swapped when the blhdr was read in. | |
1833 | if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { | |
1834 | LFHFS_LOG(LEVEL_ERROR, "jnl: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", | |
1835 | txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); | |
1836 | LFHFS_LOG(LEVEL_ERROR, "jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", | |
1837 | *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], | |
1838 | *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); | |
1839 | ||
1840 | goto bad_txn_handling; | |
1841 | } | |
1842 | } | |
1843 | ||
1844 | ||
1845 | // add this bucket to co_buf, coalescing where possible | |
1846 | // printf("jnl: replay_journal: adding block 0x%llx\n", number); | |
1847 | ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); | |
1848 | ||
1849 | if (ret_val == -1) { | |
1850 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: trouble adding block to co_buf\n"); | |
1851 | goto bad_replay; | |
1852 | } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); | |
1853 | } | |
1854 | ||
1855 | // increment offset | |
1856 | offset += size; | |
1857 | ||
1858 | // check if the last block added puts us off the end of the jnl. | |
1859 | // if so, we need to wrap to the beginning and take any remainder | |
1860 | // into account | |
1861 | // | |
1862 | if (offset >= jnl->jhdr->size) { | |
1863 | offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); | |
1864 | } | |
1865 | } | |
1866 | ||
1867 | if (block_ptr) { | |
1868 | hfs_free(block_ptr); | |
1869 | block_ptr = NULL; | |
1870 | } | |
1871 | ||
1872 | if (bad_blocks) { | |
1873 | bad_txn_handling: | |
1874 | /* Journal replay got error before it found any valid | |
1875 | * transations, abort replay */ | |
1876 | if (txn_start_offset == 0) { | |
1877 | LFHFS_LOG(LEVEL_ERROR, "jnl: no known good txn start offset! aborting journal replay.\n"); | |
1878 | goto bad_replay; | |
1879 | } | |
1880 | ||
1881 | /* Repeated error during journal replay, abort replay */ | |
1882 | if (replay_retry_count == 3) { | |
1883 | LFHFS_LOG(LEVEL_ERROR, "jnl: repeated errors replaying journal! aborting journal replay.\n"); | |
1884 | goto bad_replay; | |
1885 | } | |
1886 | replay_retry_count++; | |
1887 | ||
1888 | /* There was an error replaying the journal (possibly | |
1889 | * EIO/ENXIO from the device). So retry replaying all | |
1890 | * the good transactions that we found before getting | |
1891 | * the error. | |
1892 | */ | |
1893 | jnl->jhdr->start = orig_jnl_start; | |
1894 | jnl->jhdr->end = txn_start_offset; | |
1895 | check_past_jnl_end = 0; | |
1896 | last_sequence_num = 0; | |
1897 | LFHFS_LOG(LEVEL_ERROR, "jnl: restarting journal replay (%lld - %lld)!\n", jnl->jhdr->start, jnl->jhdr->end); | |
1898 | goto restart_replay; | |
1899 | } | |
1900 | ||
1901 | jnl->jhdr->start += blhdr->bytes_used; | |
1902 | if (jnl->jhdr->start >= jnl->jhdr->size) { | |
1903 | // wrap around and skip the journal header block | |
1904 | jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size; | |
1905 | } | |
1906 | ||
1907 | if (jnl->jhdr->start == jnl->jhdr->end) { | |
1908 | in_uncharted_territory = 1; | |
1909 | } | |
1910 | } | |
1911 | ||
1912 | if (jnl->jhdr->start != jnl->jhdr->end) { | |
1913 | LFHFS_LOG(LEVEL_ERROR, "jnl: start %lld != end %lld. resetting end.\n", jnl->jhdr->start, jnl->jhdr->end); | |
1914 | jnl->jhdr->end = jnl->jhdr->start; | |
1915 | } | |
1916 | ||
1917 | //printf("jnl: replay_journal: replaying %d blocks\n", num_full); | |
1918 | ||
1919 | /* | |
1920 | * make sure it's at least one page in size, so | |
1921 | * start max_bsize at PAGE_SIZE | |
1922 | */ | |
1923 | for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { | |
1924 | ||
1925 | if (co_buf[i].block_num == (off_t)-1) | |
1926 | continue; | |
1927 | ||
1928 | if (co_buf[i].block_size > max_bsize) | |
1929 | max_bsize = co_buf[i].block_size; | |
1930 | } | |
1931 | /* | |
1932 | * round max_bsize up to the nearest PAGE_SIZE multiple | |
1933 | */ | |
1934 | if (max_bsize & (PAGE_SIZE - 1)) { | |
1935 | max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); | |
1936 | } | |
1937 | ||
1938 | block_ptr = hfs_malloc(max_bsize); | |
1939 | ||
1940 | // Replay the coalesced entries in the co-buf | |
1941 | for(i = 0; i < num_full; i++) { | |
1942 | size_t size = co_buf[i].block_size; | |
1943 | off_t jnl_offset = (off_t) co_buf[i].jnl_offset; | |
1944 | off_t number = co_buf[i].block_num; | |
1945 | ||
1946 | ||
1947 | // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, | |
1948 | // co_buf[i].block_size, co_buf[i].jnl_offset); | |
1949 | ||
1950 | if (number == (off_t)-1) { | |
1951 | // printf("jnl: replay_journal: skipping killed fs block\n"); | |
1952 | } else { | |
1953 | ||
1954 | // do journal read, and set the phys. block | |
1955 | ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); | |
1956 | if (ret != size) { | |
1957 | LFHFS_LOG(LEVEL_ERROR, "jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl_offset); | |
1958 | goto bad_replay; | |
1959 | } | |
1960 | ||
1961 | if (update_fs_block(jnl, block_ptr, number, size) != 0) { | |
1962 | goto bad_replay; | |
1963 | } | |
1964 | } | |
1965 | } | |
1966 | ||
1967 | ||
1968 | // done replaying; update jnl header | |
1969 | if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { | |
1970 | goto bad_replay; | |
1971 | } | |
1972 | ||
1973 | // free block_ptr | |
1974 | if (block_ptr) { | |
1975 | hfs_free(block_ptr); | |
1976 | block_ptr = NULL; | |
1977 | } | |
1978 | ||
1979 | // free the coalesce buffer | |
1980 | hfs_free(co_buf); | |
1981 | co_buf = NULL; | |
1982 | ||
1983 | hfs_free(buff); | |
1984 | ||
1985 | success: | |
1986 | LFHFS_LOG(LEVEL_DEFAULT, "replay_journal: success.\n"); | |
1987 | return 0; | |
1988 | ||
1989 | bad_replay: | |
1990 | hfs_free(block_ptr); | |
1991 | hfs_free(co_buf); | |
1992 | hfs_free(buff); | |
1993 | ||
1994 | LFHFS_LOG(LEVEL_ERROR, "replay_journal: error.\n"); | |
1995 | return -1; | |
1996 | } | |
1997 | ||
1998 | // buffer_written: | |
1999 | // This function get executed after a buffer has been written to its | |
2000 | // final destination. | |
2001 | // This function lets us know when a buffer has been | |
2002 | // flushed to disk. Originally (kext), it was called from deep | |
2003 | // within the driver stack and thus is quite limited in what it could do. | |
2004 | // Notably, it could not initiate any new i/o's or allocate/free memory. | |
2005 | static void buffer_written(transaction *tr, GenericLFBuf *bp) { | |
2006 | ||
2007 | journal *jnl; | |
2008 | transaction *ctr, *prev=NULL, *next; | |
2009 | size_t i; | |
2010 | size_t bufsize, amt_flushed, total_bytes; | |
2011 | ||
2012 | ||
2013 | // snarf out the bits we want | |
2014 | bufsize = bp->uDataSize; | |
2015 | ||
2016 | // then we've already seen it | |
2017 | if (tr == NULL) { | |
2018 | return; | |
2019 | } | |
2020 | ||
2021 | CHECK_TRANSACTION(tr); | |
2022 | ||
2023 | jnl = tr->jnl; | |
2024 | ||
2025 | CHECK_JOURNAL(jnl); | |
2026 | ||
2027 | amt_flushed = tr->num_killed; | |
2028 | total_bytes = tr->total_bytes; | |
2029 | ||
2030 | // update the number of blocks that have been flushed. | |
2031 | // this buf may represent more than one block so take | |
2032 | // that into account. | |
2033 | amt_flushed += tr->num_flushed; | |
2034 | tr->num_flushed += bufsize; | |
2035 | ||
2036 | // if this transaction isn't done yet, just return as | |
2037 | // there is nothing to do. | |
2038 | // | |
2039 | // NOTE: we are careful to not reference anything through | |
2040 | // the tr pointer after doing the OSAddAtomic(). if | |
2041 | // this if statement fails then we are the last one | |
2042 | // and then it's ok to dereference "tr". | |
2043 | // | |
2044 | if ((amt_flushed + bufsize) < total_bytes) { | |
2045 | return; | |
2046 | } | |
2047 | ||
2048 | // this will single thread checking the transaction | |
2049 | lock_oldstart(jnl); | |
2050 | ||
2051 | if (tr->total_bytes == (int)0xfbadc0de) { | |
2052 | // then someone beat us to it... | |
2053 | unlock_oldstart(jnl); | |
2054 | return; | |
2055 | } | |
2056 | ||
2057 | // mark this so that we're the owner of dealing with the | |
2058 | // cleanup for this transaction | |
2059 | tr->total_bytes = 0xfbadc0de; | |
2060 | ||
2061 | if (jnl->flags & JOURNAL_INVALID) | |
2062 | goto transaction_done; | |
2063 | ||
2064 | //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", | |
2065 | // tr, tr->journal_start, tr->journal_end, jnl); | |
2066 | ||
2067 | // find this entry in the old_start[] index and mark it completed | |
2068 | for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { | |
2069 | ||
2070 | if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { | |
2071 | jnl->old_start[i] &= ~(0x8000000000000000ULL); | |
2072 | break; | |
2073 | } | |
2074 | } | |
2075 | ||
2076 | if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { | |
2077 | panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", | |
2078 | tr->journal_start, tr, jnl); | |
2079 | } | |
2080 | ||
2081 | ||
2082 | // if we are here then we need to update the journal header | |
2083 | // to reflect that this transaction is complete | |
2084 | if (tr->journal_start == jnl->active_start) { | |
2085 | jnl->active_start = tr->journal_end; | |
2086 | tr->journal_start = tr->journal_end = (off_t)0; | |
2087 | } | |
2088 | ||
2089 | // go through the completed_trs list and try to coalesce | |
2090 | // entries, restarting back at the beginning if we have to. | |
2091 | for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { | |
2092 | if (ctr->journal_start == jnl->active_start) { | |
2093 | jnl->active_start = ctr->journal_end; | |
2094 | if (prev) { | |
2095 | prev->next = ctr->next; | |
2096 | } | |
2097 | if (ctr == jnl->completed_trs) { | |
2098 | jnl->completed_trs = ctr->next; | |
2099 | } | |
2100 | ||
2101 | next = jnl->completed_trs; // this starts us over again | |
2102 | ctr->next = jnl->tr_freeme; | |
2103 | jnl->tr_freeme = ctr; | |
2104 | ctr = NULL; | |
2105 | ||
2106 | } else if (tr->journal_end == ctr->journal_start) { | |
2107 | ctr->journal_start = tr->journal_start; | |
2108 | next = jnl->completed_trs; // this starts us over again | |
2109 | ctr = NULL; | |
2110 | tr->journal_start = tr->journal_end = (off_t)0; | |
2111 | ||
2112 | } else if (tr->journal_start == ctr->journal_end) { | |
2113 | ctr->journal_end = tr->journal_end; | |
2114 | next = ctr->next; | |
2115 | tr->journal_start = tr->journal_end = (off_t)0; | |
2116 | } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { | |
2117 | // coalesce the next entry with this one and link the next | |
2118 | // entry in at the head of the tr_freeme list | |
2119 | next = ctr->next; // temporarily use the "next" variable | |
2120 | ctr->journal_end = next->journal_end; | |
2121 | ctr->next = next->next; | |
2122 | next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list | |
2123 | jnl->tr_freeme = next; | |
2124 | ||
2125 | next = jnl->completed_trs; // this starts us over again | |
2126 | ctr = NULL; | |
2127 | ||
2128 | } else { | |
2129 | next = ctr->next; | |
2130 | } | |
2131 | } | |
2132 | ||
2133 | // if this is true then we didn't merge with anyone | |
2134 | // so link ourselves in at the head of the completed | |
2135 | // transaction list. | |
2136 | if (tr->journal_start != 0) { | |
2137 | // put this entry into the correct sorted place | |
2138 | // in the list instead of just at the head. | |
2139 | ||
2140 | prev = NULL; | |
2141 | for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { | |
2142 | // just keep looping | |
2143 | } | |
2144 | ||
2145 | if (ctr == NULL && prev == NULL) { | |
2146 | jnl->completed_trs = tr; | |
2147 | tr->next = NULL; | |
2148 | ||
2149 | } else if (ctr == jnl->completed_trs) { | |
2150 | tr->next = jnl->completed_trs; | |
2151 | jnl->completed_trs = tr; | |
2152 | ||
2153 | } else { | |
2154 | tr->next = prev->next; | |
2155 | prev->next = tr; | |
2156 | } | |
2157 | ||
2158 | } else { | |
2159 | // if we're here this tr got merged with someone else so | |
2160 | // put it on the list to be free'd | |
2161 | tr->next = jnl->tr_freeme; | |
2162 | jnl->tr_freeme = tr; | |
2163 | } | |
2164 | transaction_done: | |
2165 | unlock_oldstart(jnl); | |
2166 | ||
2167 | unlock_condition(jnl, &jnl->asyncIO); | |
2168 | } | |
2169 | ||
2170 | static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { | |
2171 | return do_journal_io(jnl, offset, data, len, JNL_WRITE); | |
2172 | } | |
2173 | ||
2174 | static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { | |
2175 | return do_journal_io(jnl, offset, data, len, JNL_READ); | |
2176 | } | |
2177 | ||
2178 | ||
2179 | // This function sets the size of the tbuffer and the | |
2180 | // size of the blhdr. It assumes that jnl->jhdr->size | |
2181 | // and jnl->jhdr->jhdr_size are already valid. | |
2182 | static void size_up_tbuffer(journal *jnl, uint32_t tbuffer_size, uint32_t phys_blksz) { | |
2183 | // | |
2184 | // one-time initialization based on how much memory | |
2185 | // there is in the machine. | |
2186 | // | |
2187 | if (def_tbuffer_size == 0) { | |
2188 | uint64_t memsize = 0; | |
2189 | size_t l = sizeof(memsize); | |
2190 | sysctlbyname("hw.memsize", &memsize, &l, NULL, 0); | |
2191 | ||
2192 | if (memsize < (256*1024*1024)) { | |
2193 | def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; | |
2194 | } else if (memsize < (512*1024*1024)) { | |
2195 | def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; | |
2196 | } else if (memsize < (1024*1024*1024)) { | |
2197 | def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; | |
2198 | } else { | |
2199 | def_tbuffer_size = (uint32_t)(DEFAULT_TRANSACTION_BUFFER_SIZE * (memsize / (256*1024*1024))); | |
2200 | } | |
2201 | } | |
2202 | ||
2203 | // For analyzer | |
2204 | if (!(jnl->jhdr->jhdr_size > 0)) { | |
2205 | panic("jnl->jhdr->jhdr_size is %d", jnl->jhdr->jhdr_size); | |
2206 | } | |
2207 | ||
2208 | // size up the transaction buffer... can't be larger than the number | |
2209 | // of blocks that can fit in a block_list_header block. | |
2210 | if (tbuffer_size == 0) { | |
2211 | jnl->tbuffer_size = def_tbuffer_size; | |
2212 | } else { | |
2213 | // make sure that the specified tbuffer_size isn't too small | |
2214 | if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { | |
2215 | tbuffer_size = jnl->jhdr->blhdr_size * 2; | |
2216 | } | |
2217 | // and make sure it's an even multiple of the block size | |
2218 | if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) { | |
2219 | tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size); | |
2220 | } | |
2221 | ||
2222 | jnl->tbuffer_size = tbuffer_size; | |
2223 | } | |
2224 | ||
2225 | if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { | |
2226 | jnl->tbuffer_size = (uint32_t)(jnl->jhdr->size / 2); | |
2227 | } | |
2228 | ||
2229 | if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { | |
2230 | jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; | |
2231 | } | |
2232 | ||
2233 | jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); | |
2234 | if (jnl->jhdr->blhdr_size < phys_blksz) { | |
2235 | jnl->jhdr->blhdr_size = phys_blksz; | |
2236 | } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { | |
2237 | // have to round up so we're an even multiple of the physical block size | |
2238 | jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); | |
2239 | } | |
2240 | } | |
2241 | ||
2242 | ||
2243 | static int write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) { | |
2244 | static int num_err_prints = 0; | |
2245 | int ret=0; | |
2246 | off_t jhdr_offset = 0; | |
2247 | ||
2248 | // Flush the track cache if we're not doing force-unit-access | |
2249 | // writes. | |
2250 | if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { | |
2251 | ||
2252 | dk_synchronize_t sync_request = { | |
2253 | .options = DK_SYNCHRONIZE_OPTION_BARRIER, | |
2254 | }; | |
2255 | ||
2256 | /* | |
2257 | * If device doesn't support barrier-only flush, or | |
2258 | * the journal is on a different device, use full flush. | |
2259 | */ | |
2260 | if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { | |
2261 | sync_request.options = 0; | |
2262 | jnl->flush_counter++; | |
2263 | } | |
2264 | ||
2265 | ret = ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request); | |
2266 | } | |
2267 | if (ret != 0) { | |
2268 | // | |
2269 | // Only print this error if it's a different error than the | |
2270 | // previous one, or if it's the first time for this device | |
2271 | // or if the total number of printfs is less than 25. We | |
2272 | // allow for up to 25 printfs to insure that some make it | |
2273 | // into the on-disk syslog. Otherwise if we only printed | |
2274 | // one, it's possible it would never make it to the syslog | |
2275 | // for the root volume and that makes debugging hard. | |
2276 | // | |
2277 | if ( ret != jnl->last_flush_err | |
2278 | || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 | |
2279 | || num_err_prints++ < 25) { | |
2280 | ||
2281 | LFHFS_LOG(LEVEL_ERROR, "jnl: flushing fs disk buffer returned 0x%x\n", ret); | |
2282 | ||
2283 | jnl->flags |= JOURNAL_FLUSHCACHE_ERR; | |
2284 | jnl->last_flush_err = ret; | |
2285 | } | |
2286 | } | |
2287 | ||
2288 | jnl->jhdr->sequence_num = sequence_num; | |
2289 | jnl->jhdr->checksum = 0; | |
2290 | jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); | |
2291 | ||
2292 | if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { | |
2293 | LFHFS_LOG(LEVEL_ERROR, "jnl: write_journal_header: error writing the journal header!\n"); | |
2294 | jnl->flags |= JOURNAL_INVALID; | |
2295 | return -1; | |
2296 | } | |
2297 | ||
2298 | // If we're not doing force-unit-access writes, then we | |
2299 | // have to flush after writing the journal header so that | |
2300 | // a future transaction doesn't sneak out to disk before | |
2301 | // the header does and thus overwrite data that the old | |
2302 | // journal header refers to. Saw this exact case happen | |
2303 | // on an IDE bus analyzer with Larry Barras so while it | |
2304 | // may seem obscure, it's not. | |
2305 | // | |
2306 | if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { | |
2307 | ||
2308 | dk_synchronize_t sync_request = { | |
2309 | .options = DK_SYNCHRONIZE_OPTION_BARRIER, | |
2310 | }; | |
2311 | ||
2312 | /* | |
2313 | * If device doesn't support barrier-only flush, or | |
2314 | * the journal is on a different device, use full flush. | |
2315 | */ | |
2316 | if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { | |
2317 | sync_request.options = 0; | |
2318 | jnl->flush_counter++; | |
2319 | } | |
2320 | ||
2321 | ioctl(jnl->jdev->psFSRecord->iFD, DKIOCSYNCHRONIZE, (caddr_t)&sync_request); | |
2322 | } | |
2323 | return 0; | |
2324 | } | |
2325 | ||
2326 | static int journal_binfo_cmp(const void *a, const void *b) { | |
2327 | ||
2328 | const block_info *bi_a = (const struct block_info *)a; | |
2329 | const block_info *bi_b = (const struct block_info *)b; | |
2330 | daddr64_t res; | |
2331 | ||
2332 | if (bi_a->bnum == (off_t)-1) { | |
2333 | return 1; | |
2334 | } | |
2335 | if (bi_b->bnum == (off_t)-1) { | |
2336 | return -1; | |
2337 | } | |
2338 | ||
2339 | // don't have to worry about negative block | |
2340 | // numbers so this is ok to do. | |
2341 | GenericLFBuf *psGenBufA, *psGenBufB; | |
2342 | psGenBufA = (void*)bi_a->u.bp; | |
2343 | psGenBufB = (void*)bi_b->u.bp; | |
2344 | res = psGenBufA->uBlockN - psGenBufB->uBlockN; | |
2345 | ||
2346 | return (int)res; | |
2347 | } | |
2348 | ||
2349 | // finish_end_transaction: | |
2350 | ||
2351 | static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) { | |
2352 | int i; | |
2353 | size_t amt; | |
2354 | size_t ret = 0; | |
2355 | off_t end; | |
2356 | journal *jnl = tr->jnl; | |
2357 | GenericLFBuf *bp = NULL, **bparray = NULL; | |
2358 | block_list_header *blhdr=NULL, *next=NULL; | |
2359 | size_t tbuffer_offset; | |
2360 | int bufs_written = 0; | |
2361 | int ret_val = 0; | |
2362 | ||
2363 | end = jnl->jhdr->end; | |
2364 | ||
2365 | for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { | |
2366 | ||
2367 | amt = blhdr->bytes_used; | |
2368 | ||
2369 | blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; | |
2370 | ||
2371 | blhdr->checksum = 0; | |
2372 | blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); | |
2373 | ||
2374 | bparray = hfs_malloc(blhdr->num_blocks * sizeof(buf_t)); | |
2375 | tbuffer_offset = jnl->jhdr->blhdr_size; | |
2376 | ||
2377 | // for each block in the block-header, | |
2378 | for (i = 1; i < blhdr->num_blocks; i++) { | |
2379 | size_t bsize; | |
2380 | ||
2381 | /* | |
2382 | * finish preparing the shadow buf_t before | |
2383 | * calculating the individual block checksums | |
2384 | */ | |
2385 | if (blhdr->binfo[i].bnum != (off_t)-1) { | |
2386 | daddr64_t blkno; | |
2387 | ||
2388 | bp = (void*)blhdr->binfo[i].u.bp; | |
2389 | blkno = bp->uPhyCluster; | |
2390 | // update this so we write out the correct physical block number! | |
2391 | blhdr->binfo[i].bnum = (off_t)(blkno); | |
2392 | ||
2393 | bparray[i] = bp; | |
2394 | bsize = bp->uDataSize; | |
2395 | blhdr->binfo[i].u.bi.bsize = (uint32_t)bsize; | |
2396 | blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], (uint32_t)bsize); | |
2397 | } else { | |
2398 | bparray[i] = NULL; | |
2399 | bsize = blhdr->binfo[i].u.bi.bsize; | |
2400 | blhdr->binfo[i].u.bi.b.cksum = 0; | |
2401 | } | |
2402 | tbuffer_offset += bsize; | |
2403 | } | |
2404 | ||
2405 | /* | |
2406 | * if we fired off the journal_write_header asynchronously in | |
2407 | * 'end_transaction', we need to wait for its completion | |
2408 | * before writing the actual journal data | |
2409 | */ | |
2410 | wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); | |
2411 | ||
2412 | if (jnl->write_header_failed == FALSE) | |
2413 | ret = write_journal_data(jnl, &end, blhdr, amt); | |
2414 | else | |
2415 | ret_val = -1; | |
2416 | ||
2417 | #if HFS_CRASH_TEST | |
2418 | CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_DATA, jnl->fsmount->psHfsmount, NULL); | |
2419 | #endif | |
2420 | ||
2421 | /* | |
2422 | * put the bp pointers back so that we can | |
2423 | * make the final pass on them | |
2424 | */ | |
2425 | for (i = 1; i < blhdr->num_blocks; i++) | |
2426 | blhdr->binfo[i].u.bp = (void*)bparray[i]; | |
2427 | ||
2428 | hfs_free(bparray); | |
2429 | ||
2430 | if (ret_val == -1) | |
2431 | goto bad_journal; | |
2432 | ||
2433 | if (ret != amt) { | |
2434 | LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: only wrote %zu of %zu bytes to the journal!\n", | |
2435 | ret, amt); | |
2436 | ||
2437 | ret_val = -1; | |
2438 | goto bad_journal; | |
2439 | } | |
2440 | } | |
2441 | jnl->jhdr->end = end; // update where the journal now ends | |
2442 | tr->journal_end = end; // the transaction ends here too | |
2443 | ||
2444 | if (tr->journal_start == 0 || tr->journal_end == 0) { | |
2445 | panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", | |
2446 | tr->journal_start, tr->journal_end); | |
2447 | } | |
2448 | ||
2449 | if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { | |
2450 | ret_val = -1; | |
2451 | goto bad_journal; | |
2452 | } | |
2453 | ||
2454 | #if HFS_CRASH_TEST | |
2455 | CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_JOURNAL_HEADER, jnl->fsmount->psHfsmount, NULL); | |
2456 | #endif | |
2457 | ||
2458 | /* | |
2459 | * If the caller supplied a callback, call it now that the blocks have been | |
2460 | * written to the journal. This is used by journal_relocate so, for example, | |
2461 | * the file system can change its pointer to the new journal. | |
2462 | */ | |
2463 | if (callback != NULL && callback(callback_arg) != 0) { | |
2464 | ret_val = -1; | |
2465 | goto bad_journal; | |
2466 | } | |
2467 | ||
2468 | // the buffer_flushed_callback will only be called for the | |
2469 | // real blocks that get flushed so we have to account for | |
2470 | // the block_list_headers here. | |
2471 | // | |
2472 | tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; | |
2473 | ||
2474 | lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); | |
2475 | ||
2476 | // | |
2477 | // setup for looping through all the blhdr's. | |
2478 | // | |
2479 | for (blhdr = tr->blhdr; blhdr; blhdr = next) { | |
2480 | uint16_t num_blocks; | |
2481 | ||
2482 | /* | |
2483 | * grab this info ahead of issuing the buf_bawrites... | |
2484 | * once the last one goes out, its possible for blhdr | |
2485 | * to be freed (especially if we get preempted) before | |
2486 | * we do the last check of num_blocks or | |
2487 | * grab the next blhdr pointer... | |
2488 | */ | |
2489 | next = (block_list_header *)((long)blhdr->binfo[0].bnum); | |
2490 | num_blocks = blhdr->num_blocks; | |
2491 | ||
2492 | /* | |
2493 | * we can re-order the buf ptrs because everything is written out already | |
2494 | */ | |
2495 | qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); | |
2496 | ||
2497 | /* | |
2498 | * need to make sure that the loop issuing the buf_bawrite's | |
2499 | * does not touch blhdr once the last buf_bawrite has been | |
2500 | * issued... at that point, we no longer have a legitmate | |
2501 | * reference on the associated storage since it will be | |
2502 | * released upon the completion of that last buf_bawrite | |
2503 | */ | |
2504 | for (i = num_blocks-1; i >= 1; i--) { | |
2505 | if (blhdr->binfo[i].bnum != (off_t)-1) | |
2506 | break; | |
2507 | num_blocks--; | |
2508 | } | |
2509 | for (i = 1; i < num_blocks; i++) { | |
2510 | ||
2511 | if ((bp = (void*)blhdr->binfo[i].u.bp)) { | |
2512 | ||
2513 | errno_t ret_val = 0; | |
2514 | ||
2515 | #if JOURNAL_DEBUG | |
2516 | printf("journal write physical: bp %p, psVnode %p, uBlockN %llu, uPhyCluster %llu uLockCnt %u\n", | |
2517 | bp, bp->psVnode, bp->uBlockN, bp->uPhyCluster, bp->uLockCnt); | |
2518 | #endif | |
2519 | ||
2520 | lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK); | |
2521 | ret_val = lf_hfs_generic_buf_write(bp); | |
2522 | ||
2523 | #if HFS_CRASH_TEST | |
2524 | CRASH_ABORT(CRASH_ABORT_JOURNAL_IN_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL); | |
2525 | #endif | |
2526 | ||
2527 | if (ret_val) { | |
2528 | LFHFS_LOG(LEVEL_ERROR, "jnl: raw_readwrite_write_mount inside finish_end_transaction returned %d.\n", ret_val); | |
2529 | } | |
2530 | ||
2531 | buffer_written(tr, bp); | |
2532 | ||
2533 | lf_hfs_generic_buf_unlock(bp); | |
2534 | lf_hfs_generic_buf_release(bp); | |
2535 | ||
2536 | bufs_written++; | |
2537 | } | |
2538 | } | |
2539 | } | |
2540 | #if HFS_CRASH_TEST | |
2541 | CRASH_ABORT(CRASH_ABORT_JOURNAL_AFTER_BLOCK_DATA, jnl->fsmount->psHfsmount, NULL); | |
2542 | #endif | |
2543 | if (bufs_written == 0) { | |
2544 | /* | |
2545 | * since we didn't issue any buf_bawrite's, there is no | |
2546 | * async trigger to cause the memory associated with this | |
2547 | * transaction to be freed... so, move it to the garbage | |
2548 | * list now | |
2549 | */ | |
2550 | lock_oldstart(jnl); | |
2551 | ||
2552 | tr->next = jnl->tr_freeme; | |
2553 | jnl->tr_freeme = tr; | |
2554 | ||
2555 | unlock_oldstart(jnl); | |
2556 | ||
2557 | unlock_condition(jnl, &jnl->asyncIO); | |
2558 | } | |
2559 | ||
2560 | //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", | |
2561 | // tr, tr->journal_start, tr->journal_end); | |
2562 | ||
2563 | bad_journal: | |
2564 | if (ret_val == -1) { | |
2565 | abort_transaction(jnl, tr); // cleans up list of extents to be trimmed | |
2566 | ||
2567 | /* | |
2568 | * 'flush_aborted' is protected by the flushing condition... we need to | |
2569 | * set it before dropping the condition so that it will be | |
2570 | * noticed in 'end_transaction'... we add this additional | |
2571 | * aborted condition so that we can drop the 'flushing' condition | |
2572 | * before grabbing the journal lock... this avoids a deadlock | |
2573 | * in 'end_transaction' which is holding the journal lock while | |
2574 | * waiting for the 'flushing' condition to clear... | |
2575 | * everyone else will notice the JOURNAL_INVALID flag | |
2576 | */ | |
2577 | jnl->flush_aborted = TRUE; | |
2578 | ||
2579 | unlock_condition(jnl, &jnl->flushing); | |
2580 | journal_lock(jnl); | |
2581 | ||
2582 | jnl->flags |= JOURNAL_INVALID; | |
2583 | jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; | |
2584 | ||
2585 | journal_unlock(jnl); | |
2586 | } else | |
2587 | unlock_condition(jnl, &jnl->flushing); | |
2588 | ||
2589 | return (ret_val); | |
2590 | } | |
2591 | static off_t free_space(journal *jnl) { | |
2592 | off_t free_space_offset; | |
2593 | ||
2594 | if (jnl->jhdr->start < jnl->jhdr->end) { | |
2595 | free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; | |
2596 | } else if (jnl->jhdr->start > jnl->jhdr->end) { | |
2597 | free_space_offset = jnl->jhdr->start - jnl->jhdr->end; | |
2598 | } else { | |
2599 | // journal is completely empty | |
2600 | free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; | |
2601 | } | |
2602 | ||
2603 | return free_space_offset; | |
2604 | } | |
2605 | ||
2606 | static void dump_journal(journal *jnl) { | |
2607 | transaction *ctr; | |
2608 | ||
2609 | printf(" jdev_offset %.8llx\n", jnl->jdev_offset); | |
2610 | printf(" magic: 0x%.8x\n", jnl->jhdr->magic); | |
2611 | printf(" start: 0x%.8llx\n", jnl->jhdr->start); | |
2612 | printf(" end: 0x%.8llx\n", jnl->jhdr->end); | |
2613 | printf(" size: 0x%.8llx\n", jnl->jhdr->size); | |
2614 | printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); | |
2615 | printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); | |
2616 | printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); | |
2617 | ||
2618 | printf(" completed transactions:\n"); | |
2619 | for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { | |
2620 | printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); | |
2621 | } | |
2622 | } | |
2623 | ||
2624 | // The journal must be locked on entry to this function. | |
2625 | // The "desired_size" is in bytes. | |
2626 | static int check_free_space( journal *jnl, | |
2627 | int desired_size, | |
2628 | boolean_t *delayed_header_write, | |
2629 | uint32_t sequence_num) { | |
2630 | ||
2631 | size_t i; | |
2632 | int counter=0; | |
2633 | ||
2634 | //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", | |
2635 | // desired_size, free_space(jnl)); | |
2636 | ||
2637 | if (delayed_header_write) | |
2638 | *delayed_header_write = FALSE; | |
2639 | ||
2640 | while (1) { | |
2641 | int old_start_empty; | |
2642 | ||
2643 | // make sure there's space in the journal to hold this transaction | |
2644 | if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { | |
2645 | break; | |
2646 | } | |
2647 | if (counter++ == 5000) { | |
2648 | dump_journal(jnl); | |
2649 | panic("jnl: check_free_space: buffer flushing isn't working " | |
2650 | "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, | |
2651 | jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); | |
2652 | } | |
2653 | if (counter > 7500) { | |
2654 | return ENOSPC; | |
2655 | } | |
2656 | ||
2657 | // here's where we lazily bump up jnl->jhdr->start. we'll consume | |
2658 | // entries until there is enough space for the next transaction. | |
2659 | old_start_empty = 1; | |
2660 | lock_oldstart(jnl); | |
2661 | ||
2662 | for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { | |
2663 | int lcl_counter; | |
2664 | ||
2665 | lcl_counter = 0; | |
2666 | while (jnl->old_start[i] & 0x8000000000000000LL) { | |
2667 | if (lcl_counter++ > 10000) { | |
2668 | panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", | |
2669 | jnl->old_start[i], jnl); | |
2670 | } | |
2671 | ||
2672 | unlock_oldstart(jnl); | |
2673 | if (jnl->flush) { | |
2674 | jnl->flush(jnl->flush_arg); | |
2675 | } | |
2676 | usleep(10000); | |
2677 | lock_oldstart(jnl); | |
2678 | } | |
2679 | ||
2680 | if (jnl->old_start[i] == 0) { | |
2681 | continue; | |
2682 | } | |
2683 | ||
2684 | old_start_empty = 0; | |
2685 | jnl->jhdr->start = jnl->old_start[i]; | |
2686 | jnl->old_start[i] = 0; | |
2687 | ||
2688 | if (free_space(jnl) > desired_size) { | |
2689 | ||
2690 | if (delayed_header_write) | |
2691 | *delayed_header_write = TRUE; | |
2692 | else { | |
2693 | unlock_oldstart(jnl); | |
2694 | write_journal_header(jnl, 1, sequence_num); | |
2695 | lock_oldstart(jnl); | |
2696 | } | |
2697 | break; | |
2698 | } | |
2699 | } | |
2700 | unlock_oldstart(jnl); | |
2701 | ||
2702 | // if we bumped the start, loop and try again | |
2703 | if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { | |
2704 | continue; | |
2705 | } else if (old_start_empty) { | |
2706 | // | |
2707 | // if there is nothing in old_start anymore then we can | |
2708 | // bump the jhdr->start to be the same as active_start | |
2709 | // since it is possible there was only one very large | |
2710 | // transaction in the old_start array. if we didn't do | |
2711 | // this then jhdr->start would never get updated and we | |
2712 | // would wind up looping until we hit the panic at the | |
2713 | // start of the loop. | |
2714 | // | |
2715 | jnl->jhdr->start = jnl->active_start; | |
2716 | ||
2717 | if (delayed_header_write) | |
2718 | *delayed_header_write = TRUE; | |
2719 | else | |
2720 | write_journal_header(jnl, 1, sequence_num); | |
2721 | continue; | |
2722 | } | |
2723 | ||
2724 | ||
2725 | // if the file system gave us a flush function, call it to so that | |
2726 | // it can flush some blocks which hopefully will cause some transactions | |
2727 | // to complete and thus free up space in the journal. | |
2728 | if (jnl->flush) { | |
2729 | jnl->flush(jnl->flush_arg); | |
2730 | } | |
2731 | ||
2732 | // wait for a while to avoid being cpu-bound (this will | |
2733 | // put us to sleep for 10 milliseconds) | |
2734 | usleep(10000); | |
2735 | } | |
2736 | ||
2737 | return 0; | |
2738 | } | |
2739 | ||
2740 | static void lock_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) { | |
2741 | ||
2742 | lock_flush(jnl); | |
2743 | ||
2744 | while (psCondFlag->uFlag) { | |
2745 | pthread_cond_wait(&psCondFlag->sCond, &jnl->flock); | |
2746 | } | |
2747 | ||
2748 | psCondFlag->uFlag = TRUE; | |
2749 | unlock_flush(jnl); | |
2750 | } | |
2751 | ||
2752 | static void wait_condition(journal *jnl, ConditionalFlag_S *psCondFlag, __unused const char *condition_name) { | |
2753 | ||
2754 | if (!psCondFlag->uFlag) | |
2755 | return; | |
2756 | ||
2757 | lock_flush(jnl); | |
2758 | ||
2759 | while (psCondFlag->uFlag) { | |
2760 | pthread_cond_wait(&psCondFlag->sCond, &jnl->flock); | |
2761 | } | |
2762 | ||
2763 | unlock_flush(jnl); | |
2764 | } | |
2765 | ||
2766 | static void unlock_condition(journal *jnl, ConditionalFlag_S *psCondFlag) { | |
2767 | lock_flush(jnl); | |
2768 | ||
2769 | psCondFlag->uFlag = FALSE; | |
2770 | pthread_cond_broadcast(&psCondFlag->sCond); | |
2771 | ||
2772 | unlock_flush(jnl); | |
2773 | } | |
2774 | ||
2775 | /* | |
2776 | * End a transaction: | |
2777 | * 1) Determine if it is time to commit the transaction or not: | |
2778 | * If the transaction is small enough, and we're not forcing | |
2779 | * a write to disk, the "active" transaction becomes the "current" transaction, | |
2780 | * and will be reused for the next transaction that is started (group commit). | |
2781 | * | |
2782 | * 2) Commit: | |
2783 | * If the transaction gets written to disk (because force_it is true, or no | |
2784 | * group commit, or the transaction is sufficiently full), the blocks get | |
2785 | * written into the journal first, then they are written to their final location | |
2786 | * asynchronously. When those async writes complete, the transaction can be freed | |
2787 | * and removed from the journal. | |
2788 | * | |
2789 | * 3) Callback: | |
2790 | * An optional callback can be supplied. If given, it is called after the | |
2791 | * the blocks have been written to the journal, but before the async writes | |
2792 | * of those blocks to their normal on-disk locations. This is used by | |
2793 | * journal_relocate so that the location of the journal can be changed and | |
2794 | * flushed to disk before the blocks get written to their normal locations. | |
2795 | * Note that the callback is only called if the transaction gets written to | |
2796 | * the journal during this end_transaction call; you probably want to set the | |
2797 | * force_it flag. | |
2798 | * | |
2799 | * 4) Free blocks' Generic Buff. | |
2800 | * | |
2801 | * Inputs: | |
2802 | * tr Transaction to add to the journal | |
2803 | * force_it If true, force this transaction to the on-disk journal immediately. | |
2804 | * callback See description above. Pass NULL for no callback. | |
2805 | * callback_arg Argument passed to callback routine. | |
2806 | * | |
2807 | * Result | |
2808 | * 0 No errors | |
2809 | * -1 An error occurred. The journal is marked invalid. | |
2810 | */ | |
2811 | static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock) { | |
2812 | ||
2813 | block_list_header *blhdr=NULL, *next=NULL; | |
2814 | int i, ret_val = 0; | |
2815 | journal *jnl = tr->jnl; | |
2816 | GenericLFBuf *bp; | |
2817 | size_t tbuffer_offset; | |
2818 | ||
2819 | if (jnl->cur_tr) { | |
2820 | panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", | |
2821 | jnl, jnl->cur_tr, tr); | |
2822 | } | |
2823 | ||
2824 | // if there weren't any modified blocks in the transaction | |
2825 | // just save off the transaction pointer and return. | |
2826 | if (tr->total_bytes == (int)jnl->jhdr->blhdr_size) { | |
2827 | jnl->cur_tr = tr; | |
2828 | goto done; | |
2829 | } | |
2830 | ||
2831 | // if our transaction buffer isn't very full, just hang | |
2832 | // on to it and don't actually flush anything. this is | |
2833 | // what is known as "group commit". we will flush the | |
2834 | // transaction buffer if it's full or if we have more than | |
2835 | // one of them so we don't start hogging too much memory. | |
2836 | // | |
2837 | // We also check the device supports UNMAP/TRIM, and if so, | |
2838 | // the number of extents waiting to be trimmed. If it is | |
2839 | // small enough, then keep accumulating more (so we can | |
2840 | // reduce the overhead of trimming). If there was a prior | |
2841 | // trim error, then we stop issuing trims for this | |
2842 | // volume, so we can also coalesce transactions. | |
2843 | // | |
2844 | if ( force_it == 0 | |
2845 | && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 | |
2846 | && tr->num_blhdrs < 3 | |
2847 | && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) | |
2848 | && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { | |
2849 | ||
2850 | jnl->cur_tr = tr; | |
2851 | goto done; | |
2852 | } | |
2853 | ||
2854 | lock_condition(jnl, &jnl->flushing, "end_transaction"); | |
2855 | ||
2856 | /* | |
2857 | * if the previous 'finish_end_transaction' was being run | |
2858 | * asynchronously, it could have encountered a condition | |
2859 | * that caused it to mark the journal invalid... if that | |
2860 | * occurred while we were waiting for it to finish, we | |
2861 | * need to notice and abort the current transaction | |
2862 | */ | |
2863 | if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { | |
2864 | unlock_condition(jnl, &jnl->flushing); | |
2865 | ||
2866 | abort_transaction(jnl, tr); | |
2867 | ret_val = -1; | |
2868 | goto done; | |
2869 | } | |
2870 | ||
2871 | /* | |
2872 | * Store a pointer to this transaction's trim list so that | |
2873 | * future transactions can find it. | |
2874 | * | |
2875 | * Note: if there are no extents in the trim list, then don't | |
2876 | * bother saving the pointer since nothing can add new extents | |
2877 | * to the list (and other threads/transactions only care if | |
2878 | * there is a trim pending). | |
2879 | */ | |
2880 | lf_lck_rw_lock_exclusive(&jnl->trim_lock); | |
2881 | if (jnl->async_trim != NULL) | |
2882 | panic("jnl: end_transaction: async_trim already non-NULL!"); | |
2883 | if (tr->trim.extent_count > 0) | |
2884 | jnl->async_trim = &tr->trim; | |
2885 | lf_lck_rw_unlock_exclusive(&jnl->trim_lock); | |
2886 | ||
2887 | /* | |
2888 | * snapshot the transaction sequence number while we are still behind | |
2889 | * the journal lock since it will be bumped upon the start of the | |
2890 | * next transaction group which may overlap the current journal flush... | |
2891 | * we pass the snapshot into write_journal_header during the journal | |
2892 | * flush so that it can write the correct version in the header... | |
2893 | * because we hold the 'flushing' condition variable for the duration | |
2894 | * of the journal flush, 'saved_sequence_num' remains stable | |
2895 | */ | |
2896 | jnl->saved_sequence_num = jnl->sequence_num; | |
2897 | ||
2898 | /* | |
2899 | * if we're here we're going to flush the transaction buffer to disk. | |
2900 | * 'check_free_space' will not return untl there is enough free | |
2901 | * space for this transaction in the journal and jnl->old_start[0] | |
2902 | * is avaiable for use | |
2903 | */ | |
2904 | check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); | |
2905 | ||
2906 | // range check the end index | |
2907 | if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { | |
2908 | panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", | |
2909 | jnl->jhdr->end, jnl->jhdr->size); | |
2910 | } | |
2911 | ||
2912 | // this transaction starts where the current journal ends | |
2913 | tr->journal_start = jnl->jhdr->end; | |
2914 | ||
2915 | lock_oldstart(jnl); | |
2916 | /* | |
2917 | * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. | |
2918 | * slide everyone else down and put our latest guy in the last | |
2919 | * entry in the old_start array | |
2920 | */ | |
2921 | memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); | |
2922 | jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; | |
2923 | ||
2924 | unlock_oldstart(jnl); | |
2925 | ||
2926 | // go over the blocks in the transaction. | |
2927 | // for each block, call the fpCallback and copy the content into the journal buffer | |
2928 | for (blhdr = tr->blhdr; blhdr; blhdr = next) { | |
2929 | char *blkptr; | |
2930 | size_t bsize; | |
2931 | ||
2932 | tbuffer_offset = jnl->jhdr->blhdr_size; | |
2933 | ||
2934 | for (i = 1; i < blhdr->num_blocks; i++) { | |
2935 | ||
2936 | if (blhdr->binfo[i].bnum != (off_t)-1) { | |
2937 | ||
2938 | bp = (GenericLFBuf*)blhdr->binfo[i].u.bp; | |
2939 | ||
2940 | if (bp == NULL) { | |
2941 | panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", | |
2942 | blhdr->binfo[i].bnum, jnl, tr); | |
2943 | } | |
2944 | ||
2945 | bsize = bp->uDataSize; | |
2946 | ||
2947 | blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; | |
2948 | ||
2949 | int iRet; | |
2950 | retry: | |
2951 | iRet = lf_hfs_generic_buf_take_ownership(bp, NULL); | |
2952 | if (iRet == EAGAIN) { | |
2953 | goto retry; | |
2954 | } else if (iRet) { | |
2955 | LFHFS_LOG(LEVEL_ERROR, "jnl: end_transaction: lf_hfs_generic_buf_take_ownership returned %d.\n", iRet); | |
2956 | ret_val = -1; | |
2957 | goto done; | |
2958 | } | |
2959 | ||
2960 | if (!(bp->uCacheFlags & GEN_BUF_WRITE_LOCK)) { | |
2961 | panic("GEN_BUF_WRITE_LOCK should be set!"); | |
2962 | } | |
2963 | ||
2964 | // Call the buffer callback | |
2965 | if (bp->pfFunc) { | |
2966 | bp->pfFunc(bp, bp->pvCallbackArgs); | |
2967 | bp->pfFunc = NULL; | |
2968 | } | |
2969 | ||
2970 | if (bp->uCacheFlags & GEN_BUF_LITTLE_ENDIAN) { | |
2971 | panic("We do not want to write a GEN_BUF_LITTLE_ENDIAN buffer to media!"); | |
2972 | } | |
2973 | ||
2974 | // copy the data into the transaction buffer... | |
2975 | memcpy(blkptr, bp->pvData, bsize); | |
2976 | ||
2977 | blhdr->binfo[i].u.bp = (void*)bp; | |
2978 | ||
2979 | } else { | |
2980 | // bnum == -1, only true if a block was "killed" | |
2981 | bsize = blhdr->binfo[i].u.bi.bsize; | |
2982 | } | |
2983 | tbuffer_offset += bsize; | |
2984 | } | |
2985 | next = (block_list_header *)((long)blhdr->binfo[0].bnum); | |
2986 | } | |
2987 | ||
2988 | #if HFS_CRASH_TEST | |
2989 | CRASH_ABORT(CRASH_ABORT_JOURNAL_BEFORE_FINISH, jnl->fsmount->psHfsmount, NULL); | |
2990 | #endif | |
2991 | ||
2992 | ret_val = finish_end_transaction(tr, callback, callback_arg); | |
2993 | ||
2994 | done: | |
2995 | if (drop_lock == TRUE) { | |
2996 | journal_unlock(jnl); | |
2997 | } | |
2998 | return (ret_val); | |
2999 | } | |
3000 | ||
3001 | static void abort_transaction(journal *jnl, transaction *tr) { | |
3002 | ||
3003 | block_list_header *blhdr, *next; | |
3004 | // for each block list header, iterate over the blocks then | |
3005 | // free up the memory associated with the block list. | |
3006 | for (blhdr = tr->blhdr; blhdr; blhdr = next) { | |
3007 | int i; | |
3008 | ||
3009 | for (i = 1; i < blhdr->num_blocks; i++) { | |
3010 | GenericLFBufPtr bp; | |
3011 | ||
3012 | if (blhdr->binfo[i].bnum == (off_t)-1) | |
3013 | continue; | |
3014 | ||
3015 | bp = (void*)blhdr->binfo[i].u.bp; | |
3016 | ||
3017 | // Release the buffers | |
3018 | lf_hfs_generic_buf_clear_cache_flag(bp, GEN_BUF_WRITE_LOCK); | |
3019 | if (lf_hfs_generic_buf_validate_owner(bp)) { // abort_transaction can be called before or after we take ownership | |
3020 | lf_hfs_generic_buf_release(bp); | |
3021 | } | |
3022 | ||
3023 | } | |
3024 | next = (block_list_header *)((long)blhdr->binfo[0].bnum); | |
3025 | ||
3026 | // we can free blhdr here since we won't need it any more | |
3027 | blhdr->binfo[0].bnum = 0xdeadc0de; | |
3028 | hfs_free(blhdr); | |
3029 | } | |
3030 | ||
3031 | /* | |
3032 | * If the transaction we're aborting was the async transaction, then | |
3033 | * tell the current transaction that there is no pending trim | |
3034 | * any more. | |
3035 | */ | |
3036 | lf_lck_rw_lock_exclusive(&jnl->trim_lock); | |
3037 | if (jnl->async_trim == &tr->trim) | |
3038 | jnl->async_trim = NULL; | |
3039 | lf_lck_rw_unlock_exclusive(&jnl->trim_lock); | |
3040 | ||
3041 | ||
3042 | if (tr->trim.extents) { | |
3043 | hfs_free(tr->trim.extents); | |
3044 | } | |
3045 | tr->trim.allocated_count = 0; | |
3046 | tr->trim.extent_count = 0; | |
3047 | tr->trim.extents = NULL; | |
3048 | tr->tbuffer = NULL; | |
3049 | tr->blhdr = NULL; | |
3050 | tr->total_bytes = 0xdbadc0de; | |
3051 | hfs_free(tr); | |
3052 | } | |
3053 | ||
3054 | static void swap_journal_header(journal *jnl) { | |
3055 | jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); | |
3056 | jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); | |
3057 | jnl->jhdr->start = SWAP64(jnl->jhdr->start); | |
3058 | jnl->jhdr->end = SWAP64(jnl->jhdr->end); | |
3059 | jnl->jhdr->size = SWAP64(jnl->jhdr->size); | |
3060 | jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); | |
3061 | jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); | |
3062 | jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); | |
3063 | jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); | |
3064 | } | |
3065 | ||
3066 | // this isn't a great checksum routine but it will do for now. | |
3067 | // we use it to checksum the journal header and the block list | |
3068 | // headers that are at the start of each transaction. | |
3069 | static unsigned int calc_checksum(const char *ptr, int len) { | |
3070 | int i; | |
3071 | unsigned int cksum=0; | |
3072 | ||
3073 | // this is a lame checksum but for now it'll do | |
3074 | for(i = 0; i < len; i++, ptr++) { | |
3075 | cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); | |
3076 | } | |
3077 | ||
3078 | return (~cksum); | |
3079 | } | |
3080 | ||
3081 | ||
3082 | static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) { | |
3083 | off_t curlen = len; | |
3084 | size_t io_sz = 0; | |
3085 | off_t max_iosize; | |
3086 | #if 0 // TBD | |
3087 | int err; | |
3088 | buf_t bp; | |
3089 | off_t accumulated_offset = 0; | |
3090 | ExtendedVCB *vcb = HFSTOVCB(jnl->fsmount->psHfsmount); | |
3091 | #endif | |
3092 | ||
3093 | if (*offset < 0 || *offset > jnl->jhdr->size) { | |
3094 | panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); | |
3095 | } | |
3096 | ||
3097 | if (direction & JNL_WRITE) | |
3098 | max_iosize = jnl->max_write_size; | |
3099 | else if (direction & JNL_READ) | |
3100 | max_iosize = jnl->max_read_size; | |
3101 | else | |
3102 | max_iosize = 128 * 1024; | |
3103 | ||
3104 | again: | |
3105 | ||
3106 | // Determine the Current R/W Length, taking cyclic wrap around into account | |
3107 | if (*offset + curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { | |
3108 | if (*offset == jnl->jhdr->size) { | |
3109 | *offset = jnl->jhdr->jhdr_size; | |
3110 | } else { | |
3111 | curlen = jnl->jhdr->size - *offset; | |
3112 | } | |
3113 | } | |
3114 | ||
3115 | if (curlen > max_iosize) { | |
3116 | curlen = max_iosize; | |
3117 | } | |
3118 | ||
3119 | if (curlen <= 0) { | |
3120 | panic("jnl: do_jnl_io: curlen == %lld, offset 0x%llx len %zd\n", curlen, *offset, len); | |
3121 | } | |
3122 | ||
3123 | if (*offset == 0 && (direction & JNL_HEADER) == 0) { | |
3124 | panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %lld, data %p)\n", curlen, data); | |
3125 | } | |
3126 | ||
3127 | ||
3128 | // Perform the I/O | |
3129 | uint64_t phyblksize = jnl->fsmount->psHfsmount->hfs_physical_block_size; | |
3130 | uint64_t uBlkNum = jnl->jdev_blknum+(*offset)/phyblksize; | |
3131 | ||
3132 | if (direction & JNL_READ) { | |
3133 | raw_readwrite_read_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL); | |
3134 | ||
3135 | } else if (direction & JNL_WRITE) { | |
3136 | raw_readwrite_write_mount(jnl->jdev, uBlkNum, phyblksize, data, curlen, NULL, NULL); | |
3137 | } | |
3138 | ||
3139 | // Move to the next section | |
3140 | *offset += curlen; | |
3141 | io_sz += curlen; | |
3142 | ||
3143 | if (io_sz != len) { | |
3144 | // handle wrap-around | |
3145 | data = (char *)data + curlen; | |
3146 | curlen = len - io_sz; | |
3147 | if (*offset >= jnl->jhdr->size) { | |
3148 | *offset = jnl->jhdr->jhdr_size; | |
3149 | } | |
3150 | goto again; | |
3151 | } | |
3152 | ||
3153 | return io_sz; | |
3154 | } | |
3155 | ||
3156 | static size_t read_journal_header(journal *jnl, void *data, size_t len) { | |
3157 | off_t hdr_offset = 0; | |
3158 | ||
3159 | return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER); | |
3160 | } | |
3161 | ||
3162 | static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl) { | |
3163 | off_t readblockcnt; | |
3164 | off_t writeblockcnt; | |
3165 | off_t readmaxcnt=0, tmp_readmaxcnt; | |
3166 | off_t writemaxcnt=0, tmp_writemaxcnt; | |
3167 | off_t readsegcnt, writesegcnt; | |
3168 | ||
3169 | // First check the max read size via several different mechanisms... | |
3170 | ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt); | |
3171 | ||
3172 | if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt) == 0) { | |
3173 | tmp_readmaxcnt = readblockcnt * phys_blksz; | |
3174 | if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { | |
3175 | readmaxcnt = tmp_readmaxcnt; | |
3176 | } | |
3177 | } | |
3178 | ||
3179 | if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt)) { | |
3180 | readsegcnt = 0; | |
3181 | } | |
3182 | ||
3183 | if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { | |
3184 | readmaxcnt = readsegcnt * PAGE_SIZE; | |
3185 | } | |
3186 | ||
3187 | if (readmaxcnt == 0) { | |
3188 | readmaxcnt = 128 * 1024; | |
3189 | } else if (readmaxcnt > UINT32_MAX) { | |
3190 | readmaxcnt = UINT32_MAX; | |
3191 | } | |
3192 | ||
3193 | ||
3194 | // Now check the max writes size via several different mechanisms... | |
3195 | ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt); | |
3196 | ||
3197 | if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt) == 0) { | |
3198 | tmp_writemaxcnt = writeblockcnt * phys_blksz; | |
3199 | if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { | |
3200 | writemaxcnt = tmp_writemaxcnt; | |
3201 | } | |
3202 | } | |
3203 | ||
3204 | if (ioctl(devvp->psFSRecord->iFD, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt)) { | |
3205 | writesegcnt = 0; | |
3206 | } | |
3207 | ||
3208 | if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { | |
3209 | writemaxcnt = writesegcnt * PAGE_SIZE; | |
3210 | } | |
3211 | ||
3212 | if (writemaxcnt == 0) { | |
3213 | writemaxcnt = 128 * 1024; | |
3214 | } else if (writemaxcnt > UINT32_MAX) { | |
3215 | writemaxcnt = UINT32_MAX; | |
3216 | } | |
3217 | ||
3218 | jnl->max_read_size = readmaxcnt; | |
3219 | jnl->max_write_size = writemaxcnt; | |
3220 | } | |
3221 | ||
3222 | // this is a work function used to free up transactions that | |
3223 | // completed. they can't be free'd from buffer_flushed_callback | |
3224 | // because it is called from deep with the disk driver stack | |
3225 | // and thus can't do something that would potentially cause | |
3226 | // paging. it gets called by each of the journal api entry | |
3227 | // points so stuff shouldn't hang around for too long. | |
3228 | static void free_old_stuff(journal *jnl) { | |
3229 | transaction *tr, *next; | |
3230 | block_list_header *blhdr=NULL, *next_blhdr=NULL; | |
3231 | ||
3232 | if (jnl->tr_freeme == NULL) | |
3233 | return; | |
3234 | ||
3235 | lock_oldstart(jnl); | |
3236 | tr = jnl->tr_freeme; | |
3237 | jnl->tr_freeme = NULL; | |
3238 | unlock_oldstart(jnl); | |
3239 | ||
3240 | for(; tr; tr=next) { | |
3241 | for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { | |
3242 | next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); | |
3243 | blhdr->binfo[0].bnum = 0xdeadc0de; | |
3244 | ||
3245 | hfs_free(blhdr); | |
3246 | ||
3247 | KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); | |
3248 | } | |
3249 | next = tr->next; | |
3250 | hfs_free(tr); | |
3251 | } | |
3252 | } | |
3253 | ||
3254 | // Allocate a new active transaction. | |
3255 | // The function does the following: | |
3256 | // 1) mallocs memory for a transaction structure and a buffer | |
3257 | // 2) initializes the transaction structure and the buffer (invalid CRC + 0x5a) | |
3258 | static errno_t journal_allocate_transaction(journal *jnl) { | |
3259 | transaction *tr; | |
3260 | ||
3261 | tr = hfs_mallocz(sizeof(transaction)); | |
3262 | ||
3263 | tr->tbuffer_size = jnl->tbuffer_size; | |
3264 | ||
3265 | tr->tbuffer = hfs_malloc(tr->tbuffer_size); | |
3266 | ||
3267 | // journal replay code checksum check depends on this. | |
3268 | memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); | |
3269 | // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) | |
3270 | memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); | |
3271 | ||
3272 | tr->blhdr = (block_list_header *)tr->tbuffer; | |
3273 | tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; | |
3274 | tr->blhdr->num_blocks = 1; // accounts for this header block | |
3275 | tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; | |
3276 | tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; | |
3277 | ||
3278 | tr->sequence_num = ++jnl->sequence_num; | |
3279 | tr->num_blhdrs = 1; | |
3280 | tr->total_bytes = jnl->jhdr->blhdr_size; | |
3281 | tr->jnl = jnl; | |
3282 | ||
3283 | jnl->active_tr = tr; | |
3284 | ||
3285 | return 0; | |
3286 | } | |
3287 | ||
3288 | int journal_kill_block(journal *jnl, GenericLFBuf *psGenBuf) { | |
3289 | int i; | |
3290 | uint64_t uflags; | |
3291 | block_list_header *blhdr; | |
3292 | transaction *tr; | |
3293 | ||
3294 | #if JOURNAL_DEBUG | |
3295 | printf("journal_kill_block: psGenBuf %p, psVnode %p, uBlockN %llu, uDataSize %u, uPhyCluster %llu uLockCnt %u\n", | |
3296 | psGenBuf, psGenBuf->psVnode, psGenBuf->uBlockN, psGenBuf->uDataSize ,psGenBuf->uPhyCluster, psGenBuf->uLockCnt); | |
3297 | #endif | |
3298 | ||
3299 | CHECK_JOURNAL(jnl); | |
3300 | free_old_stuff(jnl); | |
3301 | ||
3302 | if (jnl->flags & JOURNAL_INVALID) { | |
3303 | lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK); | |
3304 | lf_hfs_generic_buf_release(psGenBuf); | |
3305 | return 0; | |
3306 | } | |
3307 | ||
3308 | tr = jnl->active_tr; | |
3309 | CHECK_TRANSACTION(tr); | |
3310 | ||
3311 | if (jnl->owner != pthread_self()) { | |
3312 | panic("jnl: journal_kill_block: called w/out a transaction! jnl %p, owner %p, curact %p\n", | |
3313 | jnl, jnl->owner, pthread_self()); | |
3314 | } | |
3315 | ||
3316 | uflags = psGenBuf->uCacheFlags; | |
3317 | ||
3318 | if ( !(uflags & GEN_BUF_WRITE_LOCK)) | |
3319 | panic("jnl: journal_kill_block: called with bp not B_LOCKED"); | |
3320 | ||
3321 | /* | |
3322 | * bp must be BL_BUSY and B_LOCKED | |
3323 | * first check if it's already part of this transaction | |
3324 | */ | |
3325 | for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { | |
3326 | ||
3327 | for (i = 1; i < blhdr->num_blocks; i++) { | |
3328 | if (psGenBuf == (void*)blhdr->binfo[i].u.bp) { | |
3329 | ||
3330 | // if the block has the DELWRI and FILTER bits sets, then | |
3331 | // things are seriously weird. if it was part of another | |
3332 | // transaction then journal_modify_block_start() should | |
3333 | // have force it to be written. | |
3334 | // | |
3335 | //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) { | |
3336 | // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); | |
3337 | //} else { | |
3338 | tr->num_killed += psGenBuf->uDataSize; | |
3339 | //} | |
3340 | blhdr->binfo[i].bnum = (off_t)-1; | |
3341 | blhdr->binfo[i].u.bp = NULL; | |
3342 | blhdr->binfo[i].u.bi.bsize = psGenBuf->uDataSize; | |
3343 | ||
3344 | lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK); | |
3345 | lf_hfs_generic_buf_release(psGenBuf); | |
3346 | ||
3347 | return 0; | |
3348 | } | |
3349 | } | |
3350 | } | |
3351 | ||
3352 | /* | |
3353 | * We did not find the block in any transaction buffer but we still | |
3354 | * need to release it or else it will be left locked forever. | |
3355 | */ | |
3356 | lf_hfs_generic_buf_clear_cache_flag(psGenBuf, GEN_BUF_WRITE_LOCK); | |
3357 | lf_hfs_generic_buf_release(psGenBuf); | |
3358 | ||
3359 | return 0; | |
3360 | } | |
3361 | ||
3362 | int journal_is_clean(struct vnode *jvp, | |
3363 | off_t offset, | |
3364 | off_t journal_size, | |
3365 | struct vnode *fsvp, | |
3366 | size_t min_fs_block_size, | |
3367 | struct mount *fsmount) { | |
3368 | ||
3369 | journal jnl; | |
3370 | uint32_t phys_blksz; | |
3371 | int ret; | |
3372 | int orig_checksum, checksum; | |
3373 | ||
3374 | /* Get the real physical block size. */ | |
3375 | if (ioctl(jvp->psFSRecord->iFD, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz)) { | |
3376 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: failed to get device block size.\n"); | |
3377 | ret = EINVAL; | |
3378 | goto cleanup_jdev_name; | |
3379 | } | |
3380 | ||
3381 | if (phys_blksz > (uint32_t)min_fs_block_size) { | |
3382 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", | |
3383 | phys_blksz, min_fs_block_size); | |
3384 | ret = EINVAL; | |
3385 | goto cleanup_jdev_name; | |
3386 | } | |
3387 | ||
3388 | if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { | |
3389 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size %lld looks bogus.\n", journal_size); | |
3390 | ret = EINVAL; | |
3391 | goto cleanup_jdev_name; | |
3392 | } | |
3393 | ||
3394 | if ((journal_size % phys_blksz) != 0) { | |
3395 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", | |
3396 | journal_size, phys_blksz); | |
3397 | ret = EINVAL; | |
3398 | goto cleanup_jdev_name; | |
3399 | } | |
3400 | ||
3401 | memset(&jnl, 0, sizeof(jnl)); | |
3402 | ||
3403 | jnl.header_buf = hfs_malloc(phys_blksz); | |
3404 | jnl.header_buf_size = phys_blksz; | |
3405 | ||
3406 | // Keep a point to the mount around for use in IO throttling. | |
3407 | jnl.fsmount = fsmount; | |
3408 | ||
3409 | get_io_info(jvp, phys_blksz, &jnl); | |
3410 | ||
3411 | jnl.jhdr = (journal_header *)jnl.header_buf; | |
3412 | memset(jnl.jhdr, 0, sizeof(journal_header)); | |
3413 | ||
3414 | jnl.jdev = jvp; | |
3415 | jnl.jdev_offset = offset; | |
3416 | jnl.jdev_blknum = (uint32_t)(offset / phys_blksz); | |
3417 | jnl.fsdev = fsvp; | |
3418 | ||
3419 | // we have to set this up here so that do_journal_io() will work | |
3420 | jnl.jhdr->jhdr_size = phys_blksz; | |
3421 | ||
3422 | if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { | |
3423 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: could not read %d bytes for the journal header.\n", | |
3424 | phys_blksz); | |
3425 | ret = EINVAL; | |
3426 | goto get_out; | |
3427 | } | |
3428 | ||
3429 | orig_checksum = jnl.jhdr->checksum; | |
3430 | jnl.jhdr->checksum = 0; | |
3431 | ||
3432 | if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { | |
3433 | // do this before the swap since it's done byte-at-a-time | |
3434 | orig_checksum = SWAP32(orig_checksum); | |
3435 | checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); | |
3436 | swap_journal_header(&jnl); | |
3437 | jnl.flags |= JOURNAL_NEED_SWAP; | |
3438 | } else { | |
3439 | checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); | |
3440 | } | |
3441 | ||
3442 | if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { | |
3443 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal magic is bad (0x%x != 0x%x)\n", | |
3444 | jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); | |
3445 | ret = EINVAL; | |
3446 | goto get_out; | |
3447 | } | |
3448 | ||
3449 | if (orig_checksum != checksum) { | |
3450 | LFHFS_LOG(LEVEL_ERROR, "jnl: journal_is_clean: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum, checksum); | |
3451 | ret = EINVAL; | |
3452 | goto get_out; | |
3453 | } | |
3454 | ||
3455 | // | |
3456 | // if the start and end are equal then the journal is clean. | |
3457 | // otherwise it's not clean and therefore an error. | |
3458 | // | |
3459 | if (jnl.jhdr->start == jnl.jhdr->end) { | |
3460 | ret = 0; | |
3461 | } else { | |
3462 | ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one | |
3463 | } | |
3464 | ||
3465 | get_out: | |
3466 | hfs_free(jnl.header_buf); | |
3467 | cleanup_jdev_name: | |
3468 | return ret; | |
3469 | } | |
3470 | ||
3471 | uint32_t journal_current_txn(journal *jnl) { | |
3472 | return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1); | |
3473 | } | |
3474 |