]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/vn/shadow.c
cb5fbfd6da7b51c13c9afcb17e03ae27550c72b9
[apple/xnu.git] / bsd / dev / vn / shadow.c
1
2 /*
3 * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
4 *
5 * @APPLE_LICENSE_HEADER_START@
6 *
7 * The contents of this file constitute Original Code as defined in and
8 * are subject to the Apple Public Source License Version 1.1 (the
9 * "License"). You may not use this file except in compliance with the
10 * License. Please obtain a copy of the License at
11 * http://www.apple.com/publicsource and read it before using this file.
12 *
13 * This Original Code and all software distributed under the License are
14 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
18 * License for the specific language governing rights and limitations
19 * under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /*
25 * shadow.c
26 *
27 * Implement copy-on-write shadow map to allow a disk image to be
28 * mounted read-only, yet be writable by transferring writes to a
29 * "shadow" file. Subsequent reads from blocks that have been
30 * written will then go the "shadow" file.
31 *
32 * The map has two parts:
33 * 1) a bit map to track which blocks have been written
34 * 2) a band map to map a "band" within the original file to a corresponding
35 * "band" in the shadow file. Each band has the same size.
36 *
37 * The band map is used to ensure that blocks that are contiguous in the
38 * original file will remain contiguous in the shadow file.
39 *
40 * For debugging purposes, this file can be compiled standalone using:
41 * cc -o shadow shadow.c -DTEST_SHADOW
42 */
43
44 /*
45 * Modification History
46 *
47 * December 21, 2001 Dieter Siegmund (dieter@apple.com)
48 * - initial revision
49 */
50 #include <sys/param.h>
51 #include <sys/types.h>
52 #include <mach/boolean.h>
53
54 #include <string.h>
55
56 #ifdef TEST_SHADOW
57 #include <unistd.h>
58 #include <stdlib.h>
59 #define my_malloc(a) malloc(a)
60 #define my_free(a) free(a)
61 #else /* !TEST_SHADOW */
62 #include <sys/malloc.h>
63 #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK)
64 #define my_free(a) FREE(a, M_TEMP)
65 #include <libkern/libkern.h>
66 #endif /* TEST_SHADOW */
67
68 #include "shadow.h"
69
70 #define ULONG_ALL_ONES ((u_long)(-1))
71 #define USHORT_ALL_ONES ((u_short)(-1))
72 #define UCHAR_ALL_ONES ((u_char)(-1))
73
74 #define my_trunc(value, divisor) ((value) / (divisor) * (divisor))
75
76 /* a band size of 128K can represent a file up to 8GB */
77 #define BAND_SIZE_DEFAULT_POWER_2 17
78 #define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2)
79
80 typedef u_short band_number_t;
81 #define BAND_ZERO ((band_number_t)0)
82 #define BAND_MAX ((band_number_t)65535)
83
84 struct shadow_map {
85 u_long blocks_per_band;/* size in blocks */
86 u_long block_size;
87 u_char * block_bitmap; /* 1 bit per block; 1=written */
88 band_number_t * bands; /* band map array */
89 u_long file_size_blocks; /* size of file in bands */
90 u_long shadow_size_bands; /* size of shadow in bands */
91 u_long next_band; /* next free band */
92 u_long zeroth_band; /* special-case 0th band */
93 };
94
95
96 typedef struct {
97 u_long byte;
98 u_long bit;
99 } bitmap_offset_t;
100
101 static __inline__ u_char
102 bit(int b)
103 {
104 return ((u_char)(1 << b));
105 }
106
107 /*
108 * Function: bits_lower
109 * Purpose:
110 * Return a byte value in which bits numbered lower than 'b' are set.
111 */
112 static __inline__ u_char
113 bits_lower(int b)
114 {
115 return ((u_char)(bit(b) - 1));
116 }
117
118 /*
119 * Function: byte_set_bits
120 * Purpose:
121 * Set the given range of bits within a byte.
122 */
123 static __inline__ u_char
124 byte_set_bits(int start, int end)
125 {
126 return ((u_char)((~bits_lower(start)) & (bits_lower(end) | bit(end))));
127 }
128
129 static __inline__ bitmap_offset_t
130 bitmap_offset(off_t where)
131 {
132 bitmap_offset_t b;
133
134 b.byte = where / NBBY;
135 b.bit = where % NBBY;
136 return (b);
137 }
138
139 /*
140 * Function: bitmap_set
141 *
142 * Purpose:
143 * Set the given range of bits.
144 *
145 * This algorithm tries to set the extents using the biggest
146 * units, using longs, then a short, then a byte, then bits.
147 */
148 static void
149 bitmap_set(u_char * map, u_long start_bit, u_long bit_count)
150 {
151 bitmap_offset_t start;
152 bitmap_offset_t end;
153
154 start = bitmap_offset(start_bit);
155 end = bitmap_offset(start_bit + bit_count);
156 if (start.byte < end.byte) {
157 u_long n_bytes;
158
159 if (start.bit) {
160 map[start.byte] |= byte_set_bits(start.bit, NBBY - 1);
161 start.bit = 0;
162 start.byte++;
163 if (start.byte == end.byte)
164 goto end;
165 }
166
167 n_bytes = end.byte - start.byte;
168
169 while (n_bytes >= (sizeof(u_long))) {
170 *((u_long *)(map + start.byte)) = ULONG_ALL_ONES;
171 start.byte += sizeof(u_long);
172 n_bytes -= sizeof(u_long);
173 }
174 if (n_bytes >= sizeof(u_short)) {
175 *((u_short *)(map + start.byte)) = USHORT_ALL_ONES;
176 start.byte += sizeof(u_short);
177 n_bytes -= sizeof(u_short);
178 }
179 if (n_bytes == 1) {
180 map[start.byte] = UCHAR_ALL_ONES;
181 start.byte++;
182 n_bytes = 0;
183 }
184 }
185
186 end:
187 if (end.bit > start.bit) {
188 map[start.byte] |= byte_set_bits(start.bit, end.bit - 1);
189 }
190
191 return;
192 }
193
194 /*
195 * Function: bitmap_get
196 *
197 * Purpose:
198 * Return the number of bits in the range that are the same e.g.
199 * 11101 returns 3 because the first 3 bits are the same (1's), whereas
200 * 001100 returns 2 because the first 2 bits are the same.
201 * This algorithm tries to count things in as big a chunk as possible,
202 * first aligning to a byte offset, then trying to count longs, a short,
203 * a byte, then any remaining bits to find the bit that is different.
204 */
205
206 static u_long
207 bitmap_get(u_char * map, u_long start_bit, u_long bit_count,
208 boolean_t * ret_is_set)
209 {
210 u_long count;
211 int i;
212 boolean_t is_set;
213 bitmap_offset_t start;
214 bitmap_offset_t end;
215
216 start = bitmap_offset(start_bit);
217 end = bitmap_offset(start_bit + bit_count);
218
219 is_set = (map[start.byte] & bit(start.bit)) ? TRUE : FALSE;
220 count = 0;
221
222 if (start.byte < end.byte) {
223 u_long n_bytes;
224
225 if (start.bit) { /* try to align to a byte */
226 for (i = start.bit; i < NBBY; i++) {
227 boolean_t this_is_set;
228
229 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
230 if (this_is_set != is_set) {
231 goto done; /* found bit that was different, we're done */
232 }
233 count++;
234 }
235 start.bit = 0; /* made it to the next byte */
236 start.byte++;
237 if (start.byte == end.byte)
238 goto end; /* no more bytes, check for any leftover bits */
239 }
240 /* calculate how many bytes are left in the range */
241 n_bytes = end.byte - start.byte;
242
243 /* check for 4 bytes of the same bits */
244 while (n_bytes >= sizeof(u_long)) {
245 u_long * valPtr = (u_long *)(map + start.byte);
246 if ((is_set && *valPtr == ULONG_ALL_ONES)
247 || (!is_set && *valPtr == 0)) {
248 count += sizeof(*valPtr) * NBBY;
249 start.byte += sizeof(*valPtr);
250 n_bytes -= sizeof(*valPtr);
251 }
252 else
253 break; /* bits differ */
254
255 }
256 /* check for 2 bytes of the same bits */
257 if (n_bytes >= sizeof(u_short)) {
258 u_short * valPtr = (u_short *)(map + start.byte);
259
260 if ((is_set && *valPtr == USHORT_ALL_ONES)
261 || (!is_set && (*valPtr == 0))) {
262 count += sizeof(*valPtr) * NBBY;
263 start.byte += sizeof(*valPtr);
264 n_bytes -= sizeof(*valPtr);
265 }
266 }
267
268 /* check for 1 byte of the same bits */
269 if (n_bytes) {
270 if ((is_set && map[start.byte] == UCHAR_ALL_ONES)
271 || (!is_set && map[start.byte] == 0)) {
272 count += NBBY;
273 start.byte++;
274 n_bytes--;
275 }
276 /* we found bits that were different, find the first one */
277 if (n_bytes) {
278 for (i = 0; i < NBBY; i++) {
279 boolean_t this_is_set;
280
281 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
282 if (this_is_set != is_set) {
283 break;
284 }
285 count++;
286 }
287 goto done;
288 }
289 }
290 }
291
292 end:
293 for (i = start.bit; i < (int)end.bit; i++) {
294 boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
295
296 if (this_is_set != is_set) {
297 break;
298 }
299 count++;
300 }
301
302 done:
303 *ret_is_set = is_set;
304 return (count);
305 }
306
307 static __inline__ band_number_t
308 shadow_map_block_to_band(shadow_map_t * map, unsigned long block)
309 {
310 return (block / map->blocks_per_band);
311 }
312
313 /*
314 * Function: shadow_map_mapped_band
315 * Purpose:
316 * Return the mapped band for the given band.
317 * If map_it is FALSE, and the band is not mapped, return FALSE.
318 * If map_it is TRUE, then this function will always return TRUE.
319 */
320 static boolean_t
321 shadow_map_mapped_band(shadow_map_t * map, band_number_t band,
322 boolean_t map_it, band_number_t * mapped_band)
323 {
324 boolean_t is_mapped = FALSE;
325
326 if (band == map->zeroth_band) {
327 *mapped_band = BAND_ZERO;
328 is_mapped = TRUE;
329 }
330 else {
331 *mapped_band = map->bands[band];
332 if (*mapped_band == BAND_ZERO) {
333 if (map_it) {
334 /* grow the file */
335 if (map->next_band == 0) {
336 /* remember the zero'th band */
337 map->zeroth_band = band;
338 }
339 *mapped_band = map->bands[band] = map->next_band++;
340 is_mapped = TRUE;
341 }
342 }
343 else {
344 is_mapped = TRUE;
345 }
346 }
347 return (is_mapped);
348 }
349
350 /*
351 * Function: shadow_map_contiguous
352 *
353 * Purpose:
354 * Return the first offset within the range position..(position + count)
355 * that is not a contiguous mapped band.
356 *
357 * If called with is_write = TRUE, this function will map bands as it goes.
358 */
359 static u_long
360 shadow_map_contiguous(shadow_map_t * map, u_long start_block,
361 u_long num_blocks, boolean_t is_write)
362 {
363 band_number_t band = shadow_map_block_to_band(map, start_block);
364 u_long end_block = start_block + num_blocks;
365 boolean_t is_mapped;
366 band_number_t mapped_band;
367 u_long ret_end_block = end_block;
368 u_long p;
369
370 is_mapped = shadow_map_mapped_band(map, band, is_write, &mapped_band);
371 if (is_write == FALSE && is_mapped == FALSE) {
372 static int happened = 0;
373 /* this can't happen */
374 if (happened == 0) {
375 printf("shadow_map_contiguous: this can't happen!\n");
376 happened = 1;
377 }
378 return (start_block);
379 }
380 for (p = my_trunc(start_block + map->blocks_per_band,
381 map->blocks_per_band);
382 p < end_block; p += map->blocks_per_band) {
383 band_number_t next_mapped_band;
384
385 band++;
386 is_mapped = shadow_map_mapped_band(map, band, is_write,
387 &next_mapped_band);
388 if (is_write == FALSE && is_mapped == FALSE) {
389 return (p);
390 }
391 if ((mapped_band + 1) != next_mapped_band) {
392 /* not contiguous */
393 ret_end_block = p;
394 break;
395 }
396 mapped_band = next_mapped_band;
397 }
398 return (ret_end_block);
399 }
400
401
402 /*
403 * Function: block_bitmap_size
404 * Purpose:
405 * The number of bytes required in a block bitmap to represent a file of size
406 * file_size.
407 *
408 * The bytes required is the number of blocks in the file,
409 * divided by the number of bits per byte.
410 * Note:
411 * An 8GB file requires (assuming 512 byte block):
412 * 2^33 / 2^9 / 2^3 = 2^21 = 2MB
413 * of bitmap space. This is a non-trival amount of memory,
414 * particularly since most of the bits will be zero.
415 * A sparse bitmap would really help in this case.
416 */
417 static __inline__ u_long
418 block_bitmap_size(off_t file_size, u_long block_size)
419 {
420 off_t blocks = howmany(file_size, block_size);
421 return (howmany(blocks, NBBY));
422 }
423
424 /*
425 * Function: shadow_map_read
426 *
427 * Purpose:
428 * Calculate the block offset within the shadow to read, and the number
429 * blocks to read. The input values (block_offset, block_count) refer
430 * to the original file.
431 *
432 * The output values (*incr_block_offset, *incr_block_count) refer to the
433 * shadow file if the return value is TRUE. They refer to the original
434 * file if the return value is FALSE.
435
436 * Blocks within a band may or may not have been written, in addition,
437 * Bands are not necessarily contiguous, therefore:
438 * *incr_block_count <= block_count
439 * The caller must be prepared to call this function interatively
440 * to complete the whole i/o.
441 * Returns:
442 * TRUE if the shadow file should be read, FALSE if the original file
443 * should be read.
444 */
445 boolean_t
446 shadow_map_read(shadow_map_t * map, u_long block_offset, u_long block_count,
447 u_long * incr_block_offset, u_long * incr_block_count)
448 {
449 boolean_t written = FALSE;
450 u_long n_blocks;
451
452 if (block_offset >= map->file_size_blocks
453 || (block_offset + block_count) > map->file_size_blocks) {
454 printf("shadow_map_read: request (%ld, %ld) exceeds file size %ld\n",
455 block_offset, block_count, map->file_size_blocks);
456 *incr_block_count = 0;
457 }
458 n_blocks = bitmap_get(map->block_bitmap, block_offset, block_count,
459 &written);
460 if (written == FALSE) {
461 *incr_block_count = n_blocks;
462 *incr_block_offset = block_offset;
463 }
464 else { /* start has been written, and therefore mapped */
465 band_number_t mapped_band;
466 u_long band_limit;
467
468 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
469 *incr_block_offset = mapped_band * map->blocks_per_band
470 + (block_offset % map->blocks_per_band);
471 band_limit
472 = shadow_map_contiguous(map, block_offset, block_count, FALSE);
473 *incr_block_count = band_limit - block_offset;
474 if (*incr_block_count > n_blocks) {
475 *incr_block_count = n_blocks;
476 }
477 }
478 return (written);
479 }
480
481 /*
482 * Function: shadow_map_write
483 *
484 * Purpose:
485 * Calculate the block offset within the shadow to write, and the number
486 * blocks to write. The input values (block_offset, block_count) refer
487 * to the original file. The output values
488 * (*incr_block_offset, *incr_block_count) refer to the shadow file.
489 *
490 * Bands are not necessarily contiguous, therefore:
491 * *incr_block_count <= block_count
492 * The caller must be prepared to call this function interatively
493 * to complete the whole i/o.
494 * Returns:
495 * TRUE if the shadow file was grown, FALSE otherwise.
496 */
497 boolean_t
498 shadow_map_write(shadow_map_t * map, u_long block_offset,
499 u_long block_count, u_long * incr_block_offset,
500 u_long * incr_block_count)
501 {
502 u_long band_limit;
503 band_number_t mapped_band;
504 boolean_t shadow_grew = FALSE;
505
506 if (block_offset >= map->file_size_blocks
507 || (block_offset + block_count) > map->file_size_blocks) {
508 printf("shadow_map_write: request (%ld, %ld) exceeds file size %ld\n",
509 block_offset, block_count, map->file_size_blocks);
510 *incr_block_count = 0;
511 }
512
513 band_limit = shadow_map_contiguous(map, block_offset, block_count, TRUE);
514 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
515 *incr_block_offset = mapped_band * map->blocks_per_band
516 + (block_offset % map->blocks_per_band);
517 *incr_block_count = band_limit - block_offset;
518
519 /* mark these blocks as written */
520 bitmap_set(map->block_bitmap, block_offset, *incr_block_count);
521
522 if (map->next_band > map->shadow_size_bands) {
523 map->shadow_size_bands = map->next_band;
524 shadow_grew = TRUE;
525 }
526 return (shadow_grew);
527 }
528
529 boolean_t
530 shadow_map_is_written(shadow_map_t * map, u_long block_offset)
531 {
532 bitmap_offset_t b;
533
534 b = bitmap_offset(block_offset);
535 return ((map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE);
536 }
537
538 /*
539 * Function: shadow_map_shadow_size
540 *
541 * Purpose:
542 * To return the size of the shadow file in blocks.
543 */
544 u_long
545 shadow_map_shadow_size(shadow_map_t * map)
546 {
547 return (map->shadow_size_bands * map->blocks_per_band);
548 }
549
550 /*
551 * Function: shadow_map_create
552 *
553 * Purpose:
554 * Allocate the dynamic data for keeping track of the shadow dirty blocks
555 * and the band mapping table.
556 * Returns:
557 * NULL if an error occurred.
558 */
559 shadow_map_t *
560 shadow_map_create(off_t file_size, off_t shadow_size,
561 u_long band_size, u_long block_size)
562 {
563 void * block_bitmap = 0;
564 u_long bitmap_size;
565 band_number_t * bands = 0;
566 shadow_map_t * map;
567 u_long n_bands = 0;
568
569 if (band_size == 0) {
570 band_size = BAND_SIZE_DEFAULT;
571 }
572
573 n_bands = howmany(file_size, band_size);
574 if (n_bands > (BAND_MAX + 1)) {
575 printf("file is too big: %ld > %d\n",
576 n_bands, BAND_MAX);
577 goto failure;
578 }
579
580 /* create a block bitmap, one bit per block */
581 bitmap_size = block_bitmap_size(file_size, block_size);
582 block_bitmap = my_malloc(bitmap_size);
583 if (block_bitmap == NULL) {
584 printf("failed to allocate bitmap\n");
585 goto failure;
586 }
587 bzero(block_bitmap, bitmap_size);
588
589 /* get the band map */
590 bands = (band_number_t *)my_malloc(n_bands * sizeof(band_number_t));
591 if (bands == NULL) {
592 printf("failed to allocate bands\n");
593 goto failure;
594 }
595 bzero(bands, n_bands * sizeof(band_number_t));
596
597 map = my_malloc(sizeof(*map));
598 if (map == NULL) {
599 printf("failed to allocate map\n");
600 goto failure;
601 }
602 map->blocks_per_band = band_size / block_size;
603 map->block_bitmap = block_bitmap;
604 map->bands = bands;
605 map->file_size_blocks = n_bands * map->blocks_per_band;
606 map->next_band = 0;
607 map->zeroth_band = -1;
608 map->shadow_size_bands = howmany(shadow_size, band_size);
609 map->block_size = block_size;
610 return (map);
611
612 failure:
613 if (block_bitmap)
614 my_free(block_bitmap);
615 if (bands)
616 my_free(bands);
617 return (NULL);
618 }
619
620 /*
621 * Function: shadow_map_free
622 * Purpose:
623 * Frees the data structure to deal with the shadow map.
624 */
625 void
626 shadow_map_free(shadow_map_t * map)
627 {
628 if (map->block_bitmap)
629 my_free(map->block_bitmap);
630 if (map->bands)
631 my_free(map->bands);
632 map->block_bitmap = 0;
633 map->bands = 0;
634 my_free(map);
635 return;
636 }
637
638 #ifdef TEST_SHADOW
639 #define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512)
640
641 enum {
642 ReadRequest,
643 WriteRequest,
644 };
645
646 typedef struct {
647 int type;
648 u_long offset;
649 u_long count;
650 } block_request_t;
651
652 int
653 main()
654 {
655 shadow_map_t * map;
656 int i;
657 block_request_t requests[] = {
658 { WriteRequest, BAND_SIZE_BLOCKS * 2, 1 },
659 { ReadRequest, BAND_SIZE_BLOCKS / 2, BAND_SIZE_BLOCKS * 2 - 2 },
660 { WriteRequest, BAND_SIZE_BLOCKS * 1, 5 * BAND_SIZE_BLOCKS + 3},
661 { ReadRequest, 0, BAND_SIZE_BLOCKS * 10 },
662 { WriteRequest, BAND_SIZE_BLOCKS * (BAND_MAX - 1),
663 BAND_SIZE_BLOCKS * 2},
664 { 0, 0 },
665 };
666
667 map = shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512);
668 if (map == NULL) {
669 printf("shadow_map_create failed\n");
670 exit(1);
671 }
672 for (i = 0; TRUE; i++) {
673 u_long offset;
674 u_long resid;
675 boolean_t shadow_grew;
676 boolean_t read_shadow;
677
678 if (requests[i].count == 0) {
679 break;
680 }
681 offset = requests[i].offset;
682 resid = requests[i].count;
683 printf("\n%s REQUEST (%ld, %ld)\n",
684 requests[i].type == WriteRequest ? "WRITE" : "READ",
685 offset, resid);
686 switch (requests[i].type) {
687 case WriteRequest:
688 while (resid > 0) {
689 u_long this_offset;
690 u_long this_count;
691
692 shadow_grew = shadow_map_write(map, offset,
693 resid,
694 &this_offset,
695 &this_count);
696 printf("\t(%ld, %ld) => (%ld, %ld)",
697 offset, resid, this_offset, this_count);
698 resid -= this_count;
699 offset += this_count;
700 if (shadow_grew) {
701 printf(" shadow grew to %ld", shadow_map_shadow_size(map));
702 }
703 printf("\n");
704 }
705 break;
706 case ReadRequest:
707 while (resid > 0) {
708 u_long this_offset;
709 u_long this_count;
710
711 read_shadow = shadow_map_read(map, offset,
712 resid,
713 &this_offset,
714 &this_count);
715 printf("\t(%ld, %ld) => (%ld, %ld)%s\n",
716 offset, resid, this_offset, this_count,
717 read_shadow ? " from shadow" : "");
718 if (this_count == 0) {
719 printf("this_count is 0, aborting\n");
720 break;
721 }
722 resid -= this_count;
723 offset += this_count;
724 }
725 break;
726 default:
727 break;
728 }
729 }
730 if (map) {
731 shadow_map_free(map);
732 }
733 exit(0);
734 return (0);
735 }
736 #endif