2 * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Implement copy-on-write shadow map to allow a disk image to be
33 * mounted read-only, yet be writable by transferring writes to a
34 * "shadow" file. Subsequent reads from blocks that have been
35 * written will then go the "shadow" file.
37 * The map has two parts:
38 * 1) a bit map to track which blocks have been written
39 * 2) a band map to map a "band" within the original file to a corresponding
40 * "band" in the shadow file. Each band has the same size.
42 * The band map is used to ensure that blocks that are contiguous in the
43 * original file will remain contiguous in the shadow file.
45 * For debugging purposes, this file can be compiled standalone using:
46 * cc -o shadow shadow.c -DTEST_SHADOW
50 * Modification History
52 * December 21, 2001 Dieter Siegmund (dieter@apple.com)
55 #include <sys/param.h>
56 #include <sys/types.h>
57 #include <mach/boolean.h>
64 #define my_malloc(a) malloc(a)
65 #define my_free(a) free(a)
66 #else /* !TEST_SHADOW */
67 #include <sys/malloc.h>
68 #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK)
69 #define my_free(a) FREE(a, M_TEMP)
70 #include <libkern/libkern.h>
71 #endif /* TEST_SHADOW */
75 #define UINT32_ALL_ONES ((uint32_t)(-1))
76 #define USHORT_ALL_ONES ((u_short)(-1))
77 #define UCHAR_ALL_ONES ((u_char)(-1))
79 #define my_trunc(value, divisor) ((value) / (divisor) * (divisor))
81 /* a band size of 128K can represent a file up to 8GB */
82 #define BAND_SIZE_DEFAULT_POWER_2 17
83 #define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2)
85 typedef u_short band_number_t
;
86 #define BAND_ZERO ((band_number_t)0)
87 #define BAND_MAX ((band_number_t)65535)
90 uint32_t blocks_per_band
;/* size in blocks */
92 u_char
* block_bitmap
; /* 1 bit per block; 1=written */
93 band_number_t
* bands
; /* band map array */
94 uint32_t file_size_blocks
; /* size of file in bands */
95 uint32_t shadow_size_bands
; /* size of shadow in bands */
96 uint32_t next_band
; /* next free band */
97 uint32_t zeroth_band
; /* special-case 0th band */
106 static __inline__ u_char
109 return (u_char
)(1 << b
);
113 * Function: bits_lower
115 * Return a byte value in which bits numbered lower than 'b' are set.
117 static __inline__ u_char
120 return (u_char
)(bit(b
) - 1);
124 * Function: byte_set_bits
126 * Set the given range of bits within a byte.
128 static __inline__ u_char
129 byte_set_bits(int start
, int end
)
131 return (u_char
)((~bits_lower(start
)) & (bits_lower(end
) | bit(end
)));
134 static __inline__ bitmap_offset_t
135 bitmap_offset(off_t where
)
139 b
.byte
= where
/ NBBY
;
140 b
.bit
= where
% NBBY
;
145 * Function: bitmap_set
148 * Set the given range of bits.
150 * This algorithm tries to set the extents using the biggest
151 * units, using longs, then a short, then a byte, then bits.
154 bitmap_set(u_char
* map
, uint32_t start_bit
, uint32_t bit_count
)
156 bitmap_offset_t start
;
159 start
= bitmap_offset(start_bit
);
160 end
= bitmap_offset(start_bit
+ bit_count
);
161 if (start
.byte
< end
.byte
) {
165 map
[start
.byte
] |= byte_set_bits(start
.bit
, NBBY
- 1);
168 if (start
.byte
== end
.byte
) {
173 n_bytes
= end
.byte
- start
.byte
;
175 while (n_bytes
>= (sizeof(uint32_t))) {
176 *((uint32_t *)(map
+ start
.byte
)) = UINT32_ALL_ONES
;
177 start
.byte
+= sizeof(uint32_t);
178 n_bytes
-= sizeof(uint32_t);
180 if (n_bytes
>= sizeof(u_short
)) {
181 *((u_short
*)(map
+ start
.byte
)) = USHORT_ALL_ONES
;
182 start
.byte
+= sizeof(u_short
);
183 n_bytes
-= sizeof(u_short
);
186 map
[start
.byte
] = UCHAR_ALL_ONES
;
193 if (end
.bit
> start
.bit
) {
194 map
[start
.byte
] |= byte_set_bits(start
.bit
, end
.bit
- 1);
201 * Function: bitmap_get
204 * Return the number of bits in the range that are the same e.g.
205 * 11101 returns 3 because the first 3 bits are the same (1's), whereas
206 * 001100 returns 2 because the first 2 bits are the same.
207 * This algorithm tries to count things in as big a chunk as possible,
208 * first aligning to a byte offset, then trying to count longs, a short,
209 * a byte, then any remaining bits to find the bit that is different.
213 bitmap_get(u_char
* map
, uint32_t start_bit
, uint32_t bit_count
,
214 boolean_t
* ret_is_set
)
219 bitmap_offset_t start
;
222 start
= bitmap_offset(start_bit
);
223 end
= bitmap_offset(start_bit
+ bit_count
);
225 is_set
= (map
[start
.byte
] & bit(start
.bit
)) ? TRUE
: FALSE
;
228 if (start
.byte
< end
.byte
) {
231 if (start
.bit
) { /* try to align to a byte */
232 for (i
= start
.bit
; i
< NBBY
; i
++) {
233 boolean_t this_is_set
;
235 this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
236 if (this_is_set
!= is_set
) {
237 goto done
; /* found bit that was different, we're done */
241 start
.bit
= 0; /* made it to the next byte */
243 if (start
.byte
== end
.byte
) {
244 goto end
; /* no more bytes, check for any leftover bits */
247 /* calculate how many bytes are left in the range */
248 n_bytes
= end
.byte
- start
.byte
;
250 /* check for 4 bytes of the same bits */
251 while (n_bytes
>= sizeof(uint32_t)) {
252 uint32_t * valPtr
= (uint32_t *)(map
+ start
.byte
);
253 if ((is_set
&& *valPtr
== UINT32_ALL_ONES
)
254 || (!is_set
&& *valPtr
== 0)) {
255 count
+= sizeof(*valPtr
) * NBBY
;
256 start
.byte
+= sizeof(*valPtr
);
257 n_bytes
-= sizeof(*valPtr
);
259 break; /* bits differ */
262 /* check for 2 bytes of the same bits */
263 if (n_bytes
>= sizeof(u_short
)) {
264 u_short
* valPtr
= (u_short
*)(map
+ start
.byte
);
266 if ((is_set
&& *valPtr
== USHORT_ALL_ONES
)
267 || (!is_set
&& (*valPtr
== 0))) {
268 count
+= sizeof(*valPtr
) * NBBY
;
269 start
.byte
+= sizeof(*valPtr
);
270 n_bytes
-= sizeof(*valPtr
);
274 /* check for 1 byte of the same bits */
276 if ((is_set
&& map
[start
.byte
] == UCHAR_ALL_ONES
)
277 || (!is_set
&& map
[start
.byte
] == 0)) {
282 /* we found bits that were different, find the first one */
284 for (i
= 0; i
< NBBY
; i
++) {
285 boolean_t this_is_set
;
287 this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
288 if (this_is_set
!= is_set
) {
299 for (i
= start
.bit
; i
< (int)end
.bit
; i
++) {
300 boolean_t this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
302 if (this_is_set
!= is_set
) {
309 *ret_is_set
= is_set
;
313 static __inline__ band_number_t
314 shadow_map_block_to_band(shadow_map_t
* map
, uint32_t block
)
316 return block
/ map
->blocks_per_band
;
320 * Function: shadow_map_mapped_band
322 * Return the mapped band for the given band.
323 * If map_it is FALSE, and the band is not mapped, return FALSE.
324 * If map_it is TRUE, then this function will always return TRUE.
327 shadow_map_mapped_band(shadow_map_t
* map
, band_number_t band
,
328 boolean_t map_it
, band_number_t
* mapped_band
)
330 boolean_t is_mapped
= FALSE
;
332 if (band
== map
->zeroth_band
) {
333 *mapped_band
= BAND_ZERO
;
336 *mapped_band
= map
->bands
[band
];
337 if (*mapped_band
== BAND_ZERO
) {
340 if (map
->next_band
== 0) {
341 /* remember the zero'th band */
342 map
->zeroth_band
= band
;
344 *mapped_band
= map
->bands
[band
] = map
->next_band
++;
355 * Function: shadow_map_contiguous
358 * Return the first offset within the range position..(position + count)
359 * that is not a contiguous mapped band.
361 * If called with is_write = TRUE, this function will map bands as it goes.
364 shadow_map_contiguous(shadow_map_t
* map
, uint32_t start_block
,
365 uint32_t num_blocks
, boolean_t is_write
)
367 band_number_t band
= shadow_map_block_to_band(map
, start_block
);
368 uint32_t end_block
= start_block
+ num_blocks
;
370 band_number_t mapped_band
;
371 uint32_t ret_end_block
= end_block
;
374 is_mapped
= shadow_map_mapped_band(map
, band
, is_write
, &mapped_band
);
375 if (is_write
== FALSE
&& is_mapped
== FALSE
) {
376 static int happened
= 0;
377 /* this can't happen */
379 printf("shadow_map_contiguous: this can't happen!\n");
384 for (p
= my_trunc(start_block
+ map
->blocks_per_band
,
385 map
->blocks_per_band
);
386 p
< end_block
; p
+= map
->blocks_per_band
) {
387 band_number_t next_mapped_band
;
390 is_mapped
= shadow_map_mapped_band(map
, band
, is_write
,
392 if (is_write
== FALSE
&& is_mapped
== FALSE
) {
395 if ((mapped_band
+ 1) != next_mapped_band
) {
400 mapped_band
= next_mapped_band
;
402 return ret_end_block
;
407 * Function: block_bitmap_size
409 * The number of bytes required in a block bitmap to represent a file of size
412 * The bytes required is the number of blocks in the file,
413 * divided by the number of bits per byte.
415 * An 8GB file requires (assuming 512 byte block):
416 * 2^33 / 2^9 / 2^3 = 2^21 = 2MB
417 * of bitmap space. This is a non-trival amount of memory,
418 * particularly since most of the bits will be zero.
419 * A sparse bitmap would really help in this case.
421 static __inline__
uint32_t
422 block_bitmap_size(off_t file_size
, uint32_t block_size
)
424 off_t blocks
= howmany(file_size
, block_size
);
425 return howmany(blocks
, NBBY
);
429 * Function: shadow_map_read
432 * Calculate the block offset within the shadow to read, and the number
433 * blocks to read. The input values (block_offset, block_count) refer
434 * to the original file.
436 * The output values (*incr_block_offset, *incr_block_count) refer to the
437 * shadow file if the return value is TRUE. They refer to the original
438 * file if the return value is FALSE.
440 * Blocks within a band may or may not have been written, in addition,
441 * Bands are not necessarily contiguous, therefore:
442 * *incr_block_count <= block_count
443 * The caller must be prepared to call this function interatively
444 * to complete the whole i/o.
446 * TRUE if the shadow file should be read, FALSE if the original file
450 shadow_map_read(shadow_map_t
* map
, uint32_t block_offset
, uint32_t block_count
,
451 uint32_t * incr_block_offset
, uint32_t * incr_block_count
)
453 boolean_t written
= FALSE
;
456 if (block_offset
>= map
->file_size_blocks
457 || (block_offset
+ block_count
) > map
->file_size_blocks
) {
458 printf("shadow_map_read: request (%d, %d) exceeds file size %d\n",
459 block_offset
, block_count
, map
->file_size_blocks
);
460 *incr_block_count
= 0;
462 n_blocks
= bitmap_get(map
->block_bitmap
, block_offset
, block_count
,
464 if (written
== FALSE
) {
465 *incr_block_count
= n_blocks
;
466 *incr_block_offset
= block_offset
;
467 } else { /* start has been written, and therefore mapped */
468 band_number_t mapped_band
;
471 mapped_band
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)];
472 *incr_block_offset
= mapped_band
* map
->blocks_per_band
473 + (block_offset
% map
->blocks_per_band
);
475 = shadow_map_contiguous(map
, block_offset
, block_count
, FALSE
);
476 *incr_block_count
= band_limit
- block_offset
;
477 if (*incr_block_count
> n_blocks
) {
478 *incr_block_count
= n_blocks
;
485 * Function: shadow_map_write
488 * Calculate the block offset within the shadow to write, and the number
489 * blocks to write. The input values (block_offset, block_count) refer
490 * to the original file. The output values
491 * (*incr_block_offset, *incr_block_count) refer to the shadow file.
493 * Bands are not necessarily contiguous, therefore:
494 * *incr_block_count <= block_count
495 * The caller must be prepared to call this function interatively
496 * to complete the whole i/o.
498 * TRUE if the shadow file was grown, FALSE otherwise.
501 shadow_map_write(shadow_map_t
* map
, uint32_t block_offset
,
502 uint32_t block_count
, uint32_t * incr_block_offset
,
503 uint32_t * incr_block_count
)
506 band_number_t mapped_band
;
507 boolean_t shadow_grew
= FALSE
;
509 if (block_offset
>= map
->file_size_blocks
510 || (block_offset
+ block_count
) > map
->file_size_blocks
) {
511 printf("shadow_map_write: request (%d, %d) exceeds file size %d\n",
512 block_offset
, block_count
, map
->file_size_blocks
);
513 *incr_block_count
= 0;
516 band_limit
= shadow_map_contiguous(map
, block_offset
, block_count
, TRUE
);
517 mapped_band
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)];
518 *incr_block_offset
= mapped_band
* map
->blocks_per_band
519 + (block_offset
% map
->blocks_per_band
);
520 *incr_block_count
= band_limit
- block_offset
;
522 /* mark these blocks as written */
523 bitmap_set(map
->block_bitmap
, block_offset
, *incr_block_count
);
525 if (map
->next_band
> map
->shadow_size_bands
) {
526 map
->shadow_size_bands
= map
->next_band
;
533 shadow_map_is_written(shadow_map_t
* map
, uint32_t block_offset
)
537 b
= bitmap_offset(block_offset
);
538 return (map
->block_bitmap
[b
.byte
] & bit(b
.bit
)) ? TRUE
: FALSE
;
542 * Function: shadow_map_shadow_size
545 * To return the size of the shadow file in blocks.
548 shadow_map_shadow_size(shadow_map_t
* map
)
550 return map
->shadow_size_bands
* map
->blocks_per_band
;
554 * Function: shadow_map_create
557 * Allocate the dynamic data for keeping track of the shadow dirty blocks
558 * and the band mapping table.
560 * NULL if an error occurred.
563 shadow_map_create(off_t file_size
, off_t shadow_size
,
564 uint32_t band_size
, uint32_t block_size
)
566 void * block_bitmap
= NULL
;
567 uint32_t bitmap_size
;
568 band_number_t
* bands
= NULL
;
570 uint32_t n_bands
= 0;
572 if (band_size
== 0) {
573 band_size
= BAND_SIZE_DEFAULT
;
576 n_bands
= howmany(file_size
, band_size
);
577 if (n_bands
> (BAND_MAX
+ 1)) {
578 printf("file is too big: %d > %d\n",
583 /* create a block bitmap, one bit per block */
584 bitmap_size
= block_bitmap_size(file_size
, block_size
);
585 block_bitmap
= my_malloc(bitmap_size
);
586 if (block_bitmap
== NULL
) {
587 printf("failed to allocate bitmap\n");
590 bzero(block_bitmap
, bitmap_size
);
592 /* get the band map */
593 bands
= (band_number_t
*)my_malloc(n_bands
* sizeof(band_number_t
));
595 printf("failed to allocate bands\n");
598 bzero(bands
, n_bands
* sizeof(band_number_t
));
600 map
= my_malloc(sizeof(*map
));
602 printf("failed to allocate map\n");
605 map
->blocks_per_band
= band_size
/ block_size
;
606 map
->block_bitmap
= block_bitmap
;
608 map
->file_size_blocks
= n_bands
* map
->blocks_per_band
;
610 map
->zeroth_band
= -1;
611 map
->shadow_size_bands
= howmany(shadow_size
, band_size
);
612 map
->block_size
= block_size
;
617 my_free(block_bitmap
);
626 * Function: shadow_map_free
628 * Frees the data structure to deal with the shadow map.
631 shadow_map_free(shadow_map_t
* map
)
633 if (map
->block_bitmap
) {
634 my_free(map
->block_bitmap
);
639 map
->block_bitmap
= NULL
;
646 #define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512)
664 block_request_t requests
[] = {
665 { WriteRequest
, BAND_SIZE_BLOCKS
* 2, 1 },
666 { ReadRequest
, BAND_SIZE_BLOCKS
/ 2, BAND_SIZE_BLOCKS
* 2 - 2 },
667 { WriteRequest
, BAND_SIZE_BLOCKS
* 1, 5 * BAND_SIZE_BLOCKS
+ 3},
668 { ReadRequest
, 0, BAND_SIZE_BLOCKS
* 10 },
669 { WriteRequest
, BAND_SIZE_BLOCKS
* (BAND_MAX
- 1),
670 BAND_SIZE_BLOCKS
* 2},
674 map
= shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512);
676 printf("shadow_map_create failed\n");
679 for (i
= 0; TRUE
; i
++) {
682 boolean_t shadow_grew
;
683 boolean_t read_shadow
;
685 if (requests
[i
].count
== 0) {
688 offset
= requests
[i
].offset
;
689 resid
= requests
[i
].count
;
690 printf("\n%s REQUEST (%ld, %ld)\n",
691 requests
[i
].type
== WriteRequest
? "WRITE" : "READ",
693 switch (requests
[i
].type
) {
696 uint32_t this_offset
;
699 shadow_grew
= shadow_map_write(map
, offset
,
703 printf("\t(%ld, %ld) => (%ld, %ld)",
704 offset
, resid
, this_offset
, this_count
);
706 offset
+= this_count
;
708 printf(" shadow grew to %ld", shadow_map_shadow_size(map
));
715 uint32_t this_offset
;
718 read_shadow
= shadow_map_read(map
, offset
,
722 printf("\t(%ld, %ld) => (%ld, %ld)%s\n",
723 offset
, resid
, this_offset
, this_count
,
724 read_shadow
? " from shadow" : "");
725 if (this_count
== 0) {
726 printf("this_count is 0, aborting\n");
730 offset
+= this_count
;
738 shadow_map_free(map
);