3 * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
5 * @APPLE_LICENSE_HEADER_START@
7 * The contents of this file constitute Original Code as defined in and
8 * are subject to the Apple Public Source License Version 1.1 (the
9 * "License"). You may not use this file except in compliance with the
10 * License. Please obtain a copy of the License at
11 * http://www.apple.com/publicsource and read it before using this file.
13 * This Original Code and all software distributed under the License are
14 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
18 * License for the specific language governing rights and limitations
21 * @APPLE_LICENSE_HEADER_END@
27 * Implement copy-on-write shadow map to allow a disk image to be
28 * mounted read-only, yet be writable by transferring writes to a
29 * "shadow" file. Subsequent reads from blocks that have been
30 * written will then go the "shadow" file.
32 * The map has two parts:
33 * 1) a bit map to track which blocks have been written
34 * 2) a band map to map a "band" within the original file to a corresponding
35 * "band" in the shadow file. Each band has the same size.
37 * The band map is used to ensure that blocks that are contiguous in the
38 * original file will remain contiguous in the shadow file.
40 * For debugging purposes, this file can be compiled standalone using:
41 * cc -o shadow shadow.c -DTEST_SHADOW
45 * Modification History
47 * December 21, 2001 Dieter Siegmund (dieter@apple.com)
50 #include <sys/param.h>
51 #include <sys/types.h>
52 #include <mach/boolean.h>
59 #define my_malloc(a) malloc(a)
60 #define my_free(a) free(a)
61 #else /* !TEST_SHADOW */
62 #include <sys/malloc.h>
63 #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK)
64 #define my_free(a) FREE(a, M_TEMP)
65 #include <libkern/libkern.h>
66 #endif /* TEST_SHADOW */
70 #define ULONG_ALL_ONES ((u_long)(-1))
71 #define USHORT_ALL_ONES ((u_short)(-1))
72 #define UCHAR_ALL_ONES ((u_char)(-1))
74 #define my_trunc(value, divisor) ((value) / (divisor) * (divisor))
76 /* a band size of 128K can represent a file up to 8GB */
77 #define BAND_SIZE_DEFAULT_POWER_2 17
78 #define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2)
80 typedef u_short band_number_t
;
81 #define BAND_ZERO ((band_number_t)0)
82 #define BAND_MAX ((band_number_t)65535)
85 u_long blocks_per_band
;/* size in blocks */
87 u_char
* block_bitmap
; /* 1 bit per block; 1=written */
88 band_number_t
* bands
; /* band map array */
89 u_long file_size_blocks
; /* size of file in bands */
90 u_long shadow_size_bands
; /* size of shadow in bands */
91 u_long next_band
; /* next free band */
92 u_long zeroth_band
; /* special-case 0th band */
101 static __inline__ u_char
104 return ((u_char
)(1 << b
));
108 * Function: bits_lower
110 * Return a byte value in which bits numbered lower than 'b' are set.
112 static __inline__ u_char
115 return ((u_char
)(bit(b
) - 1));
119 * Function: byte_set_bits
121 * Set the given range of bits within a byte.
123 static __inline__ u_char
124 byte_set_bits(int start
, int end
)
126 return ((u_char
)((~bits_lower(start
)) & (bits_lower(end
) | bit(end
))));
129 static __inline__ bitmap_offset_t
130 bitmap_offset(off_t where
)
134 b
.byte
= where
/ NBBY
;
135 b
.bit
= where
% NBBY
;
140 * Function: bitmap_set
143 * Set the given range of bits.
145 * This algorithm tries to set the extents using the biggest
146 * units, using longs, then a short, then a byte, then bits.
149 bitmap_set(u_char
* map
, u_long start_bit
, u_long bit_count
)
151 bitmap_offset_t start
;
154 start
= bitmap_offset(start_bit
);
155 end
= bitmap_offset(start_bit
+ bit_count
);
156 if (start
.byte
< end
.byte
) {
160 map
[start
.byte
] |= byte_set_bits(start
.bit
, NBBY
- 1);
163 if (start
.byte
== end
.byte
)
167 n_bytes
= end
.byte
- start
.byte
;
169 while (n_bytes
>= (sizeof(u_long
))) {
170 *((u_long
*)(map
+ start
.byte
)) = ULONG_ALL_ONES
;
171 start
.byte
+= sizeof(u_long
);
172 n_bytes
-= sizeof(u_long
);
174 if (n_bytes
>= sizeof(u_short
)) {
175 *((u_short
*)(map
+ start
.byte
)) = USHORT_ALL_ONES
;
176 start
.byte
+= sizeof(u_short
);
177 n_bytes
-= sizeof(u_short
);
180 map
[start
.byte
] = UCHAR_ALL_ONES
;
187 if (end
.bit
> start
.bit
) {
188 map
[start
.byte
] |= byte_set_bits(start
.bit
, end
.bit
- 1);
195 * Function: bitmap_get
198 * Return the number of bits in the range that are the same e.g.
199 * 11101 returns 3 because the first 3 bits are the same (1's), whereas
200 * 001100 returns 2 because the first 2 bits are the same.
201 * This algorithm tries to count things in as big a chunk as possible,
202 * first aligning to a byte offset, then trying to count longs, a short,
203 * a byte, then any remaining bits to find the bit that is different.
207 bitmap_get(u_char
* map
, u_long start_bit
, u_long bit_count
,
208 boolean_t
* ret_is_set
)
213 bitmap_offset_t start
;
216 start
= bitmap_offset(start_bit
);
217 end
= bitmap_offset(start_bit
+ bit_count
);
219 is_set
= (map
[start
.byte
] & bit(start
.bit
)) ? TRUE
: FALSE
;
222 if (start
.byte
< end
.byte
) {
225 if (start
.bit
) { /* try to align to a byte */
226 for (i
= start
.bit
; i
< NBBY
; i
++) {
227 boolean_t this_is_set
;
229 this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
230 if (this_is_set
!= is_set
) {
231 goto done
; /* found bit that was different, we're done */
235 start
.bit
= 0; /* made it to the next byte */
237 if (start
.byte
== end
.byte
)
238 goto end
; /* no more bytes, check for any leftover bits */
240 /* calculate how many bytes are left in the range */
241 n_bytes
= end
.byte
- start
.byte
;
243 /* check for 4 bytes of the same bits */
244 while (n_bytes
>= sizeof(u_long
)) {
245 u_long
* valPtr
= (u_long
*)(map
+ start
.byte
);
246 if ((is_set
&& *valPtr
== ULONG_ALL_ONES
)
247 || (!is_set
&& *valPtr
== 0)) {
248 count
+= sizeof(*valPtr
) * NBBY
;
249 start
.byte
+= sizeof(*valPtr
);
250 n_bytes
-= sizeof(*valPtr
);
253 break; /* bits differ */
256 /* check for 2 bytes of the same bits */
257 if (n_bytes
>= sizeof(u_short
)) {
258 u_short
* valPtr
= (u_short
*)(map
+ start
.byte
);
260 if ((is_set
&& *valPtr
== USHORT_ALL_ONES
)
261 || (!is_set
&& (*valPtr
== 0))) {
262 count
+= sizeof(*valPtr
) * NBBY
;
263 start
.byte
+= sizeof(*valPtr
);
264 n_bytes
-= sizeof(*valPtr
);
268 /* check for 1 byte of the same bits */
270 if ((is_set
&& map
[start
.byte
] == UCHAR_ALL_ONES
)
271 || (!is_set
&& map
[start
.byte
] == 0)) {
276 /* we found bits that were different, find the first one */
278 for (i
= 0; i
< NBBY
; i
++) {
279 boolean_t this_is_set
;
281 this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
282 if (this_is_set
!= is_set
) {
293 for (i
= start
.bit
; i
< (int)end
.bit
; i
++) {
294 boolean_t this_is_set
= (map
[start
.byte
] & bit(i
)) ? TRUE
: FALSE
;
296 if (this_is_set
!= is_set
) {
303 *ret_is_set
= is_set
;
307 static __inline__ band_number_t
308 shadow_map_block_to_band(shadow_map_t
* map
, unsigned long block
)
310 return (block
/ map
->blocks_per_band
);
314 * Function: shadow_map_mapped_band
316 * Return the mapped band for the given band.
317 * If map_it is FALSE, and the band is not mapped, return FALSE.
318 * If map_it is TRUE, then this function will always return TRUE.
321 shadow_map_mapped_band(shadow_map_t
* map
, band_number_t band
,
322 boolean_t map_it
, band_number_t
* mapped_band
)
324 boolean_t is_mapped
= FALSE
;
326 if (band
== map
->zeroth_band
) {
327 *mapped_band
= BAND_ZERO
;
331 *mapped_band
= map
->bands
[band
];
332 if (*mapped_band
== BAND_ZERO
) {
335 if (map
->next_band
== 0) {
336 /* remember the zero'th band */
337 map
->zeroth_band
= band
;
339 *mapped_band
= map
->bands
[band
] = map
->next_band
++;
351 * Function: shadow_map_contiguous
354 * Return the first offset within the range position..(position + count)
355 * that is not a contiguous mapped band.
357 * If called with is_write = TRUE, this function will map bands as it goes.
360 shadow_map_contiguous(shadow_map_t
* map
, u_long start_block
,
361 u_long num_blocks
, boolean_t is_write
)
363 band_number_t band
= shadow_map_block_to_band(map
, start_block
);
364 u_long end_block
= start_block
+ num_blocks
;
366 band_number_t mapped_band
;
367 u_long ret_end_block
= end_block
;
370 is_mapped
= shadow_map_mapped_band(map
, band
, is_write
, &mapped_band
);
371 if (is_write
== FALSE
&& is_mapped
== FALSE
) {
372 static int happened
= 0;
373 /* this can't happen */
375 printf("shadow_map_contiguous: this can't happen!\n");
378 return (start_block
);
380 for (p
= my_trunc(start_block
+ map
->blocks_per_band
,
381 map
->blocks_per_band
);
382 p
< end_block
; p
+= map
->blocks_per_band
) {
383 band_number_t next_mapped_band
;
386 is_mapped
= shadow_map_mapped_band(map
, band
, is_write
,
388 if (is_write
== FALSE
&& is_mapped
== FALSE
) {
391 if ((mapped_band
+ 1) != next_mapped_band
) {
396 mapped_band
= next_mapped_band
;
398 return (ret_end_block
);
403 * Function: block_bitmap_size
405 * The number of bytes required in a block bitmap to represent a file of size
408 * The bytes required is the number of blocks in the file,
409 * divided by the number of bits per byte.
411 * An 8GB file requires (assuming 512 byte block):
412 * 2^33 / 2^9 / 2^3 = 2^21 = 2MB
413 * of bitmap space. This is a non-trival amount of memory,
414 * particularly since most of the bits will be zero.
415 * A sparse bitmap would really help in this case.
417 static __inline__ u_long
418 block_bitmap_size(off_t file_size
, u_long block_size
)
420 off_t blocks
= howmany(file_size
, block_size
);
421 return (howmany(blocks
, NBBY
));
425 * Function: shadow_map_read
428 * Calculate the block offset within the shadow to read, and the number
429 * blocks to read. The input values (block_offset, block_count) refer
430 * to the original file.
432 * The output values (*incr_block_offset, *incr_block_count) refer to the
433 * shadow file if the return value is TRUE. They refer to the original
434 * file if the return value is FALSE.
436 * Blocks within a band may or may not have been written, in addition,
437 * Bands are not necessarily contiguous, therefore:
438 * *incr_block_count <= block_count
439 * The caller must be prepared to call this function interatively
440 * to complete the whole i/o.
442 * TRUE if the shadow file should be read, FALSE if the original file
446 shadow_map_read(shadow_map_t
* map
, u_long block_offset
, u_long block_count
,
447 u_long
* incr_block_offset
, u_long
* incr_block_count
)
449 boolean_t written
= FALSE
;
452 if (block_offset
>= map
->file_size_blocks
453 || (block_offset
+ block_count
) > map
->file_size_blocks
) {
454 printf("shadow_map_read: request (%ld, %ld) exceeds file size %ld\n",
455 block_offset
, block_count
, map
->file_size_blocks
);
456 *incr_block_count
= 0;
458 n_blocks
= bitmap_get(map
->block_bitmap
, block_offset
, block_count
,
460 if (written
== FALSE
) {
461 *incr_block_count
= n_blocks
;
462 *incr_block_offset
= block_offset
;
464 else { /* start has been written, and therefore mapped */
465 band_number_t mapped_band
;
468 mapped_band
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)];
469 *incr_block_offset
= mapped_band
* map
->blocks_per_band
470 + (block_offset
% map
->blocks_per_band
);
472 = shadow_map_contiguous(map
, block_offset
, block_count
, FALSE
);
473 *incr_block_count
= band_limit
- block_offset
;
474 if (*incr_block_count
> n_blocks
) {
475 *incr_block_count
= n_blocks
;
482 * Function: shadow_map_write
485 * Calculate the block offset within the shadow to write, and the number
486 * blocks to write. The input values (block_offset, block_count) refer
487 * to the original file. The output values
488 * (*incr_block_offset, *incr_block_count) refer to the shadow file.
490 * Bands are not necessarily contiguous, therefore:
491 * *incr_block_count <= block_count
492 * The caller must be prepared to call this function interatively
493 * to complete the whole i/o.
495 * TRUE if the shadow file was grown, FALSE otherwise.
498 shadow_map_write(shadow_map_t
* map
, u_long block_offset
,
499 u_long block_count
, u_long
* incr_block_offset
,
500 u_long
* incr_block_count
)
503 band_number_t mapped_band
;
504 boolean_t shadow_grew
= FALSE
;
506 if (block_offset
>= map
->file_size_blocks
507 || (block_offset
+ block_count
) > map
->file_size_blocks
) {
508 printf("shadow_map_write: request (%ld, %ld) exceeds file size %ld\n",
509 block_offset
, block_count
, map
->file_size_blocks
);
510 *incr_block_count
= 0;
513 band_limit
= shadow_map_contiguous(map
, block_offset
, block_count
, TRUE
);
514 mapped_band
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)];
515 *incr_block_offset
= mapped_band
* map
->blocks_per_band
516 + (block_offset
% map
->blocks_per_band
);
517 *incr_block_count
= band_limit
- block_offset
;
519 /* mark these blocks as written */
520 bitmap_set(map
->block_bitmap
, block_offset
, *incr_block_count
);
522 if (map
->next_band
> map
->shadow_size_bands
) {
523 map
->shadow_size_bands
= map
->next_band
;
526 return (shadow_grew
);
530 shadow_map_is_written(shadow_map_t
* map
, u_long block_offset
)
534 b
= bitmap_offset(block_offset
);
535 return ((map
->block_bitmap
[b
.byte
] & bit(b
.bit
)) ? TRUE
: FALSE
);
539 * Function: shadow_map_shadow_size
542 * To return the size of the shadow file in blocks.
545 shadow_map_shadow_size(shadow_map_t
* map
)
547 return (map
->shadow_size_bands
* map
->blocks_per_band
);
551 * Function: shadow_map_create
554 * Allocate the dynamic data for keeping track of the shadow dirty blocks
555 * and the band mapping table.
557 * NULL if an error occurred.
560 shadow_map_create(off_t file_size
, off_t shadow_size
,
561 u_long band_size
, u_long block_size
)
563 void * block_bitmap
= 0;
565 band_number_t
* bands
= 0;
569 if (band_size
== 0) {
570 band_size
= BAND_SIZE_DEFAULT
;
573 n_bands
= howmany(file_size
, band_size
);
574 if (n_bands
> (BAND_MAX
+ 1)) {
575 printf("file is too big: %ld > %d\n",
580 /* create a block bitmap, one bit per block */
581 bitmap_size
= block_bitmap_size(file_size
, block_size
);
582 block_bitmap
= my_malloc(bitmap_size
);
583 if (block_bitmap
== NULL
) {
584 printf("failed to allocate bitmap\n");
587 bzero(block_bitmap
, bitmap_size
);
589 /* get the band map */
590 bands
= (band_number_t
*)my_malloc(n_bands
* sizeof(band_number_t
));
592 printf("failed to allocate bands\n");
595 bzero(bands
, n_bands
* sizeof(band_number_t
));
597 map
= my_malloc(sizeof(*map
));
599 printf("failed to allocate map\n");
602 map
->blocks_per_band
= band_size
/ block_size
;
603 map
->block_bitmap
= block_bitmap
;
605 map
->file_size_blocks
= n_bands
* map
->blocks_per_band
;
607 map
->zeroth_band
= -1;
608 map
->shadow_size_bands
= howmany(shadow_size
, band_size
);
609 map
->block_size
= block_size
;
614 my_free(block_bitmap
);
621 * Function: shadow_map_free
623 * Frees the data structure to deal with the shadow map.
626 shadow_map_free(shadow_map_t
* map
)
628 if (map
->block_bitmap
)
629 my_free(map
->block_bitmap
);
632 map
->block_bitmap
= 0;
639 #define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512)
657 block_request_t requests
[] = {
658 { WriteRequest
, BAND_SIZE_BLOCKS
* 2, 1 },
659 { ReadRequest
, BAND_SIZE_BLOCKS
/ 2, BAND_SIZE_BLOCKS
* 2 - 2 },
660 { WriteRequest
, BAND_SIZE_BLOCKS
* 1, 5 * BAND_SIZE_BLOCKS
+ 3},
661 { ReadRequest
, 0, BAND_SIZE_BLOCKS
* 10 },
662 { WriteRequest
, BAND_SIZE_BLOCKS
* (BAND_MAX
- 1),
663 BAND_SIZE_BLOCKS
* 2},
667 map
= shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512);
669 printf("shadow_map_create failed\n");
672 for (i
= 0; TRUE
; i
++) {
675 boolean_t shadow_grew
;
676 boolean_t read_shadow
;
678 if (requests
[i
].count
== 0) {
681 offset
= requests
[i
].offset
;
682 resid
= requests
[i
].count
;
683 printf("\n%s REQUEST (%ld, %ld)\n",
684 requests
[i
].type
== WriteRequest
? "WRITE" : "READ",
686 switch (requests
[i
].type
) {
692 shadow_grew
= shadow_map_write(map
, offset
,
696 printf("\t(%ld, %ld) => (%ld, %ld)",
697 offset
, resid
, this_offset
, this_count
);
699 offset
+= this_count
;
701 printf(" shadow grew to %ld", shadow_map_shadow_size(map
));
711 read_shadow
= shadow_map_read(map
, offset
,
715 printf("\t(%ld, %ld) => (%ld, %ld)%s\n",
716 offset
, resid
, this_offset
, this_count
,
717 read_shadow
? " from shadow" : "");
718 if (this_count
== 0) {
719 printf("this_count is 0, aborting\n");
723 offset
+= this_count
;
731 shadow_map_free(map
);