2  * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  32  * Implement copy-on-write shadow map to allow a disk image to be 
  33  * mounted read-only, yet be writable by transferring writes to a 
  34  * "shadow" file.  Subsequent reads from blocks that have been 
  35  * written will then go the "shadow" file. 
  37  * The map has two parts: 
  38  * 1) a bit map to track which blocks have been written 
  39  * 2) a band map to map a "band" within the original file to a corresponding 
  40  *    "band" in the shadow file.  Each band has the same size. 
  42  * The band map is used to ensure that blocks that are contiguous in the  
  43  * original file will remain contiguous in the shadow file. 
  45  * For debugging purposes, this file can be compiled standalone using: 
  46  * cc -o shadow shadow.c -DTEST_SHADOW 
  50  * Modification History 
  52  * December 21, 2001    Dieter Siegmund (dieter@apple.com) 
  55 #include <sys/param.h> 
  56 #include <sys/types.h> 
  57 #include <mach/boolean.h> 
  64 #define my_malloc(a)    malloc(a) 
  65 #define my_free(a)      free(a) 
  66 #else /* !TEST_SHADOW */ 
  67 #include <sys/malloc.h> 
  68 #define my_malloc(a)    _MALLOC(a, M_TEMP, M_WAITOK) 
  69 #define my_free(a)      FREE(a, M_TEMP) 
  70 #include <libkern/libkern.h> 
  71 #endif /* TEST_SHADOW */ 
  75 #define UINT32_ALL_ONES                 ((uint32_t)(-1)) 
  76 #define USHORT_ALL_ONES                 ((u_short)(-1)) 
  77 #define UCHAR_ALL_ONES                  ((u_char)(-1)) 
  79 #define my_trunc(value, divisor)        ((value) / (divisor) * (divisor)) 
  81 /* a band size of 128K can represent a file up to 8GB */ 
  82 #define BAND_SIZE_DEFAULT_POWER_2       17 
  83 #define BAND_SIZE_DEFAULT               (1 << BAND_SIZE_DEFAULT_POWER_2) 
  85 typedef u_short band_number_t
; 
  86 #define BAND_ZERO                       ((band_number_t)0) 
  87 #define BAND_MAX                        ((band_number_t)65535) 
  90     uint32_t            blocks_per_band
;/* size in blocks */ 
  92     u_char 
*            block_bitmap
;           /* 1 bit per block; 1=written */ 
  93     band_number_t 
*     bands
;                  /* band map array */ 
  94     uint32_t            file_size_blocks
;       /* size of file in bands */ 
  95     uint32_t            shadow_size_bands
;      /* size of shadow in bands */ 
  96     uint32_t            next_band
;              /* next free band */ 
  97     uint32_t            zeroth_band
;            /* special-case 0th band */ 
 106 static __inline__ u_char
 
 109     return ((u_char
)(1 << b
)); 
 113  * Function: bits_lower 
 115  *   Return a byte value in which bits numbered lower than 'b' are set. 
 117 static __inline__ u_char
 
 120     return ((u_char
)(bit(b
) - 1)); 
 124  * Function: byte_set_bits 
 126  *   Set the given range of bits within a byte. 
 128 static __inline__ u_char
 
 129 byte_set_bits(int start
, int end
) 
 131     return ((u_char
)((~bits_lower(start
)) & (bits_lower(end
) | bit(end
)))); 
 134 static __inline__ bitmap_offset_t
 
 135 bitmap_offset(off_t where
) 
 139     b
.byte 
= where 
/ NBBY
; 
 140     b
.bit 
= where 
% NBBY
; 
 145  * Function: bitmap_set 
 148  *   Set the given range of bits. 
 150  *   This algorithm tries to set the extents using the biggest 
 151  *   units, using longs, then a short, then a byte, then bits. 
 154 bitmap_set(u_char 
* map
, uint32_t start_bit
, uint32_t bit_count
) 
 156     bitmap_offset_t     start
; 
 159     start 
= bitmap_offset(start_bit
); 
 160     end 
= bitmap_offset(start_bit 
+ bit_count
); 
 161     if (start
.byte 
< end
.byte
) { 
 165             map
[start
.byte
] |= byte_set_bits(start
.bit
, NBBY 
- 1); 
 168             if (start
.byte 
== end
.byte
) 
 172         n_bytes 
= end
.byte 
- start
.byte
; 
 174         while (n_bytes 
>= (sizeof(uint32_t))) { 
 175             *((uint32_t *)(map 
+ start
.byte
)) = UINT32_ALL_ONES
; 
 176             start
.byte 
+= sizeof(uint32_t); 
 177             n_bytes 
-= sizeof(uint32_t); 
 179         if (n_bytes 
>= sizeof(u_short
)) { 
 180             *((u_short 
*)(map 
+ start
.byte
)) = USHORT_ALL_ONES
; 
 181             start
.byte 
+= sizeof(u_short
); 
 182             n_bytes 
-= sizeof(u_short
); 
 185             map
[start
.byte
] = UCHAR_ALL_ONES
; 
 192     if (end
.bit 
> start
.bit
) { 
 193         map
[start
.byte
] |= byte_set_bits(start
.bit
, end
.bit 
- 1); 
 200  * Function: bitmap_get 
 203  *   Return the number of bits in the range that are the same e.g. 
 204  *   11101 returns 3 because the first 3 bits are the same (1's), whereas 
 205  *   001100 returns 2 because the first 2 bits are the same. 
 206  *   This algorithm tries to count things in as big a chunk as possible, 
 207  *   first aligning to a byte offset, then trying to count longs, a short, 
 208  *   a byte, then any remaining bits to find the bit that is different. 
 212 bitmap_get(u_char 
* map
, uint32_t start_bit
, uint32_t bit_count
,  
 213            boolean_t 
* ret_is_set
) 
 218     bitmap_offset_t     start
; 
 221     start 
= bitmap_offset(start_bit
); 
 222     end 
= bitmap_offset(start_bit 
+ bit_count
); 
 224     is_set 
= (map
[start
.byte
] & bit(start
.bit
)) ? TRUE 
: FALSE
; 
 227     if (start
.byte 
< end
.byte
) { 
 230         if (start
.bit
) { /* try to align to a byte */ 
 231             for (i 
= start
.bit
; i 
< NBBY
; i
++) { 
 232                 boolean_t       this_is_set
; 
 234                 this_is_set 
= (map
[start
.byte
] & bit(i
)) ? TRUE 
: FALSE
; 
 235                 if (this_is_set 
!= is_set
) { 
 236                     goto done
; /* found bit that was different, we're done */ 
 240             start
.bit 
= 0; /* made it to the next byte */ 
 242             if (start
.byte 
== end
.byte
) 
 243                 goto end
; /* no more bytes, check for any leftover bits */ 
 245         /* calculate how many bytes are left in the range */ 
 246         n_bytes 
= end
.byte 
- start
.byte
; 
 248         /* check for 4 bytes of the same bits */ 
 249         while (n_bytes 
>= sizeof(uint32_t)) { 
 250             uint32_t * valPtr 
= (uint32_t *)(map 
+ start
.byte
); 
 251             if ((is_set 
&& *valPtr 
== UINT32_ALL_ONES
)  
 252                 || (!is_set 
&& *valPtr 
== 0)) { 
 253                 count 
+= sizeof(*valPtr
) * NBBY
; 
 254                 start
.byte 
+= sizeof(*valPtr
); 
 255                 n_bytes 
-= sizeof(*valPtr
); 
 258                 break; /* bits differ */ 
 261         /* check for 2 bytes of the same bits */ 
 262         if (n_bytes 
>= sizeof(u_short
)) { 
 263             u_short 
* valPtr 
= (u_short 
*)(map 
+ start
.byte
); 
 265             if ((is_set 
&& *valPtr 
== USHORT_ALL_ONES
)  
 266                 || (!is_set 
&& (*valPtr 
== 0))) { 
 267                 count 
+= sizeof(*valPtr
) * NBBY
; 
 268                 start
.byte 
+= sizeof(*valPtr
); 
 269                 n_bytes 
-= sizeof(*valPtr
); 
 273         /* check for 1 byte of the same bits */ 
 275             if ((is_set 
&& map
[start
.byte
] == UCHAR_ALL_ONES
)  
 276                 || (!is_set 
&& map
[start
.byte
] == 0)) { 
 281             /* we found bits that were different, find the first one */ 
 283                 for (i 
= 0; i 
< NBBY
; i
++) { 
 284                     boolean_t   this_is_set
; 
 286                     this_is_set 
= (map
[start
.byte
] & bit(i
)) ? TRUE 
: FALSE
; 
 287                     if (this_is_set 
!= is_set
) { 
 298     for (i 
= start
.bit
; i 
< (int)end
.bit
; i
++) { 
 299         boolean_t this_is_set 
= (map
[start
.byte
] & bit(i
)) ? TRUE 
: FALSE
; 
 301         if (this_is_set 
!= is_set
) { 
 308     *ret_is_set 
= is_set
; 
 312 static __inline__ band_number_t
 
 313 shadow_map_block_to_band(shadow_map_t 
* map
, uint32_t block
) 
 315     return (block 
/ map
->blocks_per_band
); 
 319  * Function: shadow_map_mapped_band 
 321  *   Return the mapped band for the given band. 
 322  *   If map_it is FALSE, and the band is not mapped, return FALSE. 
 323  *   If map_it is TRUE, then this function will always return TRUE. 
 326 shadow_map_mapped_band(shadow_map_t 
* map
, band_number_t band
, 
 327                        boolean_t map_it
, band_number_t 
* mapped_band
) 
 329     boolean_t           is_mapped 
= FALSE
; 
 331     if (band 
== map
->zeroth_band
) { 
 332         *mapped_band 
= BAND_ZERO
; 
 336         *mapped_band 
= map
->bands
[band
]; 
 337         if (*mapped_band 
== BAND_ZERO
) { 
 340                 if (map
->next_band 
== 0) { 
 341                     /* remember the zero'th band */ 
 342                     map
->zeroth_band 
= band
; 
 344                 *mapped_band 
= map
->bands
[band
] = map
->next_band
++; 
 356  * Function: shadow_map_contiguous 
 359  *   Return the first offset within the range position..(position + count)  
 360  *   that is not a contiguous mapped band. 
 362  *   If called with is_write = TRUE, this function will map bands as it goes. 
 365 shadow_map_contiguous(shadow_map_t 
* map
, uint32_t start_block
, 
 366                       uint32_t num_blocks
, boolean_t is_write
) 
 368     band_number_t       band 
= shadow_map_block_to_band(map
, start_block
); 
 369     uint32_t            end_block 
= start_block 
+ num_blocks
; 
 371     band_number_t       mapped_band
; 
 372     uint32_t            ret_end_block 
= end_block
; 
 375     is_mapped 
= shadow_map_mapped_band(map
, band
, is_write
, &mapped_band
); 
 376     if (is_write 
== FALSE 
&& is_mapped 
== FALSE
) { 
 377         static int happened 
= 0; 
 378         /* this can't happen */ 
 380             printf("shadow_map_contiguous: this can't happen!\n"); 
 383         return (start_block
); 
 385     for (p 
= my_trunc(start_block 
+ map
->blocks_per_band
,  
 386                       map
->blocks_per_band
); 
 387          p 
< end_block
; p 
+= map
->blocks_per_band
) { 
 388         band_number_t   next_mapped_band
; 
 391         is_mapped 
= shadow_map_mapped_band(map
, band
, is_write
, 
 393         if (is_write 
== FALSE 
&& is_mapped 
== FALSE
) { 
 396         if ((mapped_band 
+ 1) != next_mapped_band
) { 
 401         mapped_band 
= next_mapped_band
; 
 403     return (ret_end_block
); 
 408  * Function: block_bitmap_size 
 410  *   The number of bytes required in a block bitmap to represent a file of size  
 413  *   The bytes required is the number of blocks in the file, 
 414  *   divided by the number of bits per byte. 
 416  *   An 8GB file requires (assuming 512 byte block): 
 417  *   2^33 / 2^9 / 2^3 = 2^21 = 2MB 
 418  *   of bitmap space.  This is a non-trival amount of memory, 
 419  *   particularly since most of the bits will be zero. 
 420  *   A sparse bitmap would really help in this case. 
 422 static __inline__ 
uint32_t 
 423 block_bitmap_size(off_t file_size
, uint32_t block_size
) 
 425     off_t blocks 
= howmany(file_size
, block_size
); 
 426     return (howmany(blocks
, NBBY
)); 
 430  * Function: shadow_map_read 
 433  *   Calculate the block offset within the shadow to read, and the number 
 434  *   blocks to read.  The input values (block_offset, block_count) refer 
 435  *   to the original file. 
 437  *   The output values (*incr_block_offset, *incr_block_count) refer to the 
 438  *   shadow file if the return value is TRUE.  They refer to the original 
 439  *   file if the return value is FALSE. 
 441  *   Blocks within a band may or may not have been written, in addition, 
 442  *   Bands are not necessarily contiguous, therefore: 
 443  *      *incr_block_count <= block_count 
 444  *   The caller must be prepared to call this function interatively 
 445  *   to complete the whole i/o. 
 447  *   TRUE if the shadow file should be read, FALSE if the original file 
 451 shadow_map_read(shadow_map_t 
* map
, uint32_t block_offset
, uint32_t block_count
, 
 452                 uint32_t * incr_block_offset
, uint32_t * incr_block_count
) 
 454     boolean_t           written 
= FALSE
; 
 457     if (block_offset 
>= map
->file_size_blocks
 
 458         || (block_offset 
+ block_count
) > map
->file_size_blocks
) { 
 459         printf("shadow_map_read: request (%d, %d) exceeds file size %d\n", 
 460                block_offset
, block_count
, map
->file_size_blocks
); 
 461         *incr_block_count 
= 0; 
 463     n_blocks 
= bitmap_get(map
->block_bitmap
, block_offset
, block_count
, 
 465     if (written 
== FALSE
) { 
 466         *incr_block_count 
= n_blocks
; 
 467         *incr_block_offset 
= block_offset
; 
 469     else { /* start has been written, and therefore mapped */ 
 470         band_number_t   mapped_band
; 
 473         mapped_band 
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)]; 
 474         *incr_block_offset 
= mapped_band 
* map
->blocks_per_band
 
 475             + (block_offset 
% map
->blocks_per_band
); 
 477             = shadow_map_contiguous(map
, block_offset
, block_count
, FALSE
); 
 478         *incr_block_count 
= band_limit 
- block_offset
; 
 479         if (*incr_block_count 
> n_blocks
) { 
 480             *incr_block_count 
= n_blocks
; 
 487  * Function: shadow_map_write 
 490  *   Calculate the block offset within the shadow to write, and the number 
 491  *   blocks to write.  The input values (block_offset, block_count) refer 
 492  *   to the original file.  The output values  
 493  *   (*incr_block_offset, *incr_block_count) refer to the shadow file. 
 495  *   Bands are not necessarily contiguous, therefore: 
 496  *      *incr_block_count <= block_count 
 497  *   The caller must be prepared to call this function interatively 
 498  *   to complete the whole i/o. 
 500  *   TRUE if the shadow file was grown, FALSE otherwise.  
 503 shadow_map_write(shadow_map_t 
* map
, uint32_t block_offset
,  
 504                  uint32_t block_count
, uint32_t * incr_block_offset
,  
 505                  uint32_t * incr_block_count
) 
 508     band_number_t       mapped_band
; 
 509     boolean_t           shadow_grew 
= FALSE
; 
 511     if (block_offset 
>= map
->file_size_blocks
 
 512         || (block_offset 
+ block_count
) > map
->file_size_blocks
) { 
 513         printf("shadow_map_write: request (%d, %d) exceeds file size %d\n", 
 514                block_offset
, block_count
, map
->file_size_blocks
); 
 515         *incr_block_count 
= 0; 
 518     band_limit 
= shadow_map_contiguous(map
, block_offset
, block_count
, TRUE
); 
 519     mapped_band 
= map
->bands
[shadow_map_block_to_band(map
, block_offset
)]; 
 520     *incr_block_offset 
= mapped_band 
* map
->blocks_per_band
 
 521         + (block_offset 
% map
->blocks_per_band
); 
 522     *incr_block_count 
= band_limit 
- block_offset
; 
 524     /* mark these blocks as written */ 
 525     bitmap_set(map
->block_bitmap
, block_offset
, *incr_block_count
); 
 527     if (map
->next_band 
> map
->shadow_size_bands
) { 
 528         map
->shadow_size_bands 
= map
->next_band
; 
 531     return (shadow_grew
); 
 535 shadow_map_is_written(shadow_map_t 
* map
, uint32_t block_offset
) 
 539     b 
= bitmap_offset(block_offset
); 
 540     return ((map
->block_bitmap
[b
.byte
] & bit(b
.bit
)) ? TRUE 
: FALSE
); 
 544  * Function: shadow_map_shadow_size 
 547  *   To return the size of the shadow file in blocks. 
 550 shadow_map_shadow_size(shadow_map_t 
* map
) 
 552     return (map
->shadow_size_bands 
* map
->blocks_per_band
); 
 556  * Function: shadow_map_create 
 559  *   Allocate the dynamic data for keeping track of the shadow dirty blocks 
 560  *   and the band mapping table. 
 562  *   NULL if an error occurred. 
 565 shadow_map_create(off_t file_size
, off_t shadow_size
,  
 566                   uint32_t band_size
, uint32_t block_size
) 
 568     void *              block_bitmap 
= NULL
; 
 569     uint32_t            bitmap_size
; 
 570     band_number_t 
*     bands 
= NULL
; 
 572     uint32_t            n_bands 
= 0; 
 574     if (band_size 
== 0) { 
 575         band_size 
= BAND_SIZE_DEFAULT
; 
 578     n_bands 
= howmany(file_size
, band_size
); 
 579     if (n_bands 
> (BAND_MAX 
+ 1)) { 
 580         printf("file is too big: %d > %d\n", 
 585     /* create a block bitmap, one bit per block */ 
 586     bitmap_size 
= block_bitmap_size(file_size
, block_size
); 
 587     block_bitmap 
= my_malloc(bitmap_size
); 
 588     if (block_bitmap 
== NULL
) { 
 589         printf("failed to allocate bitmap\n"); 
 592     bzero(block_bitmap
, bitmap_size
); 
 594     /* get the band map */ 
 595     bands 
= (band_number_t 
*)my_malloc(n_bands 
* sizeof(band_number_t
)); 
 597         printf("failed to allocate bands\n"); 
 600     bzero(bands
, n_bands 
* sizeof(band_number_t
)); 
 602     map 
= my_malloc(sizeof(*map
)); 
 604         printf("failed to allocate map\n"); 
 607     map
->blocks_per_band 
= band_size 
/ block_size
; 
 608     map
->block_bitmap 
= block_bitmap
; 
 610     map
->file_size_blocks 
= n_bands 
* map
->blocks_per_band
; 
 612     map
->zeroth_band 
= -1; 
 613     map
->shadow_size_bands 
= howmany(shadow_size
, band_size
); 
 614     map
->block_size 
= block_size
; 
 619         my_free(block_bitmap
); 
 626  * Function: shadow_map_free 
 628  *   Frees the data structure to deal with the shadow map. 
 631 shadow_map_free(shadow_map_t 
* map
) 
 633     if (map
->block_bitmap
) 
 634         my_free(map
->block_bitmap
); 
 637     map
->block_bitmap 
= NULL
; 
 644 #define BAND_SIZE_BLOCKS        (BAND_SIZE_DEFAULT / 512) 
 662     block_request_t     requests
[] = { 
 663         { WriteRequest
, BAND_SIZE_BLOCKS 
* 2, 1 }, 
 664         { ReadRequest
, BAND_SIZE_BLOCKS 
/ 2, BAND_SIZE_BLOCKS 
* 2 - 2 }, 
 665         { WriteRequest
, BAND_SIZE_BLOCKS 
* 1, 5 * BAND_SIZE_BLOCKS 
+ 3}, 
 666         { ReadRequest
, 0, BAND_SIZE_BLOCKS 
* 10 }, 
 667         { WriteRequest
, BAND_SIZE_BLOCKS 
* (BAND_MAX 
- 1), 
 668           BAND_SIZE_BLOCKS 
* 2}, 
 672     map 
= shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512); 
 674         printf("shadow_map_create failed\n"); 
 677     for (i 
= 0; TRUE
; i
++) { 
 680         boolean_t       shadow_grew
; 
 681         boolean_t       read_shadow
; 
 683         if (requests
[i
].count 
== 0) { 
 686         offset 
= requests
[i
].offset
; 
 687         resid 
= requests
[i
].count
; 
 688         printf("\n%s REQUEST (%ld, %ld)\n",  
 689                requests
[i
].type 
== WriteRequest 
? "WRITE" : "READ", 
 691         switch (requests
[i
].type
) { 
 694                 uint32_t this_offset
; 
 697                 shadow_grew 
= shadow_map_write(map
, offset
, 
 701                 printf("\t(%ld, %ld) => (%ld, %ld)", 
 702                        offset
, resid
, this_offset
, this_count
); 
 704                 offset 
+= this_count
; 
 706                     printf(" shadow grew to %ld", shadow_map_shadow_size(map
)); 
 713                 uint32_t this_offset
; 
 716                 read_shadow 
= shadow_map_read(map
, offset
, 
 720                 printf("\t(%ld, %ld) => (%ld, %ld)%s\n", 
 721                        offset
, resid
, this_offset
, this_count
, 
 722                        read_shadow 
? " from shadow" : ""); 
 723                 if (this_count 
== 0) { 
 724                     printf("this_count is 0, aborting\n"); 
 728                 offset 
+= this_count
; 
 736         shadow_map_free(map
);