]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/dev/vn/shadow.c
xnu-6153.11.26.tar.gz
[apple/xnu.git] / bsd / dev / vn / shadow.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * shadow.c
31 *
32 * Implement copy-on-write shadow map to allow a disk image to be
33 * mounted read-only, yet be writable by transferring writes to a
34 * "shadow" file. Subsequent reads from blocks that have been
35 * written will then go the "shadow" file.
36 *
37 * The map has two parts:
38 * 1) a bit map to track which blocks have been written
39 * 2) a band map to map a "band" within the original file to a corresponding
40 * "band" in the shadow file. Each band has the same size.
41 *
42 * The band map is used to ensure that blocks that are contiguous in the
43 * original file will remain contiguous in the shadow file.
44 *
45 * For debugging purposes, this file can be compiled standalone using:
46 * cc -o shadow shadow.c -DTEST_SHADOW
47 */
48
49/*
50 * Modification History
51 *
52 * December 21, 2001 Dieter Siegmund (dieter@apple.com)
53 * - initial revision
54 */
55#include <sys/param.h>
56#include <sys/types.h>
57#include <mach/boolean.h>
58
59#include <string.h>
60
61#ifdef TEST_SHADOW
62#include <unistd.h>
63#include <stdlib.h>
64#define my_malloc(a) malloc(a)
65#define my_free(a) free(a)
66#else /* !TEST_SHADOW */
67#include <sys/malloc.h>
68#define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK)
69#define my_free(a) FREE(a, M_TEMP)
70#include <libkern/libkern.h>
71#endif /* TEST_SHADOW */
72
73#include "shadow.h"
74
75#define UINT32_ALL_ONES ((uint32_t)(-1))
76#define USHORT_ALL_ONES ((u_short)(-1))
77#define UCHAR_ALL_ONES ((u_char)(-1))
78
79#define my_trunc(value, divisor) ((value) / (divisor) * (divisor))
80
81/* a band size of 128K can represent a file up to 8GB */
82#define BAND_SIZE_DEFAULT_POWER_2 17
83#define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2)
84
85typedef u_short band_number_t;
86#define BAND_ZERO ((band_number_t)0)
87#define BAND_MAX ((band_number_t)65535)
88
89struct shadow_map {
90 uint32_t blocks_per_band;/* size in blocks */
91 uint32_t block_size;
92 u_char * block_bitmap; /* 1 bit per block; 1=written */
93 band_number_t * bands; /* band map array */
94 uint32_t file_size_blocks; /* size of file in bands */
95 uint32_t shadow_size_bands; /* size of shadow in bands */
96 uint32_t next_band; /* next free band */
97 uint32_t zeroth_band; /* special-case 0th band */
98};
99
100
101typedef struct {
102 uint32_t byte;
103 uint32_t bit;
104} bitmap_offset_t;
105
106static __inline__ u_char
107bit(int b)
108{
109 return (u_char)(1 << b);
110}
111
112/*
113 * Function: bits_lower
114 * Purpose:
115 * Return a byte value in which bits numbered lower than 'b' are set.
116 */
117static __inline__ u_char
118bits_lower(int b)
119{
120 return (u_char)(bit(b) - 1);
121}
122
123/*
124 * Function: byte_set_bits
125 * Purpose:
126 * Set the given range of bits within a byte.
127 */
128static __inline__ u_char
129byte_set_bits(int start, int end)
130{
131 return (u_char)((~bits_lower(start)) & (bits_lower(end) | bit(end)));
132}
133
134static __inline__ bitmap_offset_t
135bitmap_offset(off_t where)
136{
137 bitmap_offset_t b;
138
139 b.byte = where / NBBY;
140 b.bit = where % NBBY;
141 return b;
142}
143
144/*
145 * Function: bitmap_set
146 *
147 * Purpose:
148 * Set the given range of bits.
149 *
150 * This algorithm tries to set the extents using the biggest
151 * units, using longs, then a short, then a byte, then bits.
152 */
153static void
154bitmap_set(u_char * map, uint32_t start_bit, uint32_t bit_count)
155{
156 bitmap_offset_t start;
157 bitmap_offset_t end;
158
159 start = bitmap_offset(start_bit);
160 end = bitmap_offset(start_bit + bit_count);
161 if (start.byte < end.byte) {
162 uint32_t n_bytes;
163
164 if (start.bit) {
165 map[start.byte] |= byte_set_bits(start.bit, NBBY - 1);
166 start.bit = 0;
167 start.byte++;
168 if (start.byte == end.byte) {
169 goto end;
170 }
171 }
172
173 n_bytes = end.byte - start.byte;
174
175 while (n_bytes >= (sizeof(uint32_t))) {
176 *((uint32_t *)(map + start.byte)) = UINT32_ALL_ONES;
177 start.byte += sizeof(uint32_t);
178 n_bytes -= sizeof(uint32_t);
179 }
180 if (n_bytes >= sizeof(u_short)) {
181 *((u_short *)(map + start.byte)) = USHORT_ALL_ONES;
182 start.byte += sizeof(u_short);
183 n_bytes -= sizeof(u_short);
184 }
185 if (n_bytes == 1) {
186 map[start.byte] = UCHAR_ALL_ONES;
187 start.byte++;
188 n_bytes = 0;
189 }
190 }
191
192end:
193 if (end.bit > start.bit) {
194 map[start.byte] |= byte_set_bits(start.bit, end.bit - 1);
195 }
196
197 return;
198}
199
200/*
201 * Function: bitmap_get
202 *
203 * Purpose:
204 * Return the number of bits in the range that are the same e.g.
205 * 11101 returns 3 because the first 3 bits are the same (1's), whereas
206 * 001100 returns 2 because the first 2 bits are the same.
207 * This algorithm tries to count things in as big a chunk as possible,
208 * first aligning to a byte offset, then trying to count longs, a short,
209 * a byte, then any remaining bits to find the bit that is different.
210 */
211
212static uint32_t
213bitmap_get(u_char * map, uint32_t start_bit, uint32_t bit_count,
214 boolean_t * ret_is_set)
215{
216 uint32_t count;
217 int i;
218 boolean_t is_set;
219 bitmap_offset_t start;
220 bitmap_offset_t end;
221
222 start = bitmap_offset(start_bit);
223 end = bitmap_offset(start_bit + bit_count);
224
225 is_set = (map[start.byte] & bit(start.bit)) ? TRUE : FALSE;
226 count = 0;
227
228 if (start.byte < end.byte) {
229 uint32_t n_bytes;
230
231 if (start.bit) { /* try to align to a byte */
232 for (i = start.bit; i < NBBY; i++) {
233 boolean_t this_is_set;
234
235 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
236 if (this_is_set != is_set) {
237 goto done; /* found bit that was different, we're done */
238 }
239 count++;
240 }
241 start.bit = 0; /* made it to the next byte */
242 start.byte++;
243 if (start.byte == end.byte) {
244 goto end; /* no more bytes, check for any leftover bits */
245 }
246 }
247 /* calculate how many bytes are left in the range */
248 n_bytes = end.byte - start.byte;
249
250 /* check for 4 bytes of the same bits */
251 while (n_bytes >= sizeof(uint32_t)) {
252 uint32_t * valPtr = (uint32_t *)(map + start.byte);
253 if ((is_set && *valPtr == UINT32_ALL_ONES)
254 || (!is_set && *valPtr == 0)) {
255 count += sizeof(*valPtr) * NBBY;
256 start.byte += sizeof(*valPtr);
257 n_bytes -= sizeof(*valPtr);
258 } else {
259 break; /* bits differ */
260 }
261 }
262 /* check for 2 bytes of the same bits */
263 if (n_bytes >= sizeof(u_short)) {
264 u_short * valPtr = (u_short *)(map + start.byte);
265
266 if ((is_set && *valPtr == USHORT_ALL_ONES)
267 || (!is_set && (*valPtr == 0))) {
268 count += sizeof(*valPtr) * NBBY;
269 start.byte += sizeof(*valPtr);
270 n_bytes -= sizeof(*valPtr);
271 }
272 }
273
274 /* check for 1 byte of the same bits */
275 if (n_bytes) {
276 if ((is_set && map[start.byte] == UCHAR_ALL_ONES)
277 || (!is_set && map[start.byte] == 0)) {
278 count += NBBY;
279 start.byte++;
280 n_bytes--;
281 }
282 /* we found bits that were different, find the first one */
283 if (n_bytes) {
284 for (i = 0; i < NBBY; i++) {
285 boolean_t this_is_set;
286
287 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
288 if (this_is_set != is_set) {
289 break;
290 }
291 count++;
292 }
293 goto done;
294 }
295 }
296 }
297
298end:
299 for (i = start.bit; i < (int)end.bit; i++) {
300 boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE;
301
302 if (this_is_set != is_set) {
303 break;
304 }
305 count++;
306 }
307
308done:
309 *ret_is_set = is_set;
310 return count;
311}
312
313static __inline__ band_number_t
314shadow_map_block_to_band(shadow_map_t * map, uint32_t block)
315{
316 return block / map->blocks_per_band;
317}
318
319/*
320 * Function: shadow_map_mapped_band
321 * Purpose:
322 * Return the mapped band for the given band.
323 * If map_it is FALSE, and the band is not mapped, return FALSE.
324 * If map_it is TRUE, then this function will always return TRUE.
325 */
326static boolean_t
327shadow_map_mapped_band(shadow_map_t * map, band_number_t band,
328 boolean_t map_it, band_number_t * mapped_band)
329{
330 boolean_t is_mapped = FALSE;
331
332 if (band == map->zeroth_band) {
333 *mapped_band = BAND_ZERO;
334 is_mapped = TRUE;
335 } else {
336 *mapped_band = map->bands[band];
337 if (*mapped_band == BAND_ZERO) {
338 if (map_it) {
339 /* grow the file */
340 if (map->next_band == 0) {
341 /* remember the zero'th band */
342 map->zeroth_band = band;
343 }
344 *mapped_band = map->bands[band] = map->next_band++;
345 is_mapped = TRUE;
346 }
347 } else {
348 is_mapped = TRUE;
349 }
350 }
351 return is_mapped;
352}
353
354/*
355 * Function: shadow_map_contiguous
356 *
357 * Purpose:
358 * Return the first offset within the range position..(position + count)
359 * that is not a contiguous mapped band.
360 *
361 * If called with is_write = TRUE, this function will map bands as it goes.
362 */
363static uint32_t
364shadow_map_contiguous(shadow_map_t * map, uint32_t start_block,
365 uint32_t num_blocks, boolean_t is_write)
366{
367 band_number_t band = shadow_map_block_to_band(map, start_block);
368 uint32_t end_block = start_block + num_blocks;
369 boolean_t is_mapped;
370 band_number_t mapped_band;
371 uint32_t ret_end_block = end_block;
372 uint32_t p;
373
374 is_mapped = shadow_map_mapped_band(map, band, is_write, &mapped_band);
375 if (is_write == FALSE && is_mapped == FALSE) {
376 static int happened = 0;
377 /* this can't happen */
378 if (happened == 0) {
379 printf("shadow_map_contiguous: this can't happen!\n");
380 happened = 1;
381 }
382 return start_block;
383 }
384 for (p = my_trunc(start_block + map->blocks_per_band,
385 map->blocks_per_band);
386 p < end_block; p += map->blocks_per_band) {
387 band_number_t next_mapped_band;
388
389 band++;
390 is_mapped = shadow_map_mapped_band(map, band, is_write,
391 &next_mapped_band);
392 if (is_write == FALSE && is_mapped == FALSE) {
393 return p;
394 }
395 if ((mapped_band + 1) != next_mapped_band) {
396 /* not contiguous */
397 ret_end_block = p;
398 break;
399 }
400 mapped_band = next_mapped_band;
401 }
402 return ret_end_block;
403}
404
405
406/*
407 * Function: block_bitmap_size
408 * Purpose:
409 * The number of bytes required in a block bitmap to represent a file of size
410 * file_size.
411 *
412 * The bytes required is the number of blocks in the file,
413 * divided by the number of bits per byte.
414 * Note:
415 * An 8GB file requires (assuming 512 byte block):
416 * 2^33 / 2^9 / 2^3 = 2^21 = 2MB
417 * of bitmap space. This is a non-trival amount of memory,
418 * particularly since most of the bits will be zero.
419 * A sparse bitmap would really help in this case.
420 */
421static __inline__ uint32_t
422block_bitmap_size(off_t file_size, uint32_t block_size)
423{
424 off_t blocks = howmany(file_size, block_size);
425 return howmany(blocks, NBBY);
426}
427
428/*
429 * Function: shadow_map_read
430 *
431 * Purpose:
432 * Calculate the block offset within the shadow to read, and the number
433 * blocks to read. The input values (block_offset, block_count) refer
434 * to the original file.
435 *
436 * The output values (*incr_block_offset, *incr_block_count) refer to the
437 * shadow file if the return value is TRUE. They refer to the original
438 * file if the return value is FALSE.
439 *
440 * Blocks within a band may or may not have been written, in addition,
441 * Bands are not necessarily contiguous, therefore:
442 * *incr_block_count <= block_count
443 * The caller must be prepared to call this function interatively
444 * to complete the whole i/o.
445 * Returns:
446 * TRUE if the shadow file should be read, FALSE if the original file
447 * should be read.
448 */
449boolean_t
450shadow_map_read(shadow_map_t * map, uint32_t block_offset, uint32_t block_count,
451 uint32_t * incr_block_offset, uint32_t * incr_block_count)
452{
453 boolean_t written = FALSE;
454 uint32_t n_blocks;
455
456 if (block_offset >= map->file_size_blocks
457 || (block_offset + block_count) > map->file_size_blocks) {
458 printf("shadow_map_read: request (%d, %d) exceeds file size %d\n",
459 block_offset, block_count, map->file_size_blocks);
460 *incr_block_count = 0;
461 }
462 n_blocks = bitmap_get(map->block_bitmap, block_offset, block_count,
463 &written);
464 if (written == FALSE) {
465 *incr_block_count = n_blocks;
466 *incr_block_offset = block_offset;
467 } else { /* start has been written, and therefore mapped */
468 band_number_t mapped_band;
469 uint32_t band_limit;
470
471 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
472 *incr_block_offset = mapped_band * map->blocks_per_band
473 + (block_offset % map->blocks_per_band);
474 band_limit
475 = shadow_map_contiguous(map, block_offset, block_count, FALSE);
476 *incr_block_count = band_limit - block_offset;
477 if (*incr_block_count > n_blocks) {
478 *incr_block_count = n_blocks;
479 }
480 }
481 return written;
482}
483
484/*
485 * Function: shadow_map_write
486 *
487 * Purpose:
488 * Calculate the block offset within the shadow to write, and the number
489 * blocks to write. The input values (block_offset, block_count) refer
490 * to the original file. The output values
491 * (*incr_block_offset, *incr_block_count) refer to the shadow file.
492 *
493 * Bands are not necessarily contiguous, therefore:
494 * *incr_block_count <= block_count
495 * The caller must be prepared to call this function interatively
496 * to complete the whole i/o.
497 * Returns:
498 * TRUE if the shadow file was grown, FALSE otherwise.
499 */
500boolean_t
501shadow_map_write(shadow_map_t * map, uint32_t block_offset,
502 uint32_t block_count, uint32_t * incr_block_offset,
503 uint32_t * incr_block_count)
504{
505 uint32_t band_limit;
506 band_number_t mapped_band;
507 boolean_t shadow_grew = FALSE;
508
509 if (block_offset >= map->file_size_blocks
510 || (block_offset + block_count) > map->file_size_blocks) {
511 printf("shadow_map_write: request (%d, %d) exceeds file size %d\n",
512 block_offset, block_count, map->file_size_blocks);
513 *incr_block_count = 0;
514 }
515
516 band_limit = shadow_map_contiguous(map, block_offset, block_count, TRUE);
517 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)];
518 *incr_block_offset = mapped_band * map->blocks_per_band
519 + (block_offset % map->blocks_per_band);
520 *incr_block_count = band_limit - block_offset;
521
522 /* mark these blocks as written */
523 bitmap_set(map->block_bitmap, block_offset, *incr_block_count);
524
525 if (map->next_band > map->shadow_size_bands) {
526 map->shadow_size_bands = map->next_band;
527 shadow_grew = TRUE;
528 }
529 return shadow_grew;
530}
531
532boolean_t
533shadow_map_is_written(shadow_map_t * map, uint32_t block_offset)
534{
535 bitmap_offset_t b;
536
537 b = bitmap_offset(block_offset);
538 return (map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE;
539}
540
541/*
542 * Function: shadow_map_shadow_size
543 *
544 * Purpose:
545 * To return the size of the shadow file in blocks.
546 */
547uint32_t
548shadow_map_shadow_size(shadow_map_t * map)
549{
550 return map->shadow_size_bands * map->blocks_per_band;
551}
552
553/*
554 * Function: shadow_map_create
555 *
556 * Purpose:
557 * Allocate the dynamic data for keeping track of the shadow dirty blocks
558 * and the band mapping table.
559 * Returns:
560 * NULL if an error occurred.
561 */
562shadow_map_t *
563shadow_map_create(off_t file_size, off_t shadow_size,
564 uint32_t band_size, uint32_t block_size)
565{
566 void * block_bitmap = NULL;
567 uint32_t bitmap_size;
568 band_number_t * bands = NULL;
569 shadow_map_t * map;
570 uint32_t n_bands = 0;
571
572 if (band_size == 0) {
573 band_size = BAND_SIZE_DEFAULT;
574 }
575
576 n_bands = howmany(file_size, band_size);
577 if (n_bands > (BAND_MAX + 1)) {
578 printf("file is too big: %d > %d\n",
579 n_bands, BAND_MAX);
580 goto failure;
581 }
582
583 /* create a block bitmap, one bit per block */
584 bitmap_size = block_bitmap_size(file_size, block_size);
585 block_bitmap = my_malloc(bitmap_size);
586 if (block_bitmap == NULL) {
587 printf("failed to allocate bitmap\n");
588 goto failure;
589 }
590 bzero(block_bitmap, bitmap_size);
591
592 /* get the band map */
593 bands = (band_number_t *)my_malloc(n_bands * sizeof(band_number_t));
594 if (bands == NULL) {
595 printf("failed to allocate bands\n");
596 goto failure;
597 }
598 bzero(bands, n_bands * sizeof(band_number_t));
599
600 map = my_malloc(sizeof(*map));
601 if (map == NULL) {
602 printf("failed to allocate map\n");
603 goto failure;
604 }
605 map->blocks_per_band = band_size / block_size;
606 map->block_bitmap = block_bitmap;
607 map->bands = bands;
608 map->file_size_blocks = n_bands * map->blocks_per_band;
609 map->next_band = 0;
610 map->zeroth_band = -1;
611 map->shadow_size_bands = howmany(shadow_size, band_size);
612 map->block_size = block_size;
613 return map;
614
615failure:
616 if (block_bitmap) {
617 my_free(block_bitmap);
618 }
619 if (bands) {
620 my_free(bands);
621 }
622 return NULL;
623}
624
625/*
626 * Function: shadow_map_free
627 * Purpose:
628 * Frees the data structure to deal with the shadow map.
629 */
630void
631shadow_map_free(shadow_map_t * map)
632{
633 if (map->block_bitmap) {
634 my_free(map->block_bitmap);
635 }
636 if (map->bands) {
637 my_free(map->bands);
638 }
639 map->block_bitmap = NULL;
640 map->bands = NULL;
641 my_free(map);
642 return;
643}
644
645#ifdef TEST_SHADOW
646#define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512)
647
648enum {
649 ReadRequest,
650 WriteRequest,
651};
652
653typedef struct {
654 int type;
655 uint32_t offset;
656 uint32_t count;
657} block_request_t;
658
659int
660main()
661{
662 shadow_map_t * map;
663 int i;
664 block_request_t requests[] = {
665 { WriteRequest, BAND_SIZE_BLOCKS * 2, 1 },
666 { ReadRequest, BAND_SIZE_BLOCKS / 2, BAND_SIZE_BLOCKS * 2 - 2 },
667 { WriteRequest, BAND_SIZE_BLOCKS * 1, 5 * BAND_SIZE_BLOCKS + 3},
668 { ReadRequest, 0, BAND_SIZE_BLOCKS * 10 },
669 { WriteRequest, BAND_SIZE_BLOCKS * (BAND_MAX - 1),
670 BAND_SIZE_BLOCKS * 2},
671 { 0, 0 },
672 };
673
674 map = shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512);
675 if (map == NULL) {
676 printf("shadow_map_create failed\n");
677 exit(1);
678 }
679 for (i = 0; TRUE; i++) {
680 uint32_t offset;
681 uint32_t resid;
682 boolean_t shadow_grew;
683 boolean_t read_shadow;
684
685 if (requests[i].count == 0) {
686 break;
687 }
688 offset = requests[i].offset;
689 resid = requests[i].count;
690 printf("\n%s REQUEST (%ld, %ld)\n",
691 requests[i].type == WriteRequest ? "WRITE" : "READ",
692 offset, resid);
693 switch (requests[i].type) {
694 case WriteRequest:
695 while (resid > 0) {
696 uint32_t this_offset;
697 uint32_t this_count;
698
699 shadow_grew = shadow_map_write(map, offset,
700 resid,
701 &this_offset,
702 &this_count);
703 printf("\t(%ld, %ld) => (%ld, %ld)",
704 offset, resid, this_offset, this_count);
705 resid -= this_count;
706 offset += this_count;
707 if (shadow_grew) {
708 printf(" shadow grew to %ld", shadow_map_shadow_size(map));
709 }
710 printf("\n");
711 }
712 break;
713 case ReadRequest:
714 while (resid > 0) {
715 uint32_t this_offset;
716 uint32_t this_count;
717
718 read_shadow = shadow_map_read(map, offset,
719 resid,
720 &this_offset,
721 &this_count);
722 printf("\t(%ld, %ld) => (%ld, %ld)%s\n",
723 offset, resid, this_offset, this_count,
724 read_shadow ? " from shadow" : "");
725 if (this_count == 0) {
726 printf("this_count is 0, aborting\n");
727 break;
728 }
729 resid -= this_count;
730 offset += this_count;
731 }
732 break;
733 default:
734 break;
735 }
736 }
737 if (map) {
738 shadow_map_free(map);
739 }
740 exit(0);
741 return 0;
742}
743#endif