+static int
+grow_table(struct bucket **buf_ptr, int num_buckets, int new_size)
+{
+ struct bucket *newBuf;
+ int current_size = num_buckets, i;
+
+ // return if newsize is less than the current size
+ if (new_size < num_buckets) {
+ return current_size;
+ }
+
+ if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) {
+ printf("jnl: grow_table: no memory to expand coalesce buffer!\n");
+ return -1;
+ }
+
+ // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size);
+
+ // copy existing elements
+ bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket));
+
+ // initialize the new ones
+ for(i=num_buckets; i < new_size; i++) {
+ newBuf[i].block_num = (off_t)-1;
+ }
+
+ // free the old container
+ FREE(*buf_ptr, M_TEMP);
+
+ // reset the buf_ptr
+ *buf_ptr = newBuf;
+
+ return new_size;
+}
+
+static int
+lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full)
+{
+ int lo, hi, index, matches, i;
+
+ if (num_full == 0) {
+ return 0; // table is empty, so insert at index=0
+ }
+
+ lo = 0;
+ hi = num_full - 1;
+ index = -1;
+
+ // perform binary search for block_num
+ do {
+ int mid = (hi - lo)/2 + lo;
+ off_t this_num = (*buf_ptr)[mid].block_num;
+
+ if (block_num == this_num) {
+ index = mid;
+ break;
+ }
+
+ if (block_num < this_num) {
+ hi = mid;
+ continue;
+ }
+
+ if (block_num > this_num) {
+ lo = mid + 1;
+ continue;
+ }
+ } while(lo < hi);
+
+ // check if lo and hi converged on the match
+ if (block_num == (*buf_ptr)[hi].block_num) {
+ index = hi;
+ }
+
+ // if no existing entry found, find index for new one
+ if (index == -1) {
+ index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1;
+ } else {
+ // make sure that we return the right-most index in the case of multiple matches
+ matches = 0;
+ i = index + 1;
+ while(i < num_full && block_num == (*buf_ptr)[i].block_num) {
+ matches++;
+ i++;
+ }
+
+ index += matches;
+ }
+
+ return index;
+}
+
+static int
+insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr, int overwriting)
+{
+ if (!overwriting) {
+ // grow the table if we're out of space
+ if (*num_full_ptr >= *num_buckets_ptr) {
+ int new_size = *num_buckets_ptr * 2;
+ int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size);
+
+ if (grow_size < new_size) {
+ printf("jnl: add_block: grow_table returned an error!\n");
+ return -1;
+ }
+
+ *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size
+ }
+
+ // if we're not inserting at the end, we need to bcopy
+ if (blk_index != *num_full_ptr) {
+ bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) );
+ }
+
+ (*num_full_ptr)++; // increment only if we're not overwriting
+ }
+
+ // sanity check the values we're about to add
+ if (offset >= jnl->jhdr->size) {
+ offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size);
+ }
+ if (size <= 0) {
+ panic("jnl: insert_block: bad size in insert_block (%d)\n", size);
+ }
+
+ (*buf_ptr)[blk_index].block_num = num;
+ (*buf_ptr)[blk_index].block_size = size;
+ (*buf_ptr)[blk_index].jnl_offset = offset;
+
+ return blk_index;
+}
+
+static int
+do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
+{
+ int num_to_remove, index, i, overwrite, err;
+ size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset;
+ off_t overlap, block_start, block_end;
+
+ block_start = block_num*jhdr_size;
+ block_end = block_start + size;
+ overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size);
+
+ // first, eliminate any overlap with the previous entry
+ if (blk_index != 0 && !overwrite) {
+ off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size;
+ off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size;
+ overlap = prev_block_end - block_start;
+ if (overlap > 0) {
+ if (overlap % jhdr_size != 0) {
+ panic("jnl: do_overlap: overlap with previous entry not a multiple of %d\n", jhdr_size);
+ }
+
+ // if the previous entry completely overlaps this one, we need to break it into two pieces.
+ if (prev_block_end > block_end) {
+ off_t new_num = block_end / jhdr_size;
+ size_t new_size = prev_block_end - block_end;
+
+ new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start);
+
+ err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, num_buckets_ptr, num_full_ptr, 0);
+ if (err < 0) {
+ panic("jnl: do_overlap: error inserting during pre-overlap\n");
+ }
+ }
+
+ // Regardless, we need to truncate the previous entry to the beginning of the overlap
+ (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start;
+ }
+ }
+
+ // then, bail out fast if there's no overlap with the entries that follow
+ if (!overwrite && block_end <= (*buf_ptr)[blk_index].block_num*jhdr_size) {
+ return 0; // no overlap, no overwrite
+ } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (*buf_ptr)[blk_index+1].block_num*jhdr_size)) {
+ return 1; // simple overwrite
+ }
+
+ // Otherwise, find all cases of total and partial overlap. We use the special
+ // block_num of -2 to designate entries that are completely overlapped and must
+ // be eliminated. The block_num, size, and jnl_offset of partially overlapped
+ // entries must be adjusted to keep the array consistent.
+ index = blk_index;
+ num_to_remove = 0;
+ while(index < *num_full_ptr && block_end > (*buf_ptr)[index].block_num*jhdr_size) {
+ if (block_end >= ((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size)) {
+ (*buf_ptr)[index].block_num = -2; // mark this for deletion
+ num_to_remove++;
+ } else {
+ overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size;
+ if (overlap > 0) {
+ if (overlap % jhdr_size != 0) {
+ panic("jnl: do_overlap: overlap of %lld is not multiple of %d\n", overlap, jhdr_size);
+ }
+
+ // if we partially overlap this entry, adjust its block number, jnl offset, and size
+ (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up
+
+ new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around
+ if (new_offset >= jnl->jhdr->size) {
+ new_offset = jhdr_size + (new_offset - jnl->jhdr->size);
+ }
+ (*buf_ptr)[index].jnl_offset = new_offset;
+
+ (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value
+ if ((*buf_ptr)[index].block_size <= 0) {
+ panic("jnl: do_overlap: after overlap, new block size is invalid (%d)\n", (*buf_ptr)[index].block_size);
+ // return -1; // if above panic is removed, return -1 for error
+ }
+ }
+
+ }
+
+ index++;
+ }
+
+ // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out)
+ index--; // start with the last index used within the above loop
+ while(index >= blk_index) {
+ if ((*buf_ptr)[index].block_num == -2) {
+ if (index == *num_full_ptr-1) {
+ (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free
+ } else {
+ bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) );
+ }
+ (*num_full_ptr)--;
+ }
+ index--;
+ }
+
+ // eliminate any stale entries at the end of the table
+ for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) {
+ (*buf_ptr)[i].block_num = -1;
+ }
+
+ return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite)
+}
+
+// PR-3105942: Coalesce writes to the same block in journal replay
+// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks
+// to be replayed and the corresponding location in the journal which contains
+// the most recent data for those blocks. The array is "played" once the all the
+// blocks in the journal have been coalesced. The code for the case of conflicting/
+// overlapping writes to a single block is the most dense. Because coalescing can
+// disrupt the existing time-ordering of blocks in the journal playback, care
+// is taken to catch any overlaps and keep the array consistent.
+static int
+add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int *num_buckets_ptr, int *num_full_ptr)
+{
+ int blk_index, overwriting;
+
+ // on return from lookup_bucket(), blk_index is the index into the table where block_num should be
+ // inserted (or the index of the elem to overwrite).
+ blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr);
+
+ // check if the index is within bounds (if we're adding this block to the end of
+ // the table, blk_index will be equal to num_full)
+ if (blk_index < 0 || blk_index > *num_full_ptr) {
+ //printf("jnl: add_block: trouble adding block to co_buf\n");
+ return -1;
+ } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index);
+
+ // Determine whether we're overwriting an existing entry by checking for overlap
+ overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr);
+ if (overwriting < 0) {
+ return -1; // if we got an error, pass it along
+ }
+
+ // returns the index, or -1 on error
+ blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, num_buckets_ptr, num_full_ptr, overwriting);
+
+ return blk_index;
+}