- switch (N) {
- case 4: return lz4_nmatch4(a,b);
- case 8: return lz4_nmatch8(a,b);
- case 16: return lz4_nmatch16(a,b);
- case 32: return lz4_nmatch32(a,b);
- case 64: return lz4_nmatch64(a,b);
- }
- __builtin_trap(); // FAIL
+ switch (N) {
+ case 4: return lz4_nmatch4(a, b);
+ case 8: return lz4_nmatch8(a, b);
+ case 16: return lz4_nmatch16(a, b);
+ case 32: return lz4_nmatch32(a, b);
+ case 64: return lz4_nmatch64(a, b);
+ }
+ __builtin_trap(); // FAIL
-static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
- uint8_t * restrict dst,
- const uint8_t * const end,
- const uint8_t * restrict src) {
- // The LZ4 encoding scheme requires that M is at least 4, because
- // the actual value stored by the encoding is M - 4. Check this
- // requirement for debug builds.
- assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
- // Having checked that M >= 4, translate M by four.
- M -= 4;
- // Similarly, we must have D < 2**16, because we use only two bytes
- // to represent the value of D in the encoding.
- assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
- // Construct the command byte by clamping both L and M to 0 ... 15
- // and packing them into a single byte, and store it.
- *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
- // If L is 15 or greater, we need to encode extra literal length bytes.
- if (L >= 15) {
- dst = lz4_store_length(dst, end, L - 15);
- if (dst == 0 || dst + L >= end) return NULL;
- }
- // Copy the literal itself from src to dst.
- dst = copy_literal(dst, src, L);
- // Store match distance.
- store2(dst, D); dst += 2;
- // If M is 15 or greater, we need to encode extra match length bytes.
- if (M >= 15) {
- dst = lz4_store_length(dst, end, M - 15);
- if (dst == 0) return NULL;
- }
- return dst;
+static uint8_t *
+lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
+ uint8_t * restrict dst,
+ const uint8_t * const end,
+ const uint8_t * restrict src)
+{
+ // The LZ4 encoding scheme requires that M is at least 4, because
+ // the actual value stored by the encoding is M - 4. Check this
+ // requirement for debug builds.
+ assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
+ // Having checked that M >= 4, translate M by four.
+ M -= 4;
+ // Similarly, we must have D < 2**16, because we use only two bytes
+ // to represent the value of D in the encoding.
+ assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
+ // Construct the command byte by clamping both L and M to 0 ... 15
+ // and packing them into a single byte, and store it.
+ *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
+ // If L is 15 or greater, we need to encode extra literal length bytes.
+ if (L >= 15) {
+ dst = lz4_store_length(dst, end, L - 15);
+ if (dst == 0 || dst + L >= end) {
+ return NULL;
+ }
+ }
+ // Copy the literal itself from src to dst.
+ dst = copy_literal(dst, src, L);
+ // Store match distance.
+ store2(dst, D); dst += 2;
+ // If M is 15 or greater, we need to encode extra match length bytes.
+ if (M >= 15) {
+ dst = lz4_store_length(dst, end, M - 15);
+ if (dst == 0) {
+ return NULL;
+ }
+ }
+ return dst;
-void lz4_encode_2gb(uint8_t ** dst_ptr,
- size_t dst_size,
- const uint8_t ** src_ptr,
- const uint8_t * src_begin,
- size_t src_size,
- lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
- int skip_final_literals)
+void
+lz4_encode_2gb(uint8_t ** dst_ptr,
+ size_t dst_size,
+ const uint8_t ** src_ptr,
+ const uint8_t * src_begin,
+ size_t src_size,
+ lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
+ int skip_final_literals)
- uint8_t *dst = *dst_ptr; // current output stream position
- uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
- const uint8_t *src = *src_ptr; // current input stream literal to encode
- const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
- const uint8_t *match_begin = 0; // first byte of matched sequence
- const uint8_t *match_end = 0; // first byte after matched sequence
+ uint8_t *dst = *dst_ptr; // current output stream position
+ uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
+ const uint8_t *src = *src_ptr; // current input stream literal to encode
+ const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
+ const uint8_t *match_begin = 0; // first byte of matched sequence
+ const uint8_t *match_end = 0; // first byte after matched sequence
-
- while (dst < end)
- {
- ptrdiff_t match_distance = 0;
- for (match_begin = src; match_begin < src_end; match_begin += 1) {
- const uint32_t pos = (uint32_t)(match_begin - src_begin);
- const uint32_t w0 = load4(match_begin);
- const uint32_t w1 = load4(match_begin + 1);
- const uint32_t w2 = load4(match_begin + 2);
- const uint32_t w3 = load4(match_begin + 3);
- const int i0 = lz4_hash(w0);
- const int i1 = lz4_hash(w1);
- const int i2 = lz4_hash(w2);
- const int i3 = lz4_hash(w3);
- const uint8_t *c0 = src_begin + hash_table[i0].offset;
- const uint8_t *c1 = src_begin + hash_table[i1].offset;
- const uint8_t *c2 = src_begin + hash_table[i2].offset;
- const uint8_t *c3 = src_begin + hash_table[i3].offset;
- const uint32_t m0 = hash_table[i0].word;
- const uint32_t m1 = hash_table[i1].word;
- const uint32_t m2 = hash_table[i2].word;
- const uint32_t m3 = hash_table[i3].word;
- hash_table[i0].offset = pos;
- hash_table[i0].word = w0;
- hash_table[i1].offset = pos + 1;
- hash_table[i1].word = w1;
-
- hash_table[i2].offset = pos + 2;
- hash_table[i2].word = w2;
- hash_table[i3].offset = pos + 3;
- hash_table[i3].word = w3;
-
- match_distance = (match_begin - c0);
- if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
- match_end = match_begin + 4;
- goto EXPAND_FORWARD;
- }
-
- match_begin++;
- match_distance = (match_begin - c1);
- if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
- match_end = match_begin + 4;
- goto EXPAND_FORWARD;
- }
-
- match_begin++;
- match_distance = (match_begin - c2);
- if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
- match_end = match_begin + 4;
- goto EXPAND_FORWARD;
- }
-
- match_begin++;
- match_distance = (match_begin - c3);
- if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
- match_end = match_begin + 4;
- goto EXPAND_FORWARD;
- }
+
+ while (dst < end) {
+ ptrdiff_t match_distance = 0;
+ for (match_begin = src; match_begin < src_end; match_begin += 1) {
+ const uint32_t pos = (uint32_t)(match_begin - src_begin);
+ const uint32_t w0 = load4(match_begin);
+ const uint32_t w1 = load4(match_begin + 1);
+ const uint32_t w2 = load4(match_begin + 2);
+ const uint32_t w3 = load4(match_begin + 3);
+ const int i0 = lz4_hash(w0);
+ const int i1 = lz4_hash(w1);
+ const int i2 = lz4_hash(w2);
+ const int i3 = lz4_hash(w3);
+ const uint8_t *c0 = src_begin + hash_table[i0].offset;
+ const uint8_t *c1 = src_begin + hash_table[i1].offset;
+ const uint8_t *c2 = src_begin + hash_table[i2].offset;
+ const uint8_t *c3 = src_begin + hash_table[i3].offset;
+ const uint32_t m0 = hash_table[i0].word;
+ const uint32_t m1 = hash_table[i1].word;
+ const uint32_t m2 = hash_table[i2].word;
+ const uint32_t m3 = hash_table[i3].word;
+ hash_table[i0].offset = pos;
+ hash_table[i0].word = w0;
+ hash_table[i1].offset = pos + 1;
+ hash_table[i1].word = w1;
+
+ hash_table[i2].offset = pos + 2;
+ hash_table[i2].word = w2;
+ hash_table[i3].offset = pos + 3;
+ hash_table[i3].word = w3;
+
+ match_distance = (match_begin - c0);
+ if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
+ match_end = match_begin + 4;
+ goto EXPAND_FORWARD;
+ }
+
+ match_begin++;
+ match_distance = (match_begin - c1);
+ if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
+ match_end = match_begin + 4;
+ goto EXPAND_FORWARD;
+ }
+
+ match_begin++;
+ match_distance = (match_begin - c2);
+ if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
+ match_end = match_begin + 4;
+ goto EXPAND_FORWARD;
+ }
+
+ match_begin++;
+ match_distance = (match_begin - c3);
+ if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
+ match_end = match_begin + 4;
+ goto EXPAND_FORWARD;
+ }
- //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
- if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
- ptrdiff_t dstd = dst - dst_begin;
-
- if (dstd == 0) {
- lz4_early_aborts++;
- return;
- }
-
-/* if (dstd >= pos) { */
-/* return; */
-/* } */
-/* ptrdiff_t cbytes = pos - dstd; */
-/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
-/* return; */
-/* } */
- lz4_do_abort_eval = 0;
- }
+ //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
+ if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
+ ptrdiff_t dstd = dst - dst_begin;
+
+ if (dstd == 0) {
+ lz4_early_aborts++;
+ return;
+ }
+
+/* if (dstd >= pos) { */
+/* return; */
+/* } */
+/* ptrdiff_t cbytes = pos - dstd; */
+/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
+/* return; */
+/* } */
+ lz4_do_abort_eval = 0;
+ }
- }
-
- if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence
-
- // Emit a trailing literal that covers the remainder of the source buffer,
- // if we can do so without exceeding the bounds of the destination buffer.
- size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
- if (src_remaining < 15) {
- *dst++ = (uint8_t)(src_remaining << 4);
- memcpy(dst, src, 16); dst += src_remaining;
- } else {
- *dst++ = 0xf0;
- dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
- if (dst == 0 || dst + src_remaining >= end) return;
- memcpy(dst, src, src_remaining); dst += src_remaining;
- }
- *dst_ptr = dst;
- *src_ptr = src + src_remaining;
- return;
-
- EXPAND_FORWARD:
-
- // Expand match forward
- {
- const uint8_t * ref_end = match_end - match_distance;
- while (match_end < src_end)
- {
- size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
- if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
- match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
- ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
- }
- }
-
- // Expand match backward
- {
- // match_begin_min = max(src_begin + match_distance,literal)
- const uint8_t * match_begin_min = src_begin + match_distance;
- match_begin_min = (match_begin_min < src)?src:match_begin_min;
- const uint8_t * ref_begin = match_begin - match_distance;
-
- while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; }
- }
-
- // Emit match
- dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
- if (!dst) return;
-
- // Update state
- src = match_end;
-
- // Update return values to include the last fully encoded match
- *dst_ptr = dst;
- *src_ptr = src;
- }
+ }
+
+ if (skip_final_literals) {
+ *src_ptr = src; *dst_ptr = dst; return;
+ } // do not emit the final literal sequence
+
+ // Emit a trailing literal that covers the remainder of the source buffer,
+ // if we can do so without exceeding the bounds of the destination buffer.
+ size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
+ if (src_remaining < 15) {
+ *dst++ = (uint8_t)(src_remaining << 4);
+ memcpy(dst, src, 16); dst += src_remaining;
+ } else {
+ *dst++ = 0xf0;
+ dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
+ if (dst == 0 || dst + src_remaining >= end) {
+ return;
+ }
+ memcpy(dst, src, src_remaining); dst += src_remaining;
+ }
+ *dst_ptr = dst;
+ *src_ptr = src + src_remaining;
+ return;
+
+EXPAND_FORWARD:
+
+ // Expand match forward
+ {
+ const uint8_t * ref_end = match_end - match_distance;
+ while (match_end < src_end) {
+ size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
+ if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) {
+ match_end += n; break;
+ }
+ match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
+ ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
+ }
+ }
+
+ // Expand match backward
+ {
+ // match_begin_min = max(src_begin + match_distance,literal)
+ const uint8_t * match_begin_min = src_begin + match_distance;
+ match_begin_min = (match_begin_min < src)?src:match_begin_min;
+ const uint8_t * ref_begin = match_begin - match_distance;
+
+ while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) {
+ match_begin -= 1; ref_begin -= 1;
+ }
+ }
+
+ // Emit match
+ dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
+ if (!dst) {
+ return;
+ }
+
+ // Update state
+ src = match_end;
+
+ // Update return values to include the last fully encoded match
+ *dst_ptr = dst;
+ *src_ptr = src;
+ }
- // Initialize hash table
- const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
-
- const uint8_t * src = src_buffer;
- uint8_t * dst = dst_buffer;
-
- // We need several blocks because our base function is limited to 2GB input
- const size_t BLOCK_SIZE = 0x7ffff000;
- while (src_size > 0)
- {
- //DRKTODO either implement pattern4 or figure out optimal unroll
- //DRKTODO: bizarrely, with plain O3 the compiler generates a single
- //DRKTODO: scalar STP per loop iteration with the stock loop
- //DRKTODO If hand unrolled, it switches to NEON store pairs
- // Reset hash table for each block
+ // Initialize hash table
+ const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
+
+ const uint8_t * src = src_buffer;
+ uint8_t * dst = dst_buffer;
+
+ // We need several blocks because our base function is limited to 2GB input
+ const size_t BLOCK_SIZE = 0x7ffff000;
+ while (src_size > 0) {
+ //DRKTODO either implement pattern4 or figure out optimal unroll
+ //DRKTODO: bizarrely, with plain O3 the compiler generates a single
+ //DRKTODO: scalar STP per loop iteration with the stock loop
+ //DRKTODO If hand unrolled, it switches to NEON store pairs
+ // Reset hash table for each block
- for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;) {
- hash_table[i++] = HASH_FILL;
- hash_table[i++] = HASH_FILL;
- hash_table[i++] = HASH_FILL;
- hash_table[i++] = HASH_FILL;
- }
-
- // Bytes to encode in this block
- const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
-
- // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
- // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
- uint8_t * dst_start = dst;
- const uint8_t * src_start = src;
- lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
-
- // Check progress
- size_t dst_used = dst - dst_start;
- size_t src_used = src - src_start; // src_used <= src_to_encode
- if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block
-
- // Note that there is a potential problem here in case of non compressible data requiring more blocks.
- // We may end up here with src_used very small, or even 0, and will not be able to make progress during
- // compression. We FAIL unless the length of literals remaining at the end is small enough.
- if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals
-
- // Update counters (SRC and DST already have been updated)
- src_size -= src_used;
- dst_size -= dst_used;
- }
-
- return (size_t)(dst - dst_buffer); // bytes produced
+ for (int i = 0; i < LZ4_COMPRESS_HASH_ENTRIES;) {
+ hash_table[i++] = HASH_FILL;
+ hash_table[i++] = HASH_FILL;
+ hash_table[i++] = HASH_FILL;
+ hash_table[i++] = HASH_FILL;
+ }
+
+ // Bytes to encode in this block
+ const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
+
+ // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
+ // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
+ uint8_t * dst_start = dst;
+ const uint8_t * src_start = src;
+ lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
+
+ // Check progress
+ size_t dst_used = dst - dst_start;
+ size_t src_used = src - src_start; // src_used <= src_to_encode
+ if (src_to_encode == src_size && src_used < src_to_encode) {
+ return 0; // FAIL to encode last block
+ }
+ // Note that there is a potential problem here in case of non compressible data requiring more blocks.
+ // We may end up here with src_used very small, or even 0, and will not be able to make progress during
+ // compression. We FAIL unless the length of literals remaining at the end is small enough.
+ if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) {
+ return 0; // FAIL too many literals
+ }
+ // Update counters (SRC and DST already have been updated)
+ src_size -= src_used;
+ dst_size -= dst_used;
+ }
+
+ return (size_t)(dst - dst_buffer); // bytes produced
- uint8_t * dst = *dst_ptr;
- const uint8_t * src = *src_ptr;
-
- // Require dst_end > dst.
- if (dst_end <= dst) goto OUT_FULL;
-
- while (src < src_end)
- {
- // Keep last good position
- *src_ptr = src;
- *dst_ptr = dst;
-
- uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
- uint32_t literalLength = (cmd >> 4) & 15; // 0..15
- uint32_t matchLength = 4 + (cmd & 15); // 4..19
-
- // extra bytes for literalLength
- if (unlikely(literalLength == 15))
- {
- uint8_t s;
- do {
+ uint8_t * dst = *dst_ptr;
+ const uint8_t * src = *src_ptr;
+
+ // Require dst_end > dst.
+ if (dst_end <= dst) {
+ goto OUT_FULL;
+ }
+
+ while (src < src_end) {
+ // Keep last good position
+ *src_ptr = src;
+ *dst_ptr = dst;
+
+ uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
+ uint32_t literalLength = (cmd >> 4) & 15; // 0..15
+ uint32_t matchLength = 4 + (cmd & 15); // 4..19
+
+ // extra bytes for literalLength
+ if (__improbable(literalLength == 15)) {
+ uint8_t s;
+ do {
- if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
- if (unlikely(literalLength > (size_t)(dst_end - dst))) {
- // literal will take us past the end of the destination buffer,
- // so we can only copy part of it.
- literalLength = (uint32_t)(dst_end - dst);
- memcpy(dst, src, literalLength);
- dst += literalLength;
- goto OUT_FULL;
- }
- memcpy(dst,src,literalLength);
- src += literalLength;
- dst += literalLength;
-
- if (unlikely(src >= src_end)) goto OUT_FULL; // valid end of stream
+ if (__improbable(literalLength > (size_t)(src_end - src))) {
+ goto IN_FAIL;
+ }
+ if (__improbable(literalLength > (size_t)(dst_end - dst))) {
+ // literal will take us past the end of the destination buffer,
+ // so we can only copy part of it.
+ literalLength = (uint32_t)(dst_end - dst);
+ memcpy(dst, src, literalLength);
+ dst += literalLength;
+ goto OUT_FULL;
+ }
+ memcpy(dst, src, literalLength);
+ src += literalLength;
+ dst += literalLength;
+
+ if (__improbable(src >= src_end)) {
+ goto OUT_FULL; // valid end of stream
+ }
- if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
- s = *src++;
- matchLength += s;
- } while (unlikely(s == 255));
- }
-
- // copy match (may overlap)
- if (unlikely(matchLength > (size_t)(dst_end - dst))) {
- // match will take us past the end of the destination buffer,
- // so we can only copy part of it.
- matchLength = (uint32_t)(dst_end - dst);
- for (uint32_t i=0; i<matchLength; ++i) dst[i] = ref[i];
- dst += matchLength;
- goto OUT_FULL;
- }
- for (uint64_t i=0;i<matchLength;i++) dst[i] = ref[i];
- dst += matchLength;
- }
-
- // We reached the end of the input buffer after a full instruction
+ if (__improbable(src >= src_end)) {
+ goto IN_FAIL; // unexpected end of input (1 byte needed)
+ }
+ s = *src++;
+ matchLength += s;
+ } while (__improbable(s == 255));
+ }
+
+ // copy match (may overlap)
+ if (__improbable(matchLength > (size_t)(dst_end - dst))) {
+ // match will take us past the end of the destination buffer,
+ // so we can only copy part of it.
+ matchLength = (uint32_t)(dst_end - dst);
+ for (uint32_t i = 0; i < matchLength; ++i) {
+ dst[i] = ref[i];
+ }
+ dst += matchLength;
+ goto OUT_FULL;
+ }
+ for (uint64_t i = 0; i < matchLength; i++) {
+ dst[i] = ref[i];
+ }
+ dst += matchLength;
+ }
+
+ // We reached the end of the input buffer after a full instruction