osfmk/vm/lz4.c

   1 /*
   2  * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 // LZ4_RAW buffer API
  30 // EB May 2015
  31 // Mar 2016 Imported from the Compression project, with minor optimisations and
  32 // early abort detection (Derek Kumar)
  33
  34 #include "lz4.h"
  35
  36 size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
  37                             const uint8_t * __restrict src_buffer, size_t src_size,
  38                             void * __restrict work __attribute__((unused)))
  39 {
  40   const uint8_t * src = src_buffer;
  41   uint8_t * dst = dst_buffer;
  42
  43   // Go fast if we can, keeping away from the end of buffers
  44 #if LZ4_ENABLE_ASSEMBLY_DECODE
  45   if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN)
  46   {
  47     if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN))
  48       return 0; // FAIL
  49   }
  50 #endif
  51 //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
  52
  53   // Finish safe
  54   if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size))
  55     return 0; // FAIL
  56
  57   return (size_t)(dst - dst_buffer); // bytes produced
  58 }
  59 // Debug flags
  60 #if LZ4DEBUG
  61 #define DEBUG_LZ4_ENCODE_ERRORS (1)
  62 #define DEBUG_LZ4_DECODE_ERRORS (1)
  63 #endif
  64
  65 #if DEBUG_LZ4_ENCODE_ERRORS
  66 #endif
  67
  68 #if !LZ4_ENABLE_ASSEMBLY_ENCODE
  69
  70 #if defined(__x86_64__) || defined(__x86_64h__)
  71 # define LZ4_MATCH_SEARCH_INIT_SIZE 32
  72 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32
  73 #else
  74 # define LZ4_MATCH_SEARCH_INIT_SIZE 8
  75 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8
  76 #endif
  77
  78 // Return hash for 4-byte sequence X
  79 static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); }
  80
  81 // Store 0xfff..fff at *PTR
  82 static inline void lz4_fill16(uint8_t * ptr)
  83 {
  84   store8(ptr,-1);
  85   store8(ptr+8,-1);
  86 }
  87
  88 // Return number of matching bytes 0..4 at positions A and B.
  89 static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b)
  90 {
  91   uint32_t x = load4(a) ^ load4(b);
  92   return (x == 0)?4:(__builtin_ctzl(x) >> 3);
  93 }
  94
  95 // Return number of matching bytes 0..8 at positions A and B.
  96 static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b)
  97 {
  98   uint64_t x = load8(a) ^ load8(b);
  99   return (x == 0)?8:(__builtin_ctzll(x) >> 3);
 100 }
 101
 102 // Return number of matching bytes 0..16 at positions A and B.
 103 static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b)
 104 {
 105   size_t n = lz4_nmatch8(a,b);
 106   return (n == 8)?(8 + lz4_nmatch8(a+8,b+8)):n;
 107 }
 108
 109 // Return number of matching bytes 0..32 at positions A and B.
 110 static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b)
 111 {
 112   size_t n = lz4_nmatch16(a,b);
 113   return (n == 16)?(16 + lz4_nmatch16(a+16,b+16)):n;
 114 }
 115
 116 // Return number of matching bytes 0..64 at positions A and B.
 117 static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b)
 118 {
 119   size_t n = lz4_nmatch32(a,b);
 120   return (n == 32)?(32 + lz4_nmatch32(a+32,b+32)):n;
 121 }
 122
 123 // Compile-time selection, return number of matching bytes 0..N at positions A and B.
 124 static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
 125 {
 126   switch (N) {
 127     case  4: return lz4_nmatch4(a,b);
 128     case  8: return lz4_nmatch8(a,b);
 129     case 16: return lz4_nmatch16(a,b);
 130     case 32: return lz4_nmatch32(a,b);
 131     case 64: return lz4_nmatch64(a,b);
 132   }
 133   __builtin_trap(); // FAIL
 134 }
 135
 136 // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
 137 // We are allowed to access a constant number of bytes above DST_END.
 138 // Return incremented DST pointer on success, and 0 on failure
 139 static inline uint8_t *lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) {
 140         (void)end;
 141   while (L >= 17*255) {
 142     lz4_fill16(dst);
 143     dst += 16;
 144     L -= 16*255;
 145   }
 146   lz4_fill16(dst);
 147   //DRKTODO verify these modulos/divisions are optimally handled by clang
 148   dst += L/255;
 149   *dst++ = L%255;
 150   return dst;
 151 }
 152
 153 static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; }
 154
 155 static inline uint8_t *copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) {
 156   uint8_t *end = dst + L;
 157   { copy16(dst, src); dst += 16; src += 16; }
 158   while (dst < end) { copy32(dst, src); dst += 32; src += 32; }
 159   return end;
 160 }
 161
 162 static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
 163                                uint8_t * restrict dst,
 164                                const uint8_t * const end,
 165                                const uint8_t * restrict src) {
 166   //  The LZ4 encoding scheme requires that M is at least 4, because
 167   //  the actual value stored by the encoding is M - 4.  Check this
 168   //  requirement for debug builds.
 169   assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
 170   //  Having checked that M >= 4, translate M by four.
 171   M -= 4;
 172   //  Similarly, we must have D < 2**16, because we use only two bytes
 173   //  to represent the value of D in the encoding.
 174   assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
 175   //  Construct the command byte by clamping both L and M to 0 ... 15
 176   //  and packing them into a single byte, and store it.
 177   *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
 178   //  If L is 15 or greater, we need to encode extra literal length bytes.
 179   if (L >= 15) {
 180     dst = lz4_store_length(dst, end, L - 15);
 181     if (dst == 0 || dst + L >= end) return NULL;
 182   }
 183   //  Copy the literal itself from src to dst.
 184   dst = copy_literal(dst, src, L);
 185   //  Store match distance.
 186   store2(dst, D); dst += 2;
 187   //  If M is 15 or greater, we need to encode extra match length bytes.
 188   if (M >= 15) {
 189     dst = lz4_store_length(dst, end, M - 15);
 190     if (dst == 0) return NULL;
 191   }
 192   return dst;
 193 }
 194
 195 /* #ifndef LZ4_EARLY_ABORT */
 196 /* #define LZ4_EARLY_ABORT (1) */
 197 /* #endif */
 198
 199 #if LZ4_EARLY_ABORT
 200 int lz4_do_early_abort = 1;
 201 int lz4_early_aborts = 0;
 202 #define LZ4_EARLY_ABORT_EVAL (448)
 203 #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
 204 #endif /* LZ4_EARLY_ABORT */
 205
 206 void lz4_encode_2gb(uint8_t ** dst_ptr,
 207                     size_t dst_size,
 208                     const uint8_t ** src_ptr,
 209                     const uint8_t * src_begin,
 210                     size_t src_size,
 211                     lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
 212                     int skip_final_literals)
 213 {
 214   uint8_t *dst = *dst_ptr;        // current output stream position
 215   uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
 216   const uint8_t *src = *src_ptr;  // current input stream literal to encode
 217   const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
 218   const uint8_t *match_begin = 0; // first byte of matched sequence
 219   const uint8_t *match_end = 0;   // first byte after matched sequence
 220 #if LZ4_EARLY_ABORT
 221   uint8_t * const dst_begin = dst;
 222   uint32_t lz4_do_abort_eval = lz4_do_early_abort;
 223 #endif
 224
 225   while (dst < end)
 226   {
 227     ptrdiff_t match_distance = 0;
 228     for (match_begin = src; match_begin < src_end; match_begin += 1) {
 229       const uint32_t pos = (uint32_t)(match_begin - src_begin);
 230       const uint32_t w0 = load4(match_begin);
 231       const uint32_t w1 = load4(match_begin + 1);
 232       const uint32_t w2 = load4(match_begin + 2);
 233       const uint32_t w3 = load4(match_begin + 3);
 234       const int i0 = lz4_hash(w0);
 235       const int i1 = lz4_hash(w1);
 236       const int i2 = lz4_hash(w2);
 237       const int i3 = lz4_hash(w3);
 238       const uint8_t *c0 = src_begin + hash_table[i0].offset;
 239       const uint8_t *c1 = src_begin + hash_table[i1].offset;
 240       const uint8_t *c2 = src_begin + hash_table[i2].offset;
 241       const uint8_t *c3 = src_begin + hash_table[i3].offset;
 242       const uint32_t m0 = hash_table[i0].word;
 243       const uint32_t m1 = hash_table[i1].word;
 244       const uint32_t m2 = hash_table[i2].word;
 245       const uint32_t m3 = hash_table[i3].word;
 246       hash_table[i0].offset = pos;
 247       hash_table[i0].word = w0;
 248       hash_table[i1].offset = pos + 1;
 249       hash_table[i1].word = w1;
 250
 251       hash_table[i2].offset = pos + 2;
 252       hash_table[i2].word = w2;
 253       hash_table[i3].offset = pos + 3;
 254       hash_table[i3].word = w3;
 255
 256       match_distance = (match_begin - c0);
 257       if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
 258         match_end = match_begin + 4;
 259         goto EXPAND_FORWARD;
 260       }
 261
 262       match_begin++;
 263       match_distance = (match_begin - c1);
 264       if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
 265         match_end = match_begin + 4;
 266         goto EXPAND_FORWARD;
 267       }
 268
 269       match_begin++;
 270       match_distance = (match_begin - c2);
 271       if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
 272         match_end = match_begin + 4;
 273         goto EXPAND_FORWARD;
 274       }
 275
 276       match_begin++;
 277       match_distance = (match_begin - c3);
 278       if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
 279         match_end = match_begin + 4;
 280         goto EXPAND_FORWARD;
 281       }
 282
 283 #if LZ4_EARLY_ABORT
 284       //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
 285       if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
 286               ptrdiff_t dstd = dst - dst_begin;
 287
 288               if (dstd == 0) {
 289                       lz4_early_aborts++;
 290                       return;
 291               }
 292
 293 /*            if (dstd >= pos) { */
 294 /*                    return; */
 295 /*            } */
 296 /*            ptrdiff_t cbytes = pos - dstd; */
 297 /*            if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos)  { */
 298 /*                    return; */
 299 /*            } */
 300               lz4_do_abort_eval = 0;
 301       }
 302 #endif
 303     }
 304
 305     if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence
 306
 307     //  Emit a trailing literal that covers the remainder of the source buffer,
 308     //  if we can do so without exceeding the bounds of the destination buffer.
 309     size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
 310     if (src_remaining < 15) {
 311       *dst++ = (uint8_t)(src_remaining << 4);
 312       memcpy(dst, src, 16); dst += src_remaining;
 313     } else {
 314       *dst++ = 0xf0;
 315       dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
 316       if (dst == 0 || dst + src_remaining >= end) return;
 317       memcpy(dst, src, src_remaining); dst += src_remaining;
 318     }
 319     *dst_ptr = dst;
 320     *src_ptr = src + src_remaining;
 321     return;
 322
 323   EXPAND_FORWARD:
 324
 325     // Expand match forward
 326     {
 327       const uint8_t * ref_end = match_end - match_distance;
 328       while (match_end < src_end)
 329       {
 330         size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
 331         if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
 332         match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
 333         ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
 334       }
 335     }
 336
 337     // Expand match backward
 338     {
 339       // match_begin_min = max(src_begin + match_distance,literal)
 340       const uint8_t * match_begin_min = src_begin + match_distance;
 341       match_begin_min = (match_begin_min < src)?src:match_begin_min;
 342       const uint8_t * ref_begin = match_begin - match_distance;
 343
 344       while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; }
 345     }
 346
 347     // Emit match
 348     dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
 349     if (!dst) return;
 350
 351     // Update state
 352     src = match_end;
 353
 354     // Update return values to include the last fully encoded match
 355     *dst_ptr = dst;
 356     *src_ptr = src;
 357   }
 358 }
 359
 360 #endif
 361
 362 size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
 363                             const uint8_t * __restrict src_buffer, size_t src_size,
 364                             lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
 365 {
 366   //  Initialize hash table
 367   const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
 368
 369   const uint8_t * src = src_buffer;
 370   uint8_t * dst = dst_buffer;
 371
 372   // We need several blocks because our base function is limited to 2GB input
 373   const size_t BLOCK_SIZE = 0x7ffff000;
 374   while (src_size > 0)
 375   {
 376           //DRKTODO either implement pattern4 or figure out optimal unroll
 377           //DRKTODO: bizarrely, with plain O3 the compiler generates a single
 378           //DRKTODO: scalar STP per loop iteration with the stock loop
 379           //DRKTODO If hand unrolled, it switches to NEON store pairs
 380     // Reset hash table for each block
 381 /* #if __STDC_HOSTED__ */
 382 /*     memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
 383 /* #else */
 384 /*     for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
 385 /* #endif */
 386
 387           for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;) {
 388                   hash_table[i++] = HASH_FILL;
 389                   hash_table[i++] = HASH_FILL;
 390                   hash_table[i++] = HASH_FILL;
 391                   hash_table[i++] = HASH_FILL;
 392           }
 393
 394     // Bytes to encode in this block
 395     const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
 396
 397     // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
 398     // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
 399     uint8_t * dst_start = dst;
 400     const uint8_t * src_start = src;
 401     lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
 402
 403     // Check progress
 404     size_t dst_used = dst - dst_start;
 405     size_t src_used = src - src_start; // src_used <= src_to_encode
 406     if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block
 407
 408     // Note that there is a potential problem here in case of non compressible data requiring more blocks.
 409     // We may end up here with src_used very small, or even 0, and will not be able to make progress during
 410     // compression. We FAIL unless the length of literals remaining at the end is small enough.
 411     if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals
 412
 413     // Update counters (SRC and DST already have been updated)
 414     src_size -= src_used;
 415     dst_size -= dst_used;
 416   }
 417
 418   return (size_t)(dst - dst_buffer); // bytes produced
 419 }
 420
 421 #define likely(expr)     __builtin_expect((expr) != 0, 1)
 422 #define unlikely(expr)   __builtin_expect((expr) != 0, 0)
 423 typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
 424
 425 int lz4_decode(uint8_t ** dst_ptr,
 426                     uint8_t * dst_begin,
 427                     uint8_t * dst_end,
 428                     const uint8_t ** src_ptr,
 429                     const uint8_t * src_end)
 430 {
 431     uint8_t * dst = *dst_ptr;
 432     const uint8_t * src = *src_ptr;
 433
 434     //  Require dst_end > dst.
 435     if (dst_end <= dst) goto OUT_FULL;
 436
 437     while (src < src_end)
 438     {
 439         // Keep last good position
 440         *src_ptr = src;
 441         *dst_ptr = dst;
 442
 443         uint8_t cmd = *src++;                                                // 1 byte encoding literal+(match-4) length: LLLLMMMM
 444         uint32_t literalLength = (cmd >> 4) & 15; // 0..15
 445         uint32_t matchLength = 4 + (cmd & 15); // 4..19
 446
 447         // extra bytes for literalLength
 448         if (unlikely(literalLength == 15))
 449         {
 450             uint8_t s;
 451             do {
 452 #if DEBUG_LZ4_DECODE_ERRORS
 453                 if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n");
 454 #endif
 455                 if (unlikely(src >= src_end)) goto IN_FAIL;         // unexpected end of input (1 byte needed)
 456                 s = *src++;
 457                 literalLength += s;
 458             } while (unlikely(s == 255));
 459         }
 460
 461         // copy literal
 462 #if DEBUG_LZ4_DECODE_ERRORS
 463         if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
 464 #endif
 465         if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
 466         if (unlikely(literalLength > (size_t)(dst_end - dst))) {
 467             //  literal will take us past the end of the destination buffer,
 468             //  so we can only copy part of it.
 469             literalLength = (uint32_t)(dst_end - dst);
 470             memcpy(dst, src, literalLength);
 471             dst += literalLength;
 472             goto OUT_FULL;
 473         }
 474         memcpy(dst,src,literalLength);
 475         src += literalLength;
 476         dst += literalLength;
 477
 478         if (unlikely(src >= src_end)) goto OUT_FULL;                // valid end of stream
 479 #if DEBUG_LZ4_DECODE_ERRORS
 480         if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
 481 #endif
 482         if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL;    // unexpected end of input (2 bytes needed)
 483
 484         //DRKTODO: this causes an alignment increase warning (legitimate?)
 485         //DRKTODO: cast of char * to uint16_t*
 486         #pragma clang diagnostic push
 487         #pragma clang diagnostic ignored "-Wcast-align"
 488
 489         // match distance
 490         uint64_t matchDistance = *(const uint16_t *)src;                     // 0x0000 <= matchDistance <= 0xffff
 491         #pragma clang diagnostic pop
 492         src += 2;
 493 #if DEBUG_LZ4_DECODE_ERRORS
 494         if (matchDistance == 0) printf("Invalid match distance D = 0\n");
 495 #endif
 496         if (unlikely(matchDistance == 0)) goto IN_FAIL;                      // 0x0000 invalid
 497         uint8_t * ref = dst - matchDistance;
 498 #if DEBUG_LZ4_DECODE_ERRORS
 499         if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
 500 #endif
 501         if (unlikely(ref < dst_begin)) goto OUT_FAIL;                        // out of range
 502
 503         // extra bytes for matchLength
 504         if (unlikely(matchLength == 19))
 505         {
 506             uint8_t s;
 507             do {
 508 #if DEBUG_LZ4_DECODE_ERRORS
 509                 if (unlikely(src >= src_end)) printf("Truncated SRC match length\n");
 510 #endif
 511                 if (unlikely(src >= src_end)) goto IN_FAIL;                      // unexpected end of input (1 byte needed)
 512                 s = *src++;
 513                 matchLength += s;
 514             } while (unlikely(s == 255));
 515         }
 516
 517         // copy match (may overlap)
 518         if (unlikely(matchLength > (size_t)(dst_end - dst))) {
 519             //  match will take us past the end of the destination buffer,
 520             //  so we can only copy part of it.
 521             matchLength = (uint32_t)(dst_end - dst);
 522             for (uint32_t i=0; i<matchLength; ++i) dst[i] = ref[i];
 523             dst += matchLength;
 524             goto OUT_FULL;
 525         }
 526         for (uint64_t i=0;i<matchLength;i++) dst[i] = ref[i];
 527         dst += matchLength;
 528     }
 529
 530     //  We reached the end of the input buffer after a full instruction
 531 OUT_FULL:
 532     //  Or we reached the end of the output buffer
 533     *dst_ptr = dst;
 534     *src_ptr = src;
 535     return 0;
 536
 537     //  Error conditions
 538 OUT_FAIL:
 539 IN_FAIL:
 540     return 1; // FAIL
 541 }