osfmk/vm/lz4.c

   1 /*
   2  * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 // LZ4_RAW buffer API
  30 // EB May 2015
  31 // Mar 2016 Imported from the Compression project, with minor optimisations and
  32 // early abort detection (Derek Kumar)
  33
  34 #include "lz4.h"
  35 #define memcpy __builtin_memcpy
  36
  37 size_t
  38 lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
  39     const uint8_t * __restrict src_buffer, size_t src_size,
  40     void * __restrict work __attribute__((unused)))
  41 {
  42         const uint8_t * src = src_buffer;
  43         uint8_t * dst = dst_buffer;
  44
  45         // Go fast if we can, keeping away from the end of buffers
  46 #if LZ4_ENABLE_ASSEMBLY_DECODE
  47         if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) {
  48                 if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) {
  49                         return 0; // FAIL
  50                 }
  51         }
  52 #endif
  53 //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
  54
  55         // Finish safe
  56         if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size)) {
  57                 return 0; // FAIL
  58         }
  59         return (size_t)(dst - dst_buffer); // bytes produced
  60 }
  61 // Debug flags
  62 #if LZ4DEBUG
  63 #define DEBUG_LZ4_ENCODE_ERRORS (1)
  64 #define DEBUG_LZ4_DECODE_ERRORS (1)
  65 #endif
  66
  67 #if DEBUG_LZ4_ENCODE_ERRORS
  68 #endif
  69
  70 #if !LZ4_ENABLE_ASSEMBLY_ENCODE
  71
  72 #if defined(__x86_64__) || defined(__x86_64h__)
  73 # define LZ4_MATCH_SEARCH_INIT_SIZE 32
  74 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32
  75 #else
  76 # define LZ4_MATCH_SEARCH_INIT_SIZE 8
  77 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8
  78 #endif
  79
  80 // Return hash for 4-byte sequence X
  81 static inline uint32_t
  82 lz4_hash(uint32_t x)
  83 {
  84         return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS);
  85 }
  86
  87 // Store 0xfff..fff at *PTR
  88 static inline void
  89 lz4_fill16(uint8_t * ptr)
  90 {
  91         store8(ptr, -1);
  92         store8(ptr + 8, -1);
  93 }
  94
  95 // Return number of matching bytes 0..4 at positions A and B.
  96 static inline size_t
  97 lz4_nmatch4(const uint8_t * a, const uint8_t * b)
  98 {
  99         uint32_t x = load4(a) ^ load4(b);
 100         return (x == 0)?4:(__builtin_ctzl(x) >> 3);
 101 }
 102
 103 // Return number of matching bytes 0..8 at positions A and B.
 104 static inline size_t
 105 lz4_nmatch8(const uint8_t * a, const uint8_t * b)
 106 {
 107         uint64_t x = load8(a) ^ load8(b);
 108         return (x == 0)?8:(__builtin_ctzll(x) >> 3);
 109 }
 110
 111 // Return number of matching bytes 0..16 at positions A and B.
 112 static inline size_t
 113 lz4_nmatch16(const uint8_t * a, const uint8_t * b)
 114 {
 115         size_t n = lz4_nmatch8(a, b);
 116         return (n == 8)?(8 + lz4_nmatch8(a + 8, b + 8)):n;
 117 }
 118
 119 // Return number of matching bytes 0..32 at positions A and B.
 120 static inline size_t
 121 lz4_nmatch32(const uint8_t * a, const uint8_t * b)
 122 {
 123         size_t n = lz4_nmatch16(a, b);
 124         return (n == 16)?(16 + lz4_nmatch16(a + 16, b + 16)):n;
 125 }
 126
 127 // Return number of matching bytes 0..64 at positions A and B.
 128 static inline size_t
 129 lz4_nmatch64(const uint8_t * a, const uint8_t * b)
 130 {
 131         size_t n = lz4_nmatch32(a, b);
 132         return (n == 32)?(32 + lz4_nmatch32(a + 32, b + 32)):n;
 133 }
 134
 135 // Compile-time selection, return number of matching bytes 0..N at positions A and B.
 136 static inline size_t
 137 lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
 138 {
 139         switch (N) {
 140         case  4: return lz4_nmatch4(a, b);
 141         case  8: return lz4_nmatch8(a, b);
 142         case 16: return lz4_nmatch16(a, b);
 143         case 32: return lz4_nmatch32(a, b);
 144         case 64: return lz4_nmatch64(a, b);
 145         }
 146         __builtin_trap(); // FAIL
 147 }
 148
 149 // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
 150 // We are allowed to access a constant number of bytes above DST_END.
 151 // Return incremented DST pointer on success, and 0 on failure
 152 static inline uint8_t *
 153 lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L)
 154 {
 155         (void)end;
 156         while (L >= 17 * 255) {
 157                 lz4_fill16(dst);
 158                 dst += 16;
 159                 L -= 16 * 255;
 160         }
 161         lz4_fill16(dst);
 162         //DRKTODO verify these modulos/divisions are optimally handled by clang
 163         dst += L / 255;
 164         *dst++ = L % 255;
 165         return dst;
 166 }
 167
 168 static inline uint32_t
 169 clamp(uint32_t x, uint32_t max)
 170 __attribute__((overloadable))
 171 {
 172         return x > max ? max : x;
 173 }
 174
 175 static inline uint8_t *
 176 copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L)
 177 {
 178         uint8_t *end = dst + L;
 179         { copy16(dst, src); dst += 16; src += 16; }
 180         while (dst < end) {
 181                 copy32(dst, src); dst += 32; src += 32;
 182         }
 183         return end;
 184 }
 185
 186 static uint8_t *
 187 lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
 188     uint8_t * restrict dst,
 189     const uint8_t * const end,
 190     const uint8_t * restrict src)
 191 {
 192         //  The LZ4 encoding scheme requires that M is at least 4, because
 193         //  the actual value stored by the encoding is M - 4.  Check this
 194         //  requirement for debug builds.
 195         assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
 196         //  Having checked that M >= 4, translate M by four.
 197         M -= 4;
 198         //  Similarly, we must have D < 2**16, because we use only two bytes
 199         //  to represent the value of D in the encoding.
 200         assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
 201         //  Construct the command byte by clamping both L and M to 0 ... 15
 202         //  and packing them into a single byte, and store it.
 203         *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
 204         //  If L is 15 or greater, we need to encode extra literal length bytes.
 205         if (L >= 15) {
 206                 dst = lz4_store_length(dst, end, L - 15);
 207                 if (dst == 0 || dst + L >= end) {
 208                         return NULL;
 209                 }
 210         }
 211         //  Copy the literal itself from src to dst.
 212         dst = copy_literal(dst, src, L);
 213         //  Store match distance.
 214         store2(dst, D); dst += 2;
 215         //  If M is 15 or greater, we need to encode extra match length bytes.
 216         if (M >= 15) {
 217                 dst = lz4_store_length(dst, end, M - 15);
 218                 if (dst == 0) {
 219                         return NULL;
 220                 }
 221         }
 222         return dst;
 223 }
 224
 225 /* #ifndef LZ4_EARLY_ABORT */
 226 /* #define LZ4_EARLY_ABORT (1) */
 227 /* #endif */
 228
 229 #if LZ4_EARLY_ABORT
 230 int lz4_do_early_abort = 1;
 231 int lz4_early_aborts = 0;
 232 #define LZ4_EARLY_ABORT_EVAL (448)
 233 #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
 234 #endif /* LZ4_EARLY_ABORT */
 235
 236 void
 237 lz4_encode_2gb(uint8_t ** dst_ptr,
 238     size_t dst_size,
 239     const uint8_t ** src_ptr,
 240     const uint8_t * src_begin,
 241     size_t src_size,
 242     lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
 243     int skip_final_literals)
 244 {
 245         uint8_t *dst = *dst_ptr;  // current output stream position
 246         uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
 247         const uint8_t *src = *src_ptr; // current input stream literal to encode
 248         const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
 249         const uint8_t *match_begin = 0; // first byte of matched sequence
 250         const uint8_t *match_end = 0; // first byte after matched sequence
 251 #if LZ4_EARLY_ABORT
 252         uint8_t * const dst_begin = dst;
 253         uint32_t lz4_do_abort_eval = lz4_do_early_abort;
 254 #endif
 255
 256         while (dst < end) {
 257                 ptrdiff_t match_distance = 0;
 258                 for (match_begin = src; match_begin < src_end; match_begin += 1) {
 259                         const uint32_t pos = (uint32_t)(match_begin - src_begin);
 260                         const uint32_t w0 = load4(match_begin);
 261                         const uint32_t w1 = load4(match_begin + 1);
 262                         const uint32_t w2 = load4(match_begin + 2);
 263                         const uint32_t w3 = load4(match_begin + 3);
 264                         const int i0 = lz4_hash(w0);
 265                         const int i1 = lz4_hash(w1);
 266                         const int i2 = lz4_hash(w2);
 267                         const int i3 = lz4_hash(w3);
 268                         const uint8_t *c0 = src_begin + hash_table[i0].offset;
 269                         const uint8_t *c1 = src_begin + hash_table[i1].offset;
 270                         const uint8_t *c2 = src_begin + hash_table[i2].offset;
 271                         const uint8_t *c3 = src_begin + hash_table[i3].offset;
 272                         const uint32_t m0 = hash_table[i0].word;
 273                         const uint32_t m1 = hash_table[i1].word;
 274                         const uint32_t m2 = hash_table[i2].word;
 275                         const uint32_t m3 = hash_table[i3].word;
 276                         hash_table[i0].offset = pos;
 277                         hash_table[i0].word = w0;
 278                         hash_table[i1].offset = pos + 1;
 279                         hash_table[i1].word = w1;
 280
 281                         hash_table[i2].offset = pos + 2;
 282                         hash_table[i2].word = w2;
 283                         hash_table[i3].offset = pos + 3;
 284                         hash_table[i3].word = w3;
 285
 286                         match_distance = (match_begin - c0);
 287                         if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
 288                                 match_end = match_begin + 4;
 289                                 goto EXPAND_FORWARD;
 290                         }
 291
 292                         match_begin++;
 293                         match_distance = (match_begin - c1);
 294                         if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
 295                                 match_end = match_begin + 4;
 296                                 goto EXPAND_FORWARD;
 297                         }
 298
 299                         match_begin++;
 300                         match_distance = (match_begin - c2);
 301                         if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
 302                                 match_end = match_begin + 4;
 303                                 goto EXPAND_FORWARD;
 304                         }
 305
 306                         match_begin++;
 307                         match_distance = (match_begin - c3);
 308                         if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
 309                                 match_end = match_begin + 4;
 310                                 goto EXPAND_FORWARD;
 311                         }
 312
 313 #if LZ4_EARLY_ABORT
 314                         //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
 315                         if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
 316                                 ptrdiff_t dstd = dst - dst_begin;
 317
 318                                 if (dstd == 0) {
 319                                         lz4_early_aborts++;
 320                                         return;
 321                                 }
 322
 323 /*            if (dstd >= pos) { */
 324 /*                    return; */
 325 /*            } */
 326 /*            ptrdiff_t cbytes = pos - dstd; */
 327 /*            if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos)  { */
 328 /*                    return; */
 329 /*            } */
 330                                 lz4_do_abort_eval = 0;
 331                         }
 332 #endif
 333                 }
 334
 335                 if (skip_final_literals) {
 336                         *src_ptr = src; *dst_ptr = dst; return;
 337                 }                                                        // do not emit the final literal sequence
 338
 339                 //  Emit a trailing literal that covers the remainder of the source buffer,
 340                 //  if we can do so without exceeding the bounds of the destination buffer.
 341                 size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
 342                 if (src_remaining < 15) {
 343                         *dst++ = (uint8_t)(src_remaining << 4);
 344                         memcpy(dst, src, 16); dst += src_remaining;
 345                 } else {
 346                         *dst++ = 0xf0;
 347                         dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
 348                         if (dst == 0 || dst + src_remaining >= end) {
 349                                 return;
 350                         }
 351                         memcpy(dst, src, src_remaining); dst += src_remaining;
 352                 }
 353                 *dst_ptr = dst;
 354                 *src_ptr = src + src_remaining;
 355                 return;
 356
 357 EXPAND_FORWARD:
 358
 359                 // Expand match forward
 360                 {
 361                         const uint8_t * ref_end = match_end - match_distance;
 362                         while (match_end < src_end) {
 363                                 size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
 364                                 if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) {
 365                                         match_end += n; break;
 366                                 }
 367                                 match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
 368                                 ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
 369                         }
 370                 }
 371
 372                 // Expand match backward
 373                 {
 374                         // match_begin_min = max(src_begin + match_distance,literal)
 375                         const uint8_t * match_begin_min = src_begin + match_distance;
 376                         match_begin_min = (match_begin_min < src)?src:match_begin_min;
 377                         const uint8_t * ref_begin = match_begin - match_distance;
 378
 379                         while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) {
 380                                 match_begin -= 1; ref_begin -= 1;
 381                         }
 382                 }
 383
 384                 // Emit match
 385                 dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
 386                 if (!dst) {
 387                         return;
 388                 }
 389
 390                 // Update state
 391                 src = match_end;
 392
 393                 // Update return values to include the last fully encoded match
 394                 *dst_ptr = dst;
 395                 *src_ptr = src;
 396         }
 397 }
 398
 399 #endif
 400
 401 size_t
 402 lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
 403     const uint8_t * __restrict src_buffer, size_t src_size,
 404     lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
 405 {
 406         //  Initialize hash table
 407         const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
 408
 409         const uint8_t * src = src_buffer;
 410         uint8_t * dst = dst_buffer;
 411
 412         // We need several blocks because our base function is limited to 2GB input
 413         const size_t BLOCK_SIZE = 0x7ffff000;
 414         while (src_size > 0) {
 415                 //DRKTODO either implement pattern4 or figure out optimal unroll
 416                 //DRKTODO: bizarrely, with plain O3 the compiler generates a single
 417                 //DRKTODO: scalar STP per loop iteration with the stock loop
 418                 //DRKTODO If hand unrolled, it switches to NEON store pairs
 419                 // Reset hash table for each block
 420 /* #if __STDC_HOSTED__ */
 421 /*     memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
 422 /* #else */
 423 /*     for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
 424 /* #endif */
 425
 426                 for (int i = 0; i < LZ4_COMPRESS_HASH_ENTRIES;) {
 427                         hash_table[i++] = HASH_FILL;
 428                         hash_table[i++] = HASH_FILL;
 429                         hash_table[i++] = HASH_FILL;
 430                         hash_table[i++] = HASH_FILL;
 431                 }
 432
 433                 // Bytes to encode in this block
 434                 const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
 435
 436                 // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
 437                 // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
 438                 uint8_t * dst_start = dst;
 439                 const uint8_t * src_start = src;
 440                 lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
 441
 442                 // Check progress
 443                 size_t dst_used = dst - dst_start;
 444                 size_t src_used = src - src_start; // src_used <= src_to_encode
 445                 if (src_to_encode == src_size && src_used < src_to_encode) {
 446                         return 0;                                        // FAIL to encode last block
 447                 }
 448                 // Note that there is a potential problem here in case of non compressible data requiring more blocks.
 449                 // We may end up here with src_used very small, or even 0, and will not be able to make progress during
 450                 // compression. We FAIL unless the length of literals remaining at the end is small enough.
 451                 if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) {
 452                         return 0;                                                  // FAIL too many literals
 453                 }
 454                 // Update counters (SRC and DST already have been updated)
 455                 src_size -= src_used;
 456                 dst_size -= dst_used;
 457         }
 458
 459         return (size_t)(dst - dst_buffer); // bytes produced
 460 }
 461
 462 typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
 463
 464 int
 465 lz4_decode(uint8_t ** dst_ptr,
 466     uint8_t * dst_begin,
 467     uint8_t * dst_end,
 468     const uint8_t ** src_ptr,
 469     const uint8_t * src_end)
 470 {
 471         uint8_t * dst = *dst_ptr;
 472         const uint8_t * src = *src_ptr;
 473
 474         //  Require dst_end > dst.
 475         if (dst_end <= dst) {
 476                 goto OUT_FULL;
 477         }
 478
 479         while (src < src_end) {
 480                 // Keep last good position
 481                 *src_ptr = src;
 482                 *dst_ptr = dst;
 483
 484                 uint8_t cmd = *src++;                                        // 1 byte encoding literal+(match-4) length: LLLLMMMM
 485                 uint32_t literalLength = (cmd >> 4) & 15; // 0..15
 486                 uint32_t matchLength = 4 + (cmd & 15); // 4..19
 487
 488                 // extra bytes for literalLength
 489                 if (__improbable(literalLength == 15)) {
 490                         uint8_t s;
 491                         do {
 492 #if DEBUG_LZ4_DECODE_ERRORS
 493                                 if (__improbable(src >= src_end)) {
 494                                         printf("Truncated SRC literal length\n");
 495                                 }
 496 #endif
 497                                 if (__improbable(src >= src_end)) {
 498                                         goto IN_FAIL;                   // unexpected end of input (1 byte needed)
 499                                 }
 500                                 s = *src++;
 501                                 literalLength += s;
 502                         } while (__improbable(s == 255));
 503                 }
 504
 505                 // copy literal
 506 #if DEBUG_LZ4_DECODE_ERRORS
 507                 if (__improbable(literalLength > (size_t)(src_end - src))) {
 508                         printf("Truncated SRC literal\n");
 509                 }
 510 #endif
 511                 if (__improbable(literalLength > (size_t)(src_end - src))) {
 512                         goto IN_FAIL;
 513                 }
 514                 if (__improbable(literalLength > (size_t)(dst_end - dst))) {
 515                         //  literal will take us past the end of the destination buffer,
 516                         //  so we can only copy part of it.
 517                         literalLength = (uint32_t)(dst_end - dst);
 518                         memcpy(dst, src, literalLength);
 519                         dst += literalLength;
 520                         goto OUT_FULL;
 521                 }
 522                 memcpy(dst, src, literalLength);
 523                 src += literalLength;
 524                 dst += literalLength;
 525
 526                 if (__improbable(src >= src_end)) {
 527                         goto OUT_FULL;                                  // valid end of stream
 528                 }
 529 #if DEBUG_LZ4_DECODE_ERRORS
 530                 if (__improbable(2 > (size_t)(src_end - src))) {
 531                         printf("Truncated SRC distance\n");
 532                 }
 533 #endif
 534                 if (__improbable(2 > (size_t)(src_end - src))) {
 535                         goto IN_FAIL;                                   // unexpected end of input (2 bytes needed)
 536                 }
 537                 //DRKTODO: this causes an alignment increase warning (legitimate?)
 538                 //DRKTODO: cast of char * to uint16_t*
 539         #pragma clang diagnostic push
 540         #pragma clang diagnostic ignored "-Wcast-align"
 541
 542                 // match distance
 543                 uint64_t matchDistance = *(const uint16_t *)src;             // 0x0000 <= matchDistance <= 0xffff
 544         #pragma clang diagnostic pop
 545                 src += 2;
 546 #if DEBUG_LZ4_DECODE_ERRORS
 547                 if (matchDistance == 0) {
 548                         printf("Invalid match distance D = 0\n");
 549                 }
 550 #endif
 551                 if (__improbable(matchDistance == 0)) {
 552                         goto IN_FAIL;                                            // 0x0000 invalid
 553                 }
 554                 uint8_t * ref = dst - matchDistance;
 555 #if DEBUG_LZ4_DECODE_ERRORS
 556                 if (__improbable(ref < dst_begin)) {
 557                         printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n", matchDistance, dst_begin, dst, dst_end);
 558                 }
 559 #endif
 560                 if (__improbable(ref < dst_begin)) {
 561                         goto OUT_FAIL;                                           // out of range
 562                 }
 563                 // extra bytes for matchLength
 564                 if (__improbable(matchLength == 19)) {
 565                         uint8_t s;
 566                         do {
 567 #if DEBUG_LZ4_DECODE_ERRORS
 568                                 if (__improbable(src >= src_end)) {
 569                                         printf("Truncated SRC match length\n");
 570                                 }
 571 #endif
 572                                 if (__improbable(src >= src_end)) {
 573                                         goto IN_FAIL;                                // unexpected end of input (1 byte needed)
 574                                 }
 575                                 s = *src++;
 576                                 matchLength += s;
 577                         } while (__improbable(s == 255));
 578                 }
 579
 580                 // copy match (may overlap)
 581                 if (__improbable(matchLength > (size_t)(dst_end - dst))) {
 582                         //  match will take us past the end of the destination buffer,
 583                         //  so we can only copy part of it.
 584                         matchLength = (uint32_t)(dst_end - dst);
 585                         for (uint32_t i = 0; i < matchLength; ++i) {
 586                                 dst[i] = ref[i];
 587                         }
 588                         dst += matchLength;
 589                         goto OUT_FULL;
 590                 }
 591                 for (uint64_t i = 0; i < matchLength; i++) {
 592                         dst[i] = ref[i];
 593                 }
 594                 dst += matchLength;
 595         }
 596
 597         //  We reached the end of the input buffer after a full instruction
 598 OUT_FULL:
 599         //  Or we reached the end of the output buffer
 600         *dst_ptr = dst;
 601         *src_ptr = src;
 602         return 0;
 603
 604         //  Error conditions
 605 OUT_FAIL:
 606 IN_FAIL:
 607         return 1; // FAIL
 608 }