[apple/xnu.git] / osfmk / vm / lz4.c

/*
 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

// LZ4_RAW buffer API
// EB May 2015
// Mar 2016 Imported from the Compression project, with minor optimisations and
// early abort detection (Derek Kumar)

#include "lz4.h"

size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
                            const uint8_t * __restrict src_buffer, size_t src_size,
                            void * __restrict work __attribute__((unused)))
{
  const uint8_t * src = src_buffer;
  uint8_t * dst = dst_buffer;
  
  // Go fast if we can, keeping away from the end of buffers
#if LZ4_ENABLE_ASSEMBLY_DECODE
  if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN)
  {
    if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN))
      return 0; // FAIL
  }
#endif
//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
  
  // Finish safe
  if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size))
    return 0; // FAIL

  return (size_t)(dst - dst_buffer); // bytes produced
}
// Debug flags
#if LZ4DEBUG
#define DEBUG_LZ4_ENCODE_ERRORS (1)
#define DEBUG_LZ4_DECODE_ERRORS (1)
#endif

#if DEBUG_LZ4_ENCODE_ERRORS
#endif

#if !LZ4_ENABLE_ASSEMBLY_ENCODE

#if defined(__x86_64__) || defined(__x86_64h__)
# define LZ4_MATCH_SEARCH_INIT_SIZE 32
# define LZ4_MATCH_SEARCH_LOOP_SIZE 32
#else
# define LZ4_MATCH_SEARCH_INIT_SIZE 8
# define LZ4_MATCH_SEARCH_LOOP_SIZE 8
#endif

// Return hash for 4-byte sequence X
static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); }

// Store 0xfff..fff at *PTR
static inline void lz4_fill16(uint8_t * ptr)
{
  store8(ptr,-1);
  store8(ptr+8,-1);
}

// Return number of matching bytes 0..4 at positions A and B.
static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b)
{
  uint32_t x = load4(a) ^ load4(b);
  return (x == 0)?4:(__builtin_ctzl(x) >> 3);
}

// Return number of matching bytes 0..8 at positions A and B.
static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b)
{
  uint64_t x = load8(a) ^ load8(b);
  return (x == 0)?8:(__builtin_ctzll(x) >> 3);
}

// Return number of matching bytes 0..16 at positions A and B.
static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b)
{
  size_t n = lz4_nmatch8(a,b);
  return (n == 8)?(8 + lz4_nmatch8(a+8,b+8)):n;
}

// Return number of matching bytes 0..32 at positions A and B.
static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b)
{
  size_t n = lz4_nmatch16(a,b);
  return (n == 16)?(16 + lz4_nmatch16(a+16,b+16)):n;
}

// Return number of matching bytes 0..64 at positions A and B.
static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b)
{
  size_t n = lz4_nmatch32(a,b);
  return (n == 32)?(32 + lz4_nmatch32(a+32,b+32)):n;
}

// Compile-time selection, return number of matching bytes 0..N at positions A and B.
static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
{
  switch (N) {
    case  4: return lz4_nmatch4(a,b);
    case  8: return lz4_nmatch8(a,b);
    case 16: return lz4_nmatch16(a,b);
    case 32: return lz4_nmatch32(a,b);
    case 64: return lz4_nmatch64(a,b);
  }
  __builtin_trap(); // FAIL
}

// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
// We are allowed to access a constant number of bytes above DST_END.
// Return incremented DST pointer on success, and 0 on failure
static inline uint8_t *lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) {
	(void)end;
  while (L >= 17*255) {
    lz4_fill16(dst);
    dst += 16;
    L -= 16*255;
  }
  lz4_fill16(dst);
  //DRKTODO verify these modulos/divisions are optimally handled by clang
  dst += L/255;
  *dst++ = L%255;
  return dst;
}

static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; }

static inline uint8_t *copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) {
  uint8_t *end = dst + L;
  { copy16(dst, src); dst += 16; src += 16; }
  while (dst < end) { copy32(dst, src); dst += 32; src += 32; }
  return end;
}

static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
                               uint8_t * restrict dst,
                               const uint8_t * const end,
                               const uint8_t * restrict src) {
  //  The LZ4 encoding scheme requires that M is at least 4, because
  //  the actual value stored by the encoding is M - 4.  Check this
  //  requirement for debug builds.
  assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
  //  Having checked that M >= 4, translate M by four.
  M -= 4;
  //  Similarly, we must have D < 2**16, because we use only two bytes
  //  to represent the value of D in the encoding.
  assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
  //  Construct the command byte by clamping both L and M to 0 ... 15
  //  and packing them into a single byte, and store it.
  *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
  //  If L is 15 or greater, we need to encode extra literal length bytes.
  if (L >= 15) {
    dst = lz4_store_length(dst, end, L - 15);
    if (dst == 0 || dst + L >= end) return NULL;
  }
  //  Copy the literal itself from src to dst.
  dst = copy_literal(dst, src, L);
  //  Store match distance.
  store2(dst, D); dst += 2;
  //  If M is 15 or greater, we need to encode extra match length bytes.
  if (M >= 15) {
    dst = lz4_store_length(dst, end, M - 15);
    if (dst == 0) return NULL;
  }
  return dst;
}

/* #ifndef LZ4_EARLY_ABORT */
/* #define LZ4_EARLY_ABORT (1) */
/* #endif */

#if LZ4_EARLY_ABORT
int lz4_do_early_abort = 1;
int lz4_early_aborts = 0;
#define LZ4_EARLY_ABORT_EVAL (448)
#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
#endif /* LZ4_EARLY_ABORT */

void lz4_encode_2gb(uint8_t ** dst_ptr,
                    size_t dst_size,
                    const uint8_t ** src_ptr,
                    const uint8_t * src_begin,
                    size_t src_size,
                    lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
                    int skip_final_literals)
{
  uint8_t *dst = *dst_ptr;        // current output stream position
  uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
  const uint8_t *src = *src_ptr;  // current input stream literal to encode
  const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
  const uint8_t *match_begin = 0; // first byte of matched sequence
  const uint8_t *match_end = 0;   // first byte after matched sequence
#if LZ4_EARLY_ABORT
  uint8_t * const dst_begin = dst;
  uint32_t lz4_do_abort_eval = lz4_do_early_abort;
#endif
  
  while (dst < end)
  {
    ptrdiff_t match_distance = 0;
    for (match_begin = src; match_begin < src_end; match_begin += 1) {
      const uint32_t pos = (uint32_t)(match_begin - src_begin);
      const uint32_t w0 = load4(match_begin);
      const uint32_t w1 = load4(match_begin + 1);
      const uint32_t w2 = load4(match_begin + 2);
      const uint32_t w3 = load4(match_begin + 3);
      const int i0 = lz4_hash(w0);
      const int i1 = lz4_hash(w1);
      const int i2 = lz4_hash(w2);
      const int i3 = lz4_hash(w3);
      const uint8_t *c0 = src_begin + hash_table[i0].offset;
      const uint8_t *c1 = src_begin + hash_table[i1].offset;
      const uint8_t *c2 = src_begin + hash_table[i2].offset;
      const uint8_t *c3 = src_begin + hash_table[i3].offset;
      const uint32_t m0 = hash_table[i0].word;
      const uint32_t m1 = hash_table[i1].word;
      const uint32_t m2 = hash_table[i2].word;
      const uint32_t m3 = hash_table[i3].word;
      hash_table[i0].offset = pos;
      hash_table[i0].word = w0;
      hash_table[i1].offset = pos + 1;
      hash_table[i1].word = w1;

      hash_table[i2].offset = pos + 2;
      hash_table[i2].word = w2;
      hash_table[i3].offset = pos + 3;
      hash_table[i3].word = w3;

      match_distance = (match_begin - c0);
      if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
        match_end = match_begin + 4;
        goto EXPAND_FORWARD;
      }

      match_begin++;
      match_distance = (match_begin - c1);
      if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
        match_end = match_begin + 4;
        goto EXPAND_FORWARD;
      }

      match_begin++;
      match_distance = (match_begin - c2);
      if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
        match_end = match_begin + 4;
        goto EXPAND_FORWARD;
      }

      match_begin++;
      match_distance = (match_begin - c3);
      if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
        match_end = match_begin + 4;
        goto EXPAND_FORWARD;
      }

#if LZ4_EARLY_ABORT
      //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
      if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
	      ptrdiff_t dstd = dst - dst_begin;

	      if (dstd == 0) {
		      lz4_early_aborts++;
		      return;
	      }

/* 	      if (dstd >= pos) { */
/* 		      return; */
/* 	      } */
/* 	      ptrdiff_t cbytes = pos - dstd; */
/* 	      if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos)  { */
/* 		      return; */
/* 	      } */
	      lz4_do_abort_eval = 0;
      }
#endif
    }
    
    if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence
    
    //  Emit a trailing literal that covers the remainder of the source buffer,
    //  if we can do so without exceeding the bounds of the destination buffer.
    size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
    if (src_remaining < 15) {
      *dst++ = (uint8_t)(src_remaining << 4);
      memcpy(dst, src, 16); dst += src_remaining;
    } else {
      *dst++ = 0xf0;
      dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
      if (dst == 0 || dst + src_remaining >= end) return;
      memcpy(dst, src, src_remaining); dst += src_remaining;
    }
    *dst_ptr = dst;
    *src_ptr = src + src_remaining;
    return;
    
  EXPAND_FORWARD:
    
    // Expand match forward
    {
      const uint8_t * ref_end = match_end - match_distance;
      while (match_end < src_end)
      {
        size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
        if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
        match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
        ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
      }
    }
    
    // Expand match backward
    {
      // match_begin_min = max(src_begin + match_distance,literal)
      const uint8_t * match_begin_min = src_begin + match_distance;
      match_begin_min = (match_begin_min < src)?src:match_begin_min;
      const uint8_t * ref_begin = match_begin - match_distance;
      
      while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; }
    }
    
    // Emit match
    dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
    if (!dst) return;
    
    // Update state
    src = match_end;
    
    // Update return values to include the last fully encoded match
    *dst_ptr = dst;
    *src_ptr = src;
  }
}

#endif

size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
                            const uint8_t * __restrict src_buffer, size_t src_size,
                            lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
{
  //  Initialize hash table
  const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
  
  const uint8_t * src = src_buffer;
  uint8_t * dst = dst_buffer;
  
  // We need several blocks because our base function is limited to 2GB input
  const size_t BLOCK_SIZE = 0x7ffff000;
  while (src_size > 0)
  {
	  //DRKTODO either implement pattern4 or figure out optimal unroll
	  //DRKTODO: bizarrely, with plain O3 the compiler generates a single
	  //DRKTODO: scalar STP per loop iteration with the stock loop
	  //DRKTODO If hand unrolled, it switches to NEON store pairs
    // Reset hash table for each block
/* #if __STDC_HOSTED__ */
/*     memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
/* #else */
/*     for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
/* #endif */

    	  for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;) {
		  hash_table[i++] = HASH_FILL;
		  hash_table[i++] = HASH_FILL;
		  hash_table[i++] = HASH_FILL;
		  hash_table[i++] = HASH_FILL;
	  }

    // Bytes to encode in this block
    const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
    
    // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
    // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
    uint8_t * dst_start = dst;
    const uint8_t * src_start = src;
    lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
    
    // Check progress
    size_t dst_used = dst - dst_start;
    size_t src_used = src - src_start; // src_used <= src_to_encode
    if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block

    // Note that there is a potential problem here in case of non compressible data requiring more blocks.
    // We may end up here with src_used very small, or even 0, and will not be able to make progress during
    // compression. We FAIL unless the length of literals remaining at the end is small enough.
    if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals
    
    // Update counters (SRC and DST already have been updated)
    src_size -= src_used;
    dst_size -= dst_used;
  }

  return (size_t)(dst - dst_buffer); // bytes produced
}

#define likely(expr)     __builtin_expect((expr) != 0, 1)
#define unlikely(expr)   __builtin_expect((expr) != 0, 0)
typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));

int lz4_decode(uint8_t ** dst_ptr,
                    uint8_t * dst_begin,
                    uint8_t * dst_end,
                    const uint8_t ** src_ptr,
                    const uint8_t * src_end)
{
    uint8_t * dst = *dst_ptr;
    const uint8_t * src = *src_ptr;
  
    //  Require dst_end > dst.
    if (dst_end <= dst) goto OUT_FULL;
    
    while (src < src_end)
    {
        // Keep last good position
        *src_ptr = src;
        *dst_ptr = dst;

        uint8_t cmd = *src++;                                                // 1 byte encoding literal+(match-4) length: LLLLMMMM
        uint32_t literalLength = (cmd >> 4) & 15; // 0..15
        uint32_t matchLength = 4 + (cmd & 15); // 4..19

        // extra bytes for literalLength
        if (unlikely(literalLength == 15))
        {
            uint8_t s;
            do {
#if DEBUG_LZ4_DECODE_ERRORS
                if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n");
#endif
                if (unlikely(src >= src_end)) goto IN_FAIL;         // unexpected end of input (1 byte needed)
                s = *src++;
                literalLength += s;
            } while (unlikely(s == 255));
        }

        // copy literal
#if DEBUG_LZ4_DECODE_ERRORS
        if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
#endif
        if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
        if (unlikely(literalLength > (size_t)(dst_end - dst))) {
            //  literal will take us past the end of the destination buffer,
            //  so we can only copy part of it.
            literalLength = (uint32_t)(dst_end - dst);
            memcpy(dst, src, literalLength);
            dst += literalLength;
            goto OUT_FULL;
        }
        memcpy(dst,src,literalLength);
        src += literalLength;
        dst += literalLength;

        if (unlikely(src >= src_end)) goto OUT_FULL;                // valid end of stream
#if DEBUG_LZ4_DECODE_ERRORS
        if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
#endif
        if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL;    // unexpected end of input (2 bytes needed)

	//DRKTODO: this causes an alignment increase warning (legitimate?)
	//DRKTODO: cast of char * to uint16_t*
	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wcast-align"
	
        // match distance
        uint64_t matchDistance = *(const uint16_t *)src;                     // 0x0000 <= matchDistance <= 0xffff
	#pragma clang diagnostic pop
        src += 2;
#if DEBUG_LZ4_DECODE_ERRORS
        if (matchDistance == 0) printf("Invalid match distance D = 0\n");
#endif
        if (unlikely(matchDistance == 0)) goto IN_FAIL;                      // 0x0000 invalid
        uint8_t * ref = dst - matchDistance;
#if DEBUG_LZ4_DECODE_ERRORS
        if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
#endif
        if (unlikely(ref < dst_begin)) goto OUT_FAIL;                        // out of range

        // extra bytes for matchLength
        if (unlikely(matchLength == 19))
        {
            uint8_t s;
            do {
#if DEBUG_LZ4_DECODE_ERRORS
                if (unlikely(src >= src_end)) printf("Truncated SRC match length\n");
#endif
                if (unlikely(src >= src_end)) goto IN_FAIL;                      // unexpected end of input (1 byte needed)
                s = *src++;
                matchLength += s;
            } while (unlikely(s == 255));
        }
      
        // copy match (may overlap)
        if (unlikely(matchLength > (size_t)(dst_end - dst))) {
            //  match will take us past the end of the destination buffer,
            //  so we can only copy part of it.
            matchLength = (uint32_t)(dst_end - dst);
            for (uint32_t i=0; i<matchLength; ++i) dst[i] = ref[i];
            dst += matchLength;
            goto OUT_FULL;
        }
        for (uint64_t i=0;i<matchLength;i++) dst[i] = ref[i];
        dst += matchLength;
    }

    //  We reached the end of the input buffer after a full instruction
OUT_FULL:
    //  Or we reached the end of the output buffer
    *dst_ptr = dst;
    *src_ptr = src;
    return 0;
    
    //  Error conditions
OUT_FAIL:
IN_FAIL:
    return 1; // FAIL
}
Commit	Line	Data
39037602 A	1	/*
	2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	// LZ4_RAW buffer API
	30	// EB May 2015
	31	// Mar 2016 Imported from the Compression project, with minor optimisations and
	32	// early abort detection (Derek Kumar)
	33
	34	#include "lz4.h"
	35
	36	size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
	37	const uint8_t * __restrict src_buffer, size_t src_size,
	38	void * __restrict work __attribute__((unused)))
	39	{
	40	const uint8_t * src = src_buffer;
	41	uint8_t * dst = dst_buffer;
	42
	43	// Go fast if we can, keeping away from the end of buffers
	44	#if LZ4_ENABLE_ASSEMBLY_DECODE
	45	if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN)
	46	{
	47	if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN))
	48	return 0; // FAIL
	49	}
	50	#endif
	51	//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
	52
	53	// Finish safe
	54	if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size))
	55	return 0; // FAIL
	56
	57	return (size_t)(dst - dst_buffer); // bytes produced
	58	}
	59	// Debug flags
	60	#if LZ4DEBUG
	61	#define DEBUG_LZ4_ENCODE_ERRORS (1)
	62	#define DEBUG_LZ4_DECODE_ERRORS (1)
	63	#endif
	64
65	#if DEBUG_LZ4_ENCODE_ERRORS
66	#endif
67
68	#if !LZ4_ENABLE_ASSEMBLY_ENCODE
69
70	#if defined(__x86_64__) \|\| defined(__x86_64h__)
71	# define LZ4_MATCH_SEARCH_INIT_SIZE 32
72	# define LZ4_MATCH_SEARCH_LOOP_SIZE 32
73	#else
74	# define LZ4_MATCH_SEARCH_INIT_SIZE 8
75	# define LZ4_MATCH_SEARCH_LOOP_SIZE 8
76	#endif
77
78	// Return hash for 4-byte sequence X
79	static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); }
80
81	// Store 0xfff..fff at *PTR
82	static inline void lz4_fill16(uint8_t * ptr)
83	{
84	store8(ptr,-1);
85	store8(ptr+8,-1);
86	}
87
88	// Return number of matching bytes 0..4 at positions A and B.
89	static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b)
90	{
91	uint32_t x = load4(a) ^ load4(b);
92	return (x == 0)?4:(__builtin_ctzl(x) >> 3);
93	}
94
95	// Return number of matching bytes 0..8 at positions A and B.
96	static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b)
97	{
98	uint64_t x = load8(a) ^ load8(b);
99	return (x == 0)?8:(__builtin_ctzll(x) >> 3);
100	}
101
102	// Return number of matching bytes 0..16 at positions A and B.
103	static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b)
104	{
105	size_t n = lz4_nmatch8(a,b);
106	return (n == 8)?(8 + lz4_nmatch8(a+8,b+8)):n;
107	}
108
109	// Return number of matching bytes 0..32 at positions A and B.
110	static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b)
111	{
112	size_t n = lz4_nmatch16(a,b);
113	return (n == 16)?(16 + lz4_nmatch16(a+16,b+16)):n;
114	}
115
116	// Return number of matching bytes 0..64 at positions A and B.
117	static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b)
118	{
119	size_t n = lz4_nmatch32(a,b);
120	return (n == 32)?(32 + lz4_nmatch32(a+32,b+32)):n;
121	}
122
123	// Compile-time selection, return number of matching bytes 0..N at positions A and B.
124	static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
125	{
126	switch (N) {
127	case 4: return lz4_nmatch4(a,b);
128	case 8: return lz4_nmatch8(a,b);
129	case 16: return lz4_nmatch16(a,b);
130	case 32: return lz4_nmatch32(a,b);
131	case 64: return lz4_nmatch64(a,b);
132	}
133	__builtin_trap(); // FAIL
134	}
135
136	// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
137	// We are allowed to access a constant number of bytes above DST_END.
138	// Return incremented DST pointer on success, and 0 on failure
139	static inline uint8_t lz4_store_length(uint8_t dst, const uint8_t * const end, uint32_t L) {
140	(void)end;
141	while (L >= 17*255) {
142	lz4_fill16(dst);
143	dst += 16;
144	L -= 16*255;
145	}
146	lz4_fill16(dst);
147	//DRKTODO verify these modulos/divisions are optimally handled by clang
148	dst += L/255;
149	*dst++ = L%255;
150	return dst;
151	}
152
153	static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; }
154
155	static inline uint8_t copy_literal(uint8_t dst, const uint8_t * restrict src, uint32_t L) {
156	uint8_t *end = dst + L;
157	{ copy16(dst, src); dst += 16; src += 16; }
158	while (dst < end) { copy32(dst, src); dst += 32; src += 32; }
159	return end;
160	}
161
162	static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
163	uint8_t * restrict dst,
164	const uint8_t * const end,
165	const uint8_t * restrict src) {
166	// The LZ4 encoding scheme requires that M is at least 4, because
167	// the actual value stored by the encoding is M - 4. Check this
168	// requirement for debug builds.
169	assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
170	// Having checked that M >= 4, translate M by four.
171	M -= 4;
172	// Similarly, we must have D < 2**16, because we use only two bytes
173	// to represent the value of D in the encoding.
174	assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
175	// Construct the command byte by clamping both L and M to 0 ... 15
176	// and packing them into a single byte, and store it.
177	*dst++ = clamp(L, 15) << 4 \| clamp(M, 15);
178	// If L is 15 or greater, we need to encode extra literal length bytes.
179	if (L >= 15) {
180	dst = lz4_store_length(dst, end, L - 15);
181	if (dst == 0 \|\| dst + L >= end) return NULL;
182	}
183	// Copy the literal itself from src to dst.
184	dst = copy_literal(dst, src, L);
185	// Store match distance.
186	store2(dst, D); dst += 2;
187	// If M is 15 or greater, we need to encode extra match length bytes.
188	if (M >= 15) {
189	dst = lz4_store_length(dst, end, M - 15);
190	if (dst == 0) return NULL;
191	}
192	return dst;
193	}
194
195	/* #ifndef LZ4_EARLY_ABORT */
196	/* #define LZ4_EARLY_ABORT (1) */
197	/* #endif */
198
199	#if LZ4_EARLY_ABORT
200	int lz4_do_early_abort = 1;
201	int lz4_early_aborts = 0;
202	#define LZ4_EARLY_ABORT_EVAL (448)
203	#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
204	#endif /* LZ4_EARLY_ABORT */
205
206	void lz4_encode_2gb(uint8_t ** dst_ptr,
207	size_t dst_size,
208	const uint8_t ** src_ptr,
209	const uint8_t * src_begin,
210	size_t src_size,
211	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
212	int skip_final_literals)
213	{
214	uint8_t dst = dst_ptr; // current output stream position
215	uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
216	const uint8_t src = src_ptr; // current input stream literal to encode
217	const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
218	const uint8_t *match_begin = 0; // first byte of matched sequence
219	const uint8_t *match_end = 0; // first byte after matched sequence
220	#if LZ4_EARLY_ABORT
221	uint8_t * const dst_begin = dst;
222	uint32_t lz4_do_abort_eval = lz4_do_early_abort;
223	#endif
224
225	while (dst < end)
226	{
227	ptrdiff_t match_distance = 0;
228	for (match_begin = src; match_begin < src_end; match_begin += 1) {
229	const uint32_t pos = (uint32_t)(match_begin - src_begin);
230	const uint32_t w0 = load4(match_begin);
231	const uint32_t w1 = load4(match_begin + 1);
232	const uint32_t w2 = load4(match_begin + 2);
233	const uint32_t w3 = load4(match_begin + 3);
234	const int i0 = lz4_hash(w0);
235	const int i1 = lz4_hash(w1);
236	const int i2 = lz4_hash(w2);
237	const int i3 = lz4_hash(w3);
238	const uint8_t *c0 = src_begin + hash_table[i0].offset;
239	const uint8_t *c1 = src_begin + hash_table[i1].offset;
240	const uint8_t *c2 = src_begin + hash_table[i2].offset;
241	const uint8_t *c3 = src_begin + hash_table[i3].offset;
242	const uint32_t m0 = hash_table[i0].word;
243	const uint32_t m1 = hash_table[i1].word;
244	const uint32_t m2 = hash_table[i2].word;
245	const uint32_t m3 = hash_table[i3].word;
246	hash_table[i0].offset = pos;
247	hash_table[i0].word = w0;
248	hash_table[i1].offset = pos + 1;
249	hash_table[i1].word = w1;
250
251	hash_table[i2].offset = pos + 2;
252	hash_table[i2].word = w2;
253	hash_table[i3].offset = pos + 3;
254	hash_table[i3].word = w3;
255
256	match_distance = (match_begin - c0);
257	if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
258	match_end = match_begin + 4;
259	goto EXPAND_FORWARD;
260	}
261
262	match_begin++;
263	match_distance = (match_begin - c1);
264	if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
265	match_end = match_begin + 4;
266	goto EXPAND_FORWARD;
267	}
268
269	match_begin++;
270	match_distance = (match_begin - c2);
271	if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
272	match_end = match_begin + 4;
273	goto EXPAND_FORWARD;
274	}
275
276	match_begin++;
277	match_distance = (match_begin - c3);
278	if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
279	match_end = match_begin + 4;
280	goto EXPAND_FORWARD;
281	}
282
283	#if LZ4_EARLY_ABORT
284	//DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
285	if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
286	ptrdiff_t dstd = dst - dst_begin;
287
288	if (dstd == 0) {
289	lz4_early_aborts++;
290	return;
291	}
292
293	/* if (dstd >= pos) { */
294	/* return; */
295	/* } */
296	/* ptrdiff_t cbytes = pos - dstd; */
297	/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
298	/* return; */
299	/* } */
300	lz4_do_abort_eval = 0;
301	}
302	#endif
303	}
304
305	if (skip_final_literals) { src_ptr = src; dst_ptr = dst; return; } // do not emit the final literal sequence
306
307	// Emit a trailing literal that covers the remainder of the source buffer,
308	// if we can do so without exceeding the bounds of the destination buffer.
309	size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
310	if (src_remaining < 15) {
311	*dst++ = (uint8_t)(src_remaining << 4);
312	memcpy(dst, src, 16); dst += src_remaining;
313	} else {
314	*dst++ = 0xf0;
315	dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
316	if (dst == 0 \|\| dst + src_remaining >= end) return;
317	memcpy(dst, src, src_remaining); dst += src_remaining;
318	}
319	*dst_ptr = dst;
320	*src_ptr = src + src_remaining;
321	return;
322
323	EXPAND_FORWARD:
324
325	// Expand match forward
326	{
327	const uint8_t * ref_end = match_end - match_distance;
328	while (match_end < src_end)
329	{
330	size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
331	if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
332	match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
333	ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
334	}
335	}
336
337	// Expand match backward
338	{
339	// match_begin_min = max(src_begin + match_distance,literal)
340	const uint8_t * match_begin_min = src_begin + match_distance;
341	match_begin_min = (match_begin_min < src)?src:match_begin_min;
342	const uint8_t * ref_begin = match_begin - match_distance;
343
344	while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; }
345	}
346
347	// Emit match
348	dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
349	if (!dst) return;
350
351	// Update state
352	src = match_end;
353
354	// Update return values to include the last fully encoded match
355	*dst_ptr = dst;
356	*src_ptr = src;
357	}
358	}
359
360	#endif
361
362	size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
363	const uint8_t * __restrict src_buffer, size_t src_size,
364	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
365	{
366	// Initialize hash table
367	const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
368
369	const uint8_t * src = src_buffer;
370	uint8_t * dst = dst_buffer;
371
372	// We need several blocks because our base function is limited to 2GB input
373	const size_t BLOCK_SIZE = 0x7ffff000;
374	while (src_size > 0)
375	{
376	//DRKTODO either implement pattern4 or figure out optimal unroll
377	//DRKTODO: bizarrely, with plain O3 the compiler generates a single
378	//DRKTODO: scalar STP per loop iteration with the stock loop
379	//DRKTODO If hand unrolled, it switches to NEON store pairs
380	// Reset hash table for each block
381	/* #if __STDC_HOSTED__ */
382	/* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
383	/* #else */
384	/* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
385	/* #endif */
386
387	for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;) {
388	hash_table[i++] = HASH_FILL;
389	hash_table[i++] = HASH_FILL;
390	hash_table[i++] = HASH_FILL;
391	hash_table[i++] = HASH_FILL;
392	}
393
394	// Bytes to encode in this block
395	const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
396
397	// Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
398	// Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
399	uint8_t * dst_start = dst;
400	const uint8_t * src_start = src;
401	lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
402
403	// Check progress
404	size_t dst_used = dst - dst_start;
405	size_t src_used = src - src_start; // src_used <= src_to_encode
406	if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block
407
408	// Note that there is a potential problem here in case of non compressible data requiring more blocks.
409	// We may end up here with src_used very small, or even 0, and will not be able to make progress during
410	// compression. We FAIL unless the length of literals remaining at the end is small enough.
411	if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals
412
413	// Update counters (SRC and DST already have been updated)
414	src_size -= src_used;
415	dst_size -= dst_used;
416	}
417
418	return (size_t)(dst - dst_buffer); // bytes produced
419	}
420
421	#define likely(expr) __builtin_expect((expr) != 0, 1)
422	#define unlikely(expr) __builtin_expect((expr) != 0, 0)
423	typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
424
425	int lz4_decode(uint8_t ** dst_ptr,
426	uint8_t * dst_begin,
427	uint8_t * dst_end,
428	const uint8_t ** src_ptr,
429	const uint8_t * src_end)
430	{
431	uint8_t * dst = *dst_ptr;
432	const uint8_t * src = *src_ptr;
433
434	// Require dst_end > dst.
435	if (dst_end <= dst) goto OUT_FULL;
436
437	while (src < src_end)
438	{
439	// Keep last good position
440	*src_ptr = src;
441	*dst_ptr = dst;
442
443	uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
444	uint32_t literalLength = (cmd >> 4) & 15; // 0..15
445	uint32_t matchLength = 4 + (cmd & 15); // 4..19
446
447	// extra bytes for literalLength
448	if (unlikely(literalLength == 15))
449	{
450	uint8_t s;
451	do {
452	#if DEBUG_LZ4_DECODE_ERRORS
453	if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n");
454	#endif
455	if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
456	s = *src++;
457	literalLength += s;
458	} while (unlikely(s == 255));
459	}
460
461	// copy literal
462	#if DEBUG_LZ4_DECODE_ERRORS
463	if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
464	#endif
465	if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
466	if (unlikely(literalLength > (size_t)(dst_end - dst))) {
467	// literal will take us past the end of the destination buffer,
468	// so we can only copy part of it.
469	literalLength = (uint32_t)(dst_end - dst);
470	memcpy(dst, src, literalLength);
471	dst += literalLength;
472	goto OUT_FULL;
473	}
474	memcpy(dst,src,literalLength);
475	src += literalLength;
476	dst += literalLength;
477
478	if (unlikely(src >= src_end)) goto OUT_FULL; // valid end of stream
479	#if DEBUG_LZ4_DECODE_ERRORS
480	if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
481	#endif
482	if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed)
483
484	//DRKTODO: this causes an alignment increase warning (legitimate?)
485	//DRKTODO: cast of char * to uint16_t*
486	#pragma clang diagnostic push
487	#pragma clang diagnostic ignored "-Wcast-align"
488
489	// match distance
490	uint64_t matchDistance = (const uint16_t )src; // 0x0000 <= matchDistance <= 0xffff
491	#pragma clang diagnostic pop
492	src += 2;
493	#if DEBUG_LZ4_DECODE_ERRORS
494	if (matchDistance == 0) printf("Invalid match distance D = 0\n");
495	#endif
496	if (unlikely(matchDistance == 0)) goto IN_FAIL; // 0x0000 invalid
497	uint8_t * ref = dst - matchDistance;
498	#if DEBUG_LZ4_DECODE_ERRORS
499	if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
500	#endif
501	if (unlikely(ref < dst_begin)) goto OUT_FAIL; // out of range
502
503	// extra bytes for matchLength
504	if (unlikely(matchLength == 19))
505	{
506	uint8_t s;
507	do {
508	#if DEBUG_LZ4_DECODE_ERRORS
509	if (unlikely(src >= src_end)) printf("Truncated SRC match length\n");
510	#endif
511	if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
512	s = *src++;
513	matchLength += s;
514	} while (unlikely(s == 255));
515	}
516
517	// copy match (may overlap)
518	if (unlikely(matchLength > (size_t)(dst_end - dst))) {
519	// match will take us past the end of the destination buffer,
520	// so we can only copy part of it.
521	matchLength = (uint32_t)(dst_end - dst);
522	for (uint32_t i=0; i<matchLength; ++i) dst[i] = ref[i];
523	dst += matchLength;
524	goto OUT_FULL;
525	}
526	for (uint64_t i=0;i<matchLength;i++) dst[i] = ref[i];
527	dst += matchLength;
528	}
529
530	// We reached the end of the input buffer after a full instruction
531	OUT_FULL:
532	// Or we reached the end of the output buffer
533	*dst_ptr = dst;
534	*src_ptr = src;
535	return 0;
536
537	// Error conditions
538	OUT_FAIL:
539	IN_FAIL:
540	return 1; // FAIL
541	}