2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
31 // Mar 2016 Imported from the Compression project, with minor optimisations and
32 // early abort detection (Derek Kumar)
36 size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer
, size_t dst_size
,
37 const uint8_t * __restrict src_buffer
, size_t src_size
,
38 void * __restrict work
__attribute__((unused
)))
40 const uint8_t * src
= src_buffer
;
41 uint8_t * dst
= dst_buffer
;
43 // Go fast if we can, keeping away from the end of buffers
44 #if LZ4_ENABLE_ASSEMBLY_DECODE
45 if (dst_size
> LZ4_GOFAST_SAFETY_MARGIN
&& src_size
> LZ4_GOFAST_SAFETY_MARGIN
)
47 if (lz4_decode_asm(&dst
, dst_buffer
, dst_buffer
+ dst_size
- LZ4_GOFAST_SAFETY_MARGIN
, &src
, src_buffer
+ src_size
- LZ4_GOFAST_SAFETY_MARGIN
))
51 //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
54 if (lz4_decode(&dst
, dst_buffer
, dst_buffer
+ dst_size
, &src
, src_buffer
+ src_size
))
57 return (size_t)(dst
- dst_buffer
); // bytes produced
61 #define DEBUG_LZ4_ENCODE_ERRORS (1)
62 #define DEBUG_LZ4_DECODE_ERRORS (1)
65 #if DEBUG_LZ4_ENCODE_ERRORS
68 #if !LZ4_ENABLE_ASSEMBLY_ENCODE
70 #if defined(__x86_64__) || defined(__x86_64h__)
71 # define LZ4_MATCH_SEARCH_INIT_SIZE 32
72 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32
74 # define LZ4_MATCH_SEARCH_INIT_SIZE 8
75 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8
78 // Return hash for 4-byte sequence X
79 static inline uint32_t lz4_hash(uint32_t x
) { return (x
* 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS
); }
81 // Store 0xfff..fff at *PTR
82 static inline void lz4_fill16(uint8_t * ptr
)
88 // Return number of matching bytes 0..4 at positions A and B.
89 static inline size_t lz4_nmatch4(const uint8_t * a
,const uint8_t * b
)
91 uint32_t x
= load4(a
) ^ load4(b
);
92 return (x
== 0)?4:(__builtin_ctzl(x
) >> 3);
95 // Return number of matching bytes 0..8 at positions A and B.
96 static inline size_t lz4_nmatch8(const uint8_t * a
,const uint8_t * b
)
98 uint64_t x
= load8(a
) ^ load8(b
);
99 return (x
== 0)?8:(__builtin_ctzll(x
) >> 3);
102 // Return number of matching bytes 0..16 at positions A and B.
103 static inline size_t lz4_nmatch16(const uint8_t * a
,const uint8_t * b
)
105 size_t n
= lz4_nmatch8(a
,b
);
106 return (n
== 8)?(8 + lz4_nmatch8(a
+8,b
+8)):n
;
109 // Return number of matching bytes 0..32 at positions A and B.
110 static inline size_t lz4_nmatch32(const uint8_t * a
,const uint8_t * b
)
112 size_t n
= lz4_nmatch16(a
,b
);
113 return (n
== 16)?(16 + lz4_nmatch16(a
+16,b
+16)):n
;
116 // Return number of matching bytes 0..64 at positions A and B.
117 static inline size_t lz4_nmatch64(const uint8_t * a
,const uint8_t * b
)
119 size_t n
= lz4_nmatch32(a
,b
);
120 return (n
== 32)?(32 + lz4_nmatch32(a
+32,b
+32)):n
;
123 // Compile-time selection, return number of matching bytes 0..N at positions A and B.
124 static inline size_t lz4_nmatch(int N
, const uint8_t * a
, const uint8_t * b
)
127 case 4: return lz4_nmatch4(a
,b
);
128 case 8: return lz4_nmatch8(a
,b
);
129 case 16: return lz4_nmatch16(a
,b
);
130 case 32: return lz4_nmatch32(a
,b
);
131 case 64: return lz4_nmatch64(a
,b
);
133 __builtin_trap(); // FAIL
136 // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
137 // We are allowed to access a constant number of bytes above DST_END.
138 // Return incremented DST pointer on success, and 0 on failure
139 static inline uint8_t *lz4_store_length(uint8_t * dst
, const uint8_t * const end
, uint32_t L
) {
141 while (L
>= 17*255) {
147 //DRKTODO verify these modulos/divisions are optimally handled by clang
153 static inline uint32_t clamp(uint32_t x
, uint32_t max
) __attribute__((overloadable
)) { return x
> max
? max
: x
; }
155 static inline uint8_t *copy_literal(uint8_t *dst
, const uint8_t * restrict src
, uint32_t L
) {
156 uint8_t *end
= dst
+ L
;
157 { copy16(dst
, src
); dst
+= 16; src
+= 16; }
158 while (dst
< end
) { copy32(dst
, src
); dst
+= 32; src
+= 32; }
162 static uint8_t *lz4_emit_match(uint32_t L
, uint32_t M
, uint32_t D
,
163 uint8_t * restrict dst
,
164 const uint8_t * const end
,
165 const uint8_t * restrict src
) {
166 // The LZ4 encoding scheme requires that M is at least 4, because
167 // the actual value stored by the encoding is M - 4. Check this
168 // requirement for debug builds.
169 assert(M
>= 4 && "LZ4 encoding requires that M is at least 4");
170 // Having checked that M >= 4, translate M by four.
172 // Similarly, we must have D < 2**16, because we use only two bytes
173 // to represent the value of D in the encoding.
174 assert(D
<= USHRT_MAX
&& "LZ4 encoding requries that D can be stored in two bytes.");
175 // Construct the command byte by clamping both L and M to 0 ... 15
176 // and packing them into a single byte, and store it.
177 *dst
++ = clamp(L
, 15) << 4 | clamp(M
, 15);
178 // If L is 15 or greater, we need to encode extra literal length bytes.
180 dst
= lz4_store_length(dst
, end
, L
- 15);
181 if (dst
== 0 || dst
+ L
>= end
) return NULL
;
183 // Copy the literal itself from src to dst.
184 dst
= copy_literal(dst
, src
, L
);
185 // Store match distance.
186 store2(dst
, D
); dst
+= 2;
187 // If M is 15 or greater, we need to encode extra match length bytes.
189 dst
= lz4_store_length(dst
, end
, M
- 15);
190 if (dst
== 0) return NULL
;
195 /* #ifndef LZ4_EARLY_ABORT */
196 /* #define LZ4_EARLY_ABORT (1) */
200 int lz4_do_early_abort
= 1;
201 int lz4_early_aborts
= 0;
202 #define LZ4_EARLY_ABORT_EVAL (448)
203 #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
204 #endif /* LZ4_EARLY_ABORT */
206 void lz4_encode_2gb(uint8_t ** dst_ptr
,
208 const uint8_t ** src_ptr
,
209 const uint8_t * src_begin
,
211 lz4_hash_entry_t hash_table
[LZ4_COMPRESS_HASH_ENTRIES
],
212 int skip_final_literals
)
214 uint8_t *dst
= *dst_ptr
; // current output stream position
215 uint8_t *end
= dst
+ dst_size
- LZ4_GOFAST_SAFETY_MARGIN
;
216 const uint8_t *src
= *src_ptr
; // current input stream literal to encode
217 const uint8_t *src_end
= src
+ src_size
- LZ4_GOFAST_SAFETY_MARGIN
;
218 const uint8_t *match_begin
= 0; // first byte of matched sequence
219 const uint8_t *match_end
= 0; // first byte after matched sequence
221 uint8_t * const dst_begin
= dst
;
222 uint32_t lz4_do_abort_eval
= lz4_do_early_abort
;
227 ptrdiff_t match_distance
= 0;
228 for (match_begin
= src
; match_begin
< src_end
; match_begin
+= 1) {
229 const uint32_t pos
= (uint32_t)(match_begin
- src_begin
);
230 const uint32_t w0
= load4(match_begin
);
231 const uint32_t w1
= load4(match_begin
+ 1);
232 const uint32_t w2
= load4(match_begin
+ 2);
233 const uint32_t w3
= load4(match_begin
+ 3);
234 const int i0
= lz4_hash(w0
);
235 const int i1
= lz4_hash(w1
);
236 const int i2
= lz4_hash(w2
);
237 const int i3
= lz4_hash(w3
);
238 const uint8_t *c0
= src_begin
+ hash_table
[i0
].offset
;
239 const uint8_t *c1
= src_begin
+ hash_table
[i1
].offset
;
240 const uint8_t *c2
= src_begin
+ hash_table
[i2
].offset
;
241 const uint8_t *c3
= src_begin
+ hash_table
[i3
].offset
;
242 const uint32_t m0
= hash_table
[i0
].word
;
243 const uint32_t m1
= hash_table
[i1
].word
;
244 const uint32_t m2
= hash_table
[i2
].word
;
245 const uint32_t m3
= hash_table
[i3
].word
;
246 hash_table
[i0
].offset
= pos
;
247 hash_table
[i0
].word
= w0
;
248 hash_table
[i1
].offset
= pos
+ 1;
249 hash_table
[i1
].word
= w1
;
251 hash_table
[i2
].offset
= pos
+ 2;
252 hash_table
[i2
].word
= w2
;
253 hash_table
[i3
].offset
= pos
+ 3;
254 hash_table
[i3
].word
= w3
;
256 match_distance
= (match_begin
- c0
);
257 if (w0
== m0
&& match_distance
< 0x10000 && match_distance
> 0) {
258 match_end
= match_begin
+ 4;
263 match_distance
= (match_begin
- c1
);
264 if (w1
== m1
&& match_distance
< 0x10000 && match_distance
> 0) {
265 match_end
= match_begin
+ 4;
270 match_distance
= (match_begin
- c2
);
271 if (w2
== m2
&& match_distance
< 0x10000 && match_distance
> 0) {
272 match_end
= match_begin
+ 4;
277 match_distance
= (match_begin
- c3
);
278 if (w3
== m3
&& match_distance
< 0x10000 && match_distance
> 0) {
279 match_end
= match_begin
+ 4;
284 //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
285 if (lz4_do_abort_eval
&& ((pos
) >= LZ4_EARLY_ABORT_EVAL
)) {
286 ptrdiff_t dstd
= dst
- dst_begin
;
293 /* if (dstd >= pos) { */
296 /* ptrdiff_t cbytes = pos - dstd; */
297 /* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
300 lz4_do_abort_eval
= 0;
305 if (skip_final_literals
) { *src_ptr
= src
; *dst_ptr
= dst
; return; } // do not emit the final literal sequence
307 // Emit a trailing literal that covers the remainder of the source buffer,
308 // if we can do so without exceeding the bounds of the destination buffer.
309 size_t src_remaining
= src_end
+ LZ4_GOFAST_SAFETY_MARGIN
- src
;
310 if (src_remaining
< 15) {
311 *dst
++ = (uint8_t)(src_remaining
<< 4);
312 memcpy(dst
, src
, 16); dst
+= src_remaining
;
315 dst
= lz4_store_length(dst
, end
, (uint32_t)(src_remaining
- 15));
316 if (dst
== 0 || dst
+ src_remaining
>= end
) return;
317 memcpy(dst
, src
, src_remaining
); dst
+= src_remaining
;
320 *src_ptr
= src
+ src_remaining
;
325 // Expand match forward
327 const uint8_t * ref_end
= match_end
- match_distance
;
328 while (match_end
< src_end
)
330 size_t n
= lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE
, ref_end
, match_end
);
331 if (n
< LZ4_MATCH_SEARCH_LOOP_SIZE
) { match_end
+= n
; break; }
332 match_end
+= LZ4_MATCH_SEARCH_LOOP_SIZE
;
333 ref_end
+= LZ4_MATCH_SEARCH_LOOP_SIZE
;
337 // Expand match backward
339 // match_begin_min = max(src_begin + match_distance,literal)
340 const uint8_t * match_begin_min
= src_begin
+ match_distance
;
341 match_begin_min
= (match_begin_min
< src
)?src
:match_begin_min
;
342 const uint8_t * ref_begin
= match_begin
- match_distance
;
344 while (match_begin
> match_begin_min
&& ref_begin
[-1] == match_begin
[-1] ) { match_begin
-= 1; ref_begin
-= 1; }
348 dst
= lz4_emit_match((uint32_t)(match_begin
- src
), (uint32_t)(match_end
- match_begin
), (uint32_t)match_distance
, dst
, end
, src
);
354 // Update return values to include the last fully encoded match
362 size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer
, size_t dst_size
,
363 const uint8_t * __restrict src_buffer
, size_t src_size
,
364 lz4_hash_entry_t hash_table
[LZ4_COMPRESS_HASH_ENTRIES
])
366 // Initialize hash table
367 const lz4_hash_entry_t HASH_FILL
= { .offset
= 0x80000000, .word
= 0x0 };
369 const uint8_t * src
= src_buffer
;
370 uint8_t * dst
= dst_buffer
;
372 // We need several blocks because our base function is limited to 2GB input
373 const size_t BLOCK_SIZE
= 0x7ffff000;
376 //DRKTODO either implement pattern4 or figure out optimal unroll
377 //DRKTODO: bizarrely, with plain O3 the compiler generates a single
378 //DRKTODO: scalar STP per loop iteration with the stock loop
379 //DRKTODO If hand unrolled, it switches to NEON store pairs
380 // Reset hash table for each block
381 /* #if __STDC_HOSTED__ */
382 /* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
384 /* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
387 for (int i
=0;i
<LZ4_COMPRESS_HASH_ENTRIES
;) {
388 hash_table
[i
++] = HASH_FILL
;
389 hash_table
[i
++] = HASH_FILL
;
390 hash_table
[i
++] = HASH_FILL
;
391 hash_table
[i
++] = HASH_FILL
;
394 // Bytes to encode in this block
395 const size_t src_to_encode
= src_size
> BLOCK_SIZE
? BLOCK_SIZE
: src_size
;
397 // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
398 // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
399 uint8_t * dst_start
= dst
;
400 const uint8_t * src_start
= src
;
401 lz4_encode_2gb(&dst
, dst_size
, &src
, src
, src_to_encode
, hash_table
, src_to_encode
< src_size
);
404 size_t dst_used
= dst
- dst_start
;
405 size_t src_used
= src
- src_start
; // src_used <= src_to_encode
406 if (src_to_encode
== src_size
&& src_used
< src_to_encode
) return 0; // FAIL to encode last block
408 // Note that there is a potential problem here in case of non compressible data requiring more blocks.
409 // We may end up here with src_used very small, or even 0, and will not be able to make progress during
410 // compression. We FAIL unless the length of literals remaining at the end is small enough.
411 if (src_to_encode
< src_size
&& src_to_encode
- src_used
>= (1<<16)) return 0; // FAIL too many literals
413 // Update counters (SRC and DST already have been updated)
414 src_size
-= src_used
;
415 dst_size
-= dst_used
;
418 return (size_t)(dst
- dst_buffer
); // bytes produced
421 #define likely(expr) __builtin_expect((expr) != 0, 1)
422 #define unlikely(expr) __builtin_expect((expr) != 0, 0)
423 typedef uint32_t lz4_uint128
__attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
425 int lz4_decode(uint8_t ** dst_ptr
,
428 const uint8_t ** src_ptr
,
429 const uint8_t * src_end
)
431 uint8_t * dst
= *dst_ptr
;
432 const uint8_t * src
= *src_ptr
;
434 // Require dst_end > dst.
435 if (dst_end
<= dst
) goto OUT_FULL
;
437 while (src
< src_end
)
439 // Keep last good position
443 uint8_t cmd
= *src
++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
444 uint32_t literalLength
= (cmd
>> 4) & 15; // 0..15
445 uint32_t matchLength
= 4 + (cmd
& 15); // 4..19
447 // extra bytes for literalLength
448 if (unlikely(literalLength
== 15))
452 #if DEBUG_LZ4_DECODE_ERRORS
453 if (unlikely(src
>= src_end
)) printf("Truncated SRC literal length\n");
455 if (unlikely(src
>= src_end
)) goto IN_FAIL
; // unexpected end of input (1 byte needed)
458 } while (unlikely(s
== 255));
462 #if DEBUG_LZ4_DECODE_ERRORS
463 if (unlikely(literalLength
> (size_t)(src_end
- src
))) printf("Truncated SRC literal\n");
465 if (unlikely(literalLength
> (size_t)(src_end
- src
))) goto IN_FAIL
;
466 if (unlikely(literalLength
> (size_t)(dst_end
- dst
))) {
467 // literal will take us past the end of the destination buffer,
468 // so we can only copy part of it.
469 literalLength
= (uint32_t)(dst_end
- dst
);
470 memcpy(dst
, src
, literalLength
);
471 dst
+= literalLength
;
474 memcpy(dst
,src
,literalLength
);
475 src
+= literalLength
;
476 dst
+= literalLength
;
478 if (unlikely(src
>= src_end
)) goto OUT_FULL
; // valid end of stream
479 #if DEBUG_LZ4_DECODE_ERRORS
480 if (unlikely(2 > (size_t)(src_end
- src
))) printf("Truncated SRC distance\n");
482 if (unlikely(2 > (size_t)(src_end
- src
))) goto IN_FAIL
; // unexpected end of input (2 bytes needed)
484 //DRKTODO: this causes an alignment increase warning (legitimate?)
485 //DRKTODO: cast of char * to uint16_t*
486 #pragma clang diagnostic push
487 #pragma clang diagnostic ignored "-Wcast-align"
490 uint64_t matchDistance
= *(const uint16_t *)src
; // 0x0000 <= matchDistance <= 0xffff
491 #pragma clang diagnostic pop
493 #if DEBUG_LZ4_DECODE_ERRORS
494 if (matchDistance
== 0) printf("Invalid match distance D = 0\n");
496 if (unlikely(matchDistance
== 0)) goto IN_FAIL
; // 0x0000 invalid
497 uint8_t * ref
= dst
- matchDistance
;
498 #if DEBUG_LZ4_DECODE_ERRORS
499 if (unlikely(ref
< dst_begin
)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance
,dst_begin
,dst
,dst_end
);
501 if (unlikely(ref
< dst_begin
)) goto OUT_FAIL
; // out of range
503 // extra bytes for matchLength
504 if (unlikely(matchLength
== 19))
508 #if DEBUG_LZ4_DECODE_ERRORS
509 if (unlikely(src
>= src_end
)) printf("Truncated SRC match length\n");
511 if (unlikely(src
>= src_end
)) goto IN_FAIL
; // unexpected end of input (1 byte needed)
514 } while (unlikely(s
== 255));
517 // copy match (may overlap)
518 if (unlikely(matchLength
> (size_t)(dst_end
- dst
))) {
519 // match will take us past the end of the destination buffer,
520 // so we can only copy part of it.
521 matchLength
= (uint32_t)(dst_end
- dst
);
522 for (uint32_t i
=0; i
<matchLength
; ++i
) dst
[i
] = ref
[i
];
526 for (uint64_t i
=0;i
<matchLength
;i
++) dst
[i
] = ref
[i
];
530 // We reached the end of the input buffer after a full instruction
532 // Or we reached the end of the output buffer