2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
31 // Mar 2016 Imported from the Compression project, with minor optimisations and
32 // early abort detection (Derek Kumar)
35 #define memcpy __builtin_memcpy
38 lz4raw_decode_buffer(uint8_t * __restrict dst_buffer
, size_t dst_size
,
39 const uint8_t * __restrict src_buffer
, size_t src_size
,
40 void * __restrict work
__attribute__((unused
)))
42 const uint8_t * src
= src_buffer
;
43 uint8_t * dst
= dst_buffer
;
45 // Go fast if we can, keeping away from the end of buffers
46 #if LZ4_ENABLE_ASSEMBLY_DECODE
47 if (dst_size
> LZ4_GOFAST_SAFETY_MARGIN
&& src_size
> LZ4_GOFAST_SAFETY_MARGIN
) {
48 if (lz4_decode_asm(&dst
, dst_buffer
, dst_buffer
+ dst_size
- LZ4_GOFAST_SAFETY_MARGIN
, &src
, src_buffer
+ src_size
- LZ4_GOFAST_SAFETY_MARGIN
)) {
53 //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
56 if (lz4_decode(&dst
, dst_buffer
, dst_buffer
+ dst_size
, &src
, src_buffer
+ src_size
)) {
59 return (size_t)(dst
- dst_buffer
); // bytes produced
63 #define DEBUG_LZ4_ENCODE_ERRORS (1)
64 #define DEBUG_LZ4_DECODE_ERRORS (1)
67 #if DEBUG_LZ4_ENCODE_ERRORS
70 #if !LZ4_ENABLE_ASSEMBLY_ENCODE
72 #if defined(__x86_64__) || defined(__x86_64h__)
73 # define LZ4_MATCH_SEARCH_INIT_SIZE 32
74 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32
76 # define LZ4_MATCH_SEARCH_INIT_SIZE 8
77 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8
80 // Return hash for 4-byte sequence X
81 static inline uint32_t
84 return (x
* 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS
);
87 // Store 0xfff..fff at *PTR
89 lz4_fill16(uint8_t * ptr
)
95 // Return number of matching bytes 0..4 at positions A and B.
97 lz4_nmatch4(const uint8_t * a
, const uint8_t * b
)
99 uint32_t x
= load4(a
) ^ load4(b
);
100 return (x
== 0)?4:(__builtin_ctzl(x
) >> 3);
103 // Return number of matching bytes 0..8 at positions A and B.
105 lz4_nmatch8(const uint8_t * a
, const uint8_t * b
)
107 uint64_t x
= load8(a
) ^ load8(b
);
108 return (x
== 0)?8:(__builtin_ctzll(x
) >> 3);
111 // Return number of matching bytes 0..16 at positions A and B.
113 lz4_nmatch16(const uint8_t * a
, const uint8_t * b
)
115 size_t n
= lz4_nmatch8(a
, b
);
116 return (n
== 8)?(8 + lz4_nmatch8(a
+ 8, b
+ 8)):n
;
119 // Return number of matching bytes 0..32 at positions A and B.
121 lz4_nmatch32(const uint8_t * a
, const uint8_t * b
)
123 size_t n
= lz4_nmatch16(a
, b
);
124 return (n
== 16)?(16 + lz4_nmatch16(a
+ 16, b
+ 16)):n
;
127 // Return number of matching bytes 0..64 at positions A and B.
129 lz4_nmatch64(const uint8_t * a
, const uint8_t * b
)
131 size_t n
= lz4_nmatch32(a
, b
);
132 return (n
== 32)?(32 + lz4_nmatch32(a
+ 32, b
+ 32)):n
;
135 // Compile-time selection, return number of matching bytes 0..N at positions A and B.
137 lz4_nmatch(int N
, const uint8_t * a
, const uint8_t * b
)
140 case 4: return lz4_nmatch4(a
, b
);
141 case 8: return lz4_nmatch8(a
, b
);
142 case 16: return lz4_nmatch16(a
, b
);
143 case 32: return lz4_nmatch32(a
, b
);
144 case 64: return lz4_nmatch64(a
, b
);
146 __builtin_trap(); // FAIL
149 // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
150 // We are allowed to access a constant number of bytes above DST_END.
151 // Return incremented DST pointer on success, and 0 on failure
152 static inline uint8_t *
153 lz4_store_length(uint8_t * dst
, const uint8_t * const end
, uint32_t L
)
156 while (L
>= 17 * 255) {
162 //DRKTODO verify these modulos/divisions are optimally handled by clang
168 static inline uint32_t
169 clamp(uint32_t x
, uint32_t max
)
170 __attribute__((overloadable
))
172 return x
> max
? max
: x
;
175 static inline uint8_t *
176 copy_literal(uint8_t *dst
, const uint8_t * restrict src
, uint32_t L
)
178 uint8_t *end
= dst
+ L
;
179 { copy16(dst
, src
); dst
+= 16; src
+= 16; }
181 copy32(dst
, src
); dst
+= 32; src
+= 32;
187 lz4_emit_match(uint32_t L
, uint32_t M
, uint32_t D
,
188 uint8_t * restrict dst
,
189 const uint8_t * const end
,
190 const uint8_t * restrict src
)
192 // The LZ4 encoding scheme requires that M is at least 4, because
193 // the actual value stored by the encoding is M - 4. Check this
194 // requirement for debug builds.
195 assert(M
>= 4 && "LZ4 encoding requires that M is at least 4");
196 // Having checked that M >= 4, translate M by four.
198 // Similarly, we must have D < 2**16, because we use only two bytes
199 // to represent the value of D in the encoding.
200 assert(D
<= USHRT_MAX
&& "LZ4 encoding requries that D can be stored in two bytes.");
201 // Construct the command byte by clamping both L and M to 0 ... 15
202 // and packing them into a single byte, and store it.
203 *dst
++ = clamp(L
, 15) << 4 | clamp(M
, 15);
204 // If L is 15 or greater, we need to encode extra literal length bytes.
206 dst
= lz4_store_length(dst
, end
, L
- 15);
207 if (dst
== 0 || dst
+ L
>= end
) {
211 // Copy the literal itself from src to dst.
212 dst
= copy_literal(dst
, src
, L
);
213 // Store match distance.
214 store2(dst
, D
); dst
+= 2;
215 // If M is 15 or greater, we need to encode extra match length bytes.
217 dst
= lz4_store_length(dst
, end
, M
- 15);
225 /* #ifndef LZ4_EARLY_ABORT */
226 /* #define LZ4_EARLY_ABORT (1) */
230 int lz4_do_early_abort
= 1;
231 int lz4_early_aborts
= 0;
232 #define LZ4_EARLY_ABORT_EVAL (448)
233 #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
234 #endif /* LZ4_EARLY_ABORT */
237 lz4_encode_2gb(uint8_t ** dst_ptr
,
239 const uint8_t ** src_ptr
,
240 const uint8_t * src_begin
,
242 lz4_hash_entry_t hash_table
[LZ4_COMPRESS_HASH_ENTRIES
],
243 int skip_final_literals
)
245 uint8_t *dst
= *dst_ptr
; // current output stream position
246 uint8_t *end
= dst
+ dst_size
- LZ4_GOFAST_SAFETY_MARGIN
;
247 const uint8_t *src
= *src_ptr
; // current input stream literal to encode
248 const uint8_t *src_end
= src
+ src_size
- LZ4_GOFAST_SAFETY_MARGIN
;
249 const uint8_t *match_begin
= 0; // first byte of matched sequence
250 const uint8_t *match_end
= 0; // first byte after matched sequence
252 uint8_t * const dst_begin
= dst
;
253 uint32_t lz4_do_abort_eval
= lz4_do_early_abort
;
257 ptrdiff_t match_distance
= 0;
258 for (match_begin
= src
; match_begin
< src_end
; match_begin
+= 1) {
259 const uint32_t pos
= (uint32_t)(match_begin
- src_begin
);
260 const uint32_t w0
= load4(match_begin
);
261 const uint32_t w1
= load4(match_begin
+ 1);
262 const uint32_t w2
= load4(match_begin
+ 2);
263 const uint32_t w3
= load4(match_begin
+ 3);
264 const int i0
= lz4_hash(w0
);
265 const int i1
= lz4_hash(w1
);
266 const int i2
= lz4_hash(w2
);
267 const int i3
= lz4_hash(w3
);
268 const uint8_t *c0
= src_begin
+ hash_table
[i0
].offset
;
269 const uint8_t *c1
= src_begin
+ hash_table
[i1
].offset
;
270 const uint8_t *c2
= src_begin
+ hash_table
[i2
].offset
;
271 const uint8_t *c3
= src_begin
+ hash_table
[i3
].offset
;
272 const uint32_t m0
= hash_table
[i0
].word
;
273 const uint32_t m1
= hash_table
[i1
].word
;
274 const uint32_t m2
= hash_table
[i2
].word
;
275 const uint32_t m3
= hash_table
[i3
].word
;
276 hash_table
[i0
].offset
= pos
;
277 hash_table
[i0
].word
= w0
;
278 hash_table
[i1
].offset
= pos
+ 1;
279 hash_table
[i1
].word
= w1
;
281 hash_table
[i2
].offset
= pos
+ 2;
282 hash_table
[i2
].word
= w2
;
283 hash_table
[i3
].offset
= pos
+ 3;
284 hash_table
[i3
].word
= w3
;
286 match_distance
= (match_begin
- c0
);
287 if (w0
== m0
&& match_distance
< 0x10000 && match_distance
> 0) {
288 match_end
= match_begin
+ 4;
293 match_distance
= (match_begin
- c1
);
294 if (w1
== m1
&& match_distance
< 0x10000 && match_distance
> 0) {
295 match_end
= match_begin
+ 4;
300 match_distance
= (match_begin
- c2
);
301 if (w2
== m2
&& match_distance
< 0x10000 && match_distance
> 0) {
302 match_end
= match_begin
+ 4;
307 match_distance
= (match_begin
- c3
);
308 if (w3
== m3
&& match_distance
< 0x10000 && match_distance
> 0) {
309 match_end
= match_begin
+ 4;
314 //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
315 if (lz4_do_abort_eval
&& ((pos
) >= LZ4_EARLY_ABORT_EVAL
)) {
316 ptrdiff_t dstd
= dst
- dst_begin
;
323 /* if (dstd >= pos) { */
326 /* ptrdiff_t cbytes = pos - dstd; */
327 /* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
330 lz4_do_abort_eval
= 0;
335 if (skip_final_literals
) {
336 *src_ptr
= src
; *dst_ptr
= dst
; return;
337 } // do not emit the final literal sequence
339 // Emit a trailing literal that covers the remainder of the source buffer,
340 // if we can do so without exceeding the bounds of the destination buffer.
341 size_t src_remaining
= src_end
+ LZ4_GOFAST_SAFETY_MARGIN
- src
;
342 if (src_remaining
< 15) {
343 *dst
++ = (uint8_t)(src_remaining
<< 4);
344 memcpy(dst
, src
, 16); dst
+= src_remaining
;
347 dst
= lz4_store_length(dst
, end
, (uint32_t)(src_remaining
- 15));
348 if (dst
== 0 || dst
+ src_remaining
>= end
) {
351 memcpy(dst
, src
, src_remaining
); dst
+= src_remaining
;
354 *src_ptr
= src
+ src_remaining
;
359 // Expand match forward
361 const uint8_t * ref_end
= match_end
- match_distance
;
362 while (match_end
< src_end
) {
363 size_t n
= lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE
, ref_end
, match_end
);
364 if (n
< LZ4_MATCH_SEARCH_LOOP_SIZE
) {
365 match_end
+= n
; break;
367 match_end
+= LZ4_MATCH_SEARCH_LOOP_SIZE
;
368 ref_end
+= LZ4_MATCH_SEARCH_LOOP_SIZE
;
372 // Expand match backward
374 // match_begin_min = max(src_begin + match_distance,literal)
375 const uint8_t * match_begin_min
= src_begin
+ match_distance
;
376 match_begin_min
= (match_begin_min
< src
)?src
:match_begin_min
;
377 const uint8_t * ref_begin
= match_begin
- match_distance
;
379 while (match_begin
> match_begin_min
&& ref_begin
[-1] == match_begin
[-1]) {
380 match_begin
-= 1; ref_begin
-= 1;
385 dst
= lz4_emit_match((uint32_t)(match_begin
- src
), (uint32_t)(match_end
- match_begin
), (uint32_t)match_distance
, dst
, end
, src
);
393 // Update return values to include the last fully encoded match
402 lz4raw_encode_buffer(uint8_t * __restrict dst_buffer
, size_t dst_size
,
403 const uint8_t * __restrict src_buffer
, size_t src_size
,
404 lz4_hash_entry_t hash_table
[LZ4_COMPRESS_HASH_ENTRIES
])
406 // Initialize hash table
407 const lz4_hash_entry_t HASH_FILL
= { .offset
= 0x80000000, .word
= 0x0 };
409 const uint8_t * src
= src_buffer
;
410 uint8_t * dst
= dst_buffer
;
412 // We need several blocks because our base function is limited to 2GB input
413 const size_t BLOCK_SIZE
= 0x7ffff000;
414 while (src_size
> 0) {
415 //DRKTODO either implement pattern4 or figure out optimal unroll
416 //DRKTODO: bizarrely, with plain O3 the compiler generates a single
417 //DRKTODO: scalar STP per loop iteration with the stock loop
418 //DRKTODO If hand unrolled, it switches to NEON store pairs
419 // Reset hash table for each block
420 /* #if __STDC_HOSTED__ */
421 /* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
423 /* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
426 for (int i
= 0; i
< LZ4_COMPRESS_HASH_ENTRIES
;) {
427 hash_table
[i
++] = HASH_FILL
;
428 hash_table
[i
++] = HASH_FILL
;
429 hash_table
[i
++] = HASH_FILL
;
430 hash_table
[i
++] = HASH_FILL
;
433 // Bytes to encode in this block
434 const size_t src_to_encode
= src_size
> BLOCK_SIZE
? BLOCK_SIZE
: src_size
;
436 // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
437 // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
438 uint8_t * dst_start
= dst
;
439 const uint8_t * src_start
= src
;
440 lz4_encode_2gb(&dst
, dst_size
, &src
, src
, src_to_encode
, hash_table
, src_to_encode
< src_size
);
443 size_t dst_used
= dst
- dst_start
;
444 size_t src_used
= src
- src_start
; // src_used <= src_to_encode
445 if (src_to_encode
== src_size
&& src_used
< src_to_encode
) {
446 return 0; // FAIL to encode last block
448 // Note that there is a potential problem here in case of non compressible data requiring more blocks.
449 // We may end up here with src_used very small, or even 0, and will not be able to make progress during
450 // compression. We FAIL unless the length of literals remaining at the end is small enough.
451 if (src_to_encode
< src_size
&& src_to_encode
- src_used
>= (1 << 16)) {
452 return 0; // FAIL too many literals
454 // Update counters (SRC and DST already have been updated)
455 src_size
-= src_used
;
456 dst_size
-= dst_used
;
459 return (size_t)(dst
- dst_buffer
); // bytes produced
462 typedef uint32_t lz4_uint128
__attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
465 lz4_decode(uint8_t ** dst_ptr
,
468 const uint8_t ** src_ptr
,
469 const uint8_t * src_end
)
471 uint8_t * dst
= *dst_ptr
;
472 const uint8_t * src
= *src_ptr
;
474 // Require dst_end > dst.
475 if (dst_end
<= dst
) {
479 while (src
< src_end
) {
480 // Keep last good position
484 uint8_t cmd
= *src
++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
485 uint32_t literalLength
= (cmd
>> 4) & 15; // 0..15
486 uint32_t matchLength
= 4 + (cmd
& 15); // 4..19
488 // extra bytes for literalLength
489 if (__improbable(literalLength
== 15)) {
492 #if DEBUG_LZ4_DECODE_ERRORS
493 if (__improbable(src
>= src_end
)) {
494 printf("Truncated SRC literal length\n");
497 if (__improbable(src
>= src_end
)) {
498 goto IN_FAIL
; // unexpected end of input (1 byte needed)
502 } while (__improbable(s
== 255));
506 #if DEBUG_LZ4_DECODE_ERRORS
507 if (__improbable(literalLength
> (size_t)(src_end
- src
))) {
508 printf("Truncated SRC literal\n");
511 if (__improbable(literalLength
> (size_t)(src_end
- src
))) {
514 if (__improbable(literalLength
> (size_t)(dst_end
- dst
))) {
515 // literal will take us past the end of the destination buffer,
516 // so we can only copy part of it.
517 literalLength
= (uint32_t)(dst_end
- dst
);
518 memcpy(dst
, src
, literalLength
);
519 dst
+= literalLength
;
522 memcpy(dst
, src
, literalLength
);
523 src
+= literalLength
;
524 dst
+= literalLength
;
526 if (__improbable(src
>= src_end
)) {
527 goto OUT_FULL
; // valid end of stream
529 #if DEBUG_LZ4_DECODE_ERRORS
530 if (__improbable(2 > (size_t)(src_end
- src
))) {
531 printf("Truncated SRC distance\n");
534 if (__improbable(2 > (size_t)(src_end
- src
))) {
535 goto IN_FAIL
; // unexpected end of input (2 bytes needed)
537 //DRKTODO: this causes an alignment increase warning (legitimate?)
538 //DRKTODO: cast of char * to uint16_t*
539 #pragma clang diagnostic push
540 #pragma clang diagnostic ignored "-Wcast-align"
543 uint64_t matchDistance
= *(const uint16_t *)src
; // 0x0000 <= matchDistance <= 0xffff
544 #pragma clang diagnostic pop
546 #if DEBUG_LZ4_DECODE_ERRORS
547 if (matchDistance
== 0) {
548 printf("Invalid match distance D = 0\n");
551 if (__improbable(matchDistance
== 0)) {
552 goto IN_FAIL
; // 0x0000 invalid
554 uint8_t * ref
= dst
- matchDistance
;
555 #if DEBUG_LZ4_DECODE_ERRORS
556 if (__improbable(ref
< dst_begin
)) {
557 printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n", matchDistance
, dst_begin
, dst
, dst_end
);
560 if (__improbable(ref
< dst_begin
)) {
561 goto OUT_FAIL
; // out of range
563 // extra bytes for matchLength
564 if (__improbable(matchLength
== 19)) {
567 #if DEBUG_LZ4_DECODE_ERRORS
568 if (__improbable(src
>= src_end
)) {
569 printf("Truncated SRC match length\n");
572 if (__improbable(src
>= src_end
)) {
573 goto IN_FAIL
; // unexpected end of input (1 byte needed)
577 } while (__improbable(s
== 255));
580 // copy match (may overlap)
581 if (__improbable(matchLength
> (size_t)(dst_end
- dst
))) {
582 // match will take us past the end of the destination buffer,
583 // so we can only copy part of it.
584 matchLength
= (uint32_t)(dst_end
- dst
);
585 for (uint32_t i
= 0; i
< matchLength
; ++i
) {
591 for (uint64_t i
= 0; i
< matchLength
; i
++) {
597 // We reached the end of the input buffer after a full instruction
599 // Or we reached the end of the output buffer