2 * Copyright (c) 2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 #include <vm/lz4_assembly_select.h>
29 #if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64
33 int64_t lz4_decode_asm(
34 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
35 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
36 uint8_t * dst_end, "relaxed" end of output buffer (see below)
37 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
38 const uint8_t * src_end) "relaxed" end of input buffer (see below)
40 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
41 We never read before dst_begin.
43 Return 0 on success, -1 on failure
44 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
48 #if MSVC_CALLING_CONVENTIONS
49 #error TODO implement MSVC calling conventions for LZ4 x86_64 assembly
52 // %rax and %rbx are free to use
54 #define dst %rdi // arg0
55 #define dst_begin %rsi // arg1
56 #define dst_end %rdx // arg2
57 #define src %rcx // arg3
58 #define src_end %r8 // arg4
60 #define n_literals %r9
61 #define n_matches %r10
63 #define copy_src %r11 // match/literal copy source
64 #define copy_dst %r12 // match/literal copy destination
65 #define match_distance %r13 // match distance
70 .globl _lz4_decode_asm
72 .macro establish_frame
82 .macro clear_frame_and_return
95 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
96 // Copy 16 bytes, clobber: xmm0
107 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
108 // Copy 16 bytes, and increment both addresses by 16, clobber: xmm0
109 .macro copy_1x16_and_increment
121 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
122 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0
123 .macro copy_2x16_and_increment
139 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
140 // Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1
141 .macro copy_1x32_and_increment
157 jbe L_done // done if src >= src_end
162 jbe L_done // done if dst >= dst_end
169 push dst // keep uint8_t ** dst on stack
170 mov (dst),dst // load current dst from *dst
171 push src // keep const uint8_t ** src on stack
172 mov (src),src // load current src from *src
175 // Keep last known good command
184 movzb (src),%rax // read command byte LLLLMMMM
187 shr $4,n_literals // n_literals in 0..15
190 add $4,n_matches // n_matches in 4..19
194 je L_decode_long_literal
196 // Copy literals, n_literals <= 14: copy 16 bytes
197 L_copy_short_literal:
199 add n_literals,src // src += n_literals
200 add n_literals,dst // dst += n_literals
201 jmp L_expand_match // continue to match
203 // the number of literals is encoded on more bytes, we need to decode them
204 L_decode_long_literal:
205 check_src_end // required here, since we may loop an arbitrarily high number of times
210 je L_decode_long_literal
212 // Copy literals, n_literals >= 15
214 mov src,copy_src // literal copy source
215 mov dst,copy_dst // literal copy destination
216 add n_literals,src // update src,dst for next step
218 check_src_end // required here, since n_literals can be arbitrarily high
222 copy_1x32_and_increment copy_src,copy_dst
223 copy_1x32_and_increment copy_src,copy_dst
224 L_copy_long_literal_loop:
225 copy_1x32_and_increment copy_src,copy_dst
227 ja L_copy_long_literal_loop
231 // Load match distance, and get match copy source
232 movzw (src),match_distance
234 test match_distance,match_distance
235 jz L_fail // match_distance == 0: FAIL
237 sub match_distance,copy_src // copy_src = match copy source
238 cmp copy_src,dst_begin
239 ja L_fail // dst_begin > copy_src: FAIL
241 // Long n_matches encoding?
243 je L_decode_long_match // unlikely
244 // Long n_matches with short encoding (17 or 18)?
246 ja L_long_match // unlikely
248 // Copy match, n_matches <= 16
250 cmp $16,match_distance
251 jb L_copy_short_match_overlap
253 // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes
254 copy_1x16 copy_src,dst
255 add n_matches,dst // update dst
256 jmp L_decode_command // to next command
258 // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern
259 L_copy_short_match_overlap:
260 lea L_match_permtable(%rip),%rax
261 shl $5,match_distance
263 vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
264 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
265 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
266 vmovdqu %xmm0,(dst) // and store the result
268 movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
269 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
270 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
271 movdqu %xmm0,(dst) // and store the result
273 add n_matches,dst // update dst
274 jmp L_decode_command // to next command
276 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
279 L_decode_long_match_loop:
280 check_src_end // required here, since we may loop an arbitrarily high number of times
286 je L_decode_long_match_loop
290 mov dst,copy_dst // copy_dst = match copy destination
291 add n_matches,dst // update dst
292 check_dst_end // n_matches may be arbitrarily high
294 cmp $16,match_distance
295 jb L_copy_long_match_overlap // match_distance < 16: overlapping copy
297 // Copy match, n_matches >= 16, match_distance >= 16
299 copy_1x16_and_increment copy_src,copy_dst
300 L_copy_long_match_loop:
301 copy_2x16_and_increment copy_src,copy_dst
303 ja L_copy_long_match_loop
304 jmp L_decode_command // to next command
306 // Copy match, n_matches >= 16, match_distance < 16: replicate pattern
307 L_copy_long_match_overlap:
308 lea L_match_permtable(%rip),%rax
309 mov match_distance,%rbx
312 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
313 vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes
314 vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
315 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
316 vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
317 vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1
318 vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register
320 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
321 movdqa %xmm0,%xmm1 // keep a copy for the high bytes
322 movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
323 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
324 movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
325 pshufb %xmm2,%xmm1 // replicate the pattern in xmm1
327 // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes
328 lea L_match_disttable(%rip),%rax
329 movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32.
333 vmovdqu %ymm0,(copy_dst)
335 movdqu %xmm0,(copy_dst)
336 movdqu %xmm1,16(copy_dst)
339 L_copy_long_match_overlap_loop:
342 vmovdqu %ymm0,(copy_dst)
344 movdqu %xmm0,(copy_dst)
345 movdqu %xmm1,16(copy_dst)
349 ja L_copy_long_match_overlap
350 jmp L_decode_command // to next command
366 clear_frame_and_return
368 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
369 // value(d)[i] = i%d for i = 0..31
372 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
373 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
374 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
375 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
376 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
377 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
378 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
379 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
380 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
381 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
382 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
383 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
384 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
385 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
386 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
387 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
389 // valid repeating pattern size, for each match_distance = 0 to 15
390 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
393 .byte 32,32,32,30 // 0 .. 3
394 .byte 16,30,30,28 // 4 .. 7
395 .byte 16,27,30,22 // 8 .. 11
396 .byte 24,26,28,30 // 12 .. 15
398 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64