1 #include <vm/lz4_assembly_select.h>
2 #if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64
6 int64_t lz4_decode_asm(
7 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
8 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
9 uint8_t * dst_end, "relaxed" end of output buffer (see below)
10 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
11 const uint8_t * src_end) "relaxed" end of input buffer (see below)
13 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
14 We never read before dst_begin.
16 Return 0 on success, -1 on failure
17 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
21 #if MSVC_CALLING_CONVENTIONS
22 #error TODO implement MSVC calling conventions for LZ4 x86_64 assembly
25 // %rax and %rbx are free to use
27 #define dst %rdi // arg0
28 #define dst_begin %rsi // arg1
29 #define dst_end %rdx // arg2
30 #define src %rcx // arg3
31 #define src_end %r8 // arg4
33 #define n_literals %r9
34 #define n_matches %r10
36 #define copy_src %r11 // match/literal copy source
37 #define copy_dst %r12 // match/literal copy destination
38 #define match_distance %r13 // match distance
43 .globl _lz4_decode_asm
45 .macro establish_frame
55 .macro clear_frame_and_return
68 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
69 // Copy 16 bytes, clobber: xmm0
80 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
81 // Copy 16 bytes, and increment both addresses by 16, clobber: xmm0
82 .macro copy_1x16_and_increment
94 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
95 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0
96 .macro copy_2x16_and_increment
112 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
113 // Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1
114 .macro copy_1x32_and_increment
130 jbe L_done // done if src >= src_end
135 jbe L_done // done if dst >= dst_end
142 push dst // keep uint8_t ** dst on stack
143 mov (dst),dst // load current dst from *dst
144 push src // keep const uint8_t ** src on stack
145 mov (src),src // load current src from *src
148 // Keep last known good command
157 movzb (src),%rax // read command byte LLLLMMMM
160 shr $4,n_literals // n_literals in 0..15
163 add $4,n_matches // n_matches in 4..19
167 je L_decode_long_literal
169 // Copy literals, n_literals <= 14: copy 16 bytes
170 L_copy_short_literal:
172 add n_literals,src // src += n_literals
173 add n_literals,dst // dst += n_literals
174 jmp L_expand_match // continue to match
176 // the number of literals is encoded on more bytes, we need to decode them
177 L_decode_long_literal:
178 check_src_end // required here, since we may loop an arbitrarily high number of times
183 je L_decode_long_literal
185 // Copy literals, n_literals >= 15
187 mov src,copy_src // literal copy source
188 mov dst,copy_dst // literal copy destination
189 add n_literals,src // update src,dst for next step
191 check_src_end // required here, since n_literals can be arbitrarily high
195 copy_1x32_and_increment copy_src,copy_dst
196 copy_1x32_and_increment copy_src,copy_dst
197 L_copy_long_literal_loop:
198 copy_1x32_and_increment copy_src,copy_dst
200 ja L_copy_long_literal_loop
204 // Load match distance, and get match copy source
205 movzw (src),match_distance
207 test match_distance,match_distance
208 jz L_fail // match_distance == 0: FAIL
210 sub match_distance,copy_src // copy_src = match copy source
211 cmp copy_src,dst_begin
212 ja L_fail // dst_begin > copy_src: FAIL
214 // Long n_matches encoding?
216 je L_decode_long_match // unlikely
217 // Long n_matches with short encoding (17 or 18)?
219 ja L_long_match // unlikely
221 // Copy match, n_matches <= 16
223 cmp $16,match_distance
224 jb L_copy_short_match_overlap
226 // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes
227 copy_1x16 copy_src,dst
228 add n_matches,dst // update dst
229 jmp L_decode_command // to next command
231 // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern
232 L_copy_short_match_overlap:
233 lea L_match_permtable(%rip),%rax
234 shl $5,match_distance
236 vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
237 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
238 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
239 vmovdqu %xmm0,(dst) // and store the result
241 movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
242 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
243 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
244 movdqu %xmm0,(dst) // and store the result
246 add n_matches,dst // update dst
247 jmp L_decode_command // to next command
249 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
252 L_decode_long_match_loop:
253 check_src_end // required here, since we may loop an arbitrarily high number of times
259 je L_decode_long_match_loop
263 mov dst,copy_dst // copy_dst = match copy destination
264 add n_matches,dst // update dst
265 check_dst_end // n_matches may be arbitrarily high
267 cmp $16,match_distance
268 jb L_copy_long_match_overlap // match_distance < 16: overlapping copy
270 // Copy match, n_matches >= 16, match_distance >= 16
272 copy_1x16_and_increment copy_src,copy_dst
273 L_copy_long_match_loop:
274 copy_2x16_and_increment copy_src,copy_dst
276 ja L_copy_long_match_loop
277 jmp L_decode_command // to next command
279 // Copy match, n_matches >= 16, match_distance < 16: replicate pattern
280 L_copy_long_match_overlap:
281 lea L_match_permtable(%rip),%rax
282 mov match_distance,%rbx
285 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
286 vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes
287 vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
288 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
289 vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
290 vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1
291 vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register
293 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
294 movdqa %xmm0,%xmm1 // keep a copy for the high bytes
295 movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
296 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
297 movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
298 pshufb %xmm2,%xmm1 // replicate the pattern in xmm1
300 // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes
301 lea L_match_disttable(%rip),%rax
302 movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32.
306 vmovdqu %ymm0,(copy_dst)
308 movdqu %xmm0,(copy_dst)
309 movdqu %xmm1,16(copy_dst)
312 L_copy_long_match_overlap_loop:
315 vmovdqu %ymm0,(copy_dst)
317 movdqu %xmm0,(copy_dst)
318 movdqu %xmm1,16(copy_dst)
322 ja L_copy_long_match_overlap
323 jmp L_decode_command // to next command
339 clear_frame_and_return
341 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
342 // value(d)[i] = i%d for i = 0..31
345 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
346 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
347 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
348 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
349 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
350 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
351 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
352 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
353 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
354 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
355 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
356 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
357 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
358 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
359 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
360 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
362 // valid repeating pattern size, for each match_distance = 0 to 15
363 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
366 .byte 32,32,32,30 // 0 .. 3
367 .byte 16,30,30,28 // 4 .. 7
368 .byte 16,27,30,22 // 8 .. 11
369 .byte 24,26,28,30 // 12 .. 15
371 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64