]> git.saurik.com Git - apple/xnu.git/blame - osfmk/x86_64/lz4_decode_x86_64.s
xnu-3789.1.32.tar.gz
[apple/xnu.git] / osfmk / x86_64 / lz4_decode_x86_64.s
CommitLineData
39037602
A
1#include <vm/lz4_assembly_select.h>
2#if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64
3
4/*
5
6 int64_t lz4_decode_asm(
7 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
8 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
9 uint8_t * dst_end, "relaxed" end of output buffer (see below)
10 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
11 const uint8_t * src_end) "relaxed" end of input buffer (see below)
12
13 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
14 We never read before dst_begin.
15
16 Return 0 on success, -1 on failure
17 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
18
19*/
20
21#if MSVC_CALLING_CONVENTIONS
22#error TODO implement MSVC calling conventions for LZ4 x86_64 assembly
23#endif
24
25// %rax and %rbx are free to use
26
27#define dst %rdi // arg0
28#define dst_begin %rsi // arg1
29#define dst_end %rdx // arg2
30#define src %rcx // arg3
31#define src_end %r8 // arg4
32
33#define n_literals %r9
34#define n_matches %r10
35
36#define copy_src %r11 // match/literal copy source
37#define copy_dst %r12 // match/literal copy destination
38#define match_distance %r13 // match distance
39
40#define src_good %r14
41#define dst_good %r15
42
43.globl _lz4_decode_asm
44
45.macro establish_frame
46 push %rbp
47 mov %rsp,%rbp
48 push %rbx
49 push %r12
50 push %r13
51 push %r14
52 push %r15
53.endm
54
55.macro clear_frame_and_return
56 pop %r15
57 pop %r14
58 pop %r13
59 pop %r12
60 pop %rbx
61 pop %rbp
62#ifdef __AVX2__
63 vzeroupper
64#endif
65 ret
66.endm
67
68// copy_1x16 SOURCE_ADDR DESTINATION_ADDR
69// Copy 16 bytes, clobber: xmm0
70.macro copy_1x16
71#ifdef __AVX2__
72 vmovdqu ($0),%xmm0
73 vmovdqu %xmm0,($1)
74#else
75 movdqu ($0),%xmm0
76 movdqu %xmm0,($1)
77#endif
78.endm
79
80// copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
81// Copy 16 bytes, and increment both addresses by 16, clobber: xmm0
82.macro copy_1x16_and_increment
83#ifdef __AVX2__
84 vmovdqu ($0),%xmm0
85 vmovdqu %xmm0,($1)
86#else
87 movdqu ($0),%xmm0
88 movdqu %xmm0,($1)
89#endif
90 add $$16,$0
91 add $$16,$1
92.endm
93
94// copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
95// Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0
96.macro copy_2x16_and_increment
97#ifdef __AVX2__
98 vmovdqu ($0),%xmm0
99 vmovdqu %xmm0,($1)
100 vmovdqu 16($0),%xmm0
101 vmovdqu %xmm0,16($1)
102#else
103 movdqu ($0),%xmm0
104 movdqu %xmm0,($1)
105 movdqu 16($0),%xmm0
106 movdqu %xmm0,16($1)
107#endif
108 add $$32,$0
109 add $$32,$1
110.endm
111
112// copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
113// Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1
114.macro copy_1x32_and_increment
115#ifdef __AVX2__
116 vmovdqu ($0),%ymm0
117 vmovdqu %ymm0,($1)
118#else
119 movdqu ($0),%xmm0
120 movdqu 16($0),%xmm1
121 movdqu %xmm0,($1)
122 movdqu %xmm1,16($1)
123#endif
124 add $$32,$0
125 add $$32,$1
126.endm
127
128.macro check_src_end
129 cmp src,src_end
130 jbe L_done // done if src >= src_end
131.endm
132
133.macro check_dst_end
134 cmp dst,dst_end
135 jbe L_done // done if dst >= dst_end
136.endm
137
138.text
139.p2align 6
140_lz4_decode_asm:
141 establish_frame
142 push dst // keep uint8_t ** dst on stack
143 mov (dst),dst // load current dst from *dst
144 push src // keep const uint8_t ** src on stack
145 mov (src),src // load current src from *src
146
147L_decode_command:
148 // Keep last known good command
149 mov dst,dst_good
150 mov src,src_good
151
152 // Check limits
153 check_src_end
154 check_dst_end
155
156 // Decode command
157 movzb (src),%rax // read command byte LLLLMMMM
158 add $1,src
159 mov %rax,n_literals
160 shr $4,n_literals // n_literals in 0..15
161 mov %rax,n_matches
162 and $0xf,n_matches
163 add $4,n_matches // n_matches in 4..19
164
165 // Short literal?
166 cmp $15,n_literals
167 je L_decode_long_literal
168
169 // Copy literals, n_literals <= 14: copy 16 bytes
170L_copy_short_literal:
171 copy_1x16 src,dst
172 add n_literals,src // src += n_literals
173 add n_literals,dst // dst += n_literals
174 jmp L_expand_match // continue to match
175
176 // the number of literals is encoded on more bytes, we need to decode them
177L_decode_long_literal:
178 check_src_end // required here, since we may loop an arbitrarily high number of times
179 movzb (src),%rax
180 add $1,src
181 add %rax,n_literals
182 cmp $255,%rax
183 je L_decode_long_literal
184
185 // Copy literals, n_literals >= 15
186L_copy_long_literal:
187 mov src,copy_src // literal copy source
188 mov dst,copy_dst // literal copy destination
189 add n_literals,src // update src,dst for next step
190 add n_literals,dst
191 check_src_end // required here, since n_literals can be arbitrarily high
192 check_dst_end
193
194 // fixed + loop
195 copy_1x32_and_increment copy_src,copy_dst
196 copy_1x32_and_increment copy_src,copy_dst
197L_copy_long_literal_loop:
198 copy_1x32_and_increment copy_src,copy_dst
199 cmp copy_dst,dst
200 ja L_copy_long_literal_loop
201 // continue to match
202
203L_expand_match:
204 // Load match distance, and get match copy source
205 movzw (src),match_distance
206 add $2,src
207 test match_distance,match_distance
208 jz L_fail // match_distance == 0: FAIL
209 mov dst,copy_src
210 sub match_distance,copy_src // copy_src = match copy source
211 cmp copy_src,dst_begin
212 ja L_fail // dst_begin > copy_src: FAIL
213
214 // Long n_matches encoding?
215 cmp $19,n_matches
216 je L_decode_long_match // unlikely
217 // Long n_matches with short encoding (17 or 18)?
218 cmp $16,n_matches
219 ja L_long_match // unlikely
220
221 // Copy match, n_matches <= 16
222L_copy_short_match:
223 cmp $16,match_distance
224 jb L_copy_short_match_overlap
225
226 // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes
227 copy_1x16 copy_src,dst
228 add n_matches,dst // update dst
229 jmp L_decode_command // to next command
230
231 // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern
232L_copy_short_match_overlap:
233 lea L_match_permtable(%rip),%rax
234 shl $5,match_distance
235#ifdef __AVX2__
236 vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
237 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
238 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
239 vmovdqu %xmm0,(dst) // and store the result
240#else
241 movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
242 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
243 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
244 movdqu %xmm0,(dst) // and store the result
245#endif
246 add n_matches,dst // update dst
247 jmp L_decode_command // to next command
248
249 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
250L_decode_long_match:
251 mov $255,%rbx
252L_decode_long_match_loop:
253 check_src_end // required here, since we may loop an arbitrarily high number of times
254 mov (src),%rax
255 add $1,src
256 and %rbx,%rax
257 add %rax,n_matches
258 cmp %rbx,%rax
259 je L_decode_long_match_loop
260
261 // n_matches > 16
262L_long_match:
263 mov dst,copy_dst // copy_dst = match copy destination
264 add n_matches,dst // update dst
265 check_dst_end // n_matches may be arbitrarily high
266
267 cmp $16,match_distance
268 jb L_copy_long_match_overlap // match_distance < 16: overlapping copy
269
270 // Copy match, n_matches >= 16, match_distance >= 16
271 // fixed + loop
272 copy_1x16_and_increment copy_src,copy_dst
273L_copy_long_match_loop:
274 copy_2x16_and_increment copy_src,copy_dst
275 cmp copy_dst,dst
276 ja L_copy_long_match_loop
277 jmp L_decode_command // to next command
278
279 // Copy match, n_matches >= 16, match_distance < 16: replicate pattern
280L_copy_long_match_overlap:
281 lea L_match_permtable(%rip),%rax
282 mov match_distance,%rbx
283 shl $5,%rbx
284#ifdef __AVX2__
285 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
286 vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes
287 vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
288 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
289 vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
290 vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1
291 vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register
292#else
293 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
294 movdqa %xmm0,%xmm1 // keep a copy for the high bytes
295 movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
296 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
297 movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
298 pshufb %xmm2,%xmm1 // replicate the pattern in xmm1
299#endif
300 // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes
301 lea L_match_disttable(%rip),%rax
302 movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32.
303
304 // fixed
305#ifdef __AVX2__
306 vmovdqu %ymm0,(copy_dst)
307#else
308 movdqu %xmm0,(copy_dst)
309 movdqu %xmm1,16(copy_dst)
310#endif
311 add %rax,copy_dst
312L_copy_long_match_overlap_loop:
313 // loop
314#ifdef __AVX2__
315 vmovdqu %ymm0,(copy_dst)
316#else
317 movdqu %xmm0,(copy_dst)
318 movdqu %xmm1,16(copy_dst)
319#endif
320 add %rax,copy_dst
321 cmp copy_dst,dst
322 ja L_copy_long_match_overlap
323 jmp L_decode_command // to next command
324
325L_fail:
326 xor %rax,%rax
327 dec %rax // -1
328 jmp L_exit
329
330L_done:
331 xor %rax,%rax
332 // continue to exit
333
334L_exit:
335 pop src
336 mov src_good,(src)
337 pop dst
338 mov dst_good,(dst)
339 clear_frame_and_return
340
341// permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
342// value(d)[i] = i%d for i = 0..31
343.p2align 6
344L_match_permtable:
345.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
346.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
347.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
348.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
349.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
350.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
351.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
352.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
353.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
354.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
355.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
356.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
357.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
358.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
359.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
360.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
361
362// valid repeating pattern size, for each match_distance = 0 to 15
363// value(d) = 32 - (32%d), is the largest a multiple of d <= 32
364.p2align 6
365L_match_disttable:
366.byte 32,32,32,30 // 0 .. 3
367.byte 16,30,30,28 // 4 .. 7
368.byte 16,27,30,22 // 8 .. 11
369.byte 24,26,28,30 // 12 .. 15
370
371#endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64