]>
Commit | Line | Data |
---|---|---|
1 | #include <vm/lz4_assembly_select.h> | |
2 | #if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64 | |
3 | ||
4 | /* | |
5 | ||
6 | int64_t lz4_decode_asm( | |
7 | uint8_t ** dst_ptr, *dst_ptr points to next output byte to write | |
8 | uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst | |
9 | uint8_t * dst_end, "relaxed" end of output buffer (see below) | |
10 | const uint8_t ** src_ptr, *src_ptr points to next input byte to read | |
11 | const uint8_t * src_end) "relaxed" end of input buffer (see below) | |
12 | ||
13 | We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant. | |
14 | We never read before dst_begin. | |
15 | ||
16 | Return 0 on success, -1 on failure | |
17 | On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction. | |
18 | ||
19 | */ | |
20 | ||
21 | #if MSVC_CALLING_CONVENTIONS | |
22 | #error TODO implement MSVC calling conventions for LZ4 x86_64 assembly | |
23 | #endif | |
24 | ||
25 | // %rax and %rbx are free to use | |
26 | ||
27 | #define dst %rdi // arg0 | |
28 | #define dst_begin %rsi // arg1 | |
29 | #define dst_end %rdx // arg2 | |
30 | #define src %rcx // arg3 | |
31 | #define src_end %r8 // arg4 | |
32 | ||
33 | #define n_literals %r9 | |
34 | #define n_matches %r10 | |
35 | ||
36 | #define copy_src %r11 // match/literal copy source | |
37 | #define copy_dst %r12 // match/literal copy destination | |
38 | #define match_distance %r13 // match distance | |
39 | ||
40 | #define src_good %r14 | |
41 | #define dst_good %r15 | |
42 | ||
43 | .globl _lz4_decode_asm | |
44 | ||
45 | .macro establish_frame | |
46 | push %rbp | |
47 | mov %rsp,%rbp | |
48 | push %rbx | |
49 | push %r12 | |
50 | push %r13 | |
51 | push %r14 | |
52 | push %r15 | |
53 | .endm | |
54 | ||
55 | .macro clear_frame_and_return | |
56 | pop %r15 | |
57 | pop %r14 | |
58 | pop %r13 | |
59 | pop %r12 | |
60 | pop %rbx | |
61 | pop %rbp | |
62 | #ifdef __AVX2__ | |
63 | vzeroupper | |
64 | #endif | |
65 | ret | |
66 | .endm | |
67 | ||
68 | // copy_1x16 SOURCE_ADDR DESTINATION_ADDR | |
69 | // Copy 16 bytes, clobber: xmm0 | |
70 | .macro copy_1x16 | |
71 | #ifdef __AVX2__ | |
72 | vmovdqu ($0),%xmm0 | |
73 | vmovdqu %xmm0,($1) | |
74 | #else | |
75 | movdqu ($0),%xmm0 | |
76 | movdqu %xmm0,($1) | |
77 | #endif | |
78 | .endm | |
79 | ||
80 | // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR | |
81 | // Copy 16 bytes, and increment both addresses by 16, clobber: xmm0 | |
82 | .macro copy_1x16_and_increment | |
83 | #ifdef __AVX2__ | |
84 | vmovdqu ($0),%xmm0 | |
85 | vmovdqu %xmm0,($1) | |
86 | #else | |
87 | movdqu ($0),%xmm0 | |
88 | movdqu %xmm0,($1) | |
89 | #endif | |
90 | add $$16,$0 | |
91 | add $$16,$1 | |
92 | .endm | |
93 | ||
94 | // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR | |
95 | // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0 | |
96 | .macro copy_2x16_and_increment | |
97 | #ifdef __AVX2__ | |
98 | vmovdqu ($0),%xmm0 | |
99 | vmovdqu %xmm0,($1) | |
100 | vmovdqu 16($0),%xmm0 | |
101 | vmovdqu %xmm0,16($1) | |
102 | #else | |
103 | movdqu ($0),%xmm0 | |
104 | movdqu %xmm0,($1) | |
105 | movdqu 16($0),%xmm0 | |
106 | movdqu %xmm0,16($1) | |
107 | #endif | |
108 | add $$32,$0 | |
109 | add $$32,$1 | |
110 | .endm | |
111 | ||
112 | // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR | |
113 | // Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1 | |
114 | .macro copy_1x32_and_increment | |
115 | #ifdef __AVX2__ | |
116 | vmovdqu ($0),%ymm0 | |
117 | vmovdqu %ymm0,($1) | |
118 | #else | |
119 | movdqu ($0),%xmm0 | |
120 | movdqu 16($0),%xmm1 | |
121 | movdqu %xmm0,($1) | |
122 | movdqu %xmm1,16($1) | |
123 | #endif | |
124 | add $$32,$0 | |
125 | add $$32,$1 | |
126 | .endm | |
127 | ||
128 | .macro check_src_end | |
129 | cmp src,src_end | |
130 | jbe L_done // done if src >= src_end | |
131 | .endm | |
132 | ||
133 | .macro check_dst_end | |
134 | cmp dst,dst_end | |
135 | jbe L_done // done if dst >= dst_end | |
136 | .endm | |
137 | ||
138 | .text | |
139 | .p2align 6 | |
140 | _lz4_decode_asm: | |
141 | establish_frame | |
142 | push dst // keep uint8_t ** dst on stack | |
143 | mov (dst),dst // load current dst from *dst | |
144 | push src // keep const uint8_t ** src on stack | |
145 | mov (src),src // load current src from *src | |
146 | ||
147 | L_decode_command: | |
148 | // Keep last known good command | |
149 | mov dst,dst_good | |
150 | mov src,src_good | |
151 | ||
152 | // Check limits | |
153 | check_src_end | |
154 | check_dst_end | |
155 | ||
156 | // Decode command | |
157 | movzb (src),%rax // read command byte LLLLMMMM | |
158 | add $1,src | |
159 | mov %rax,n_literals | |
160 | shr $4,n_literals // n_literals in 0..15 | |
161 | mov %rax,n_matches | |
162 | and $0xf,n_matches | |
163 | add $4,n_matches // n_matches in 4..19 | |
164 | ||
165 | // Short literal? | |
166 | cmp $15,n_literals | |
167 | je L_decode_long_literal | |
168 | ||
169 | // Copy literals, n_literals <= 14: copy 16 bytes | |
170 | L_copy_short_literal: | |
171 | copy_1x16 src,dst | |
172 | add n_literals,src // src += n_literals | |
173 | add n_literals,dst // dst += n_literals | |
174 | jmp L_expand_match // continue to match | |
175 | ||
176 | // the number of literals is encoded on more bytes, we need to decode them | |
177 | L_decode_long_literal: | |
178 | check_src_end // required here, since we may loop an arbitrarily high number of times | |
179 | movzb (src),%rax | |
180 | add $1,src | |
181 | add %rax,n_literals | |
182 | cmp $255,%rax | |
183 | je L_decode_long_literal | |
184 | ||
185 | // Copy literals, n_literals >= 15 | |
186 | L_copy_long_literal: | |
187 | mov src,copy_src // literal copy source | |
188 | mov dst,copy_dst // literal copy destination | |
189 | add n_literals,src // update src,dst for next step | |
190 | add n_literals,dst | |
191 | check_src_end // required here, since n_literals can be arbitrarily high | |
192 | check_dst_end | |
193 | ||
194 | // fixed + loop | |
195 | copy_1x32_and_increment copy_src,copy_dst | |
196 | copy_1x32_and_increment copy_src,copy_dst | |
197 | L_copy_long_literal_loop: | |
198 | copy_1x32_and_increment copy_src,copy_dst | |
199 | cmp copy_dst,dst | |
200 | ja L_copy_long_literal_loop | |
201 | // continue to match | |
202 | ||
203 | L_expand_match: | |
204 | // Load match distance, and get match copy source | |
205 | movzw (src),match_distance | |
206 | add $2,src | |
207 | test match_distance,match_distance | |
208 | jz L_fail // match_distance == 0: FAIL | |
209 | mov dst,copy_src | |
210 | sub match_distance,copy_src // copy_src = match copy source | |
211 | cmp copy_src,dst_begin | |
212 | ja L_fail // dst_begin > copy_src: FAIL | |
213 | ||
214 | // Long n_matches encoding? | |
215 | cmp $19,n_matches | |
216 | je L_decode_long_match // unlikely | |
217 | // Long n_matches with short encoding (17 or 18)? | |
218 | cmp $16,n_matches | |
219 | ja L_long_match // unlikely | |
220 | ||
221 | // Copy match, n_matches <= 16 | |
222 | L_copy_short_match: | |
223 | cmp $16,match_distance | |
224 | jb L_copy_short_match_overlap | |
225 | ||
226 | // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes | |
227 | copy_1x16 copy_src,dst | |
228 | add n_matches,dst // update dst | |
229 | jmp L_decode_command // to next command | |
230 | ||
231 | // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern | |
232 | L_copy_short_match_overlap: | |
233 | lea L_match_permtable(%rip),%rax | |
234 | shl $5,match_distance | |
235 | #ifdef __AVX2__ | |
236 | vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance | |
237 | vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 | |
238 | vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0 | |
239 | vmovdqu %xmm0,(dst) // and store the result | |
240 | #else | |
241 | movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance | |
242 | movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 | |
243 | pshufb %xmm2,%xmm0 // replicate the pattern in xmm0 | |
244 | movdqu %xmm0,(dst) // and store the result | |
245 | #endif | |
246 | add n_matches,dst // update dst | |
247 | jmp L_decode_command // to next command | |
248 | ||
249 | // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them | |
250 | L_decode_long_match: | |
251 | mov $255,%rbx | |
252 | L_decode_long_match_loop: | |
253 | check_src_end // required here, since we may loop an arbitrarily high number of times | |
254 | mov (src),%rax | |
255 | add $1,src | |
256 | and %rbx,%rax | |
257 | add %rax,n_matches | |
258 | cmp %rbx,%rax | |
259 | je L_decode_long_match_loop | |
260 | ||
261 | // n_matches > 16 | |
262 | L_long_match: | |
263 | mov dst,copy_dst // copy_dst = match copy destination | |
264 | add n_matches,dst // update dst | |
265 | check_dst_end // n_matches may be arbitrarily high | |
266 | ||
267 | cmp $16,match_distance | |
268 | jb L_copy_long_match_overlap // match_distance < 16: overlapping copy | |
269 | ||
270 | // Copy match, n_matches >= 16, match_distance >= 16 | |
271 | // fixed + loop | |
272 | copy_1x16_and_increment copy_src,copy_dst | |
273 | L_copy_long_match_loop: | |
274 | copy_2x16_and_increment copy_src,copy_dst | |
275 | cmp copy_dst,dst | |
276 | ja L_copy_long_match_loop | |
277 | jmp L_decode_command // to next command | |
278 | ||
279 | // Copy match, n_matches >= 16, match_distance < 16: replicate pattern | |
280 | L_copy_long_match_overlap: | |
281 | lea L_match_permtable(%rip),%rax | |
282 | mov match_distance,%rbx | |
283 | shl $5,%rbx | |
284 | #ifdef __AVX2__ | |
285 | vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 | |
286 | vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes | |
287 | vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes | |
288 | vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0 | |
289 | vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes | |
290 | vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1 | |
291 | vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register | |
292 | #else | |
293 | movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16 | |
294 | movdqa %xmm0,%xmm1 // keep a copy for the high bytes | |
295 | movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes | |
296 | pshufb %xmm2,%xmm0 // replicate the pattern in xmm0 | |
297 | movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes | |
298 | pshufb %xmm2,%xmm1 // replicate the pattern in xmm1 | |
299 | #endif | |
300 | // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes | |
301 | lea L_match_disttable(%rip),%rax | |
302 | movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32. | |
303 | ||
304 | // fixed | |
305 | #ifdef __AVX2__ | |
306 | vmovdqu %ymm0,(copy_dst) | |
307 | #else | |
308 | movdqu %xmm0,(copy_dst) | |
309 | movdqu %xmm1,16(copy_dst) | |
310 | #endif | |
311 | add %rax,copy_dst | |
312 | L_copy_long_match_overlap_loop: | |
313 | // loop | |
314 | #ifdef __AVX2__ | |
315 | vmovdqu %ymm0,(copy_dst) | |
316 | #else | |
317 | movdqu %xmm0,(copy_dst) | |
318 | movdqu %xmm1,16(copy_dst) | |
319 | #endif | |
320 | add %rax,copy_dst | |
321 | cmp copy_dst,dst | |
322 | ja L_copy_long_match_overlap | |
323 | jmp L_decode_command // to next command | |
324 | ||
325 | L_fail: | |
326 | xor %rax,%rax | |
327 | dec %rax // -1 | |
328 | jmp L_exit | |
329 | ||
330 | L_done: | |
331 | xor %rax,%rax | |
332 | // continue to exit | |
333 | ||
334 | L_exit: | |
335 | pop src | |
336 | mov src_good,(src) | |
337 | pop dst | |
338 | mov dst_good,(dst) | |
339 | clear_frame_and_return | |
340 | ||
341 | // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15 | |
342 | // value(d)[i] = i%d for i = 0..31 | |
343 | .p2align 6 | |
344 | L_match_permtable: | |
345 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0 | |
346 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1 | |
347 | .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2 | |
348 | .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3 | |
349 | .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4 | |
350 | .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5 | |
351 | .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6 | |
352 | .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7 | |
353 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8 | |
354 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9 | |
355 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10 | |
356 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11 | |
357 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12 | |
358 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13 | |
359 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14 | |
360 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15 | |
361 | ||
362 | // valid repeating pattern size, for each match_distance = 0 to 15 | |
363 | // value(d) = 32 - (32%d), is the largest a multiple of d <= 32 | |
364 | .p2align 6 | |
365 | L_match_disttable: | |
366 | .byte 32,32,32,30 // 0 .. 3 | |
367 | .byte 16,30,30,28 // 4 .. 7 | |
368 | .byte 16,27,30,22 // 8 .. 11 | |
369 | .byte 24,26,28,30 // 12 .. 15 | |
370 | ||
371 | #endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64 |