]> git.saurik.com Git - apple/xnu.git/blob - osfmk/x86_64/lz4_decode_x86_64.s
xnu-7195.60.75.tar.gz
[apple/xnu.git] / osfmk / x86_64 / lz4_decode_x86_64.s
1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <vm/lz4_assembly_select.h>
29 #if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64
30
31 /*
32
33 int64_t lz4_decode_asm(
34 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
35 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
36 uint8_t * dst_end, "relaxed" end of output buffer (see below)
37 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
38 const uint8_t * src_end) "relaxed" end of input buffer (see below)
39
40 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
41 We never read before dst_begin.
42
43 Return 0 on success, -1 on failure
44 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
45
46 */
47
48 #if MSVC_CALLING_CONVENTIONS
49 #error TODO implement MSVC calling conventions for LZ4 x86_64 assembly
50 #endif
51
52 // %rax and %rbx are free to use
53
54 #define dst %rdi // arg0
55 #define dst_begin %rsi // arg1
56 #define dst_end %rdx // arg2
57 #define src %rcx // arg3
58 #define src_end %r8 // arg4
59
60 #define n_literals %r9
61 #define n_matches %r10
62
63 #define copy_src %r11 // match/literal copy source
64 #define copy_dst %r12 // match/literal copy destination
65 #define match_distance %r13 // match distance
66
67 #define src_good %r14
68 #define dst_good %r15
69
70 .globl _lz4_decode_asm
71
72 .macro establish_frame
73 push %rbp
74 mov %rsp,%rbp
75 push %rbx
76 push %r12
77 push %r13
78 push %r14
79 push %r15
80 .endm
81
82 .macro clear_frame_and_return
83 pop %r15
84 pop %r14
85 pop %r13
86 pop %r12
87 pop %rbx
88 pop %rbp
89 #ifdef __AVX2__
90 vzeroupper
91 #endif
92 ret
93 .endm
94
95 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
96 // Copy 16 bytes, clobber: xmm0
97 .macro copy_1x16
98 #ifdef __AVX2__
99 vmovdqu ($0),%xmm0
100 vmovdqu %xmm0,($1)
101 #else
102 movdqu ($0),%xmm0
103 movdqu %xmm0,($1)
104 #endif
105 .endm
106
107 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
108 // Copy 16 bytes, and increment both addresses by 16, clobber: xmm0
109 .macro copy_1x16_and_increment
110 #ifdef __AVX2__
111 vmovdqu ($0),%xmm0
112 vmovdqu %xmm0,($1)
113 #else
114 movdqu ($0),%xmm0
115 movdqu %xmm0,($1)
116 #endif
117 add $$16,$0
118 add $$16,$1
119 .endm
120
121 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
122 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0
123 .macro copy_2x16_and_increment
124 #ifdef __AVX2__
125 vmovdqu ($0),%xmm0
126 vmovdqu %xmm0,($1)
127 vmovdqu 16($0),%xmm0
128 vmovdqu %xmm0,16($1)
129 #else
130 movdqu ($0),%xmm0
131 movdqu %xmm0,($1)
132 movdqu 16($0),%xmm0
133 movdqu %xmm0,16($1)
134 #endif
135 add $$32,$0
136 add $$32,$1
137 .endm
138
139 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
140 // Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1
141 .macro copy_1x32_and_increment
142 #ifdef __AVX2__
143 vmovdqu ($0),%ymm0
144 vmovdqu %ymm0,($1)
145 #else
146 movdqu ($0),%xmm0
147 movdqu 16($0),%xmm1
148 movdqu %xmm0,($1)
149 movdqu %xmm1,16($1)
150 #endif
151 add $$32,$0
152 add $$32,$1
153 .endm
154
155 .macro check_src_end
156 cmp src,src_end
157 jbe L_done // done if src >= src_end
158 .endm
159
160 .macro check_dst_end
161 cmp dst,dst_end
162 jbe L_done // done if dst >= dst_end
163 .endm
164
165 .text
166 .p2align 6
167 _lz4_decode_asm:
168 establish_frame
169 push dst // keep uint8_t ** dst on stack
170 mov (dst),dst // load current dst from *dst
171 push src // keep const uint8_t ** src on stack
172 mov (src),src // load current src from *src
173
174 L_decode_command:
175 // Keep last known good command
176 mov dst,dst_good
177 mov src,src_good
178
179 // Check limits
180 check_src_end
181 check_dst_end
182
183 // Decode command
184 movzb (src),%rax // read command byte LLLLMMMM
185 add $1,src
186 mov %rax,n_literals
187 shr $4,n_literals // n_literals in 0..15
188 mov %rax,n_matches
189 and $0xf,n_matches
190 add $4,n_matches // n_matches in 4..19
191
192 // Short literal?
193 cmp $15,n_literals
194 je L_decode_long_literal
195
196 // Copy literals, n_literals <= 14: copy 16 bytes
197 L_copy_short_literal:
198 copy_1x16 src,dst
199 add n_literals,src // src += n_literals
200 add n_literals,dst // dst += n_literals
201 jmp L_expand_match // continue to match
202
203 // the number of literals is encoded on more bytes, we need to decode them
204 L_decode_long_literal:
205 check_src_end // required here, since we may loop an arbitrarily high number of times
206 movzb (src),%rax
207 add $1,src
208 add %rax,n_literals
209 cmp $255,%rax
210 je L_decode_long_literal
211
212 // Copy literals, n_literals >= 15
213 L_copy_long_literal:
214 mov src,copy_src // literal copy source
215 mov dst,copy_dst // literal copy destination
216 add n_literals,src // update src,dst for next step
217 add n_literals,dst
218 check_src_end // required here, since n_literals can be arbitrarily high
219 check_dst_end
220
221 // fixed + loop
222 copy_1x32_and_increment copy_src,copy_dst
223 copy_1x32_and_increment copy_src,copy_dst
224 L_copy_long_literal_loop:
225 copy_1x32_and_increment copy_src,copy_dst
226 cmp copy_dst,dst
227 ja L_copy_long_literal_loop
228 // continue to match
229
230 L_expand_match:
231 // Load match distance, and get match copy source
232 movzw (src),match_distance
233 add $2,src
234 test match_distance,match_distance
235 jz L_fail // match_distance == 0: FAIL
236 mov dst,copy_src
237 sub match_distance,copy_src // copy_src = match copy source
238 cmp copy_src,dst_begin
239 ja L_fail // dst_begin > copy_src: FAIL
240
241 // Long n_matches encoding?
242 cmp $19,n_matches
243 je L_decode_long_match // unlikely
244 // Long n_matches with short encoding (17 or 18)?
245 cmp $16,n_matches
246 ja L_long_match // unlikely
247
248 // Copy match, n_matches <= 16
249 L_copy_short_match:
250 cmp $16,match_distance
251 jb L_copy_short_match_overlap
252
253 // Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes
254 copy_1x16 copy_src,dst
255 add n_matches,dst // update dst
256 jmp L_decode_command // to next command
257
258 // Copy match, n_matches <= 16 and match_distance < 16: replicate pattern
259 L_copy_short_match_overlap:
260 lea L_match_permtable(%rip),%rax
261 shl $5,match_distance
262 #ifdef __AVX2__
263 vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
264 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
265 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
266 vmovdqu %xmm0,(dst) // and store the result
267 #else
268 movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
269 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
270 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
271 movdqu %xmm0,(dst) // and store the result
272 #endif
273 add n_matches,dst // update dst
274 jmp L_decode_command // to next command
275
276 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
277 L_decode_long_match:
278 mov $255,%rbx
279 L_decode_long_match_loop:
280 check_src_end // required here, since we may loop an arbitrarily high number of times
281 mov (src),%rax
282 add $1,src
283 and %rbx,%rax
284 add %rax,n_matches
285 cmp %rbx,%rax
286 je L_decode_long_match_loop
287
288 // n_matches > 16
289 L_long_match:
290 mov dst,copy_dst // copy_dst = match copy destination
291 add n_matches,dst // update dst
292 check_dst_end // n_matches may be arbitrarily high
293
294 cmp $16,match_distance
295 jb L_copy_long_match_overlap // match_distance < 16: overlapping copy
296
297 // Copy match, n_matches >= 16, match_distance >= 16
298 // fixed + loop
299 copy_1x16_and_increment copy_src,copy_dst
300 L_copy_long_match_loop:
301 copy_2x16_and_increment copy_src,copy_dst
302 cmp copy_dst,dst
303 ja L_copy_long_match_loop
304 jmp L_decode_command // to next command
305
306 // Copy match, n_matches >= 16, match_distance < 16: replicate pattern
307 L_copy_long_match_overlap:
308 lea L_match_permtable(%rip),%rax
309 mov match_distance,%rbx
310 shl $5,%rbx
311 #ifdef __AVX2__
312 vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
313 vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes
314 vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
315 vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
316 vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
317 vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1
318 vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register
319 #else
320 movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
321 movdqa %xmm0,%xmm1 // keep a copy for the high bytes
322 movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
323 pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
324 movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
325 pshufb %xmm2,%xmm1 // replicate the pattern in xmm1
326 #endif
327 // Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes
328 lea L_match_disttable(%rip),%rax
329 movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32.
330
331 // fixed
332 #ifdef __AVX2__
333 vmovdqu %ymm0,(copy_dst)
334 #else
335 movdqu %xmm0,(copy_dst)
336 movdqu %xmm1,16(copy_dst)
337 #endif
338 add %rax,copy_dst
339 L_copy_long_match_overlap_loop:
340 // loop
341 #ifdef __AVX2__
342 vmovdqu %ymm0,(copy_dst)
343 #else
344 movdqu %xmm0,(copy_dst)
345 movdqu %xmm1,16(copy_dst)
346 #endif
347 add %rax,copy_dst
348 cmp copy_dst,dst
349 ja L_copy_long_match_overlap
350 jmp L_decode_command // to next command
351
352 L_fail:
353 xor %rax,%rax
354 dec %rax // -1
355 jmp L_exit
356
357 L_done:
358 xor %rax,%rax
359 // continue to exit
360
361 L_exit:
362 pop src
363 mov src_good,(src)
364 pop dst
365 mov dst_good,(dst)
366 clear_frame_and_return
367
368 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
369 // value(d)[i] = i%d for i = 0..31
370 .p2align 6
371 L_match_permtable:
372 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
373 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
374 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
375 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
376 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
377 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
378 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
379 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
380 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
381 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
382 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
383 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
384 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
385 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
386 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
387 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
388
389 // valid repeating pattern size, for each match_distance = 0 to 15
390 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
391 .p2align 6
392 L_match_disttable:
393 .byte 32,32,32,30 // 0 .. 3
394 .byte 16,30,30,28 // 4 .. 7
395 .byte 16,27,30,22 // 8 .. 11
396 .byte 24,26,28,30 // 12 .. 15
397
398 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64