]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm/lz4_decode_armv7NEON.s
xnu-4570.71.2.tar.gz
[apple/xnu.git] / osfmk / arm / lz4_decode_armv7NEON.s
1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <vm/lz4_assembly_select.h>
29 #if LZ4_ENABLE_ASSEMBLY_DECODE_ARMV7
30
31 /*
32
33 int64_t lz4_decode_asm(
34 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
35 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
36 uint8_t * dst_end, "relaxed" end of output buffer (see below)
37 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
38 const uint8_t * src_end) "relaxed" end of input buffer (see below)
39
40 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
41 We never read before dst_begin.
42
43 Return 0 on success, -1 on failure
44 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
45
46 */
47
48 .globl _lz4_decode_asm
49
50 #define dst r0 // arg0
51 #define dst_begin r1 // arg1
52 #define dst_end r2 // arg2
53 #define src r3 // arg3
54
55 #define src_end r4 // arg4
56
57 #define n_matches r5
58 #define n_literals r10
59 #define copy_src r11 // match/literal copy source
60 #define copy_dst r8 // match/literal copy destination
61
62 #define aux1 r9
63
64 #define match_distance r6
65
66 #define match_permtable r12
67 #define match_disttable lr
68
69 #define dst_good [sp, #0]
70 #define src_good [sp, #4]
71
72 .macro establish_frame
73 ldr ip, [sp, #0] // read src_end
74 push {r4-r7, lr} // Save registers
75 add r7, sp, #12 // Establish stack frame
76 push {r8-r11}
77 mov src_end, ip
78 push {r0, r3} // save dst/src
79 sub sp, sp, #4+16 // 4 for 16-byte stack alignment, extra 16-bytes for local
80 .endm
81
82 .macro clear_frame_and_return
83 add sp, sp, #12+16 // skip r0/r3
84 pop {r8-r11}
85 pop {r4-r7,pc}
86 .endm
87
88 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
89 // Copy 16 bytes, clobber: q0
90 .macro copy_1x16
91 vld1.8 {q0}, [$0]
92 vst1.8 {q0}, [$1]
93 .endm
94
95 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
96 // Copy 16 bytes, and increment both addresses by 16, clobber: q0
97 .macro copy_1x16_and_increment
98 vld1.8 {q0}, [$0]!
99 vst1.8 {q0}, [$1]!
100 .endm
101
102 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
103 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0
104 .macro copy_2x16_and_increment
105 vld1.8 {q0}, [$0]!
106 vst1.8 {q0}, [$1]!
107 vld1.8 {q0}, [$0]!
108 vst1.8 {q0}, [$1]!
109 .endm
110
111 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
112 // Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1
113 .macro copy_1x32_and_increment
114 vld1.8 {q0,q1}, [$0]!
115 vst1.8 {q0,q1}, [$1]!
116 .endm
117
118 // If we don't branch, src < src_end after this
119 .macro check_src_end
120 cmp src,src_end
121 bhs L_done // extremely unlikely, DONE when src >= src_end
122 .endm
123
124 // If we don't branch, dst < dst_end after this
125 .macro check_dst_end
126 cmp dst,dst_end
127 bhs L_done // extremely unlikely, DONE when dst >= dst_end
128 .endm
129
130 .text
131 .syntax unified
132 .thumb
133 .thumb_func _lz4_decode_asm
134 .p2align 1
135 _lz4_decode_asm:
136 establish_frame
137 ldr src,[src] // src = *src_ptr
138 ldr dst,[dst] // dst = *dst_ptr
139
140 adr match_permtable,L_match_permtable
141 adr match_disttable,L_match_disttable
142
143 L_decode_command:
144 // Keep last known good positions in both streams
145 str dst, dst_good
146 str src, src_good
147
148 // Check limits
149 check_src_end
150 check_dst_end
151
152 // Decode 1-byte command
153 ldrb aux1,[src],#1 // read command byte LLLLMMMM
154 lsr n_literals,aux1,#4 // 0000LLLL. n_literals is now 0..15
155 and n_matches,aux1,#0xf // 0000MMMM. n_matches is now 0..15
156 add n_matches,n_matches,#4 // n_matches is now 4..19
157
158 // Test number of literals (do not test if n_literals==0, because branch prediction fails on it)
159 cmp n_literals,#14
160 bls L_copy_short_literal // 96% likely: n_literals in 0..14
161 // continue to decode_long_literal
162
163 // the number of literals is encoded on more bytes, we need to decode them
164 L_decode_long_literal:
165 check_src_end // required here, since we may loop an arbitrarily high number of times
166 ldrb aux1,[src],#1
167 add n_literals,n_literals,aux1
168 cmp aux1,#255
169 beq L_decode_long_literal // extremely unlikely
170 // continue to copy_long_literal
171
172 // Copy literals, n_literals >= 15
173 L_copy_long_literal:
174 mov copy_src,src // literal copy origin
175 mov copy_dst,dst // literal copy destination
176 add src,src,n_literals
177 add dst,dst,n_literals
178 check_src_end // required here, since n_literals can be arbitrarily high
179 check_dst_end
180
181 // fixed + loop
182 copy_1x32_and_increment copy_src,copy_dst
183 L_copy_long_literal_loop:
184 copy_1x32_and_increment copy_src,copy_dst
185 cmp dst,copy_dst
186 bhi L_copy_long_literal_loop // first test occurs after 64 bytes have been copied, and is unlikely to loop back
187 b L_expand_match
188
189 // Copy literals, n_literals <= 14: copy 16 bytes
190 L_copy_short_literal:
191 copy_1x16 src,dst
192 add src,src,n_literals
193 add dst,dst,n_literals
194 // continue to expand match
195
196 L_expand_match:
197
198 // Decode match distance
199 ldrh match_distance,[src],#2 // 16-bit distance
200 cbz match_distance,L_fail // distance == 0 is invalid
201 sub copy_src,dst,match_distance // copy_src is the match copy source
202 cmp copy_src,dst_begin
203 blo L_fail // copy_src < dst_begin: FAIL
204 mov copy_dst,dst // copy_dst is the match copy destination
205 add dst,dst,n_matches // dst is updated to be the byte after the match; n_matches <= 19 here
206
207 // Do we need to decode a long match?
208 cmp n_matches,#19
209 beq L_decode_long_match // unlikely, n_matches >= 19 encoded on more bytes
210 cmp n_matches,#16
211 bhi L_long_match // unlikely, n_matches == 17 or 18
212 // continue to short match (most probable case)
213
214 // Copy match, n_matches <= 16
215 L_short_match:
216 cmp match_distance,#15
217 bls L_copy_short_match_small_distance
218
219 // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes
220 copy_1x16 copy_src,copy_dst
221 b L_decode_command
222
223 L_fail:
224 mov aux1,#-1 // FAIL
225 b L_exit
226
227 L_done:
228 mov aux1,#0 // OK
229 // continue to L_exit
230
231 L_exit:
232
233 ldr dst,[sp, #20] // get back src_ptr,dst_ptr from stack
234 ldr src,[sp, #24] // get back src_ptr,dst_ptr from stack
235 ldr ip, src_good
236 ldr lr, dst_good
237 str ip,[src] // *src_ptr = src_good
238 str lr,[dst] // *dst_ptr = dst_good
239
240 mov r0,aux1 // x0 = return value
241 clear_frame_and_return
242
243 // Copy match, n_matches <= 16, match_distance < 16:
244 // load shuffle table, and permute to replicate the pattern on 16 bytes
245 L_copy_short_match_small_distance:
246 vld1.8 {q0},[copy_src]
247 add aux1,match_permtable,match_distance,lsl #5 // index in table
248 vld1.8 {q1},[aux1] // load only permutation for the low 16 bytes
249 vtbl.8 d4, {q0}, d2 // low 16 bytes of pattern
250 vtbl.8 d5, {q0}, d3 // low 16 bytes of pattern
251 vst1.8 {q2},[copy_dst]
252 b L_decode_command
253
254 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
255 L_decode_long_match:
256 check_src_end // required here, since we may loop an arbitrarily high number of times
257 ldrb aux1,[src],#1
258 add dst,dst,aux1
259 cmp aux1,#255
260 beq L_decode_long_match // very unlikely
261 check_dst_end // required here, since dst was incremented by a arbitrarily high value
262 // continue to long_match
263
264 // n_matches > 16
265 L_long_match:
266 cmp match_distance,#31
267 bhi L_copy_long_match_32
268 cmp match_distance,#15
269 bhi L_copy_long_match_16
270
271 // Copy match, n_matches >= 16, match_distance < 16:
272 // load shuffle table, and permute to replicate the pattern on 32 bytes
273 L_copy_long_match_small_distance:
274 vld1.8 {q1}, [copy_src] // 16 pattern bytes
275 add aux1,match_permtable,match_distance,lsl #5 // index in table
276 vld1.8 {q2-q3}, [aux1] // load 32-byte permutation
277
278 vtbl.8 d4, {q1}, d4 // low 16 bytes of pattern
279 vtbl.8 d5, {q1}, d5 // low 16 bytes of pattern
280 vtbl.8 d6, {q1}, d6 // low 16 bytes of pattern
281 vtbl.8 d7, {q1}, d7 // low 16 bytes of pattern
282
283 ldrb aux1,[match_disttable,match_distance] // valid pattern length in aux1
284 // fixed
285 vst1.8 {q2-q3},[copy_dst]
286 add copy_dst,copy_dst,aux1
287 L_copy_long_match_small_distance_loop:
288 // loop
289 vst1.8 {q2-q3},[copy_dst]
290 add copy_dst,copy_dst,aux1
291 vst1.8 {q2-q3},[copy_dst]
292 add copy_dst,copy_dst,aux1
293 cmp dst,copy_dst
294 bhi L_copy_long_match_small_distance_loop
295 b L_decode_command
296
297 // Copy match, n_matches >= 16, match_distance >= 32
298 L_copy_long_match_32:
299 // fixed + loop
300 copy_1x16_and_increment copy_src,copy_dst
301 L_copy_long_match_32_loop:
302 copy_1x32_and_increment copy_src,copy_dst
303 cmp dst,copy_dst
304 bhi L_copy_long_match_32_loop
305 b L_decode_command
306
307 // Copy match, n_matches >= 16, match_distance >= 16
308 L_copy_long_match_16:
309 // fixed + loop
310 copy_1x16_and_increment copy_src,copy_dst
311 L_copy_long_match_16_loop:
312 copy_2x16_and_increment copy_src,copy_dst
313 cmp dst,copy_dst
314 bhi L_copy_long_match_16_loop
315 b L_decode_command
316
317
318 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
319 // value(d)[i] = i%d for i = 0..31
320 .p2align 6
321 L_match_permtable:
322 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
323 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
324 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
325 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
326 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
327 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
328 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
329 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
330 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
331 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
332 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
333 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
334 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
335 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
336 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
337 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
338
339 // valid repeating pattern size, for each match_distance = 0 to 15
340 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
341 .p2align 6
342 L_match_disttable:
343 .byte 32,32,32,30 // 0 .. 3
344 .byte 16,30,30,28 // 4 .. 7
345 .byte 16,27,30,22 // 8 .. 11
346 .byte 24,26,28,30 // 12 .. 15
347
348 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARMV7