]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm64/lz4_decode_arm64.s
xnu-4570.61.1.tar.gz
[apple/xnu.git] / osfmk / arm64 / lz4_decode_arm64.s
1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <vm/lz4_assembly_select.h>
29 #if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64
30
31 /*
32
33 int64_t lz4_decode_asm(
34 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
35 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
36 uint8_t * dst_end, "relaxed" end of output buffer (see below)
37 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
38 const uint8_t * src_end) "relaxed" end of input buffer (see below)
39
40 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
41 We never read before dst_begin.
42
43 Return 0 on success, -1 on failure
44 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
45
46 */
47
48 .globl _lz4_decode_asm
49
50 #define dst x0 // arg0
51 #define dst_begin x1 // arg1
52 #define dst_end x2 // arg2
53 #define src x3 // arg3
54 #define src_end x4 // arg4
55
56 #define w_n_matches w5 // lower 32 bits of n_matches
57 #define n_matches x5
58 #define n_literals x6
59 #define copy_src x7 // match/literal copy source
60 #define copy_dst x8 // match/literal copy destination
61
62 #define w_aux1 w9 // lower 32 bits of aux1
63 #define aux1 x9
64 #define aux2 x10
65
66 #define w_match_distance w11 // lower 32 bits of match_distance
67 #define match_distance x11
68
69 #define match_permtable x12
70 #define match_disttable x13
71
72 #define dst_good x19
73 #define src_good x20
74
75 .macro establish_frame
76 stp fp, lr, [sp, #-16]!
77 mov fp, sp
78 .endm
79
80 .macro clear_frame_and_return
81 ldp fp, lr, [sp], #16
82 ret lr
83 .endm
84
85 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
86 // Copy 16 bytes, clobber: q0
87 .macro copy_1x16
88 ldr q0,[$0]
89 str q0,[$1]
90 .endm
91
92 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
93 // Copy 16 bytes, and increment both addresses by 16, clobber: q0
94 .macro copy_1x16_and_increment
95 ldr q0,[$0],#16
96 str q0,[$1],#16
97 .endm
98
99 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
100 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0
101 .macro copy_2x16_and_increment
102 ldr q0,[$0],#16
103 str q0,[$1],#16
104 ldr q0,[$0],#16
105 str q0,[$1],#16
106 .endm
107
108 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
109 // Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1
110 .macro copy_1x32_and_increment
111 ldp q0,q1,[$0],#32
112 stp q0,q1,[$1],#32
113 .endm
114
115 // If we don't branch, src < src_end after this
116 .macro check_src_end
117 cmp src,src_end
118 b.hs L_done // extremely unlikely, DONE when src >= src_end
119 .endm
120
121 // If we don't branch, dst < dst_end after this
122 .macro check_dst_end
123 cmp dst,dst_end
124 b.hs L_done // extremely unlikely, DONE when dst >= dst_end
125 .endm
126
127 .text
128 .p2align 4
129 _lz4_decode_asm:
130 establish_frame
131 stp x19,x20,[sp,#-16]! // need to preserve these
132 stp src,dst,[sp,#-16]! // save src_ptr,dst_ptr on stack
133 ldr src,[src] // src = *src_ptr
134 ldr dst,[dst] // dst = *dst_ptr
135 adr match_permtable,L_match_permtable
136 adr match_disttable,L_match_disttable
137
138 L_decode_command:
139 // Keep last known good positions in both streams
140 mov dst_good,dst
141 mov src_good,src
142
143 // Check limits
144 check_src_end
145 check_dst_end
146
147 // Decode 1-byte command
148 ldrb w_aux1,[src],#1 // read command byte LLLLMMMM
149 lsr n_literals,aux1,#4 // 0000LLLL. n_literals is now 0..15
150 and n_matches,aux1,#0xf // 0000MMMM. n_matches is now 0..15
151 add n_matches,n_matches,#4 // n_matches is now 4..19
152
153 // Test number of literals (do not test if n_literals==0, because branch prediction fails on it)
154 cmp n_literals,#14
155 b.ls L_copy_short_literal // 96% likely: n_literals in 0..14
156 // continue to decode_long_literal
157
158 // the number of literals is encoded on more bytes, we need to decode them
159 L_decode_long_literal:
160 check_src_end // required here, since we may loop an arbitrarily high number of times
161 ldrb w_aux1,[src],#1
162 add n_literals,n_literals,aux1
163 cmp aux1,#255
164 b.eq L_decode_long_literal // extremely unlikely
165 // continue to copy_long_literal
166
167 // Copy literals, n_literals >= 15
168 L_copy_long_literal:
169 mov copy_src,src // literal copy origin
170 mov copy_dst,dst // literal copy destination
171 add src,src,n_literals
172 add dst,dst,n_literals
173 check_src_end // required here, since n_literals can be arbitrarily high
174 check_dst_end
175
176 // fixed + loop
177 copy_1x32_and_increment copy_src,copy_dst
178 L_copy_long_literal_loop:
179 copy_1x32_and_increment copy_src,copy_dst
180 cmp dst,copy_dst
181 b.hi L_copy_long_literal_loop // first test occurs after 64 bytes have been copied, and is unlikely to loop back
182 b L_expand_match
183
184 // Copy literals, n_literals <= 14: copy 16 bytes
185 L_copy_short_literal:
186 copy_1x16 src,dst
187 add src,src,n_literals
188 add dst,dst,n_literals
189 // continue to expand match
190
191 L_expand_match:
192
193 // Decode match distance
194 ldrh w_match_distance,[src],#2 // 16-bit distance
195 cbz match_distance,L_fail // distance == 0 is invalid
196 sub copy_src,dst,match_distance // copy_src is the match copy source
197 cmp copy_src,dst_begin
198 b.lo L_fail // copy_src < dst_begin: FAIL
199 mov copy_dst,dst // copy_dst is the match copy destination
200 add dst,dst,n_matches // dst is updated to be the byte after the match; n_matches <= 19 here
201
202 // Do we need to decode a long match?
203 cmp n_matches,#19
204 b.eq L_decode_long_match // unlikely, n_matches >= 19 encoded on more bytes
205 cmp n_matches,#16
206 b.hi L_long_match // unlikely, n_matches == 17 or 18
207 // continue to short match (most probable case)
208
209 // Copy match, n_matches <= 16
210 L_short_match:
211 cmp match_distance,#15
212 b.ls L_copy_short_match_small_distance
213
214 // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes
215 copy_1x16 copy_src,copy_dst
216 b L_decode_command
217
218 // Copy match, n_matches <= 16, match_distance < 16:
219 // load shuffle table, and permute to replicate the pattern on 16 bytes
220 L_copy_short_match_small_distance:
221 ldr q0,[copy_src]
222 add aux1,match_permtable,match_distance,lsl #5 // index in table
223 ldr q1,[aux1] // load only permutation for the low 16 bytes
224 tbl v0.16b,{v0.16b},v1.16b // low 16 bytes of pattern
225 str q0,[copy_dst]
226 b L_decode_command
227
228 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
229 L_decode_long_match:
230 check_src_end // required here, since we may loop an arbitrarily high number of times
231 ldrb w_aux1,[src],#1
232 add dst,dst,aux1
233 cmp aux1,#255
234 b.eq L_decode_long_match // very unlikely
235 check_dst_end // required here, since dst was incremented by a arbitrarily high value
236 // continue to long_match
237
238 // n_matches > 16
239 L_long_match:
240 cmp match_distance,#31
241 b.hi L_copy_long_match_32
242 cmp match_distance,#15
243 b.hi L_copy_long_match_16
244
245 // Copy match, n_matches >= 16, match_distance < 16:
246 // load shuffle table, and permute to replicate the pattern on 32 bytes
247 L_copy_long_match_small_distance:
248 ldr q1,[copy_src] // 16 pattern bytes
249 add aux1,match_permtable,match_distance,lsl #5 // index in table
250 ldp q2,q3,[aux1] // load 32-byte permutation
251 tbl v0.16b,{v1.16b},v2.16b // low 16 bytes of pattern in q0
252 tbl v1.16b,{v1.16b},v3.16b // high 16 bytes of pattern in q1
253 ldrb w_aux1,[match_disttable,match_distance] // valid pattern length in aux1
254 // fixed
255 stp q0,q1,[copy_dst]
256 add copy_dst,copy_dst,aux1
257 L_copy_long_match_small_distance_loop:
258 // loop
259 stp q0,q1,[copy_dst]
260 add copy_dst,copy_dst,aux1
261 stp q0,q1,[copy_dst]
262 add copy_dst,copy_dst,aux1
263 cmp dst,copy_dst
264 b.hi L_copy_long_match_small_distance_loop
265 b L_decode_command
266
267 // Copy match, n_matches >= 16, match_distance >= 32
268 L_copy_long_match_32:
269 // fixed + loop
270 copy_1x16_and_increment copy_src,copy_dst
271 L_copy_long_match_32_loop:
272 copy_1x32_and_increment copy_src,copy_dst
273 cmp dst,copy_dst
274 b.hi L_copy_long_match_32_loop
275 b L_decode_command
276
277 // Copy match, n_matches >= 16, match_distance >= 16
278 L_copy_long_match_16:
279 // fixed + loop
280 copy_1x16_and_increment copy_src,copy_dst
281 L_copy_long_match_16_loop:
282 copy_2x16_and_increment copy_src,copy_dst
283 cmp dst,copy_dst
284 b.hi L_copy_long_match_16_loop
285 b L_decode_command
286
287 L_fail:
288 mov aux1,#-1 // FAIL
289 b L_exit
290
291 L_done:
292 mov aux1,#0 // OK
293 // continue to L_exit
294
295 L_exit:
296 ldp src,dst,[sp],#16 // get back src_ptr,dst_ptr from stack
297 str src_good,[src] // *src_ptr = src_good
298 str dst_good,[dst] // *dst_ptr = dst_good
299 mov x0,aux1 // x0 = return value
300 ldp x19,x20,[sp],#16 // restore
301 clear_frame_and_return
302
303 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
304 // value(d)[i] = i%d for i = 0..31
305 .p2align 6
306 L_match_permtable:
307 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
308 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
309 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
310 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
311 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
312 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
313 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
314 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
315 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
316 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
317 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
318 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
319 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
320 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
321 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
322 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
323
324 // valid repeating pattern size, for each match_distance = 0 to 15
325 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
326 .p2align 6
327 L_match_disttable:
328 .byte 32,32,32,30 // 0 .. 3
329 .byte 16,30,30,28 // 4 .. 7
330 .byte 16,27,30,22 // 8 .. 11
331 .byte 24,26,28,30 // 12 .. 15
332
333 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARM64