2 * Copyright (c) 2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 #include <vm/lz4_assembly_select.h>
29 #include <arm64/asm.h>
30 #if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64
34 int64_t lz4_decode_asm(
35 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
36 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
37 uint8_t * dst_end, "relaxed" end of output buffer (see below)
38 const uint8_t ** src_ptr, *src_ptr points to next input byte to read
39 const uint8_t * src_end) "relaxed" end of input buffer (see below)
41 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
42 We never read before dst_begin.
44 Return 0 on success, -1 on failure
45 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
49 .globl _lz4_decode_asm
51 #define dst x0 // arg0
52 #define dst_begin x1 // arg1
53 #define dst_end x2 // arg2
54 #define src x3 // arg3
55 #define src_end x4 // arg4
57 #define w_n_matches w5 // lower 32 bits of n_matches
60 #define copy_src x7 // match/literal copy source
61 #define copy_dst x8 // match/literal copy destination
63 #define w_aux1 w9 // lower 32 bits of aux1
67 #define w_match_distance w11 // lower 32 bits of match_distance
68 #define match_distance x11
70 #define match_permtable x12
71 #define match_disttable x13
76 .macro establish_frame
78 stp fp, lr, [sp, #-16]!
82 .macro clear_frame_and_return
87 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
88 // Copy 16 bytes, clobber: q0
94 // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
95 // Copy 16 bytes, and increment both addresses by 16, clobber: q0
96 .macro copy_1x16_and_increment
101 // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
102 // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0
103 .macro copy_2x16_and_increment
110 // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
111 // Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1
112 .macro copy_1x32_and_increment
117 // If we don't branch, src < src_end after this
120 b.hs L_done // extremely unlikely, DONE when src >= src_end
123 // If we don't branch, dst < dst_end after this
126 b.hs L_done // extremely unlikely, DONE when dst >= dst_end
133 stp x19,x20,[sp,#-16]! // need to preserve these
134 stp src,dst,[sp,#-16]! // save src_ptr,dst_ptr on stack
135 ldr src,[src] // src = *src_ptr
136 ldr dst,[dst] // dst = *dst_ptr
137 adr match_permtable,L_match_permtable
138 adr match_disttable,L_match_disttable
141 // Keep last known good positions in both streams
149 // Decode 1-byte command
150 ldrb w_aux1,[src],#1 // read command byte LLLLMMMM
151 lsr n_literals,aux1,#4 // 0000LLLL. n_literals is now 0..15
152 and n_matches,aux1,#0xf // 0000MMMM. n_matches is now 0..15
153 add n_matches,n_matches,#4 // n_matches is now 4..19
155 // Test number of literals (do not test if n_literals==0, because branch prediction fails on it)
157 b.ls L_copy_short_literal // 96% likely: n_literals in 0..14
158 // continue to decode_long_literal
160 // the number of literals is encoded on more bytes, we need to decode them
161 L_decode_long_literal:
162 check_src_end // required here, since we may loop an arbitrarily high number of times
164 add n_literals,n_literals,aux1
166 b.eq L_decode_long_literal // extremely unlikely
167 // continue to copy_long_literal
169 // Copy literals, n_literals >= 15
171 mov copy_src,src // literal copy origin
172 mov copy_dst,dst // literal copy destination
173 add src,src,n_literals
174 add dst,dst,n_literals
175 check_src_end // required here, since n_literals can be arbitrarily high
179 copy_1x32_and_increment copy_src,copy_dst
180 L_copy_long_literal_loop:
181 copy_1x32_and_increment copy_src,copy_dst
183 b.hi L_copy_long_literal_loop // first test occurs after 64 bytes have been copied, and is unlikely to loop back
186 // Copy literals, n_literals <= 14: copy 16 bytes
187 L_copy_short_literal:
189 add src,src,n_literals
190 add dst,dst,n_literals
191 // continue to expand match
195 // Decode match distance
196 ldrh w_match_distance,[src],#2 // 16-bit distance
197 cbz match_distance,L_fail // distance == 0 is invalid
198 sub copy_src,dst,match_distance // copy_src is the match copy source
199 cmp copy_src,dst_begin
200 b.lo L_fail // copy_src < dst_begin: FAIL
201 mov copy_dst,dst // copy_dst is the match copy destination
202 add dst,dst,n_matches // dst is updated to be the byte after the match; n_matches <= 19 here
204 // Do we need to decode a long match?
206 b.eq L_decode_long_match // unlikely, n_matches >= 19 encoded on more bytes
208 b.hi L_long_match // unlikely, n_matches == 17 or 18
209 // continue to short match (most probable case)
211 // Copy match, n_matches <= 16
213 cmp match_distance,#15
214 b.ls L_copy_short_match_small_distance
216 // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes
217 copy_1x16 copy_src,copy_dst
220 // Copy match, n_matches <= 16, match_distance < 16:
221 // load shuffle table, and permute to replicate the pattern on 16 bytes
222 L_copy_short_match_small_distance:
224 add aux1,match_permtable,match_distance,lsl #5 // index in table
225 ldr q1,[aux1] // load only permutation for the low 16 bytes
226 tbl v0.16b,{v0.16b},v1.16b // low 16 bytes of pattern
230 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
232 check_src_end // required here, since we may loop an arbitrarily high number of times
236 b.eq L_decode_long_match // very unlikely
237 check_dst_end // required here, since dst was incremented by a arbitrarily high value
238 // continue to long_match
242 cmp match_distance,#31
243 b.hi L_copy_long_match_32
244 cmp match_distance,#15
245 b.hi L_copy_long_match_16
247 // Copy match, n_matches >= 16, match_distance < 16:
248 // load shuffle table, and permute to replicate the pattern on 32 bytes
249 L_copy_long_match_small_distance:
250 ldr q1,[copy_src] // 16 pattern bytes
251 add aux1,match_permtable,match_distance,lsl #5 // index in table
252 ldp q2,q3,[aux1] // load 32-byte permutation
253 tbl v0.16b,{v1.16b},v2.16b // low 16 bytes of pattern in q0
254 tbl v1.16b,{v1.16b},v3.16b // high 16 bytes of pattern in q1
255 ldrb w_aux1,[match_disttable,match_distance] // valid pattern length in aux1
258 add copy_dst,copy_dst,aux1
259 L_copy_long_match_small_distance_loop:
262 add copy_dst,copy_dst,aux1
264 add copy_dst,copy_dst,aux1
266 b.hi L_copy_long_match_small_distance_loop
269 // Copy match, n_matches >= 16, match_distance >= 32
270 L_copy_long_match_32:
272 copy_1x16_and_increment copy_src,copy_dst
273 L_copy_long_match_32_loop:
274 copy_1x32_and_increment copy_src,copy_dst
276 b.hi L_copy_long_match_32_loop
279 // Copy match, n_matches >= 16, match_distance >= 16
280 L_copy_long_match_16:
282 copy_1x16_and_increment copy_src,copy_dst
283 L_copy_long_match_16_loop:
284 copy_2x16_and_increment copy_src,copy_dst
286 b.hi L_copy_long_match_16_loop
295 // continue to L_exit
298 ldp src,dst,[sp],#16 // get back src_ptr,dst_ptr from stack
299 str src_good,[src] // *src_ptr = src_good
300 str dst_good,[dst] // *dst_ptr = dst_good
301 mov x0,aux1 // x0 = return value
302 ldp x19,x20,[sp],#16 // restore
303 clear_frame_and_return
305 // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
306 // value(d)[i] = i%d for i = 0..31
309 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
310 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
311 .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
312 .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
313 .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
314 .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
315 .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
316 .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
317 .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
318 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
319 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
320 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
321 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
322 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
323 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
324 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
326 // valid repeating pattern size, for each match_distance = 0 to 15
327 // value(d) = 32 - (32%d), is the largest a multiple of d <= 32
330 .byte 32,32,32,30 // 0 .. 3
331 .byte 16,30,30,28 // 4 .. 7
332 .byte 16,27,30,22 // 8 .. 11
333 .byte 24,26,28,30 // 12 .. 15
335 #endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARM64