]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm64/lz4_encode_arm64.s
xnu-4570.51.1.tar.gz
[apple/xnu.git] / osfmk / arm64 / lz4_encode_arm64.s
1 /*
2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <vm/lz4_assembly_select.h>
30 #include <vm/lz4_constants.h>
31
32 #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
33
34 /* void lz4_encode_2gb(uint8_t ** dst_ptr,
35 size_t dst_size,
36 const uint8_t ** src_ptr,
37 const uint8_t * src_begin,
38 size_t src_size,
39 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
40 int skip_final_literals) */
41
42 .globl _lz4_encode_2gb
43
44 #define dst_ptr x0
45 #define dst_size x1
46 #define src_ptr x2
47 #define src_begin x3
48 #define src_size x4
49 #define hash_table x5
50 #define skip_final_literals x6
51
52 .text
53 .p2align 4
54 _lz4_encode_2gb:
55
56 // esteblish frame
57 stp fp, lr, [sp, #-16]!
58 mov fp, sp
59
60 stp x19, x20, [sp, #-16]!
61 stp x21, x22, [sp, #-16]!
62 stp x23, x24, [sp, #-16]!
63 stp x25, x26, [sp, #-16]!
64 stp x27, x28, [sp, #-16]!
65
66 // constant registers
67 adr x7, L_constant
68 ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255)
69 ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY
70 mov x27, #-1 // x27 = 0xffffffffffffffff
71 dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
72
73
74 // x9 - is current dst
75 // x10 - dst_end - safety_margin
76 ldr x9, [x0] // dst
77 add x10, x9, x1 // dst_end
78 sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin
79 cmp x10, x9 // if dst_size < safety_margin abort
80 b.lt L_done
81
82 // x11 - is current src
83 // x12 - is src_end - safety margin
84 ldr x11, [x2] // src
85 add x12, x11, x4 // src_end
86 sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin
87 cmp x12, x11 // if src_size < safety_margin skip to trailing_literals
88 b.lt L_trailing_literals
89
90
91 // this block search for the next available match
92 // set match_begin to current src (which is also where last match ended)
93 L_search_next_available_match:
94 mov x13, x11 // match_begin = src
95 sub x14, x13, x3 // match_postion = match_begin - src_begin
96
97 // compute hash value for the next 5 "quads"
98 // hash distance need to be 0 < D < 0x10000
99
100 L_hash_match:
101 ldr x15, [x13] // match_first_4_bytes
102 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
103 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
104 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
105
106 ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
107 stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
108
109 add x26, x14, #1 // next_match pos
110 lsr x25, x15, #8 // next_match_first_4_bytes
111 umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
112 lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
113 add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index)
114
115 ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos)
116 stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes)
117
118 cmp w15, w22
119 b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match
120 sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos)
121 cmp w19, #0x10000
122 ccmp w19, #0, #0xf, lo
123 b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000
124 b L_found_valid_match
125
126 L_try_next_match_0:
127 add x13, x13, #1
128 add x14, x14, #1
129
130 add x26, x14, #1 // next_match pos
131 lsr x15, x15, #16 // next_match_first_4_bytes
132 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
133 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
134 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
135
136 ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
137 stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
138
139 cmp w25, w24
140 b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match
141 sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
142 cmp w19, #0x10000
143 ccmp w19, #0, #0xf, lo
144 b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000
145 b L_found_valid_match
146
147 L_try_next_match_1:
148 add x13, x13, #1
149 add x14, x14, #1
150
151 add x26, x14, #1 // next_match pos
152 lsr x25, x15, #8 // next_match_first_4_bytes
153 umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
154 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
155 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
156
157 ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos)
158 stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes)
159
160 cmp w15, w22
161 b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match
162 sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
163 cmp w19, #0x10000
164 ccmp w19, #0, #0xf, lo
165 b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000
166 b L_found_valid_match
167
168 L_try_next_match_2:
169 add x13, x13, #1
170 add x14, x14, #1
171
172 add x26, x14, #1 // next_match pos
173 lsr x15, x15, #16 // next_match_first_4_bytes
174 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
175 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
176 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
177
178 ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
179 stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
180
181 cmp w25, w24
182 b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match
183 sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
184 cmp w19, #0x10000
185 ccmp w19, #0, #0xf, lo
186 b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000
187 b L_found_valid_match
188
189 L_try_next_match_3:
190 add x13, x13, #1
191 add x14, x14, #1
192
193 cmp w15, w22
194 b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match
195 sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
196 cmp w19, #0x10000
197 ccmp w19, #0, #0xf, lo
198 b.eq L_try_next_matchs // verify the 0 < dist < 0x10000
199 b L_found_valid_match
200
201 // this block exapnd the valid match as much as possible
202 // first it try to expand the match forward
203 // next it try to expand the match backword
204 L_found_valid_match:
205 add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes)
206 sub x21, x20, x19 // ref_end = match_end - dist
207 L_found_valid_match_expand_forward_loop:
208 ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin)
209 ldr x23, [x21], #8 // load ref_current_8_bytes
210 cmp x22, x23
211 b.ne L_found_valid_match_expand_forward_partial
212 cmp x20, x12 // check if match_end reached src_end
213 b.lo L_found_valid_match_expand_forward_loop
214 b L_found_valid_match_expand_backward
215 L_found_valid_match_expand_forward_partial:
216 sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes
217 eor x22, x22, x23 // compare the bits using xor
218 rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
219 clz x22, x22 // after the revrse for every equal prefix byte clz would count 8
220 add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3
221 L_found_valid_match_expand_backward:
222 sub x15, x13, x19 // ref_begin = match_begin - dist
223 L_found_valid_match_expand_backward_loop:
224 cmp x13, x11 // check if match_begin reached src (previous match end)
225 ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin
226 b.le L_found_valid_match_emit_match
227 ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin)
228 ldrb w23, [x15, #-1]! // load ref_current_8_bytes
229 cmp w22, w23
230 b.eq L_found_valid_match_expand_backward_loop
231 add x13, x13, #1 // revert x13, last compare didn't match
232
233 // this block write the match into dst
234 // it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
235 // it update src & dst positions and progress to L_search_next_available_match
236 L_found_valid_match_emit_match:
237 sub x21, x20, x13 // match_length - match_end - match_begin
238 sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed)
239 sub x22, x13, x11 // literals_length = match_begin - src // compute
240 sub x26, x10, x9 // dst_remaining_space = dst_end - dst
241 sub x26, x26, x22 // dst_remaining_space -= literals_length
242 subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token
243 b.lo L_done // exit if dst isn't sufficent
244
245 and x23, x21, #0xf // store M 4 LSbits
246 add x23, x23, x22, lsl #4 // add L 4 LSbits
247 add x15, x9, #1 // tmp_dst = dst + 1
248 cmp x22, #15 // if L >= 15 need to write more L tokens
249 b.lo L_found_valid_match_copy_literals
250 orr x23, x23, #0xf0 // update L/M token to be 0xfM
251 sub x24, x22, #15 // reduce 15 from number_of_literals
252 sub x26, x26, #1 // check if there is space for the extra L token
253 b.lo L_done
254 cmp x24, #255 // check if need to compute number of 255 tokens
255 b.lo L_found_valid_match_skip_L_255_tokens
256 umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number)
257 lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
258 subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
259 b.lo L_done
260 mov x13, #255
261 umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
262 L_found_valid_match_L_255_tokens_loop:
263 str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
264 subs x25, x25, #16 // check if there are any 255 token left after current 16
265 b.hi L_found_valid_match_L_255_tokens_loop
266 add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
267 L_found_valid_match_skip_L_255_tokens:
268 strb w24, [x15], #1 // write last L token
269 L_found_valid_match_copy_literals:
270 ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin)
271 str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin)
272 subs x22, x22, #16
273 b.gt L_found_valid_match_copy_literals
274 add x15, x15, x22 // revert tmp_dst if written too many literals
275 strh w19, [x15], #2 // store dist bytes
276 cmp x21, #15 // if M >= 15 need to write more M tokens
277 b.lo L_found_valid_match_finish_writing_match
278 orr x23, x23, #0xf // update L/M token to be 0xLf
279 sub x24, x21, #15 // reduce 15 from match_length
280 sub x26, x26, #1 // check if there is space for the extra M token
281 b.lo L_done
282 cmp x24, #255 // check if need to compute number of 255 tokens
283 b.lo L_found_valid_match_skip_M_255_tokens
284 umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number)
285 lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39
286 subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
287 b.lo L_done
288 mov x13, #255
289 umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255)
290 L_found_valid_match_M_255_tokens_loop:
291 str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
292 subs x25, x25, #16 // check if there are any 255 token left after current 16
293 b.hi L_found_valid_match_M_255_tokens_loop
294 add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
295 L_found_valid_match_skip_M_255_tokens:
296 strb w24, [x15], #1 // write last M token
297 L_found_valid_match_finish_writing_match:
298 strb w23, [x9] // store first token of match in dst
299 mov x9, x15 // update dst to last postion written
300 mov x11, x20 // update src to match_end (last byte that was encoded)
301 cmp x11, x12 // check if src reached src_end
302 ccmp x9, x10, #9, lt // check if dst reached dst_end
303 b.ge L_trailing_literals
304 b L_search_next_available_match
305 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
306 // attempted to hash three quad values from the end of each emited match
307 // this eneded up being slower and less compression (???)
308 // this block set match_begin and pos for next hash search and
309 // compute the hash values for the last 3 bytes of currently emited match
310 // only need to comute these hash becasue other "quads" were hashed when the original
311 // data was read.
312
313 L_try_next_matchs:
314 add x13, x13, #1 // move to next match
315 add x14, x14, #1 // update next match pos
316 cmp x13, x12 // check match_begin didn't reach src_end
317 b.lo L_hash_match
318
319 L_trailing_literals:
320 // unless skip_final_literals is set
321 // write the trailing bytes as literals
322 // traliing bytes include the whole src (with the safty margin)
323 // need to verify whole dst (withthe safty margin) has sufficent space
324
325 tst x6, x6
326 b.ne L_done // if skip_final_literals is set skip writing them
327
328 add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
329 subs x13, x12, x11 // remaining_src
330 b.eq L_done // finish if there are 0 trailing literals
331
332 add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
333 sub x14, x10, x9 // remaining dst (dst_end - dst)
334 sub x14, x14, #1 // 1 byte is needed at least to write literals token
335 subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token
336 b.le L_done // (need to verify that it has room for literals tokens
337
338 cmp x13, #15
339 b.lt L_trailing_literals_store_less_than_15_literals
340 subs x14, x14, #1 // 1-extra byte is needed for literals tokens
341 b.mi L_done
342 mov w15, #0xf0
343 strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
344 sub x15, x13, #15
345 cmp x15, #255
346 b.lo L_trailing_literals_no_255_tokens
347 umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number)
348 lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
349 subs x14, x14, x19
350 b.mi L_revert_x9_and_done
351 mov x26, #255
352 umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
353 L_tariling_literals_write_16_255_tokens:
354 str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
355 subs x19, x19, #16
356 b.gt L_tariling_literals_write_16_255_tokens
357 add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
358 L_trailing_literals_no_255_tokens:
359 strb w15, [x9], #1 // store remainder_token
360 lsr x14, x13, #4 // check if there are more than 16 literals left to be written
361 tst x14, x14
362 b.eq L_trailing_literals_copy_less_than_16_literals
363 L_trailing_literals_copy_16_literals:
364 ldr q0, [x11], #16 // load current_16_literals
365 str q0, [ x9], #16 // *dst16++ = current_16_literals
366 subs x14, x14, #1
367 b.gt L_trailing_literals_copy_16_literals
368 cmp x11, x12
369 b.lo L_trailing_literals_copy_less_than_16_literals
370 b L_done
371
372 L_trailing_literals_store_less_than_15_literals:
373 lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits)
374 strb w14, [x9], #1 // *dst++ = literals_only_token
375 L_trailing_literals_copy_less_than_16_literals:
376 ldrb w13, [x11], #1 // load current_literal
377 strb w13, [ x9], #1 // *dst++ = current_literal
378 cmp x11, x12
379 b.lo L_trailing_literals_copy_less_than_16_literals
380
381 // this block upadte dst & src pointers and remove frame
382 L_done:
383 str x9, [x0]
384 str x11, [x2]
385
386 ldp x27, x28, [sp], #16
387 ldp x25, x26, [sp], #16
388 ldp x23, x24, [sp], #16
389 ldp x21, x22, [sp], #16
390 ldp x19, x20, [sp], #16
391
392 // clear frame
393 ldp fp, lr, [sp], #16
394 ret lr
395
396 L_revert_x9_and_done:
397 sub x9, x9, #1
398 b L_done
399
400 .p2align 2
401 L_constant:
402 .long LZ4_COMPRESS_HASH_MULTIPLY
403 .long 0x80808081
404
405 #endif
406