]>
Commit | Line | Data |
---|---|---|
5ba3f43e A |
1 | /* |
2 | * Copyright (c) 2016-2016 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <vm/lz4_assembly_select.h> | |
30 | #include <vm/lz4_constants.h> | |
31 | ||
32 | #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64 | |
33 | ||
34 | /* void lz4_encode_2gb(uint8_t ** dst_ptr, | |
35 | size_t dst_size, | |
36 | const uint8_t ** src_ptr, | |
37 | const uint8_t * src_begin, | |
38 | size_t src_size, | |
39 | lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], | |
40 | int skip_final_literals) */ | |
41 | ||
42 | .globl _lz4_encode_2gb | |
43 | ||
44 | #define dst_ptr x0 | |
45 | #define dst_size x1 | |
46 | #define src_ptr x2 | |
47 | #define src_begin x3 | |
48 | #define src_size x4 | |
49 | #define hash_table x5 | |
50 | #define skip_final_literals x6 | |
51 | ||
52 | .text | |
53 | .p2align 4 | |
54 | _lz4_encode_2gb: | |
55 | ||
56 | // esteblish frame | |
57 | stp fp, lr, [sp, #-16]! | |
58 | mov fp, sp | |
59 | ||
60 | stp x19, x20, [sp, #-16]! | |
61 | stp x21, x22, [sp, #-16]! | |
62 | stp x23, x24, [sp, #-16]! | |
63 | stp x25, x26, [sp, #-16]! | |
64 | stp x27, x28, [sp, #-16]! | |
65 | ||
66 | // constant registers | |
67 | adr x7, L_constant | |
68 | ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255) | |
69 | ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY | |
70 | mov x27, #-1 // x27 = 0xffffffffffffffff | |
71 | dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} | |
72 | ||
73 | ||
74 | // x9 - is current dst | |
75 | // x10 - dst_end - safety_margin | |
76 | ldr x9, [x0] // dst | |
77 | add x10, x9, x1 // dst_end | |
78 | sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin | |
79 | cmp x10, x9 // if dst_size < safety_margin abort | |
80 | b.lt L_done | |
81 | ||
82 | // x11 - is current src | |
83 | // x12 - is src_end - safety margin | |
84 | ldr x11, [x2] // src | |
85 | add x12, x11, x4 // src_end | |
86 | sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin | |
87 | cmp x12, x11 // if src_size < safety_margin skip to trailing_literals | |
88 | b.lt L_trailing_literals | |
89 | ||
90 | ||
91 | // this block search for the next available match | |
92 | // set match_begin to current src (which is also where last match ended) | |
93 | L_search_next_available_match: | |
94 | mov x13, x11 // match_begin = src | |
95 | sub x14, x13, x3 // match_postion = match_begin - src_begin | |
96 | ||
97 | // compute hash value for the next 5 "quads" | |
98 | // hash distance need to be 0 < D < 0x10000 | |
99 | ||
100 | L_hash_match: | |
101 | ldr x15, [x13] // match_first_4_bytes | |
102 | umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY | |
103 | lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index | |
104 | add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) | |
105 | ||
106 | ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) | |
107 | stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) | |
108 | ||
109 | add x26, x14, #1 // next_match pos | |
110 | lsr x25, x15, #8 // next_match_first_4_bytes | |
111 | umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY | |
112 | lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index | |
113 | add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index) | |
114 | ||
115 | ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos) | |
116 | stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes) | |
117 | ||
118 | cmp w15, w22 | |
119 | b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match | |
120 | sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos) | |
121 | cmp w19, #0x10000 | |
122 | ccmp w19, #0, #0xf, lo | |
123 | b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000 | |
124 | b L_found_valid_match | |
125 | ||
126 | L_try_next_match_0: | |
127 | add x13, x13, #1 | |
128 | add x14, x14, #1 | |
129 | ||
130 | add x26, x14, #1 // next_match pos | |
131 | lsr x15, x15, #16 // next_match_first_4_bytes | |
132 | umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY | |
133 | lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index | |
134 | add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) | |
135 | ||
136 | ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) | |
137 | stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) | |
138 | ||
139 | cmp w25, w24 | |
140 | b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match | |
141 | sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos) | |
142 | cmp w19, #0x10000 | |
143 | ccmp w19, #0, #0xf, lo | |
144 | b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000 | |
145 | b L_found_valid_match | |
146 | ||
147 | L_try_next_match_1: | |
148 | add x13, x13, #1 | |
149 | add x14, x14, #1 | |
150 | ||
151 | add x26, x14, #1 // next_match pos | |
152 | lsr x25, x15, #8 // next_match_first_4_bytes | |
153 | umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY | |
154 | lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index | |
155 | add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) | |
156 | ||
157 | ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos) | |
158 | stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes) | |
159 | ||
160 | cmp w15, w22 | |
161 | b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match | |
162 | sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos) | |
163 | cmp w19, #0x10000 | |
164 | ccmp w19, #0, #0xf, lo | |
165 | b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000 | |
166 | b L_found_valid_match | |
167 | ||
168 | L_try_next_match_2: | |
169 | add x13, x13, #1 | |
170 | add x14, x14, #1 | |
171 | ||
172 | add x26, x14, #1 // next_match pos | |
173 | lsr x15, x15, #16 // next_match_first_4_bytes | |
174 | umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY | |
175 | lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index | |
176 | add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) | |
177 | ||
178 | ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) | |
179 | stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) | |
180 | ||
181 | cmp w25, w24 | |
182 | b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match | |
183 | sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos) | |
184 | cmp w19, #0x10000 | |
185 | ccmp w19, #0, #0xf, lo | |
186 | b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000 | |
187 | b L_found_valid_match | |
188 | ||
189 | L_try_next_match_3: | |
190 | add x13, x13, #1 | |
191 | add x14, x14, #1 | |
192 | ||
193 | cmp w15, w22 | |
194 | b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match | |
195 | sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos) | |
196 | cmp w19, #0x10000 | |
197 | ccmp w19, #0, #0xf, lo | |
198 | b.eq L_try_next_matchs // verify the 0 < dist < 0x10000 | |
199 | b L_found_valid_match | |
200 | ||
201 | // this block exapnd the valid match as much as possible | |
202 | // first it try to expand the match forward | |
203 | // next it try to expand the match backword | |
204 | L_found_valid_match: | |
205 | add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes) | |
206 | sub x21, x20, x19 // ref_end = match_end - dist | |
207 | L_found_valid_match_expand_forward_loop: | |
208 | ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin) | |
209 | ldr x23, [x21], #8 // load ref_current_8_bytes | |
210 | cmp x22, x23 | |
211 | b.ne L_found_valid_match_expand_forward_partial | |
212 | cmp x20, x12 // check if match_end reached src_end | |
213 | b.lo L_found_valid_match_expand_forward_loop | |
214 | b L_found_valid_match_expand_backward | |
215 | L_found_valid_match_expand_forward_partial: | |
216 | sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes | |
217 | eor x22, x22, x23 // compare the bits using xor | |
218 | rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit) | |
219 | clz x22, x22 // after the revrse for every equal prefix byte clz would count 8 | |
220 | add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3 | |
221 | L_found_valid_match_expand_backward: | |
222 | sub x15, x13, x19 // ref_begin = match_begin - dist | |
223 | L_found_valid_match_expand_backward_loop: | |
224 | cmp x13, x11 // check if match_begin reached src (previous match end) | |
225 | ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin | |
226 | b.le L_found_valid_match_emit_match | |
227 | ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin) | |
228 | ldrb w23, [x15, #-1]! // load ref_current_8_bytes | |
229 | cmp w22, w23 | |
230 | b.eq L_found_valid_match_expand_backward_loop | |
231 | add x13, x13, #1 // revert x13, last compare didn't match | |
232 | ||
233 | // this block write the match into dst | |
234 | // it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens] | |
235 | // it update src & dst positions and progress to L_search_next_available_match | |
236 | L_found_valid_match_emit_match: | |
237 | sub x21, x20, x13 // match_length - match_end - match_begin | |
238 | sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed) | |
239 | sub x22, x13, x11 // literals_length = match_begin - src // compute | |
240 | sub x26, x10, x9 // dst_remaining_space = dst_end - dst | |
241 | sub x26, x26, x22 // dst_remaining_space -= literals_length | |
242 | subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token | |
243 | b.lo L_done // exit if dst isn't sufficent | |
244 | ||
245 | and x23, x21, #0xf // store M 4 LSbits | |
246 | add x23, x23, x22, lsl #4 // add L 4 LSbits | |
247 | add x15, x9, #1 // tmp_dst = dst + 1 | |
248 | cmp x22, #15 // if L >= 15 need to write more L tokens | |
249 | b.lo L_found_valid_match_copy_literals | |
250 | orr x23, x23, #0xf0 // update L/M token to be 0xfM | |
251 | sub x24, x22, #15 // reduce 15 from number_of_literals | |
252 | sub x26, x26, #1 // check if there is space for the extra L token | |
253 | b.lo L_done | |
254 | cmp x24, #255 // check if need to compute number of 255 tokens | |
255 | b.lo L_found_valid_match_skip_L_255_tokens | |
256 | umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number) | |
257 | lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39 | |
258 | subs x26, x26, x25 // check if there is sufficent space for the 255_tokens | |
259 | b.lo L_done | |
260 | mov x13, #255 | |
261 | umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255) | |
262 | L_found_valid_match_L_255_tokens_loop: | |
263 | str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin | |
264 | subs x25, x25, #16 // check if there are any 255 token left after current 16 | |
265 | b.hi L_found_valid_match_L_255_tokens_loop | |
266 | add x15, x15, x25 // revert tmp_dst if written too many 255 tokens. | |
267 | L_found_valid_match_skip_L_255_tokens: | |
268 | strb w24, [x15], #1 // write last L token | |
269 | L_found_valid_match_copy_literals: | |
270 | ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin) | |
271 | str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin) | |
272 | subs x22, x22, #16 | |
273 | b.gt L_found_valid_match_copy_literals | |
274 | add x15, x15, x22 // revert tmp_dst if written too many literals | |
275 | strh w19, [x15], #2 // store dist bytes | |
276 | cmp x21, #15 // if M >= 15 need to write more M tokens | |
277 | b.lo L_found_valid_match_finish_writing_match | |
278 | orr x23, x23, #0xf // update L/M token to be 0xLf | |
279 | sub x24, x21, #15 // reduce 15 from match_length | |
280 | sub x26, x26, #1 // check if there is space for the extra M token | |
281 | b.lo L_done | |
282 | cmp x24, #255 // check if need to compute number of 255 tokens | |
283 | b.lo L_found_valid_match_skip_M_255_tokens | |
284 | umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number) | |
285 | lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39 | |
286 | subs x26, x26, x25 // check if there is sufficent space for the 255_tokens | |
287 | b.lo L_done | |
288 | mov x13, #255 | |
289 | umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255) | |
290 | L_found_valid_match_M_255_tokens_loop: | |
291 | str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin | |
292 | subs x25, x25, #16 // check if there are any 255 token left after current 16 | |
293 | b.hi L_found_valid_match_M_255_tokens_loop | |
294 | add x15, x15, x25 // revert tmp_dst if written too many 255 tokens. | |
295 | L_found_valid_match_skip_M_255_tokens: | |
296 | strb w24, [x15], #1 // write last M token | |
297 | L_found_valid_match_finish_writing_match: | |
298 | strb w23, [x9] // store first token of match in dst | |
299 | mov x9, x15 // update dst to last postion written | |
300 | mov x11, x20 // update src to match_end (last byte that was encoded) | |
301 | cmp x11, x12 // check if src reached src_end | |
302 | ccmp x9, x10, #9, lt // check if dst reached dst_end | |
303 | b.ge L_trailing_literals | |
304 | b L_search_next_available_match | |
305 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
306 | // attempted to hash three quad values from the end of each emited match | |
307 | // this eneded up being slower and less compression (???) | |
308 | // this block set match_begin and pos for next hash search and | |
309 | // compute the hash values for the last 3 bytes of currently emited match | |
310 | // only need to comute these hash becasue other "quads" were hashed when the original | |
311 | // data was read. | |
312 | ||
313 | L_try_next_matchs: | |
314 | add x13, x13, #1 // move to next match | |
315 | add x14, x14, #1 // update next match pos | |
316 | cmp x13, x12 // check match_begin didn't reach src_end | |
317 | b.lo L_hash_match | |
318 | ||
319 | L_trailing_literals: | |
320 | // unless skip_final_literals is set | |
321 | // write the trailing bytes as literals | |
322 | // traliing bytes include the whole src (with the safty margin) | |
323 | // need to verify whole dst (withthe safty margin) has sufficent space | |
324 | ||
325 | tst x6, x6 | |
326 | b.ne L_done // if skip_final_literals is set skip writing them | |
327 | ||
328 | add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin | |
329 | subs x13, x12, x11 // remaining_src | |
330 | b.eq L_done // finish if there are 0 trailing literals | |
331 | ||
332 | add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin | |
333 | sub x14, x10, x9 // remaining dst (dst_end - dst) | |
334 | sub x14, x14, #1 // 1 byte is needed at least to write literals token | |
335 | subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token | |
336 | b.le L_done // (need to verify that it has room for literals tokens | |
337 | ||
338 | cmp x13, #15 | |
339 | b.lt L_trailing_literals_store_less_than_15_literals | |
340 | subs x14, x14, #1 // 1-extra byte is needed for literals tokens | |
341 | b.mi L_done | |
342 | mov w15, #0xf0 | |
343 | strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1) | |
344 | sub x15, x13, #15 | |
345 | cmp x15, #255 | |
346 | b.lo L_trailing_literals_no_255_tokens | |
347 | umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number) | |
348 | lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39 | |
349 | subs x14, x14, x19 | |
350 | b.mi L_revert_x9_and_done | |
351 | mov x26, #255 | |
352 | umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255) | |
353 | L_tariling_literals_write_16_255_tokens: | |
354 | str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token) | |
355 | subs x19, x19, #16 | |
356 | b.gt L_tariling_literals_write_16_255_tokens | |
357 | add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16) | |
358 | L_trailing_literals_no_255_tokens: | |
359 | strb w15, [x9], #1 // store remainder_token | |
360 | lsr x14, x13, #4 // check if there are more than 16 literals left to be written | |
361 | tst x14, x14 | |
362 | b.eq L_trailing_literals_copy_less_than_16_literals | |
363 | L_trailing_literals_copy_16_literals: | |
364 | ldr q0, [x11], #16 // load current_16_literals | |
365 | str q0, [ x9], #16 // *dst16++ = current_16_literals | |
366 | subs x14, x14, #1 | |
367 | b.gt L_trailing_literals_copy_16_literals | |
368 | cmp x11, x12 | |
369 | b.lo L_trailing_literals_copy_less_than_16_literals | |
370 | b L_done | |
371 | ||
372 | L_trailing_literals_store_less_than_15_literals: | |
373 | lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits) | |
374 | strb w14, [x9], #1 // *dst++ = literals_only_token | |
375 | L_trailing_literals_copy_less_than_16_literals: | |
376 | ldrb w13, [x11], #1 // load current_literal | |
377 | strb w13, [ x9], #1 // *dst++ = current_literal | |
378 | cmp x11, x12 | |
379 | b.lo L_trailing_literals_copy_less_than_16_literals | |
380 | ||
381 | // this block upadte dst & src pointers and remove frame | |
382 | L_done: | |
383 | str x9, [x0] | |
384 | str x11, [x2] | |
385 | ||
386 | ldp x27, x28, [sp], #16 | |
387 | ldp x25, x26, [sp], #16 | |
388 | ldp x23, x24, [sp], #16 | |
389 | ldp x21, x22, [sp], #16 | |
390 | ldp x19, x20, [sp], #16 | |
391 | ||
392 | // clear frame | |
393 | ldp fp, lr, [sp], #16 | |
394 | ret lr | |
395 | ||
396 | L_revert_x9_and_done: | |
397 | sub x9, x9, #1 | |
398 | b L_done | |
399 | ||
400 | .p2align 2 | |
401 | L_constant: | |
402 | .long LZ4_COMPRESS_HASH_MULTIPLY | |
403 | .long 0x80808081 | |
404 | ||
405 | #endif | |
406 |