]>
Commit | Line | Data |
---|---|---|
5ba3f43e A |
1 | /* |
2 | * Copyright (c) 2016-2016 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <vm/lz4_assembly_select.h> | |
30 | #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7 | |
31 | ||
32 | /* void lz4_encode_2gb(uint8_t ** dst_ptr, | |
33 | size_t dst_size, | |
34 | const uint8_t ** src_ptr, | |
35 | const uint8_t * src_begin, | |
36 | size_t src_size, | |
37 | lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], | |
38 | int skip_final_literals) */ | |
39 | ||
40 | .globl _lz4_encode_2gb | |
41 | .syntax unified | |
42 | ||
43 | #define dst_ptr r0 | |
44 | #define dst_end r1 | |
45 | #define src_ptr r2 | |
46 | #define src_beg r3 | |
47 | #define src_end r4 | |
48 | #define table r5 | |
49 | #define mch_ptr r6 | |
50 | #define mch_dis r8 | |
51 | #define mch_len r9 | |
52 | #define mch_ref r10 | |
53 | ||
54 | #define margin 128 | |
55 | ||
56 | .macro establish_frame | |
57 | push {r4-r7, lr} | |
58 | add r7, sp, #12 | |
59 | push {r8-r11} | |
60 | ldrd r4, r5, [sp, #36] | |
61 | push {r0, r2} | |
62 | ldr dst_ptr, [r0] | |
63 | ldr src_ptr, [r2] | |
64 | subs r1, r1, margin // subtract safety margin from dst_size | |
65 | bls L_done | |
66 | add dst_end, dst_ptr, r1 // dst end - margin | |
67 | sub r4, r4, margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop). | |
68 | add src_end, src_ptr, r4 // src end - margin. | |
69 | vmov.i8 q1, #255 // vector of all 1s used to emit | |
70 | .endm | |
71 | ||
72 | .macro clear_frame_and_return | |
73 | pop {r1, r3} | |
74 | str dst_ptr, [r1] | |
75 | str src_ptr, [r3] | |
76 | pop {r8-r11} | |
77 | pop {r4-r7, pc} | |
78 | .endm | |
79 | ||
80 | .p2align 4 | |
81 | _lz4_encode_2gb: | |
82 | establish_frame | |
83 | L_next_match: | |
84 | // Start searching for the next match, starting at the end of the last one. | |
85 | // [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr | |
86 | // within the search loop itself]. Load the hash magic number in lr, and | |
87 | // zero out mch_len (when we find a match, its length will initially be | |
88 | // four, but we actually work with the match length minus four at all times). | |
89 | ldr lr, L_hash | |
90 | sub mch_ptr, src_ptr, #1 | |
91 | ||
92 | L_match_candidate_loop: | |
93 | // If mch_ptr >= src_end, we are near the end of the source buffer (remember | |
94 | // that we subtracted margin from src_end, so we are *not* actually past the | |
95 | // end just yet). | |
96 | cmp mch_ptr, src_end | |
97 | bhs L_trailing_literal | |
98 | ||
99 | // Load the four-byte word starting at mch_ptr, and get the address of the | |
100 | // corresponding row of the hash table. | |
101 | ldr r9, [mch_ptr, #1]! | |
102 | sub r8, mch_ptr, src_beg | |
103 | mul r12, r9, lr | |
104 | mvn r10, #0x7 | |
105 | and r12, r10, r12, lsr #17 // byte offset of table entry. | |
106 | ||
107 | // Load offset and word from hash table row, then update with the new offset | |
108 | // and word that we just computed. | |
109 | ldrd r10,r11, [table, r12] | |
110 | strd r8, r9, [table, r12] | |
111 | ||
112 | // At this point, we only know that the hashes of the words match; check to | |
113 | // see if the words themselves match. If not, move on to the next candidate. | |
114 | cmp r9, r11 | |
115 | bne L_match_candidate_loop | |
116 | ||
117 | // It's not enough for the words to match; the match distance must also be | |
118 | // in the representable range (i.e. less than 0x10000). | |
119 | sub mch_dis, r8, r10 | |
120 | add mch_ref, src_beg, r10 | |
121 | cmp mch_dis, #0x10000 | |
122 | bhs L_match_candidate_loop | |
123 | ||
124 | // We have found a match; registers at this point are as follows: | |
125 | // | |
126 | // register symbolic name meaning | |
127 | // r0 dst_ptr pointer into destination buffer where the | |
128 | // match information will be stored. | |
129 | // r1 dst_end pointer to the end of the destination buffer, | |
130 | // less margin. | |
131 | // r2 src_ptr pointer to the byte after the last match that | |
132 | // we found, or to the point from which we | |
133 | // started searching if this is the first match. | |
134 | // r3 src_beg pointer to the actual start of the buffer. | |
135 | // r4 src_end pointer to the end of the source buffer, less | |
136 | // margin. | |
137 | // r5 table address of hash table. | |
138 | // r6 mch_ptr pointer to match. | |
139 | // r8 mch_dis match distance ("D") | |
140 | // r9 mch_len length of match less four ("M") | |
141 | // r10 mch_ref pointer to match reference. | |
142 | // r11 - | |
143 | // r12 - | |
144 | // lr - | |
145 | // | |
146 | // Attempt to grow the match backwards (typically we only grow backwards by | |
147 | // a byte or two, if at all, so we use a byte-by-byte scan). | |
148 | eor mch_len, mch_len | |
149 | 0:cmp mch_ref, src_beg | |
150 | cmpne mch_ptr, src_ptr | |
151 | beq 1f | |
152 | ldrb r11, [mch_ref, #-1] | |
153 | ldrb r12, [mch_ptr, #-1] | |
154 | cmp r11, r12 | |
155 | bne 1f | |
156 | sub mch_ref, #1 | |
157 | sub mch_ptr, #1 | |
158 | add mch_len, #1 | |
159 | b 0b | |
160 | ||
161 | // Now that we have the start of the match, we can compute the literal | |
162 | // length. Then advance the mch and ref pointers to the end of the match | |
163 | // and its reference. Because mch_len is the real match length minus four, | |
164 | // we actually advance to four before the end of the match, but our loop | |
165 | // to grow the matches uses pre-incremented loads with writeback, so this | |
166 | // works out correctly. | |
167 | #define lit_len lr | |
168 | 1:sub lit_len, mch_ptr, src_ptr | |
169 | add mch_ptr, mch_len | |
170 | add mch_ref, mch_len | |
171 | ||
172 | // Now attempt to grow the match forwards. This is much more common, and | |
173 | // there is a safety margin at the end of the buffer, so we grow forwards | |
174 | // in four-byte chunks. | |
175 | 0:ldr r11, [mch_ptr, #4]! | |
176 | ldr r12, [mch_ref, #4]! | |
177 | eors r11, r12 | |
178 | bne 1f | |
179 | add mch_len, #4 | |
180 | cmp mch_ptr, src_end | |
181 | blo 0b | |
182 | b L_emit_match | |
183 | // At least one of the bytes in the last comparison did not match. Identify | |
184 | // which byte had the mismatch and compute the final length (less four). | |
185 | 1:rev r11, r11 | |
186 | clz r11, r11 | |
187 | add mch_len, r11, lsr #3 | |
188 | ||
189 | L_emit_match: | |
190 | // Time to emit what we've found! | |
191 | // | |
192 | // register symbolic name meaning | |
193 | // r0 dst_ptr pointer into destination buffer where the | |
194 | // match information will be stored. | |
195 | // r1 dst_end pointer to the end of the destination buffer, | |
196 | // less margin. | |
197 | // r2 src_ptr pointer to the byte after the last match that | |
198 | // we found, or to the point from which we | |
199 | // started searching if this is the first match. | |
200 | // r3 src_beg pointer to the actual start of the buffer. | |
201 | // r4 src_end pointer to the end of the source buffer, less | |
202 | // margin. | |
203 | // r5 table address of hash table. | |
204 | // r6 - | |
205 | // r8 mch_dis match distance ("D") | |
206 | // r9 mch_len length of match ("M") | |
207 | // r10 - | |
208 | // r11 - | |
209 | // r12 - | |
210 | // lr lit_len literal length ("L") | |
211 | // q1 vector of all ones | |
212 | // | |
213 | // Synthesize control byte under the assumption that L and M are both less | |
214 | // than 15, jumping out of the fast path if one of them is not. | |
215 | cmp lit_len, #15 | |
216 | orr r10, mch_len, lit_len, lsl #4 | |
217 | cmplo mch_len, #15 | |
218 | bhs L_emit_careful | |
219 | // L and M are both less than 15, which means (a) we use the most compact | |
220 | // encoding for the match and (b) we do not need to do a bounds check on | |
221 | // the destination buffer before writing, only before continuing our search. | |
222 | // Store the command byte. | |
223 | strb r10, [dst_ptr], #1 | |
224 | // Copy literal. | |
225 | vld1.8 q0, [src_ptr] | |
226 | add src_ptr, lit_len | |
227 | vst1.8 q0, [dst_ptr] | |
228 | add dst_ptr, lit_len | |
229 | // Restore "true" match length before updating src_ptr. | |
230 | add mch_len, #4 | |
231 | // Store match distance (D) and update the source pointer. | |
232 | strh r8, [dst_ptr], #2 | |
233 | add src_ptr, mch_len | |
234 | // If we're not into the safety margin of the destination buffer, look for | |
235 | // another match. | |
236 | cmp dst_ptr, dst_end | |
237 | blo L_next_match | |
238 | // If we *are* into the safety margin of the destination buffer, we're done | |
239 | // encoding this block; update the source and destination pointers and | |
240 | // return. | |
241 | L_done: | |
242 | clear_frame_and_return | |
243 | ||
244 | // Constant island | |
245 | L_hash: .long 2654435761 | |
246 | L_magic: .long 0x80808081 | |
247 | ||
248 | L_emit_careful: | |
249 | // Either L or M is >= 15, which means that we don't get to use the compact | |
250 | // encoding, and that we need to do extra bounds checks while writing. | |
251 | // | |
252 | // register symbolic name meaning | |
253 | // r0 dst_ptr pointer into destination buffer where the | |
254 | // match information will be stored. | |
255 | // r1 dst_end pointer to the end of the destination buffer, | |
256 | // less margin. | |
257 | // r2 src_ptr pointer to the byte after the last match that | |
258 | // we found, or to the point from which we | |
259 | // started searching if this is the first match. | |
260 | // r3 src_beg pointer to the actual start of the buffer. | |
261 | // r4 src_end pointer to the end of the source buffer, less | |
262 | // margin. | |
263 | // r5 table address of hash table. | |
264 | // r6 - | |
265 | // r8 mch_dis match distance ("D") | |
266 | // r9 mch_len length of match ("M") less four | |
267 | // r10 - | |
268 | // r11 - | |
269 | // r12 - | |
270 | // lr lit_len literal length ("L") | |
271 | // q1 vector of all ones | |
272 | // | |
273 | // Start by creating the low 4 bits of the control word; M if M < 15, 0xf | |
274 | // otherwise. We also load 0x80808081, which is the magic number for | |
275 | // division by 255; this will be required later on. | |
276 | ldr r12, L_magic | |
277 | cmp mch_len, #15 | |
278 | mov r10, mch_len | |
279 | movhs r10, #0x0f | |
280 | subs r6, lit_len, #15 | |
281 | bhs L_large_L | |
282 | // M is large, but L is < 15. This means we can use the simple approach | |
283 | // for copying the literal with no bounds checks. | |
284 | orr r10, lit_len, lsl #4 | |
285 | strb r10, [dst_ptr], #1 | |
286 | // Copy literal. | |
287 | vld1.8 q0, [src_ptr] | |
288 | add src_ptr, lit_len | |
289 | vst1.8 q0, [dst_ptr] | |
290 | add dst_ptr, lit_len | |
291 | // Store match distance (D). | |
292 | strh r8, [dst_ptr], #2 | |
293 | sub r6, mch_len, #15 | |
294 | b L_large_M | |
295 | ||
296 | L_large_L: | |
297 | // L is large, M may or may not be. We need to encode extra literal length | |
298 | // bytes and we need to do bounds checks while store both those byte and the | |
299 | // literal itself. | |
300 | orr r10, #0xf0 | |
301 | strb r10, [dst_ptr], #1 | |
302 | // How many extra literal bytes do we need to store? We need to store | |
303 | // (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is | |
304 | // (L - 15)%255. Get these quantities via magic number multiplication: | |
305 | // (L - 15)*0x80808081 >> (32 + 7) | |
306 | umull r10, r11, r6, r12 | |
307 | mov r12, #255 | |
308 | lsr r10, r11, #7 // (L - 15) / 255 | |
309 | mls r11, r10, r12, r6 // (L - 15) % 255 | |
310 | ldr r12, L_magic // may need magic number again for M. | |
311 | // Compute address dst_ptr will have after storing all 0xff bytes, and | |
312 | // check that we won't exceed dst_end in doing so. | |
313 | add r10, dst_ptr, r10 | |
314 | cmp r10, dst_end | |
315 | bhs L_done | |
316 | // There's enough space for all the 0xff bytes, so go ahead and store them. | |
317 | 0:vst1.8 q1, [dst_ptr]! | |
318 | cmp dst_ptr, r10 | |
319 | blo 0b | |
320 | // Store the (L - 15) % 255 byte. | |
321 | strb r11, [r10], #1 | |
322 | // Compute the address we'll have reached after storing all literal bytes. | |
323 | // If that passes dst_end, we're done. | |
324 | add dst_ptr, r10, lit_len | |
325 | cmp dst_ptr, dst_end | |
326 | bhs L_done | |
327 | // Copy the literal. | |
328 | 0:vld1.8 q0, [src_ptr]! | |
329 | vst1.8 q0, [r10]! | |
330 | subs r6, r10, dst_ptr | |
331 | blo 0b | |
332 | // Fixup src_ptr, store match distance (D), and check whether or not M is | |
333 | // bigger than 14. If not, go find the next match. | |
334 | strh r8, [dst_ptr], #2 | |
335 | sub src_ptr, r6 | |
336 | subs r6, mch_len, #15 | |
337 | bhs L_large_M | |
338 | // M is small, so we're all done; we just need to update the source pointer | |
339 | // and we can go look for the next match. | |
340 | add mch_len, #4 | |
341 | add src_ptr, mch_len | |
342 | b L_next_match | |
343 | ||
344 | L_large_M: | |
345 | // Just like with large L, we split (M - 15) into (M - 15) / 255 and | |
346 | // (M - 15) % 255 via magic number multiply. | |
347 | umull r10, r11, r6, r12 | |
348 | mov r12, #255 | |
349 | lsr r10, r11, #7 // (M - 15) / 255 | |
350 | mls r11, r10, r12, r6 // (M - 15) % 255 | |
351 | // Compute address dst_ptr will have after storing all 0xff bytes, and | |
352 | // check that we won't exceed dst_end in doing so. | |
353 | add r10, dst_ptr, r10 | |
354 | cmp r10, dst_end | |
355 | bhs L_done | |
356 | // There's enough space for all the 0xff bytes, so go ahead and store them. | |
357 | 0:vst1.8 q1, [dst_ptr]! | |
358 | cmp dst_ptr, r10 | |
359 | blo 0b | |
360 | // Store final M % 255 byte, update dst_ptr and src_ptr, and look for next | |
361 | // match. | |
362 | strb r11, [r10] | |
363 | add mch_len, #4 | |
364 | add dst_ptr, r10, #1 | |
365 | add src_ptr, mch_len | |
366 | b L_next_match | |
367 | ||
368 | L_trailing_literal: | |
369 | // Check if skip_final_literals is set. | |
370 | ldr r5, [sp, #52] | |
371 | tst r5, r5 | |
372 | bne L_done | |
373 | // Emit a trailing literal that covers the remainder of the source buffer, | |
374 | // if we can do so without exceeding the bounds of the destination buffer. | |
375 | add src_end, margin | |
376 | sub lit_len, src_end, src_ptr | |
377 | subs r6, lit_len, #15 | |
378 | bhs L_trailing_literal_long | |
379 | lsl r10, lit_len, #4 | |
380 | strb r10, [dst_ptr], #1 | |
381 | vld1.8 q0, [src_ptr] | |
382 | mov src_ptr, src_end | |
383 | vst1.8 q0, [dst_ptr] | |
384 | add dst_ptr, lit_len | |
385 | b L_done | |
386 | ||
387 | L_trailing_literal_long: | |
388 | ldr r12, L_magic | |
389 | mov r10, #0xf0 | |
390 | add dst_end, margin | |
391 | strb r10, [dst_ptr], #1 | |
392 | umull r10, r11, r6, r12 | |
393 | mov r12, #255 | |
394 | lsr r10, r11, #7 // (L - 15) / 255 | |
395 | mls r11, r10, r12, r6 // (L - 15) % 255 | |
396 | // We want to write out lit_len + (L - 15)/255 + 1 bytes. Check if we have | |
397 | // space for all of them. | |
398 | add r10, dst_ptr | |
399 | add r12, r10, lit_len | |
400 | cmp r12, dst_end | |
401 | bhs L_done | |
402 | // We have enough space, so go ahead and write them all out. Because we | |
403 | // know that we have enough space, and that the literal is at least 15 bytes, | |
404 | // we can write the block of 0xffs using vector stores, even without a | |
405 | // safety margin. | |
406 | 0:vst1.8 q1, [dst_ptr]! | |
407 | cmp dst_ptr, r10 | |
408 | blo 0b | |
409 | // Store the (L - 15) % 255 byte. | |
410 | strb r11, [r10], #1 | |
411 | mov dst_ptr, r10 | |
412 | // Now store the literal itself; here we need to actually be somewhat | |
413 | // careful to ensure that we don't write past the end of the destination | |
414 | // buffer or read past the end of the source buffer. | |
415 | subs lit_len, #16 | |
416 | blo 1f | |
417 | 0:vld1.8 q0, [src_ptr]! | |
418 | subs lit_len, #16 | |
419 | vst1.8 q0, [dst_ptr]! | |
420 | bhs 0b | |
421 | 1:adds lit_len, #16 | |
422 | beq L_done | |
423 | 2:ldrb r6, [src_ptr], #1 | |
424 | subs lit_len, #1 | |
425 | strb r6, [dst_ptr], #1 | |
426 | bne 2b | |
427 | b L_done | |
428 | ||
429 | #endif |