]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/arm/lz4_encode_armv7.s
xnu-4570.71.2.tar.gz
[apple/xnu.git] / osfmk / arm / lz4_encode_armv7.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <vm/lz4_assembly_select.h>
30#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7
31
32/* void lz4_encode_2gb(uint8_t ** dst_ptr,
33 size_t dst_size,
34 const uint8_t ** src_ptr,
35 const uint8_t * src_begin,
36 size_t src_size,
37 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
38 int skip_final_literals) */
39
40.globl _lz4_encode_2gb
41.syntax unified
42
43#define dst_ptr r0
44#define dst_end r1
45#define src_ptr r2
46#define src_beg r3
47#define src_end r4
48#define table r5
49#define mch_ptr r6
50#define mch_dis r8
51#define mch_len r9
52#define mch_ref r10
53
54#define margin 128
55
56.macro establish_frame
57 push {r4-r7, lr}
58 add r7, sp, #12
59 push {r8-r11}
60 ldrd r4, r5, [sp, #36]
61 push {r0, r2}
62 ldr dst_ptr, [r0]
63 ldr src_ptr, [r2]
64 subs r1, r1, margin // subtract safety margin from dst_size
65 bls L_done
66 add dst_end, dst_ptr, r1 // dst end - margin
67 sub r4, r4, margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop).
68 add src_end, src_ptr, r4 // src end - margin.
69 vmov.i8 q1, #255 // vector of all 1s used to emit
70.endm
71
72.macro clear_frame_and_return
73 pop {r1, r3}
74 str dst_ptr, [r1]
75 str src_ptr, [r3]
76 pop {r8-r11}
77 pop {r4-r7, pc}
78.endm
79
80.p2align 4
81_lz4_encode_2gb:
82 establish_frame
83L_next_match:
84 // Start searching for the next match, starting at the end of the last one.
85 // [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr
86 // within the search loop itself]. Load the hash magic number in lr, and
87 // zero out mch_len (when we find a match, its length will initially be
88 // four, but we actually work with the match length minus four at all times).
89 ldr lr, L_hash
90 sub mch_ptr, src_ptr, #1
91
92L_match_candidate_loop:
93 // If mch_ptr >= src_end, we are near the end of the source buffer (remember
94 // that we subtracted margin from src_end, so we are *not* actually past the
95 // end just yet).
96 cmp mch_ptr, src_end
97 bhs L_trailing_literal
98
99 // Load the four-byte word starting at mch_ptr, and get the address of the
100 // corresponding row of the hash table.
101 ldr r9, [mch_ptr, #1]!
102 sub r8, mch_ptr, src_beg
103 mul r12, r9, lr
104 mvn r10, #0x7
105 and r12, r10, r12, lsr #17 // byte offset of table entry.
106
107 // Load offset and word from hash table row, then update with the new offset
108 // and word that we just computed.
109 ldrd r10,r11, [table, r12]
110 strd r8, r9, [table, r12]
111
112 // At this point, we only know that the hashes of the words match; check to
113 // see if the words themselves match. If not, move on to the next candidate.
114 cmp r9, r11
115 bne L_match_candidate_loop
116
117 // It's not enough for the words to match; the match distance must also be
118 // in the representable range (i.e. less than 0x10000).
119 sub mch_dis, r8, r10
120 add mch_ref, src_beg, r10
121 cmp mch_dis, #0x10000
122 bhs L_match_candidate_loop
123
124 // We have found a match; registers at this point are as follows:
125 //
126 // register symbolic name meaning
127 // r0 dst_ptr pointer into destination buffer where the
128 // match information will be stored.
129 // r1 dst_end pointer to the end of the destination buffer,
130 // less margin.
131 // r2 src_ptr pointer to the byte after the last match that
132 // we found, or to the point from which we
133 // started searching if this is the first match.
134 // r3 src_beg pointer to the actual start of the buffer.
135 // r4 src_end pointer to the end of the source buffer, less
136 // margin.
137 // r5 table address of hash table.
138 // r6 mch_ptr pointer to match.
139 // r8 mch_dis match distance ("D")
140 // r9 mch_len length of match less four ("M")
141 // r10 mch_ref pointer to match reference.
142 // r11 -
143 // r12 -
144 // lr -
145 //
146 // Attempt to grow the match backwards (typically we only grow backwards by
147 // a byte or two, if at all, so we use a byte-by-byte scan).
148 eor mch_len, mch_len
1490:cmp mch_ref, src_beg
150 cmpne mch_ptr, src_ptr
151 beq 1f
152 ldrb r11, [mch_ref, #-1]
153 ldrb r12, [mch_ptr, #-1]
154 cmp r11, r12
155 bne 1f
156 sub mch_ref, #1
157 sub mch_ptr, #1
158 add mch_len, #1
159 b 0b
160
161 // Now that we have the start of the match, we can compute the literal
162 // length. Then advance the mch and ref pointers to the end of the match
163 // and its reference. Because mch_len is the real match length minus four,
164 // we actually advance to four before the end of the match, but our loop
165 // to grow the matches uses pre-incremented loads with writeback, so this
166 // works out correctly.
167#define lit_len lr
1681:sub lit_len, mch_ptr, src_ptr
169 add mch_ptr, mch_len
170 add mch_ref, mch_len
171
172 // Now attempt to grow the match forwards. This is much more common, and
173 // there is a safety margin at the end of the buffer, so we grow forwards
174 // in four-byte chunks.
1750:ldr r11, [mch_ptr, #4]!
176 ldr r12, [mch_ref, #4]!
177 eors r11, r12
178 bne 1f
179 add mch_len, #4
180 cmp mch_ptr, src_end
181 blo 0b
182 b L_emit_match
183 // At least one of the bytes in the last comparison did not match. Identify
184 // which byte had the mismatch and compute the final length (less four).
1851:rev r11, r11
186 clz r11, r11
187 add mch_len, r11, lsr #3
188
189L_emit_match:
190 // Time to emit what we've found!
191 //
192 // register symbolic name meaning
193 // r0 dst_ptr pointer into destination buffer where the
194 // match information will be stored.
195 // r1 dst_end pointer to the end of the destination buffer,
196 // less margin.
197 // r2 src_ptr pointer to the byte after the last match that
198 // we found, or to the point from which we
199 // started searching if this is the first match.
200 // r3 src_beg pointer to the actual start of the buffer.
201 // r4 src_end pointer to the end of the source buffer, less
202 // margin.
203 // r5 table address of hash table.
204 // r6 -
205 // r8 mch_dis match distance ("D")
206 // r9 mch_len length of match ("M")
207 // r10 -
208 // r11 -
209 // r12 -
210 // lr lit_len literal length ("L")
211 // q1 vector of all ones
212 //
213 // Synthesize control byte under the assumption that L and M are both less
214 // than 15, jumping out of the fast path if one of them is not.
215 cmp lit_len, #15
216 orr r10, mch_len, lit_len, lsl #4
217 cmplo mch_len, #15
218 bhs L_emit_careful
219 // L and M are both less than 15, which means (a) we use the most compact
220 // encoding for the match and (b) we do not need to do a bounds check on
221 // the destination buffer before writing, only before continuing our search.
222 // Store the command byte.
223 strb r10, [dst_ptr], #1
224 // Copy literal.
225 vld1.8 q0, [src_ptr]
226 add src_ptr, lit_len
227 vst1.8 q0, [dst_ptr]
228 add dst_ptr, lit_len
229 // Restore "true" match length before updating src_ptr.
230 add mch_len, #4
231 // Store match distance (D) and update the source pointer.
232 strh r8, [dst_ptr], #2
233 add src_ptr, mch_len
234 // If we're not into the safety margin of the destination buffer, look for
235 // another match.
236 cmp dst_ptr, dst_end
237 blo L_next_match
238 // If we *are* into the safety margin of the destination buffer, we're done
239 // encoding this block; update the source and destination pointers and
240 // return.
241L_done:
242 clear_frame_and_return
243
244// Constant island
245L_hash: .long 2654435761
246L_magic: .long 0x80808081
247
248L_emit_careful:
249 // Either L or M is >= 15, which means that we don't get to use the compact
250 // encoding, and that we need to do extra bounds checks while writing.
251 //
252 // register symbolic name meaning
253 // r0 dst_ptr pointer into destination buffer where the
254 // match information will be stored.
255 // r1 dst_end pointer to the end of the destination buffer,
256 // less margin.
257 // r2 src_ptr pointer to the byte after the last match that
258 // we found, or to the point from which we
259 // started searching if this is the first match.
260 // r3 src_beg pointer to the actual start of the buffer.
261 // r4 src_end pointer to the end of the source buffer, less
262 // margin.
263 // r5 table address of hash table.
264 // r6 -
265 // r8 mch_dis match distance ("D")
266 // r9 mch_len length of match ("M") less four
267 // r10 -
268 // r11 -
269 // r12 -
270 // lr lit_len literal length ("L")
271 // q1 vector of all ones
272 //
273 // Start by creating the low 4 bits of the control word; M if M < 15, 0xf
274 // otherwise. We also load 0x80808081, which is the magic number for
275 // division by 255; this will be required later on.
276 ldr r12, L_magic
277 cmp mch_len, #15
278 mov r10, mch_len
279 movhs r10, #0x0f
280 subs r6, lit_len, #15
281 bhs L_large_L
282 // M is large, but L is < 15. This means we can use the simple approach
283 // for copying the literal with no bounds checks.
284 orr r10, lit_len, lsl #4
285 strb r10, [dst_ptr], #1
286 // Copy literal.
287 vld1.8 q0, [src_ptr]
288 add src_ptr, lit_len
289 vst1.8 q0, [dst_ptr]
290 add dst_ptr, lit_len
291 // Store match distance (D).
292 strh r8, [dst_ptr], #2
293 sub r6, mch_len, #15
294 b L_large_M
295
296L_large_L:
297 // L is large, M may or may not be. We need to encode extra literal length
298 // bytes and we need to do bounds checks while store both those byte and the
299 // literal itself.
300 orr r10, #0xf0
301 strb r10, [dst_ptr], #1
302 // How many extra literal bytes do we need to store? We need to store
303 // (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is
304 // (L - 15)%255. Get these quantities via magic number multiplication:
305 // (L - 15)*0x80808081 >> (32 + 7)
306 umull r10, r11, r6, r12
307 mov r12, #255
308 lsr r10, r11, #7 // (L - 15) / 255
309 mls r11, r10, r12, r6 // (L - 15) % 255
310 ldr r12, L_magic // may need magic number again for M.
311 // Compute address dst_ptr will have after storing all 0xff bytes, and
312 // check that we won't exceed dst_end in doing so.
313 add r10, dst_ptr, r10
314 cmp r10, dst_end
315 bhs L_done
316 // There's enough space for all the 0xff bytes, so go ahead and store them.
3170:vst1.8 q1, [dst_ptr]!
318 cmp dst_ptr, r10
319 blo 0b
320 // Store the (L - 15) % 255 byte.
321 strb r11, [r10], #1
322 // Compute the address we'll have reached after storing all literal bytes.
323 // If that passes dst_end, we're done.
324 add dst_ptr, r10, lit_len
325 cmp dst_ptr, dst_end
326 bhs L_done
327 // Copy the literal.
3280:vld1.8 q0, [src_ptr]!
329 vst1.8 q0, [r10]!
330 subs r6, r10, dst_ptr
331 blo 0b
332 // Fixup src_ptr, store match distance (D), and check whether or not M is
333 // bigger than 14. If not, go find the next match.
334 strh r8, [dst_ptr], #2
335 sub src_ptr, r6
336 subs r6, mch_len, #15
337 bhs L_large_M
338 // M is small, so we're all done; we just need to update the source pointer
339 // and we can go look for the next match.
340 add mch_len, #4
341 add src_ptr, mch_len
342 b L_next_match
343
344L_large_M:
345 // Just like with large L, we split (M - 15) into (M - 15) / 255 and
346 // (M - 15) % 255 via magic number multiply.
347 umull r10, r11, r6, r12
348 mov r12, #255
349 lsr r10, r11, #7 // (M - 15) / 255
350 mls r11, r10, r12, r6 // (M - 15) % 255
351 // Compute address dst_ptr will have after storing all 0xff bytes, and
352 // check that we won't exceed dst_end in doing so.
353 add r10, dst_ptr, r10
354 cmp r10, dst_end
355 bhs L_done
356 // There's enough space for all the 0xff bytes, so go ahead and store them.
3570:vst1.8 q1, [dst_ptr]!
358 cmp dst_ptr, r10
359 blo 0b
360 // Store final M % 255 byte, update dst_ptr and src_ptr, and look for next
361 // match.
362 strb r11, [r10]
363 add mch_len, #4
364 add dst_ptr, r10, #1
365 add src_ptr, mch_len
366 b L_next_match
367
368L_trailing_literal:
369 // Check if skip_final_literals is set.
370 ldr r5, [sp, #52]
371 tst r5, r5
372 bne L_done
373 // Emit a trailing literal that covers the remainder of the source buffer,
374 // if we can do so without exceeding the bounds of the destination buffer.
375 add src_end, margin
376 sub lit_len, src_end, src_ptr
377 subs r6, lit_len, #15
378 bhs L_trailing_literal_long
379 lsl r10, lit_len, #4
380 strb r10, [dst_ptr], #1
381 vld1.8 q0, [src_ptr]
382 mov src_ptr, src_end
383 vst1.8 q0, [dst_ptr]
384 add dst_ptr, lit_len
385 b L_done
386
387L_trailing_literal_long:
388 ldr r12, L_magic
389 mov r10, #0xf0
390 add dst_end, margin
391 strb r10, [dst_ptr], #1
392 umull r10, r11, r6, r12
393 mov r12, #255
394 lsr r10, r11, #7 // (L - 15) / 255
395 mls r11, r10, r12, r6 // (L - 15) % 255
396 // We want to write out lit_len + (L - 15)/255 + 1 bytes. Check if we have
397 // space for all of them.
398 add r10, dst_ptr
399 add r12, r10, lit_len
400 cmp r12, dst_end
401 bhs L_done
402 // We have enough space, so go ahead and write them all out. Because we
403 // know that we have enough space, and that the literal is at least 15 bytes,
404 // we can write the block of 0xffs using vector stores, even without a
405 // safety margin.
4060:vst1.8 q1, [dst_ptr]!
407 cmp dst_ptr, r10
408 blo 0b
409 // Store the (L - 15) % 255 byte.
410 strb r11, [r10], #1
411 mov dst_ptr, r10
412 // Now store the literal itself; here we need to actually be somewhat
413 // careful to ensure that we don't write past the end of the destination
414 // buffer or read past the end of the source buffer.
415 subs lit_len, #16
416 blo 1f
4170:vld1.8 q0, [src_ptr]!
418 subs lit_len, #16
419 vst1.8 q0, [dst_ptr]!
420 bhs 0b
4211:adds lit_len, #16
422 beq L_done
4232:ldrb r6, [src_ptr], #1
424 subs lit_len, #1
425 strb r6, [dst_ptr], #1
426 bne 2b
427 b L_done
428
429#endif