2 * Copyright (c) 2011 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 * This file implements the following functions for the Swift micro-arch:
25 * void memset_pattern4(void *b, const void *pattern4, size_t len);
26 * void memset_pattern8(void *b, const void *pattern8, size_t len);
27 * void memset_pattern16(void *b, const void *pattern16, size_t len);
29 * The implementation of all three functions is fundamentally the same.
30 * memset_pattern4 is extensively commented to explain, reference that
31 * if you have any questions about the other two.
35 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
40 .globl _memset_pattern4$VARIANT$Swift
41 .globl _memset_pattern8$VARIANT$Swift
42 .globl _memset_pattern16$VARIANT$Swift
44 /******************************************************************************/
47 _memset_pattern4$VARIANT$Swift:
51 // Load the pattern and splat it to q0, then check if the buffer is at least
52 // 64 bytes long. If not, branch to a short-buffer implementation.
58 // We want to use aligned vector stores to fill the bulk of the buffer. In
59 // order to make that work, we need to rotate the pattern as necessary to
60 // match up with aligned locations, and we also need to extract the alignment
61 // of the destination pointer mod 16.
63 and lr, r0, #0xf // alignment of destination pointer mod 16
64 rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
66 // Before we start the aligned stores, we do a single unaligned store of
67 // 16 bytes of the pattern to the start of the buffer. Since the buffer is
68 // at least 64 bytes long, this store is known to lie entirely inside the
70 // first aligned address in buffer
72 // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
73 // ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
74 // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
76 // unaligned store starts here:
77 // [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
80 // Subsequent stores will be aligned, and will start at the first aligned
81 // address in the buffer. We apply the rotation that we calculated before
82 // the vector store (in the low five bits of ip) to get the pattern that
83 // is to be stored starting at the aligned location. For example, in the
84 // picture above, the buffer had alignment of 3 mod 4, so the rotation to
85 // be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
86 // us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
87 // needs to be stored starting at the first aligned location.
89 // Besides rotating the pattern, we also need to adjust the length (by
90 // subtracting 16 - alignment mod 16), and to advance the pointer to the
91 // first aligned location.
92 ror r1, ip // Pattern to use for aligned memory
94 bic r0, #0xf // destination for first aligned store
95 subs r3, #16 // updated length
98 // Splat the rotated value across q1 and q2
102 // Main store loop. We write the splatted aligned pattern across 64 bytes
103 // per iteration, terminating the loop when the remaining length of the
104 // buffer is 64 bytes or less.
106 vst1.32 {q1,q2}, [r0,:128]!
107 vst1.32 {q1,q2}, [r0,:128]!
110 // The remaining length of the buffer is 64 bytes or less (but the total
111 // length of the buffer is at least 64 bytes; otherwise we would have
112 // branched to the "short" path). Thus, we can handle the entirety of the
113 // remaining buffer with two 32-byte unaligned stores.
115 // Again, we need to rotate the pattern to match the alignment, this time
116 // by 8*(length%4), and we also need to back up the destination pointer
117 // so that it points to precisely 64 bytes before the end of the buffer.
118 // We accomplish this by adding r3, which contains the remaining length of
119 // the buffer minus 64.
125 vst1.32 {q8,q9}, [r0]!
126 vst1.32 {q8,q9}, [r0]
130 // If we branch here, the buffer is less than 64 bytes long. At this point,
131 // register contents are as follows:
133 // r0 pointer to the buffer
136 // q0 splatted pattern
138 // To begin, we store eight bytes at a time until the remaining length is
139 // less than eight bytes.
146 // Then we store one byte at a time, rotating the pattern to get the next
147 // byte, until we reach the end of the buffer.
155 /******************************************************************************/
158 _memset_pattern8$VARIANT$Swift:
159 // The implementation of this function is substantially identical to that of
160 // memset_pattern4. The only differences are in how we rotate the pattern for
161 // the purposes of extracting the bytes to store. For clarity, only those
162 // differences are commented here; consult memset_pattern4 (above) for
163 // a detailed description of the algorithm used.
171 bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
172 sub sp, #16 // of pattern to the stack. We will use
173 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
174 sub sp, #16 // to get rotated forms of the pattern.
175 vst1.8 {q0}, [sp,:128]
176 and ip, r0, #0x7 // Now generate an unaligned pointer to the
177 rsb ip, ip, #8 // rotated pattern that we need to use for
178 add ip, sp // aligned stores in the main loop.
188 vst1.32 {q1,q2}, [r0,:128]!
189 vst1.32 {q1,q2}, [r0,:128]!
191 1: and lr, r3, #0x7 // Generate an unaligned pointer to the
192 add ip, lr // rotated pattern to use for cleanup.
196 vst1.32 {q8,q9}, [r0]!
197 vst1.32 {q8,q9}, [r0]
198 mov sp, r7 // Restore stack pointer
209 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
210 vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
215 /******************************************************************************/
218 _memset_pattern16$VARIANT$Swift:
219 // The implementation of this function is substantially identical to that of
220 // memset_pattern4. The only differences are in how we rotate the pattern for
221 // the purposes of extracting the bytes to store. For clarity, only those
222 // differences are commented here; consult memset_pattern4 (above) for
223 // a detailed description of the algorithm used.
230 bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
231 sub sp, #16 // of pattern to the stack. We will use
232 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
233 sub sp, #16 // to get rotated forms of the pattern.
234 vst1.8 {q0}, [sp,:128]
236 vst1.8 {q0}, [sp,:128]
237 and lr, r0, #0xf // Now generate an unaligned pointer to the
238 rsb ip, lr, #16 // rotated pattern that we need to use for
239 add ip, sp // aligned stores in the main loop.
248 vst1.32 {q1,q2}, [r0,:128]!
249 vst1.32 {q1,q2}, [r0,:128]!
251 1: and lr, r3, #0xf // Generate an unaligned pointer to the
252 add ip, lr // rotated pattern to use for cleanup.
256 vst1.32 {q8,q9}, [r0]!
257 vst1.32 {q8,q9}, [r0]
258 mov sp, r7 // Restore stack pointer
269 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
270 vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
275 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD