]> git.saurik.com Git - apple/libc.git/blob - arm/string/memset_pattern_Swift.s
Libc-825.25.tar.gz
[apple/libc.git] / arm / string / memset_pattern_Swift.s
1 /*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 *
23 * This file implements the following functions for the Swift micro-arch:
24 *
25 * void memset_pattern4(void *b, const void *pattern4, size_t len);
26 * void memset_pattern8(void *b, const void *pattern8, size_t len);
27 * void memset_pattern16(void *b, const void *pattern16, size_t len);
28 *
29 * The implementation of all three functions is fundamentally the same.
30 * memset_pattern4 is extensively commented to explain, reference that
31 * if you have any questions about the other two.
32 */
33
34 #include <arm/arch.h>
35 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
36
37 .syntax unified
38 .code 32
39 .text
40 .globl _memset_pattern4$VARIANT$Swift
41 .globl _memset_pattern8$VARIANT$Swift
42 .globl _memset_pattern16$VARIANT$Swift
43
44 /******************************************************************************/
45
46 .align 4
47 _memset_pattern4$VARIANT$Swift:
48 push {r7,lr}
49 mov r7, sp
50
51 // Load the pattern and splat it to q0, then check if the buffer is at least
52 // 64 bytes long. If not, branch to a short-buffer implementation.
53 ldr r1, [r1]
54 vdup.32 q0, r1
55 subs r3, r2, #64
56 blo L_short4
57
58 // We want to use aligned vector stores to fill the bulk of the buffer. In
59 // order to make that work, we need to rotate the pattern as necessary to
60 // match up with aligned locations, and we also need to extract the alignment
61 // of the destination pointer mod 16.
62 lsl ip, r0, #3
63 and lr, r0, #0xf // alignment of destination pointer mod 16
64 rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
65
66 // Before we start the aligned stores, we do a single unaligned store of
67 // 16 bytes of the pattern to the start of the buffer. Since the buffer is
68 // at least 64 bytes long, this store is known to lie entirely inside the
69 // buffer:
70 // first aligned address in buffer
71 // v
72 // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
73 // ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
74 // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
75 // ^
76 // unaligned store starts here:
77 // [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
78 vst1.8 {q0}, [r0]!
79
80 // Subsequent stores will be aligned, and will start at the first aligned
81 // address in the buffer. We apply the rotation that we calculated before
82 // the vector store (in the low five bits of ip) to get the pattern that
83 // is to be stored starting at the aligned location. For example, in the
84 // picture above, the buffer had alignment of 3 mod 4, so the rotation to
85 // be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
86 // us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
87 // needs to be stored starting at the first aligned location.
88 //
89 // Besides rotating the pattern, we also need to adjust the length (by
90 // subtracting 16 - alignment mod 16), and to advance the pointer to the
91 // first aligned location.
92 ror r1, ip // Pattern to use for aligned memory
93 add r3, lr
94 bic r0, #0xf // destination for first aligned store
95 subs r3, #16 // updated length
96 blo 1f
97
98 // Splat the rotated value across q1 and q2
99 vdup.32 q1, r1
100 vmov q2, q1
101
102 // Main store loop. We write the splatted aligned pattern across 64 bytes
103 // per iteration, terminating the loop when the remaining length of the
104 // buffer is 64 bytes or less.
105 0: subs r3, #64
106 vst1.32 {q1,q2}, [r0,:128]!
107 vst1.32 {q1,q2}, [r0,:128]!
108 bhi 0b
109
110 // The remaining length of the buffer is 64 bytes or less (but the total
111 // length of the buffer is at least 64 bytes; otherwise we would have
112 // branched to the "short" path). Thus, we can handle the entirety of the
113 // remaining buffer with two 32-byte unaligned stores.
114 //
115 // Again, we need to rotate the pattern to match the alignment, this time
116 // by 8*(length%4), and we also need to back up the destination pointer
117 // so that it points to precisely 64 bytes before the end of the buffer.
118 // We accomplish this by adding r3, which contains the remaining length of
119 // the buffer minus 64.
120 1: lsl ip, r3, #3
121 ror r1, ip
122 vdup.32 q8, r1
123 vmov q9, q8
124 add r0, r3
125 vst1.32 {q8,q9}, [r0]!
126 vst1.32 {q8,q9}, [r0]
127 pop {r7,pc}
128
129 L_short4:
130 // If we branch here, the buffer is less than 64 bytes long. At this point,
131 // register contents are as follows:
132 //
133 // r0 pointer to the buffer
134 // r1 pattern
135 // r2 buffer length
136 // q0 splatted pattern
137 //
138 // To begin, we store eight bytes at a time until the remaining length is
139 // less than eight bytes.
140 subs r3, r2, #8
141 blo 1f
142 0: subs r3, #8
143 vst1.32 {d0}, [r0]!
144 bhs 0b
145
146 // Then we store one byte at a time, rotating the pattern to get the next
147 // byte, until we reach the end of the buffer.
148 add r2, r3, #8
149 1: subs r2, #1
150 strbhs r1, [r0],#1
151 ror r1, #8
152 bhi 1b
153 pop {r7,pc}
154
155 /******************************************************************************/
156
157 .align 4
158 _memset_pattern8$VARIANT$Swift:
159 // The implementation of this function is substantially identical to that of
160 // memset_pattern4. The only differences are in how we rotate the pattern for
161 // the purposes of extracting the bytes to store. For clarity, only those
162 // differences are commented here; consult memset_pattern4 (above) for
163 // a detailed description of the algorithm used.
164 push {r7,lr}
165 mov r7, sp
166 vld1.8 {d0}, [r1]
167 vmov d1, d0
168 subs r3, r2, #64
169 blo L_short8
170
171 bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
172 sub sp, #16 // of pattern to the stack. We will use
173 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
174 sub sp, #16 // to get rotated forms of the pattern.
175 vst1.8 {q0}, [sp,:128]
176 and ip, r0, #0x7 // Now generate an unaligned pointer to the
177 rsb ip, ip, #8 // rotated pattern that we need to use for
178 add ip, sp // aligned stores in the main loop.
179 and lr, r0, #0xf
180 vst1.8 {q0}, [r0]!
181 add r3, lr
182 bic r0, #0xf
183 subs r3, #16
184 blo 1f
185 vld1.8 {q1}, [ip]
186 vmov q2, q1
187 0: subs r3, #64
188 vst1.32 {q1,q2}, [r0,:128]!
189 vst1.32 {q1,q2}, [r0,:128]!
190 bhi 0b
191 1: and lr, r3, #0x7 // Generate an unaligned pointer to the
192 add ip, lr // rotated pattern to use for cleanup.
193 vld1.8 {q8}, [ip]
194 vmov q9, q8
195 add r0, r3
196 vst1.32 {q8,q9}, [r0]!
197 vst1.32 {q8,q9}, [r0]
198 mov sp, r7 // Restore stack pointer
199 pop {r7,pc}
200
201 L_short8:
202 subs r2, #8
203 blo 1f
204 0: subs r2, #8
205 vst1.32 {d0}, [r0]!
206 bhs 0b
207 1: adds r2, #8
208 beq 3f
209 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
210 vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
211 subs r2, #1
212 bhi 2b
213 3: pop {r7,pc}
214
215 /******************************************************************************/
216
217 .align 4
218 _memset_pattern16$VARIANT$Swift:
219 // The implementation of this function is substantially identical to that of
220 // memset_pattern4. The only differences are in how we rotate the pattern for
221 // the purposes of extracting the bytes to store. For clarity, only those
222 // differences are commented here; consult memset_pattern4 (above) for
223 // a detailed description of the algorithm used.
224 push {r7,lr}
225 mov r7, sp
226 vld1.8 {q0}, [r1]
227 subs r3, r2, #64
228 blo L_short16
229
230 bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
231 sub sp, #16 // of pattern to the stack. We will use
232 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
233 sub sp, #16 // to get rotated forms of the pattern.
234 vst1.8 {q0}, [sp,:128]
235 sub sp, #16
236 vst1.8 {q0}, [sp,:128]
237 and lr, r0, #0xf // Now generate an unaligned pointer to the
238 rsb ip, lr, #16 // rotated pattern that we need to use for
239 add ip, sp // aligned stores in the main loop.
240 vst1.8 {q0}, [r0]!
241 add r3, lr
242 bic r0, #0xf
243 subs r3, #16
244 blo 1f
245 vld1.8 {q1}, [ip]
246 vmov q2, q1
247 0: subs r3, #64
248 vst1.32 {q1,q2}, [r0,:128]!
249 vst1.32 {q1,q2}, [r0,:128]!
250 bhi 0b
251 1: and lr, r3, #0xf // Generate an unaligned pointer to the
252 add ip, lr // rotated pattern to use for cleanup.
253 vld1.8 {q8}, [ip]
254 vmov q9, q8
255 add r0, r3
256 vst1.32 {q8,q9}, [r0]!
257 vst1.32 {q8,q9}, [r0]
258 mov sp, r7 // Restore stack pointer
259 pop {r7,pc}
260
261 L_short16:
262 subs r2, #16
263 blo 1f
264 0: subs r2, #16
265 vst1.32 {q0}, [r0]!
266 bhs 0b
267 1: adds r2, #16
268 beq 3f
269 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
270 vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
271 subs r2, #1
272 bhi 2b
273 3: pop {r7,pc}
274
275 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD