]>
Commit | Line | Data |
---|---|---|
ad3c9f2a A |
1 | /* |
2 | * Copyright (c) 2011 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | * | |
23 | * This file implements the following functions for the Swift micro-arch: | |
24 | * | |
25 | * void memset_pattern4(void *b, const void *pattern4, size_t len); | |
26 | * void memset_pattern8(void *b, const void *pattern8, size_t len); | |
27 | * void memset_pattern16(void *b, const void *pattern16, size_t len); | |
28 | * | |
29 | * The implementation of all three functions is fundamentally the same. | |
30 | * memset_pattern4 is extensively commented to explain, reference that | |
31 | * if you have any questions about the other two. | |
32 | */ | |
33 | ||
34 | #include <arm/arch.h> | |
35 | #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD | |
36 | ||
37 | .syntax unified | |
38 | .code 32 | |
39 | .text | |
40 | .globl _memset_pattern4$VARIANT$Swift | |
41 | .globl _memset_pattern8$VARIANT$Swift | |
42 | .globl _memset_pattern16$VARIANT$Swift | |
43 | ||
44 | /******************************************************************************/ | |
45 | ||
46 | .align 4 | |
47 | _memset_pattern4$VARIANT$Swift: | |
48 | push {r7,lr} | |
49 | mov r7, sp | |
50 | ||
51 | // Load the pattern and splat it to q0, then check if the buffer is at least | |
52 | // 64 bytes long. If not, branch to a short-buffer implementation. | |
53 | ldr r1, [r1] | |
54 | vdup.32 q0, r1 | |
55 | subs r3, r2, #64 | |
56 | blo L_short4 | |
57 | ||
58 | // We want to use aligned vector stores to fill the bulk of the buffer. In | |
59 | // order to make that work, we need to rotate the pattern as necessary to | |
60 | // match up with aligned locations, and we also need to extract the alignment | |
61 | // of the destination pointer mod 16. | |
62 | lsl ip, r0, #3 | |
63 | and lr, r0, #0xf // alignment of destination pointer mod 16 | |
64 | rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4). | |
65 | ||
66 | // Before we start the aligned stores, we do a single unaligned store of | |
67 | // 16 bytes of the pattern to the start of the buffer. Since the buffer is | |
68 | // at least 64 bytes long, this store is known to lie entirely inside the | |
69 | // buffer: | |
70 | // first aligned address in buffer | |
71 | // v | |
72 | // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--- | |
73 | // ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ... | |
74 | // ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--- | |
75 | // ^ | |
76 | // unaligned store starts here: | |
77 | // [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ] | |
78 | vst1.8 {q0}, [r0]! | |
79 | ||
80 | // Subsequent stores will be aligned, and will start at the first aligned | |
81 | // address in the buffer. We apply the rotation that we calculated before | |
82 | // the vector store (in the low five bits of ip) to get the pattern that | |
83 | // is to be stored starting at the aligned location. For example, in the | |
84 | // picture above, the buffer had alignment of 3 mod 4, so the rotation to | |
85 | // be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives | |
86 | // us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what | |
87 | // needs to be stored starting at the first aligned location. | |
88 | // | |
89 | // Besides rotating the pattern, we also need to adjust the length (by | |
90 | // subtracting 16 - alignment mod 16), and to advance the pointer to the | |
91 | // first aligned location. | |
92 | ror r1, ip // Pattern to use for aligned memory | |
93 | add r3, lr | |
94 | bic r0, #0xf // destination for first aligned store | |
95 | subs r3, #16 // updated length | |
96 | blo 1f | |
97 | ||
98 | // Splat the rotated value across q1 and q2 | |
99 | vdup.32 q1, r1 | |
100 | vmov q2, q1 | |
101 | ||
102 | // Main store loop. We write the splatted aligned pattern across 64 bytes | |
103 | // per iteration, terminating the loop when the remaining length of the | |
104 | // buffer is 64 bytes or less. | |
105 | 0: subs r3, #64 | |
106 | vst1.32 {q1,q2}, [r0,:128]! | |
107 | vst1.32 {q1,q2}, [r0,:128]! | |
108 | bhi 0b | |
109 | ||
110 | // The remaining length of the buffer is 64 bytes or less (but the total | |
111 | // length of the buffer is at least 64 bytes; otherwise we would have | |
112 | // branched to the "short" path). Thus, we can handle the entirety of the | |
113 | // remaining buffer with two 32-byte unaligned stores. | |
114 | // | |
115 | // Again, we need to rotate the pattern to match the alignment, this time | |
116 | // by 8*(length%4), and we also need to back up the destination pointer | |
117 | // so that it points to precisely 64 bytes before the end of the buffer. | |
118 | // We accomplish this by adding r3, which contains the remaining length of | |
119 | // the buffer minus 64. | |
120 | 1: lsl ip, r3, #3 | |
121 | ror r1, ip | |
122 | vdup.32 q8, r1 | |
123 | vmov q9, q8 | |
124 | add r0, r3 | |
125 | vst1.32 {q8,q9}, [r0]! | |
126 | vst1.32 {q8,q9}, [r0] | |
127 | pop {r7,pc} | |
128 | ||
129 | L_short4: | |
130 | // If we branch here, the buffer is less than 64 bytes long. At this point, | |
131 | // register contents are as follows: | |
132 | // | |
133 | // r0 pointer to the buffer | |
134 | // r1 pattern | |
135 | // r2 buffer length | |
136 | // q0 splatted pattern | |
137 | // | |
138 | // To begin, we store eight bytes at a time until the remaining length is | |
139 | // less than eight bytes. | |
140 | subs r3, r2, #8 | |
141 | blo 1f | |
142 | 0: subs r3, #8 | |
143 | vst1.32 {d0}, [r0]! | |
144 | bhs 0b | |
145 | ||
146 | // Then we store one byte at a time, rotating the pattern to get the next | |
147 | // byte, until we reach the end of the buffer. | |
148 | add r2, r3, #8 | |
149 | 1: subs r2, #1 | |
150 | strbhs r1, [r0],#1 | |
151 | ror r1, #8 | |
152 | bhi 1b | |
153 | pop {r7,pc} | |
154 | ||
155 | /******************************************************************************/ | |
156 | ||
157 | .align 4 | |
158 | _memset_pattern8$VARIANT$Swift: | |
159 | // The implementation of this function is substantially identical to that of | |
160 | // memset_pattern4. The only differences are in how we rotate the pattern for | |
161 | // the purposes of extracting the bytes to store. For clarity, only those | |
162 | // differences are commented here; consult memset_pattern4 (above) for | |
163 | // a detailed description of the algorithm used. | |
164 | push {r7,lr} | |
165 | mov r7, sp | |
166 | vld1.8 {d0}, [r1] | |
167 | vmov d1, d0 | |
168 | subs r3, r2, #64 | |
169 | blo L_short8 | |
170 | ||
171 | bic sp, #0xf // Align stack to 16 bytes and write 32 bytes | |
172 | sub sp, #16 // of pattern to the stack. We will use | |
173 | vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer | |
174 | sub sp, #16 // to get rotated forms of the pattern. | |
175 | vst1.8 {q0}, [sp,:128] | |
176 | and ip, r0, #0x7 // Now generate an unaligned pointer to the | |
177 | rsb ip, ip, #8 // rotated pattern that we need to use for | |
178 | add ip, sp // aligned stores in the main loop. | |
179 | and lr, r0, #0xf | |
180 | vst1.8 {q0}, [r0]! | |
181 | add r3, lr | |
182 | bic r0, #0xf | |
183 | subs r3, #16 | |
184 | blo 1f | |
185 | vld1.8 {q1}, [ip] | |
186 | vmov q2, q1 | |
187 | 0: subs r3, #64 | |
188 | vst1.32 {q1,q2}, [r0,:128]! | |
189 | vst1.32 {q1,q2}, [r0,:128]! | |
190 | bhi 0b | |
191 | 1: and lr, r3, #0x7 // Generate an unaligned pointer to the | |
192 | add ip, lr // rotated pattern to use for cleanup. | |
193 | vld1.8 {q8}, [ip] | |
194 | vmov q9, q8 | |
195 | add r0, r3 | |
196 | vst1.32 {q8,q9}, [r0]! | |
197 | vst1.32 {q8,q9}, [r0] | |
198 | mov sp, r7 // Restore stack pointer | |
199 | pop {r7,pc} | |
200 | ||
201 | L_short8: | |
202 | subs r2, #8 | |
203 | blo 1f | |
204 | 0: subs r2, #8 | |
205 | vst1.32 {d0}, [r0]! | |
206 | bhs 0b | |
207 | 1: adds r2, #8 | |
208 | beq 3f | |
209 | 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON | |
210 | vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern | |
211 | subs r2, #1 | |
212 | bhi 2b | |
213 | 3: pop {r7,pc} | |
214 | ||
215 | /******************************************************************************/ | |
216 | ||
217 | .align 4 | |
218 | _memset_pattern16$VARIANT$Swift: | |
219 | // The implementation of this function is substantially identical to that of | |
220 | // memset_pattern4. The only differences are in how we rotate the pattern for | |
221 | // the purposes of extracting the bytes to store. For clarity, only those | |
222 | // differences are commented here; consult memset_pattern4 (above) for | |
223 | // a detailed description of the algorithm used. | |
224 | push {r7,lr} | |
225 | mov r7, sp | |
226 | vld1.8 {q0}, [r1] | |
227 | subs r3, r2, #64 | |
228 | blo L_short16 | |
229 | ||
230 | bic sp, #0xf // Align stack to 16 bytes and write 48 bytes | |
231 | sub sp, #16 // of pattern to the stack. We will use | |
232 | vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer | |
233 | sub sp, #16 // to get rotated forms of the pattern. | |
234 | vst1.8 {q0}, [sp,:128] | |
235 | sub sp, #16 | |
236 | vst1.8 {q0}, [sp,:128] | |
237 | and lr, r0, #0xf // Now generate an unaligned pointer to the | |
238 | rsb ip, lr, #16 // rotated pattern that we need to use for | |
239 | add ip, sp // aligned stores in the main loop. | |
240 | vst1.8 {q0}, [r0]! | |
241 | add r3, lr | |
242 | bic r0, #0xf | |
243 | subs r3, #16 | |
244 | blo 1f | |
245 | vld1.8 {q1}, [ip] | |
246 | vmov q2, q1 | |
247 | 0: subs r3, #64 | |
248 | vst1.32 {q1,q2}, [r0,:128]! | |
249 | vst1.32 {q1,q2}, [r0,:128]! | |
250 | bhi 0b | |
251 | 1: and lr, r3, #0xf // Generate an unaligned pointer to the | |
252 | add ip, lr // rotated pattern to use for cleanup. | |
253 | vld1.8 {q8}, [ip] | |
254 | vmov q9, q8 | |
255 | add r0, r3 | |
256 | vst1.32 {q8,q9}, [r0]! | |
257 | vst1.32 {q8,q9}, [r0] | |
258 | mov sp, r7 // Restore stack pointer | |
259 | pop {r7,pc} | |
260 | ||
261 | L_short16: | |
262 | subs r2, #16 | |
263 | blo 1f | |
264 | 0: subs r2, #16 | |
265 | vst1.32 {q0}, [r0]! | |
266 | bhs 0b | |
267 | 1: adds r2, #16 | |
268 | beq 3f | |
269 | 2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON | |
270 | vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern | |
271 | subs r2, #1 | |
272 | bhi 2b | |
273 | 3: pop {r7,pc} | |
274 | ||
275 | #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD |