]> git.saurik.com Git - apple/libc.git/blame - arm/string/memset_pattern_Swift.s
Libc-825.40.1.tar.gz
[apple/libc.git] / arm / string / memset_pattern_Swift.s
CommitLineData
ad3c9f2a
A
1/*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 *
23 * This file implements the following functions for the Swift micro-arch:
24 *
25 * void memset_pattern4(void *b, const void *pattern4, size_t len);
26 * void memset_pattern8(void *b, const void *pattern8, size_t len);
27 * void memset_pattern16(void *b, const void *pattern16, size_t len);
28 *
29 * The implementation of all three functions is fundamentally the same.
30 * memset_pattern4 is extensively commented to explain, reference that
31 * if you have any questions about the other two.
32 */
33
34#include <arm/arch.h>
35#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
36
37.syntax unified
38.code 32
39.text
40.globl _memset_pattern4$VARIANT$Swift
41.globl _memset_pattern8$VARIANT$Swift
42.globl _memset_pattern16$VARIANT$Swift
43
44/******************************************************************************/
45
46.align 4
47_memset_pattern4$VARIANT$Swift:
48 push {r7,lr}
49 mov r7, sp
50
51// Load the pattern and splat it to q0, then check if the buffer is at least
52// 64 bytes long. If not, branch to a short-buffer implementation.
53 ldr r1, [r1]
54 vdup.32 q0, r1
55 subs r3, r2, #64
56 blo L_short4
57
58// We want to use aligned vector stores to fill the bulk of the buffer. In
59// order to make that work, we need to rotate the pattern as necessary to
60// match up with aligned locations, and we also need to extract the alignment
61// of the destination pointer mod 16.
62 lsl ip, r0, #3
63 and lr, r0, #0xf // alignment of destination pointer mod 16
64 rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
65
66// Before we start the aligned stores, we do a single unaligned store of
67// 16 bytes of the pattern to the start of the buffer. Since the buffer is
68// at least 64 bytes long, this store is known to lie entirely inside the
69// buffer:
70// first aligned address in buffer
71// v
72// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
73// ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
74// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
75// ^
76// unaligned store starts here:
77// [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
78 vst1.8 {q0}, [r0]!
79
80// Subsequent stores will be aligned, and will start at the first aligned
81// address in the buffer. We apply the rotation that we calculated before
82// the vector store (in the low five bits of ip) to get the pattern that
83// is to be stored starting at the aligned location. For example, in the
84// picture above, the buffer had alignment of 3 mod 4, so the rotation to
85// be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
86// us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
87// needs to be stored starting at the first aligned location.
88//
89// Besides rotating the pattern, we also need to adjust the length (by
90// subtracting 16 - alignment mod 16), and to advance the pointer to the
91// first aligned location.
92 ror r1, ip // Pattern to use for aligned memory
93 add r3, lr
94 bic r0, #0xf // destination for first aligned store
95 subs r3, #16 // updated length
96 blo 1f
97
98// Splat the rotated value across q1 and q2
99 vdup.32 q1, r1
100 vmov q2, q1
101
102// Main store loop. We write the splatted aligned pattern across 64 bytes
103// per iteration, terminating the loop when the remaining length of the
104// buffer is 64 bytes or less.
1050: subs r3, #64
106 vst1.32 {q1,q2}, [r0,:128]!
107 vst1.32 {q1,q2}, [r0,:128]!
108 bhi 0b
109
110// The remaining length of the buffer is 64 bytes or less (but the total
111// length of the buffer is at least 64 bytes; otherwise we would have
112// branched to the "short" path). Thus, we can handle the entirety of the
113// remaining buffer with two 32-byte unaligned stores.
114//
115// Again, we need to rotate the pattern to match the alignment, this time
116// by 8*(length%4), and we also need to back up the destination pointer
117// so that it points to precisely 64 bytes before the end of the buffer.
118// We accomplish this by adding r3, which contains the remaining length of
119// the buffer minus 64.
1201: lsl ip, r3, #3
121 ror r1, ip
122 vdup.32 q8, r1
123 vmov q9, q8
124 add r0, r3
125 vst1.32 {q8,q9}, [r0]!
126 vst1.32 {q8,q9}, [r0]
127 pop {r7,pc}
128
129L_short4:
130// If we branch here, the buffer is less than 64 bytes long. At this point,
131// register contents are as follows:
132//
133// r0 pointer to the buffer
134// r1 pattern
135// r2 buffer length
136// q0 splatted pattern
137//
138// To begin, we store eight bytes at a time until the remaining length is
139// less than eight bytes.
140 subs r3, r2, #8
141 blo 1f
1420: subs r3, #8
143 vst1.32 {d0}, [r0]!
144 bhs 0b
145
146// Then we store one byte at a time, rotating the pattern to get the next
147// byte, until we reach the end of the buffer.
148 add r2, r3, #8
1491: subs r2, #1
150 strbhs r1, [r0],#1
151 ror r1, #8
152 bhi 1b
153 pop {r7,pc}
154
155/******************************************************************************/
156
157.align 4
158_memset_pattern8$VARIANT$Swift:
159// The implementation of this function is substantially identical to that of
160// memset_pattern4. The only differences are in how we rotate the pattern for
161// the purposes of extracting the bytes to store. For clarity, only those
162// differences are commented here; consult memset_pattern4 (above) for
163// a detailed description of the algorithm used.
164 push {r7,lr}
165 mov r7, sp
166 vld1.8 {d0}, [r1]
167 vmov d1, d0
168 subs r3, r2, #64
169 blo L_short8
170
171 bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
172 sub sp, #16 // of pattern to the stack. We will use
173 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
174 sub sp, #16 // to get rotated forms of the pattern.
175 vst1.8 {q0}, [sp,:128]
176 and ip, r0, #0x7 // Now generate an unaligned pointer to the
177 rsb ip, ip, #8 // rotated pattern that we need to use for
178 add ip, sp // aligned stores in the main loop.
179 and lr, r0, #0xf
180 vst1.8 {q0}, [r0]!
181 add r3, lr
182 bic r0, #0xf
183 subs r3, #16
184 blo 1f
185 vld1.8 {q1}, [ip]
186 vmov q2, q1
1870: subs r3, #64
188 vst1.32 {q1,q2}, [r0,:128]!
189 vst1.32 {q1,q2}, [r0,:128]!
190 bhi 0b
1911: and lr, r3, #0x7 // Generate an unaligned pointer to the
192 add ip, lr // rotated pattern to use for cleanup.
193 vld1.8 {q8}, [ip]
194 vmov q9, q8
195 add r0, r3
196 vst1.32 {q8,q9}, [r0]!
197 vst1.32 {q8,q9}, [r0]
198 mov sp, r7 // Restore stack pointer
199 pop {r7,pc}
200
201L_short8:
202 subs r2, #8
203 blo 1f
2040: subs r2, #8
205 vst1.32 {d0}, [r0]!
206 bhs 0b
2071: adds r2, #8
208 beq 3f
2092: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
210 vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
211 subs r2, #1
212 bhi 2b
2133: pop {r7,pc}
214
215/******************************************************************************/
216
217.align 4
218_memset_pattern16$VARIANT$Swift:
219// The implementation of this function is substantially identical to that of
220// memset_pattern4. The only differences are in how we rotate the pattern for
221// the purposes of extracting the bytes to store. For clarity, only those
222// differences are commented here; consult memset_pattern4 (above) for
223// a detailed description of the algorithm used.
224 push {r7,lr}
225 mov r7, sp
226 vld1.8 {q0}, [r1]
227 subs r3, r2, #64
228 blo L_short16
229
230 bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
231 sub sp, #16 // of pattern to the stack. We will use
232 vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
233 sub sp, #16 // to get rotated forms of the pattern.
234 vst1.8 {q0}, [sp,:128]
235 sub sp, #16
236 vst1.8 {q0}, [sp,:128]
237 and lr, r0, #0xf // Now generate an unaligned pointer to the
238 rsb ip, lr, #16 // rotated pattern that we need to use for
239 add ip, sp // aligned stores in the main loop.
240 vst1.8 {q0}, [r0]!
241 add r3, lr
242 bic r0, #0xf
243 subs r3, #16
244 blo 1f
245 vld1.8 {q1}, [ip]
246 vmov q2, q1
2470: subs r3, #64
248 vst1.32 {q1,q2}, [r0,:128]!
249 vst1.32 {q1,q2}, [r0,:128]!
250 bhi 0b
2511: and lr, r3, #0xf // Generate an unaligned pointer to the
252 add ip, lr // rotated pattern to use for cleanup.
253 vld1.8 {q8}, [ip]
254 vmov q9, q8
255 add r0, r3
256 vst1.32 {q8,q9}, [r0]!
257 vst1.32 {q8,q9}, [r0]
258 mov sp, r7 // Restore stack pointer
259 pop {r7,pc}
260
261L_short16:
262 subs r2, #16
263 blo 1f
2640: subs r2, #16
265 vst1.32 {q0}, [r0]!
266 bhs 0b
2671: adds r2, #16
268 beq 3f
2692: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
270 vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
271 subs r2, #1
272 bhi 2b
2733: pop {r7,pc}
274
275#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD