]> git.saurik.com Git - apple/libc.git/blob - arm/string/bcopy_Swift.s
Libc-825.25.tar.gz
[apple/libc.git] / arm / string / bcopy_Swift.s
1 /*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 *
23 * This file implements the following functions for the Swift micro-arch:
24 *
25 * void bcopy(const void * source,
26 * void * destination,
27 * size_t length);
28 *
29 * void *memmove(void * destination,
30 * const void * source,
31 * size_t n);
32 *
33 * void *memcpy(void * restrict destination,
34 * const void * restrict source,
35 * size_t n);
36 *
37 * All copy n successive bytes from source to destination. Memmove and memcpy
38 * return destination, whereas bcopy has no return value. Copying takes place
39 * as if it were through a temporary buffer -- after return destination
40 * contains exactly the bytes from source, even if the buffers overlap (this is
41 * not required of memcpy by the C standard; its behavior is undefined if the
42 * buffers overlap, but we are holding ourselves to the historical behavior of
43 * this function on OS X and iOS).
44 */
45
46 #include <arm/arch.h>
47 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
48
49 .syntax unified
50 .code 16
51 .globl _bcopy$VARIANT$Swift
52 .thumb_func _bcopy$VARIANT$Swift
53 .globl _memmove$VARIANT$Swift
54 .thumb_func _memmove$VARIANT$Swift
55 .globl _memcpy$VARIANT$Swift
56 .thumb_func _memcpy$VARIANT$Swift
57
58 .text
59 .align 4
60 _bcopy$VARIANT$Swift:
61 // Translate bcopy calls into memcpy calls by swapping the first and second
62 // arguments.
63 mov r3, r0
64 mov r0, r1
65 mov r1, r3
66
67 _memmove$VARIANT$Swift:
68 _memcpy$VARIANT$Swift:
69 // Our preference is to copy the data in ascending address order, but if the
70 // buffers overlap such that the beginning of the destination buffer aliases
71 // the end of the source buffer, we need to copy in descending address order
72 // instead to preserve the memmove semantics. We detect this case with the
73 // test:
74 //
75 // destination - source < length (unsigned compare)
76 //
77 // If the address of the source buffer is higher than the address of the
78 // destination buffer, this arithmetic can overflow, but the overflowed value
79 // can only be smaller than length if the buffers do not overlap, so we don't
80 // need to worry about false positives due to the overflow (they happen, but
81 // only in cases where copying in either order is correct).
82 push {r7,lr}
83 mov r7, sp
84 subs r3, r0, r1
85 beq L_exit
86 mov ip, r0
87 cmp r3, r2
88 blo L_descendingCopy
89
90 /*****************************************************************************
91 * Ascending copy *
92 *****************************************************************************/
93
94 subs r3, r2, #32 // If length < 32, jump to a dedicated code
95 blo L_ascendingShort // path for short buffers.
96
97 orr lr, r0, r1 // If the length is not a multiple of 16, or
98 orr lr, r2 // either buffer is not 16-byte aligned, then
99 ands lr, #0xf // some edging is needed; jump to a separate
100 bne L_ascendingEdging // branch to handle it.
101
102 /*****************************************************************************
103 * Ascending vector aligned copy *
104 *****************************************************************************/
105
106 0: subs r3, #32 // Copy 32 bytes at a time from src to dst,
107 vld1.8 {q0,q1}, [r1,:128]! // both of which have 16-byte alignment.
108 vst1.8 {q0,q1}, [ip,:128]! // Terminate this loop when 32 or fewer bytes
109 bhi 0b // remain to be copied.
110
111 add r1, r3 // Backtrack both pointers by 32 - remaining
112 vld1.8 {q0,q1}, [r1,:128] // and copy 32 bytes from src to dst. This
113 add ip, r3 // copy may overlap the previous copy, and
114 vst1.8 {q0,q1}, [ip,:128] // takes us precisely to the end of the
115 pop {r7,pc} // buffer.
116
117 /*****************************************************************************
118 * Ascending vector misaligned copy *
119 *****************************************************************************/
120
121 L_ascendingEdging:
122 tst ip, #0xf // Copy one byte at a time until the
123 itttt ne // destination pointer has 16 byte alignment.
124 ldrbne r3, [r1],#1
125 strbne r3, [ip],#1
126 subne r2, #1
127 bne L_ascendingEdging
128
129 and lr, r1, #0xf // Back the source pointer up to a 16-byte
130 bic r1, #0xf // aligned location, and check if length > 32.
131 subs r3, r2, #32
132 blo L_ascendingEdgingExit
133 tbh [pc, lr, lsl #1] // Otherwise, we have a jump table based on
134 0: // the relative alignment of the buffers.
135 .short (L_ascendingExtract0x0-0b)/2
136 .short (L_ascendingExtract0x1-0b)/2
137 .short (L_ascendingExtract0x2-0b)/2
138 .short (L_ascendingExtract0x3-0b)/2
139 .short (L_ascendingExtract0x4-0b)/2
140 .short (L_ascendingExtract0x5-0b)/2
141 .short (L_ascendingExtract0x6-0b)/2
142 .short (L_ascendingExtract0x7-0b)/2
143 .short (L_ascendingExtract0x8-0b)/2
144 .short (L_ascendingExtract0x9-0b)/2
145 .short (L_ascendingExtract0xa-0b)/2
146 .short (L_ascendingExtract0xb-0b)/2
147 .short (L_ascendingExtract0xc-0b)/2
148 .short (L_ascendingExtract0xd-0b)/2
149 .short (L_ascendingExtract0xe-0b)/2
150 .short (L_ascendingExtract0xf-0b)/2
151
152 L_ascendingExtract0x0: // If the two buffers are similarly aligned,
153 subs r3, #32 // we use a slightly simpler loop that just
154 vld1.8 {q0,q1}, [r1,:128]! // copies 32 bytes at a time.
155 vst1.8 {q0,q1}, [ip,:128]!
156 bhs L_ascendingExtract0x0
157 b L_ascendingEdgingExit
158
159 #define ASCENDING_EXTRACT(shift)\
160 L_ascendingExtract ## shift:\
161 vld1.8 {q8}, [r1,:128]!;\
162 0: vld1.8 {q9,q10},[r1,:128]!;\
163 vext.8 q0, q8, q9, $(shift);\
164 vext.8 q1, q9, q10,$(shift);\
165 vmov q8, q10;\
166 vst1.8 {q0,q1}, [ip,:128]!;\
167 subs r3, $32;\
168 bhs 0b;\
169 sub r1, $16;\
170 b L_ascendingEdgingExit
171
172 ASCENDING_EXTRACT(0x1) // Otherwise, we use the loop implemented in
173 ASCENDING_EXTRACT(0x2) // the above macro. It loads 32 bytes per
174 ASCENDING_EXTRACT(0x3) // iteration combines it with the residual
175 ASCENDING_EXTRACT(0x4) // bytes from the previous iteration, and
176 ASCENDING_EXTRACT(0x5) // uses the VEXT instruction to extract 32
177 ASCENDING_EXTRACT(0x6) // bytes that can be stored to a 16-byte
178 ASCENDING_EXTRACT(0x7) // aligned location in the destination buffer.
179 ASCENDING_EXTRACT(0x8) // This continues until 32 or fewer bytes
180 ASCENDING_EXTRACT(0x9) // remain to be copied. This is significantly
181 ASCENDING_EXTRACT(0xa) // faster than using misaligned loads and
182 ASCENDING_EXTRACT(0xb) // stores, which are very inefficient on
183 ASCENDING_EXTRACT(0xc) // Swift.
184 ASCENDING_EXTRACT(0xd)
185 ASCENDING_EXTRACT(0xe)
186 ASCENDING_EXTRACT(0xf)
187
188 L_ascendingEdgingExit:
189 add r1, lr // Restore the source pointer
190 add r2, r3, #32 // Restore the length
191 L_ascendingShort:
192 subs r2, #1 // Copy one byte at a time until the buffer
193 itt hs // is exhausted, then return.
194 ldrbhs r3, [r1],#1
195 strbhs r3, [ip],#1
196 bhi L_ascendingShort
197 L_exit:
198 pop {r7,pc}
199
200 /*****************************************************************************
201 * Descending copy *
202 *****************************************************************************/
203
204 L_descendingCopy:
205 add r1, r2 // Advance source and destination pointers to
206 add ip, r2 // the end of the buffer.
207
208 subs r3, r2, #32 // If length < 32, jump to a dedicated code
209 blo L_descendingShort // path for short buffers.
210
211 orr lr, r0, r1 // If the length is not a multiple of 16, or
212 orr lr, r2 // either buffer is not 16-byte aligned, then
213 ands lr, #0xf // some edging is needed; jump to a separate
214 bne L_descendingEdging // branch to handle it.
215
216 /*****************************************************************************
217 * Descending vector aligned copy *
218 *****************************************************************************/
219
220 0: sub r1, #32 // Copies 32 bytes (16-byte aligned) from
221 vld1.8 {q0,q1}, [r1,:128] // source to destination on each pass through
222 sub ip, #32 // the loop. The loop ends when 32 or fewer
223 vst1.8 {q0,q1}, [ip,:128] // bytes remain to be copied.
224 subs r3, #32
225 bhi 0b
226 add r3, #32 // Copy the remaining up-to-32 bytes.
227 sub r1, r3 // This copy may overlap the copy performed
228 vld1.8 {q0,q1}, [r1,:128] // in the final iteration through the
229 sub ip, r3 // previous loop, but this is more efficient
230 vst1.8 {q0,q1}, [ip,:128] // than figuring out exactly which bytes
231 pop {r7,pc} // need to be copied.
232
233 /*****************************************************************************
234 * Descending vector misaligned copy *
235 *****************************************************************************/
236
237 L_descendingEdging:
238 tst ip, #0xf // Identical to how we handle misalignment for
239 itttt ne // ascending copies. First we move one byte
240 ldrbne r3, [r1,#-1]! // at a time until the destination has 16
241 strbne r3, [ip,#-1]! // byte alignment.
242 subne r2, #1
243 bne L_descendingEdging
244
245 and lr, r1, #0xf // Then we extract the alignment of the source
246 bic r1, #0xf // buffer and use a jump table to dispatch
247 subs r3, r2, #32 // into code that does the appropriate
248 blo L_descendingEdgingExit // software alignment fixup.
249 tbh [pc, lr, lsl #1]
250 0:
251 .short (L_descendingExtract0x0-0b)/2
252 .short (L_descendingExtract0x1-0b)/2
253 .short (L_descendingExtract0x2-0b)/2
254 .short (L_descendingExtract0x3-0b)/2
255 .short (L_descendingExtract0x4-0b)/2
256 .short (L_descendingExtract0x5-0b)/2
257 .short (L_descendingExtract0x6-0b)/2
258 .short (L_descendingExtract0x7-0b)/2
259 .short (L_descendingExtract0x8-0b)/2
260 .short (L_descendingExtract0x9-0b)/2
261 .short (L_descendingExtract0xa-0b)/2
262 .short (L_descendingExtract0xb-0b)/2
263 .short (L_descendingExtract0xc-0b)/2
264 .short (L_descendingExtract0xd-0b)/2
265 .short (L_descendingExtract0xe-0b)/2
266 .short (L_descendingExtract0xf-0b)/2
267
268 L_descendingExtract0x0: // For relative alignment, we have a fast
269 sub r1, #32 // path identical to the aligned copy loop.
270 vld1.8 {q0,q1}, [r1,:128]
271 sub ip, #32
272 vst1.8 {q0,q1}, [ip,:128]
273 subs r3, #32
274 bhs L_descendingExtract0x0
275 b L_descendingEdgingExit
276
277 #define DESCENDING_EXTRACT(shift)\
278 L_descendingExtract ## shift:\
279 vld1.8 {q10}, [r1,:128];\
280 0: sub r1, #32;\
281 vld1.8 {q8,q9}, [r1,:128];\
282 vext.8 q1, q9, q10,$(shift);\
283 vext.8 q0, q8, q9, $(shift);\
284 vmov q10, q8;\
285 sub ip, #32;\
286 vst1.8 {q0,q1}, [ip,:128];\
287 subs r3, $32;\
288 bhs 0b;\
289 b L_descendingEdgingExit
290
291 DESCENDING_EXTRACT(0x1) // Otherwise, we use the loop above (almost
292 DESCENDING_EXTRACT(0x2) // identical to the one we use in the
293 DESCENDING_EXTRACT(0x3) // ascending copy case).
294 DESCENDING_EXTRACT(0x4)
295 DESCENDING_EXTRACT(0x5)
296 DESCENDING_EXTRACT(0x6)
297 DESCENDING_EXTRACT(0x7)
298 DESCENDING_EXTRACT(0x8)
299 DESCENDING_EXTRACT(0x9)
300 DESCENDING_EXTRACT(0xa)
301 DESCENDING_EXTRACT(0xb)
302 DESCENDING_EXTRACT(0xc)
303 DESCENDING_EXTRACT(0xd)
304 DESCENDING_EXTRACT(0xe)
305 DESCENDING_EXTRACT(0xf)
306
307 L_descendingEdgingExit:
308 add r1, lr // Restore source pointer
309 add r2, r3, #32 // Restore length
310 L_descendingShort:
311 subs r2, #1 // Byte-by-byte copy loop for short overlapping
312 itt hs // buffers.
313 ldrbhs r3, [r1,#-1]!
314 strbhs r3, [ip,#-1]!
315 bhi L_descendingShort
316 pop {r7,pc}
317
318 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD