]> git.saurik.com Git - apple/libc.git/blob - arm/string/bcopy_CortexA9.s
45f0e2b6420bb0ac4f9903f43545da326c986552
[apple/libc.git] / arm / string / bcopy_CortexA9.s
1 /*
2 * Copyright (c) 2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 *
23 * This file implements the following functions for the Cortex-A9 processor:
24 *
25 * void bcopy(const void * source,
26 * void * destination,
27 * size_t length);
28 *
29 * void *memmove(void * destination,
30 * const void * source,
31 * size_t n);
32 *
33 * void *memcpy(void * restrict destination,
34 * const void * restrict source,
35 * size_t n);
36 *
37 * All copy n successive bytes from source to destination. Memmove and memcpy
38 * return destination, whereas bcopy has no return value. Copying takes place
39 * as if it were through a temporary buffer -- after return destination
40 * contains exactly the bytes from source, even if the buffers overlap (this is
41 * not required of memcpy by the C standard; its behavior is undefined if the
42 * buffers overlap, but we are holding ourselves to the historical behavior of
43 * this function on OS X and iOS).
44 */
45
46 #include <arm/arch.h>
47 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
48
49 /*****************************************************************************
50 * Macros *
51 *****************************************************************************/
52
53 #define A9_ENTRY(name) \
54 .align 2;\
55 .globl _ ## name ## $VARIANT$CortexA9;\
56 _ ## name ## $VARIANT$CortexA9:
57
58 #define ESTABLISH_FRAME \
59 push {r0,r4,r7,lr};\
60 add r7, sp, #8
61
62 #define CLEAR_FRAME_AND_RETURN \
63 pop {r0,r4,r7,pc}
64
65 #define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
66
67 #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
68
69 /*****************************************************************************
70 * entry points *
71 *****************************************************************************/
72
73 .text
74 .syntax unified
75 .code 32
76
77 A9_ENTRY(bcopy)
78 // Translate bcopy calls into memcpy calls by swapping the first and second
79 // arguments.
80 mov r3, r0
81 mov r0, r1
82 mov r1, r3
83
84 A9_ENTRY(memcpy)
85 A9_ENTRY(memmove)
86 // Our preference is to copy the data in ascending address order, but if the
87 // buffers overlap such that the beginning of the destination buffer aliases
88 // the end of the source buffer, we need to copy in descending address order
89 // instead to preserve the memmove semantics. We detect this case with the
90 // test:
91 //
92 // destination - source < length (unsigned compare)
93 //
94 // If the address of the source buffer is higher than the address of the
95 // destination buffer, this arithmetic can overflow, but the overflowed value
96 // can only be smaller than length if the buffers do not overlap, so we don't
97 // need to worry about false positives due to the overflow (they happen, but
98 // only in cases where copying in either order is correct).
99 subs r3, r0, r1
100 bxeq lr
101 ESTABLISH_FRAME
102 cmp r3, r2
103 blo L_descendingCopy
104
105 /*****************************************************************************
106 * ascending copy *
107 *****************************************************************************/
108
109 // The layout of the two buffers is such that we can use our preferred
110 // (ascending address order) copy implementation. Throughout this copy,
111 // registers are used as follows:
112 //
113 // r0 lowest unwritten address in the destination buffer.
114 // r1 lowest unread address in the source buffer.
115 // r2 number of bytes remaining to copy less an offset that varies
116 // with the size of the copies that are being made.
117 // r3, r4, r5, r6, r8, r9, r10, r12
118 // temporary registers used to hold the data during copies.
119 // r12 also used as a scratch register for alignment / length calculations
120
121 L_ascendingCopy:
122 // We begin by checking if less than four bytes are to be copied; if so, we
123 // branch directly to a small-buffer copy and return. Otherwise, we copy up
124 // to three bytes if needed to make the destination pointer have word (four
125 // byte) alignment.
126 subs r2, #4
127 blo L_ascendingLengthLessThanFour
128 ands ip, r0, #0x3
129 beq L_ascendingDestinationWordAligned
130 ldrb r3, [r1],#1
131 cmp ip, #2
132 ldrbls r4, [r1],#1
133 strb r3, [r0],#1
134 ldrblo r3, [r1],#1
135 add r2, ip
136 strbls r4, [r0],#1
137 strblo r3, [r0],#1
138 subs r2, #4
139 bhs L_ascendingDestinationWordAligned
140
141 L_ascendingLengthLessThanFour:
142 // Conditionally copies up to three bytes, assuming no alignment. This is
143 // only used if the original length of the buffer is smaller than four.
144 lsls ip, r2, #31
145 ldrbcs r3, [r1],#1
146 ldrbcs ip, [r1],#1
147 ldrbmi r4, [r1]
148 strbcs r3, [r0],#1
149 strbcs ip, [r0],#1
150 strbmi r4, [r0]
151 CLEAR_FRAME_AND_RETURN
152
153 L_ascendingDestinationWordAligned:
154 // We know that the destination has word alignment. If the source is not
155 // similarly aligned, jump to an unaligned copy loop.
156 tst r1, #0x3
157 bne L_ascendingUnalignedCopy
158
159 /*****************************************************************************
160 * ascending copy, both buffers have word alignment *
161 *****************************************************************************/
162
163 // If less than sixty-four bytes remain to be copied, jump directly to the
164 // word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
165 // to make the destination pointer have cacheline alignment.
166 subs r2, r2, #0x3c
167 blo L_ascendingLengthLessThanSixtyFour
168 0: tst r0, #0x1c
169 beq L_ascendingDestinationCachelineAligned
170 ldr r3, [r1],#4
171 subs r2, #4
172 str r3, [r0],#4
173 bhs 0b
174 b L_ascendingLengthLessThanSixtyFour
175
176 L_ascendingDestinationCachelineAligned:
177 // Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
178 // Empirical testing suggests that 0x60 is the optimal lookahead for preload,
179 // though anything between 0x40 and 0x100 seems to be "acceptable".
180 push ADDITIONAL_CALLEE_SAVE_REGISTERS
181 0: ldm r1!, COPY_REGISTERS
182 subs r2, r2, #0x40
183 stm r0!, COPY_REGISTERS
184 pld [r1, #0x60]
185 ldm r1!, COPY_REGISTERS
186 pld [r1, #0x60]
187 stm r0!, COPY_REGISTERS
188 bhs 0b
189 pop ADDITIONAL_CALLEE_SAVE_REGISTERS
190
191 L_ascendingLengthLessThanSixtyFour:
192 // Cleanup copy of up to 63 bytes. We can assume that both the source and
193 // destination addresses have word alignment here.
194 tst r2, #0x30
195 beq 1f
196 0: ldm r1!, {r3,r4,r9,ip}
197 sub r2, r2, #0x10
198 stm r0!, {r3,r4,r9,ip}
199 tst r2, #0x30
200 bne 0b
201 1: tst r2, #0xf
202 beq 2f
203 lsls ip, r2, #29
204 ldmcs r1!, {r3,ip}
205 stmcs r0!, {r3,ip}
206 ldrmi r3, [r1],#4
207 strmi r3, [r0],#4
208 lsls ip, r2, #31
209 ldrhcs r3, [r1],#2
210 strhcs r3, [r0],#2
211 ldrbmi ip, [r1]
212 strbmi ip, [r0]
213 2: CLEAR_FRAME_AND_RETURN
214
215 /*****************************************************************************
216 * ascending copy, source buffer is not word aligned *
217 *****************************************************************************/
218
219 L_ascendingUnalignedCopy:
220 // Destination buffer is word aligned, but source buffer is not. Copy
221 // byte-by-byte until the destination buffer has eightbyte alignment.
222 subs r2, #4
223 blo L_ascendingUnalignedByteCleanup
224 0: tst r0, #0x7
225 beq L_ascendingUnalignedVectorCopy
226 ldrb r3, [r1],#1
227 subs r2, #1
228 strb r3, [r0],#1
229 bhs 0b
230 L_ascendingUnalignedByteCleanup:
231 adds r2, #8
232 beq 1f
233 0: ldrb r3, [r1],#1
234 subs r2, #1
235 strb r3, [r0],#1
236 bne 0b
237 1: CLEAR_FRAME_AND_RETURN
238
239 L_ascendingUnalignedVectorCopy:
240 // Destination buffer is eightbyte aligned. Source buffer has unknown
241 // alignment. Use NEON to handle the misaligned copies. We begin by copying
242 // up to 24 bytes to get cacheline alignment of the destination buffer.
243 subs r2, #0x18
244 blo L_ascendingUnalignedVectorCleanup
245 0: tst r0, #0x18
246 beq L_ascendingUnalignedCachelineCopy
247 vld1.8 {d0}, [r1]!
248 subs r2, #8
249 vst1.8 {d0}, [r0,:64]!
250 bhs 0b
251 L_ascendingUnalignedVectorCleanup:
252 adds r2, #0x18
253 blo L_ascendingUnalignedByteCleanup
254 0: vld1.8 {d0}, [r1]!
255 subs r2, #8
256 vst1.8 {d0}, [r0,:64]!
257 bhs 0b
258 b L_ascendingUnalignedByteCleanup
259
260 L_ascendingUnalignedCachelineCopy:
261 // Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
262 // of the source address.
263 vld1.8 {q0,q1},[r1]!
264 pld [r1, #0x60]
265 vst1.8 {q0,q1},[r0,:256]!
266 subs r2, #0x20
267 bhs L_ascendingUnalignedCachelineCopy
268 b L_ascendingUnalignedVectorCleanup
269
270 /*****************************************************************************
271 * descending copy *
272 *****************************************************************************/
273
274 // The layout of the two buffers is such that we must copy in descending-
275 // address order. Throughout this copy, registers are used as follows:
276 //
277 // r0 lowest address in the destination buffer that has been written to.
278 // r1 lowest address in the source buffer that has been read from.
279 // r2 number of bytes remaining to copy less an offset that varies
280 // with the size of the copies that are being made.
281 // r3, r4, r5, r6, r8, r9, r10, r12
282 // temporary registers used to hold the data during copies.
283 // r12 also used as a scratch register for alignment / length calculations
284
285 L_descendingCopy:
286 // We begin by checking if less than four bytes are to be copied; if so, we
287 // branch directly to a small-buffer copy and return. Otherwise, we copy up
288 // to three bytes if needed to make the destination pointer have word (four
289 // byte) alignment.
290 add r1, r2
291 add r0, r2
292 subs r2, #4
293 blo L_descendingLengthLessThanFour
294 ands ip, r0, #0x3
295 beq L_descendingDestinationWordAligned
296 ldrb r3, [r1, #-1]!
297 cmp ip, #2
298 ldrbhs r4, [r1, #-1]!
299 strb r3, [r0, #-1]!
300 ldrbhi r3, [r1, #-1]!
301 strbhs r4, [r0, #-1]!
302 strbhi r3, [r0, #-1]!
303 subs r2, ip
304 bhs L_descendingDestinationWordAligned
305
306 L_descendingLengthLessThanFour:
307 // Conditionally copies up to three bytes, assuming no alignment. This is
308 // only used if the original length of the buffer is smaller than four.
309 lsls ip, r2, #31
310 ldrbcs r3, [r1, #-1]!
311 ldrbcs ip, [r1, #-1]!
312 ldrbmi r4, [r1, #-1]
313 strbcs r3, [r0, #-1]!
314 strbcs ip, [r0, #-1]!
315 strbmi r4, [r0, #-1]
316 CLEAR_FRAME_AND_RETURN
317
318 L_descendingDestinationWordAligned:
319 // We know that the destination has word alignment. If the source is not
320 // similarly aligned, jump to an unaligned copy loop.
321 tst r1, #0x3
322 bne L_descendingUnalignedCopy
323
324 /*****************************************************************************
325 * descending copy, both buffers have word alignment *
326 *****************************************************************************/
327
328 // If less than sixty-four bytes remain to be copied, jump directly to the
329 // word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
330 // to make the destination pointer have cacheline alignment.
331 subs r2, r2, #0x3c
332 blo L_descendingLengthLessThanSixtyFour
333 0: tst r0, #0x1c
334 beq L_descendingDestinationCachelineAligned
335 ldr r3, [r1, #-4]!
336 subs r2, #4
337 str r3, [r0, #-4]!
338 bhs 0b
339 b L_descendingLengthLessThanSixtyFour
340
341 L_descendingDestinationCachelineAligned:
342 // Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
343 // Empirical testing suggests that -0x80 is the optimal lookahead for preload,
344 // though anything between -0x40 and -0x100 seems to be "acceptable".
345 push ADDITIONAL_CALLEE_SAVE_REGISTERS
346 0: ldmdb r1!, COPY_REGISTERS
347 subs r2, r2, #0x40
348 stmdb r0!, COPY_REGISTERS
349 pld [r1, #-0x80]
350 ldmdb r1!, COPY_REGISTERS
351 pld [r1, #-0x80]
352 stmdb r0!, COPY_REGISTERS
353 bhs 0b
354 pop ADDITIONAL_CALLEE_SAVE_REGISTERS
355
356 L_descendingLengthLessThanSixtyFour:
357 // Cleanup copy of up to 63 bytes. We can assume that both the source and
358 // destination addresses have word alignment here.
359 tst r2, #0x30
360 beq 1f
361 0: ldmdb r1!, {r3,r4,r9,ip}
362 sub r2, r2, #0x10
363 stmdb r0!, {r3,r4,r9,ip}
364 tst r2, #0x30
365 bne 0b
366 1: tst r2, #0xf
367 beq 2f
368 lsls ip, r2, #29
369 ldmdbcs r1!, {r3,ip}
370 stmdbcs r0!, {r3,ip}
371 ldrmi r3, [r1, #-4]!
372 strmi r3, [r0, #-4]!
373 lsls ip, r2, #31
374 ldrhcs r3, [r1, #-2]!
375 strhcs r3, [r0, #-2]!
376 ldrbmi ip, [r1, #-1]
377 strbmi ip, [r0, #-1]
378 2: CLEAR_FRAME_AND_RETURN
379
380 /*****************************************************************************
381 * descending copy, source buffer is not word aligned *
382 *****************************************************************************/
383
384 L_descendingUnalignedCopy:
385 // Destination buffer is word aligned, but source buffer is not. Copy
386 // byte-by-byte until the destination buffer has eightbyte alignment.
387 subs r2, #4
388 blo L_descendingUnalignedByteCleanup
389 0: tst r0, #0x7
390 beq L_descendingUnalignedVectorCopy
391 ldrb r3, [r1, #-1]!
392 subs r2, #1
393 strb r3, [r0, #-1]!
394 bhs 0b
395 L_descendingUnalignedByteCleanup:
396 adds r2, #8
397 beq 1f
398 0: ldrb r3, [r1, #-1]!
399 subs r2, #1
400 strb r3, [r0, #-1]!
401 bne 0b
402 1: CLEAR_FRAME_AND_RETURN
403
404 L_descendingUnalignedVectorCopy:
405 // Destination buffer is eightbyte aligned. Source buffer has unknown
406 // alignment. Use NEON to handle the misaligned copies. We begin by copying
407 // up to 24 bytes to get cacheline alignment of the destination buffer.
408 subs r2, #0x18
409 blo L_descendingUnalignedVectorCleanup
410 0: tst r0, #0x18
411 beq L_descendingUnalignedCachelineCopy
412 sub r1, #8
413 vld1.8 {d0}, [r1]
414 sub r0, #8
415 vst1.8 {d0}, [r0,:64]
416 subs r2, #8
417 bhs 0b
418 L_descendingUnalignedVectorCleanup:
419 adds r2, #0x18
420 blo L_descendingUnalignedByteCleanup
421 0: sub r1, #8
422 vld1.8 {d0}, [r1]
423 sub r0, #8
424 vst1.8 {d0}, [r0,:64]
425 subs r2, #8
426 bhs 0b
427 b L_descendingUnalignedByteCleanup
428
429 L_descendingUnalignedCachelineCopy:
430 // Main copy loop; moves 32 bytes per iteration. Requires only byte alignment
431 // of the source address.
432 sub r1, #32
433 sub r0, #32
434 mov r4, #-32
435 0: vld1.8 {q0,q1},[r1], r4
436 pld [r1, #-0x60]
437 vst1.8 {q0,q1},[r0,:256], r4
438 subs r2, #0x20
439 bhs 0b
440 add r1, #32
441 add r0, #32
442 b L_descendingUnalignedVectorCleanup
443
444 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD