]>
Commit | Line | Data |
---|---|---|
7b00c0c4 A |
1 | /* |
2 | * Copyright (c) 2010 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | * | |
23 | * This file implements the following functions for the Cortex-A9 processor: | |
24 | * | |
25 | * void bcopy(const void * source, | |
26 | * void * destination, | |
27 | * size_t length); | |
28 | * | |
29 | * void *memmove(void * destination, | |
30 | * const void * source, | |
31 | * size_t n); | |
32 | * | |
33 | * void *memcpy(void * restrict destination, | |
34 | * const void * restrict source, | |
35 | * size_t n); | |
36 | * | |
37 | * All copy n successive bytes from source to destination. Memmove and memcpy | |
38 | * return destination, whereas bcopy has no return value. Copying takes place | |
39 | * as if it were through a temporary buffer -- after return destination | |
40 | * contains exactly the bytes from source, even if the buffers overlap (this is | |
41 | * not required of memcpy by the C standard; its behavior is undefined if the | |
42 | * buffers overlap, but we are holding ourselves to the historical behavior of | |
43 | * this function on OS X and iOS). | |
44 | */ | |
45 | ||
ad3c9f2a A |
46 | #include <arm/arch.h> |
47 | #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD | |
48 | ||
49 | /***************************************************************************** | |
50 | * Macros * | |
51 | *****************************************************************************/ | |
52 | ||
53 | #define A9_ENTRY(name) \ | |
54 | .align 2;\ | |
55 | .globl _ ## name ## $VARIANT$CortexA9;\ | |
56 | _ ## name ## $VARIANT$CortexA9: | |
57 | ||
58 | #define ESTABLISH_FRAME \ | |
59 | push {r0,r4,r7,lr};\ | |
60 | add r7, sp, #8 | |
61 | ||
62 | #define CLEAR_FRAME_AND_RETURN \ | |
63 | pop {r0,r4,r7,pc} | |
64 | ||
65 | #define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10} | |
66 | ||
67 | #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12} | |
68 | ||
69 | /***************************************************************************** | |
70 | * entry points * | |
71 | *****************************************************************************/ | |
72 | ||
73 | .text | |
74 | .syntax unified | |
75 | .code 32 | |
76 | ||
77 | A9_ENTRY(bcopy) | |
78 | // Translate bcopy calls into memcpy calls by swapping the first and second | |
79 | // arguments. | |
80 | mov r3, r0 | |
81 | mov r0, r1 | |
82 | mov r1, r3 | |
83 | ||
84 | A9_ENTRY(memcpy) | |
85 | A9_ENTRY(memmove) | |
86 | // Our preference is to copy the data in ascending address order, but if the | |
87 | // buffers overlap such that the beginning of the destination buffer aliases | |
88 | // the end of the source buffer, we need to copy in descending address order | |
89 | // instead to preserve the memmove semantics. We detect this case with the | |
90 | // test: | |
91 | // | |
92 | // destination - source < length (unsigned compare) | |
93 | // | |
94 | // If the address of the source buffer is higher than the address of the | |
95 | // destination buffer, this arithmetic can overflow, but the overflowed value | |
96 | // can only be smaller than length if the buffers do not overlap, so we don't | |
97 | // need to worry about false positives due to the overflow (they happen, but | |
98 | // only in cases where copying in either order is correct). | |
99 | subs r3, r0, r1 | |
100 | bxeq lr | |
101 | ESTABLISH_FRAME | |
102 | cmp r3, r2 | |
103 | blo L_descendingCopy | |
104 | ||
105 | /***************************************************************************** | |
106 | * ascending copy * | |
107 | *****************************************************************************/ | |
108 | ||
109 | // The layout of the two buffers is such that we can use our preferred | |
110 | // (ascending address order) copy implementation. Throughout this copy, | |
111 | // registers are used as follows: | |
112 | // | |
113 | // r0 lowest unwritten address in the destination buffer. | |
114 | // r1 lowest unread address in the source buffer. | |
115 | // r2 number of bytes remaining to copy less an offset that varies | |
116 | // with the size of the copies that are being made. | |
117 | // r3, r4, r5, r6, r8, r9, r10, r12 | |
118 | // temporary registers used to hold the data during copies. | |
119 | // r12 also used as a scratch register for alignment / length calculations | |
120 | ||
121 | L_ascendingCopy: | |
122 | // We begin by checking if less than four bytes are to be copied; if so, we | |
123 | // branch directly to a small-buffer copy and return. Otherwise, we copy up | |
124 | // to three bytes if needed to make the destination pointer have word (four | |
125 | // byte) alignment. | |
126 | subs r2, #4 | |
127 | blo L_ascendingLengthLessThanFour | |
128 | ands ip, r0, #0x3 | |
129 | beq L_ascendingDestinationWordAligned | |
130 | ldrb r3, [r1],#1 | |
131 | cmp ip, #2 | |
132 | ldrbls r4, [r1],#1 | |
133 | strb r3, [r0],#1 | |
134 | ldrblo r3, [r1],#1 | |
135 | add r2, ip | |
136 | strbls r4, [r0],#1 | |
137 | strblo r3, [r0],#1 | |
138 | subs r2, #4 | |
139 | bhs L_ascendingDestinationWordAligned | |
140 | ||
141 | L_ascendingLengthLessThanFour: | |
142 | // Conditionally copies up to three bytes, assuming no alignment. This is | |
143 | // only used if the original length of the buffer is smaller than four. | |
144 | lsls ip, r2, #31 | |
145 | ldrbcs r3, [r1],#1 | |
146 | ldrbcs ip, [r1],#1 | |
147 | ldrbmi r4, [r1] | |
148 | strbcs r3, [r0],#1 | |
149 | strbcs ip, [r0],#1 | |
150 | strbmi r4, [r0] | |
151 | CLEAR_FRAME_AND_RETURN | |
152 | ||
153 | L_ascendingDestinationWordAligned: | |
154 | // We know that the destination has word alignment. If the source is not | |
155 | // similarly aligned, jump to an unaligned copy loop. | |
156 | tst r1, #0x3 | |
157 | bne L_ascendingUnalignedCopy | |
158 | ||
159 | /***************************************************************************** | |
160 | * ascending copy, both buffers have word alignment * | |
161 | *****************************************************************************/ | |
162 | ||
163 | // If less than sixty-four bytes remain to be copied, jump directly to the | |
164 | // word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed | |
165 | // to make the destination pointer have cacheline alignment. | |
166 | subs r2, r2, #0x3c | |
167 | blo L_ascendingLengthLessThanSixtyFour | |
168 | 0: tst r0, #0x1c | |
169 | beq L_ascendingDestinationCachelineAligned | |
170 | ldr r3, [r1],#4 | |
171 | subs r2, #4 | |
172 | str r3, [r0],#4 | |
173 | bhs 0b | |
174 | b L_ascendingLengthLessThanSixtyFour | |
175 | ||
176 | L_ascendingDestinationCachelineAligned: | |
177 | // Unrolled main copy loop; copies two cachelines (64 bytes) per iteration. | |
178 | // Empirical testing suggests that 0x60 is the optimal lookahead for preload, | |
179 | // though anything between 0x40 and 0x100 seems to be "acceptable". | |
180 | push ADDITIONAL_CALLEE_SAVE_REGISTERS | |
181 | 0: ldm r1!, COPY_REGISTERS | |
182 | subs r2, r2, #0x40 | |
183 | stm r0!, COPY_REGISTERS | |
184 | pld [r1, #0x60] | |
185 | ldm r1!, COPY_REGISTERS | |
186 | pld [r1, #0x60] | |
187 | stm r0!, COPY_REGISTERS | |
188 | bhs 0b | |
189 | pop ADDITIONAL_CALLEE_SAVE_REGISTERS | |
190 | ||
191 | L_ascendingLengthLessThanSixtyFour: | |
192 | // Cleanup copy of up to 63 bytes. We can assume that both the source and | |
193 | // destination addresses have word alignment here. | |
194 | tst r2, #0x30 | |
195 | beq 1f | |
196 | 0: ldm r1!, {r3,r4,r9,ip} | |
197 | sub r2, r2, #0x10 | |
198 | stm r0!, {r3,r4,r9,ip} | |
199 | tst r2, #0x30 | |
200 | bne 0b | |
201 | 1: tst r2, #0xf | |
202 | beq 2f | |
203 | lsls ip, r2, #29 | |
204 | ldmcs r1!, {r3,ip} | |
205 | stmcs r0!, {r3,ip} | |
206 | ldrmi r3, [r1],#4 | |
207 | strmi r3, [r0],#4 | |
208 | lsls ip, r2, #31 | |
209 | ldrhcs r3, [r1],#2 | |
210 | strhcs r3, [r0],#2 | |
211 | ldrbmi ip, [r1] | |
212 | strbmi ip, [r0] | |
213 | 2: CLEAR_FRAME_AND_RETURN | |
214 | ||
215 | /***************************************************************************** | |
216 | * ascending copy, source buffer is not word aligned * | |
217 | *****************************************************************************/ | |
218 | ||
219 | L_ascendingUnalignedCopy: | |
220 | // Destination buffer is word aligned, but source buffer is not. Copy | |
221 | // byte-by-byte until the destination buffer has eightbyte alignment. | |
222 | subs r2, #4 | |
223 | blo L_ascendingUnalignedByteCleanup | |
224 | 0: tst r0, #0x7 | |
225 | beq L_ascendingUnalignedVectorCopy | |
226 | ldrb r3, [r1],#1 | |
227 | subs r2, #1 | |
228 | strb r3, [r0],#1 | |
229 | bhs 0b | |
230 | L_ascendingUnalignedByteCleanup: | |
231 | adds r2, #8 | |
232 | beq 1f | |
233 | 0: ldrb r3, [r1],#1 | |
234 | subs r2, #1 | |
235 | strb r3, [r0],#1 | |
236 | bne 0b | |
237 | 1: CLEAR_FRAME_AND_RETURN | |
238 | ||
239 | L_ascendingUnalignedVectorCopy: | |
240 | // Destination buffer is eightbyte aligned. Source buffer has unknown | |
241 | // alignment. Use NEON to handle the misaligned copies. We begin by copying | |
242 | // up to 24 bytes to get cacheline alignment of the destination buffer. | |
243 | subs r2, #0x18 | |
244 | blo L_ascendingUnalignedVectorCleanup | |
245 | 0: tst r0, #0x18 | |
246 | beq L_ascendingUnalignedCachelineCopy | |
247 | vld1.8 {d0}, [r1]! | |
248 | subs r2, #8 | |
249 | vst1.8 {d0}, [r0,:64]! | |
250 | bhs 0b | |
251 | L_ascendingUnalignedVectorCleanup: | |
252 | adds r2, #0x18 | |
253 | blo L_ascendingUnalignedByteCleanup | |
254 | 0: vld1.8 {d0}, [r1]! | |
255 | subs r2, #8 | |
256 | vst1.8 {d0}, [r0,:64]! | |
257 | bhs 0b | |
258 | b L_ascendingUnalignedByteCleanup | |
259 | ||
260 | L_ascendingUnalignedCachelineCopy: | |
261 | // Main copy loop; moves 32 bytes per iteration. Requires only byte alignment | |
262 | // of the source address. | |
263 | vld1.8 {q0,q1},[r1]! | |
264 | pld [r1, #0x60] | |
265 | vst1.8 {q0,q1},[r0,:256]! | |
266 | subs r2, #0x20 | |
267 | bhs L_ascendingUnalignedCachelineCopy | |
268 | b L_ascendingUnalignedVectorCleanup | |
269 | ||
270 | /***************************************************************************** | |
271 | * descending copy * | |
272 | *****************************************************************************/ | |
273 | ||
274 | // The layout of the two buffers is such that we must copy in descending- | |
275 | // address order. Throughout this copy, registers are used as follows: | |
276 | // | |
277 | // r0 lowest address in the destination buffer that has been written to. | |
278 | // r1 lowest address in the source buffer that has been read from. | |
279 | // r2 number of bytes remaining to copy less an offset that varies | |
280 | // with the size of the copies that are being made. | |
281 | // r3, r4, r5, r6, r8, r9, r10, r12 | |
282 | // temporary registers used to hold the data during copies. | |
283 | // r12 also used as a scratch register for alignment / length calculations | |
284 | ||
285 | L_descendingCopy: | |
286 | // We begin by checking if less than four bytes are to be copied; if so, we | |
287 | // branch directly to a small-buffer copy and return. Otherwise, we copy up | |
288 | // to three bytes if needed to make the destination pointer have word (four | |
289 | // byte) alignment. | |
290 | add r1, r2 | |
291 | add r0, r2 | |
292 | subs r2, #4 | |
293 | blo L_descendingLengthLessThanFour | |
294 | ands ip, r0, #0x3 | |
295 | beq L_descendingDestinationWordAligned | |
296 | ldrb r3, [r1, #-1]! | |
297 | cmp ip, #2 | |
298 | ldrbhs r4, [r1, #-1]! | |
299 | strb r3, [r0, #-1]! | |
300 | ldrbhi r3, [r1, #-1]! | |
301 | strbhs r4, [r0, #-1]! | |
302 | strbhi r3, [r0, #-1]! | |
303 | subs r2, ip | |
304 | bhs L_descendingDestinationWordAligned | |
305 | ||
306 | L_descendingLengthLessThanFour: | |
307 | // Conditionally copies up to three bytes, assuming no alignment. This is | |
308 | // only used if the original length of the buffer is smaller than four. | |
309 | lsls ip, r2, #31 | |
310 | ldrbcs r3, [r1, #-1]! | |
311 | ldrbcs ip, [r1, #-1]! | |
312 | ldrbmi r4, [r1, #-1] | |
313 | strbcs r3, [r0, #-1]! | |
314 | strbcs ip, [r0, #-1]! | |
315 | strbmi r4, [r0, #-1] | |
316 | CLEAR_FRAME_AND_RETURN | |
317 | ||
318 | L_descendingDestinationWordAligned: | |
319 | // We know that the destination has word alignment. If the source is not | |
320 | // similarly aligned, jump to an unaligned copy loop. | |
321 | tst r1, #0x3 | |
322 | bne L_descendingUnalignedCopy | |
323 | ||
324 | /***************************************************************************** | |
325 | * descending copy, both buffers have word alignment * | |
326 | *****************************************************************************/ | |
327 | ||
328 | // If less than sixty-four bytes remain to be copied, jump directly to the | |
329 | // word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed | |
330 | // to make the destination pointer have cacheline alignment. | |
331 | subs r2, r2, #0x3c | |
332 | blo L_descendingLengthLessThanSixtyFour | |
333 | 0: tst r0, #0x1c | |
334 | beq L_descendingDestinationCachelineAligned | |
335 | ldr r3, [r1, #-4]! | |
336 | subs r2, #4 | |
337 | str r3, [r0, #-4]! | |
338 | bhs 0b | |
339 | b L_descendingLengthLessThanSixtyFour | |
340 | ||
341 | L_descendingDestinationCachelineAligned: | |
342 | // Unrolled main copy loop; copies two cachelines (64 bytes) per iteration. | |
343 | // Empirical testing suggests that -0x80 is the optimal lookahead for preload, | |
344 | // though anything between -0x40 and -0x100 seems to be "acceptable". | |
345 | push ADDITIONAL_CALLEE_SAVE_REGISTERS | |
346 | 0: ldmdb r1!, COPY_REGISTERS | |
347 | subs r2, r2, #0x40 | |
348 | stmdb r0!, COPY_REGISTERS | |
349 | pld [r1, #-0x80] | |
350 | ldmdb r1!, COPY_REGISTERS | |
351 | pld [r1, #-0x80] | |
352 | stmdb r0!, COPY_REGISTERS | |
353 | bhs 0b | |
354 | pop ADDITIONAL_CALLEE_SAVE_REGISTERS | |
355 | ||
356 | L_descendingLengthLessThanSixtyFour: | |
357 | // Cleanup copy of up to 63 bytes. We can assume that both the source and | |
358 | // destination addresses have word alignment here. | |
359 | tst r2, #0x30 | |
360 | beq 1f | |
361 | 0: ldmdb r1!, {r3,r4,r9,ip} | |
362 | sub r2, r2, #0x10 | |
363 | stmdb r0!, {r3,r4,r9,ip} | |
364 | tst r2, #0x30 | |
365 | bne 0b | |
366 | 1: tst r2, #0xf | |
367 | beq 2f | |
368 | lsls ip, r2, #29 | |
369 | ldmdbcs r1!, {r3,ip} | |
370 | stmdbcs r0!, {r3,ip} | |
371 | ldrmi r3, [r1, #-4]! | |
372 | strmi r3, [r0, #-4]! | |
373 | lsls ip, r2, #31 | |
374 | ldrhcs r3, [r1, #-2]! | |
375 | strhcs r3, [r0, #-2]! | |
376 | ldrbmi ip, [r1, #-1] | |
377 | strbmi ip, [r0, #-1] | |
378 | 2: CLEAR_FRAME_AND_RETURN | |
379 | ||
380 | /***************************************************************************** | |
381 | * descending copy, source buffer is not word aligned * | |
382 | *****************************************************************************/ | |
383 | ||
384 | L_descendingUnalignedCopy: | |
385 | // Destination buffer is word aligned, but source buffer is not. Copy | |
386 | // byte-by-byte until the destination buffer has eightbyte alignment. | |
387 | subs r2, #4 | |
388 | blo L_descendingUnalignedByteCleanup | |
389 | 0: tst r0, #0x7 | |
390 | beq L_descendingUnalignedVectorCopy | |
391 | ldrb r3, [r1, #-1]! | |
392 | subs r2, #1 | |
393 | strb r3, [r0, #-1]! | |
394 | bhs 0b | |
395 | L_descendingUnalignedByteCleanup: | |
396 | adds r2, #8 | |
397 | beq 1f | |
398 | 0: ldrb r3, [r1, #-1]! | |
399 | subs r2, #1 | |
400 | strb r3, [r0, #-1]! | |
401 | bne 0b | |
402 | 1: CLEAR_FRAME_AND_RETURN | |
403 | ||
404 | L_descendingUnalignedVectorCopy: | |
405 | // Destination buffer is eightbyte aligned. Source buffer has unknown | |
406 | // alignment. Use NEON to handle the misaligned copies. We begin by copying | |
407 | // up to 24 bytes to get cacheline alignment of the destination buffer. | |
408 | subs r2, #0x18 | |
409 | blo L_descendingUnalignedVectorCleanup | |
410 | 0: tst r0, #0x18 | |
411 | beq L_descendingUnalignedCachelineCopy | |
412 | sub r1, #8 | |
413 | vld1.8 {d0}, [r1] | |
414 | sub r0, #8 | |
415 | vst1.8 {d0}, [r0,:64] | |
416 | subs r2, #8 | |
417 | bhs 0b | |
418 | L_descendingUnalignedVectorCleanup: | |
419 | adds r2, #0x18 | |
420 | blo L_descendingUnalignedByteCleanup | |
421 | 0: sub r1, #8 | |
422 | vld1.8 {d0}, [r1] | |
423 | sub r0, #8 | |
424 | vst1.8 {d0}, [r0,:64] | |
425 | subs r2, #8 | |
426 | bhs 0b | |
427 | b L_descendingUnalignedByteCleanup | |
428 | ||
429 | L_descendingUnalignedCachelineCopy: | |
430 | // Main copy loop; moves 32 bytes per iteration. Requires only byte alignment | |
431 | // of the source address. | |
432 | sub r1, #32 | |
433 | sub r0, #32 | |
434 | mov r4, #-32 | |
435 | 0: vld1.8 {q0,q1},[r1], r4 | |
436 | pld [r1, #-0x60] | |
437 | vst1.8 {q0,q1},[r0,:256], r4 | |
438 | subs r2, #0x20 | |
439 | bhs 0b | |
440 | add r1, #32 | |
441 | add r0, #32 | |
442 | b L_descendingUnalignedVectorCleanup | |
443 | ||
444 | #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD |