2 * Copyright (c) 2009 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
24 /*****************************************************************************
25 * Cortex-A8 implementation *
26 *****************************************************************************/
28 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
30 // Our tests have shown that NEON is always a performance win for memcpy( ).
31 // However, for the specific case of copies from a warm source to a cold
32 // destination when the buffer size is between 1k and 32k, it is not enough
33 // of a performance win to offset the increased power footprint, resulting
34 // in an energy usage regression. Thus, we detect that particular case, and
35 // pass those copies through the ARM core registers. All other copies larger
36 // than 8 bytes are handled on NEON.
38 // Stephen Canon, August 2009
44 // void bcopy(const void * source,
45 // void * destination,
48 // void *memmove(void * destination,
49 // const void * source,
52 // void *memcpy(void * restrict destination,
53 // const void * restrict source,
56 // all copy n successive bytes from source to destination. memmove and memcpy
57 // returns destination, whereas bcopy has no return value. copying takes place
58 // as if it were through a temporary buffer -- after return destination contains
59 // exactly the bytes from source, even if the buffers overlap.
70 mov r3, r0 // swap the first and second arguments
71 mov r0, r1 // and fall through into memmove
77 subs r3, r0, r1 // offset = destination addr - source addr
79 bxeq lr // if source == destination, early out
81 // Our preference is for using a (faster) front-to-back copy. However, if
82 // 0 < offset < length, it is necessary to copy back-to-front for correctness.
83 // We have already ruled out offset == 0, so we can use an unsigned compare
84 // with length -- if offset is higher, offset is either greater than length
90 /*****************************************************************************
91 * back to front copy *
92 *****************************************************************************/
94 mov ip, r0 // copy destination pointer.
95 add r1, r2 // move source pointer to end of source array
96 add ip, r2 // move destination pointer to end of dest array
98 subs r2, $8 // if length - 8 is negative (i.e. length
99 blt L_scalarReverseCopy // is less than 8), jump to cleanup path.
100 tst ip, $7 // if (destination + length) is doubleword
101 beq L_vectorReverseCopy // aligned, jump to fast path.
103 0: ldrb r3, [r1, $-1]! // load byte
104 sub r2, $1 // decrement length
105 strb r3, [ip, $-1]! // store byte
106 tst ip, $7 // test alignment
109 cmp r2, $0 // if length - 8 is negative,
110 blt L_scalarReverseCopy // jump to the cleanup code
112 /*****************************************************************************
113 * destination is doubleword aligned *
114 *****************************************************************************/
117 ands r3, r1, $3 // Extract the alignment of the source
119 tbh [pc, r3, lsl $1] // Dispatch table on source alignment
121 .short (L_reverseAligned0-0b)/2 // The NEON alignment hardware does not work
122 .short (L_reverseAligned1-0b)/2 // properly with sub 4-byte alignment and
123 .short (L_reverseAligned2-0b)/2 // buffers that are uncacheable, so we need
124 .short (L_reverseAligned3-0b)/2 // to have a software workaround.
126 /*****************************************************************************
127 * source is also at least word aligned *
128 *****************************************************************************/
131 subs r2, $0x38 // if length - 64 is negative, jump to
132 blt L_reverseVectorCleanup// the cleanup path.
133 tst ip, $0x38 // if (destination + length) is cacheline
134 beq L_reverseCachelineAligned // aligned, jump to the fast path.
136 0: sub r1, $8 // copy eight bytes at a time until the
137 vld1.32 {d0}, [r1] // destination is 8 byte aligned.
141 vst1.64 {d0}, [ip, :64] //
144 cmp r2, $0 // if length - 64 is negative,
145 blt L_reverseVectorCleanup// jump to the cleanup code
147 L_reverseCachelineAligned:
148 sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
149 cmp r3, $0x7c00 // register copies instead of NEON to
150 blo L_useSTMDB // control energy usage.
152 sub r1, $32 // decrement source
153 sub ip, $32 // decrement destination
154 mov r3, $-32 // load address increment
155 tst r1, $0x1f // if source shares 32 byte alignment
156 beq L_reverseSourceAligned// jump to loop with more alignment hints
158 vld1.32 {q2,q3}, [r1], r3 // This loop handles 4-byte aligned copies
159 vld1.32 {q0,q1}, [r1], r3 // as generally as possible.
161 vst1.64 {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
162 blt 1f // properly handle misalignment in vld1
163 .align 3 // with an element size of 8 or 16, so
164 0: vld1.32 {q2,q3}, [r1], r3 // this is the best we can do without
165 vst1.64 {q0,q1}, [ip,:256], r3 // handling alignment in software.
166 vld1.32 {q0,q1}, [r1], r3 //
168 vst1.64 {q2,q3}, [ip,:256], r3 //
172 L_reverseSourceAligned:
173 vld1.64 {q2,q3}, [r1,:256], r3 // Identical to loop above except for
174 vld1.64 {q0,q1}, [r1,:256], r3 // additional alignment information; this
175 subs r2, $64 // gets an additional .5 bytes per cycle
176 vst1.64 {q2,q3}, [ip,:256], r3 // on Cortex-A8.
179 0: vld1.64 {q2,q3}, [r1,:256], r3 //
180 vst1.64 {q0,q1}, [ip,:256], r3 //
181 vld1.64 {q0,q1}, [r1,:256], r3 //
183 vst1.64 {q2,q3}, [ip,:256], r3 //
185 1: vst1.64 {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
186 add r1, $32 // point source at last element stored
187 add ip, $32 // point destination at last element stored
189 L_reverseVectorCleanup:
190 adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
191 blt L_scalarReverseCopy //
193 0: sub r1, $8 // copy eight bytes at a time until
194 vld1.32 {d0}, [r1] // (length - 8) < 0.
197 vst1.64 {d0}, [ip, :64] //
200 /*****************************************************************************
201 * sub-doubleword cleanup copies *
202 *****************************************************************************/
205 adds r2, #0x8 // restore length
206 it eq // if this is zero
209 0: ldrb r3, [r1, #-1]! // load a byte from source
210 strb r3, [ip, #-1]! // store to destination
211 subs r2, #0x1 // subtract one from length
212 bne 0b // if non-zero, repeat
215 /*****************************************************************************
216 * STMDB loop for 1k-32k buffers *
217 *****************************************************************************/
222 0: ldmdb r1!, {r3-r8,r10,r11}
224 stmdb ip!, {r3-r8,r10,r11}
225 ldmdb r1!, {r3-r8,r10,r11}
227 stmdb ip!, {r3-r8,r10,r11}
230 b L_reverseVectorCleanup
232 /*****************************************************************************
233 * Misaligned vld1 loop *
234 *****************************************************************************/
236 // Software alignment fixup to handle source and dest that are relatively
237 // misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
238 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
239 // store to destination.
241 #define RCOPY_UNALIGNED(offset) \
247 vld1.32 {d2,d3}, [r1], r3 ;\
250 0: vext.8 d0, d2, d3, $(offset);\
252 vld1.32 {d2}, [r1], r3 ;\
254 vst1.64 {d0}, [ip, :64], r3 ;\
256 1: vext.8 d0, d2, d3, $(offset);\
258 vst1.64 {d0}, [ip, :64] ;\
261 b L_scalarReverseCopy
270 /*****************************************************************************
271 * front to back copy *
272 *****************************************************************************/
275 mov ip, r0 // copy destination pointer.
276 subs r2, $8 // if length - 8 is negative (i.e. length
277 blt L_scalarCopy // is less than 8), jump to cleanup path.
278 tst ip, $7 // if the destination is doubleword
279 beq L_vectorCopy // aligned, jump to fast path.
281 0: ldrb r3, [r1], $1 // load byte
282 sub r2, $1 // decrement length
283 strb r3, [ip], $1 // store byte
284 tst ip, $7 // test alignment
287 cmp r2, $0 // if length - 8 is negative,
288 blt L_scalarCopy // jump to the cleanup code
290 /*****************************************************************************
291 * destination is doubleword aligned *
292 *****************************************************************************/
295 ands r3, r1, $3 // Extract the alignment of the source
297 tbh [pc, r3, lsl $1] // Dispatch table on source alignment
299 .short (L_sourceAligned0-0b)/2 // The NEON alignment hardware does not work
300 .short (L_sourceAligned1-0b)/2 // properly with sub 4-byte alignment and
301 .short (L_sourceAligned2-0b)/2 // buffers that are uncacheable, so we need
302 .short (L_sourceAligned3-0b)/2 // to have a software workaround.
304 /*****************************************************************************
305 * source is also at least word aligned *
306 *****************************************************************************/
309 subs r2, $0x38 // If (length - 64) < 0
310 blt L_vectorCleanup // jump to cleanup code
311 tst ip, $0x38 // If destination is 64 byte aligned
312 beq L_cachelineAligned // jump to main loop
314 0: vld1.32 {d0}, [r1]! // Copy one double word at a time until
315 sub r2, $8 // the destination is 64-byte aligned.
316 vst1.64 {d0}, [ip, :64]! //
320 cmp r2, $0 // If (length - 64) < 0, goto cleanup
321 blt L_vectorCleanup //
324 sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
325 cmp r3, $0x7c00 // register copies instead of NEON to
326 blo L_useSTMIA // control energy usage.
327 tst r1, $0x1f // If source has 32-byte alignment, use
328 beq L_sourceAligned32 // an optimized loop.
330 vld1.32 {q2,q3}, [r1]! // This is the most common path for small
331 vld1.32 {q0,q1}, [r1]! // copies, which are alarmingly frequent.
332 subs r2, #0x40 // It requires 4-byte alignment on the
333 vst1.64 {q2,q3}, [ip, :256]! // source. For ordinary malloc'd buffers,
334 blt 1f // this path could handle only single-byte
335 .align 3 // alignment at speed by using vld1.8
336 0: vld1.32 {q2,q3}, [r1]! // instead of vld1.32; however, the NEON
337 vst1.64 {q0,q1}, [ip, :256]! // alignment handler misbehaves for some
338 vld1.32 {q0,q1}, [r1]! // special copies if the element size is
339 subs r2, #0x40 // 8 or 16, so we need to work around
340 vst1.64 {q2,q3}, [ip, :256]! // sub 4-byte alignment in software, in
341 bge 0b // another code path.
345 vld1.64 {q2,q3}, [r1, :256]! // When the source shares 32-byte alignment
346 vld1.64 {q0,q1}, [r1, :256]! // with the destination, we use this loop
347 subs r2, #0x40 // instead, which specifies the maximum
348 vst1.64 {q2,q3}, [ip, :256]! // :256 alignment on all loads and stores.
350 .align 3 // This gets an additional .5 bytes per
351 0: vld1.64 {q2,q3}, [r1, :256]! // cycle for in-cache copies, which is not
352 vst1.64 {q0,q1}, [ip, :256]! // insignificant for this (rather common)
353 vld1.64 {q0,q1}, [r1, :256]! // case.
355 vst1.64 {q2,q3}, [ip, :256]! // This is identical to the above loop,
356 bge 0b // except for the additional alignment.
357 1: vst1.64 {q0,q1}, [ip, :256]! //
360 adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
363 0: vld1.32 {d0}, [r1]! // Copy one doubleword at a time until
364 subs r2, $8 // (length - 8) < 0.
365 vst1.64 {d0}, [ip, :64]! //
368 /*****************************************************************************
369 * sub-doubleword cleanup copies *
370 *****************************************************************************/
373 adds r2, #0x8 // restore length
374 it eq // if this is zero
377 0: ldrb r3, [r1], #1 // load a byte from source
378 strb r3, [ip], #1 // store to destination
379 subs r2, #1 // subtract one from length
380 bne 0b // if non-zero, repeat
383 /*****************************************************************************
384 * STMIA loop for 1k-32k buffers *
385 *****************************************************************************/
390 0: ldmia r1!, {r3-r8,r10,r11}
392 stmia ip!, {r3-r8,r10,r11}
393 ldmia r1!, {r3-r8,r10,r11}
395 stmia ip!, {r3-r8,r10,r11}
400 /*****************************************************************************
401 * Misaligned reverse vld1 loop *
402 *****************************************************************************/
404 // Software alignment fixup to handle source and dest that are relatively
405 // misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
406 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
407 // store to destination.
409 #define COPY_UNALIGNED(offset) \
412 vld1.32 {d2,d3}, [r1]! ;\
415 0: vext.8 d0, d2, d3, $(offset);\
417 vld1.32 {d3}, [r1]! ;\
419 vst1.64 {d0}, [ip, :64]! ;\
421 1: vext.8 d0, d2, d3, $(offset);\
423 vst1.64 {d0}, [ip, :64]! ;\
424 2: add r1, $(offset);\